統計學習方法 最大熵模型實現

2021-08-30 11:33:10 字數 4751 閱讀 8197

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

import time

import math

import random

import numpy as np

import pandas as pd

from collections import defaultdict

from sklearn.cross_validation import train_test_split

class maxent(object):

def init_params(self, x, y):

self.x_ = x # x是乙個列表,二維陣列

self.y_ = set()

self.cal_pxy_px(x, y)

self.n = len(x) # 訓練集大小

self.n = len(self.pxy) # (x,y)對數

self.m = 10000.0 # 書p91,可認為是學習速率

self.build_dict()

self.cal_epxy()

def cal_pxy_px(self, x, y):

'''p82

計算聯合分布p(x,y)的經驗分布和邊緣分布p(x)的經驗分布

'''# defaultdict作用是當查詢時key不存在時,返回預設值而不是報錯

self.pxy = defaultdict(int)

self.px = defaultdict(int)

for i in range(len(x)):

x_, y = x[i], y[i]

self.y_.add(y)

for x in x_: # 遍歷每個特徵

self.pxy[(x, y)] += 1

self.px[x] += 1

def build_dict(self):

'''為所有(x,y)設定乙個id,並建立起雙向的對映,相同(x,y)無需重複設定

'''self.id2xy = {}

self.xy2id = {}

for i, (x, y) in enumerate(self.pxy):

self.id2xy[i] = (x, y)

self.xy2id[(x, y)] = i

def fxy(self, x, y):

'''判斷是否存在(x,y)這一對

'''return (x, y) in self.xy2id

def cal_pyx(self, x, y):

'''計算規範化因子z的內層(6.23)

同時也是(6.22)的分子

'''result = 0.0

for x in x: # 這裡的x是一條資料,遍歷每個特徵

if self.fxy(x, y):

id = self.xy2id[(x, y)]

result += self.w[id] # f(x,y) = 1,不用乘

return (math.exp(result), y)

def cal_probability(self, x): # 這裡的x是一條資料

'''計算p85的(6.22)

'''pyxs = [(self.cal_pyx(x, y)) for y in self.y_]

z = sum([prob for prob, y in pyxs]) # 規範化因子

return [(prob / z, y) for prob, y in pyxs]

def cal_epx(self):

'''計算p83最上面的期望

'''self.epx = [0.0 for i in range(self.n)] # epx個數 = (x,y)對數

for i, x in enumerate(self.x_): # 遍歷每條資料

pyxs = self.cal_probability(x) # 得到p(y|x),x固定,遍歷y(為什麼不是特徵x而是x存疑)

for x in x: # 遍歷每個特徵

for pyx, y in pyxs: # pyx是針對每條資料x和每個標籤y的值

if self.fxy(x, y):

id = self.xy2id[(x, y)]

self.epx[id] += pyx * (1.0 / self.n) # 暗含經驗分布的疊加

def cal_epxy(self):

'''計算p82最下面的期望

'''self.epxy = defaultdict(float)

for id in range(self.n):

(x, y) = self.id2xy[id]

self.epxy[id] = float(self.pxy[(x, y)]) / float(self.n)

def train(self, x, y):

self.init_params(x, y)

self.w = [0.0 for i in range(self.n)]

max_iteration = 1000

for times in range(max_iteration):

print('iteration times %d' % times)

deltas =

self.cal_epx()

for i in range(self.n):

delta = 1 / self.m * math.log(self.epx[i])

self.w = [self.w[i] + deltas[i] for i in range(self.n)]

def predict(self, testset):

results =

for test in testset:

result = self.cal_probability(test)

return results

def rebuild_features(features):

'''將原feature的(a0,a1,a2,a3...)

變成(0_a0,1_a1,2_a2,3_a3)的形式

因為計算f(x,y)需要區分不同的x

'''new_features =

for feature in features:

new_feature =

for i, f in enumerate(feature): # 遍歷每一行資料

return new_features

if __name__ == '__main__':

print('start reading data:')

time1 = time.time()

raw_data = pd.read_csv('data/train_binary.csv', header=0)

data = raw_data.values

imgs = data[:, 1:]

labels = data[:, 0]

train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=11111)

train_features = rebuild_features(train_features)

test_features = rebuild_features(test_features)

print(len(train_features))

print(len(test_features))

# print(train_features[0])

time2 = time.time()

print('read data cost %f seconds' % (time2 - time1))

print('start training:')

met = maxent()

met.train(train_features, train_labels)

time3 = time.time()

print('training cost %f seconds' % (time3 - time2))

print('start predicting:')

test_predict = met.predict(test_features)

time4 = time.time()

print('predicting cost %f seconds' % (time4 - time3))

accuracy = sum([test_labels[i] == test_predict[i] for i in range(len(test_labels))]) / len(test_labels)

print('the accuracy is %f!' % accuracy)

'''output:

start reading data:

28140

13860

read data cost 42.410567 seconds

start training:

iteration times 0

iteration times 1

iteration times 2

...(要執行十幾個小時,正確率可達97%以上)

'''

《統計學習方法》筆記07 最大熵模型

最大熵模型,最初在吳軍博士 數學之美 看到。那節題目為 不要把雞蛋放在乙個籃子裡 最大熵模型 吳軍談到最大熵原理在人們日常生活中不自覺用到.比如擲乙個色子,六面均勻的情況下6出現的概率為1 6,這幾乎是所有人都會給出的答案,但為什麼是1 6?其實其中蘊含了最大熵原理。吳軍談到這個模型挺複雜,搞了好長...

《統計學習方法》學習筆記5 關於最大熵模型學習

部落格 如何理解拉格朗日乘子法?解密svm系列 一 關於拉格朗日乘子法和kkt條件 解密svm系列 二 svm的理論基礎 深入理解拉格朗日乘子法 lagrange multiplier 和kkt條件 適用條件 應用於凸函式的帶約束的組合優化問題。kkt條件 關於帶等式以及不等式的約束條件的凸函式優化...

統計學習方法 樹模型

樹模型 上思維導圖來自知乎 夕小瑤 決策樹演算法主要包括決策樹的生成與剪枝。決策樹可以從兩個方面解釋 決策樹學習的本質是從訓練資料集中歸納出一組分類規則,也可以看做是對特徵空間劃分類的條件概率分布。首先,按照根據統計學習三要素來分析決策樹學習的過程 假設空間 對特徵空間進行劃分所有可能的決策樹 損失...