#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
import math
import random
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.cross_validation import train_test_split
class maxent(object):
def init_params(self, x, y):
self.x_ = x # x是乙個列表,二維陣列
self.y_ = set()
self.cal_pxy_px(x, y)
self.n = len(x) # 訓練集大小
self.n = len(self.pxy) # (x,y)對數
self.m = 10000.0 # 書p91,可認為是學習速率
self.build_dict()
self.cal_epxy()
def cal_pxy_px(self, x, y):
'''p82
計算聯合分布p(x,y)的經驗分布和邊緣分布p(x)的經驗分布
'''# defaultdict作用是當查詢時key不存在時,返回預設值而不是報錯
self.pxy = defaultdict(int)
self.px = defaultdict(int)
for i in range(len(x)):
x_, y = x[i], y[i]
self.y_.add(y)
for x in x_: # 遍歷每個特徵
self.pxy[(x, y)] += 1
self.px[x] += 1
def build_dict(self):
'''為所有(x,y)設定乙個id,並建立起雙向的對映,相同(x,y)無需重複設定
'''self.id2xy = {}
self.xy2id = {}
for i, (x, y) in enumerate(self.pxy):
self.id2xy[i] = (x, y)
self.xy2id[(x, y)] = i
def fxy(self, x, y):
'''判斷是否存在(x,y)這一對
'''return (x, y) in self.xy2id
def cal_pyx(self, x, y):
'''計算規範化因子z的內層(6.23)
同時也是(6.22)的分子
'''result = 0.0
for x in x: # 這裡的x是一條資料,遍歷每個特徵
if self.fxy(x, y):
id = self.xy2id[(x, y)]
result += self.w[id] # f(x,y) = 1,不用乘
return (math.exp(result), y)
def cal_probability(self, x): # 這裡的x是一條資料
'''計算p85的(6.22)
'''pyxs = [(self.cal_pyx(x, y)) for y in self.y_]
z = sum([prob for prob, y in pyxs]) # 規範化因子
return [(prob / z, y) for prob, y in pyxs]
def cal_epx(self):
'''計算p83最上面的期望
'''self.epx = [0.0 for i in range(self.n)] # epx個數 = (x,y)對數
for i, x in enumerate(self.x_): # 遍歷每條資料
pyxs = self.cal_probability(x) # 得到p(y|x),x固定,遍歷y(為什麼不是特徵x而是x存疑)
for x in x: # 遍歷每個特徵
for pyx, y in pyxs: # pyx是針對每條資料x和每個標籤y的值
if self.fxy(x, y):
id = self.xy2id[(x, y)]
self.epx[id] += pyx * (1.0 / self.n) # 暗含經驗分布的疊加
def cal_epxy(self):
'''計算p82最下面的期望
'''self.epxy = defaultdict(float)
for id in range(self.n):
(x, y) = self.id2xy[id]
self.epxy[id] = float(self.pxy[(x, y)]) / float(self.n)
def train(self, x, y):
self.init_params(x, y)
self.w = [0.0 for i in range(self.n)]
max_iteration = 1000
for times in range(max_iteration):
print('iteration times %d' % times)
deltas =
self.cal_epx()
for i in range(self.n):
delta = 1 / self.m * math.log(self.epx[i])
self.w = [self.w[i] + deltas[i] for i in range(self.n)]
def predict(self, testset):
results =
for test in testset:
result = self.cal_probability(test)
return results
def rebuild_features(features):
'''將原feature的(a0,a1,a2,a3...)
變成(0_a0,1_a1,2_a2,3_a3)的形式
因為計算f(x,y)需要區分不同的x
'''new_features =
for feature in features:
new_feature =
for i, f in enumerate(feature): # 遍歷每一行資料
return new_features
if __name__ == '__main__':
print('start reading data:')
time1 = time.time()
raw_data = pd.read_csv('data/train_binary.csv', header=0)
data = raw_data.values
imgs = data[:, 1:]
labels = data[:, 0]
train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=11111)
train_features = rebuild_features(train_features)
test_features = rebuild_features(test_features)
print(len(train_features))
print(len(test_features))
# print(train_features[0])
time2 = time.time()
print('read data cost %f seconds' % (time2 - time1))
print('start training:')
met = maxent()
met.train(train_features, train_labels)
time3 = time.time()
print('training cost %f seconds' % (time3 - time2))
print('start predicting:')
test_predict = met.predict(test_features)
time4 = time.time()
print('predicting cost %f seconds' % (time4 - time3))
accuracy = sum([test_labels[i] == test_predict[i] for i in range(len(test_labels))]) / len(test_labels)
print('the accuracy is %f!' % accuracy)
'''output:
start reading data:
28140
13860
read data cost 42.410567 seconds
start training:
iteration times 0
iteration times 1
iteration times 2
...(要執行十幾個小時,正確率可達97%以上)
'''
《統計學習方法》筆記07 最大熵模型
最大熵模型,最初在吳軍博士 數學之美 看到。那節題目為 不要把雞蛋放在乙個籃子裡 最大熵模型 吳軍談到最大熵原理在人們日常生活中不自覺用到.比如擲乙個色子,六面均勻的情況下6出現的概率為1 6,這幾乎是所有人都會給出的答案,但為什麼是1 6?其實其中蘊含了最大熵原理。吳軍談到這個模型挺複雜,搞了好長...
《統計學習方法》學習筆記5 關於最大熵模型學習
部落格 如何理解拉格朗日乘子法?解密svm系列 一 關於拉格朗日乘子法和kkt條件 解密svm系列 二 svm的理論基礎 深入理解拉格朗日乘子法 lagrange multiplier 和kkt條件 適用條件 應用於凸函式的帶約束的組合優化問題。kkt條件 關於帶等式以及不等式的約束條件的凸函式優化...
統計學習方法 樹模型
樹模型 上思維導圖來自知乎 夕小瑤 決策樹演算法主要包括決策樹的生成與剪枝。決策樹可以從兩個方面解釋 決策樹學習的本質是從訓練資料集中歸納出一組分類規則,也可以看做是對特徵空間劃分類的條件概率分布。首先,按照根據統計學習三要素來分析決策樹學習的過程 假設空間 對特徵空間進行劃分所有可能的決策樹 損失...