決策樹 資訊熵增益

2021-10-24 19:19:41 字數 4792 閱讀 2191

from math import log

import operator

defcreatedateset()

: dataset =[[

"青年"

,"否"

,"否"

,"一般"

,"否"],

["青年"

,"否"

,"否"

,"好"

,"否"],

["青年"

,"是"

,"否"

,"好"

,"是"],

["青年"

,"是"

,"是"

,"一般"

,"是"],

["青年"

,"否"

,"否"

,"一般"

,"否"],

["中年"

,"否"

,"否"

,"一般"

,"否"],

["中年"

,"否"

,"否"

,"好"

,"否"],

["中年"

,"是"

,"是"

,"好"

,"是"],

["中年"

,"否"

,"是"

,"非常好"

,"是"],

["中年"

,"否"

,"是"

,"非常好"

,"是"],

["老年"

,"否"

,"是"

,"非常好"

,"是"],

["老年"

,"否"

,"是"

,"好"

,"是"],

["老年"

,"是"

,"否"

,"好"

,"是"],

["老年"

,"是"

,"否"

,"非常好"

,"是"],

["老年"

,"否"

,"否"

,"一般"

,"否"]]

featrues_names =

["年齡"

,"有工作"

,"有自己的房子"

,"信貸情況"

]return dataset, featrues_names

defcalcshannonent

(dataset)

:"""

資料集包含標籤

:return 返回夏農熵

"""# 使用字典儲存不同label計數

label =

num =

len(dataset)

for i in

range

(num)

:if dataset[i][-

1]notin label.keys():

label[dataset[i][-

1]]=

1else

: label[dataset[i][-

1]]+=

1 shannonent =

0for key in label:

p = label[key]

/ num

shannonent +=

-p * log(p,2)

return shannonent

defdatasplit

(dataset, axis, value)

:"""

:param dataset: 資料集包含標籤

:param axis: 資料集特徵

:param value: 資料集特徵的值

:return: 返回特徵等於某個值去掉此特徵的資料集

"""num =

len(dataset)

data_feature =

for i in

range

(num)

:if dataset[i]

[axis]

== value:

data_left = dataset[i]

[:axis]

data_right = dataset[i]

[axis +1:

] data_merge = data_left + data_right

return data_feature

defchoosebestfeaturetosplit

(dataset)

:"""

:param dataset: 資料集

:return:

best_feature,最優特徵索引

infogainratemax,最大資訊增益率

"""base_shanonent = calcshannonent(dataset)

features_nums =

len(dataset[0]

)-1 data_nums =

len(dataset)

infogainratemax =

0 best_feature =-1

for i in

range

(features_nums)

: feature_list =

[feature[i]

for feature in dataset]

# 取某條資料某個特徵

feature_list_unique =

set(feature_list)

shanonent_ =

0 h_feature =

0for j in feature_list_unique:

data_feature = datasplit(dataset, i, j)

p =len(data_feature)

/ data_nums

h_feature +=

-p * log(p,2)

shanonent_ += p * calcshannonent(data_feature)

infogainrate =

(base_shanonent - shanonent_)

/ h_feature

if infogainrate > infogainratemax:

infogainratemax = infogainrate

best_feature = i

return best_feature, infogainratemax

defmajortitycnt

(classlist)

:"""

:param classlist:標籤列表

:return: 列表中數量最大的標籤值

"""classcount =

for vote in classlist:

if vote not

in classcount.keys():

classcount[vote]=0

classcount[vote]+=1

sortedclasscount =

sorted

(classcount.iteritems(

), \

key=operator.itemgetter(1)

, reverse=

true

)return sortedclasscount[0]

[0]def

createtree

(dataset, features_names)

: classlist =

[feature[-1

]for feature in dataset]

iflen

(set

(classlist))==

1:return classlist[0]

iflen

(dataset[0]

)==1:

return majortitycnt(classlist)

best_feature, infogainratemax = choosebestfeaturetosplit(dataset)

# 最好特徵索引

best_feature_name = features_names[best_feature]

# 最好特徵名稱

mytree =

}del

(features_names[best_feature]

) featvalues =

[feature[best_feature]

for feature in dataset]

uniquevals =

set(featvalues)

for value in uniquevals:

sublabels = features_names[:]

mytree[best_feature_name]

[value]

= createtree(datasplit(dataset, best_feature, value)

, sublabels)

return mytree

dataset, features_names = createdateset(

)print

(createtree(dataset, features_names)

)結果:}}

}

最大資訊熵增益 決策樹與資訊增益

今天我們開始介紹決策樹。它既可以用於分類,也可以用於回歸。這裡我們主要介紹更加常見的分類用法。概念決策樹,顧名思義,它的形狀類似於一棵樹,我們可以簡單把它畫出來 如上圖,最上面的乙個點我們叫它根節點 root node 最下面不再進行分類的點我們叫它葉節點 leaf node 決策樹的分類過程是這樣...

決策樹資訊增益

決策樹和整合演算法都是樹模型 決策樹 從根節點一步步走到葉子節點,所有的資料都會落到葉子節點,既可以做分類也可以做回歸。一顆樹有三種節點組成,根節點,中間幾點,葉子節點。根節點是第乙個選擇節點,也是最重要的乙個選擇特徵。葉子節點是存放最終的結果。決策樹的訓練和測試 訓練是建立一棵樹。測試是讓資料從根...

決策樹 資訊熵 資訊增益 基尼係數

決策樹系列目錄 文末有彩蛋 決策樹 決策樹演算法原理 id3,c4.5,cart 決策樹 決策樹引數介紹 分類和回歸 決策樹 決策樹sklearn調參 gridsearchcv 決策樹 python 實現決策樹 決策樹應用例項 鐵達尼號分類 決策樹應用例項 使用者流失 模型 決策樹應用例項 銀行借貸...