CART決策樹python實現

2021-10-12 02:49:47 字數 4391 閱讀 8263

from sklearn import tree

import pydotplus

defcart_skl_test()

: df = pd.read_csv(

"../dataset/liquefaction_data_mle.csv"

) x = df[

['csr'

,'vs']]

y = df[

'target'

] clf = tree.decisiontreeclassifier(

) clf.fit(x, y)

dot_data = tree.export_graphviz(clf, out_file=

none

) graph = pydotplus.graph_from_dot_data(dot_data)

graph.write_png(

"carttree.png"

)

import pandas as pd

import math

def

get_gini

(dataset)

: num_instances =

len(dataset)

# 資料個數

label_counts =

# 統計當前各標籤資料量

for featvec in dataset:

current_label = featvec[-1

]if current_label not

in label_counts.keys():

label_counts[current_label]=0

label_counts[current_label]+=1

sum_prob =

0.0for key in label_counts:

prob =

float

(label_counts[key]

)/ num_instances

sum_prob = sum_prob + math.

pow(prob,2)

gini =

1- sum_prob

return gini

def

splitdataset

(dataset, axis, value)

: leftdataset =

rightdataset =

for featvec in dataset:

if featvec[axis]

<= value:

else

:# print(leftdataset)

# print(rightdataset)

return leftdataset, rightdataset

def

choosebestfeaturetosplit

(dataset)

:# 決策屬性不算

numfeatures =

len(dataset[0]

)-1 bestinfogini =

1.0 bestfeature =-1

bestsplitvalue =-1

basegini = get_gini(dataset)

for i in

range

(numfeatures)

:# 把第i列屬性的值取出來生成一維陣列

featlist =

[example[i]

for example in dataset]

# 剔除重複值,並排序

uniquevals =

list

(set

(featlist)

) uniquevals.sort(

)

featuresplit =-1

# 當前屬性下的最佳分割點

featuregini =

1.0# 當前屬性下的最小gini值

# 選擇當前屬性下的最佳分割點

for j in

range

(len

(uniquevals)-1

):value =

(uniquevals[j]

+ uniquevals[j+1]

)/2 left_dataset, right_dataset = splitdataset(dataset, i, value)

prob =

len(left_dataset)

/float

(len

(dataset)

) currentgini = prob * get_gini(left_dataset)+(

1- prob)

* get_gini(right_dataset)

if currentgini < featuregini:

featuregini = currentgini

featuresplit = value

# 選擇最佳屬性及其分割點

if featuregini < bestinfogini:

bestinfogini = featuregini

bestfeature = i

bestsplitvalue = featuresplit

print

("bestfeature: {}, bestsplitvalue: {}, gini: {}"

.format

(bestfeature, bestsplitvalue, basegini)

)return bestfeature, bestsplitvalue, bestinfogini

def

createtree

(dataset, parafeaturename)

:# 拷貝標籤

classlist =

[example[-1

]for example in dataset]

# 當結點中所有標籤相同時-->葉子結點

if classlist.count(classlist[0]

)==len(classlist)

:return classlist[0]

bestfeat, bestsplit, gini = choosebestfeaturetosplit(dataset)

bestfeaturename = parafeaturename[bestfeat]

mytree =

}#運用字典儲存樹

# 遞迴建立樹

lefttree, righttree = splitdataset(dataset, bestfeat, bestsplit)

mytree[bestfeaturename]

["<="

+str

(bestsplit)

]= createtree(lefttree, parafeaturename)

mytree[bestfeaturename]

[">"

+str

(bestsplit)

]= createtree(righttree, parafeaturename)

return mytree

if __name__ ==

"__main__"

:# cart_skl_test()

df = pd.read_csv(

"../dataset/liquefaction_data_mle.csv"

)#讀取.csv資料

featurename = df.columns.values

dataset =

for i in df.values:

tree = createtree(dataset, featurename)

print

(tree)

自寫**結果展示:

}, 

'>0.26':

}}}}},

'>15.55': 1.0}}}}}},

'>16.35':

}}}}}

}}}}}

調包結果展示:

決策樹CART的python實現

cart演算法只做二元切分,因此每個樹節點包含待切分的特徵,待切分的特徵值,左子樹,右子樹。import numpy as np class treenode object def init self,feat,val,right,left featuretospliton feat valueof...

決策樹和CART決策樹

首先簡單介紹下決策樹 說到決策樹肯定離不開資訊熵 什麼是資訊熵 不要被這名字唬住,其實很簡單 乙個不太可能的時間居然發生了,要比乙個非常可能的時間發生提供更多的資訊。訊息說 今天早上太陽公升起 資訊量是很少的,以至於沒有必要傳送。但另一條訊息說 今天早上日食 資訊量就很豐富。概率越大資訊量就越少,與...

決策樹之CART

本系列分享由三篇部落格組成,建議從前往後閱讀學習。決策樹之id3 決策樹之c4.5 決策樹之cart 前面我們講到了決策樹演算法id3,和c4.5。c4.5是在id3的基礎上發展而來的,c4.5還存在的缺陷有 1 c4.5不能解決回歸的問題。2 c4.5需要進行多個對數計算,效率比較低。3 對於離散...