機器學習實戰決策樹

tree.py

from math import log
import operator
def createdataset():
dataset = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]#資料集
labels = ['no su***cing','flippers']
#change to discrete values
return dataset, labels
#計算熵
def calcshannonent(dataset):
numentries = len(dataset)#樣本容量
labelcounts = {}#建立乙個空字典
for featvec in dataset: #遍歷資料集
currentlabel = featvec[-1]#取類標記，根據資料集的資料格式選取
if currentlabel not in labelcounts.keys(): labelcounts[currentlabel] = 0
labelcounts[currentlabel] += 1#如果取的類標記不在字典裡，就新增進去，再將字典內容加一
shannonent = 0.0#定義熵
for key in labelcounts:
prob = float(labelcounts[key])/numentries
shannonent -= prob * log(prob,2) #log base 2#按公式計算熵
return shannonent
#劃分資料集 
def splitdataset(dataset, axis, value):
retdataset = #為了不修改原始資料，建立乙個新的列表物件
for featvec in dataset:#遍歷資料集
if featvec[axis] == value:#如果該例項的值等於需要的值
reducedfeatvec = featvec[:axis]#取出在axis前面的值
reducedfeatvec.extend(featvec[axis+1:])#取出在axis後面的值
return retdataset
def choosebestfeaturetosplit(dataset):
numfeatures = len(dataset[0]) - 1 #the last column is used for the labels
baseentropy = calcshannonent(dataset)#計算熵
bestinfogain = 0.0; bestfeature = -1
for i in range(numfeatures): #iterate over all the features
featlist = [example[i] for example in dataset]#create a list of all the examples of this feature
uniquevals = set(featlist) #get a set of unique values
newentropy = 0.0
for value in uniquevals:
subdataset = splitdataset(dataset, i, value)
prob = len(subdataset)/float(len(dataset))
newentropy += prob * calcshannonent(subdataset) 
infogain = baseentropy - newentropy #calculate the info gain; ie reduction in entropy
if (infogain > bestinfogain): #compare this to the best gain so far
bestinfogain = infogain #if better than current best, set to best
bestfeature = i
return bestfeature #returns an integer
def majoritycnt(classlist):
classcount={}
for vote in classlist:
if vote not in classcount.keys(): classcount[vote] = 0
classcount[vote] += 1
sortedclasscount = sorted(classcount.iteritems(), key=operator.itemgetter(1), reverse=true)
return sortedclasscount[0][0]
def createtree(dataset,labels):
classlist = [example[-1] for example in dataset]
if classlist.count(classlist[0]) == len(classlist): 
return classlist[0]#stop splitting when all of the classes are equal
if len(dataset[0]) == 1: #stop splitting when there are no more features in dataset
return majoritycnt(classlist)
bestfeat = choosebestfeaturetosplit(dataset)
bestfeatlabel = labels[bestfeat]
mytree = }
del(labels[bestfeat])
featvalues = [example[bestfeat] for example in dataset]
uniquevals = set(featvalues)
for value in uniquevals:
sublabels = labels[:] #copy all of labels, so trees don't mess up existing labels
mytree[bestfeatlabel][value] = createtree(splitdataset(dataset, bestfeat, value),sublabels)
return mytree 
def classify(inputtree,featlabels,testvec):
firststr = inputtree.keys()[0]
seconddict = inputtree[firststr]
featindex = featlabels.index(firststr)
key = testvec[featindex]
valueoffeat = seconddict[key]
if isinstance(valueoffeat, dict): 
classlabel = classify(valueoffeat, featlabels, testvec)
else: classlabel = valueoffeat
return classlabel
def storetree(inputtree,filename):
import pickle
fw = open(filename,'w')
pickle.dump(inputtree,fw)
fw.close()
def grabtree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)

機器學習實戰決策樹

決策樹 2 python語言在函式中傳遞的是列表的引用，在函式內部對列表物件的修改，將會影響該列表物件的整個生存週期。為了消除這個不良影響，我們需要在函式的開始宣告乙個新列表物件。在本節中，指的是在劃分資料集函式中，傳遞的引數dataset列表的引用，為了不影響dataset我們重新宣告了乙個ret...

機器學習實戰決策樹

這幾天一直在學習機器學習實戰python 實現，在程式清單的3 6 獲取及誒單數程式，書上的程式是這樣的 def getnumleafs mytree numleafs 0.0 firststr list dict.keys mytree 0 seconddict mytree firststr p...

機器學習實戰決策樹

class sklearn.tree.decisiontreeclassifier criterion gini splitter best max depth none,min samples split 2,min samples leaf 1,min weight fraction leaf ...

機器學習實戰 決策樹

機器學習實戰 決策樹

機器學習實戰決策樹

機器學習實戰 決策樹

相關推薦

機器學習實戰決策樹

機器學習實戰決策樹

機器學習實戰決策樹