機器學習實戰 決策樹

2021-08-29 18:39:49 字數 4109 閱讀 5350

tree.py 

from math import log

import operator

def createdataset():

dataset = [[1, 1, 'yes'],

[1, 1, 'yes'],

[1, 0, 'no'],

[0, 1, 'no'],

[0, 1, 'no']]#資料集

labels = ['no su***cing','flippers']

#change to discrete values

return dataset, labels

#計算熵

def calcshannonent(dataset):

numentries = len(dataset)#樣本容量

labelcounts = {}#建立乙個空字典

for featvec in dataset: #遍歷資料集

currentlabel = featvec[-1]#取類標記,根據資料集的資料格式選取

if currentlabel not in labelcounts.keys(): labelcounts[currentlabel] = 0

labelcounts[currentlabel] += 1#如果取的類標記不在字典裡,就新增進去,再將字典內容加一

shannonent = 0.0#定義熵

for key in labelcounts:

prob = float(labelcounts[key])/numentries

shannonent -= prob * log(prob,2) #log base 2#按公式計算熵

return shannonent

#劃分資料集

def splitdataset(dataset, axis, value):

retdataset = #為了不修改原始資料,建立乙個新的列表物件

for featvec in dataset:#遍歷資料集

if featvec[axis] == value:#如果該例項的值等於需要的值

reducedfeatvec = featvec[:axis]#取出在axis前面的值

reducedfeatvec.extend(featvec[axis+1:])#取出在axis後面的值

return retdataset

def choosebestfeaturetosplit(dataset):

numfeatures = len(dataset[0]) - 1 #the last column is used for the labels

baseentropy = calcshannonent(dataset)#計算熵

bestinfogain = 0.0; bestfeature = -1

for i in range(numfeatures): #iterate over all the features

featlist = [example[i] for example in dataset]#create a list of all the examples of this feature

uniquevals = set(featlist) #get a set of unique values

newentropy = 0.0

for value in uniquevals:

subdataset = splitdataset(dataset, i, value)

prob = len(subdataset)/float(len(dataset))

newentropy += prob * calcshannonent(subdataset)

infogain = baseentropy - newentropy #calculate the info gain; ie reduction in entropy

if (infogain > bestinfogain): #compare this to the best gain so far

bestinfogain = infogain #if better than current best, set to best

bestfeature = i

return bestfeature #returns an integer

def majoritycnt(classlist):

classcount={}

for vote in classlist:

if vote not in classcount.keys(): classcount[vote] = 0

classcount[vote] += 1

sortedclasscount = sorted(classcount.iteritems(), key=operator.itemgetter(1), reverse=true)

return sortedclasscount[0][0]

def createtree(dataset,labels):

classlist = [example[-1] for example in dataset]

if classlist.count(classlist[0]) == len(classlist):

return classlist[0]#stop splitting when all of the classes are equal

if len(dataset[0]) == 1: #stop splitting when there are no more features in dataset

return majoritycnt(classlist)

bestfeat = choosebestfeaturetosplit(dataset)

bestfeatlabel = labels[bestfeat]

mytree = }

del(labels[bestfeat])

featvalues = [example[bestfeat] for example in dataset]

uniquevals = set(featvalues)

for value in uniquevals:

sublabels = labels[:] #copy all of labels, so trees don't mess up existing labels

mytree[bestfeatlabel][value] = createtree(splitdataset(dataset, bestfeat, value),sublabels)

return mytree

def classify(inputtree,featlabels,testvec):

firststr = inputtree.keys()[0]

seconddict = inputtree[firststr]

featindex = featlabels.index(firststr)

key = testvec[featindex]

valueoffeat = seconddict[key]

if isinstance(valueoffeat, dict):

classlabel = classify(valueoffeat, featlabels, testvec)

else: classlabel = valueoffeat

return classlabel

def storetree(inputtree,filename):

import pickle

fw = open(filename,'w')

pickle.dump(inputtree,fw)

fw.close()

def grabtree(filename):

import pickle

fr = open(filename)

return pickle.load(fr)

機器學習實戰 決策樹

決策樹 2 python語言在函式中傳遞的是列表的引用,在函式內部對列表物件的修改,將會影響該列表物件的整個生存週期。為了消除這個不良影響,我們需要在函式的開始宣告乙個新列表物件。在本節中,指的是在劃分資料集函式中,傳遞的引數dataset列表的引用,為了不影響dataset我們重新宣告了乙個ret...

機器學習實戰決策樹

這幾天一直在學習機器學習實戰python 實現,在程式清單的3 6 獲取及誒單數程式,書上的程式是這樣的 def getnumleafs mytree numleafs 0.0 firststr list dict.keys mytree 0 seconddict mytree firststr p...

機器學習實戰 決策樹

class sklearn.tree.decisiontreeclassifier criterion gini splitter best max depth none,min samples split 2,min samples leaf 1,min weight fraction leaf ...