numpy實現簡易版決策樹演算法

2021-10-10 20:50:18 字數 3152 閱讀 9096

根據決策樹分而治之的思想,使用gini準則,封裝乙個決策樹分類演算法,同時能實現調節兩個超引數:樹深和葉子節點最小樣本數。

import numpy as np

from collections import counter

'''encapsulate the decision tree method

author:evan

'''class decisiontreeclassifier:

def __init__(self,max_depth=2,min_samples_leaf=1):

self.tree_ = none

self.max_depth = max_depth

self.min_samples_leaf = min_samples_leaf

def fit(self, x, y):

self.tree_ = self.creat_tree(x,y)

return self

def creat_tree(self, x, y,current_depth=1):

if current_depth>self.max_depth:

return none

d, v, g = try_split(x, y,self.min_samples_leaf)

if d == -1 or g == 0:

return none

node = node(d, v, g)

x_left, x_right, y_left, y_right = cut(x, y, v, d)

node.children_left = self.creat_tree(x_left, y_left,current_depth+1)

if node.children_left is none:

label = counter(y_left).most_common(1)[0][0]

node.children_left = node(l=label)

node.children_right = self.creat_tree(x_right, y_right,current_depth+1)

if node.children_right is none:

label = counter(y_right).most_common(1)[0][0]

node.children_right = node(l=label)

return node

def predict(self, x):

assert self.tree_ is not none, 'call the fit() method first'

return np.array([self._predict(x, self.tree_) for x in x])

def _predict(self, x, node):

if node.label is not none:

return node.label

if x[node.dim] <= node.value:

return self._predict(x, node.children_left)

else:

return self._predict(x, node.children_right)

def cut(x,y,v,d):

'''將資料一分為二'''

ind_left = (x[:,d]<=v)

ind_right = (x[:,d]>v)

return x[ind_left], x[ind_right], y[ind_left], y[ind_right]

def try_split(x,y,min_samples_leaf):

'''劃分資料集,返回最好的劃分點'''

best_g = 1

best_d = -1

best_v = -1

for d in range(x.shape[1]):

sorted_index = np.argsort(x[:,d])

for i in range(len(x)-1):

if x[sorted_index[i],d] == x[sorted_index[i+1],d]:

continue

v = (x[sorted_index[i],d]+x[sorted_index[i+1],d])/2

# print("d={},v={}".format(d,v))

x_left,x_right,y_left,y_right = cut(x,y,v,d)

gini_all =gini(y_left)+gini(y_right)

# print("d={},v={},g={}".format(d,v,gini_all))

if gini_all= min_samples_leaf and len(y_right) >= min_samples_leaf:

best_g = gini_all

best_d = d

best_v = v

return best_d,best_v,best_g

# define node class

class node():

def __init__(self, d=none, v=none, g=none, l=none):

self.dim = d

self.value = v

self.gini = g

self.label = l

self.children_left = none

self.children_right = none

def __repr__(self):

return "node(d={},v={},g={},l={})".format(self.dim, self.value, self.gini, self.label)

# compute gini

def gini(y):

counter = counter(y)

result = 0

for v in counter.values():

result += (v/len(y))**2

return 1 - result

簡易版redux實現

redux其實只有幾個重要的api,getstate,subscribe和dispatch,getstate用來獲取狀態,subscribe監聽狀態的改變,dispatch派發事件改變狀態,下面就來看下。首先是createstore,它接收三個引數,分別是reducer函式,初始狀態值,還有就是中介...

決策樹演算法 python實現

定義 資訊增益 再劃分資料之前之後資訊發生的變化。香濃熵 簡稱熵 集合資訊的度量方式,熵是資訊的期望值。其實決策樹主要就是選擇最優劃分屬性對給定集合進行劃分,隨著花粉的不斷進行,我們希望最終決策樹所包含的樣本盡量屬於同一類別,即結點的 純度 越來越高。資訊增益的計算 1.資訊熵的計算ent d 越小...

Python實現決策樹演算法

決策樹的一般流程 檢測資料集中的每個子項是否屬於同乙個分類 if so return 類標籤 else 尋找劃分資料集的最好特徵 劃分資料集 建立分支 節點 from math import log import operator 生成樣本資料集 defcreatedataset dataset 1...