機器學習實戰 ch02 1

'''
knn.py
'''from numpy import *
from operator
def createdataset():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['a','a','b','b']
return group,labels
'''intx: 輸入向量,與dataset矩陣矩陣的行數一致
k: 鄰近的個數
'''def classify0(intx,dataset,labels,k):
datasetsize = dataset.shape[0]
# 計算歐式距離
diffmat = tile(intx,(datasetsize,1)) - dataset # 矩陣減法
# tile（a,reps） a沿各個維度重複的次數
sqdiffmat = diffmat ** 2
sqdistances = sqdiffmat.sum(axis = 1) # 矩陣行相加,列的引數是0
distances = sqdistances ** 0.5
#print("distances:")
#print distances
sorteddistindicies = distances.argsort() # argsort函式返回的是陣列值從小到大的索引值
#print("sorteddistindicies:")
#print sorteddistindicies 
classcount ={} #dict 型別
for i in range(k):
voteilabel = label[sorteddistindicies[i]]
#print ("i: %d, sorteddistindicies[%d]: %d"%(i,i,sorteddistindicies[i]))
#print ("voteilabel:%s"%(voteilabel))
classcount[voteilabel] = classcount.get(voteilabel,0) + 1
#d.get(k[, d]) => d[k] if k in d else d. d defaults to none.提取標籤出現的個數
#print ("classcount[%s]:%d"%(voteilabel,classcount[voteilabel]))
sortedclasscount = sorted(classcount.iteritems(), key = operator.itemgetter(1), reverse = true)
#classcount.iteritems()遍歷
#operator.itemgetter函式選取第幾個維的數字，和sorted一起用可根據選出的那個維進行排序
return sortedclasscount[0][0]
def file2matrix(filename):
fr = open(filename)
arrayoflines = fr.readlines()
numberoflines = len(arrayoflines)
returnmat = zeros((numberoflines,3)) #3列矩陣
classlabelvector = #list
index = 0
for line for arrayoflines:
line = line.strip() # 去除回車
listfromline = line.split('\t')
returnmat[index,:] = listfromline[0:3]
index += 1
return returnmax, classlabelvector
''' 歸一化
newvalue = (oldvalue - minvalue)/(maxvalue - minvalue)
''' 
def autonorm(dataset):
minvals = dataset.min(0)
maxvals = dataset.max(0)
ranges = maxvals - minvals
normdataset = zeros(shape(dataset))
m = dataset.shape[0]
normdataset = normdataset - tile(minvals,(m,1))
normdataset = normdataset / tile(ranges,(m,1))
return normdataset, ranges, minvals
'''test
'''def datingdatatest():
horatio = 0.10 
datingdatamat, datinglabels = file2matrix('datingtestset2.txt')
normmat, ranges, minvals = autonorm(datingdatamat)
m = normmat.shape[0]
normtestvecs = int (m * horatio)
errorcount = 0.0
for i in range(normtestvecs):
classifierresult = classify0(normmat[i,:],\
normmat[normtestvecs:m,:],\
datinglabels[normtestvecs:m],\
3)print ("the classifier came back with: %d, the real answer is:%d"\
%(classifierresult,datinglabels[i]))
if(classifierresult != datinglabels[i]):
errorcount += 1.0
print ("the total error rate is: %f" % (errorcount/float(numtestvecs)))
'''classify
'''def classifyperson():
resultlist = ['not at all','in small doses','in large doses']
percenttats = float(raw_input\
("percentage of time spent playing video games?"))
ffmiles = float(raw_input\
("frequent flier miles earned per year?"))
icecream = float(raw_input\
("liters of ice cream consumed per year?"))
datingdatamat,datinglabels = file2matrix('datingtestset2.txt')
normmat, ranges, minvals = autonorm(datingdatamat)
inarr = array([percenttats,ffmiles,icecream])
classifierresult = classify0((inarr-minvals)/ranges,\
normmat,\
datinglabels,\
3)print ("you will probably like this person: ",\
resultlist[classifierresult-1])

《機器學習實戰》筆記（三） Ch3 決策樹

資訊增益熵劃分資料集遞迴構建決策樹測試演算法使用決策樹執行分類使用演算法決策樹的儲存例子使用決策樹眼睛型別目標通過決策樹患者需要佩戴的眼睛型別。fr open lensens.txt lenses inst.strip split t for inst in fr.rea...

機器學習實戰

花了一段時間，總算把機器學習實戰粗讀了一遍，重點就在這個粗讀上。這本書的確不錯，機器學習的幾個經典演算法都涉及了，每個演算法都有1 2個實際例子進行說明，都有實實在在的讓我想起了linus的 talk is cheap,show me the code 那句名言。但多年來養成的習慣，從來都是喜...

ch1機器學習基礎

分類將例項資料劃分到合適的分類中回歸通過給定資料點擬合最有曲線從而數值型資料以上兩個任務都屬於監督學習，因為這類演算法必須知道什麼，即目標變數的分類資訊或目標數值。無監督學習資料沒有類別資訊，也不會給出目標值聚類將資料集分成由類似的物件組成的多個類的過程密度估計將尋找描述資料統...

機器學習實戰 ch02 1

《機器學習實戰》筆記（三） Ch3 決策樹

機器學習實戰

ch1機器學習基礎

相關推薦