機器學習實戰 ch02 1

2021-07-27 06:33:46 字數 3892 閱讀 4819

'''

knn.py

'''from numpy import *

from operator

def createdataset():

group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])

labels = ['a','a','b','b']

return group,labels

'''intx: 輸入向量,與dataset矩陣矩陣的行數一致

k: 鄰近的個數

'''def classify0(intx,dataset,labels,k):

datasetsize = dataset.shape[0]

# 計算歐式距離

diffmat = tile(intx,(datasetsize,1)) - dataset # 矩陣減法

# tile(a,reps) a沿各個維度重複的次數

sqdiffmat = diffmat ** 2

sqdistances = sqdiffmat.sum(axis = 1) # 矩陣行相加,列的引數是0

distances = sqdistances ** 0.5

#print("distances:")

#print distances

sorteddistindicies = distances.argsort() # argsort函式返回的是陣列值從小到大的索引值

#print("sorteddistindicies:")

#print sorteddistindicies

classcount ={} #dict 型別

for i in range(k):

voteilabel = label[sorteddistindicies[i]]

#print ("i: %d, sorteddistindicies[%d]: %d"%(i,i,sorteddistindicies[i]))

#print ("voteilabel:%s"%(voteilabel))

classcount[voteilabel] = classcount.get(voteilabel,0) + 1

#d.get(k[, d]) => d[k] if k in d else d. d defaults to none.提取標籤出現的個數

#print ("classcount[%s]:%d"%(voteilabel,classcount[voteilabel]))

sortedclasscount = sorted(classcount.iteritems(), key = operator.itemgetter(1), reverse = true)

#classcount.iteritems()遍歷

#operator.itemgetter函式選取第幾個維的數字,和sorted一起用可根據選出的那個維進行排序

return sortedclasscount[0][0]

def file2matrix(filename):

fr = open(filename)

arrayoflines = fr.readlines()

numberoflines = len(arrayoflines)

returnmat = zeros((numberoflines,3)) #3列矩陣

classlabelvector = #list

index = 0

for line for arrayoflines:

line = line.strip() # 去除回車

listfromline = line.split('\t')

returnmat[index,:] = listfromline[0:3]

index += 1

return returnmax, classlabelvector

''' 歸一化

newvalue = (oldvalue - minvalue)/(maxvalue - minvalue)

'''

def autonorm(dataset):

minvals = dataset.min(0)

maxvals = dataset.max(0)

ranges = maxvals - minvals

normdataset = zeros(shape(dataset))

m = dataset.shape[0]

normdataset = normdataset - tile(minvals,(m,1))

normdataset = normdataset / tile(ranges,(m,1))

return normdataset, ranges, minvals

'''test

'''def datingdatatest():

horatio = 0.10

datingdatamat, datinglabels = file2matrix('datingtestset2.txt')

normmat, ranges, minvals = autonorm(datingdatamat)

m = normmat.shape[0]

normtestvecs = int (m * horatio)

errorcount = 0.0

for i in range(normtestvecs):

classifierresult = classify0(normmat[i,:],\

normmat[normtestvecs:m,:],\

datinglabels[normtestvecs:m],\

3)print ("the classifier came back with: %d, the real answer is:%d"\

%(classifierresult,datinglabels[i]))

if(classifierresult != datinglabels[i]):

errorcount += 1.0

print ("the total error rate is: %f" % (errorcount/float(numtestvecs)))

'''classify

'''def classifyperson():

resultlist = ['not at all','in small doses','in large doses']

percenttats = float(raw_input\

("percentage of time spent playing video games?"))

ffmiles = float(raw_input\

("frequent flier miles earned per year?"))

icecream = float(raw_input\

("liters of ice cream consumed per year?"))

datingdatamat,datinglabels = file2matrix('datingtestset2.txt')

normmat, ranges, minvals = autonorm(datingdatamat)

inarr = array([percenttats,ffmiles,icecream])

classifierresult = classify0((inarr-minvals)/ranges,\

normmat,\

datinglabels,\

3)print ("you will probably like this person: ",\

resultlist[classifierresult-1])

《機器學習實戰》筆記(三) Ch3 決策樹

資訊增益 熵 劃分資料集 遞迴構建決策樹 測試演算法 使用決策樹執行分類 使用演算法 決策樹的儲存 例子 使用決策樹 眼睛型別 目標 通過決策樹 患者需要佩戴的 眼睛型別。fr open lensens.txt lenses inst.strip split t for inst in fr.rea...

機器學習實戰

花了一段時間,總算把 機器學習實戰 粗讀了一遍,重點就在這個粗讀上。這本書的確不錯,機器學習的幾個經典演算法都涉及了,每個演算法都有1 2個實際例子進行說明,都有實實在在的 讓我想起了linus的 talk is cheap,show me the code 那句名言。但多年來養成的習慣,從來都是喜...

ch1機器學習基礎

分類 將例項資料劃分到合適的分類中 回歸 通過給定資料點擬合最有曲線從而 數值型資料 以上兩個任務都屬於監督學習,因為這類演算法必須知道 什麼,即目標變數的分類資訊或目標數值。無監督學習 資料沒有類別資訊,也不會給出目標值 聚類 將資料集分成由類似的物件組成的多個類的過程 密度估計 將尋找描述資料統...