機器學習實戰 ch02 1

'''from numpy import *

from operator

def createdataset():

group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])

labels = ['a','a','b','b']

return group,labels

'''intx: 輸入向量,與dataset矩陣矩陣的行數一致

k: 鄰近的個數

'''def classify0(intx,dataset,labels,k):

datasetsize = dataset.shape[0]

# 計算歐式距離

diffmat = tile(intx,(datasetsize,1)) - dataset # 矩陣減法

# tile(a,reps) a沿各個維度重複的次數

sqdiffmat = diffmat ** 2

sqdistances = sqdiffmat.sum(axis = 1) # 矩陣行相加,列的引數是0

distances = sqdistances ** 0.5


#print distances

sorteddistindicies = distances.argsort() # argsort函式返回的是陣列值從小到大的索引值


#print sorteddistindicies

classcount ={} #dict 型別

for i in range(k):

voteilabel = label[sorteddistindicies[i]]

#print ("i: %d, sorteddistindicies[%d]: %d"%(i,i,sorteddistindicies[i]))

#print ("voteilabel:%s"%(voteilabel))

classcount[voteilabel] = classcount.get(voteilabel,0) + 1

#d.get(k[, d]) => d[k] if k in d else d. d defaults to none.提取標籤出現的個數

#print ("classcount[%s]:%d"%(voteilabel,classcount[voteilabel]))

sortedclasscount = sorted(classcount.iteritems(), key = operator.itemgetter(1), reverse = true)



return sortedclasscount[0][0]

def file2matrix(filename):

fr = open(filename)

arrayoflines = fr.readlines()

numberoflines = len(arrayoflines)

returnmat = zeros((numberoflines,3)) #3列矩陣

classlabelvector = #list

index = 0

for line for arrayoflines:

line = line.strip() # 去除回車

listfromline = line.split('\t')

returnmat[index,:] = listfromline[0:3]

index += 1

return returnmax, classlabelvector

''' 歸一化

newvalue = (oldvalue - minvalue)/(maxvalue - minvalue)


def autonorm(dataset):

minvals = dataset.min(0)

maxvals = dataset.max(0)

ranges = maxvals - minvals

normdataset = zeros(shape(dataset))

m = dataset.shape[0]

normdataset = normdataset - tile(minvals,(m,1))

normdataset = normdataset / tile(ranges,(m,1))

return normdataset, ranges, minvals


'''def datingdatatest():

horatio = 0.10

datingdatamat, datinglabels = file2matrix('datingtestset2.txt')

normmat, ranges, minvals = autonorm(datingdatamat)

m = normmat.shape[0]

normtestvecs = int (m * horatio)

errorcount = 0.0

for i in range(normtestvecs):

classifierresult = classify0(normmat[i,:],\



3)print ("the classifier came back with: %d, the real answer is:%d"\


if(classifierresult != datinglabels[i]):

errorcount += 1.0

print ("the total error rate is: %f" % (errorcount/float(numtestvecs)))


'''def classifyperson():

resultlist = ['not at all','in small doses','in large doses']

percenttats = float(raw_input\

("percentage of time spent playing video games?"))

ffmiles = float(raw_input\

("frequent flier miles earned per year?"))

icecream = float(raw_input\

("liters of ice cream consumed per year?"))

datingdatamat,datinglabels = file2matrix('datingtestset2.txt')

normmat, ranges, minvals = autonorm(datingdatamat)

inarr = array([percenttats,ffmiles,icecream])

classifierresult = classify0((inarr-minvals)/ranges,\



3)print ("you will probably like this person: ",\


