kNN演算法及其python實現

2021-08-07 20:29:17 字數 4966 閱讀 2570

1、knn演算法原理

k近鄰(k-nearest neighbor)是比較簡單的機器學習演算法,它通過計算向量(特徵值)間的距離衡量相似度來進行分類。它的思想很簡單:如果乙個樣本在特徵空間中的k個最近鄰(最相似)的樣本中的大多數屬於某一類別,則該樣本也屬於這個類別。說白了就是一句話:對某個樣本,與他最相似的k個樣本屬於哪一類,它就屬於哪一類。

2、knn的python實現(進行文字分類)

a、準備資料

準備資料,儲存在nbayes_lib.py中

import numpy as np

from numpy import *

defloaddataset

(): postinglist = [['my','dog','has','flea','problems','help','help','please'],

['maybe','not','take','him','to','dog','park','stupid'],

['my','dalmation','is','so','cute','i','love','him','my'],

['stop','posting','stupid','workless','garbage'],

['mr','licks','ate','my','steak','how','to','stop','him'],

['quit','buying','worthless','dog','food','stupid']]

classvec = [0,1,0,1,0,1]

return postinglist,classvec

class

nbayes

(object):

def__init__

(self):

self.vocabulary =

self.idf = 0

self.tf = 0

self.tdm = 0

self.pcates = {}

self.labels =

self.doclength = 0

self.vocablen = 0

self.testset = 0

deftrain_set

(self, trainset,classvec):

self.cate_prob(classvec)

self.doclength = len(trainset)

tempset = set()

[tempset.add(word) for doc in trainset for word in doc]

self.vocabulary = list(tempset)

self.vocablen = len(self.vocabulary)

self.calc_wordfreq(trainset)

self.build_tdm()

defcate_prob

(self,classvec):

self.labels = classvec

labeltemps = set(self.labels)

for labeltemp in labeltemps:

self.pcates[labeltemp] = float(self.labels.count(labeltemp)) / float(len(self.labels))

defcalc_wordfreq

(self,trainset):

self.idf = np.zeros([1,self.vocablen])

self.tf = np.zeros([self.doclength,self.vocablen])

for indx in range(self.doclength):

for word in trainset[indx]:

self.tf[indx,self.vocabulary.index(word)]+=1

for signleword in set(trainset[indx]):

self.idf[0,self.vocabulary.index(signleword)]+=1

defbuild_tdm

(self):

self.tdm = np.zeros([len(self.pcates),self.vocablen])

sumlist = np.zeros([len(self.pcates),1])

for indx in range(self.doclength):

self.tdm[self.labels[indx]]+= self.tf[indx]

sumlist[self.labels[indx]]=np.sum(self.tdm[self.labels[indx]])

self.tdm = self.tdm/sumlist

defmap2vocab

(self,testdata):

self.testset = np.zeros([1,self.vocablen])

for word in testdata:

self.testset[0,self.vocabulary.index(word)] +=1

defpredict

(self,testset):

if np.shape(testset)[1] !=self.vocablen:

print('輸入錯誤')

exit(0)

predvalue = 0

predclass = ""

for tdm_vect,keyclass in zip(self.tdm,self.pcates):

temp = np.sum(testset*tdm_vect*self.pcates[keyclass])

if temp > predvalue:

predvalue = temp

predclass = keyclass

return predclass

defcalc_tfidf

(self,trainset):

self.idf = np.zeros([1,self.vocablen])

self.tf = np.zeros([self.doclength,self.vocablen])

for indx in range(self.doclength):

for word in trainset[indx]:

self.tf[indx,self.vocabulary.index(word)]+=1

self.tf[indx] = self.tf[indx]/float(len(trainset[indx]))

for signleword in set(trainset[indx]):

self.idf[0,self.vocabulary.index(signleword)]+=1

self.idf = np.log(float(self.doclength)/self.idf)

self.tf = np.multiply(self.tf,self.idf)

b、進行分類

import sys

import os

from numpy import *

import numpy as np

import operator

from nbayes_lib import *

import importlib

importlib.reload(sys)

k=3def

cosdist

(v1,v2):

return np.dot(v1,v2) / (linalg.norm(v1)*linalg.norm(v2))

defclassify

(testdata,traindata,listclass,k):

datasetsize = traindata.shape[0] #返回樣本的行數,就是樣本的個數

distance = array(zeros(datasetsize))

for indx in range(datasetsize): #計算測試集與訓練集之間的距離

distance[indx] = cosdist(testdata,traindata[indx])

sorteddistindicies = argsort(-distance)

#print(sorteddistindicies)

classcount = {}

for i in np.arange(k): #獲取角度最小的前k項作為參考項

voteilabel = listclass[sorteddistindicies[i]]

classcount[voteilabel] = classcount.get(voteilabel,0) + 1

#print(classcount)

sortedclasscount = sorted(classcount.items(),key=operator.itemgetter(1),reverse = true)

return sortedclasscount[0][0]

dataset,listclasses = loaddataset()

print('資料集是')

print(dataset)

print('label是:')

print(listclasses)

nb = nbayes()

nb.train_set(dataset,listclasses)

print('tf[3]是:')

print(nb.tf[3])

print('tf是')

print(nb.tf)

print(classify(nb.tf[3],nb.tf,listclasses,k))

輸出結果為1,準確率100%

Python實現KNN演算法

from numpy import import operator def creatdataset group array 1.0,1.1 1.0,1.0 0,0 0,0.1 lables a a b b return group,lables def classify0 inx,dataset,...

python實現knn演算法

importnumpyasnp importoperator defcreatedataset group np.array 1.0 1.1 1.0 1.0 0.0 0.0 0.0 0.1 labels a a b b returngroup,labels 分類演算法 inx待分類的點 defcla...

python實現KNN演算法

具體 如下 import numpy as np import matplotlib.pyplot as plt class myknn def init self,k self.k k def compute self,x train,y train,x test dist i 0 計算歐式距離 ...