機器學習之K近鄰演算法 kNN 1

2021-08-22 17:52:47 字數 4730 閱讀 5612

可以說knn是機器學習中非常特殊的沒有模型的演算法,為了和其他演算法統一,可以認為新聯資料集就是模型本身

import numpy as np

import matplotlib.pyplot as plt

from math import sqrt

from collections import counter

# 特徵集合

raw_data_x = [[3.393533211, 2.331273381],

[3.110073483, 1.781539638],

[1.343808831, 3.368360954],

[3.582294042, 4.679179110],

[2.280362439, 2.866990263],

[7.423469421, 4.694522875],

[5.745051997, 3.533989803],

[9.172168622, 2.511101045],

[7.792783481, 3.424088941],

[7.939820817, 0.791637231]]

# 每乙個特徵的類別

raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

# 訓練集

x_train = np.array(raw_data_x)

y_train = np.array(raw_data_y)

# 來了乙個新的資料 要判斷它的特徵值

new = np.array([8.093607318, 3.365731514])

# 原資料

plt.scatter(x_train[y_train == 0, 0], x_train[y_train == 0, 1], color='g')

plt.scatter(x_train[y_train == 1, 0], x_train[y_train == 1, 1], color='r')

# 新資料

plt.scatter(new[0], new[1], color='b')

# plt.show()

# 由圖可知,它一定輸入特徵值為 1

# knn的過程

distances =

# np.sum((x - new) ** 2) 等價於 (x[0] - new[0]) ** 2 + (x[1] - new[1]) ** 2

for x in x_train:

d = sqrt(np.sum((x - new) ** 2))

# 一句話搞定

# distances = [sqrt(np.sum((x - new) ** 2)) for x in x_train]

nearest = np.argsort(distances)

k = 6

# 最近距離y座標

topk_y = [y_train[i] for i in nearest[:k]]

# 投票過程

votes = counter(topk_y)

# **結果值

predict_y = votes.most_common(1)[0][0]

print(predict_y)

很容易把上述的過程整理出來寫出乙個函式

import numpy as np

from math import sqrt

from collections import counter

defknn_classify

(k, x_train, y_train, new):

# 校驗引數

assert

1<= k <= x_train.shape[0], "k must be valid "

assert x_train.shape[0] == y_train.shape[0], "the size of x_train must equal to the size of y_train"

assert x_train.shape[1] == new.shape[0], "th feature number of x must be equal to x_train"

# 距離陣列

distance = [sqrt(np.sum((x - new) ** 2)) for x in x_train]

nearest = np.argsort(distance)

topk_y = [y_train[i] for i in nearest[:k]]

# 投票

votes = counter(topk_y)

return votes.most_common(1)[0][0]

from sklearn.neighbors import kneighborsclassifier

import numpy as np

# 特徵集合

raw_data_x = [[3.393533211, 2.331273381],

[3.110073483, 1.781539638],

[1.343808831, 3.368360954],

[3.582294042, 4.679179110],

[2.280362439, 2.866990263],

[7.423469421, 4.694522875],

[5.745051997, 3.533989803],

[9.172168622, 2.511101045],

[7.792783481, 3.424088941],

[7.939820817, 0.791637231]]

# 每乙個特徵的類別

raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

# 訓練集

x_train = np.array(raw_data_x)

y_train = np.array(raw_data_y)

new = np.array([[8.093607318, 3.365731514]])

knn_classifier = kneighborsclassifier(n_neighbors=6)

knn_classifier.fit(x_train, y_train)

print(knn_classifier.predict(new))

import numpy as np

from math import sqrt

from collections import counter

class

knnclassifier:

def__init__

(self, k):

""""初始化knn分類器"""

assert

1<= k, "k must be valid "

self.k = k

self._x_train = none

self._y_train = none

deffit

(self, x_train, y_train):

""""根據訓練資料集x_train,y_train訓練knn分類器"""

assert x_train.shape[0] == y_train.shape[0], "the size of x_train must equal to the size of y_train"

assert self.k <= x_train.shape[0], "th feature number of x must be equal to x_train"

self._x_train = x_train

self._y_train = y_train

return self

defpredict

(self, new):

""""給定待**資料集new,返回表示new的結果向量"""

assert self._x_train is

notnone

and self._y_train is

notnone, "must fit before predict!"

assert new.shape[1] == self._x_train.shape[1], "the feature number of new must be equal to x_train"

y_predict = [self._predict(x) for x in new]

return np.array(y_predict)

def_predict

(self, x):

""""給定單個待**資料x,返回x_predict的**結果值"""

assert x.shape[0] == self._x_train.shape[1], "the feature number of x must be equal to x_train"

# 距離陣列

distance = [sqrt(np.sum((i - x) ** 2)) for i in self._x_train]

nearest = np.argsort(distance)

topk_y = [self._y_train[i] for i in nearest[:self.k]]

# 投票

votes = counter(topk_y)

return votes.most_common(1)[0][0]

機器學習 kNN(1)

knn k nearest neighbor 工作原理 存在乙個樣本資料集合,也稱為訓練樣本集,並且樣本集中每個資料都存在標籤,即我們知道樣本集中每一資料與所屬分類對應的關係。輸入沒有標籤的資料後,將新資料中的每個特徵與樣本集中資料對應的特徵進行比較,提取出樣本集中特徵最相似資料 最近鄰 的分類標籤...

機器學習之 K近鄰演算法(KNN)

k近鄰演算法是機器學習中的最基礎的演算法之一,也是必須會講解的演算法之一。因為其簡單,易懂。但是k近鄰演算法在實際應用中卻很少使用到,由於其侷限性很強,執行效率低下。那麼所謂的k近鄰中的近鄰是指 已有乙個訓練資料集,現給定乙個新的資料,利用現有資料集對其進行 那麼 方法便是尋找與給定資料距離最近的k...

機器學習 k 近鄰 kNN 演算法

一 基本原理 存在乙個樣本資料集合 也稱訓練樣本集 並且樣本集中每個資料都存在標籤。輸入沒有標籤的新資料後,將新資料的每個特徵與樣本集中資料對應的特徵進行比較,然後演算法提取樣本集中特徵最相似資料 最近鄰 的分類標籤。我們一般只選擇樣本集中前k k通常是不大於20的整數 個最相似的資料,最後選擇k個...