python使用KNN文字分類

2022-07-22 07:33:10 字數 3557 閱讀 3410

上次爬取的爸爸、媽媽、老師和自己的作文,利用sklearn.neighbors.kneighborsclassifier進行分類。

import

jieba

import

pandas as pd

import

numpy as np

import

osimport

itertools

import

matplotlib.pyplot as plt

from sklearn.feature_extraction.text import

countvectorizer

from sklearn.neighbors import

kneighborsclassifier

from sklearn.metrics import

confusion_matrix

from sklearn.decomposition import

pca#

讀取檔案內容

path = '

e:\作文

'corpos = pd.dataframe(columns=['

filepath

','text

','kind'])

for root,dirs,files in

os.walk(path):

for name in

files:

filepath = root+'

\\'+name

f = open(filepath,'

r',encoding='

utf-8')

text =f.read()

txt = ''.join(text.split('\n'

)) kind = root.split('

\\')[-1]

corpos.loc[len(corpos)] =[filepath,text.strip(),kind]

#設定停用詞,構建詞頻矩陣

stopwords = pd.read_csv(r'

stopwords.txt',

encoding='

utf-8

',sep='\n'

)def

tokenizer(s):

words=

cut =jieba.cut(s)

for word in

cut:

return

words

count = countvectorizer(tokenizer=tokenizer,

stop_words=list(stopwords['

stopword

']))

countvector = count.fit_transform(corpos.iloc[:,1]).toarray()

#將類別轉化為數字

kind = np.unique(corpos['

kind

'].values)

nkind = np.zeros(700)

for i in

range(len(kind)):

index = corpos[corpos['

kind

']==kind[i]].index

nkind[index] = i+1

#將詞頻矩陣轉化為二維資料,畫圖

pca = pca(n_components=2)

newvector =pca.fit_transform(countvector)

plt.figure()

for i,c,m in zip(range(len(kind)),['

r','

b','

g','

y'],['

o','

^','

>

','<

']):

index = corpos[corpos['

kind

']==kind[i]].index

x =newvector[index,0]

y = newvector[index,1]

plt.scatter(x,y,c=c,marker=m,label=kind[i])

plt.legend()

plt.xlim(-5,10)

plt.ylim(-20,50)

plt.xlabel(

'x label')

plt.ylabel(

'y label')

#隨機選出測試集

index = np.random.randint(0,700,200)

x_test =countvector[index]

y_test = corpos.iloc[index,2]

#利用knn分類

knn =kneighborsclassifier()

knn.fit(countvector,corpos.iloc[:,2])

y_pred =knn.predict(x_test)

knn.score(x_test,y_test)

#畫knn分類結果的混淆矩陣

knn_confusion =confusion_matrix(y_test,y_pred)

'''array([[61, 1, 0, 3],

[ 8, 35,  0,  1],

[ 1, 0, 53, 1],

[ 9, 1, 2, 24]])

'''

plt.imshow(knn_confusion,interpolation='

nearest

',cmap=plt.cm.oranges)

plt.xlabel(

'y_pred')

plt.ylabel(

'y_true')

tick_marks =np.arange(len(kind))

plt.xticks(tick_marks,kind,rotation=90)

plt.yticks(tick_marks,kind)

plt.colorbar()

plt.title(

'confustion_matrix')

for i,j in

itertools.product(range(len(knn_confusion)),range(len(knn_confusion))):

plt.text(i,j,knn_confusion[j,i],

horizontalalignment="

center

")

資料散點圖如下所示:

knn分類結果的混淆矩陣圖如下所示:

機器學習1 KNN文字分類

思想 1.找到與資料最相近k個資料 根據余弦相似度 2.分別找出k條資料的類別,同類別相加,得到最大值,則該類別為測試資料的所屬類。encoding utf 8 from pylab import reload sys defcreatedataset group 1.0,1.1 2.0,2.1 1...

使用python進行文字分類

coding utf 8 author lishuai importnumpy defloaddataset postinglist my dog has flea problems help please maybe not take him to dog park stupid my dalma...

fasstext文字分類(python)

fasttext是word2vec作者提出的文字分類演算法。它是乙個用於高效學習單詞表示和文字分類的庫。本篇部落格主要介紹fasttext在python下的基本應用 pip install fasttext訓練樣本train data.txt的格式介紹 每一行是文字 分類標籤 分類標籤最好形如 la...