python sklearn實現中文簡訊垃圾分類

2021-10-06 14:18:43 字數 1796 閱讀 5360

資料讀取

import pandas as pd

import jieba

data = pd.read_csv(r"e:\資料\實驗data\messages.csv",encoding='gbk', header=0, ,names=[「id」,'label','text'])

#print(data.head())

簡訊分詞

#print(data.head())

x = data['cut_message'].values

y = data['label'].values

訓練集、測試集劃分

from sklearn.cross_validation import train_test_split

train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.1)#測試集:訓練集 =1:9

模型訓練與**

from sklearn.feature_extraction.text import tfidftransformer,countvectorizer

vectorizer = countvectorizer()

x_train_termcounts = vectorizer.fit_transform(train_x)

tfidf_transformer = tfidftransformer()

x_train_tfidf = tfidf_transformer.fit_transform(x_train_termcounts)

from sklearn.*****_bayes import gaussiannb,multinomialnb

classifier = multinomialnb().fit(x_train_tfidf,train_y)

x_input_termcounts = vectorizer.transform(test_x)

x_input_tfidf = tfidf_transformer.transform(x_input_termcounts)

predicted_categories = classifier.predict(x_input_tfidf) #**分類

準確率、召回率

from sklearn.metrics import accuracy_score,recall_score  

accuracy_s = accuracy_score(test_y,predicted_categories)

recall_s = recall_score(test_y,predicted_categories)

混淆矩陣

from sklearn.metrics import confusion_matrix  

confusion_matrix(test_y,predicted_categories)

輸出一部分例項

category_map = 

for sentence,category,real in zip(test_x[:10],predicted_categories[:10],test_y[:10]):

print('\nmessage_content:',sentence,'\npredicted_type:',category_map[category],'real_values:',category_map[real])

python sklearn庫實現簡單邏輯回歸

import xlrd import matplotlib.pyplot as plt import numpy as np from sklearn import model selection from sklearn.linear model import logisticregression...

Python sklearn 交叉驗證

from sklearn.datasets import load boston from sklearn.model selection import cross val score from sklearn.tree import decisiontreeregressor boston loa...

Python sklearn 中的SVM示例

coding utf 8 import pandas as pd from numpy.random import shuffle from sklearn import svm import joblib from sklearn import metrics inputfile data mom...