原理:
**部分:
1.先做乙個資料集
from numpy import
*def
loaddataset()
: postinglist=[[
'my'
,'dog'
,'has'
,'flea'
,'problems'
,'help'
,'please'],
['maybe'
,'not'
,'take'
,'him'
,'to'
,'dog'
,'park'
,'stupid'],
['my'
,'dalmation'
,'is'
,'so'
,'cute'
,'i'
,'love'
,'him'],
['stop'
,'posting'
,'stupid'
,'worthless'
,'garbage'],
['mr'
,'licks'
,'ate'
,'my'
,'steak'
,'how'
,'to'
,'stop'
,'him'],
['quit'
,'buying'
,'worthless'
,'dog'
,'food'
,'stupid']]
classvec =[0
,1,0
,1,0
,1]#1 is abusive, 0 not
return postinglist,classvecdef createvocablist(dataset)
:
2.建立詞彙表
def
createvocablist
(dataset)
: vocabset =
set(
)#create empty set
for document in dataset:
vocabset = vocabset |
set(document)
#union of the two sets
return
list
(vocabset)
3.詞向量
def
setofwords2vec
(vocablist,inputset)
: returnvec=[0
]*len(vocabilst)
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)]=
1return returnvec
4.按照貝葉斯原理寫個訓練函式
def
trainnb0
(trainmatrix,traincategory)
: numtraindocs =
len(trainmatrix)
numwords =
len(trainmatrix[0]
) pabusive =
sum(traincategory)
/float
(numtraindocs)
p0num = ones(numwords)
; p1num = ones(numwords)
#change to ones()
p0denom =
2.0; p1denom =
2.0#change to 2.0
for i in
range
(numtraindocs)
:if traincategory[i]==1
:
p1num += trainmatrix[i]
p1denom +=
sum(trainmatrix[i]
)else
: p0num += trainmatrix[i]
p0denom +=
sum(trainmatrix[i]
) p1vect = log(p1num/p1denom)
#change to log()
p0vect = log(p0num/p0denom)
#change to log()
return p0vect,p1vect,pabusive
5.分類
def
classifynb
(vec2classify, p0vec, p1vec, pclass1)
: p1 =
sum(vec2classify * p1vec)
+ log(pclass1)
#element-wise mult
p0 =
sum(vec2classify * p0vec)
+ log(
1.0- pclass1)
if p1 > p0:
return
1else
:return
0
原理:
**部分:
1.呼叫資料集
import numpy as np
import sklearn
from sklearn.datasets import fetch_20newsgroups
twenty_train=fetch_20newsgroups(subset=
'train'
,shuffle=
true
)twenty_train.traget_names
2.詞袋模型
from sklearn.feature_extraction.text import countvectorizer
count_vect=countvectorizer(
)x_train_counts=count_vect.fit_transform(twenty_train.data)
#詞袋模型
3.訓練及給出答案
from sklearn.*****_bayes import multinomialnb
clf=multionmialnb.fit(x_train_counts,twenty_train.target)
twenty_test=fetch_20newsgroups(subset=
'test'
,shuffle=
true
)#生成測試集
x_test_counts=count_vect.transform(twenty_test.data)
predicted=clf.predict(x_test_counts)
np.mean(predicted==twenty_test.target)
原理:
**:1.讀取和處理資料
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import dictionary
# create a corpus from a list of texts
common_dictionary = dictionary(common_texts)
common_corpus =
[common_dictionary.doc2bow(text)
for text in common_texts]
# train the model on the corpus.
lda = ldamodel(common_corpus, num_topics=
10)
2.將文字轉化為詞袋模型
from gensim.corpora import dictionary
dct = dictionary(
["máma mele maso"
.split(),
"ema má máma"
.split()]
)dct.doc2bow(
["this"
,"is"
,"máma"])
[(2,
1)]dct.doc2bow(
["this"
,"is"
,"máma"
], return_missing=
true)(
[(2,
1)],
)
3.運用lda模型
from gensim.models import ldamodel
lda = ldamodel(common_corpus, num_topics=10)
lda.print_topic(
1, topn=2)
'0.500
*"9"
+0.045
*"10"
樸素貝葉斯實現的文字分類
參考文章 樸素貝葉斯實現的文字分類原理 coding utf 8 created on 2017 author xyj import jieba import os import random import math deftextprocessing floder path,train size ...
樸素貝葉斯分類
1 貝葉斯分類是一類分類演算法的總稱,這類演算法均以貝葉斯定理為基礎,故統稱為貝葉斯分類。2 樸素貝葉斯的思想基礎是這樣的 對於給出的待分類項,求解在此項出現的條件下各個類別出現的概率,哪個最大,就認為此待分類項屬於哪個類別。通俗來說,就好比這麼個道理,你在街上看到乙個黑人,我問你你猜這哥們 來的,...
樸素貝葉斯分類
摘自寫在公司內部的wiki 要解決的問題 表中增加欄位classification,有四個取值 0 初始值,未分類 1 positive 2 normal 99 negative review submit前,由樸素貝葉斯分類器決定該條review的flag屬於negative還是positive ...