機器學習 貝葉斯

2021-09-16 13:17:15 字數 4960 閱讀 5893

bayes.py包含了所有函式的實現,需要做的是,明白各個函式的功能作用及輸入輸出,在指令碼中完成函式的呼叫,給出要求的格式的結果。

from numpy import *

import csv

import random

random.seed(21860251)

def loaddataset():

postinglist = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],

['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],

['my', 'dalmation', 'is', 'so', 'cute', 'i', 'love', 'him'],

['stop', 'posting', 'stupid', 'worthless', 'garbage'],

['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],

['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]

classvec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not

return postinglist, classvec

def createvocablist(dataset):

vocabset = set() # create empty set

for document in dataset:

vocabset = vocabset | set(document) # union of the two sets

return list(vocabset)

def setofwords2vec(vocablist, inputset): # 對比詞彙表和輸入的所有單詞

returnvec = [0] * len(vocablist)

for word in inputset:

if word in vocablist:

returnvec[vocablist.index(word)] = 1

else:

print("the word: %s is not in my vocabulary!" % word)

return returnvec

def trainnb0(trainmatrix, traincategory): # trainmatrix是文件矩陣 traincategory是文件類別標籤構成的向量

numtraindocs = len(trainmatrix) # 文件矩陣的長度

numwords = len(trainmatrix[0])

pabusive = sum(traincategory) / float(numtraindocs)

p0num = ones(numwords) # 初始化概率

p1num = ones(numwords)

p0denom = 2.0

p1denom = 2.0

for i in range(numtraindocs):

if traincategory[i] == 1:

p1num += trainmatrix[i]

p1denom += sum(trainmatrix[i])

else:

p0num += trainmatrix[i]

p0denom += sum(trainmatrix[i])

p1vect = log(p1num / p1denom) # change to log()

p0vect = log(p0num / p0denom) # change to log()

return p0vect, p1vect, pabusive

def classifynb(vec2classify, p0vec, p1vec, pclass1):

p1 = sum(vec2classify * p1vec) + log(pclass1) # element-wise mult

p0 = sum(vec2classify * p0vec) + log(1.0 - pclass1)

if p1 > p0:

return 1

else:

return 0

def bagofwords2vecmn(vocablist, inputset): # 樸素貝葉斯詞帶模型

returnvec = [0] * len(vocablist)

for word in inputset:

if word in vocablist:

returnvec[vocablist.index(word)] += 1 # 加1

return returnvec

def testingnb():

listoposts, listclasses = loaddataset()

myvocablist = createvocablist(listoposts)

trainmat =

for postindoc in listoposts:

p0v, p1v, pab = trainnb0(array(trainmat), array(listclasses))

testentry = ['love', 'my', 'dalmation']

thisdoc = array(setofwords2vec(myvocablist, testentry))

print(testentry, 'classified as: ', classifynb(thisdoc, p0v, p1v, pab))

testentry = ['stupid', 'garbage']

thisdoc = array(setofwords2vec(myvocablist, testentry))

print(testentry, 'classified as: ', classifynb(thisdoc, p0v, p1v, pab))

def textparse(bigstring): # input is big string, #output is word list

import re

listoftokens = re.split(r'\w*', bigstring)

return [tok.lower() for tok in listoftokens if len(tok) > 2]

def spamtest():

doclist =

classlist =

fulltext =

for i in range(1, 26):

wordlist = textparse(open('email/spam/%d.txt' % i).read())

fulltext.extend(wordlist)

wordlist = textparse(open('email/ham/%d.txt' % i).read())

fulltext.extend(wordlist)

vocablist = createvocablist(doclist) # create vocabulary

trainingset = list(range(50))

testset = # create test set 隨機構建測試集

for i in range(10):

randindex = int(random.uniform(0, len(trainingset)))

del (trainingset[randindex])

trainmat =

trainclasses =

for docindex in trainingset: # train the classifier (get probs) trainnb0

p0v, p1v, pspam = trainnb0(array(trainmat), array(trainclasses))

out = open('test_result.csv', 'w', newline='')

csv_write = csv.writer(out, dialect='excel')

csv_write.writerow(['testset', 'predict_value', 'truth'])

errorcount = 0

for docindex in testset: # classify the remaining items

wordvector = bagofwords2vecmn(vocablist, doclist[docindex])

if classifynb(array(wordvector), p0v, p1v, pspam) != classlist[docindex]:

errorcount += 1

print("classification error", doclist[docindex])

test_result_list = [docindex, classifynb(array(wordvector), p0v, p1v, pspam), classlist[docindex]]

csv_write.writerow(test_result_list) # 以覆蓋方式來寫入csv檔案中

print('the error rate is: ', float(errorcount) / len(testset))

# return vocablist,fulltext

testset

predict_value

truth90

03411

31031

10101

14211

181121

00361

11411

機器學習 樸素貝葉斯

樸素貝葉斯原理 1.貝葉斯公式 2.樸素貝葉斯的模型 3.後驗概率最大化的含義 4.樸素貝葉斯的引數估計 4.1.特徵是離散值 假設符合多項式分布 4.2.特徵是稀疏的離散值 假設符合伯努利分布 4.3.特徵是連續值 假設符合正態分佈 5.樸素貝葉斯演算法過程 6.樸素貝葉斯演算法小結 scikit...

機器學習之貝葉斯

scikit learn 樸素貝葉斯類庫使用小結 demo 貝葉斯定理是18世紀英國數學家托馬斯 貝葉斯 thomas bayes 提出得重要概率論理論。以下摘一段 wikipedia 上的簡介 所謂的貝葉斯定理源於他生前為解決乙個 逆概 問題寫的一篇文章,而這篇文章是在他死後才由他的一位朋友發表出...

機器學習 樸素貝葉斯 例子

一 學習樸素貝葉斯之前先了解一下 條件概率下 貝葉斯公式 1 舉例是兩個工具機m1和m2 用例生產扳手機器 生產扳手 個數 mach1 30mach2 20不合格佔1 defective 機器 次品分布比例 mach1 50 mach2 50 2 問題 mach2生產的不合格產品的概率是多少?p m...