樸素貝葉斯實現的文字分類

2021-08-08 19:55:40 字數 4492 閱讀 6918

參考文章: 樸素貝葉斯實現的文字分類原理

# coding=utf-8

'''created on 2017

@author: xyj

'''import jieba

import os

import random

import math

deftextprocessing

(floder_path,train_size =0.8):

floder_list = os.listdir(floder_path)

train_data_list =

train_class_list =

test_data_list =

test_class_list =

for floder in floder_list:

new_floder_path = os.path.join(floder_path,floder)

new_floder_list = os.listdir(new_floder_path)

word_list =

for file in new_floder_list:

txt_list =

with open(os.path.join(new_floder_path,file),'rb') as f:

raw = f.read().decode('ansi','ignore')

txt_list = list(jieba.cut(raw,cut_all = false))

while

'\u3000'

in txt_list:

txt_list.remove('\u3000')

while

'\r\n'

in txt_list:

txt_list.remove('\r\n')

while

'\x00'

in txt_list:

txt_list.remove('\x00')

while

'\n'

in txt_list:

txt_list.remove('\n')

random.shuffle(word_list)

size = int(len(word_list)*train_size)

print(floder)

print(size)

tem_train_list = word_list[:size]

tem_test_list = word_list[size:]

tem_train_word =

for a in tem_train_list :

for b in a:

3##生成訓練資料集和測試資料集

return train_data_list,test_data_list,train_class_list,test_class_list

'''@param param is stopwords's filename:

@return: a set of stopwords_file

'''def

makestopwordsset

(stopwords_file):

words_set = set()

with open(stopwords_file,'rb') as f:

lines = f.readlines()

for line in lines:

word = line[:-2].decode('utf-8')

if len(word)>0

and word not

in words_set:

words_set.add(word)

return words_set

deflisttodict

(data_list,stopwords_set=set()):

data_dict = {}

for word in data_list:

if word not

in stopwords_set and

not word.isdigit():

if word in data_dict:

data_dict[word] += 1

else:

data_dict[word] = 1

return data_dict

defclearlist

(test_list,stopwords_set = set()):

test =

for word in test_list:

if word not

in stopwords_set and

not word.isdigit():

return test

defpredicted

(test_list,train_data_list_dict,train_class_list,train_data_count):

predicte =

for dic ,count in zip(train_data_list_dict,train_data_count):

laplace = 0

for word in test_list:

laplace += p(word,dic,count)

ma = max(predicte)

return train_class_list[list.index(predicte,ma)]

defp

(word,dic,count):

if word in dic:

laplace = math.log(((dic[word]+1)/(count + len(dic))))/math.log(10)

else:

laplace = math.log((1/(count + len(dic))))/math.log(10)

return laplace

defmain

(): abspath = os.path.abspath(os.path.dirname(os.getcwd()))

##########獲取不關鍵單詞集合##########

stopwords_file = abspath + '\\stopwords_cn.txt'

stopwords_set = makestopwordsset(stopwords_file)

###########獲取資料集################

folder_path = abspath+'/reduced'

train_data_list,test_data_list,train_class_list,test_class_list = textprocessing(folder_path,train_size = 0.8)

##處理訓練資料集#####################

train_data_list_dict =

for word_list in train_data_list:

print('訓練資料集處理完成')

##處理測試訓練集########

for test_list in test_data_list:

for test in test_list:

test = clearlist(test,stopwords_set)

print('測試資料集處理完成')

for a in train_data_list_dict:

internet_list = sorted(a.items(),key = lambda f : f[1],reverse = true)

print(internet_list[:200])

##統計每一類的單詞數,為了方便計算p(bi/a)

train_data_count =

for dic in train_data_list_dict:

count = 0

for v in dic.values():

count += v

###test###########################################

for li,classtpye in zip(test_data_list,test_class_list):

corr = 0

count = 0

for lis in li:

name = predicted(lis, train_data_list_dict, train_class_list, train_data_count)

count += 1

if name == classtpye:

corr += 1

print(classtpye+'類**成功率為 %.3f %%'%(corr*100/count))

if __name__ == '__main__':

main()

樸素貝葉斯分類

1 貝葉斯分類是一類分類演算法的總稱,這類演算法均以貝葉斯定理為基礎,故統稱為貝葉斯分類。2 樸素貝葉斯的思想基礎是這樣的 對於給出的待分類項,求解在此項出現的條件下各個類別出現的概率,哪個最大,就認為此待分類項屬於哪個類別。通俗來說,就好比這麼個道理,你在街上看到乙個黑人,我問你你猜這哥們 來的,...

樸素貝葉斯分類

摘自寫在公司內部的wiki 要解決的問題 表中增加欄位classification,有四個取值 0 初始值,未分類 1 positive 2 normal 99 negative review submit前,由樸素貝葉斯分類器決定該條review的flag屬於negative還是positive ...

分類 樸素貝葉斯

原始的貝葉斯公式為 p b a p a b p a p a b p b p a 1 在分類問題中,y為類別,x為樣本特徵,則已知待 的樣本特徵 x 它為類別yi 的概率為 p yi x p x yi p y i p x p yi jp xj y i p x 2 p yi 類別為y i的樣本 數總樣本...