import os,re

import numpy as np

import pandas as pd

import jieba.posseg as pseg

from sklearn.model_selection import train_test_split

from sklearn.*****_bayes import multinomialnb

from sklearn.feature_extraction.text import tfidfvectorizer

#dataset_path = './dataset'

text_filenames = ['0_simplifyweibo.txt', '1_simplifyweibo.txt',

'2_simplifyweibo.txt', '3_simplifyweibo.txt']

# 原始資料的csv檔案

output_text_filename = 'raw_weibo_text.csv'

# 清洗好的文字資料檔案

output_cln_text_filename = 'clean_weibo_text.csv'

stopwords1 = [line.rstrip() for line in open('./中文停用詞庫.txt', 'r', encoding='utf-8')]

stopwords = stopwords1


'''text_w_label_df_lst =

for text_filename in text_filenames:

text_file = os.path.join(dataset_path, text_filename)

# 獲取標籤,即0, 1, 2, 3

label = int(text_filename[0])

# 讀取文字檔案

with open(text_file, 'r', encoding='utf-8') as f:

lines =

labels = [label] * len(lines)


text_series = pd.series(lines)

label_series = pd.series(labels)

# 構造dataframe

text_w_label_df = pd.concat([label_series, text_series], axis=1)

result_df = pd.concat(text_w_label_df_lst, axis=0)

# 儲存成csv檔案

result_df.columns = ['label', 'text']

result_df.to_csv(os.path.join(dataset_path, output_text_filename),

index=none, encoding='utf-8')

'''#1. 資料讀取,處理,清洗,準備

'''# 讀取處理好的csv檔案,構造資料集

text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),encoding='utf-8')


def proc_text(raw_line):



"""# 1. 使用正規表示式去除非中文字元

filter_pattern = re.compile('[^\u4e00-\u9fd5]+')

chinese_only = filter_pattern.sub('', raw_line)

# 2. 結巴分詞+詞性標註

words_lst = pseg.cut(chinese_only)

# 3. 去除停用詞

meaninful_words =

for word, flag in words_lst:

# if (word not in stopwords) and (flag == 'v'):

# 也可根據詞性去除非動詞等

if word not in stopwords:

return ' '.join(meaninful_words)

# 處理文字資料

# 過濾空字串

text_df = text_df[text_df['text'] != '']

# 儲存處理好的文字資料

text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),index=none, encoding='utf-8')


'''# 2. 分割訓練集、測試集

# 對應不同類別的感情:

# 0:喜悅

# 1:憤怒

# 2:厭惡

# 3:低落

clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),encoding='utf-8')

# 分割訓練集和測試集

x_train, x_test, y_train, y_test = train_test_split(clean_text_df['text'].values, clean_text_df['label'].values,test_size=0.25)

# 3. 特徵提取

# 計算詞頻

tf = tfidfvectorizer()

# 以訓練集當中的詞的列表進行每篇文章的重要性統計

x_train = tf.fit_transform(x_train)


x_test = tf.transform(x_test)

# 4. 訓練模型***** bayes

mlt = multinomialnb(alpha=1.0)

# print(x_train.toarry()), y_train)

y_predict = mlt.predict(x_test)

#5. **得出準確率 分類模型的評估標準-準確率和召回率(越高越好,**結果的準確性)

print("**的準確率:", mlt.score(x_test, y_test))


