生成詞云的函式

2021-10-08 00:26:02 字數 4059 閱讀 5719

寫的乙個生成詞云的函式

這個是沒有自定義圖版本

#coding:utf-8

__author__ = '英俊'

import warnings

warnings.filterwarnings("ignore")

#codecs提供的open方法來指定開啟的檔案的語言編碼,它會在讀取的時候自動轉換為內部unicode

import codecs

#分詞包

import jieba

#統計import pandas as pd

#numpy計算包

import numpy

#視覺化

import matplotlib.pyplot as plt

#展示在當前頁面中

%matplotlib inline

import matplotlib

#設定大小

matplotlib.rcparams['figure.figsize'] = (10.0, 5.0)

from wordcloud import wordcloud#詞云包

def createwordcloud(text_path):

# "./data/entertainment_news.csv"

df = pd.read_csv(text_path, encoding='utf-8')

# 去掉空行

df = df.dropna()

# df.head()

#將資料變成list

content=df.content.values.tolist()

segment=

for line in content:

try:

#列表segs=jieba.lcut(line)

for seg in segs:

#判斷是否為空或者是不是換行詞

if len(seg)>1 and seg!='\r\n':

except:

print(line)

continue

words_df=pd.dataframe()

# words_df.head()

stopwords=pd.read_csv("data/stopwords.txt",index_col=false,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用

# stopwords.head()

# 先抽取在停用詞裡面的分詞片語,然後再將它去掉

words_df=words_df[~words_df.segment.isin(stopwords.stopword)]

# words_df.head()

# 這一塊是個難點,詞頻統計

words_stat = words_df.groupby('segment').agg(計數=pd.namedagg(column='segment', aggfunc='size')).reset_index().sort_values(

by='計數', ascending=false)

# words_stat.head()

# font_path因為中文文字做詞云很麻煩,backgrou_color是設定背景底,max是文字的最大大小

wordcloud=wordcloud(font_path="data/simhei.ttf",background_color="white",max_font_size=80)

#這是乙個字典生成,x[0]是詞是什麼,x[1]是詞的數量

word_frequence =

#生成詞云

wordcloud=wordcloud.fit_words(word_frequence)

plt.imshow(wordcloud)

呼叫函式

createwordcloud("./data/entertainment_news.csv")
生成詞云

加入自定義的

from scipy.misc import imread

#詞云大小

matplotlib.rcparams['figure.figsize'] = (15.0, 15.0)

from wordcloud import wordcloud,imagecolorgenerator

def createsuperwordcloud(text_path,image_path):

# "./data/entertainment_news.csv"

df = pd.read_csv(text_path, encoding='utf-8')

# 去掉空行

df = df.dropna()

# df.head()

#將資料變成list

content=df.content.values.tolist()

segment=

for line in content:

try:

#列表segs=jieba.lcut(line)

for seg in segs:

#判斷是否為空或者是不是換行詞

if len(seg)>1 and seg!='\r\n':

except:

print(line)

continue

words_df=pd.dataframe()

# words_df.head()

stopwords=pd.read_csv("data/stopwords.txt",index_col=false,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用

# stopwords.head()

# 先抽取在停用詞裡面的分詞片語,然後再將它去掉

words_df=words_df[~words_df.segment.isin(stopwords.stopword)]

# words_df.head()

# 這一塊是個難點,詞頻統計

words_stat = words_df.groupby('segment').agg(計數=pd.namedagg(column='segment', aggfunc='size')).reset_index().sort_values(

by='計數', ascending=false)

#讀取生成背景

# 'image/entertainment.jpeg'

bimg=imread(image_path)

# 生成詞云

wordcloud=wordcloud(background_color="white",mask=bimg,font_path='data/simhei.ttf',max_font_size=200)

#生成詞頻

word_frequence =

wordcloud=wordcloud.fit_words(word_frequence)

# 重新上色

bimgcolors=imagecolorgenerator(bimg)

# 去掉off

plt.axis("off")

#重新填寫背景

plt.imshow(wordcloud.recolor(color_func=bimgcolors))

呼叫函式:

createsuperwordcloud("./data/entertainment_news.csv",'image/entertainment.jpeg')
生成詞云:

Python 生成詞云

import matplotlib.pyplot as plt from wordcloud import wordcloud import jieba text from file with apath open python.txt encoding utf 8 read wordlist af...

python 生成詞云

coding utf 8 from wordcloud import wordcloud import matplotlib.pyplot as plt import jieba from pil import image import numpy as np 生成詞云 defcreate word...

python 生成詞云

1 知識點 wordcloud引數講解 font path表示用到字型的路徑 width和height表示畫布的寬和高 prefer horizontal可以調整詞雲中字型水平和垂直的多少 mask即掩膜,產生詞云背景的區域 scale 計算和繪圖之間的縮放 min font size設定最小的字型...