爬取微博標題並儲存到txt,製作詞云

2021-10-11 09:29:39 字數 4393 閱讀 8378

# encoding=utf-8

defget()

:# pros[0].click()

# # 切換到開啟的頁面

# driver.switch_to.window(driver.window_handles[1])

# # 單數xpath

# title = driver.find_element_by_xpath('').text

# print(title)

# # 找到**

# price = driver.find_element_by_xpath('//爺爺//孫子').text

# print(price)

## # 關閉頁面批量爬取

# driver.close()

# # 切換頁面

# # 切換到開啟的頁面

# driver.switch_to.window(driver.window_handles[0])

# # 迴圈

## for i in pros[:3]:

# i.click()

# driver.switch_to.window(driver.window_handles[0])

## # 目標**,獲取各個標題

# pros = driver.find_elements_by_xpath('//div[@class="keyword-out-div"]')

## print(len(pros))

# pros[0].click()

# # 切換到開啟的頁面

# driver.switch_to.window(driver.window_handles[1])

# # 單數xpath

# title = driver.find_element_by_xpath('').text

# print(title)

# # 找到**

# price = driver.find_element_by_xpath('//爺爺//孫子').text

# print(price)

# print('##############')

# # 關閉頁面批量爬取

# driver.close()

# driver.switch_to.window(driver.window_handles[0])##

##return

0if __name__ ==

'__main__'

:from selenium import webdriver

url =

''# 開啟**,指定為chrome瀏覽器

driver = webdriver.chrome(r'c:\users\wangxutao\desktop\chromedriver_win32\chromedriver.exe'

)# 載入網頁

driver.get(url)

# titles_list =

# on_times_list=

# 目標**,獲取各個標題

pros = driver.find_elements_by_xpath(

'//div[@class="keyword-out-div"]//p'

)# to_click= driver.find_element_by_xpath('//button[@class="ui-btn ui-btn-inline" id="btn_after"]')

# to_click.click()

for pp in

range(11

*30*24

*30*10

):to_click = driver.find_elements_by_xpath(

'//button[@class="ui-btn ui-btn-inline"]'

) to_click=to_click[1]

to_click.click(

)import time

# 記錄titles

for i in pros[

:200:4

]:f =

open

('2020year_hot_titles_for_weibo.txt'

,'a+'

) title = i.text

# 取出標題 儲存在txt裡面

title = title[title.find(

'.')+1

:]title = title.replace(

'(微博廣告位留空)',''

) f.write(title+

'\n'

) f.close(

)# print(title)

# # 記錄在榜單時間

# for i in pros[0+3:200+3:4]:

# on_times = i.text

# on_times = on_times[on_times.find('在榜時長:') + 5:]

# print(on_times_list)

# print(titles_list)

# print(on_times_list)

# import pandas as pd

# data =pd.dataframe()

# data['標題'] = titles_list

# data['持續時間'] = on_times_list

# print(data)

# data.to_csv('mycsv.xlsx')

# encoding=utf-8

# jieba cut

defjieba_cut

(mytxt)

:import jieba

mytext =

" ".join(jieba.cut(mytxt)

)return mytext

defmakeword_cloud

(txt)

:from wordcloud import wordcloud

wordcloud = wordcloud(font_path=

'simsun.ttf'

,width=

1920

, height=

1080

,mode=

'rgba'

, background_color=

none

).generate(txt)

import matplotlib.pyplot as plt

# plt.switch_backend('agg')

plt.imshow(wordcloud, interpolation=

'bilinear'

) plt.axis(

"off"

)if __name__ ==

'__main__'

: word_cloud =

'2020year_hot_titles_for_weibo.txt'

f =open

(file

=word_cloud,mode=

'r')

content = f.read(

)# content= content.replace('(微博廣告位留空)',' ')

content_for_ciyun = jieba_cut(content)

content_for_ciyun=content_for_ciyun.replace(

'感染'

,'釘釘'

) content_for_ciyun=content_for_ciyun.replace(

'境外',)

content_for_ciyun=content_for_ciyun.replace(

'輸入'

,'辦公'

) content_for_ciyun=content_for_ciyun.replace(

'開學',)

content_for_ciyun=content_for_ciyun.replace(

'時間'

,'會議'

)print

(content_for_ciyun)

# content_for_ciyun=content_for_ciyun.replace('確診','釘釘')

# print(content_for_ciyun)

makeword_cloud(content_for_ciyun)

Python爬取熱門微博,並儲存到MySQL中

目標 m.weibo.cn url的獲取可以從瀏覽器的f12中的network的xhr中找到。weibo demo.py import requests import json from w3lib.html import remove tags from mysqlhelper import my...

Scrapy爬取並儲存到TXT檔案

在建立完成專案並建立爬蟲的基礎上,編寫儲存到txt的專案 1.將 robotstxt obey 設定為false 2.將 item pipelines 開啟 item是scrapy提供的類似於字典型別的資料容器,它與字典最大的區別在於它規定了統一的資料規格樣式,即具有統一性與結構性。這樣既方便資料的...

scrapy爬取資料並儲存到文字

1.scrapy專案結構如下 2.開啟spidler目錄下的duba.py檔案,如下 這個是根據豆瓣一部分頁面獲取的熱門話題內容,有6條資料 coding utf 8 import scrapy from scrapydemo.items import scrapydemoitem from lxm...