Python多程序爬取豆瓣Top250資料

2021-10-01 02:18:23 字數 3306 閱讀 4212

以前電腦死活跑不了多程序,重灌了一下系統,居然啥都解決了,於是乎就跑了一下:

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

"""@file : jianshu.py.py

@time : 2019/9/1 20:42

@author : sound_of_silence

"""import requests

import time

import re

from lxml.html import etree

from multiprocessing import pool

defget_url_text

(url1)

:try

: headers =

res = requests.get(url1, headers=headers)

res.encoding =

'utf-8'

# print(res.status_code)

return res.text

except baseexception:

return

''def

clean_str

(string)

:return re.sub(r'\s|\n',''

,string)

defget_info

(url2)

:global count

html = get_url_text(url2)

print

(len

(html)

,end=

' '

)print

(url2)

response = etree.html(html)

try:

titles = response.xpath(

'//div[@id="list-container"]//li//a[@class="title"]/text()'

) authors = response.xpath(

'//div[@id="list-container"]//li//a[@class="nickname"]/text()'

) scores = response.xpath(

'//div[@id="list-container"]//li//span[@class="jsd-meta"]/text()')[

1::2

] comments = response.xpath(

'//div[@id="list-container"]//li//div[@class="meta"]/a[2]/text()')[

1::2

] thumb_ups = response.xpath(

'//div[@id="list-container"]//li//div[@class="meta"]/span[2]/text()'

) earnings = response.xpath(

'//div[@id="list-container"]//li//div[@class="meta"]/span[3]/text()'

) urls = response.xpath(

'//div[@id="list-container"]//li//a[@class="title"]/@href'

) abstracts = response.xpath(

'//div[@id="list-container"]//li//p[@class="abstract"]/text()'

)# print(earnings)

for title, author, score, comment, thumb_up, earning, url, abstract in

zip(titles, authors, scores, comments,thumb_ups,earnings,urls, abstracts)

: item =

dict()

item[

'title'

]= clean_str(title)

item[

'author'

]= clean_str(author)

item[

'score']=

eval

(clean_str(score)

) item[

'comment']=

eval

(clean_str(comment)

) item[

'thumb_up']=

eval

(clean_str(thumb_up)

) item[

'earning']=

eval

(clean_str(earning)

) item[

'url']=

''+ clean_str(url)

item[

'abstract'

]= clean_str(abstract)

print

(item)

except

(indexerror, syntaxerror)

as e:

print

(e)pass

if __name__ ==

'__main__'

: urls =

['/c/bdhhpk?order_by=top&page={}'

.format

(i)for i in

range(1

,11)]

count =

0 p = pool(8)

start = time.perf_counter(

)for i in

range(10

):(urls[i],)

) time.sleep(2)

p.close(

) p.join(

)for url in urls:

get_info(url)

time.sleep(2)

end = time.perf_counter(

)print

(f' s'

)

因為加了sleep,單程序時40秒左右,多程序時20秒左右,效果明顯!

Python 多程序爬取豆瓣TOP250

import requests from bs4 import beautifulsoup import multiprocessing import time import os 程序1獲取網頁真實位址並存入佇列中 class geturl multiprocessing.process def ...

python爬取豆瓣影評

看的別人的 爬取某部影片的影評 沒有模擬登入只能爬6頁 encoding utf 8 import requests from bs4 import beautifulsoup import re import random import io import sys import time 使用se...

python爬取資料豆瓣讀書

xpath爬取指令碼 from urllib import request from lxml import etree base url response request.urlopen base url html response.read decode utf 8 htmls etree.ht...