以前電腦死活跑不了多程序,重灌了一下系統,居然啥都解決了,於是乎就跑了一下:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""@file : jianshu.py.py
@time : 2019/9/1 20:42
@author : sound_of_silence
"""import requests
import time
import re
from lxml.html import etree
from multiprocessing import pool
defget_url_text
(url1)
:try
: headers =
res = requests.get(url1, headers=headers)
res.encoding =
'utf-8'
# print(res.status_code)
return res.text
except baseexception:
return
''def
clean_str
(string)
:return re.sub(r'\s|\n',''
,string)
defget_info
(url2)
:global count
html = get_url_text(url2)
print
(len
(html)
,end=
' '
)print
(url2)
response = etree.html(html)
try:
titles = response.xpath(
'//div[@id="list-container"]//li//a[@class="title"]/text()'
) authors = response.xpath(
'//div[@id="list-container"]//li//a[@class="nickname"]/text()'
) scores = response.xpath(
'//div[@id="list-container"]//li//span[@class="jsd-meta"]/text()')[
1::2
] comments = response.xpath(
'//div[@id="list-container"]//li//div[@class="meta"]/a[2]/text()')[
1::2
] thumb_ups = response.xpath(
'//div[@id="list-container"]//li//div[@class="meta"]/span[2]/text()'
) earnings = response.xpath(
'//div[@id="list-container"]//li//div[@class="meta"]/span[3]/text()'
) urls = response.xpath(
'//div[@id="list-container"]//li//a[@class="title"]/@href'
) abstracts = response.xpath(
'//div[@id="list-container"]//li//p[@class="abstract"]/text()'
)# print(earnings)
for title, author, score, comment, thumb_up, earning, url, abstract in
zip(titles, authors, scores, comments,thumb_ups,earnings,urls, abstracts)
: item =
dict()
item[
'title'
]= clean_str(title)
item[
'author'
]= clean_str(author)
item[
'score']=
eval
(clean_str(score)
) item[
'comment']=
eval
(clean_str(comment)
) item[
'thumb_up']=
eval
(clean_str(thumb_up)
) item[
'earning']=
eval
(clean_str(earning)
) item[
'url']=
''+ clean_str(url)
item[
'abstract'
]= clean_str(abstract)
print
(item)
except
(indexerror, syntaxerror)
as e:
print
(e)pass
if __name__ ==
'__main__'
: urls =
['/c/bdhhpk?order_by=top&page={}'
.format
(i)for i in
range(1
,11)]
count =
0 p = pool(8)
start = time.perf_counter(
)for i in
range(10
):(urls[i],)
) time.sleep(2)
p.close(
) p.join(
)for url in urls:
get_info(url)
time.sleep(2)
end = time.perf_counter(
)print
(f' s'
)
因為加了sleep,單程序時40秒左右,多程序時20秒左右,效果明顯! Python 多程序爬取豆瓣TOP250
import requests from bs4 import beautifulsoup import multiprocessing import time import os 程序1獲取網頁真實位址並存入佇列中 class geturl multiprocessing.process def ...
python爬取豆瓣影評
看的別人的 爬取某部影片的影評 沒有模擬登入只能爬6頁 encoding utf 8 import requests from bs4 import beautifulsoup import re import random import io import sys import time 使用se...
python爬取資料豆瓣讀書
xpath爬取指令碼 from urllib import request from lxml import etree base url response request.urlopen base url html response.read decode utf 8 htmls etree.ht...