使用多執行緒爬取 糗事百科 前十頁段子

2022-07-27 17:18:23 字數 3124 閱讀 5207

定義裝飾器函式

1

defrun_forever(func):

2def

3while

true:

4func(obj)

5類初始化

def__init__

(self, page):

self.max_page =page

self.url_head = '

'self.url_mid = '

text/page/

'self.url_detail = '/'

self.count =0

self.url_queue = queue() #

頁面url佇列

self.get_url_content_queue = queue() #

單個頁面佇列

self.url_queue_all =queue()

self.page_url_list =

定義類方法

def

add_url_to_queue(self):

for i in range(1, self.max_page):

self.url_queue.put(self.url_head + self.url_detail + self.url_mid + str(i) +self.url_detail)

@run_forever

defget_page_url_to_list(self):

url =self.url_queue.get()

response =requests.get(url)

if response.status_code != 200:

self.url_queue.put(url)

print('

url {}驗證失敗 重新寫入

'.format(url))

else

: html =etree.html(response.text)

url_list = html.xpath('

//a[@class="contentherf"]/@href')

for url in

url_list:

self.url_queue_all.put(self.url_head +url)

self.url_queue.task_done()

@run_forever

defget_url_to_content_queue(self):

url =self.url_queue_all.get()

print

(url)

self.get_url_content_queue.put(url)

self.url_queue_all.task_done()

@run_forever

defget_content(self):

url =self.get_url_content_queue.get()

try:

response = requests.get(url, timeout=1)

if response.status_code != 200:

self.get_url_content_queue.put(url)

else

: html =etree.html(response.text)

title = html.xpath('

//h1[@class="article-title"]/text()')

contents = html.xpath('

//div[@class="content"]/text()')

with open(

'qiushi.txt

', '

a', encoding='

utf8

') as p:

for x in

title:

p.write(

"title:

" +x)

p.write('\n

')for i in

contents:

p.write(i + '\n'

) p.write('\n

')response.close()

self.count += 1

print("

".format(self.count))

self.get_url_content_queue.task_done()

except

:

print("

url truble:{}

".format(url))

self.get_url_content_queue.put(url)

def run_sue_more_task(self, func, count=1):

for i in

range(0, count):

t = thread(target=func)

t.setdaemon(true)

t.start()

defrun(self):

self.add_url_to_queue()

self.run_sue_more_task(self.get_page_url_to_list, 3)

self.run_sue_more_task(self.get_url_to_content_queue, 3)

self.run_sue_more_task(self.get_content, 5)

self.url_queue.join()

self.get_url_content_queue.join()

self.url_queue_all.join()

建立例項,呼叫方法

if

__name__ == '

__main__':

qbs = get_qiushibaike(12)

qbs.run()

ps:爬蟲有風險,封ip需謹慎,執行緒一時爽,封號火葬場

多執行緒爬去糗事百科

import queue import threading from fake useragent import useragent import time import requests from requests.exceptions import requestexception from l...

簡單爬取糗事百科

剛剛入門,對於爬蟲還要折騰很久才行,雖然很多功能還沒開始掌握,但是爬取下來就很開心,接下來還會爭取進步的。把自己出現的一些錯誤都加上了注釋,我目前還在學習當中,大家一起進步。期間學了乙個新的函式,在這裡分享下 strip 網上是這麼說的 需要注意的是,傳入的是乙個字元陣列,編譯器去除兩端所有相應的字...

python 爬取糗事百科

step 1 構建乙個提取糗事百科笑話的函式import urllib2 import urllib import re import thread import time import sys reload sys sys.setdefaultencoding utf 8 defgetpage p...