多執行緒爬蟲實戰

2021-09-27 08:18:27 字數 3113 閱讀 5902

from urllib import request, error

import re

headers = ('user-agent', 'mozilla/5.0 (windows nt 10.0; win64; x64; rv:68.0) gecko/20100101 firefox/68.0')

opener = request.build_opener()

opener.addheaders = [headers]

request.install_opener(opener)

for i in range(1, 2):

url = '' + str(i) + '.html'

file = './data/' + str(i) + '.html'

try:

data = request.urlopen(url).read().decode('utf-8', 'ignore')

pat = 'target="_blank" href="(.*?)" class="all-read"'

ret = re.compile(pat).findall(data)

# 爬取笑話網全文

for j in range(0, len(ret)):

all_url = '' + ret[j]

all_data = request.urlopen(all_url).read().decode('gbk', 'ignore')

all_pat = '

(.*?)

' # re.s,讓.匹配換行

all_ret = re.compile(all_pat, re.s).findall(all_data)

print(all_ret)

except error.urlerror as e:

if hasattr(e, 'code'):

print(e.code)

if hasattr(e, 'reason'):

print(e.reason)

except exception as e:

print(e)

多執行緒

import threading

# 定義a、b執行緒

class a(threading.thread):

def __init__(self):

threading.thread.__init__(self)

def run(self):

for i in range(0, 100):

print('我是a菜')

class b(threading.thread):

def __init__(self):

threading.thread.__init__(self)

def run(self):

for i in range(0, 10):

print('我是b菜')

# 開啟執行緒

t1 = a()

t1.start()

t2 = b()

t2.start()

多執行緒爬蟲

from urllib import request, error

import re

import threading

headers = ('user-agent', 'mozilla/5.0 (windows nt 10.0; win64; x64; rv:68.0) gecko/20100101 firefox/68.0')

opener = request.build_opener()

opener.addheaders = [headers]

request.install_opener(opener)

class one(threading.thread):

def __init__(self):

threading.thread.__init__(self)

def run(self):

# 爬取奇數頁

for i in range(1, 30, 2):

get_data(i)

class two(threading.thread):

def __init__(self):

threading.thread.__init__(self)

def run(self):

# 爬取偶數頁

for i in range(2, 30, 2):

get_data(i)

def get_data(i):

url = '' + str(i) + '.html'

try:

data = request.urlopen(url).read().decode('utf-8', 'ignore')

pat = 'target="_blank" href="(.*?)" class="all-read"'

ret = re.compile(pat).findall(data)

# 爬取笑話網全文

for j in range(0, len(ret)):

all_url = '' + ret[j]

all_data = request.urlopen(all_url).read().decode('gbk', 'ignore')

all_pat = '

(.*?)

' # re.s,讓.匹配換行

all_ret = re.compile(all_pat, re.s).findall(all_data)

print(all_ret)

except error.urlerror as e:

if hasattr(e, 'code'):

print(e.code)

if hasattr(e, 'reason'):

print(e.reason)

except exception as e:

print(e)

a = one()

b = two()

a.start()

b.start()

爬蟲多執行緒

多執行緒在之前的scrapy裡面已經接觸過了,就是裡面的yiled,開啟乙個新的執行緒。但是這是這是基於這個高階框架的,用的時候只知道這是開啟了乙個新的執行緒,並不是很清楚到底是怎麼執行的。而在python裡面有包 import threading引入這個包之後就可以寫自己的多執行緒了 寫多執行緒的...

多執行緒爬蟲

python標準庫是執行緒之間常見的資料交換形式 queue的使用可以確保python的執行緒安全 q queue.queue maxsize 建立佇列,並可以指定大小 q.empty 判斷佇列是否為空 q.full 判斷佇列是否滿 q.put data 向佇列中放入資料 q.get 從佇列中拿資料...

爬蟲多執行緒

執行緒程序 import requests import threading import json from queue import queue import time 寫子類 class thread crawl threading.thread def init self,name,page...