Python 多程序爬取豆瓣TOP250

2021-08-15 19:33:49 字數 2793 閱讀 2076

import requests

from bs4 import beautifulsoup

import multiprocessing

import time

import os

# 程序1獲取網頁真實位址並存入佇列中

class geturl(multiprocessing.process):

def __init__(self, urlqueue, count, url):

multiprocessing.process.__init__(self)

self.urlqueue = urlqueue

self.url = url

self.count = count

def run(self):

# time.sleep(5)

while self.count >= 0 and self.count <= 250:

page_url = self.url + '?start=' + str(self.count) + '&filter='

self.urlqueue.put(page_url)

# self.urlqueue.task_done()

self.count += 25

# time.sleep(1)

# 程序2獲取資訊並存入txt文件中

class getcontent(multiprocessing.process):

def __init__(self, urlqueue):

multiprocessing.process.__init__(self)

self.urlqueue = urlqueue

def run(self):

while true:

header = {'referer': '',

'user-agent':

url = self.urlqueue.get()

res = requests.get(url, headers=header)

soup = beautifulsoup(res.text, 'html.parser')

for contents in soup.select('.info'):

if contents.select('.hd') != :

titles = ''.join(contents.select('.hd')[0].text.split())

# print(titles)

if contents.select('.bd p') != :

peoples = contents.select('.bd p')[0]

name = peoples.contents[0].strip()

addrs = peoples.contents[2].strip()

# print(name)

# print(addrs)

score = contents.select('.bd .star .rating_num')[0].text

numbers = contents.select('.bd .star span')[3].text # .contents[6]

# print (score)

# print(numbers)

if contents.select('.bd .quote .inq') != :

message = contents.select('.bd .quote .inq')[0].text

# print(message)

content = [titles, name, addrs,

score, numbers, message]

with open('c:\\users\\dell\\desktop\\douban.txt', 'a', encoding='utf-8') as file:

for each in content:

file.write(each)

file.write('\n')

file.write('\n')

file.write('\n')

# print()

time.sleep(1)

# 程序3監控程序1,2

class contrl(multiprocessing.process):

def __init__(self, urlqueue):

multiprocessing.process.__init__(self)

self.urlqueue = urlqueue

def run(self):

while true:

print("程式執行中")

time.sleep(60)

if (self.urlqueue.empty()):

print("程式執行完畢!")

exit()

if __name__ == '__main__':

url = ''

count = 0

urlqueue = multiprocessing.queue()

t1 = geturl(urlqueue, count, url)

t1.start()

t2 = getcontent(urlqueue)

t2.start()

t3 = contrl(urlqueue)

t3.start()

注意:這裡不能使用佇列queue模組,而要使用multiprocessing 裡的queue方法,不然會報錯。

Python多程序爬取豆瓣Top250資料

以前電腦死活跑不了多程序,重灌了一下系統,居然啥都解決了,於是乎就跑了一下 usr bin env python encoding utf 8 file jianshu.py.py time 2019 9 1 20 42 author sound of silence import requests...

python爬取豆瓣影評

看的別人的 爬取某部影片的影評 沒有模擬登入只能爬6頁 encoding utf 8 import requests from bs4 import beautifulsoup import re import random import io import sys import time 使用se...

python爬取資料豆瓣讀書

xpath爬取指令碼 from urllib import request from lxml import etree base url response request.urlopen base url html response.read decode utf 8 htmls etree.ht...