python爬取豆瓣影評

2021-08-30 08:51:42 字數 2143 閱讀 6384

看的別人的** 爬取某部影片的影評 沒有模擬登入只能爬6頁

# -*- encoding:utf-8 -*-

import requests

from bs4 import beautifulsoup

import re

import random

import io

import sys

import time

# 使用session來儲存登陸資訊

s = requests.session()

# 獲取動態ip,防止ip被封

def get_ip_list(url, headers):

web_data = requests.get(url, headers=headers)

soup = beautifulsoup(web_data.text, 'lxml')

ips = soup.find_all('tr')

ip_list =

for i in range(1, len(ips)):

ip_info = ips[i]

tds = ip_info.find_all('td')

return ip_list

# 隨機從動態ip鍊錶中選擇一條ip

def get_random_ip(ip_list):

proxy_list =

for ip in ip_list:

proxy_ip = random.choice(proxy_list)

proxies =

return proxies

def get_data(html):

soup = beautifulsoup(html, "lxml")

comment_list = soup.select('.comment > p')

next_page = soup.select('.next')[0].get('href')

return comment_list, next_page

if __name__ == "__main__":

absolute = ''

headers =

# 獲取動態ip

url = ''

ip_list = get_ip_list(url, headers=headers)

proxies = get_random_ip(ip_list)

current_page = absolute

next_page = ""

comment_list =

temp_list =

num = 0

ans = 0

while (1):

ans+=1

print("爬取第" + str(ans) + "頁")

time.sleep(5)

html = s.get(current_page, headers=headers, proxies=proxies).content

temp_list, next_page = get_data(html)

if ans is 7:

break

current_page = absolute + next_page

comment_list = comment_list + temp_list

# time.sleep(1 + float(random.randint(1, 100)) / 20)

num = num + 1

# 每20次更新一次ip

if num % 20 == 0:

proxies = get_random_ip(ip_list)

print(current_page)

with open("f:\comments.txt", 'a')as f:

ark = 0

for node in comment_list:

comment = node.get_text().strip().replace("\n", "")

f.write(comment + "\n")

ark += 1

print("寫了" + str(ark) + "個")

f.close()

nodejs爬取豆瓣影評

爬取豆瓣心靈奇旅影評,包括使用者主頁頭像 let request require request let fs require fs const path require path var startnum 0 起始爬取位置 傳送請求 function reqdata url else 請求處理 a...

豆瓣影評爬取 中國機長

10月大火的中國機長相信大家都看過了吧 悄悄的說,我還有二刷?超級超級超級超級喜歡袁泉姐姐,溫柔又堅定,真誠又勇敢!import requests from bs4 import beautifulsoup import time import pandas as pd import os def ...

python爬蟲實戰 爬取豆瓣影評資料

爬取豆瓣影評資料步驟 1 獲取網頁請求 2 解析獲取的網頁 3 提速資料 4 儲存檔案 1 匯入需要的庫 import urllib.request from bs4 import beautifulsoup 隨機數的庫 import random 時間庫 import time 庫 import ...