爬蟲 爬取今日頭條街拍2

2021-08-24 20:15:28 字數 2125 閱讀 2660

import re,json

import requests

from urllib import request

import os

defb

(url):

headers =

# print(url)

# headers =

response = requests.get(url,headers=headers)

# print(response)

html_str = response.text

# print(html_str)

pattern = r'gallery: json\.parse\((.*)\),'

match_res = re.search(pattern,html_str)

# print(match_res)

# with open('ying.html', 'wb') as f:

# f.write(response.content)

# 新建資料夾

ifnot os.path.exists('load'):

os.mkdir('load')

if match_res:

# 這本來就是str

# print(match_res.group(1))

json_origin = match_res.group(1)

# 這是第一遍loads, 返回值是str

print('你寫錯了, 不應該來我這')

defa

(offset):

url = ''

a_url = url.format(offset)

# print(a_url)

# print(url)

response = requests.get(a_url)

# 可以通過response.json 直接獲取轉化後的物件(dict)

html_json_dict = response.json()

# print(html_json_dict)

# 獲取dict中的data key對應的列表

data_list = html_json_dict['data']

# print(data_list)

num = offset/20

if num < 4:

offset+=20

# 如果列表中的每一項,有article_url我們就取這個值

for data_item in data_list:

if'article_url'

in data_item:

article_url = data_item['article_url']

# print(article_url)

print(article_url)

b(article_url)

# response = requests.get(article_url)

# print(response)

a(offset)

if __name__=='__main__':

a(0)

爬取今日頭條Ajax請求

搜尋頭條 可以得到這個 search keyword e8 a1 97 e6 8b 8d 開發者工具檢視 我們在搜尋中並沒有發現上面的文字,那麼我們可以www.cppcns.com初步判定,這個由ajax載入,然後渲染出來的。此時切換到xhr過濾,可以看到確實是ajaxyviloqszif請求。觀察...

今日頭條站長平台 頭條搜尋爬蟲spider介紹

頭條搜尋的爬蟲ua為 bytespider 首寫字母為大寫。例如 例如 mozilla 5.0 www.cppcns.com linux android 6.0 nexus 5 build mra58n applewebkit 537.36 khtml,like gecko chrome 41.0....

python2 spider 今日頭條

requests json 直接上 usr bin python coding utf 8 import requests import json url wbdata requests.get url text data json.loads wbdata news data data pc fe...