爬取百度諮詢

2021-10-01 08:18:23 字數 3403 閱讀 5746

獲取url,就是把關鍵字進行urlencode。整理爬取的內容,就是把一些【回車】,【空格】等雜七雜八的東西過濾掉。輸出結果下面是**:

import re

from urllib import parse

import time

import requests

from bs4 import beautifulsoup

import pandas as pd

def html_decode

(url)

: agent =

headers =

time.

sleep(5

) html = requests.

get(url, headers=headers)

content = html.content

soup =

beautifulsoup

(content, fromencoding=

'utf-8'

) content_list =

author_list =

time_list =

for mulu in soup.

findall

('div'

, attrs=):

# get subject

subject = mulu.h3.a.

get_text()

author_time = mulu.div.p.

get_text()

text1 = re.

sub(r"\n| ",''

, subject)

text2 = re.

sub(r"\n| ",''

, author_time)

text2.

strip()

author = text2.

split

(' ')[

0]time = text2.

split

(' ')[

1]content_list.

(text1)

author_list.

(author)

time_list.

(time)

return content_list, author_list, time_list

def get_context

(keyword, page)

: all_content =

all_author =

all_time =

agent =

headers =

parameter_page = page *

10if

type

(keyword)

== str:

main_url =r''

parameter =

# print

(str

(url_data)

)else

:print

('url wrong!'

) # get first page

url_data = parse.

urlencode

(parameter)

first_page_paramter =

str(url_data)

.split

('&inputt')[

0]first_page = parse.

urljoin

(main_url,

's?'

+str

(first_page_paramter)

) first_item, first_author_list, first_time_list =

html_decode

(first_page)

all_content.

extend

(first_item)

all_author.

extend

(first_author_list)

all_time.

extend

(first_time_list)

# get other pages

for num in

range(1

, page+1)

:print

('this is page %d!'

%(num)

) parameter[

'pn'

]= num *

10 url_data = parse.

urlencode

(parameter)

all_url = parse.

urljoin

(main_url,

's?'

+str

(url_data)

) # print

(all_url)

other_items, other_author_list, other_time_list =

html_decode

(all_url)

all_content.

extend

(other_items)

all_author.

extend

(other_author_list)

all_time.

extend

(other_time_list)

return all_content, all_author, all_time

if __name__ ==

'__main__'

: date = time.

strftime

('%m-%d'

,time.

localtime()

) key_word =

'華山'

page_num =

5 all_content, all_author, all_time =

get_context

(key_word, page_num)

result = pd.

dataframe()

result.

to_excel

('result_%s.xlsx'

%date)

print

('總共%d條資料!'

%len

(all_content)

)print

('爬取完畢!'

)

參考的部落格有:

網頁爬蟲 爬取百度諮詢新聞

工具 import urllib request urllib.request庫可以模擬瀏覽器傳送網頁請求並獲取request的結果。以科技類新聞為例,擬爬取這樣一篇文章。首先,傳送請求 html request urllib request request html 寫入獲取到的網頁,並轉化成py...

爬取百度(有道)翻譯

難點是分析動態網頁 抓包 找出url。self.query input 請輸入要翻譯的內容 self.url self.data self.headers def run self post response requests.post url self.url,data self.data,hea...

爬取百度貼吧

import urllib.request import urllib.parse import os,time 輸入貼吧名字 baname input 請輸入貼吧的名字 start page int input 請輸入起始頁 end page int input 請輸入結束頁 不完整的url ur...