python爬蟲 爬取勵志語錄

2021-10-01 18:54:23 字數 3187 閱讀 7963

將爬取到的網頁寫入檔案中

import urllib.request

))

爬取勵志語錄
import urllib.request

import urllib.parse

import re

import os

import time

'''start_page = int(input("請輸入開始頁面:"))

end_page = int(input("請輸入結束頁面:"))

for page in range(start_page ,end_page+1):

url = ''+str(page)+'.html'

header =

request = urllib.request.request(url=url,headers= header)

response = urllib.request.urlopen(request)

content = response.read().decode('utf8')

with open('f1.html', 'w', encoding='utf8') as fp:

fp.write(content)

pattern = re.compile(r'(.*?).*?',re.s)

ret = pattern.findall(content)

#print(ret)

for text_info in ret:

text_title = text_info[0]

text_main = text_info[1]

dirname = 'lizhi'

if not os.path.exists(dirname):

os.mkdir(dirname)

filename = str(text_title).split('——')[0]

filepath = os.path.join(dirname, filename)

with open(filepath+ '.txt', 'w',encoding = 'utf8') as fp:

fp.write(text_main)

'''# 將上述過程封裝成函式

defhandle_request

(url, page)

: url +=

str(page)

+'.html'

header =

request = urllib.request.request(url=url, headers=header)

return request

defparse_content

(request)

: response = urllib.request.urlopen(request)

content = response.read(

).decode(

'utf8'

)with

open

('f1.html'

,'w'

, encoding=

'utf8'

)as fp:

fp.write(content)

pattern = re.

compile

(r'(.*?).*?'

, re.s)

ret = pattern.findall(content)

writein(ret)

defwritein

(ret)

:for text_info in ret:

text_title = text_info[0]

text_main = text_info[1]

dirname =

'lizhi'

ifnot os.path.exists(dirname)

: os.mkdir(dirname)

filename =

str(text_title)

filepath = os.path.join(dirname, filename)

print

("%s start writing......"

%filename)

with

open

(filepath+

'.txt'

,'w'

,encoding =

'utf8'

)as fp:

fp.write(text_main)

print

("%s write successfully"

% filename)

#time.sleep()

defmain()

: url =

''start_page =

int(

input

("請輸入開始頁面:"))

end_page =

int(

input

("請輸入結束頁面:"))

for page in

range

(start_page, end_page +1)

:print

("start downloading %s頁......"

% page)

request = handle_request(url, page)

parse_content(request)

print

("第%s頁 end download"

% page)

time.sleep(1)

if __name__ ==

'__main__'

: main(

)

Python 爬蟲爬取網頁

工具 python 2.7 import urllib import urllib2 defgetpage url 爬去網頁的方法 request urllib.request url 訪問網頁 reponse urllib2.urlopen request 返回網頁 return response...

python爬蟲爬取策略

在爬蟲系統中,待抓取url佇列是很重要的一部分。待抓取url佇列中的url以什麼樣的順序排列也是乙個很重要的問題,因為這涉及到先抓取那個頁面,後抓取哪個頁面。而決定這些url排列順序的方法,叫做抓取策略。下面重點介紹幾種常見的抓取策略 一 深度優先遍歷策略 深度優先遍歷策略是指網路爬蟲會從起始頁開始...

python爬蟲 seebug爬取

1.找相關的標籤一步一步往下查詢 2.有cookie才能查詢 3.用import re而不用from re import 是為了防止衝突 coding utf 8 from requests import import re from bs4 import beautifulsoup as bs h...