糗事百科(文字)爬取

2021-09-09 09:05:06 字數 2744 閱讀 9305

import requests

from lxml import etree

class

qiubaispider

(object):

def__init__

(self)

:# 要爬取的位址

self.url_temp =

''# 請求頭

self.headers =

defget_url_list

(self, page_num)

:return

[self.url_temp.

format

(i)for i in

range(1

, page_num)

]def

paser_one_url

(self, url1)

:# 傳送請求,獲取響應

response = requests.get(url1, headers=self.headers)

return response.content.decode(

)def

paser_two_url

(self, url2)

: response = requests.get(url2, headers=self.headers)

return response.content

defget_page_url

(self, html_str)

: num =

1 html = etree.html(html_str)

self.div_list = html.xpath(

'//div[@id="content-left"]/div/a[1]/@href'

)# 遍歷出每乙個頁面

for i in self.div_list:

with

open(,

'r')

as f:

a = f.readlines(

)if i +

'\n'

notin a:

# 拼接頁面位址

url2 =

''+ i url2_html = self.paser_two_url(url2)

html2 = etree.html(url2_html)

data_list = html2.xpath(

'//div[@id="single-next-link"]/div[@class="content"]/text()'

) data =

''.join(data_list)

print

('-'

*1000

)print

('%s.'

% num)

print

(data)

# 儲存資料

with

open

('糗事百科.txt'

,'a'

, encoding=

'utf-8'

)as f:

f.write(

str(num)

+'.'

) f.write(data)

f.write(

'\n\n\n'

) num +=

1print

('儲存成功'

)print

('-'

*1000

)with

open(,

'a', encoding=

'utf-8'

)as f:

f.write(i)

f.write(

'\n'

)else

:print()

f.close(

)def

run(self)

:# 1.獲取url_list,所有網頁

page_num =

int(

input

('請輸入要爬取的頁數:'))

url_list = self.get_url_list(

int(page_num)+1

)print

(url_list)

# 2.遍歷,傳送請求,獲取響應

num =

1for url1 in url_list:

print

('第%s頁'

% num)

html_str = self.paser_one_url(url1)

get_data = self.get_page_url(html_str)

with

open

('糗事百科.txt'

簡單爬取糗事百科

剛剛入門,對於爬蟲還要折騰很久才行,雖然很多功能還沒開始掌握,但是爬取下來就很開心,接下來還會爭取進步的。把自己出現的一些錯誤都加上了注釋,我目前還在學習當中,大家一起進步。期間學了乙個新的函式,在這裡分享下 strip 網上是這麼說的 需要注意的是,傳入的是乙個字元陣列,編譯器去除兩端所有相應的字...

python 爬取糗事百科

step 1 構建乙個提取糗事百科笑話的函式import urllib2 import urllib import re import thread import time import sys reload sys sys.setdefaultencoding utf 8 defgetpage p...

Python爬取糗事百科

一 引入模組 因為urlopen功能比較簡單,所以設定 ip需引入proxyhandler和build opener模組,ip的獲取可以上西祠 查詢 import re from urllib.request import request,build opener,proxyhandler base...