python爬去糗事百科

2022-06-14 20:18:11 字數 3403 閱讀 6220

1.用requests+beautifulsoup抓取糗事百科的文字內容;

2.將抓取的內容寫入txt。

1.獲取網頁源**

def get_html(url): #

用requests庫得到網頁源**

html =requests.get(url).text

return html

2.檢視源**結構找到要抓取的目標

3.找到這幾樣就可以寫抓取**如下

soup = beautifulsoup(html,'

lxml')

datas = soup.find(id="

content-left

")#獲取全部內容標籤

data_list = datas.find_all(class_="

article")

for data in

data_list:

contents = data.find(class_="

content

").text.replace('

\n','')#

獲取內容

name = data.find('

h2').text.replace('

\n','')#

獲取暱稱

age_gender = data.find(class_="

articlegender

")#獲取性別

if age_gender is

notnone:

cll = age_gender['

class']

if'womenicon'in

cll:

gender = '女'

elif

'manicon'in

cll:

gender = '男'

else

: gender = ''

age =age_gender.string

else

: gender = ''

age = ''

votes = data.find(class_="

stats-vote

").find(class_="

number

").text#

獲取點讚數

comments = data.find(class_="

stats-comments

").find(class_="

number

").text#

4.全部**肉如下

import

requests

from bs4 import

beautifulsoup

def get_html(url): #

用requests庫得到網頁源**

html =requests.get(url).text

return

html

defget_data(html):

soup = beautifulsoup(html,'

lxml')

datas = soup.find(id="

content-left

")#獲取全部內容標籤

data_list = datas.find_all(class_="

article")

for data in

data_list:

contents = data.find(class_="

content

").text.replace('

\n','')#

獲取內容

name = data.find('

h2').text.replace('

\n','')#

獲取暱稱

age_gender = data.find(class_="

articlegender

")#獲取性別

if age_gender is

notnone:

cll = age_gender['

class']

if'womenicon'in

cll:

gender = '女'

elif

'manicon'in

cll:

gender = '男'

else

: gender = ''

age =age_gender.string

else

: gender = ''

age = ''

votes = data.find(class_="

stats-vote

").find(class_="

number

").text#

獲取點讚數

comments = data.find(class_="

stats-comments

").find(class_="

number

").text#

dict =

yield

dict

defget_txt(dict):

print('

--'+'

正在寫入......')

with open(

'糗事百科.txt

','a+

',encoding='

utf-8

') as f:

for i in

dict:

f.write(str(i)+'\n'

)

print('

---'+'

寫入完畢')

defmain():

for i in range(1,20):

print('

正在爬取第%d頁

',i)

url = '

'.format(i)

html =get_html(url)

dict =get_data(html)

get_txt(dict)

if__name__ == '

__main__':

main()

5.謝謝**

多執行緒爬去糗事百科

import queue import threading from fake useragent import useragent import time import requests from requests.exceptions import requestexception from l...

python 爬取糗事百科

step 1 構建乙個提取糗事百科笑話的函式import urllib2 import urllib import re import thread import time import sys reload sys sys.setdefaultencoding utf 8 defgetpage p...

Python爬取糗事百科

一 引入模組 因為urlopen功能比較簡單,所以設定 ip需引入proxyhandler和build opener模組,ip的獲取可以上西祠 查詢 import re from urllib.request import request,build opener,proxyhandler base...