笑話大全內容爬取

2021-10-08 17:15:37 字數 3912 閱讀 3285

#要求字段至少包括笑話分類,笑話**,笑話標題,笑話內容,笑話url

import re,time,random

import requests

import pymysql

from lxml import etree

headers=

#獲取笑話分類url '//div[@class="filter-links clearfix"]/a/@href'

new_url=

''response=requests.get(new_url,headers=headers)

.text

html2=etree.html(response)

# 獲取所有笑話分類

fl_urls=html2.xpath(

'//div[@class="filter-links clearfix"]/a/@href'

)xhfl=html2.xpath(

'//div[@class="filter-links clearfix"]/a/text()'

)xhfl=xhfl[1:

]# print(len(xhfl),xhfl)

#獲取笑話分類所有url

fl_urls_list=

for fl_url in fl_urls:

''+fl_url)

fenlei_urls=fl_urls_list[1:

]# print(len(fenlei_urls),fenlei_urls)

#遍歷每個笑話分類

for w in

range

(len

(fenlei_urls)):

print

('正在列印:%s'

%xhfl[w]

)try

:# url = '/lengxiaohua/'

r = requests.get(fenlei_urls[w]

,headers=headers)

.text

# print(r)

html = etree.html(r)

# print(html)

#獲取笑話每頁中的url

urls = html.xpath(

'//ul[@class="article-list"]/li/div[3]/a/@href'

)# '//ul[@class="article-list"]/li/span[2]/a/@href'

url_list1=

for url in urls:

''+url)

print

(len

(url_list1)

,url_list1)

for n in

range

(len

(url_list1)):

try:

print

('正在列印第{}頁'

.format

(n+1))

url1 =

'{}{}.html'

.format

(fenlei_urls[w]

,n+1

)#第乙個中括號獲取每個分類url,第二個中括號獲取每頁url

r = requests.get(url1, headers=headers)

.text

# print(r)

html = etree.html(r)

# 獲取笑話標題

biaoti = html.xpath(

'//ul[@class="article-list"]/li/span/a/text()'

)print

(len

(biaoti)

,biaoti)

# 獲取每個笑話的**

laiyuan = html.xpath(

'//ul[@class="article-list"]/li/div[@class="article-source"]/span[2]//text() | //ul[@class="article-list"]/li/div[@class="article-source"]/a//text()'

)print

(len

(laiyuan)

,laiyuan)

#獲取每頁url鏈結

urls = html.xpath(

'//ul[@class="article-list"]/li/div[3]/a/@href'

) url_list1 =

for url in urls:

''+ url)

print

(len

(url_list1)

, url_list1)

#-----獲取笑話大全每頁每個鏈結中的笑話內容

xhnr_list=

# info_list=

for j in

range

(len

(url_list1)):

r1 = requests.get(url_list1[j]

,headers=headers)

.text

html1 = etree.html(r1)

xhnr1 = html1.xpath(

'//div[@class = "article-text"]//text()'

)# print(xhnr1)

xhnr2=

[' '

.join(

[i.strip(

)for i in xh.strip(

).split(

'\t')]

)for xh in xhnr1]

# print(xhnr2)

xhnr =

[i for i in xhnr2 if

len(

str(i))!=

0]# print(xhnr)

print

(xhnr_list)

#連線資料庫(使用的函式:pymysql.connect)

conn=pymysql.connect(host=

"localhost"

,user=

'root'

,password=

'123456'

,db=

'51job'

,charset=

'utf8'

)#連線完資料庫後,使用db.cursor()獲取資料庫的操作游標

cursor=conn.cursor(

)#按行插入

insert_sql=

"insert into xhdq (xhfl,laiyuan,biaoti,xhnr,xhurl)values(%s,%s,%s,%s,%s)"

for i in

range(0

,len

(url_list1)):

cursor.execute(insert_sql,

(str

(xhfl[w]),

str(laiyuan[i]),

str(biaoti[i]),

str(xhnr_list[i]),

str(url_list1[i]))

) conn.commit(

) cursor.close(

) conn.close(

)except exception as e:

print

("請求出現錯誤,錯誤型別是%s"

% e)

except exception as e:

print

("請求出現錯誤,錯誤型別是%s"

% e)

爬取笑話大全

簡單練習 多做多練多看 才可以學的更好 拿來練手 不喜勿噴 import requests from bs4 import beautifulsoup urls headers html requests.get urls,headers headers soup beautifulsoup htm...

scrapy簡單爬取內容

scrapy的簡單爬取不用新建專案。安裝好scrapy後編寫爬蟲檔案import scrapy class zaobaoscrapy scrapy.spider name zaobao start urls def parse self,response for href in response.c...

Java 爬取影評內容

根據 和頁面編碼獲取網頁原始碼 解析原始碼 批量採集影評資料 jsoup document doc jsoup.parse html element ele doc.getelementbyid 外層div id elements eles ele.getelementbyclass 模板 clas...