python xpath爬取電影天堂

2021-09-29 10:54:39 字數 3730 閱讀 9567

import requests

from lxml import html

base_domain =

''# url = '/html/gndy/dyzz/list_23_1.html'

headers =

defspider()

: base_url =

"/html/gndy/dyzz/list_23_{}.html"

movies =

for num in

range(1

,8):

url = base_url.

format

(num)

detail_urls = get_detail_urls(url)

for detail_url in detail_urls:

movie = parse_detail_page(detail_url)

print

(movie)

defget_detail_urls

(url)

: resp = requests.get(url, headers=headers)

.content.decode(

'gbk'

,'ignore'

)# 忽略除gbk以外的編碼

etree = html.etree

htmlelements = etree.html(resp)

detail_urls = htmlelements.xpath(

'//table[@class="tbspan"]//a/@href'

) detail_urls =

map(

lambda url: base_domain + url, detail_urls)

return detail_urls

defparse_info

(info,rule)

:return info.replace(rule,"")

.strip(

)def

parse_detail_page

(url)

: movie =

resp = requests.get(url, headers=headers)

.content.decode(

'gbk'

,'ignore'

)# 忽略除gbk以外的編碼

etree = html.etree

htmlelements = etree.html(resp)

title = htmlelements.xpath(

'//div[@class="title_all"]//font[@color="#07519a"]/text()')[

0]movie[

'title'

]= title

zoome = htmlelements.xpath(

'//div[@id="zoom"]')[

0]imgs = zoome.xpath(

".//img/@src"

) cover = imgs[0]

screenshot = imgs[1]

movie[

'cover'

]= cover

movie[

'screenshot'

]= screenshot

infos = zoome.xpath(

'.//text()'

)# print(infos)

for index,info in

enumerate

(infos)

:if info.startswith(

"◎年  代"):

info = info.replace(

"◎年  代",""

).strip(

) movie[

"year"

]= info

if info.startswith(

"◎產  地"):

info = info.replace(

"◎產  地",''

).strip(

) movie[

'country'

]= info

if info.startswith(

"◎類  別"):

info = info.replace(

'◎類  別',""

).strip(

) movie[

"categary"

]= info

if info.startswith(

"◎豆瓣評分"):

info = info.replace(

"◎豆瓣評分",""

).strip(

) movie[

"douban"

]= info

if info.startswith(

"◎導  演"):

info = info.replace(

"◎導  演",""

).strip(

) directors =

[info]

for x in

range

(index+1,

len(infos)):

director = infos[x]

.strip(

)if director.startswith(

"◎編  劇"):

break

movie[

"director"

]= directors

if info.startswith(

"◎主  演"):

info = info.replace(

"◎主  演",''

).strip(

) actors =

[info]

for x in

range

(index+1,

len(infos)):

actor = infos[x]

.strip(

)if actor.startswith(

"◎標  籤"):

break

movie[

"actor"

]= actors

if info.startswith(

"◎簡  介 "):

info = info.replace(

"◎簡  介 ",""

).strip(

) profiles =

for x in

range

(index+1,

len(infos)):

profile = infos[x]

.strip(

)if profile.startswith(

"◎獲獎情況 "):

break

movie[

"profile"

]= profiles

return movie

if __name__ ==

'__main__'

: spider(

)

1.return movie 可以使用yield代替提高效率

2.為啥我背景是黑色的

爬取電影天堂

分析每頁的url,可以得到規律是 第t頁的url為 於是可以先分析第一頁,然後對頁數進迴圈,就可得到所有最新電影的詳細資訊。from lxml import etree headers defget movie url url resp requests.get url,headers header...

電影爬取案例

本次案例針對的是電影天堂上的電影,爬取所有電影的鏈結,以及每個鏈結對應的詳情頁。整體思路是 封裝兩個 函式,用來獲取url,對url進行解析。這個函式用來獲取每個頁面的url def get detail url url response requests.get url,headers heade...

爬取貓眼電影

有乙份工作需要我列出兩個電影院的每天電影排期資訊,我不想每次都要去貓眼上覆制貼上。所以做了個爬蟲 功能 能夠知道每天的電影排期資訊 使用限制 只能在當天使用,不能在前一晚上使用,後面我會再考慮修改 coding utf 8 import requests import re from bs4 imp...