Python 之lxml解析庫

2022-06-08 13:15:08 字數 3429 閱讀 8165

一、xpath常用規則

二、解析html檔案

from lxml import

etree

#讀取html檔案進行解析

defparse_html_file():

html = etree.parse("

./test.html

", parser=etree.htmlparser())

print(etree.tostring(html).decode("

utf-8"))

'''

'''#

讀取文字解析節點

defget_text_node(text):

html = etree.html(text, parser=etree.htmlparser())

print(html.xpath("

//ul/li[position()=2]/text()

")) #

['你好!!!']

print(html.xpath("

//ul/li[2]/text()

")) #

['你好!!!']

#獲取所有節點

defget_all_node(text):

html = etree.html(text, parser=etree.htmlparser())

print

(html.xpath(

"//*

")) #

[, , , , , , , , ]

#獲取子節點

defget_children_node(text):

html = etree.html(text, parser=etree.htmlparser())

print(html.xpath("

//div/ul/li/a

")) #

#獲取父節點

defget_parent_node(text):

html = etree.html(text, parser=etree.htmlparser())

print(html.xpath("

//a/..

")) #

[, ]

#屬性匹配

defmath_attr(text):

html = etree.html(text, parser=etree.htmlparser())

print(html.xpath("

//a[@href='2.html']/text()

")) #

['hello world']

#屬性獲取

defget_attr(text):

html = etree.html(text, parser=etree.htmlparser())

print(html.xpath("

//a/@href

")) #

['1.html', '2.html']

#屬性多值匹配

defmatch_more_attr(text):

html = etree.html(text, parser=etree.htmlparser())

print(html.xpath("

//li[contains(@class, 'aaa')]/a/text()

")) #

['yangs']

if__name__ == '

__main__':

text = '''

'''

三、去哪兒網html抓取案例

import

requests

from lxml import

etree

defgo_where(keyword):

url = "

" +keyword

headers =

try:

html = requests.get(url, headers=headers).content.decode("

utf-8")

except

runtimeerror as e:

print

(e)

try:

html_object = etree.html(html, parser=etree.htmlparser())

#獲取總共多少條資料

count = len(html_object.xpath("

//div[@class='sight_item']"))

return_data =

for i in

range(count):

name = html_object.xpath("

//div[@class='sight_item']/@data-sight-name")

districts = html_object.xpath("

//div[@class='sight_item']/@data-districts")

point = html_object.xpath("

//div[@class='sight_item']/@data-point")

img_url = html_object.xpath("

//div[@class='sight_item']/@data-sight-img-u-r-l")

address = html_object.xpath("

//div[@class='sight_item']/@data-address")

"name

": name[i],

"districts

": districts[i],

"point

": point[i],

"address

": address[i],

"img_url

": img_url[i]

})return

return_data

except

runtimeerror as e:

print

(e)if

__name__ == '

__main__':

data = go_where("溫州"

)

print(data) #

[, , , , , , , , , , ]

有我案例**優化的,可以發給我。。。

Python 之lxml解析模組

lxml 是 乙個html xml的解析器,主要的功能是如何解析和提取 html xml 資料。一 lxml示例 1 初步 使用 lxml 的 etree 庫 from lxml import etree text 利用etree.html,將字串解析為html文件 html etree.html ...

python的lxml庫簡介 lxml庫

lxml 是 乙個html xml的解析器,主要的功能是如何解析和提取 html xml 資料。lxml和正則一樣,也是用 c 實現的,是一款高效能的 python html xml 解析器,我們可以利用之前學習的xpath語法,來快速的定位特定元素以及節點資訊。需要安裝c語言庫,可使用 pip 安...

python爬蟲網頁解析之lxml模組

windows系統下的安裝 方法一 pip3 install lxml pip3 install lxml 4.2.1 cp36 cp36m win amd64.whl 檔案所在的路徑 linux下安裝 方法一 pip3 install lxml 方法二 yum install y epel rel...