python3 爬蟲新手筆記（三）資訊類網頁爬取

def
start_requests
(self)
:return
[request(
, callback=self.parse_article)
]#列表的第一頁有一篇文章，將這篇文章也爬取下來
deffirst_page
(self, response):)
#通過response得到該頁面的html文字和url位址
item[
'html'
]= response.body
item[
'url'
]= response.url
....
..#print(item)
yield item
blocks = response.xpath(
'.//div[@class="block-article bdaiafadein"]'
)for block in blocks:
article_href = block.xpath(
'article/header/h2[@class="entry-title"]/a/@href'
).extract_first(
)if article_href is
notnone
:yield request(article_href, callback=self.parse_article)
div_pagination = response.xpath(
'.//div[@class="bdaia-pagination"]'
)if div_pagination is
notnone
: childs = div_pagination.xpath(
'*')
child_count =0;
for child in childs:
child_class = child.xpath(
'@class'
).extract_first(
) child_count +=
1if child_class==
'current'
:break
next_page_href = childs[child_count]
.xpath(
'@href'
).extract_first(
)print
(next_page_href)
if next_page_href is
notnone
:yield request(next_page_href, callback=self.article_list)
defarticle_list
(self, response):.
....
.#爬取具體文章 
defparse_article
(self, response):)
item[
'html'
]= response.body
item[
'url'
]= response.url
.....
. post_content = response.xpath(
'.//div[@class="bdaia-post-content"]'
)if post_content is
notnone
: content_list =
first_div = post_content.xpath(
'div[1]'
)if first_div is
notnone
: pronest(content_list, first_div)
#'*'得到該節點的所有元素節點
childs = post_content.xpath(
'*')
for child in childs[2:
]:#呼叫遞迴結構處理
pronest(content_list, child)
""" print("attrs: ")
attrs = child.xpath('.//@href | .//@src')
for attr in attrs:
print(attr)
"""item[
'content'
]= content_list
#print("content:", content_list)..
....
yield item
#遞迴結構
defpronest
(text, selector)
:#'node()'得到該節點的所有節點，與'*'不同，包含純文字之類的
childs = selector.xpath(
'node()'
)for child in childs:
#'name()'得到該節點的名稱
child_name = child.xpath(
'name()'
).extract_first(
)#如果名稱為空是純文字，直接提取內容
if child_name is
none
: text_content = child.extract()if
len(text_content)==0
:continue
#print(text_content)
#如果是img、iframe、a、link或其他節點
else
:if child_name==
'img'
: img_content =
"+ child.xpath(
'@src'
).extract_first()+
"" img_text = child.xpath(
'text()'
).extract_first(
)if img_text is
notnone
: img_content += img_text
#print(img_content)))
elif child_name==
'iframe'
: iframe_content =
""+ child.xpath(
'@src'
).extract_first()+
"" iframe_text = child.xpath(
'text()'
).extract_first(
)if iframe_text is
notnone
: iframe_content += iframe_text
#print(iframe_content)))
elif child_name==
'a':
a_content =
""+ child.xpath(
'@href'
).extract_first()+
"" a_text = child.xpath(
'text()'
).extract_first(
)if a_text is
notnone
: a_content += a_text
#print(a_content)))
elif child_name==
'link'
: link_content =
""+ child.xpath(
'@href'
).extract_first()+
"" link_text = child.xpath(
'text()'
).extract_first(
)if link_text is
notnone
: link_content += link_text
#print(link_content)))
#節點的子節點中可能含有img、iframe、a、link
else
: links = child.xpath(
'.//a | .//link'
) srcs = child.xpath(
'.//img | .//iframe'
)#如果沒有img、iframe、a、link節點，直接提取內容；否則遞迴
iflen
(links)==0
andlen
(srcs)==0
: content = child.xpath(
'.//text()'
).extract()if
len(content)==0
:continue
#print(content)
text += content
else
: pronest(text, child)

python3爬蟲筆記

請求並提取資料的自動化程式發起請求獲取響應內容解析文字內容儲存資料 1.瀏覽器傳送資訊給該所在的伺服器，這個過程叫做http request。2.服務收到瀏覽器傳送的訊息後，能夠根據瀏覽器傳送訊息的內容，做相應的處理，然後把訊息回傳給瀏覽器。這個過程叫做http response。...

python3爬蟲學習筆記

爬蟲爬取京東某手機頁面 beautifulsoup 原文記錄內容太多現進行摘錄和分類 pip3 install jieba kou ubuntu python cat clahamlet.py usr bin env python coding utf 8 e10.1calhamlet.py def...

Python3爬蟲學習筆記1 0 什麼是爬蟲？

我們來思考乙個問題，什麼是爬蟲？說一下我對爬蟲的理解。理解爬蟲之前，我們思考一下網路是一種什麼樣式的存在。爬蟲就是游離在這些網路之間的乙個自動化程式，並且能夠完成對網路地瀏覽，自動採集網路中所有訪問到的內容從而在網路中得到你需要的資訊。網路蜘蛛爬蟲維基百科順便說一句 google是世界上最大...

python3 爬蟲新手筆記（三） 資訊類網頁爬取

python3爬蟲筆記

python3爬蟲學習筆記

Python3爬蟲學習筆記1 0 什麼是爬蟲？

相關推薦

python3 爬蟲新手筆記（三）資訊類網頁爬取