Python 爬蟲基礎

by：小?

post請求

session

beautifulsoup庫

scrapy框架

import requests
response = requests.get(
"")# 或者response = requests.request("get", "")

import requests
kw =
headers =
# params 接收乙個字典或者字串的查詢引數，字典型別自動轉換為url編碼， 
response = requests.get(
"?", params = kw, headers = headers)
# 檢視響應內容，response.text 返回的是unicode格式的資料
print
(response.text)
# 檢視響應內容，response.content返回的位元組流資料
print
(respones.content)
# 檢視完整url位址
print
(response.url)
# 檢視響應頭部字元編碼
print
(response.encoding)
# 檢視響應碼
print
(response.status_code)

?wd=%e5%8c%97%e4%ba%ac
iso-8859-1
200

import requests
formdata =
url =
""headers=
response = requests.post(url, data = formdata, headers = headers)
print
(response.text)

import requests
# 如果**需要使用http basic auth，可以使用下面這種格式：
proxy =
response = requests.post(
"", proxies = proxy)
print
(response.text)

the dormouse's story once upon a time there were three little sisters; and their names were elsie, lacie, tillie; and they lived at the bottom of a well.

..."""

from bs4 import beautifulsoup
soup = beautifulsoup(html_doc)
print
(soup.prettify(
))

the dormouse's story once upon a time there were three little sisters; and their names were elsie ,lacie ,tillie ;and they lived at the bottom of a well.

...

print
(soup.title)
# the dormouse's story
print
(soup.title.name)
# u'title'
print
(soup.title.string)
# u'the dormouse's story'
print
(soup.title.parent.name)
# u'head'
print(""
)print
(soup.p)
#the dormouse's story
print
(soup.p[
'class'])
# u'title'
print
(soup.p.string)
print(""
)print
(soup.a)
# elsie
print
(soup.find_all(
'a')
)# [elsie,
# lacie,
# tillie]
print
(soup.find(id=
"link3"))
# tillie

the dormouse's story
title
the dormouse's story
head
the dormouse's story
['title']
the dormouse's story
elsie
[elsie, lacie, tillie]
tillie

for link in soup.find_all(
'a')
:print
(link.get(
'href'
))

print
(soup.get_text(
))

the dormouse's story the dormouse's story once upon a time there were three little sisters; and their names were elsie, lacie, tillie; and they lived at the bottom of a well.

...

beautifulsoup爬取部落格例項

#開啟終端進入想建立的目錄輸入

scrapy startproject 專案名字

#例如
import scrapy
class
dmozitem
(scrapy.item)
: title = scrapy.field(
)#爬取名字
link = scrapy.field(
)#爬取鏈結
desc = scrapy.field(
)#爬取描述

#例如
import scrapy
from tutorial.items import dmozitem #匯入items裡定義的函式
class
dmozspider
(scrapy.spider)
: name =
"dmoz"
#給爬蟲起個名字
allowed_domains =
["dmoz.org"
]#爬蟲鏈結的名頭
start_urls =
["",#爬蟲開始的鏈結url
""]#推薦乙個chrome拓展xpath helper（非常實用）
defparse
(self, response)
:for sel in response.xpath(
'//ul/li'):
#爬取所有此標籤
item = dmozitem(
)#給items定義的函式起個使用的名
item[
'title'
]= sel.xpath(
'a/text()'
).extract(
)#爬取a標籤的名字
item[
'link'
]= sel.xpath(
'a/@href'
).extract(
)#爬取a標籤的鏈結
item[
'desc'
]= sel.xpath(
'text()'
).extract(
)#爬取文字
yield item #返回值給item

scrapy框架爬取部落格例項

python爬蟲基礎

一什麼是爬蟲通常爬蟲是從某個的某個頁面開始，爬取這個頁面的內容，找到網頁中的其他鏈結位址，然後從這個位址爬到下乙個頁面，這樣一直不停的爬下去，進去批量的抓取資訊。那麼，我們可以看出網路爬蟲就是乙個不停爬取網頁抓取資訊的程式。二爬蟲的基本流程 1，發起請求向目標站點傳送乙個requests請...

python爬蟲基礎

爬蟲爬蟲，全稱網路爬蟲，指按照一定的規則模擬瀏覽器人工登入網頁的方式自動抓取網路資訊資料的程式。簡單的說，就是將瀏覽器上網所能看到頁面上的內容通過爬蟲程式自動獲取下來，並進行儲存。爬蟲其實就是乙個程式自動收集獲取指定網路資料資訊的過程，網路資料資訊量十分龐大，人工獲取無法完成，這時就需要爬蟲來...

python 爬蟲基礎

urllib 或 requests re 01 r 大圖的 pat re.compile re 01 建立乙個正規表示式的模板 imgurls re.findall pat,data 開始匹配 print len imgurls imgurls i 0 for imgurl in imgurls i...

Python 爬蟲基礎

python爬蟲基礎

python爬蟲基礎

python 爬蟲基礎

相關推薦