Python 爬蟲基礎

2021-09-29 17:28:06 字數 4465 閱讀 8710

by:小?

post請求

cookies

session

beautifulsoup庫

scrapy框架

import requests

response = requests.get(

"")# 或者response = requests.request("get", "")

import requests

kw =

headers =

# params 接收乙個字典或者字串的查詢引數,字典型別自動轉換為url編碼,

response = requests.get(

"?", params = kw, headers = headers)

# 檢視響應內容,response.text 返回的是unicode格式的資料

print

(response.text)

# 檢視響應內容,response.content返回的位元組流資料

print

(respones.content)

# 檢視完整url位址

print

(response.url)

# 檢視響應頭部字元編碼

print

(response.encoding)

# 檢視響應碼

print

(response.status_code)

?wd=%e5%8c%97%e4%ba%ac

iso-8859-1

200

import requests

formdata =

url =

""headers=

response = requests.post(url, data = formdata, headers = headers)

print

(response.text)

import requests

# 如果**需要使用http basic auth,可以使用下面這種格式:

proxy =

response = requests.post(

"", proxies = proxy)

print

(response.text)

the dormouse's story

once upon a time there were three little sisters; and their names were

elsie,

lacie,

tillie;

and they lived at the bottom of a well.

..."""

from bs4 import beautifulsoup

soup = beautifulsoup(html_doc)

print

(soup.prettify(

))

the dormouse's story

once upon a time there were three little sisters; and their names were

elsie

,lacie

,tillie

;and they lived at the bottom of a well.

...

print

(soup.title)

# the dormouse's story

print

(soup.title.name)

# u'title'

print

(soup.title.string)

# u'the dormouse's story'

print

(soup.title.parent.name)

# u'head'

print(""

)print

(soup.p)

#the dormouse's story

print

(soup.p[

'class'])

# u'title'

print

(soup.p.string)

print(""

)print

(soup.a)

# elsie

print

(soup.find_all(

'a')

)# [elsie,

# lacie,

# tillie]

print

(soup.find(id=

"link3"))

# tillie

the dormouse's story

title

the dormouse's story

head

the dormouse's story

['title']

the dormouse's story

elsie

[elsie, lacie, tillie]

tillie

for link in soup.find_all(

'a')

:print

(link.get(

'href'

))

print

(soup.get_text(

))

the dormouse's story

the dormouse's story

once upon a time there were three little sisters; and their names were

elsie,

lacie,

tillie;

and they lived at the bottom of a well.

...

beautifulsoup爬取部落格例項

#開啟終端進入想建立的目錄輸入

scrapy startproject 專案名字

#例如

import scrapy

class

dmozitem

(scrapy.item)

: title = scrapy.field(

)#爬取名字

link = scrapy.field(

)#爬取鏈結

desc = scrapy.field(

)#爬取描述

#例如

import scrapy

from tutorial.items import dmozitem #匯入items裡定義的函式

class

dmozspider

(scrapy.spider)

: name =

"dmoz"

#給爬蟲起個名字

allowed_domains =

["dmoz.org"

]#爬蟲鏈結的名頭

start_urls =

["",#爬蟲開始的鏈結url

""]#推薦乙個chrome拓展xpath helper(非常實用)

defparse

(self, response)

:for sel in response.xpath(

'//ul/li'):

#爬取所有此標籤

item = dmozitem(

)#給items定義的函式起個使用的名

item[

'title'

]= sel.xpath(

'a/text()'

).extract(

)#爬取a標籤的名字

item[

'link'

]= sel.xpath(

'a/@href'

).extract(

)#爬取a標籤的鏈結

item[

'desc'

]= sel.xpath(

'text()'

).extract(

)#爬取文字

yield item #返回值給item

scrapy框架爬取部落格例項

python爬蟲基礎

一 什麼是爬蟲 通常爬蟲是從某個 的某個頁面開始,爬取這個頁面的內容,找到網頁中的其他鏈結位址,然後從這個位址爬到下乙個頁面,這樣一直不停的爬下去,進去批量的抓取資訊。那麼,我們可以看出網路爬蟲就是乙個不停爬取網頁抓取資訊的程式。二 爬蟲的基本流程 1,發起請求 向目標站點傳送乙個requests請...

python爬蟲基礎

爬蟲 爬蟲,全稱網路爬蟲,指按照一定的規則 模擬瀏覽器人工登入網頁的方式 自動抓取網路資訊資料的程式。簡單的說,就是將瀏覽器上網所能看到頁面上的內容通過爬蟲程式自動獲取下來,並進行儲存。爬蟲其實就是乙個程式自動收集獲取指定網路資料資訊的過程,網路資料資訊量十分龐大,人工獲取無法完成,這時就需要爬蟲來...

python 爬蟲基礎

urllib 或 requests re 01 r 大圖的 pat re.compile re 01 建立乙個正規表示式的模板 imgurls re.findall pat,data 開始匹配 print len imgurls imgurls i 0 for imgurl in imgurls i...