Python爬蟲入門（四）re庫及某寶例項

import re
# re庫基本使用
# 原生字串型別，不包含轉移符 raw string
deftest_regex()
:# regex庫常用的6個方法
# search 返回第乙個匹配
match = re.search(r'[1-9]\d'
,'bit 100081 100081'
)if match:
print
(match.group(0)
)# 從起始位置開始匹配
match = re.match(r'[1-9]\d'
,'100081 bit 100081'
)if match:
print
(match.group(0)
)# 以列表返回所有匹配的字串
ls = re.findall(r'[1-9]\d'
,'bit100081 tsu100084'
)print
(ls)
# 將匹配的去除，返回列表
ls = re.split(r'[1-9]\d'
,'bit100081 tsu100084'
)print
(ls)
ls = re.split(r'[1-9]\d'
,'bit100081 tsu100084'
, maxsplit=1)
print
(ls)
# 返回迭代型別，每個迭代元素是match物件
for m in re.finditer(r'[1-9]\d'
,'bit100081 tsu100084'):
if m:
print
(m.group(0)
)# 替換匹配到的字串，並返回
t = re.sub(r'[1-9]\d'
,':zipcode'
,'bit100081 tsu100084'
)print
(t)# 編譯生成正規表示式物件 regex = re.compile
# test_regex()
deftest_match_object()
:# match物件屬性
m = re.search(r'[1-9]\d'
,'bit100081 tsu100084'
)# 返回一次匹配結果
print
(m.string)
print
(m.re)
print
(m.pos)
print
(m.endpos)
print
(m.group(0)
)# 匹配得到的內容
print
(m.start())
print
(m.end())
print
(m.span())
# test_match_object()
defgreed_and_min_match()
:# 預設最長匹配即貪婪匹配
m = re.search(r'py.*n'
,'pynabncdenfn'
)print
(m.group())
# 加上？可實現最小匹配
m = re.search(r'py.*?n'
,'pynabncdenfn'
)print
(m.group(0)
)greed_and_min_match(
)

import requests
import re
# **搜尋定向爬蟲
# 學會正規表示式
# cookies和ip問題
deffind_html
(url, kv, cookies)
:try
: r = requests.get(url, cookies=cookies, headers=kv, timeout=30)
r.raise_for_status(
) return r.text
except
:return
""def
parse_page
(ilt, html)
:try
:# \: \.分別為:和.的轉義
# plt tlt分別為乙個網頁上商品**、名稱的列表
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"'
, html)
# "view_price":"數字或者.出現多次"
tlt = re.findall(r'\"raw_title\"\:\".*?\"'
, html)
# "raw_title":"任意字元的最小匹配"
for i in
range
(len
(plt)):
# eval去除"或'
# eval() 函式用來執行乙個字串表示式，並返回表示式的值。
price =
eval
(plt[i]
.split(
':')[1
])title =
eval
(tlt[i]
.split(
':')[1
])[price, title]
)except
:print
("****"
)def
print_good_list
(ilt)
: tplt =
"\t\t"
print
(tplt.
format
("序號"
,"**"
,"商品名稱"))
count =
0for g in ilt:
count = count +
1print
(tplt.
format
(count, g[0]
, g[1]
))defmain()
: goods =
'鉛筆'
depth =
3 start_url =
''+ goods
infolist =
coo =
''# 這裡填寫自己****的cookie哦
cookies =
for line in coo.split(
';')
:# line.strip為***=***形式
name, value = line.strip(
).split(
'=',1)
cookies[name]
= value
kv =
for i in
range
(depth)
:try
: url = start_url +
'&s='
+str(44
* i)
html = find_html(url, kv, cookies)
if i ==0:
print
(html)
parse_page(infolist, html)
except
:continue
print_good_list(infolist)
main(
)

python爬蟲 re庫（正則）

1.re.match re.match嘗試從字元創的起始位置匹配乙個模式，如果不是起始位置匹配成功的話，就會返回none。re.match pattern,string,flags 0 2.最常規的匹配 import re content hello 123 4567 world this is a...

爬蟲之 re庫

a表示正則的規則，b表示字串從開頭開始匹配，若開頭就匹配失敗，則返回為none result re.match a b result.group 若a 的規則中有用小括號圈起來東西，可以按順序由 result.group 1 result.group 2 等匹配得到掃瞄整個字串，返回第乙個成...

Python爬蟲入門四urllib庫的高階用法

有些不會同意程式直接用上面的方式進行訪問，如果識別有問題，那麼站點根本不會響應，所以為了完全模擬瀏覽器的工作，我們需要設定一些 headers 的屬性。首先，開啟我們的瀏覽器，除錯瀏覽器 f12，開啟network，點登入之後，我們會發現登陸之後介面都變化了，出現乙個新的介面，實質上這個頁面包含了...

Python爬蟲入門（四）re庫及某寶例項

python爬蟲 re庫（正則）

爬蟲 之 re庫

Python爬蟲入門四urllib庫的高階用法

相關推薦

爬蟲之 re庫