爬蟲(5) 爬取拉鉤網資料

2021-08-21 04:43:56 字數 2744 閱讀 7497

importjson

importurllib

fromurllibimportparse, request

importmath

# 請求頭

headers =

# 獲得相關網頁數方法

defgetpagenum(kw):

# url = ''

# 路由(沒有輸查詢關鍵字的路由)

url = ''

# form data

data =

# url編碼

data = urllib.parse.urlencode(data).encode('utf-8')

# 請求體

req = urllib.request.request(url,

data=data,

headers=headers) # post請求

# 獲取響應

response = urllib.request.urlopen(req).read().decode('utf-8')

# 轉json

data = json.loads(response)

# 獲取崗位數

jobnum = data['content']['positionresult']['totalcount']

print(jobnum)

# 獲取單頁崗位數

pagesize = data['content']['pagesize']

print(pagesize)

# 獲取頁碼數

totalpage = math.ceil(jobnum / pagesize)

print(totalpage)

returnint(totalpage)

# 獲得崗位資訊的方法

defgetjobinfo(kw, pagenum):

# url = ''

# 路由(沒有輸查詢關鍵字的路由)

url = ''

foriinrange(1

, pagenum + 1):

data =

data = urllib.parse.urlencode(data).encode('utf-8')

req = urllib.request.request(url,

data=data,

headers=headers) # post請求

response = urllib.request.urlopen(req).read().decode('utf-8')

data = json.loads(response)

joblist = data['content']['positionresult']['result']

# print(joblist)

forjobinjoblist:

city = job['city']

companyfullname = job['companyfullname']

companylabellist = ['companylabellist']

companyshortname = job['companyshortname']

companysize = job['companysize']

district = job['district']

education = job['education']

firsttype = job['firsttype']

hitags = job['hitags']

positionadvantage = job['positionadvantage']

positionlables = job['positionlables']

print(city, companyfullname, companylabellist, companysize, district, education, firsttype, hitags,

positionadvantage, positionlables)

# 將爬取的結果儲存到pythonjob.txt中

withopen('pythonjob.txt'

, 'a+'

, encoding='utf-8'

, errors='ignore')asf:

f.write(

str((city, companyfullname, companylabellist, companysize, district, education, firsttype, hitags,

positionadvantage, positionlables)) + '

\n')

# 清除快取

f.flush()

if__name__ == '__main__':

totalpage = getpagenum('python')

getjobinfo('python'

, totalpage)

拉鉤JSON資料爬取

訪問url位址檢視網頁源 發現職位資訊為動態載入通過開發者工具檢視xhr請求,發現json資料位址為,將位址複製到瀏覽器,出現您操作太頻繁,請稍後訪問之類的資料,無法檢視完整資料向web中的位址傳送請求,獲取cookies和session資訊使用post方式將之前獲取的cookies,session...

python爬拉鉤案例 爬蟲

直接上 這裡拉勾網做了cookie的反扒機制,所以用 requests.utils.dict from cookiejar這個方法去獲取cookie然後賦值import requests url headers 或者response從而獲取cookie response requests.get h...

用Python爬取拉鉤網招聘職位資訊

本文實現自動爬取拉鉤網招聘資訊,並將爬取結果儲存在本地文字中 也可以將資料存入資料庫 使用到的python模組包 python3 1.urllib.request 2.urllib.parse 3.json 簡單分析 1.在向伺服器傳送請求,需要傳入post引數 2.搜尋的職位列表資訊存在乙個jos...