python爬取智聯招聘資訊

2021-08-20 02:16:02 字數 3456 閱讀 1578

importrandom

importre

fromtimeimportsleep

importrequests

fromtqdmimporttqdm

importuser_agents

importcsv

defget_page(city,keyword,page):

# 構造請求位址

paras =

#完整網頁位址

url =''#請求頭

headers =

try:

response = requests.get(url, params=paras, headers=headers)

# 通過狀態碼判斷是否獲取成功

ifresponse.status_code == 200:

returnresponse.text

return noneexcept:

return nonedefparse_page(html):

# 正規表示式匹配需要的資訊

pattern = re.compile(

'(.*?).*?'# 職位鏈結和職位名稱

'(.*?).*?'# 反饋率

'(.*?).*?'# 公司鏈結和公司名稱

'(.*?).*?'# 月薪

'(.*?).*?'# 地點

'(.*?).*?'#發布時間

, re.s)

# 匹配所有符合標準的內容

data = re.findall(pattern, html)

# print(items)

#去掉前面置頂的無用資訊 換了職位後手動增加或者減少

_, _, _, _, *items = data

# print(items)

foriteminitems:

job_name = item[1]

job_name = job_name.replace('','')

job_name = job_name.replace('','')

yield

defwrite_file_header(file_name, headers):

"""寫入表頭(第一行)

:paramfile_name:

:paramheaders:

:return:

"""withopen(file_name,'a', encoding='utf-8', newline='')asf:

f_csv = csv.dictwriter(f, headers)

f_csv.writeheader()

defwrite_file_rows(file_name, headers, rows):

"""寫入資訊

:paramfile_name:

:paramheaders:

:paramrows:

:return:

"""withopen(file_name,'a', encoding='utf-8', newline='')asf:

f_csv = csv.dictwriter(f, headers)

f_csv.writerows(rows)

defmain(city, keyword, page):

file_name ='/users/xiongxing/desktop/'+'智聯'+ city + keyword +'.csv'headers = ['zhiweilianjie','jobname','response rate','gongshilianjie','company','salary','address','time']

write_file_header(file_name, headers)

foriintqdm(range(page)):

job =

html = get_page(city, keyword, i)

# print(html)

sleep(0.1)

contents = parse_page(html)

foritemincontents:

# print(item)

write_file_rows(file_name, headers, job)

if__name__ =='__main__':

main('成都','python', 1) #可更換搜尋條件

python爬取智聯招聘資訊

分享今天寫的乙個爬取智聯招聘資訊的爬蟲,使用了requests和re模組,沒有寫注釋,但是 都比較簡單,不是太難,這是爬取的資訊 coding utf 8 import requests import re from itertools import izip from json import du...

Python爬取智聯招聘職位資訊

from urllib import request from urllib import parse from bs4 import beautifulsoup import csv 管理json資料的模組的 import json 定義智聯的爬蟲類 class zhilianspider obj...

python 爬取智聯招聘

乙個爬取智聯的乙個小爬蟲 python版本 python3.7 依賴模組 selenium pyquery 廢話少說,上 from selenium import webdriver from selenium.webdriver.chrome.options import options from...