抓取boss直聘的資訊

2021-08-26 05:09:55 字數 2227 閱讀 2661

from bs4 import beautifulsoup

import requests

import ip_proxy

from urllib import parse

def get_boss_info(my_ip,detailed_url):

#url = 『

proxy = 

response = requests.get(detailed_url, headers=headers, proxies = proxy, timeout=5)

soup = beautifulsoup(response.text, 'lxml')

title = soup.find('h1').text

#div_ele = soup.find('div', class_="name")

#print(div_ele)

salary = soup.find('span', class_="badge").text.replace('\n', '').strip()

print(title)

print(salary)

gezhong_info = soup.select('div.info-primary > p')[0].text.replace('\n', '').strip()

print(gezhong_info)

gangwei_info = soup.select('div.text')[0].text

print(gangwei_info)

def get_detail_url(my_ip, url):# 獲取詳情頁的url

# url = 『

proxy =

response = requests.get(url, headers = headers, proxies=proxy, timeout=5)

soup = beautifulsoup(response.text, 'lxml')

#a_ele_list = soup.select('h3.name > a')

a_ele_list = soup.select('div.job-list > ul > li div.info-primary > h3 > a')

for a_ele in a_ele_list:

# 屬性值的獲取可以通過類似字典的方式獲取

a_href = a_ele['href']

# 拼接詳情頁的鏈結

href = parse.urljoin(url, a_href)

print('詳情頁的href: ' + href)

# 重試三次, 獲取**訪問boss直聘, 三次沒有成功訪問就跳過

for i in range(0,3):

try:

# 獲取詳情頁的資訊

get_boss_info(my_ip, href)

break

except exception as e:

print(e)

my_ip.update_ip_proxy_str()

def get_all_info(my_ip):

base_url = 『

for i in range(1,4):

# 每乙個分頁的url

url = base_url % (i, i)

# 迴圈處理, 如果proxy不好使, 就需要換**, 如果重試4次依然不好使,就跳過

for i in range(0, 4):

try:

# 迴圈四次訪問boss直聘的**, 分頁的內容

# get_detail_url(my_ip, url)

get_detail_url(my_ip, url)

break

except exception as e:

print(e)

my_ip.update_ip_proxy_str()

ifname== 『main『:

my_ip = ip_proxy.ip_getter()

# 獲取乙個ip

# proxy_str = 『36.27.143.72:21450』

# print(proxy_str)

# 獲取所有的boss直聘資訊

get_all_info(my_ip)

爬取boss直聘招聘資訊

直接上主 from bs4 import beautifulsoup import requests import ip proxy from urllib import parse headers def get boss info my ip,detailed url url proxy res...

BOSS直聘招聘資訊爬取

設定搜尋職位名稱 key words 資料分析 key urllib.parse.quote key words url key page 1 ka page 1 defget data url try res requests.get url,headers page headers status...

Scrapy實戰 爬Boss直聘

我們爬取頁面中每個公司的崗位資訊,包括職位 辦公地點 工作經驗 上圖中的11個加上boss直聘的jobid共12個資訊 開啟shell scrapy shell view response 發現返回403 嘗試把headers一併給出 from scrapy import request fetch...