python丨Selenium爬取拉鉤職位資訊

2021-09-17 18:19:19 字數 4497 閱讀 4007

第一頁職位資訊

from selenium import webdriver

from lxml import etree

import re

import time

''''''

class lagouspider(object):

def __init__(self):

self.driver = webdriver.chrome()

#python職位

self.url = ''

self.position =

def run(self):

self.driver.get(self.url)

source = self.driver.page_source

self.parse_list_page(source)

def parse_list_page(self,source):

html = etree.html(source)

links = html.xpath("//a[@class='position_link']/@href")

#每一頁的所有職位的詳情url

for link in links:

self.request_detail_page(link)

time.sleep(1)

def request_detail_page(self,url):

self.driver.get(url)

#獲取職位詳情頁的源**

source = self.driver.page_source

self.parse_detail_page(source)

def parse_detail_page(self,source):

html = etree.html(source)

position_name = html.xpath("//span[@class='name']/text()")[0]

job_request_spans = html.xpath("//dd[@class='job_request']//span")

salary = job_request_spans[0].xpath('.//text()')[0].strip()

city = job_request_spans[1].xpath('.//text()')[0].strip()

city = re.sub(r"[\s/]","",city)

work_years = job_request_spans[2].xpath('.//text()')[0].strip()

work_years = re.sub(r"[\s/]","",work_years)

education = job_request_spans[3].xpath('.//text()')[0].strip()

education = re.sub(r"[\s/]","",education)

desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()

position =

print(position)

print('-'*200)

if __name__ == '__main__':

spider = lagouspider()

spider.run()

from selenium import webdriver

from lxml import etree

import re

import time

from selenium.webdriver.support.ui import webdriverwait

from selenium.webdriver.support import expected_conditions as ec

from selenium.webdriver.common.by import by

''''''

class lagouspider(object):

def __init__(self):

self.driver = webdriver.chrome()

#python職位

self.url = ''

self.position =

def run(self):

self.driver.get(self.url)

while true:

source = self.driver.page_source

webdriverwait(driver=self.driver,timeout=20).until(

ec.presence_of_element_located((by.xpath,"//div[@class='pager_container']/span[last()]"))

)self.parse_list_page(source)

next_btn = self.driver.find_element_by_xpath(

"//div[@class='pager_container']/span[last()]")

if "pager_next_disabled" in next_btn.get_attribute("class"):

break

else:

next_btn.click()

time.sleep(1)

def parse_list_page(self,source):

html = etree.html(source)

links = html.xpath("//a[@class='position_link']/@href")

#每一頁的所有職位的詳情url

for link in links:

self.request_detail_page(link)

time.sleep(1)

def request_detail_page(self,url):

# self.driver.get(url)

self.driver.execute_script("window.open('%s')"%url)

self.driver.switch_to.window(self.driver.window_handles[1])

webdriverwait(driver=self.driver,timeout=20).until(

ec.presence_of_element_located((by.xpath,"//div[@class='job-name']/span[@class='name']"))

)#獲取職位詳情頁的源**

source = self.driver.page_source

self.parse_detail_page(source)

#關閉當前詳情頁,並且切換到列表頁

self.driver.close()

self.driver.switch_to.window(self.driver.window_handles[0])

def parse_detail_page(self,source):

html = etree.html(source)

position_name = html.xpath("//span[@class='name']/text()")[0]

job_request_spans = html.xpath("//dd[@class='job_request']//span")

salary = job_request_spans[0].xpath('.//text()')[0].strip()

city = job_request_spans[1].xpath('.//text()')[0].strip()

city = re.sub(r"[\s/]","",city)

work_years = job_request_spans[2].xpath('.//text()')[0].strip()

work_years = re.sub(r"[\s/]","",work_years)

education = job_request_spans[3].xpath('.//text()')[0].strip()

education = re.sub(r"[\s/]","",education)

desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()

company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()

position =

print(position)

print('-'*200)

if __name__ == '__main__':

spider = lagouspider()

spider.run()

Python 爬蟲利器 Selenium

前面幾節,我們學習了用 requests 構造頁面請求來爬取靜態網頁中的資訊以及通過 requests 構造 ajax 請求直接獲取返回的 json 資訊。還記得前幾節,我們在構造請求時會給請求加上瀏覽器 headers,目的就是為了讓我們的請求模擬瀏覽器的行為,防止被 的反爬蟲策略限制。今天要介紹...

Python 爬蟲利器 Selenium

前面幾節,我們學習了用 requests 構造頁面請求來爬取靜態網頁中的資訊以及通過 requests 構造 ajax 請求直接獲取返回的 json 資訊。還記得前幾節,我們在構造請求時會給請求加上瀏覽器 headers,目的就是為了讓我們的請求模擬瀏覽器的行為,防止被 的反爬蟲策略限制。今天要介紹...

Python核心丨匿名函式

描述 匿名函式格式 lambda argument1,argument2,argumentn expression匿名函式的關鍵字是lambda,之後是一系列的引數,然後用冒號隔開,最後則是由這些引數組成的表示式。square lambda x x 2square 3 9寫成常規函式 def squ...