python丨Selenium爬取拉鉤職位資訊

第一頁職位資訊

from selenium import webdriver
from lxml import etree
import re
import time
''''''
class lagouspider(object):
def __init__(self):
self.driver = webdriver.chrome()
#python職位
self.url = ''
self.position = 
def run(self):
self.driver.get(self.url)
source = self.driver.page_source
self.parse_list_page(source)
def parse_list_page(self,source):
html = etree.html(source)
links = html.xpath("//a[@class='position_link']/@href")
#每一頁的所有職位的詳情url
for link in links:
self.request_detail_page(link)
time.sleep(1)
def request_detail_page(self,url):
self.driver.get(url)
#獲取職位詳情頁的源**
source = self.driver.page_source
self.parse_detail_page(source)
def parse_detail_page(self,source):
html = etree.html(source)
position_name = html.xpath("//span[@class='name']/text()")[0]
job_request_spans = html.xpath("//dd[@class='job_request']//span")
salary = job_request_spans[0].xpath('.//text()')[0].strip()
city = job_request_spans[1].xpath('.//text()')[0].strip()
city = re.sub(r"[\s/]","",city)
work_years = job_request_spans[2].xpath('.//text()')[0].strip()
work_years = re.sub(r"[\s/]","",work_years)
education = job_request_spans[3].xpath('.//text()')[0].strip()
education = re.sub(r"[\s/]","",education)
desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
position = 
print(position)
print('-'*200)
if __name__ == '__main__':
spider = lagouspider()
spider.run()

from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.support.ui import webdriverwait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import by
''''''
class lagouspider(object):
def __init__(self):
self.driver = webdriver.chrome()
#python職位
self.url = ''
self.position = 
def run(self):
self.driver.get(self.url)
while true:
source = self.driver.page_source
webdriverwait(driver=self.driver,timeout=20).until(
ec.presence_of_element_located((by.xpath,"//div[@class='pager_container']/span[last()]"))
)self.parse_list_page(source)
next_btn = self.driver.find_element_by_xpath(
"//div[@class='pager_container']/span[last()]")
if "pager_next_disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
time.sleep(1)
def parse_list_page(self,source):
html = etree.html(source)
links = html.xpath("//a[@class='position_link']/@href")
#每一頁的所有職位的詳情url
for link in links:
self.request_detail_page(link)
time.sleep(1)
def request_detail_page(self,url):
# self.driver.get(url)
self.driver.execute_script("window.open('%s')"%url)
self.driver.switch_to.window(self.driver.window_handles[1])
webdriverwait(driver=self.driver,timeout=20).until(
ec.presence_of_element_located((by.xpath,"//div[@class='job-name']/span[@class='name']"))
)#獲取職位詳情頁的源**
source = self.driver.page_source
self.parse_detail_page(source)
#關閉當前詳情頁，並且切換到列表頁
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(self,source):
html = etree.html(source)
position_name = html.xpath("//span[@class='name']/text()")[0]
job_request_spans = html.xpath("//dd[@class='job_request']//span")
salary = job_request_spans[0].xpath('.//text()')[0].strip()
city = job_request_spans[1].xpath('.//text()')[0].strip()
city = re.sub(r"[\s/]","",city)
work_years = job_request_spans[2].xpath('.//text()')[0].strip()
work_years = re.sub(r"[\s/]","",work_years)
education = job_request_spans[3].xpath('.//text()')[0].strip()
education = re.sub(r"[\s/]","",education)
desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()
position = 
print(position)
print('-'*200)
if __name__ == '__main__':
spider = lagouspider()
spider.run()

Python 爬蟲利器 Selenium

前面幾節，我們學習了用 requests 構造頁面請求來爬取靜態網頁中的資訊以及通過 requests 構造 ajax 請求直接獲取返回的 json 資訊。還記得前幾節，我們在構造請求時會給請求加上瀏覽器 headers,目的就是為了讓我們的請求模擬瀏覽器的行為，防止被的反爬蟲策略限制。今天要介紹...

Python 爬蟲利器 Selenium

Python核心丨匿名函式

描述匿名函式格式 lambda argument1,argument2,argumentn expression匿名函式的關鍵字是lambda，之後是一系列的引數，然後用冒號隔開，最後則是由這些引數組成的表示式。square lambda x x 2square 3 9寫成常規函式 def squ...

python丨Selenium爬取拉鉤職位資訊

Python 爬蟲利器 Selenium

Python 爬蟲利器 Selenium

Python核心丨匿名函式

相關推薦