爬蟲 拉勾網 selenium

2022-02-24 15:35:49 字數 3658 閱讀 3199

使用selenium進行翻頁獲取職位鏈結,再對鏈結進行解析

會爬取到部分空列表,感覺是網速太慢了,加了time.sleep()還是會有空列表

1

from selenium import

webdriver

2import

requests

3importre4

from lxml import

etree

5import

time

6from selenium.webdriver.support.ui import

webdriverwait

7from selenium.webdriver.support import

expected_conditions as ec

8from selenium.webdriver.common.by importby9

1011

class

lagouspider(object):

12def

__init__

(self):

13 opt =webdriver.chromeoptions()14#

把chrome設定成無介面模式

15opt.set_headless()

16 self.driver = webdriver.chrome(options=opt)

17 self.url = '

爬蟲?px=default&city=北京

'18 self.headers =

2122

23def

run(self):

24self.driver.get(self.url)

25while

true:

26 html = ''

27 links =

28 html =self.driver.page_source

29 links =self.get_one_page_links(html)

30for link in

links:

31print('

\n' + link+'\n'

)32self.parse_detail_page(link)

3334 webdriverwait(self.driver, 10).until(

35 ec.presence_of_element_located((by.class_name, '

pager_next

')))

36 next_page_btn = self.driver.find_element_by_class_name('

pager_next')

3738if'

pager_next_disabled

'in next_page_btn.get_attribute('

class'):

39break

40else:41

next_page_btn.click()

42 time.sleep(1)

4344

45def

get_one_page_links(self, html):

46 links =

47 hrefs = self.driver.find_elements_by_xpath('

//a[@class="position_link"]')

48for href in

hrefs:

href'))

50return

links

5152

53def

parse_detail_page(self, url):

54 job_information ={}

55 response = requests.get(url, headers=self.headers)

5657 time.sleep(2)

58 html =response.text

59 html_element =etree.html(html)

60 job_name = html_element.xpath('

//div[@class="job-name"]/@title')

61 job_description = html_element.xpath('

//dd[@class="job_bt"]//p//text()')

62for index, i in

enumerate(job_description):

63 job_description[index] = re.sub('

\xa0

', ''

, i)

64 job_address = html_element.xpath('

//div[@class="work_addr"]/a/text()')

65 job_salary = html_element.xpath('

//span[@class="salary"]/text()')

6667

#字串處理去掉不必要的資訊

68for index, i in

enumerate(job_address):

69 job_address[index] = re.sub('

檢視地圖

', ''

, i)

70while

''in

job_address:

71 job_address.remove('')72

73 job_information['

job_name

'] =job_name

74 job_information['

job_description

'] =job_description

75 job_information['

job_address

'] =job_address

76 job_information['

job_salary

'] =job_salary

77print

(job_information)

7879

80def

main():

81 spider =lagouspider()

82spider.run()

8384

85if

__name__ == '

__main__':

86 main()

執行結果

使用selenium爬拉勾網資料

usr bin env python encoding utf 8 description 使用selenium爬拉勾網資料 from selenium import webdriver from selenium.webdriver.support.ui import webdriverwait ...

node爬蟲抓取拉勾網資料

初始化 1.安裝了node 2.新建乙個資料夾 3.在該資料夾中初始化node應用 npm init安裝依賴 使用express框架 使用superagent庫 superagent 是乙個輕量級 漸進式的請求庫,內部依賴 nodejs 原生的請求 api,適用於 nodejs 環境 使用cheer...

selenium 爬取拉勾

用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...