蛋殼租房資訊的爬取

2021-09-09 05:22:43 字數 4006 閱讀 4879

import requests

from lxml import etree

import json

import csv

class

danke

(object):

def__init__

(self, num)

: self.headers =

self.base_url =

''self.num = num

defget_url_list

(self)

:'''獲取頁面'''

detail_href_list =

for i in

range(1

, self.num)

: url = self.base_url +

str(i)

# print(url)

response = requests.get(url=url, headers=self.headers)

self.html_doc = etree.html(response.text)

detail_href = self.html_doc.xpath(

"//div[@class='r_ls_box']/div[@class='r_lbx']/div[@class='r_lbx_cen']/div[@class='r_lbx_cena']/a/@href"

)# 遍歷出每乙個頁面鏈結

for i in detail_href:

set(detail_href_list)

list

(detail_href_list)

# print(detail_href_list)

return detail_href_list

defroom_info

(self, href)

:# 對應頁面

self.href = href

print

(self.href)

# 月租金

try:

self.price_num = self.html_doc.xpath(

"//div[@class='room-price-num']/text()")[

0]print

(self.price_num)

except

: self.price_num =

none

try:

self.price_sale = self.html_doc.xpath(

"//div[@class='room-price-sale']/text()")[

0].strip()+

' 元/月'

print

(self.price_sale)

except

: self.price_sale =

none

# 位址

self.places1 = self.html_doc.xpath(

"//div[@class='room-list-box']/div[2]/div[@class='room-list'][3]//label/div/a/text()")[

0]# print(self.places1)

self.places2 = self.html_doc.xpath(

"//div[@class='room-list-box']/div[2]/div[@class='room-list'][3]//label/div/a/text()")[

1]# print(self.places2)

try:

self.places3 = self.html_doc.xpath(

"//div[@class='room-list-box']/div[2]/div[@class='room-list'][3]//label/div/a/text()")[

2]except

: self.places3 =

none

# print(self.places3)

self.places = self.places1 +

' '+ self.places2 +

' '+ self.places3

print

(self.places)

# 建築面積

self.room_area = self.html_doc.xpath(

"//div[@class='room-detail-box'][1]/div[@class='room-list'][1]/label/text()")[

0]print

(self.room_area)

# **編號

self.room_num = self.html_doc.xpath(

"//div[@class='room-detail-box'][1]/div[@class='room-list'][2]/label/text()")[

0]print

(self.room_num)

# 戶型

self.room_style = self.html_doc.xpath(

"//div[@class='room-detail-box'][1]/div[@class='room-list'][3]/label/text()")[

0].strip(

)print

(self.room_style)

# 付款

self.pay_method = self.html_doc.xpath(

"//div[@class='room-detail-box'][1]/div[@class='room-list'][4]/label/a/text()")[

0]print

(self.pay_method)

# 樓層

a = self.html_doc.xpath(

"//div[@class='room-list-box']/div[2]/div[@class='room-list'][2]/label/text()")[

0].split(

':')

self.floor_num =

str(a[1]

)print

(self.floor_num)

self.dict_new =

return self.dict_new

defrun(self)

:# 1.構造url列表

detail_href_list = self.get_url_list(

)# 2.遍歷,傳送請求,獲取響應

for href in detail_href_list:

response = requests.get(url=href, headers=self.headers)

.text

self.html_doc = etree.html(response)

self.room_info(href)

keyname =

list

(self.dict_new.keys())

# print(keyname)

print

('='*50

)#儲存

csv_file =

open

('room_info1.csv'

,'a+'

) csv_writer = csv.writer(csv_file)

csv_writer.writerow(self.dict_new.values())

if __name__ ==

'__main__'

: num =

int(

input

('請輸入爬取頁面數量:'))

danke = danke(num)

danke.run(

)

爬蟲爬取趕集網租房資訊

如下 示例 import scrapy import numpy as np import pandas as pd import matplotlib.pyplot as plt 如下 示例 terminal 終端實現 cd 跳轉到上一層目錄 scrapy startproject booktop...

python爬蟲 爬取小豬網的租房資訊

pycharm簡介 pycharm是一種python ide,帶有一整套可以幫助使用者在使用python語言開發時提高其效率的工具,比如除錯 語法高亮 project管理 跳轉 智慧型提示 自動完成 單元測試 版本控制。此外,該ide提供了一些高階功能,以用於支援django框架下的專業web開發。...

使用多執行緒實現我愛我家租房資訊的爬取

我愛我家的租房 完整 import math import requests from lxml import etree import re from queue import queue import threading import time def request url url heade...