爬取廣州鏈家二手房並寫入csv

2021-08-21 06:33:28 字數 2842 閱讀 2444

知識點:多執行緒,讀取csv,xpath
import json

import csv

import requests

import threading

import lxml

import lxml.etree

#遞迴鎖

rlock=threading.rlock()

headers =

#獲取區域

def getarealist(url):

html=requests.get(url,headers=headers).text

mytree=lxml.etree.html(html)

arelist=mytree.xpath('//div[@data-role="ershoufang"]//a')

areadict={}

for area in arelist:

# 區域名

areaname=area.xpath('./text()')[0]

#url

areaurl = ''+area.xpath('./@href')[0]

#print(areaname,areaurl)

areadict[areaname]=areaurl

print(areaname,areaurl)

return areadict

#獲取區域頁數

def getareapage(areaurl,areaname):

html = requests.get(areaurl, headers=headers).text

mytree = lxml.etree.html(html)

pagenum=mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]

pagenum=int(json.loads(pagenum)["totalpage"])

#return int(pagenum)

gethourseinfo(areaurl, areaname, pagenum)

#獲取房子資訊

def gethourseinfo(areaurl,areaname,pagenum):

for page in range(1,pagenum):

#構造頁碼url

url=areaurl+'pg%d'%page

html = requests.get(url, headers=headers).text

mytree = lxml.etree.html(html)

#房子列表

hourselist = mytree.xpath('//ul[@class="selllistcontent"]/li')

for hourse in hourselist:

#房子標題

hoursetitle=hourse.xpath('.//div[@class="title"]/a/text()')[0]

#房子url

hourseurl = hourse.xpath('.//div[@class="title"]/a/@href')[0]

#位址資訊

hourseaddress=hourse.xpath('.//div[@class="houseinfo"]//text()')

hourseaddress=hourseaddress[0]+hourseaddress[1]

#位置xinxi

positioninfo=hourse.xpath('.//div[@class="positioninfo"]//text()')

positioninfo=positioninfo[0]+positioninfo[1]

#總價totalprice=hourse.xpath('.//div[@class="totalprice"]//text()')

totalprice=totalprice[0]+totalprice[1]

#單價unitprice

unitprice = hourse.xpath('.//div[@class="unitprice"]//text()')[0]

data=[hoursetitle,hourseaddress,positioninfo,totalprice,unitprice,hourseurl]

print(data)

#寫入csv

with rlock:

with open('./data/'+areaname+'.csv','a+') as f:

writer=csv.writer(f)

writer.writerow(data)

if __name__ == '__main__':

#廣州二手房

mainurl='/ershoufang/'

#獲取區域字典

areadict=getarealist(mainurl)

threadlist=

for areaname,areaurl in areadict.items():

# print(areaname)

# #獲取區域頁碼

# pagenum=getareapage(areaurl)

# gethourseinfo(areaurl,areaname,pagenum)

t=threading.thread(target=getareapage,args=(areaurl,areaname))

t.start()

#保證執行緒正常結束

for t in threadlist:

t.join()

Python鏈家廣州二手房的資料爬取 資料爬取

讀取原始資料 注意選擇gbk編碼方式 很簡單 就不做上傳 try soup beautifulsoup html,html.parser except exception return 1 house info div soup.find all div attrs 獲取整個標題塊 獲取到之後的資料...

Python爬取鏈家二手房資訊

2 資料庫表結構 使用物件導向的方式,搭建專案框架 import requests from bs4 import beautifulsoup import pymysql class lianjiaspider mydb pymysql.connect localhost root 123456 ...

python爬蟲爬取鏈家二手房資訊

問題一 鏈家 也有反爬蟲策略和robots限制,robots限制忽略 不然沒法爬 另外頻繁爬取會直接導致被ban,需要隔天才會解禁止。防止被ban的方法有多種,1.禁止cookie 2.設定header 3.加大爬取間隔 4.使用 我只用了前三種方法,具體可以在settings.py 和middle...