python爬取鏈家網二手房資訊

朋友請我幫忙做的期末作業，我自己不是愛說話，直接分享**，可以直接執行的，期中用的是 python 3.6版本，導包的時候直接在cmd裡面用的pip install 包名，其中有的包安裝失敗，提示pip需要公升級，可以看一下這個鏈結

下面是**：

在這裡插入**片
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import requests
from bs4 import beautifulsoup
import re
import xlsxwriter
def generate_allurl(user_in_nub, user_in_city): # 生成url
def get_allurl(generate_allurl,user_in_city): # 分析url解析出每一頁的詳細url
head = 
get_url = requests.get(generate_allurl, headers=head,timeout=5)
if get_url.status_code == 200:
re_set = re.compile('' in i or len(i) > 0:
key, value = (i.split(''))
info[key[24:]] = value.rsplit('')[0]
# print(info)
return info
else:
print('獲取詳細資訊失敗')
def writer_to_text(info): # 儲存到text
with open('鏈家二手房.text', 'a', encoding='utf-8')as f:
f.write(json.dumps(info, ensure_ascii=false) + '\n')
f.close()
def pandas_to_xlsx(info,row,worksheet,bold_format): # 儲存到xlsx
col = 0
# 用符號標記位置，例如：a列1行
worksheet.write('a1', '標題', bold_format)
worksheet.write('b1', '總價', bold_format)
worksheet.write('c1', '單位', bold_format)
worksheet.write('d1', '每平方售價', bold_format)
worksheet.write('e1', '參考總價', bold_format)
worksheet.write('f1', '建造時間', bold_format)
worksheet.write('g1', '小區名稱', bold_format)
worksheet.write('h1', '所在區域', bold_format)
worksheet.write('i1', '鏈家編號', bold_format)
worksheet.write('j1', '房屋戶型', bold_format)
worksheet.write('k1', '所在樓層', bold_format)
worksheet.write('l1', '建築面積', bold_format)
worksheet.write('m1', '套內面積', bold_format)
worksheet.write('n1', '房屋朝向', bold_format)
worksheet.write('o1', '產權年限', bold_format)
#從第二行開始寫入這裡傳進來的row是1
worksheet.write_string(row, col + 0, info['標題'])
worksheet.write_string(row, col + 1, info['總價'])
worksheet.write_string(row, col + 2, info['單位'])
worksheet.write_string(row, col + 3, info['每平方售價'])
worksheet.write_string(row, col + 4, info['參考總價'])
worksheet.write_string(row, col + 5, info['建造時間'])
worksheet.write_string(row, col + 6, info['小區名稱'])
worksheet.write_string(row, col + 7, info['所在區域'])
worksheet.write_string(row, col + 8, info['鏈家編號'])
worksheet.write_string(row, col + 9, info['房屋戶型'])
worksheet.write_string(row, col + 10, info['所在樓層'])
worksheet.write_string(row, col + 11, info['建築面積'])
worksheet.write_string(row, col + 12, info['套內面積'])
worksheet.write_string(row, col + 13, info['房屋朝向'])
worksheet.write_string(row, col + 14, info['產權年限'])
# def main(url,user_in_city,row):
# writer_to_text(open_url(url)) #儲存到text檔案
if __name__ == '__main__':
user_in_city = input('輸入爬取城市：')
user_in_nub = input('輸入爬取頁數：')
#儲存到excel
workbook = xlsxwriter.workbook('./鏈家二手房.xlsx')
worksheet = workbook.add_worksheet()
# 設定格式，等號左邊格式名稱自定義，字典中格式為指定選項
# bold：加粗，num_format:數字格式
bold_format = workbook.add_format()
# 將二行二列設定寬度為15(從0開始)
# worksheet.set_column(1, 1, 15)
page = 1
row = 1
# pool = pool()
for i in generate_allurl(user_in_nub,user_in_city):
print('開始獲取第',page,'頁資料.......\n')
url = get_allurl(i,user_in_city)
for j in url:
pandas_to_xlsx(open_url(j,user_in_city),row,worksheet,bold_format)
row+=1
print('\n第' ,page ,'頁獲取完畢')
page +=1
print("成功")
workbook.close()
#儲存到excel結束
# main(url,user_in_city,row)
# pool.map(main, [url for url in get_allurl(i)])

Python爬取鏈家二手房資訊

2 資料庫表結構使用物件導向的方式，搭建專案框架 import requests from bs4 import beautifulsoup import pymysql class lianjiaspider mydb pymysql.connect localhost root 123456 ...

python爬蟲爬取鏈家二手房資訊

問題一鏈家也有反爬蟲策略和robots限制，robots限制忽略不然沒法爬另外頻繁爬取會直接導致被ban，需要隔天才會解禁止。防止被ban的方法有多種，1.禁止cookie 2.設定header 3.加大爬取間隔 4.使用我只用了前三種方法，具體可以在settings.py 和middle...

python爬蟲爬取鏈家二手房資訊

coding utf 8 import requests from fake useragent import useragent from bs4 import beautifulsoup import json import csv import time 構建請求頭 useragent use...

python爬取鏈家網二手房資訊

Python爬取鏈家二手房資訊

python爬蟲爬取鏈家二手房資訊

python爬蟲爬取鏈家二手房資訊

相關推薦