python爬取鏈家網二手房資訊

2021-09-24 09:17:05 字數 3923 閱讀 2823

朋友請我幫忙做的期末作業,我自己不是愛說話,直接分享**,可以直接執行的,期中用的是 python 3.6版本,導包的時候直接在cmd裡面用的pip install 包名,其中有的包安裝失敗,提示pip需要公升級,可以看一下這個鏈結

下面是**:

在這裡插入**片

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

import json

import requests

from bs4 import beautifulsoup

import re

import xlsxwriter

def generate_allurl(user_in_nub, user_in_city): # 生成url

def get_allurl(generate_allurl,user_in_city): # 分析url解析出每一頁的詳細url

head =

get_url = requests.get(generate_allurl, headers=head,timeout=5)

if get_url.status_code == 200:

re_set = re.compile('' in i or len(i) > 0:

key, value = (i.split(''))

info[key[24:]] = value.rsplit('')[0]

# print(info)

return info

else:

print('獲取詳細資訊失敗')

def writer_to_text(info): # 儲存到text

with open('鏈家二手房.text', 'a', encoding='utf-8')as f:

f.write(json.dumps(info, ensure_ascii=false) + '\n')

f.close()

def pandas_to_xlsx(info,row,worksheet,bold_format): # 儲存到xlsx

col = 0

# 用符號標記位置,例如:a列1行

worksheet.write('a1', '標題', bold_format)

worksheet.write('b1', '總價', bold_format)

worksheet.write('c1', '單位', bold_format)

worksheet.write('d1', '每平方售價', bold_format)

worksheet.write('e1', '參考總價', bold_format)

worksheet.write('f1', '建造時間', bold_format)

worksheet.write('g1', '小區名稱', bold_format)

worksheet.write('h1', '所在區域', bold_format)

worksheet.write('i1', '鏈家編號', bold_format)

worksheet.write('j1', '房屋戶型', bold_format)

worksheet.write('k1', '所在樓層', bold_format)

worksheet.write('l1', '建築面積', bold_format)

worksheet.write('m1', '套內面積', bold_format)

worksheet.write('n1', '房屋朝向', bold_format)

worksheet.write('o1', '產權年限', bold_format)

#從第二行開始寫入這裡傳進來的row是1

worksheet.write_string(row, col + 0, info['標題'])

worksheet.write_string(row, col + 1, info['總價'])

worksheet.write_string(row, col + 2, info['單位'])

worksheet.write_string(row, col + 3, info['每平方售價'])

worksheet.write_string(row, col + 4, info['參考總價'])

worksheet.write_string(row, col + 5, info['建造時間'])

worksheet.write_string(row, col + 6, info['小區名稱'])

worksheet.write_string(row, col + 7, info['所在區域'])

worksheet.write_string(row, col + 8, info['鏈家編號'])

worksheet.write_string(row, col + 9, info['房屋戶型'])

worksheet.write_string(row, col + 10, info['所在樓層'])

worksheet.write_string(row, col + 11, info['建築面積'])

worksheet.write_string(row, col + 12, info['套內面積'])

worksheet.write_string(row, col + 13, info['房屋朝向'])

worksheet.write_string(row, col + 14, info['產權年限'])

# def main(url,user_in_city,row):

# writer_to_text(open_url(url)) #儲存到text檔案

if __name__ == '__main__':

user_in_city = input('輸入爬取城市:')

user_in_nub = input('輸入爬取頁數:')

#儲存到excel

workbook = xlsxwriter.workbook('./鏈家二手房.xlsx')

worksheet = workbook.add_worksheet()

# 設定格式,等號左邊格式名稱自定義,字典中格式為指定選項

# bold:加粗,num_format:數字格式

bold_format = workbook.add_format()

# 將二行二列設定寬度為15(從0開始)

# worksheet.set_column(1, 1, 15)

page = 1

row = 1

# pool = pool()

for i in generate_allurl(user_in_nub,user_in_city):

print('開始獲取第',page,'頁資料.......\n')

url = get_allurl(i,user_in_city)

for j in url:

pandas_to_xlsx(open_url(j,user_in_city),row,worksheet,bold_format)

row+=1

print('\n第' ,page ,'頁獲取完畢')

page +=1

print("成功")

workbook.close()

#儲存到excel結束

# main(url,user_in_city,row)

# pool.map(main, [url for url in get_allurl(i)])

Python爬取鏈家二手房資訊

2 資料庫表結構 使用物件導向的方式,搭建專案框架 import requests from bs4 import beautifulsoup import pymysql class lianjiaspider mydb pymysql.connect localhost root 123456 ...

python爬蟲爬取鏈家二手房資訊

問題一 鏈家 也有反爬蟲策略和robots限制,robots限制忽略 不然沒法爬 另外頻繁爬取會直接導致被ban,需要隔天才會解禁止。防止被ban的方法有多種,1.禁止cookie 2.設定header 3.加大爬取間隔 4.使用 我只用了前三種方法,具體可以在settings.py 和middle...

python爬蟲爬取鏈家二手房資訊

coding utf 8 import requests from fake useragent import useragent from bs4 import beautifulsoup import json import csv import time 構建請求頭 useragent use...