python 爬蟲例項(四)

2022-05-21 09:42:15 字數 3778 閱讀 3520

環境:

os:window10

python:3.7

爬取鏈家地產上面的資料,兩個畫面上的資料的爬取

效果,下面的兩個網頁中的資料取出來

'''初期化變數的值

'''def __init__(self):

# 定義自己要爬取的url

self.url = ""

self.path = r"c:\pythonproject\lianjia_house"

self.headers = {

'''訪問url

'''def request(self, param):

# 如果不加的話可能會出現403的錯誤,所以盡量的都加上header,模仿網頁來訪問

req = requests.get(param, headers=self.headers)

# req.raise_for_status()

return req.text

'''page設定

'''def all_pages(self, pagecn):

datalista =

for i in range(1, pagecn+1):

if pagecn == 1:

datalista = datalista + self.getdata(self.url[0:self.url.find("pg")])

else:

url = self.url.format(i)

datalista = datalista + self.getdata(url)

# self.dataorganize(datalista)

'''資料取得

'''def getdata(self, url):

datalist =

thread_lock.acquire()

req = self.request(url)

# driver = webdriver.chrome()

# driver.get(self.url)

# iframe_html = driver.page_source

# driver.close()

# print(iframe_html)

soup = beautifulsoup(req, 'lxml')

counthouse = soup.find(class_="total fl").find("span")

print("共找到 ", counthouse.string, " 套大連二手房")

sell_all = soup.find(class_="selllistcontent").find_all("li")

for sell in sell_all:

title = sell.find(class_="title")

if title is not none:

print("------------------------概要--------------------------------------------")

title = title.find("a")

print("title:", title.string)

housinfo = sell.find(class_="houseinfo").get_text()

print("houseinfo:", housinfo)

positioninfo = sell.find(class_="positioninfo").get_text()

print("positioninfo:", positioninfo)

followinfo = sell.find(class_="followinfo").get_text()

print("followinfo:", followinfo)

print("------------------------詳細資訊--------------------------------------------")

url_detail = title["href"]

req_detail = self.request(url_detail)

soup_detail = beautifulsoup(req_detail, "lxml")

total = soup_detail.find(class_="total")

unit = soup_detail.find(class_="unit").get_text()

print("總價:", total.string, unit)

unitpricevalue = soup_detail.find(class_="unitpricevalue").get_text()

print("單價:", unitpricevalue)

room_maininfo = soup_detail.find(class_="room").find(class_="maininfo").get_text()

print("戶型:", room_maininfo)

type_maininfo = soup_detail.find(class_="type").find(class_="maininfo").get_text()

print("朝向:", type_maininfo)

area_maininfo = soup_detail.find(class_="area").find(class_="maininfo").get_text()

print("面積:", area_maininfo)

else:

print("広告です")

thread_lock.release()

return datalist

## def dataorganize(self, datalist):

## data2 = pd.dataframe(datalist)

# data2.to_csv(r'c:\users\peiqiang\desktop\lagoujob.csv', header=false, index=false, mode='a+')

# data3 = pd.read_csv(r'c:\users\peiqiang\desktop\lagoujob.csv', encoding='gbk')

thread_lock = threading.boundedsemaphore(value=100)

house_info = lianjiahouseinfo()

starttime = datetime.datetime.now()

house_info.all_pages(1)

endtime = datetime.datetime.now()

print("実行時間:", (endtime - starttime).seconds)

執行之後的效果

Python爬蟲例項

中國大學排名專案 功能描述 輸出 大學排名資訊的螢幕輸出 排名,大學名稱,總分 技術路線 requests bs4 定向爬蟲 僅對輸入url進行爬取,不擴充套件爬取 程式的結構設計 步驟1 從網路上獲取大學排名網頁內容 步驟2 提取網頁內容中資訊到合適的資料結構 二維列表 步驟3 利用資料結構展示並...

python 爬蟲例項

coding utf 8 import re import sys import os from time import sleep from bs4 import beautifulsoup import requests reload sys sys.setdefaultencoding utf...

Python 爬蟲例項

下面是我寫的乙個簡單爬蟲例項 1.定義函式讀取html網頁的源 2.從源 通過正規表示式挑選出自己需要獲取的內容 3.序列中的htm依次寫到d盤 usr bin python import re import urllib.request 定義函式讀取html網頁的源 def gethtml url...