爬取58二手資料 py

2022-09-11 05:57:12 字數 2878 閱讀 3406

#

第乙個模組 抓取所有頻道鏈結

from bs4 import

beautifulsoup

import

requests

start_url = '

'url_host = '

'def

get_index_url(url):

wb_data =requests.get(url)

soup = beautifulsoup(wb_data.text, '

lxml')

links = soup.select('

ul.ym-submnu > li > b > a')

for link in

links:

page_url = url_host + link.get('

href')

print

(page_url)

get_index_url(start_url)

#第二個模組 抓取所有商品鏈結和詳情資料

from bs4 import

beautifulsoup

import

requests

import

time

import

pymongo

client = pymongo.mongoclient('

localhost

', 27017)

ceshi = client['

ceshi']

url_list = ceshi['

url_list4']

item_info = ceshi['

item_info4']

#在最左邊是在python 中物件的名稱,後面的是在資料庫中的名稱

#spider 1

defget_links_from(channel, pages):

#td.t 沒有這個就終止

list_view = '

{}/pn{}/

'.format(channel, str(pages))

wb_data =requests.get(list_view)

time.sleep(1)

soup = beautifulsoup(wb_data.text, '

lxml')

if soup.find('

td', 't'

):

for link in soup.select('

td.t a.t'):

item_link = link.get('

href

').split('?'

)[0]

if item_link != '

':url_list.insert()

print

(item_link)

#return urls

else

:

#it's the last page !

pass

#spider 2

defget_item_info(url):

wb_data =requests.get(url)

soup = beautifulsoup(wb_data.text, '

lxml')

if url[:25] == '

':data=

item_info.insert(data)

else

: data=

item_info.insert(data)

#第三個模組 主檔案執行開始抓取

from multiprocessing import

pool

from pages_parsing import

get_item_info,url_list,item_info,get_links_from

from channel_extact import

channel_list

item_url = (item['

url'] for item in

url_list.find())

index_urls0 = (item['

url'] for item in

item_info.find())

x =set(item_url)

y =set(index_urls0)

rest_of_urls = x-y

defget_all_links_from(channel):

for i in range(1,100):

get_links_from(channel,i)

return

rest_of_urls

if__name__ == '

__main__':

pool =pool()

#pool = pool(processes=6)

#pool.map(get_all_links_from,channel_list.split())

pool.map(get_item_info,rest_of_urls)

#count = 0

#for url in rest_of_urls:

#print(url)

#count += 1

#print(count)

#第四個模組 檢視資料流

import

time

from pages_parsing import

url_list

while

true:

print

(url_list.find().count())

time.sleep(5)

爬取58二手房的放原標題

import requests from bs4 import beautifulsoup import re from lxml import etree import time 需求 爬取58二手房的 資訊 if name main headers 爬取到頁面原始碼資料 url page tex...

爬取二手房資訊

開源到github了 專案位址 基於springboot,idea 匯入依賴 org.jsoupgroupid jsoupartifactid 1.10.2version dependency 資料放入redis中,引人redis org.springframework.bootgroupid sp...

Python爬取58同城二手房資訊的標題名稱

今天,我們用python來爬取58同城頁面二手房資訊的資料。首先開啟 爬取頁面原始碼資料 page text requests.get url url,headers headers text 資料解析 tree etree.html page text 儲存li標籤物件 li list tree....