第一次爬蟲

2021-08-15 13:44:36 字數 2904 閱讀 1682

from bs4 import beautifulsoup

import requests

import time

def

get_item_info(url): #如何從詳情頁裡面要爬取的內容

#url = ''

wb_data = requests.get(url)

soup = beautifulsoup(wb_data.text,

'lxml')

time.sleep(2)

cate = soup.select('span.crb_i > a')[-1].text.strip()

title = soup.select('h1.info_titile')[0].text

view = soup.select('span.look_time')[0].text

price = soup.select('span.price_now')[0].text

zone = soup.select('div.palce_li > span > i')[0].text

data=

print(data)

def

get_all_items_info(url): #如何從一頁中獲得詳情頁連線

#url = ''

wb_data = requests.get(url)

soup = beautifulsoup(wb_data.text,

'lxml')

hrefs_list = soup.select('a.t')

for href in hrefs_list:

link = href.get('href')

if 'zhuanzhuan'

in link:

get_item_info(link)

def

get_page_link(page_number):#如何獲取每頁鏈結

for each_number in

range(1

, page_number):

full_url = '0/pn{}/'.format(each_number)

get_all_items_info(full_url)

get_page_link(3)

總結:感覺重要的還是乙個邏輯思路,函式與函式之間的聯絡。把關係弄清楚了,程式設計就簡單了

from 

bs4

import

beautifulsoup

import

requests

import

time

def

get_item_info

(url):

#3如何從詳情頁裡面要爬取的內容

wb_data = requests.get(url)

soup = beautifulsoup(wb_data.text

,'lxml'

) time.sleep(2)

cate = soup.select(

'span.crb_i > a'

)[-1

].text.strip()

title = soup.select(

'h1.info_titile')[0

].text

view = soup.select(

'span.look_time')[0

].text

price = soup.select(

'span.price_now')[0

].text

zone = soup.select(

'div.palce_li > span > i')[0

].text

data=

print

(data)

'''def get_all_items_info(url): #2如何從一頁中獲得詳情頁連線

wb_data = requests.get(url)

soup = beautifulsoup(wb_data.text,'lxml')

hrefs_list = soup.select('a.t')

for href in hrefs_list:

link = href.get('href')

if 'zhuanzhuan' in link:

get_item_info(link)

'''def

get_page_link

(page_number):

#1如何獲取每頁鏈結

for

each_number

in range(1

, page_number):

full_url =

'0/pn{}/'

.format(each_number)

#get_all_items_info(full_url)

wb_data = requests.get(full_url)

soup = beautifulsoup(wb_data.text

, 'lxml'

) hrefs_list = soup.select(

'a.t'

)

for

href

in hrefs_list:

link = href.get(

'href'

)

if 'zhuanzhuan'

in link:

get_item_info(link)

get_page_link(

2)

這樣也可以,

爬蟲第一次

由於面試的需要,昨天看了下爬蟲,python的,原先一直以為很高階,但是才發現大體思路很清晰。1。連線到要抓取的某網 注意import urllib,比如這個樣子 def gethtml url page urllib.urlopen url html page.read return html 這...

記第一次爬蟲

出不來結果的時候,真是著急,期間犯了很多錯誤,這個過程痛苦並快樂著 哈哈哈哈哈,哈哈哈哈哈 import urllib.request import re import os url page urllib.request.urlopen url read page page.decode gbk ...

記第一次爬蟲

在學習爬蟲的過程中,我首先進行的是對豆瓣top250的爬取,步驟可分為如下幾步 第一步 抓包 url 第二步 請求url 第三步 解析,提取出我需要的資訊 第四步 存入檔案 首先我對豆瓣的網頁進行了分析,開啟要抓取的網頁,f12 f5,這樣你就可以看到網頁原始碼了,進入到network,找到要抓取的...