scrapy 5 爬取二級頁面的內容

2022-07-28 15:57:32 字數 3252 閱讀 7530

# -*- coding: utf-8 -*-

'''field: item.py

'''# define here the models for your scraped items

## see documentation in:

# import scrapy

class tupianprojectitem(scrapy.item):

# define the fields for your item here like:

# name = scrapy.field()

# 標題

title = scrapy.field()

# 發布時間

publish_time = scrapy.field()

# 瀏覽量

look = scrapy.field()

# 收藏量

collect = scrapy.field()

download = scrapy.field()

# 鏈結

image_url = scrapy.field()

# -*- coding: utf-8 -*-

import scrapy

from tupianproject.items import tupianprojectitem

class imagespider(scrapy.spider):

name = 'image'

allowed_domains = ['699pic.com']

start_urls = ['']

url = ''

page = 1

def parse(self, response):

image_detail_url_list = response.xpath('//div[@class="list"]/a/@href').extract()

# pass

# 遍歷詳情頁面,向每乙個詳情頁面傳送請求即可

for image_detail_url in image_detail_url_list:

yield scrapy.request(url=image_detail_url, callback=self.parse_detail)

# 接著傳送其他請求

if self.page <= 3:

self.page += 1

url = self.url.format(self.page)

yield scrapy.request(url=url, callback=self.parse)

def parse_detail(self, response):

# 建立乙個item物件

item = tupianprojectitem()

# 提取的每乙個資訊

# title

item['title'] = response.xpath('//div[@class="photo-view"]/h1/text()').extract_first()

# 發布時間

item['publish_time'] = response.xpath('//div[@class="photo-view"]/div/span[@class="publicityt"]')[0].xpath('string(.)').extract_first()

# 獲取瀏覽量

item['look'] = response.xpath('//div[@class="photo-view"]/div/span[@class="look"]/read/text()').extract_first()

# 獲取收藏量

item['collect'] = response.xpath('//div[@class="photo-view"]/div/span[@class="collect"]')[0].xpath('string(.)').extract_first()

item['download'] = response.xpath('//div[@class="photo-view"]/div/span[@class="download"]')[0].xpath('string(.)').extract_first().strip('\n\t')

item['image_url'] = response.xpath('//div[@class="huabu"]//img/@src').extract_first()

# 將item傳送出去

yield item

# -*- coding: utf-8 -*-

'''filed: pipelines.py

'''s

# define your item pipelines here

## don't forget to add your pipeline to the item_pipelines setting

# see:

import json

import urllib.request

import os

class tupianprojectpipeline(object):

def open_spider(self, spider):

self.fp = open('tupian.json', 'w', encoding='utf8')

def process_item(self, item, spider):

d = dict(item)

string = json.dumps(d, ensure_ascii=false)

self.fp.write(string + '\n')

self.download(item)

return item

def download(self, item):

dirname = './people'

suffix = item['image_url'].split('.')[-1]

filename = item['title'] + '.' + suffix

filepath = os.path.join(dirname, filename)

urllib.request.urlretrieve(item['image_url'], filepath)

def close_spider(self, spider):

self.fp.close()

Scrapy爬蟲框架 二 匯出爬取結果

功能描述 爬取 豆瓣 電影 top 250,爬取內容 電影標題,評分人數,評分 編輯 items.py 檔案 coding utf 8 import scrapy class doubanmovieitem scrapy.item 排名 ranking scrapy.field 電影名稱 title...

scrapy實現多級頁面爬取(初級練習題)

練習題 quotes to scrapes 諺語 等級 初級 爬取每條諺語的資訊 諺語 作者 標籤 作者出生日期 作者出事地點 作者基本描述 思路 2 得到初始url的response,傳遞給parse1函式 負責解析第一級頁面 解析response 4 parse2函式會解析每個二級頁面的url的...

scrapy爬取頁面不完全的解決辦法

最近在使用scrapy來製作爬蟲以爬取一些 上的資訊,但是卻出現了乙個很奇怪的問題,即在網頁中開啟待爬取的url,並在網頁源 中定位了某些待爬取的元素,但是當使用scrapy爬取資料時,卻發現報錯了,而錯誤竟然是所爬取到的網頁中並沒有我在瀏覽器中看到的元素,即對於同乙個url,爬取到的頁面和我在瀏覽...