scrapy 5 爬取二級頁面的內容

# -*- coding: utf-8 -*-
'''field: item.py
'''# define here the models for your scraped items
## see documentation in:
# import scrapy
class tupianprojectitem(scrapy.item):
# define the fields for your item here like:
# name = scrapy.field()
# 標題
title = scrapy.field()
# 發布時間
publish_time = scrapy.field()
# 瀏覽量
look = scrapy.field()
# 收藏量
collect = scrapy.field()
download = scrapy.field()
# 鏈結
image_url = scrapy.field()

# -*- coding: utf-8 -*-
import scrapy
from tupianproject.items import tupianprojectitem
class imagespider(scrapy.spider):
name = 'image'
allowed_domains = ['699pic.com']
start_urls = ['']
url = ''
page = 1
def parse(self, response):
image_detail_url_list = response.xpath('//div[@class="list"]/a/@href').extract()
# pass
# 遍歷詳情頁面，向每乙個詳情頁面傳送請求即可
for image_detail_url in image_detail_url_list:
yield scrapy.request(url=image_detail_url, callback=self.parse_detail)
# 接著傳送其他請求
if self.page <= 3:
self.page += 1
url = self.url.format(self.page)
yield scrapy.request(url=url, callback=self.parse)
def parse_detail(self, response):
# 建立乙個item物件
item = tupianprojectitem()
# 提取的每乙個資訊
# title
item['title'] = response.xpath('//div[@class="photo-view"]/h1/text()').extract_first()
# 發布時間
item['publish_time'] = response.xpath('//div[@class="photo-view"]/div/span[@class="publicityt"]')[0].xpath('string(.)').extract_first()
# 獲取瀏覽量
item['look'] = response.xpath('//div[@class="photo-view"]/div/span[@class="look"]/read/text()').extract_first()
# 獲取收藏量
item['collect'] = response.xpath('//div[@class="photo-view"]/div/span[@class="collect"]')[0].xpath('string(.)').extract_first()
item['download'] = response.xpath('//div[@class="photo-view"]/div/span[@class="download"]')[0].xpath('string(.)').extract_first().strip('\n\t')
item['image_url'] = response.xpath('//div[@class="huabu"]//img/@src').extract_first()
# 將item傳送出去
yield item

# -*- coding: utf-8 -*-
'''filed: pipelines.py
'''s
# define your item pipelines here
## don't forget to add your pipeline to the item_pipelines setting
# see: 
import json
import urllib.request
import os
class tupianprojectpipeline(object):
def open_spider(self, spider):
self.fp = open('tupian.json', 'w', encoding='utf8')
def process_item(self, item, spider):
d = dict(item)
string = json.dumps(d, ensure_ascii=false)
self.fp.write(string + '\n')
self.download(item)
return item
def download(self, item):
dirname = './people'
suffix = item['image_url'].split('.')[-1]
filename = item['title'] + '.' + suffix
filepath = os.path.join(dirname, filename)
urllib.request.urlretrieve(item['image_url'], filepath)
def close_spider(self, spider):
self.fp.close()

Scrapy爬蟲框架二匯出爬取結果

功能描述爬取豆瓣電影 top 250,爬取內容電影標題，評分人數，評分編輯 items.py 檔案 coding utf 8 import scrapy class doubanmovieitem scrapy.item 排名 ranking scrapy.field 電影名稱 title...

scrapy實現多級頁面爬取（初級練習題）

練習題 quotes to scrapes 諺語等級初級爬取每條諺語的資訊諺語作者標籤作者出生日期作者出事地點作者基本描述思路 2 得到初始url的response，傳遞給parse1函式負責解析第一級頁面解析response 4 parse2函式會解析每個二級頁面的url的...

scrapy爬取頁面不完全的解決辦法

最近在使用scrapy來製作爬蟲以爬取一些上的資訊，但是卻出現了乙個很奇怪的問題，即在網頁中開啟待爬取的url，並在網頁源中定位了某些待爬取的元素，但是當使用scrapy爬取資料時，卻發現報錯了，而錯誤竟然是所爬取到的網頁中並沒有我在瀏覽器中看到的元素，即對於同乙個url，爬取到的頁面和我在瀏覽...

scrapy 5 爬取二級頁面的內容

Scrapy爬蟲框架 二 匯出爬取結果

scrapy實現多級頁面爬取（初級練習題）

scrapy爬取頁面不完全的解決辦法

相關推薦

Scrapy爬蟲框架二匯出爬取結果