scrapy 爬去網頁(1)

2022-05-31 20:57:16 字數 2894 閱讀 6404



class cnblogsitem(scrapy.item):

# define the fields for your item here like:

# name = scrapy.field()

title = scrapy.field()

link = scrapy.field()

desc = scrapy.field()

listurl = scrapy.field()



rules = [ rule(sle(allow=('/t/python\?type=newest&page=\d'),), follow=true,callback='parse_item1') ]

def parse_item1(self, response):

sel = selector(response)

items =

base_url = get_base_url(response)

posttitle = sel.css('').css("section")#全部的問題數量每一頁

postcon = sel.css('div.postcon div.c_b_p_desc')

# #標題、url和描述的結構是乙個鬆散的結構,後期可以改進

for index in range(len(posttitle)):

item = cnblogsitem()


item['title'] = posttitle[index].css("a").xpath('text()').extract()[0]

# item['link'] = ''+posttitle[index].css('a').xpath('@href').extract()[0]#提問人的主頁鏈結


item['link'] = ''+posttitle[index].css("h2.title").css('a').xpath('@href').extract()[0]


item['listurl'] = base_url

item['desc'] = posttitle[index].css("div.answers ").xpath("text()").extract()[0]

#print base_url + "********\n"

return items



#! /usr/bin/python

'''author fiz


'''import pymongo

from scrapy.conf import settings

from scrapy.exceptions import dropitem

from scrapy import log

class mongodbpipeline( object ):

def __init__( self ):

connection = pymongo.mongoclient()

db = connection[settings[ 'mongodb_db' ]]

self .collection = db[settings[ 'mongodb_collection' ]]

def process_item( self , item, spider):

valid = true

for data in item:

if not data:

valid = false

raise dropitem( "missing !" . format (data))

if valid:

self .collection.insert( dict (item))

log.msg( "question added to mongodb database!" ,

level = log.debug, spider = spider)

return item


