# kaoyan.py

# -*- coding: utf-8 -*-

from copy import deepcopy

import scrapy

from scrapy.linkextractors import linkextractor

from scrapy.spiders import crawlspider, rule

class kaoyanspider(crawlspider):

name = 'kaoyan'

allowed_domains = ['kaoyan365.cn']

start_urls = ['']

rules = (

# 提取各個省份的url位址

rule(linkextractor(allow=r''), callback='parse_list',


)def parse_list(self, response):

# 提取各個大學名稱及鏈結

td_list = response.xpath('//div[@class="zg_list_left01_cont"]//td')

for td in td_list:

item = {}

item["university"] = td.xpath('.//text()').extract_first()

item["href"] = td.xpath('./a/@href').extract_first()

if item["href"]:

yield scrapy.request(




)def parse_university(self, response):

# 獲取網頁詳細內容

item = response.meta["item"]

item["content"] = response.xpath("//div[@class='zg_list_left01_cont']//text()").extract()

yield item

# pipelines.py

# -*- coding: utf-8 -*-

# define your item pipelines here

## don't forget to add your pipeline to the item_pipelines setting

# see:

import re

class tiaojipipeline(object):

def process_item(self, item, spider):

# 寫入檔案

with open("考研調劑資訊.txt", "a", encoding="utf-8") as f:

f.write("***" + item["university"] + ":" + item["href"] + "\n")

# 清理無效資料


return item

def clear_item(self, content_list):


for content in content_list:

content = re.sub(r"u3000", "", content)

with open("考研調劑資訊.txt", "a", encoding="utf-8") as f:

f.write(content.strip() + "\n")

xpath案例 全國城市名爬取

