使用scrapy框架爬蟲,寫入到資料庫

2021-09-29 07:47:58 字數 3553 閱讀 9714

import scrapy

class bossitem(scrapy.item):

# define the fields for your item here like:

# name = scrapy.field()

name = scrapy.field() #對應實體類的屬性

salary = scrapy.field()

# -*- coding: utf-8 -*-

import scrapy

from boss.items import bossitem

class zhipinspider(scrapy.spider):

name = 'zhipin' #爬蟲名稱

allowed_domains = ['lianjia.com'] #爬取域

start_urls = [''] #爬取鏈結

def parse(self, response):

items =

posts = response.xpath("//div[@class='content__list--item--main']")

for each in posts:

item = bossitem()

item["name"] = each.xpath("//p[@class='content__list--item--title twoline']/a/text()").extract()[0]

address = each.xpath("p[class='content__list--item--des']/a[position()<4]/text()").extract()

item["salary"] = each.xpath("span[@class='content__list--item-price']/em/text()").extract()[0]

#item["address"] = address[1]+address[2]+address[3]

print(item)

# yield item

return items

#測試是否能爬取到網頁

# with open("lianjia.html","w",encoding="utf-8") as file:

# file.write(response.text)

item_pipelines = 

#瀏覽器**

# obey robots.txt rules

robotstxt_obey = false

import json

class bosspipeline(object):

def __init__(self):

self.file = open("lianjia.json","w",encoding="utf-8")

def process_item(self, item, spider):

content = json.dumps(dict(item),ensure_ascii = false)+"\n"

self.file.write(content)

return item

def close_spider(self,spider):

self.file.close()

from scrapy import cmdline

cmdline.execute("scrapy crawl zhipin".split())

執行結束後,會出現乙個json的檔案。

建立資料庫表python

import json

import pymysql

import traceback

from time import sleep

class pymysql(object):

create_table = 'create table lianjia(id int not null primary key auto_increment,name varchar(255) not null,salary int,address varchar(255))default charset=utf8'

def __init__(self, host, user, pwd, db):

self.conn = pymysql.connect(host, user, pwd, db)

self.cursor = self.conn.cursor()

def create_table_func(self):

self.cursor.execute(pymysql.create_table)

print('資料表建立完畢')

def insert_date(self,sql):

try:

self.cursor.execute(sql)

self.conn.commit()

except:

print(traceback.format_exc())

self.conn.rollback()

def select_data(self):

self.cursor.execute(pymysql.select)

all_data = self.cursor.fetchall()

for i in all_data:

print('查詢結果為:{}'.format(i))

if __name__ == '__main__':

my = pymysql('localhost', 'root', '123456', 'pytest')

# my.create_table_func()

with open('../lianjia.json','r',encoding='utf-8') as f:

for line in f.readlines():

print(line)

temp = json.loads(line)

name = temp['name'].strip();

salary = temp['salary']

address = temp['address']

sql = 'insert into lianjia(name,salary,address) values("%s","%s","%s")' % (name, salary, address)

my.insert_date(sql)

執行此方法將資料寫到資料據庫中。

相關問題

debug: forbidden by robots.txt

debug: crawled (403)

debug: redirecting (302) to

爬蟲2 2 scrapy框架 檔案寫入

目錄 pipelines.py 前提回顧,spider.py中 data yield data import json class qsbkpipeline object def init self self.fp open qsbk.json w encoding utf 8 初始化檔案指標 de...

scrapy爬蟲框架

作者經過幾周的python爬蟲實踐之後,深入學習了一下scrapy這個爬蟲框架,現將一些基本知識和 總結整理一下,以備後查。2.scrapy的命令列使用 這部分網上很多部落格都有總結,不需要背,理解會用主要的命令 startproject crawl fetch list genspider.即可,...

scrapy 爬蟲框架

1.安裝 公升級pip版本 pip install upgrade pip 通過pip安裝scrapy框架 pip install scrapy 安裝成功 只執行scrapy 進行測試是否安裝成功 2.scrapy startproject 爬蟲專案名稱 執行此命令,可以生成乙個爬蟲專案 會預先生成...