爬取拉勾網資料,並存入Mongodb資料庫

2021-09-13 19:05:29 字數 1476 閱讀 6816

import time

import pymongo

import requests

from bs4 import beautifulsoup

#簡歷資料庫連線

client=pymongo.mongoclient('localhost',27017)

mydb=client['mydb']

lagou=mydb['lagou']

#獲取request header資訊

headers=

def get_page(url):

web_data=requests.get(url,headers=headers)

soup=beautifulsoup(web_data.text,'html.parser',from_encoding='utf-8')

#soup.select()方法中對應的引數為:檢查--》copy--》copy selector

companynames=soup.select('.list_item_top > div.company > div.company_name > a')

industrys=soup.select('div.list_item_top > div.company > div.industry')

positions=soup.select('div.list_item_top > div.position > div.p_top > a > h3')

addresses=soup.select('.list_item_top > div.position > div.p_top > a > span > em')

moneys=soup.select('div.list_item_top > div.position > div.p_bot > div > span')

advantages=soup.select('div.list_item_bot > div.li_b_r')

#上面獲取的資料為乙個個的列表,將他們轉換為json字串形式的資料,插入資料庫

for companyname,industry,position,address,money,advantage in zip(companynames,industrys,positions,addresses,moneys,advantages):

data=

print(data)

lagou_id=lagou.insert(data)

time.sleep(1)

print(lagou_id)

#output_html(data)

print('----------------------------')

if __name__ == '__main__':

urls=[''.format(str(i)) for i in range(1,4)]

for url in urls:

get_page(url)

爬取拉勾網職位資訊並存為json檔案

from bs4 import beautifulsoup import requests import re import pymongo import json client pymongo.mongoclient localhost 27017 lagou client lagou sheet...

使用scrapy框架爬取資料並存入excel表中

爬取 爬取目標 獲得乙個地區七天之內的天氣狀況,並存入excel 中 爬蟲檔案部分 import scrapy from items import tianqiyubaoitem class tianqispider scrapy.spider name tianqi allowed domains...

雪球網爬取資料並存入資料庫

from urllib import request import json import pymysql class mysql connect object 初始化的建構函式 def init self self.db pymysql.connect host 127.0.0.1 user ro...