python 資料庫 爬蟲

2021-09-25 15:17:21 字數 3924 閱讀 9415

python3 和 pip3 安裝

安裝 selenium 

配置驅動的環境變數,或者將驅動放到已經配置好的資料夾中,  類似 window 的 cmd的目錄  

window :c:\windows\system32    linux : /usr/bin; /usr/local/bin

安裝 pyquery

安裝pymysql

# -*- coding: utf-8 -*-

"""created on wed jul 24 12:07:25 2019

@author: icheng

"""from selenium.webdriver import firefox

from selenium.webdriver.firefox.options import options

from pyquery import pyquery as pq

import time

import pymysql

import threading

class spider:

def __init(self):

self.__data =

self.__options = options()

self.__options.add_argument('-headless')

self.__driver = firefox(options=self.__options)

print('init')

# 退出無頭瀏覽器

def __close(self):

self.__driver.quit()

print("close")

# 通過url解析頁面,能載入js

def pageparsing(self,url):

try:

self.__driver.get(url)

time.sleep(1)

html = self.__driver.page_source

except connectionerror:

print('connectionerror')

print('attempting to reconnect')

time_sleep = 0

while true:

time.sleep(time_sleep)

try:

self.__driver.get(url)

html = self.__driver.page_source

break

except connectionerror:

time_sleep = time_sleep * 2

if(time_sleep > 50):

print('pageparsing wrong exit')

break

return html

# 處理單個

def getinfo(self,url):

html = self.pageparsing(url)

doc = pq(html)

text = doc('.course-enroll-info_course-enroll_price-enroll_enroll-count').text()

trynumber = 0

while(text == '' and trynumber < 5):

html = self.pageparsing(url)

doc = pq(html)

text = doc('.course-enroll-info_course-enroll_price-enroll_enroll-count').text()

trynumber += 1

if(trynumber == 5):

print('geting',url.split('/')[-1])

return -1

number = int(text[2:-3]) #從 『已有###人參加』 字串中提取數字

return number

def getdata(self):

return self.__data

# 批量處理

def getbatchdata(self,urls):

self.__init()

for url in urls:

number = self.getinfo(url[0])

if(number == -1):

continue

self.__close()

# 主程序 分成多個程序來爬取 加快速度

def run(urls):

threadnumber = 10

num = len(urls)

count = num // threadnumber

threads = # 用來存放執行緒的列表

spiders =

for i in range(threadnumber): # 新建程序

spider = spider()

t = threading.thread(target=spider.getbatchdata, args=(urls[count * i:count * (i + 1)],))

time.sleep(1)

if(count * (threadnumber) < num):

spider = spider()

t = threading.thread(target=spider.getbatchdata, args=(urls[count * threadnumber:num],))

for t in threads: # 開啟所有執行緒

t.start()

for t in threads: # 阻塞主線程,直到所有執行緒全部完成

t.join()

data =

for s in spiders:

data += s.getdata()

return data

def update():

times = time.time()

# 連線資料庫 將資料庫配置更改為執行機器的資料庫資訊

db = pymysql.connect(host='localhost',user='root',password='123456',database='world')

cursor = db.cursor()

# 更改sql語句 只需要 id url

select = 'select id,url from c'

cursor.execute(select) # 執行語句

urls =

for i in cursor:

print('running...') # 爬取資料

data = run(urls)

data =

print('data acquisition success')

# 更改更新的sql語句

sql = 'update c set number=%s where id=%s'

cursor.executemany(sql,data) # 更新到資料庫

db.commit()

cursor.close()

db.close()

print('successful database update')

times2 = time.time()

print(times2-times)

if __name__ == '__main__':

update()

python 網路爬蟲 與資料庫

這是乙個簡單的爬取豆瓣電影top250的 爬去了每一條電影的18個維度的資料,並且將他們儲存在本地的mysql資料庫中.詳細 如下.requests 請求網頁,獲取網頁資料 lxml 使用xpath語法快速解析網頁資料 coding utf 8 created on tue jan 22 20 55...

python 爬蟲 xpath 儲存到資料庫

參考 安裝 lxml 庫 import pymysql import requests from lxml import etree def get movies page url page 獲取url中的內容 response requests.get url html content respo...

Python爬蟲之四倉庫(資料庫)

第三方庫名 sqlite3import sqlite3 建立資料庫連線物件 conn sqlite3.connect my data.db 建立資料庫操控物件 control conn.cursor 查詢 返回可迭代物件 info control.execute select from novel ...