Python多執行緒抓取全書網全站

2021-08-21 12:59:23 字數 1405 閱讀 9430

return page_data,all_link_name

def parsing_html(self,response,name):

html = etree.html(response)

url = html.xpath('//a[@class="l mr10"]/@href')

img = html.xpath('//a[@class="l mr10"]/img/@src')

title = html.xpath('//span[@class="l"]/a/@title')

author = re.findall('(.*?)

def write_to_mysql(self,title,url,img,author,mysql_name):

j = 0

db = pymysql.connect(host="localhost", user="root", password='123456', db="quanshu_mysql", charset="utf8")

cursor = db.cursor()

try:

cursor.execute("create table " + mysql_name + "(id int primary key auto_increment not null,title varchar(50),url varchar(100) not null,img varchar(100),author varchar(40));")

except:

pass

print(mysql_name, "資料庫建立成功!")

while j < len(url):

try:

print("正在寫入資料:",title[j])

cursor.execute('insert into '+mysql_name+'(title,url,img,author) values("%s","%s","%s","%s")'%(title[j],url[j],img[j],author[j]))

db.commit()

print(title[j],"寫入成功")

except indexerror:

print(mysql_name,"裡面的資料寫入成功!")

break

j+=1

#遍歷佇列,然後將佇列裡面的所有queue新增進函式request_url

for t in threads:

t.start()#開啟執行緒活動!

for t in threads:

t.join()

if __name__ == "__main__":

url = ""

c = qs_spider(url)

c.start_spider()

print("it's ok!")

python多執行緒爬蟲抓取網頁

突發想法,抓取 資料以便採用機器學習分析練手,網頁為年份。步驟如下 1 每乙個子執行緒抓取每一年的網頁 2 抓取網頁後利用正規表示式抽取資料,存入多維list。3 構建sql語句,存入mysql。user bin env python3 coding utf 8 from bs4 import be...

python多執行緒實現抓取網頁

python實現抓取網頁 以下的python抓取網頁的程式比較0基礎。僅僅能抓取第一頁的url所屬的頁面,僅僅要預定url足夠多。保證你抓取的網頁是無限級別的哈,以下是 coding utf 8 無限抓取網頁 author wangbingyu date 2014 06 26 import sys,...

多執行緒抓取英雄聯盟全面板並儲存

import requests import threading from queue import queue from lxml import etree import time import wget import os urlqueue queue 鏈結佇列 threads num 100 ...