關於爬蟲初期學習爬取小說的問題

2021-09-01 10:55:13 字數 4176 閱讀 9018

剛學完爬蟲基礎,由於這是沒有學習框架時候的**,可能會顯得有些囉嗦,不過裡邊有很多自己的想法,可以參考並提出意見.

由於當時寫的比較匆忙,沒有用到物件導向的思想去編寫**,所以這裡只是用到了函式

import json

import os

import re

import urllib.request

import time

from pprint import pprint

from bs4 import beautifulsoup

from selenium import webdriver

from selenium.webdriver.chrome.options import options

import urllib.parse

#這裡由於爬取的****內容是js載入的,所以我採用selenium模擬瀏覽器的方法

def gethtml(url):

chrome_options = options()

chrome_options.add_argument(』–headless』)

chrome_options.add_argument(』–disable-gpu』)

path = r'e:\pycharm\課件\chromedriver_win32\chromedriver.exe'

driver = webdriver.chrome(executable_path=path,chrome_options=chrome_options)

url = url

driver.get(url)

time.sleep(7)

# pprint(driver.page_source)

return driver.page_source

time.sleep(1)

print('結束爬取%s' % timu)

def get_data(content,dirname,inovel_name):

soup = beautifulsoup(content,『lxml』)

mulu_list = soup.select("#catalog-panel > .yofiction-volume")

# print(mulu_list)

mululist =

for mulu in mulu_list:

dirnames = mulu.select(』.volume-header > h3』)[0].string

print(『正在爬取%s–%s』 %(inovel_name,dirnames))

dirpath = os.path.join(dirname,dirnames)

if not os.path.exists(dirpath):

os.mkdir(dirpath)

# print(filename)

mulu_name = mulu.select('.clearfix > li > a')

# print(mulu_name)

for i in mulu_name:

timu = i.string

href = '' + i['href']

# print(href)

# print(timu)

# exit()

# print(content)

#獲取對應標題裡的文章

get_contents(href,dirpath,timu)

print('結束爬取%s' % dirnames)

return label_lists,label_href
def parse_wenku(wenku_href,wenku_dirname):

request = set_request(wenku_href)

html = urllib.request.urlopen(request).read().decode(『utf8』)

soup = beautifulsoup(html,『lxml』)

#這裡的注釋是由於當時想順便獲取所有頁碼的url,但是時間不夠就擱置了,改為手動輸入頁碼並拼接

# pattern = re.compile(r』

』)# inovel_names = soup.select(』.yofiction-book-description > a > h2』)

# inovel_name_list =

# for inovel in inovel_names:

# inovel_name_list = inovel_name_list[:-1]

# inovel_hrefs = pattern.findall(html)

# # print(inovel_hrefs)

# inovel_href_list =

# for inovel in inovel_hrefs:

』 + inovel)

# # print(len(inovel_href_list),len(inovel_name_list))

# # exit()

## return inovel_name_list,inovel_href_list

inovel_list = soup.select(』.yofiction-book-description』)

for inovel in inovel_list:

inovel = str(inovel)

pattern = re.compile(r』

』)inovel_info = pattern.findall(inovel)[0]

# print(inovel_info)

inovel_name = inovel_info[1]

inovel_href = 『

』 + inovel_info[0]

#為每個**建立目錄

inovel_dirpath = os.path.join(wenku_dirname,inovel_name)

if not os.path.exists(inovel_dirpath):

os.mkdir(inovel_dirpath)

#獲取**內容

request =set_request(inovel_href)

html = urllib.request.urlopen(request).read().decode('utf8')

get_data(html,inovel_dirpath,inovel_name)

def main():

#建立**庫目錄

dirname = 『***文庫』

if not os.path.exists(dirname):

os.mkdir(dirname)

url = '/'

#建立request請求物件

request = set_request(url)

#獲取**html資料

html = urllib.request.urlopen(request).read().decode('utf8')

# print(content)

#獲取標籤

label_list,label_href_list = get_label_list(html)

# if start_page == 1:

# start_page += 1

for i in range(len(label_list)):

#建立每個文庫的資料夾

wenku_dirname = label_list[i]

wenku_dirpath = os.path.join(dirname,wenku_dirname)

if not os.path.exists(wenku_dirpath):

os.mkdir(wenku_dirpath)

wenku_href = label_href_list[i]

#進入文庫內,獲取所有的***

parse_wenku(wenku_href,wenku_dirpath)

ifname== 『main』:

main()

爬蟲之小說爬取

以筆趣閣 為例,爬取一念永恆這本 具體 如下 1 from bs4 import beautifulsoup 2from urllib import request 3import requests 4importre5 import sys6 def down this chapter chapt...

爬蟲小說爬取 待修改

爬蟲進一步學習,找到了乙份 筆趣說 的爬取 亟待需要維護,修正。但頻繁爬取後出現503錯誤,等待進一步學習解決。from urllib import request from bs4 import beautifulsoup import collections import re import o...

Python爬蟲例項,爬取小說

import pprint import requests from bs4 import beautifulsoup 獲取原始碼 defget source url r requests.get url if r.status code 200 print r.status code 錯誤 rai...