練 python爬取小說

2021-07-29 03:36:11 字數 2576 閱讀 1347

# -*- coding:utf-8 -*-

''' 爬取** by @asdfv

將每部**的章節內容儲存至本地

'''import urllib2,re

from bs4 import beautifulsoup

import threading

# 請求並獲取網頁原始碼

defget_html_content

(url):

header =

request = urllib2.request(url=url, headers=header)

html = urllib2.urlopen(request).read()

return html

#[**名,**url]

novel_list =

defget_novels_list

(html):

soup_novels = beautifulsoup(html,'html.parser')

for string in soup_novels.find_all(attrs="l"):

for str_name in string.find_all(attrs="clearfix stitle"):

return novel_list

# 獲取**跳轉至章節顯示頁面鏈結

defturn2novel

(novel_chapters_url):

html = get_html_content(novel_chapters_url)

if html:

soup_novel = beautifulsoup(html,'html.parser')

# print soup_novel.find(attrs="reader").get('href')

return soup_novel.find(attrs="reader").get('href') # 章節url

# 獲取某一**所有章節名稱及其鏈結

defnovel_chapters_content

(chapter):

html = get_html_content(chapter)

if html:

reg_bookname = re.compile(r'(.*?)')

bookname = re.findall(reg_bookname,html)

reg = re.compile(r'(.*?)

') url_chapters_name = re.findall(reg,html)

return url_chapters_name

# 獲取**各章節文字內容

defget_chapter_novel_content

(chapter_txt_url):

html = get_html_content(chapter_txt_url)

if html:

html = html.decode('gbk').encode('utf-8')

reg = re.compile(r'(.*?)')

content = re.findall(reg,html)[1] # **文字內容

return content.replace('    ',' ').replace('

','\n')

# 儲存

defdownload_novel

(url):

html = get_html_content(url)

listnovel = get_novels_list(html)

for item in listnovel:

chapters_url = turn2novel(item[1])

with open('f:\\paqu\\txt\\' + item[0].decode('utf-8').encode('gbk') + '.txt','a') as f:

print item[0]

try:

for item_chapter in novel_chapters_content(chapters_url):

# f.writelines(item_chapter[1] + '\n')

txt_url = chapters_url + '/' +item_chapter[0]

# print get_chapter_novel_content(txt_url)

f.writelines(' ' + item_chapter[1] + '\n\n' + get_chapter_novel_content(txt_url).decode('utf-8').encode('gbk') + '\n\n')

except:

print

'----error here!----'

continue

defth

(event):

thr = threading.thread(target=download_novel(event))

thr.start()

url = ''

th(url)

Python爬取小說

感覺這個夠蛋疼的,因為你如果正常寫的話,前幾次執行沒問題,之後你連 都沒改,再執行就出錯了。其實這可能是網路請求失敗,或者有反爬蟲的東西吧。但這就會讓你寫的時候非常苦惱,所以這這東西,健壯性及其重要!import requests from bs4 import beautifulsoup impo...

python 爬取小說

前些天突然想看一些 可能是因為壓力大,所以就要有補償機制吧。為了節省流量,就想著把內容爬下來,然後就可以在路上看了。於是有了下面的指令碼。usr bin env python coding utf 8 import requests from lxml import etree 為了解決unicod...

python爬取小說

一 準備 安裝 requests pyquery庫 二 使用 定義了search類 初始化時傳入 第一章url 和 名即可 再呼叫all content方法即可 coding utf8 import re import requests from requests.exceptions import...