1 4 爬蟲 筆趣閣獲取小說例子

2021-10-02 07:16:20 字數 3181 閱讀 1376

#筆趣閣**

# 1.模擬搜尋

# 2.圖書查詢-章節

# 3.獲取章節-內容

# 4.本地儲存:txt、mysql、

def searchbook():

bookname = input("請輸入圖書的名稱: ")

# 1.轉移字元:中文在url中亂碼

bookname = bookname.encode("gbk")

# 2.請求

resp = requests.get(url = url_one, params = ,headers=head,timeout=10)

# 3.判斷是否成功

if resp.status_code == 200:

resp.encoding = "gbk"

print(resp.text)

# 4.解析內容:1 .資料來源 2.html.parser

soup = beautifulsoup(resp.text, "html.parser")

# 4.1 tag 根據標籤的名稱獲取,第乙個出現的

title = soup.title # 拿標題

print(title)

img = soup.img # 拿a標籤

print(img)

a = soup.a # 拿a標籤

print(a)

# 4.2 string text 獲取內容

print(title.string, img, a.string)

# 4.3獲取屬性 attrs 屬性字典集合 get(key)訪問

# print(img.attrs)

print(img.attrs.get("src"))

# # # 4.4查詢

# # find_all() 查詢所有標籤,list列表[tag,tag...]

# find() = soup.tag 第乙個出現的標籤

# name:標籤名 ,string:單個,list:多個

div_list = soup.find_all(name="div",attrs=)

for div in div_list:

# 判斷不能none

bookname = div.h4.a.string

bookurl = div.h4.a.attrs.get("href")

bookauthor = div.small.string

bookdir = div.p.string

# and 與 需要滿足所有所有條件

if bookname != none and bookurl != none and bookauthor != none and bookdir != none:

bookname.replace(" ", "")

bookurl.replace(" ", "")

bookauthor.replace(" ", "")

bookdir.replace(" ", "")

print(bookname + "\n", bookurl + "\n", bookauthor + "\n", bookdir + "\n")

# 5.儲存到字典

book_dict[bookname] = bookurl

else:

print("錯誤!重新開始")

searchbook()

pass

def getbookchapter():

bookname = input("請輸入已找到的圖書的名稱: ")

# 判斷是否存在字典中

# keys() 返回字典key的列表 集合

if bookname in book_dict.keys():

# resp = requests.get(url=url_one, params=, headers=head, timeout=10)

resp = requests.get(url=book_dict[bookname],headers=head, timeout=time)

# 3.判斷是否成功

if resp.status_code == 200:

resp.encoding = "gbk"

soup = beautifulsoup(resp.text, "html.parser")

title = soup.title.string # 拿標題

print(title.string)

dd_list = soup.find_all(name="dd", attrs=)

for dd in dd_list:

try:

chapter = dd.a.attrs.get("title")

chapterurl = dd.a.attrs.get("href")

print(chapter,chapterurl)

bookurl = book_dict[bookname]

getbookchaptercontent(chapter, chapterurl, bookurl,bookname)

except exception:

pass

continue

else:

print("錯誤!重新開始")

getbookchapter()

pass

def getbookchaptercontent(chapter, chapterurl, bookurl,bookname):

# 判斷是否存在url,進行拼接

resp = requests.get(url=chapterurl) # 發起請求

if resp.ststus_code == 200:

resp.encoding = "gbk"

soup4 = beautifulsoup(resp.text,"html.parser") # 格式化

div = soup4.find(name="div",attrs=) #返回乙個標籤物件,而不是列表物件

text = div.text

if text !=none and text !="": #判斷不能為空

text = div.text.replace("

爬蟲筆趣閣例子

from lxml import etree from selenium.webdriver.support.wait import webdriverwait from selenium.webdriver.support import expected conditions as es from...

c 筆趣閣小說爬蟲

流年似水,回想上一次博文發表,好像已經是一年多以前,差點就忘了自己是個文件攻城獅的本質,罪過啊。最近在研究爬蟲,python用的不太習慣,還是回歸老本行c 比較好一點,個人又比較喜歡看 所以就選取筆大大做個白老鼠 默哀 寫個爬蟲玩完,迷茫啊。這個專案有幾個比較重要的點 一 正規表示式,參考 二 抓取...

初級爬蟲爬取筆趣閣小說

import requests from pyquery import pyquery as pq def get content a response requests.get a response.encoding gbk doc pq response.text text doc conten...