爬蟲爬取桌面桌布

2021-10-10 06:18:20 字數 4455 閱讀 7489

import requests

from bs4 import beautifulsoup

import os

import time

import random

import useragent

interval = 3 # 爬取的間隔時間

firstdir = 'd:/netbian' # 總路徑

classificationdict = {} # 存放**分類子頁面的資訊

# 獲取頁面篩選後的內容列表

def screen(url, select):

# 隨機獲取乙個headers

headers =

html = requests.get(url=url, headers=headers)

html.encoding = 'gbk' # **的編碼

html = html.text

soup = beautifulsoup(html, 'lxml')

return soup.select(select)

# 獲取頁碼

while (os.path.exists(path)): # 若檔名重複

path = path.split(".")[0] + str(random.randint(2, 17)) + '.' + path.split(".") [1]

with open(path, 'wb') as pic:

for chunk in response.iter_content(128):

pic.write(chunk)

# 定位到 1920 1080 解析度

def handleimgs(links, path):

for link in links:

href = link.get('href')

if (href == ''): # 過濾廣告

continue

# 第一次跳轉

url = href

else:

url = index + href

select = 'div#main div.endpage div.pic div.pic-down a'

link = screen(url, select)

if (link == ):

print(url + ' 無此,爬取失敗')

continue

href = link [0].get('href')

# 第二次跳轉

url = index + href

# 獲取到了

select = 'div#main table a img'

link = screen(url, select)

if (link == ):

print(url + " 該需要登入才能爬取,爬取失敗")

continue

name = link [0].get('alt').replace('\t', '').replace('|', '').replace(':', '').replace('\\', '').replace('/',

'').replace(

'*', '').replace('?', '').replace('"', '').replace('<', '').replace('>', '')

src = link [0].get('src')

if (requests.get(src).status_code == 404):

print()

continue

print()

download(src, name, path)

time.sleep(interval)

def select_classification(choice):

print('---------------------------')

print('--------------' + choice + '-------------')

print('---------------------------')

secondurl = classificationdict [choice] ['url']

seconddir = classificationdict [choice] ['path']

if (not os.path.exists(seconddir)):

os.mkdir(seconddir) # 建立分類目錄

select = '#main > div.page > span.slh'

pageindex = screenpage(secondurl, select)

lastpagenum = int(pageindex) # 獲取最後一頁的頁碼

for i in range(0, lastpagenum):

if i == 0:

url = secondurl

else:

url = secondurl + 'index_%d.htm' % (i + 1)

print('--------------' + choice + ': ' + str(i + 1) + '-------------')

path = seconddir

select = 'div#main div.list ul li a'

links = screen(url, select)

handleimgs(links, path)

def ui():

print('--------------netbian-------------')

print('全部', end=' ')

for c in classificationdict.keys():

print(c, end=' ')

print()

choice = input('請輸入分類名:')

if (choice == '全部'):

for c in classificationdict.keys():

select_classification(c)

elif (choice not in classificationdict.keys()):

print("輸入錯誤,請重新輸入!")

print('----')

ui()

else:

select_classification(choice)

# 將分類子頁面資訊存放在字典中

def init_classification():

url = index

select = '#header > div.head > ul > li:nth-child(1) > div > a'

#header相當於 id = "header"

#div.head相當於 div class="head"

#li:nth-child(1) 相當於父元素下第乙個子分類

classifications = screen(url, select)

for c in classifications:

href = c.get('href') # 獲取的是相對位址

text = c.string # 獲取分類名

if (text == '4k桌布'): # 4k桌布,因許可權問題無法爬取,直接跳過

continue

seconddir = firstdir + '/' + text # 分類目錄

url = index + href # 分類子頁面url

global classificationdict

classificationdict[text] =

def main():

if not os.path.exists(firstdir):

os.mkdir(firstdir) # 建立總目錄

init_classification()

ui()

if __name__ == '__main__':

main()

python爬取彼岸桌面桌布

1.目標站點分析 進入 經過f12分析,url都儲存在 2.選擇爬取工具,這裡網頁比較簡單,就採用requests庫和正則.import requests import osimport reimport time 主頁 main urls headers ifnot os.path.exists ...

Python 爬取高畫質桌面桌布

今天寫了乙個指令碼用來爬取zol桌面桌布 的高畫質 如下 coding utf 8 import urllib import re import time class spider baseurl pic index 0 itemgrouppic def init self,page count t...

框架 MFC 修改桌面 桌布

功 能 使用 iactivedesktop 介面獲取 設定和重新整理桌面背景 桌布 開發環境 vc vs2005 vs2008 vs2010 vs2012 vs2013 新建專案 mfc應用程式 基於對話方塊 include include shlobj.h shlwapi.h 包含了對檔案判別的a...