Python之爬蟲學習(七) 問題記錄

2021-09-13 00:17:42 字數 4585 閱讀 4326

1、**ip質量問題

獲取到的**ip可能會失效,所以需要多次校驗ip的質量及記錄問題url重新請求

獲取時校驗:

# 獲取可用ip**

print(self.urlproxylist)

使用時再次校驗:

def getusefulproxy(self):

proxy = random.choice(self.urlproxylist)

header =

try:

r = requests.get(self.check_url, headers=header, proxies=proxy, timeout=2)

except:

return annualreport.getusefulproxy(self)

else:

return proxy

2、如果不使用ua/ip**,多次請求或者請求間隔時間太短會被部分**發現並封ip 

3、多執行緒爬取時注意操作同一檔案的問題

4、可以使用**+非**相結合的手段

如果對資料要求全量,**總有失效的情況下,catch到異常呼叫無**的方法,注意執行緒等待1-2s避免安全部門**

def getreportbythreadpool(self):

print('get reports start...')

executor = threadpoolexecutor(max_workers=4)

#等待執行緒池所有任務完成

all_task = [executor.submit(annualreport.getreport,self,reporturl) for reporturl in self.reportlist]

wait(all_task,return_when=all_completed)

print('獲取上市公司中的年報列表結束')

"""

獲取年報列表中的所有年報並寫入txt

"""def getreport(self,reporturl):

try:

# 設定get請求的user-agent,用於偽裝瀏覽器ua

header =

time.sleep(1.5)

response = request.request(reporturl, headers=header)

# 使用proxyhandler方法生成處理器物件

proxy = random.choice(self.urlproxylist)

proxy_handler = request.proxyhandler(proxy)

#建立**ip的opener例項

opener = request.build_opener(proxy_handler)

req = opener.open(response,timeout=2)

#req = request.urlopen(response,timeout=3)

html = req.read().decode("gb18030")

bf = beautifulsoup(html, 'html.parser')

texts = bf.find_all('div', id='content')

file_name = reporturl[reporturl.index('stockid=')+8:reporturl.index('&id=')]

txt_path = '/users/wecash/desktop/report/'+file_name+'.txt'

txt_before = '

' txt_after = '

' with open(txt_path, 'w') as f:

f.write(txt_before)

f.write(str(texts[0]))

f.write(txt_after)

print('使用**獲取年報txt成功')

text2 = bf.find_all('a')

for i in text2:

if i.string == '公告原文':

self.pdfurl = i.get('href')

download_dir = '/users/wecash/desktop/report/'+file_name+'.pdf'

r = requests.get(self.pdfurl, stream=true)

with open(download_dir, 'wb') as f:

for chunk in r.iter_content(chunk_size=1024):

if chunk:

f.write(chunk)

f.flush()

print('使用**獲取年報pdf成功')

return

except baseexception as e:

print('使用**獲取年報失敗:'+str(e))

self.getreportwithoutproxy(reporturl)

else:

print('獲取年報成功')

def getreportwithoutproxy(self, reporturl):

try:

header =

response = request.request(reporturl, headers=header)

time.sleep(1.5)

req = request.urlopen(response, timeout=3)

html = req.read().decode("gb18030")

bf = beautifulsoup(html, 'html.parser')

texts = bf.find_all('div', id='content')

file_name = reporturl[reporturl.index('stockid=') + 8:reporturl.index('&id=')]

txt_path = '/users/wecash/desktop/report/' + file_name + '.txt';

txt_before = '

' txt_after = '

' with open(txt_path, 'w') as f:

f.write(txt_before)

f.write(str(texts[0]))

f.write(txt_after)

print('無**獲取年報txt成功')

text2 = bf.find_all('a')

for i in text2:

if i.string == '公告原文':

self.pdfurl = i.get('href')

download_dir = '/users/wecash/desktop/report/' + file_name + '.pdf'

r = requests.get(self.pdfurl, stream=true)

with open(download_dir, 'wb') as f:

for chunk in r.iter_content(chunk_size=1024):

if chunk:

f.write(chunk)

f.flush()

print('無**獲取年報pdf成功')

return

except baseexception as e:

print('獲取年報失敗:' + str(e))

else:

print('獲取年報成功')

python爬蟲問題記錄

環境搭建 基本庫框架 打碼平台 pycharm官方使用文件 python命名規範 python中文文件 啟動參考 cd d e mongodb bin mongob dbpath e mongodb data db 驗證是否啟動成功 啟動參考 cd d e redis redis server re...

python爬蟲學習(七)

from selenium.webdriver.common.keys import keys browser webdriver.chrome browser.get 1 在搜尋框中輸入 selenium browser.find element by id kw send keys 趙麗穎 2 ...

Python爬蟲學習筆記 七

json是輕量級的資料互動格式 給使用者看的,展示資料的 簡單理解就是乙個字點或者list 書寫格式 不能寫注釋 key value 必須都是雙引號 末尾不能寫逗號 整個檔案有且僅有乙個或 字串 loads coding gbk import json 1 字串和dic list轉換 字串 json...