Python之爬蟲學習（四）

from urllib import request
from urllib import error
from bs4 import beautifulsoup
import random
import time
class annualreport(object):
def __init__(self):
''self.base = ''
self.companylist = 
self.reportlist = 
self.count = 0
# user-agent列表
self.user_agents = [
"mozilla/4.0 (compatible; msie 6.0; windows nt 5.1; sv1; acoobrowser; .net clr 1.1.4322; .net clr 2.0.50727)",
"mozilla/4.0 (compatible; msie 7.0; windows nt 6.0; acoo browser; slcc1; .net clr 2.0.50727; media center pc 5.0; .net clr 3.0.04506)",
"mozilla/4.0 (compatible; msie 7.0; aol 9.5; aolbuild 4337.35; windows nt 5.1; .net clr 1.1.4322; .net clr 2.0.50727)",
"mozilla/5.0 (windows; u; msie 9.0; windows nt 9.0; en-us)",
"mozilla/5.0 (compatible; msie 9.0; windows nt 6.1; win64; x64; trident/5.0; .net clr 3.5.30729; .net clr 3.0.30729; .net clr 2.0.50727; media center pc 6.0)",
"mozilla/5.0 (compatible; msie 8.0; windows nt 6.0; trident/4.0; wow64; trident/4.0; slcc2; .net clr 2.0.50727; .net clr 3.5.30729; .net clr 3.0.30729; .net clr 1.0.3705; .net clr 1.1.4322)",
"mozilla/4.0 (compatible; msie 7.0b; windows nt 5.2; .net clr 1.1.4322; .net clr 2.0.50727; infopath.2; .net clr 3.0.04506.30)",
"mozilla/5.0 (windows; u; windows nt 5.1; en-us; rv:1.8.1.2pre) gecko/20070215 k-ninja/2.1.1",
"mozilla/5.0 (windows; u; windows nt 5.1; zh-cn; rv:1.9) gecko/20080705 firefox/3.0 kapiko/3.0",
"mozilla/5.0 (x11; linux i686; u;) gecko/20070322 kazehakase/0.4.5",
"mozilla/5.0 (x11; u; linux i686; en-us; rv:1.9.0.8) gecko fedora/1.9.0.8-1.fc10 kazehakase/0.5.6",
"opera/9.80 (macintosh; intel mac os x 10.6.8; u; fr) presto/2.9.168 version/11.52",
]# 設定ip**
#爬取**
"""獲取上市公司中的年報列表
"""def getlist(self):
for companyurl in self.companylist:
try:
# 設定get請求的user-agent，用於偽裝瀏覽器ua
header = 
response = request.request(companyurl,headers=header)
time.sleep(1)
req = request.urlopen(response)
# 使用proxyhandler方法生成處理器物件
#ip = random.choice(self.iplist)
#proxy_handler = request.proxyhandler()
# 建立**ip的opener例項
print('獲取年報列表失敗：'+str(e))
print(companyurl)
else:
print('請求成功通過。')
"""獲取年報列表中的所有年報並寫入txt
"""def getreport(self):
for reporturl in self.reportlist:
try :
# 設定get請求的user-agent，用於偽裝瀏覽器ua
header = 
response = request.request(reporturl, headers=header)
time.sleep(1)
req = request.urlopen(response)
# 使用proxyhandler方法生成處理器物件
#proxy_handler = request.proxyhandler()
# 建立**ip的opener例項
#opener = request.build_opener(proxy_handler)
#req = opener.open(response)
html = req.read().decode("gb18030")
bf = beautifulsoup(html,'html.parser')
texts = bf.find_all('div', id='content')
txtname = '/users/wecash/desktop/annualreport/'+str(self.count)+'.txt';
self.count = self.count+1
with open(txtname,'w') as f:
f.write(str(texts[0]))
except baseexception as e:
print('獲取年報失敗：'+str(e))
print(reporturl)
else:
print('獲取年報成功')
if __name__ == "__main__":
ob = annualreport()
ob.getiplist()
ob.getcompanylist()
ob.getlist()
ob.getreport()

python爬蟲之urllib 四

每個都會定義robots.txt 檔案，這個檔案可以告訴網路爬蟲爬取該時存在哪些限制。作為良好網民以及其他人利益，一般上遵從這些限制。如何檢視這個檔案？可以通過在目標站點或網域名稱後面加上 robots.txt 進行訪問。例如目標站點的 robots.txt 檔案就是 robots.tx...

Python網路爬蟲學習筆記（四）

鏈結爬蟲實現思路確定好要爬取的入口鏈結根據需求構建好鏈結提取的正規表示式模擬成瀏覽器並爬取對應網頁根據2中的正規表示式提取出該網頁中包含的鏈結過濾掉重複的鏈結後續操作，例如列印這些鏈結以下程式是獲取網頁上的所有鏈結 import re import urllib.request i...

爬蟲 Python爬蟲學習筆記之Urllib庫

1.urllib.request開啟和讀取url 2.urllib.error包含urllib.request各種錯誤的模組 3.urllib.parse解析url 4.urllib.robotparse解析 robots.txt檔案傳送get請求引入urlopen庫用於開啟網頁 from u...

Python之爬蟲學習（四）

python爬蟲之urllib 四

Python網路爬蟲學習筆記（四）

爬蟲 Python爬蟲學習筆記之Urllib庫

相關推薦