python爬取HDU所有題目

2021-08-20 18:29:53 字數 3355 閱讀 8731

還沒有爬具體題目資訊,待更新。

# -*- coding:utf-8 -*-

# 爬取hdu題目儲存到本地excel**

import requests

from bs4 import beautifulsoup

import xlsxwriter

# 儲存題目資訊二維陣列

allproblem =

# 建立excel檔案"hduproblem.xlsx"

workxlsx = xlsxwriter.workbook(

'hduproblem.xlsx'

)# 建立乙個表單

worksheet = workxlsx.add_worksheet(

)# 設定**格式

worksheet.set_column(1,

1,74)

ratio_format = workxlsx.add_format(

)title_format = workxlsx.add_format(

)head_format = workxlsx.add_format(

)worksheet.merge_range(0,

0,0,

2,'hdu online judge problem'

, title_format)

# 獲取網頁文字源**

defgethtmltext

(url)

:try

: r = requests.get(url)

r.raise_for_status(

)# r.encoding = 'utf-8'

return r.text

except

:return

''# 使用beautifulsoup根據樹狀標籤尋找需要爬取的資訊

deffillproblemlist

(soup)

: oneproblemlist =

# 題目資訊在這裡儲存

table = soup.find(

'table',}

) problemlist = table.get_text(

) problemlist = problemlist[47:

] temponepageproblemlist =

# 將每道題的資訊分開儲存

temponepageproblemlist = problemlist.split(

';')

# hdu每頁並不是100道題,中間會少幾道

pageproblemnum =

len(temponepageproblemlist)

for problem in

range

(pageproblemnum -1)

: temponeproblemlist = temponepageproblemlist[problem][4

:-1]

# 將題目每個資訊分開

oneproblemlist = temponeproblemlist.split(

',')

oneproblemlist[2]

= oneproblemlist[2]

[1:-

1]# 迴圈整理資料(計算通過率),題目名稱有可能含有未知個數','而被分開

flag =

1while flag:

try:

oneproblemlist[3]

=float

(oneproblemlist[3]

)/float

(oneproblemlist[4]

) flag =

0except

: oneproblemlist[2]

= oneproblemlist[2]

+ oneproblemlist[3]

del oneproblemlist[3]

try:

oneproblemlist[3]

=float

(oneproblemlist[3]

)/float

(oneproblemlist[4]

) flag =

0except

:pass

oneproblemlist.pop(

)def

main

(pages)

:# 表頭

sheethead =

['題號',''

,'題目'

,'正確率'

] headcol =

0for head in

range

(len

(sheethead)):

if head ==1:

continue

worksheet.write(

1, headcol, sheethead[head]

, head_format)

headcol +=

1# 截止到2023年6月14日共53頁題目

for page in

range(1

, pages +1)

: url =

''+str

(page)

html = gethtmltext(url)

soup = beautifulsoup(html,

'html.parser'

) fillproblemlist(soup)

# 將資訊寫入**

row =

2for problem in allproblem:

col =0;

for i in

range(0

,len

(problem)):

if i ==1:

continue

elif i ==3:

worksheet.write(row, col, problem[i]

) worksheet.write(row, col, problem[i]

, ratio_format)

col +=

1 row +=

1 workxlsx.close(

)if __name__ ==

'__main__'

: main(

53)

執行結果:

Python爬取海安網所有號碼

注 1.本程式採用ms sql server資料庫,請執行前手動修改程式開始處的資料庫連線資訊。2.需要pyodbc庫的支援 import requests,re,sys from bs4 import beautifulsoup import pyodbc pages set conn pyodb...

Python題目5 爬取CFDA資料

import requests class cfda 初始化函式 def init self 初始化要提交資料的 self.url 反爬 f12 網路 請求標頭 def getdata self,data 獲取資料 self.html requests.post self.url,data data...

Python爬蟲爬取LOL所有英雄面板

獲取id defget id url headers response requests.get url url,headers headers r response.json ids jsonpath.jsonpath r,heroid print ids print 英雄的個數為 str len...