獲取全部校園新聞

2022-06-04 18:21:09 字數 4538 閱讀 4039

1.取出乙個新聞列表頁的全部新聞 包裝成函式。

2.獲取總的新聞篇數,算出新聞總頁數。

3.獲取全部新聞列表頁的全部新聞詳情。

4.找乙個自己感興趣的主題,進行資料爬取,並進行分詞分析。不能與其它同學雷同。

import

requests

from bs4 import

beautifulsoup

from datetime import

datetime

import

reimport

jieba

defcountword(content):

wordlist =list(jieba.cut(content))

worddict ={}

for word in

wordlist:

if (len(word) == 1):

continue

worddict[word] =wordlist.count(word)

wordlistsort = sorted(worddict.items(), key=lambda d: d[1], reverse=true)

return

wordlistsort

def getclickcount(newsurl): #

獲取點選次數

newid = re.search('

_(.*)/(.*).html

', newsurl).group(2)

clickurl = '

'.format(newid)

clickstr =requests.get(clickurl).text

return(re.search("

hits'\).html\('(.*)'\);

",clickstr).group(1))

defgetnewsdetail(newsurl):

newsdist={}

resp =requests.get(newsurl)

resp.encoding = '

utf-8

'soup = beautifulsoup(resp.text, '

html.parser')

title=soup.select('

.show-title

')[0].text

showinfo = soup.select('

.show-info

')[0].text

time = showinfo.lstrip('

')[0:19]

dtime = datetime.strptime(time, '

%y-%m-%d %h:%m:%s')

author='

none

'if showinfo.find('

')>0:

author=showinfo[showinfo.find('

'):].split()[0].lstrip('')

audit='

none

'if showinfo.find('

')>0:

audit = showinfo[showinfo.find('

'):].split()[0].lstrip('')

origin='

none

'if showinfo.find('

') >0:

origin = showinfo[showinfo.find('

'):].split()[0].lstrip('')

photography='

none

'if showinfo.find('

攝影:') >0:

photography = showinfo[showinfo.find('

攝影:'):].split()[0].lstrip('

攝影:'

)

print('

\n\n

'+'*

'*10+'

分界線'+'

*'*10)

print('

'+title)

print('

'.format(dtime))

print('

'+author)

print('

' +audit)

print('

'+origin)

print('

攝影:'+photography)

print('

點選次數:

'+getclickcount(newsurl))

print('

新聞正文:

' + soup.select('

#content

')[0].text.strip().replace('

\u3000

','').replace('

\n','').replace('

\r',''

))

print('

詞頻分析:

'+ str(countword(soup.select('

#content

')[0].text.strip().replace('

\u3000

','').replace('

\n','').replace('

\r',''))[0:5]))

newsdist[

'title

']=title

newsdist[

'audit

']=audit

newsdist[

'origin

']=origin

newsdist[

'photography

']=photography

newsdist[

'clicktime

']=getclickcount(newsurl)

newsdist[

'content

']=soup.select('

#content

')[0].text.strip().replace('

\u3000

','').replace('

\n','').replace('

\r',''

)

return

newsdist

defgetfirstpage(soup):

fistdistlist=

newspage = soup.select('

.news-list > li')

for news in

newspage:

newsurl = news.select('

a')[0].attrs['

href']

return

fistdistlist

defgetpage(pageurl):

disrlist=

pageresp =requests.get(pageurl)

pageresp.encoding = '

utf-8

'pagesoup = beautifulsoup(pageresp.text, '

html.parser')

newspage = pagesoup.select('

.news-list > li')

for news in

newspage:

newsurl=news.select('

a')[0].attrs['

href']

#print(disrlist)

return

disrlist

firstpageurl='

'firstpageresp=requests.get(firstpageurl)

firstpageresp.encoding='

utf-8

'firstpagesoup=beautifulsoup(firstpageresp.text,'

html.parser')

totledistlist=

totledistlist.extend(getfirstpage(firstpagesoup))

pagemaxnum=int(firstpagesoup.select('

#pages > a

')[len(firstpagesoup.select('

#pages > a

'))-2].text)

for i in range(2,10):

pageurl='

{}.html

'.format(i)

totledistlist.extend(getpage(pageurl))

#print(totledistlist)

結果如下:

獲取全部校園新聞

1.取出乙個新聞列表頁的全部新聞 包裝成函式。2.獲取總的新聞篇數,算出新聞總頁數。3.獲取全部新聞列表頁的全部新聞詳情。4.找乙個自己感興趣的主題,進行資料爬取,並進行分詞分析。不能與其它同學雷同。import requests from bs4 import beautifulsoup from...

獲取全部校園新聞

1.取出乙個新聞列表頁的全部新聞 包裝成函式。2.獲取總的新聞篇數,算出新聞總頁數。3.獲取全部新聞列表頁的全部新聞詳情。import requests from bs4 import beautifulsoup from datetime import datetime importre 獲得新聞...

獲取全部校園新聞

1.取出乙個新聞列表頁的全部新聞 包裝成函式。2.獲取總的新聞篇數,算出新聞總頁數。3.獲取全部新聞列表頁的全部新聞詳情。import requests from bs4 import beautifulsoup from datetime import datetime importre 獲取新聞...