獲取全部校園新聞

2022-06-02 06:06:07 字數 4751 閱讀 4640

1.取出乙個新聞列表頁的全部新聞 包裝成函式。

2.獲取總的新聞篇數,算出新聞總頁數。

3.獲取全部新聞列表頁的全部新聞詳情。

import

requests

from bs4 import

beautifulsoup

from datetime import

datetime

importre#

獲取新聞點選次數

defgetnewsid(url):

newsid = re.findall(r'

\_(.*).html

', url)[0][-4:]

clickurl = '

'.format(newsid)

clickres =requests.get(clickurl)

#利用正規表示式獲取新聞點選次數

clickcount = int(re.search("

hits'\).html\('(.*)'\);

", clickres.text).group(1))

return

clickcount

#獲取新聞細節

defgetnewsdetail(newsurl):

resd =requests.get(newsurl)

resd.encoding = '

utf-8

'soupd = beautifulsoup(resd.text, '

html.parser')

content = soupd.select('

#content

')[0].text

info = soupd.select('

.show-info

')[0].text

#呼叫getnewsid()獲取點選次數

count =getnewsid(newsurl)

#識別時間格式

date = re.search('

(\d.\d.\d\s\d.\d.\d)

', info).group(1)

#識別乙個至三個資料

if(info.find('

')>0):

author = re.search('

', info).group(1)

if(info.find('

')>0):

check = re.search('

', info).group(1)

if(info.find('

')>0):

sources = re.search('

', info).group(1)

#用datetime將時間字串轉換為datetime型別

datetime = datetime.strptime(date, '

%y-%m-%d %h:%m:%s')

#利用format對字串進行操作

print('

'.format(datetime, author, check, sources, count))

print

(content)

defgetlistpage(listurl):

res =requests.get(listurl)

res.encoding = '

utf-8

'soup = beautifulsoup(res.text, '

html.parser')

for new in soup.select('li'

):

if len(new.select('

.news-list-title

')) >0:

title = new.select('

.news-list-title

')[0].text

description = new.select('

.news-list-description

')[0].text

newsurl = new.select('

a')[0]['

href']

print('

'.format(title, description, newsurl))

#呼叫getnewsdetail()獲取新聞詳情

getnewsdetail(newsurl)

break

listurl = '

'getlistpage(listurl)

res =requests.get(listurl)

res.encoding = '

utf-8

'soup = beautifulsoup(res.text, '

html.parser')

listcount = int(soup.select('

.a1')[0].text.rstrip('

條'))//10+1

for i in range(2,listcount):

listurl= '

{}.html

'.format(i)

getlistpage(listurl)

4.找乙個自己感興趣的主題,進行資料爬取,並進行分詞分析。不能與其它同學雷同。

import

requests

from bs4 import

beautifulsoup

from datetime import

datetime

import

jieba

newsurl = '

'def

sort(text):

str = '''

一!「」,。?;』"',.、:\n

'''for s in

str:

text = text.replace(s, '')

wordlist =list(jieba.cut(text))

exclude =

set2 = set(wordlist) -exclude

dict ={}

for key in

set2:

dict[key] =wordlist.count(key)

dictlist =list(dict.items())

dictlist.sort(key=lambda x: x[1], reverse=true)

print("")

for i in range(5):

print

(dictlist[i])

defgetcontent(url):

res =requests.get(url)

res.encoding = '

utf-8

'soup2 = beautifulsoup(res.text, '

html.parser')

for news in soup2.select('

.l_a'):

if len(news.select('

.author

'))>0:

author=news.select('

.author

')[0].text

print("作者"

,author)

content = soup2.select('

.la_con

')[0].text.rstrip('

ad_survey_add_adpos("7000531");')

print("

正文:"

, content)

sort(content)

defgetnewdetails(newsurl):

res =requests.get(newsurl)

res.encoding = '

utf-8

'soup = beautifulsoup(res.text, '

html.parser')

for news in soup.select('

.item'):

#print(news)

title = news.select('

a')[0].attrs['

title']

a = news.select('

a')[0].attrs['

href']

brief = news.select('

h5')[0].text.rstrip('

[詳細]')

time = news.select('h6'

)[0].text

dt = datetime.strptime(time, '

%y-%m-%d %h:%m')

print("

", title)

print("

", a)

print("

內容簡介:

", brief)

print("

", dt)

getcontent(a)

print('\n'

)

#break

res =requests.get(newsurl)

res.encoding = '

utf-8

'soup = beautifulsoup(res.text, '

html.parser')

getnewdetails(newsurl)

獲取全部校園新聞

1.取出乙個新聞列表頁的全部新聞 包裝成函式。2.獲取總的新聞篇數,算出新聞總頁數。3.獲取全部新聞列表頁的全部新聞詳情。4.找乙個自己感興趣的主題,進行資料爬取,並進行分詞分析。不能與其它同學雷同。import requests from bs4 import beautifulsoup from...

獲取全部校園新聞

1.取出乙個新聞列表頁的全部新聞 包裝成函式。2.獲取總的新聞篇數,算出新聞總頁數。3.獲取全部新聞列表頁的全部新聞詳情。import requests from bs4 import beautifulsoup from datetime import datetime importre 獲得新聞...

獲取全部校園新聞

1.取出乙個新聞列表頁的全部新聞 包裝成函式。2.獲取總的新聞篇數,算出新聞總頁數。3.獲取全部新聞列表頁的全部新聞詳情。4.找乙個自己感興趣的主題,進行資料爬取,並進行分詞分析。不能與其它同學雷同。import requests from bs4 import beautifulsoup from...