資料結構化與儲存

2022-08-22 21:36:08 字數 2852 閱讀 6358

import requests

from bs4 import beautifulsoup

from datetime import datetime

import re

import pandas

import openpyxl

import sqlite3

url = ""

res = requests.get(url);

res.encoding = "utf-8"

soup = beautifulsoup(res.text, "html.parser");

def writenewsdetails(contents):

f = open('gzccnews.txt', "a", encoding="utf-8")

f.write(contents)

f.close()

def getclickcount(newurl):

newsid = re.findall("\_(.*).html", newurl)[0].split("/")[-1];

res = requests.get(" {}&modelid=80".format(newsid))

return int(res.text.split(".html")[-1].lstrip("('").rsplit("');")[0])

# 獲取新聞詳情

def getnewdetails(newsdetailurl):

detail_res = requests.get(newsdetailurl)

detail_res.encoding = "utf-8"

detail_soup = beautifulsoup(detail_res.text, "html.parser")

news = {}

news['title'] = detail_soup.select(".show-title")[0].text

info = detail_soup.select(".show-info")[0].text

else:

news['source'] = 'none'

news['content'] = detail_soup.select("#content")[0].text

writedetailnews(news['content'])

news['click'] = getclickcount(newsdetailurl)

return news

# print(news)

# 獲取總頁數

def getpagen(url):

res = requests.get(url)

res.encoding = 'utf-8'

soup = beautifulsoup(res.text, 'html.parser')

return int(soup.select(".a1")[0].text.rstrip("條")) // 10 + 1

# 獲取新聞一頁的所有資訊

def getlistpage(url):

newslist =

for news in soup.select("li"):

if len(news.select(".news-list-title")) > 0: # 排除為空的li

# time = news.select(".news-list-info")[0].contents[0].text

# title = news.select(".news-list-title")[0].text

# description = news.select(".news-list-description")[0].text

detail_url = news.select('a')[0].attrs['href']

return newslist

newstotal =

totalpagenum = getpagen(url)

firstpageurl = ""

newstotal.extend(getlistpage(firstpageurl))

for num in range(totalpagenum, totalpagenum + 1):

listpageurl = "{}.html".format(num)

getlistpage(listpageurl)

print(newstotal)

#3. 安裝pandas,用pandas.dataframe(newstotal),建立乙個dataframe物件df.

df = pandas.dataframe(newstotal)

print(df)

#4. 通過df將提取的資料儲存到csv或excel 檔案。

df.to_excel('gzcss.xlsx')

# 5. 用pandas提供的函式和方法進行資料分析:

## 提取包含點選次數、標題、**的前6行資料

# 提取『學校綜合辦』發布的,『點選次數』超過3000的新聞。

# 提取'國際學院'和'學生工作處'發布的新聞。

# 進取2023年3月的新聞

print(df[['title', 'clickcount', 'source']][:6])

print(df[(df['clickcount'] > 3000) & (df['source'] == '學校綜合辦')])

sou = ['國際學院', '學生工作處']

print(df[df['source'].isin(sou)])

df1 = df.set_index('time')

print(df1['2018-03'])

資料結構化與儲存

1.將新聞的正文內容儲存到文字檔案。soup beautifulsoup res.text,html.parser content soup.select show content 0 text f open news.txt w encoding utf 8 f.write content f.c...

資料結構化與儲存

作業是 同學的,因為沒有對新聞資訊做提取,所有無法新增新聞資訊到字典。已練習pandas庫的相關使用方法,匯出excel檔案。ps 自己的 會盡快修改!import requests from bs4 import beautifulsoup from datetime import datetim...

資料結構化與儲存

1.將新聞的正文內容儲存到文字檔案。newscontent soup.select show content 0 text f open news.txt w f.write newscontent f open news.txt r print f.read 3.安裝pandas,用pandas....