爬取新浪網頁

# 唯一性的用id表示，id前面需要加#

例如：使用select ,找出所有id為title 的元素

alink = soup.select('#title')
print alink
print alink.text

# 有相同的用class表示，class前面需要加.

例如：使用select 找出所有class為link的元素

for link in soup.select('.link'):
print link
print link.text

#使用select 找出所有a tag 的href鏈結

alinks =soup.select('.a')
for link in alinks:
print link['href']

#!/usr/bin/env python
#-*- coding: utf-8 -*-
import requests
from bs4 import beautifulsoup 
import json
import pandas
# jd =json.loads(comments.text.strip('var data='))
# jd['result']['count']['total']
# class 時，加. ;id 時，加#
# soup.select('.article-editor')[0].text.lstrip('責任編輯')
# soup.select('#commentcount1')
commenturl = ' size=20'
def getcommentcount(newsurl):
m=re.search('doc-i(.*).shtml',newsurl)
newsid = m.group(1)
comments = requests.get(commenturl.format(newsid))
jd = json.loads(comments.text.strip('var data='))
return jd['result']['count']['total']
# 將抓取內文資訊方法整理成一函式
def getnewsdetail(newsurl):
result = {}
res = requests.get(newsurl)
res.encoding = 'utf-8'
soup = beautifulsoup(res.text,'html.parser') # or 'lxml'
result['title'] = soup.select('#artibodytitle')[0].text
result['newssource'] = soup.select('.time-source span a')[0].text
timesource = soup.select('.time-source')[0].contents[0].strip()
result['dt'] = datetime.striptime(timesource,'%y年%m月%d日%h:%m')
result['article'] = ''.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])
result['comments'] = getcommentcount(newsurl)
return result
#找到分頁鏈結
# 1.選擇network頁籤
# 2.點選js
# 3.點找到鏈結 (包含page = ?)
# 剖析分頁資訊
res = requests.get('http"//.....page=1&....') # 簡寫，可能每個**的分頁不同
jd = json.loads(res.text.lstrip('...').rstrip('....')) # 移除左右多餘字串
for ent in jd['result']['data']:
print ent['url'] # 印出每頁鏈結
# 建立剖析清單鏈結函式
def parselistlinks(url):
newsdetails = 
res = requests.get(url)
jd = json.loads(res.text.lstrip('newsloadercallback(').rstrip(');'))
for ent in jd['result']['data']:
return newsdetails
# 批次抓取每頁新聞內文
# 將資料儲存至excel
df.to_excel('news.xlsx')
# 將資料儲存至資料庫
import sqlite3
with sqlite3.connect('news.sqlite') as db:
df.to_sql('news',con=db)
# 從資料庫讀取
import sqlite3
with sqlite3.connect('news.sqlite') as db:
df2 = pandas.read_sql_query('select * from news',con=db)

新浪網 sina 新聞鏈結爬取

一新聞爬蟲需求分析二實現思路三專案實現 1.首先解析網頁,檢視各條新聞儲存位置 2.通過正規表示式獲取新聞鏈結,依次爬取各新聞並儲存到本地正規表示式,寫出每條新聞對應的鏈結 coding utf 8 import urllib.request import re data urllib...

HTML CSS實戰訓練（仿新浪網頁）

學了兩天的html css了，這是第一次參與實戰做網頁，發現將零碎知識點串在一起很重要，要不然還是不知道怎麼寫，而且深刻感受到了前端的不易，雖然前端邏輯簡單，但實際上手卻有很多疑問和考慮的點，不過幸好通過不斷的嘗試做出來了。在用css選擇器時，一定要注意是塊級元素還是行內元素，它們之間的特點，mar...

爬取新浪微博

學到的東西。1 習慣用logger，而不是用print self.logger.debug 開始解析 format response.url 2 習慣用正規表示式這是在pipeline清理資料時用到的 s 5分鐘前 if re.match d 分鐘前 s minute re.match d s g...

爬取新浪網頁

新浪網 sina 新聞鏈結爬取

HTML CSS實戰訓練（仿新浪網頁）

爬取新浪微博

相關推薦