Python爬蟲入門案例

2021-08-07 20:29:17 字數 2631 閱讀 1224

寫入資料庫版本:

import requests

from bs4 import beautifulsoup

import numpy as np

import pandas as pd

import sqlalchemy

import pymysql

pymysql_engine = sqlalchemy.create_engine('mysql+pymysql://root:1234@localhost/sampledb?charset=utf8') #設定資料庫

# all_page_news =

root_url = ''

for i in range(1,4): #限定爬去前三頁

url = root_url + "?page={}".format(i)

print("抓取頁面: ",url)

html = requests.get(url)

soup = beautifulsoup(html.text, "lxml")

items = soup.find("div",class_="items")

one_page_news =

for i in

items("div",class_="item-inner")[:4]: #限定爬去每頁前四條

title = i.h2.a.string

#新聞標題

tmp_url = "" + i.h2.a["href"].split("#")[0] + "?full=y"

#對應新聞全文超連結,去除廣告

print("抓取新聞: ",tmp_url)

tmp_soup = beautifulsoup(requests.get(tmp_url).text, "lxml")

try:

text = "\n".join(tmp_soup.find("div",class_="story-body").stripped_strings) #新聞正文,對非文字粗略判定為新聞

except attributeerror:

text = "新聞"

lead = i.find("div",class_="item-lead", recursive=false).string

#新聞摘要

one_news = [title, lead, tmp_url, text]

# 寫入資料庫

print("to database...")

data = np.array(one_page_news)

df = pd.dataframe(data, columns=['title', 'lead', 'tmp_url', 'text'])

# data = np.array(all_page_news)

json版本:

import requests

from bs4 import beautifulsoup

import json

all_page_news =

root_url = ''

for i in range(1,4):

url = root_url + "?page={}".format(i)

print("抓取頁面: ",url)

html = requests.get(url)

soup = beautifulsoup(html.text, "lxml")

items = soup.find("div",class_="items")

one_page_news =

for i in

items("div",class_="item-inner")[:4]:

title = i.h2.a.string

tmp_url = "" + i.h2.a["href"].split("#")[0] + "?full=y"

print("抓取新聞: ",tmp_url)

tmp_soup = beautifulsoup(requests.get(tmp_url).text, "lxml")

try:

text = "\n".join(tmp_soup.find("div",class_="story-body").stripped_strings)

except attributeerror:

text = "新聞"

lead = i.find("div",class_="item-lead", recursive=false).string

one_news = [title, lead, tmp_url, text]

all_page_news.extend(one_page_news)

# 寫入json檔案

with

open("ftnews.json",'w',encoding="utf8") as

file:

json.dump(all_page_news,file)

# 讀取

# with open("ftnews.json",'r',encoding="utf8") as file:

# data_in = json.load(file)

python爬蟲案例 Python爬蟲案例集合

在python2.x裡面有urllib和urllib2 在python3.x裡面就把urllib和urllib2合成乙個urllib urllib3是在python3.x了裡面新增的第三方擴充套件。import urllib.request 向指定的url位址傳送請求,並返回伺服器響應的類檔案物件 ...

python案例 Python爬蟲案例集合

urllib2 在python2.x裡面有urllib和urllib2 在python3.x裡面就把urllib和urllib2合成乙個urllib urllib3是在python3.x了裡面新增的第三方擴充套件。urllib2 官方文件 urllib2 原始碼 urllib2 在 python3....

python爬蟲案例講解 Python爬蟲案例集合

伺服器返回的類檔案物件支援python檔案物件的操作方法 read 方法就是讀取檔案裡的全部內容,返回字串 html response.read 列印響應內容 我們需要稍微偽裝下,要不然第一步就會被反爬蟲發現 usr bin env python coding utf 8 import urllib...