Python爬取熱門微博,並儲存到MySQL中

2021-10-05 15:33:47 字數 3547 閱讀 8930

目標**:m.weibo.cn

url的獲取可以從瀏覽器的f12中的network的xhr中找到。

weibo_demo.py:

import requests

import json

from w3lib.html import remove_tags

from mysqlhelper import mysqlhelper

import time

helper = mysqlhelper(

)max_page =

50#設定header

headers =

defget_one_page_info

(url)

:#需要:text,comments_count,attitudes_count,reposts_count,created_at ,source,儲存到mysql中

response = requests.get(url=url , headers=headers)

#json.loads()將json的字串轉化為dict

res_dict = json.loads(response.text)

cards_list = res_dict[

'data'][

'cards'

]#具體的獲取資料

for card in cards_list:

if'mblog'

in card:

text = remove_tags(card[

'mblog'][

'text'])

comments_count = card[

'mblog'][

'comments_count'

] attitudes_count = card[

'mblog'][

'attitudes_count'

] reposts_count = card[

'mblog'][

'reposts_count'

] created_at = card[

'mblog'][

'created_at'

] source_a = card[

'mblog'][

'source'

]# print(text,comments_count,attitudes_count,reposts_count,created_at,source_a)

insert_sql =

'insert into weibo_test (source_a, created_at, `text`, comments_count, attitudes_count, reposts_count)values (%s, %s, %s, %s, %s, %s)'

data =

(source_a, created_at, text, comments_count, attitudes_count, reposts_count)

helper.execute_insert_sql(insert_sql, data)

# time.sleep(1)

# create table wb_test(id int primary key auto_increment,source_a varchar(50),created_at varchar(40),`text` text,comments_count int,attitudes_count int,reposts_count int) default charset=utf8;

#truncate table 表名 制空表

if __name__ ==

'__main__'

:for i in

range

(max_page)

:print

('page '

+str

(i +1)

+' has done!'

) url =

''.format

(i +1)

get_one_page_info(url)

mysqlhelper.py:

import pymysql

class

mysqlhelper

(object):

def__init__

(self)

: self.conn = pymysql.connect(host=

'localhost'

, port=

3306

, db=

'wb'

, user=

'root'

, passwd=

'123456'

, charset=

'utf8'

)#cursor游標,類似與yeild生成器

self.cursor = self.conn.cursor(

)def

execute_insert_sql

(self, sql, data)

: self.cursor.execute(sql, data)

self.conn.commit(

)def

__del__

(self)

: self.cursor.close(

) self.conn.close(

)if __name__ ==

'__main__'

:#例項化

helper = mysqlhelper(

) insert_sql =

'insert into weibo_test (source_a, created_at, `text`, comments_count, attitudes_count, reposts_count)values (%s, %s, %s, %s, %s, %s)'

data =

('mi'

,'2020-4-22'

,'今天天氣好',2

,3,5

) helper.execute_insert_sql(insert_sql, data)

執行結果:

2020.4.30報錯:

後來在執行中發現會報pymysql.err.internalerror: (1366, "incorrect string value: '\\xf0\\x9f\\x98\\xb7 ' for column 'text' at row 1")錯誤,這個錯誤發生在當需要將特殊字元寫入mysql時。此時需要在mysqlhelper.py中加上self.cursor.execute("set names utf8mb4;")

爬取熱門微博資料2018 3 27更新

import requests import os import re import csv import time import json cookies 當出現一些解決不了的問題時候 試著更新一下cookies 使用者資訊,同時也能獲取到uid fid oid等關鍵引數 def get user...

自動獲取cookie,爬取新浪微博熱門評論

目錄 一 前言 二 網盤 selenium僅僅用於獲取cookie,實際爬取將直接使用requests請求,以保證爬取效率 話不多說,也不複雜,直接上 了,關鍵的地方有注釋 import requests import selenium from selenium import webdriver ...

爬取新浪微博

學到的東西。1 習慣用logger,而不是用print self.logger.debug 開始解析 format response.url 2 習慣用正規表示式 這是在pipeline清理資料時用到的 s 5分鐘前 if re.match d 分鐘前 s minute re.match d s g...