python爬蟲之新浪網(簡潔版)

2021-10-03 10:59:35 字數 2922 閱讀 4770

爬蟲

python

注釋挺詳細了,直接上全部**,歡迎各位大佬批評指正。

from selenium import webdriver

from selenium.webdriver.chrome.options import options

from selenium.webdriver.common.by import by

from time import sleep

from lxml import etree

import os

import requests

import csv

# 建立乙個無頭瀏覽器物件

chrome_options = options()

# 設定它為無框模式

chrome_options.add_argument('--headless')

# 如果在windows上執行需要加**

chrome_options.add_argument('--disable-gpu')

browser = webdriver.chrome(chrome_options=chrome_options)

# 設定乙個10秒的隱式等待

browser.implicitly_wait(10)

# 使用谷歌無頭瀏覽器來載入動態js

def start_get(url):

try:

browser.get(url)

a =

for one in range(1, 100):

sleep(0.5)

# 翻到頁底

browser.execute_script('window.scrollto(0,document.body.scrollheight)')

sleep(0.5)

# 再次翻頁到底

browser.execute_script('window.scrollto(0,document.body.scrollheight)')

sleep(1)

# 拿到頁面源**

source = browser.page_source

print(f"原始碼獲取成功")

next_btn = browser.find_elements_by_xpath('.//span[@class="pagebox_next"]')

if next_btn:

next_btn[0].click()

return a

except exception as e:

print(e)

# 對新聞列表頁面進行解析

def parse_page(html):

# 建立etree物件

tree = etree.html(html)

new_lst = tree.xpath('//div[@id="subshowcontent1_static"]') # 注意修改

for one_new in new_lst:

title = one_new.xpath('.//h2/a/text()')[0]

link = one_new.xpath('.//h2/a/@href')[0]

write_in(title, link)

# 將其寫入到檔案

def write_in(title, link):

alist =

print('開始寫入篇新聞{}'.format(title))

browser.get(link)

sleep(1)

source = browser.page_source

tree = etree.html(source)

con_link = link

content_lst = tree.xpath('.//div[@class="article"]//p')

con = ''

for one_content in content_lst:

if one_content.text:

con = con + '\n' + one_content.text.strip()

post_time = tree.xpath('.//span[@class="date"]')[0].text

post_source = tree.xpath('.//a[@class="source"]')[0].text

# browser.get(url)

tiecount = tree.xpath('.//a[@data-sudaclick="comment_sum_p"]')[0].text

tiejoincount = tree.xpath('.//a[@data-sudaclick="comment_participatesum_p"]')[0].text

# 1. 建立檔案物件

# 2. 基於檔案物件構建 csv寫入物件

csv_writer = csv.writer(f)

# print(alist)

csv_writer.writerow(alist)

f.close()

if __name__ == '__main__':

urls = ['','','','','']

for url in urls:

html_list = start_get(url)

for h in html_list:

try:

parse_page(h)

except exception as e:

print(e)

結果如下:

注:本文僅用於技術交流,不得用於商業用途。不遵守者,與本文作者無關。

Python 新浪微博爬蟲之模擬登陸

目前,親測能用的步驟是 通過預登入,使用get方法,獲得登入所需的servertime,nonce,pubkey,rsakv 使用encode64加密使用者名稱,使用rsa演算法加密密碼 登入。步驟一 response格式為 換行是我自己加上去的 sinassocontroller.prelogin...

python 爬蟲之筆趣網 (附原始碼)

需要使用的庫 匯入相關庫 import requests from pyquery import pyquery as pq import re import os from multiprocessing import process from redis import strictredis i...

Python之起點中文網爬蟲

注 請勿用於其他用途,僅供學習使用 import requests import re import os from lxml import etree head defget page book id 獲取章節字段 所有章節介面https b url 獲取文章內容 b p url 名 id j t...