python獲取頁面文字資訊

2022-05-07 06:51:08 字數 3852 閱讀 6237

# -*- coding: utf-8 -*-

from

selenium import webdriver

import time, re,requests,os,time,random,traceback

import urllib.request,threading

from

bs4 import beautifulsoup

import html.parser

from tkinter import *

from

tkinter import ttk

import tkinter.messagebox

def gethtml(questionid,page):

chrome_options =webdriver.chromeoptions()

chrome_options.add_argument(

'--start-maximized

') # 最大化執行(全屏視窗),不設定,取元素會報錯

chrome_options.add_argument(

'--disable-infobars

') # 禁用瀏覽器正在被自動化程式控制的提示

chrome_options.add_argument(

'--incognito

') # 隱身模式(無痕模式)

chrome_options.add_argument(

'--headless

') # 瀏覽器不提供視覺化頁面

driver = webdriver.chrome(executable_path = "

chromedriver

",options=chrome_options) # 開啟瀏覽器

driver.

get("

"+questionid+"

/answers/updated?page=

"+str(page)) # 開啟想要爬取的知乎頁面

# 模擬使用者操作

def execute_times(times):

for i in

range(times):

print('第

'+str(i)+'

次點選'

) driver.execute_script(

"window.scrollto(0,

"+str(1000 * i)+");"

) time.sleep(3)

driver.execute_script(

"window.scrollto(0, document.body.scrollheight);

")

execute_times(12)

result_raw =driver.page_source # 這是原網頁 html 資訊

result_soup = beautifulsoup(result_raw, '

html.parser

')# 然後將其解析

result_bf =result_soup.prettify() # 結構化原 html 檔案

answers = driver.find_elements_by_class_name("

richcontent-inner")

txt = "

start\n

"for answer in

answers:

if len(answer.text) > 300

: txt = txt + answer.text + "

\n-----------我是分隔符------\n

"with open(questionid +"

/page_

"+str(page)+"

.txt

", '

w',encoding="

utf-8

") as

zhpage: # 儲存路徑裡的資料夾需要事先建立。

zhpage.write(txt)

zhpage.close()

print(

"爬取回答頁面成功!!!")

driver.quit()

return

result_soup

def readtxt(path):

f = open(path,'

r',encoding='

utf-8')

strtxt =f.read()

f.close()

return

strtxt

def main(questionid,startpage,endpage):

mkdir([questionid])

for i in

range(startpage,endpage):

try:

gethtml(questionid,i)

time.sleep(random.choice(range(

5,8)))

except exception:

traceback.print_exc()

pass

def mkdir(paths):

for path in

paths:

ifnot os.path.exists(path):

os.mkdir(path)

def getanswer():

questionid = var_id.get

() start = var_start.get

() end = var_end.get

() main(questionid,start,end)

if __name__ == '

__main__':

main(str(

308829198),101,200

)tk =tk()

tk.title(

'獲取知乎問題所有答案')

tk.geometry(

'600x150')

frame =frame(tk)

label(tk,text='

問題標識:(例:324405640/answer/720532471中的324405640 )

',width=200,anchor=w, justify=left).place(x=10,y=10

)var_id =variable()

question_id = entry(tk,textvariable=var_id,width=30

)question_id.place(x=10,y=40

)label(tk,text='

開始頁:

').place(x=230,y=40

)var_start =variable()

e = entry(tk, textvariable=var_start,width=10).place(x=290,y=40

)var_start.

set(1

)label(tk,text='

結束頁:

').place(x=360,y=40

)var_end =variable()

e = entry(tk, textvariable=var_end,width=10).place(x=420,y=40

)var_end.

set(10

)button(tk, text="

獲取答案

", command=getanswer).place(x=200,y=80

)#tk.mainloop()

獲取頁面尺寸文字資訊

獲取介面我們常用的方法有 size 獲取元素尺寸 text 獲取元素文字 get attribute name 獲取屬性值 is displayed 設定元素是否使用者可見 如下 from selenium import webdriver driver webdriver.chrome drive...

JS獲取當前頁面頁面URL資訊

url即 統一資源定位符 uniform resource locator,url 完整的url由這幾個部分構成 scheme host port path?query fragment scheme 通訊協議 常用的http,ftp,maito等 設定或獲取url從頭到埠號部分。url windo...

python獲取本機資訊

python的確是簡單方便,庫函式完成了許多可能用到的功能,今天學習到的是獲取本機資訊的功能.import socket defget host ip 查詢本機ip位址 return ip try s socket.socket socket.af inet,socket.sock dgram s....