python 微信爬蟲 python 微信爬蟲例項

2021-10-11 10:06:46 字數 3809 閱讀 2904

import urllib.request

import urllib.parse

import urllib.error

import re,time

import queue

import threading

operner = urllib.request.build_opener()

operner.addheaders = [headers]

urllib.request.install_opener(operner)

urlque = queue.queue()

list_url =

###使用**獲取網頁url內容

def use_proxy(url):

try:

# proxy = urllib.request.proxyhandler()

# operner = urllib.request.build_opener()

# urllib.request.install_opener(operner)

headers = ("user-agent",

operner = urllib.request.build_opener()

operner.addheaders = [headers]

urllib.request.install_opener(operner)

data = urllib.request.urlopen(url).read().decode('utf-8')

#print (data)

return data

except urllib.error.urlerror as e:

if hasattr(e,"code"):

print (e.code)

elif hasattr(e,"reason"):

print (e.reason)

except exception as e:

print ("exception"+str(e))

time.sleep(1)

###獲取文章的url連線,並將連線加入到佇列

class get_url(threading.thread):

def __init__(self,key,pagestart,pageend,urlque):

threading.thread.__init__(self)

self.pagestart = pagestart

self.pageend = pageend

self.key = key

self.urlque = urlque

def run(self):

try:

keycode = urllib.parse.quote(self.key)

for page in range(self.pagestart,self.pageend+1):

url = "" % (keycode,page)

data = use_proxy(url)

print ("data1的內容是",data)

listurl_pattern = '

' result = re.compile(listurl_pattern,re.s).findall(data)

print (result)

if len(result) == 0:

print ("沒有可用的url")

sys.exit()

for i in range(len(result)):

res = result[i].replace("amp;","").split(" ")[0].replace("\"" ,"")

self.urlque.put(res) ##加入佇列

self.urlque.task_done()

#return list_url

except urllib.error.urlerror as e:

if hasattr(e, "code"):

print(e.code)

elif hasattr(e, "reason"):

print(e.reason)

except exception as e:

print ("exception:",e)

##根據url獲取文章內容

class get_url_content(threading.thread):

def __init__(self,urlque):

threading.thread.__init__(self)

self.urlque = urlque

def run(self):

fh1 = open("d:\\python-script\\1.html", 'wb')

fh1.write(html1.encode("utf-8"))

fh1.close()

fh = open("d:\\python-script\\1.html", 'ab')

while true:

try:

url = self.urlque.get()

data_content = use_proxy(url)

title_pattern = '

.*?'

result_title = re.compile(title_pattern, re.s).findall(data_content)

##標題

res_title = result_title[0].replace("

","").strip()

content_pattern = 'id="js_content">(.*?)

' content = re.compile(content_pattern, re.s).findall(data_content)

#c = '

'# for i in content:

# ##內容

# c_content=i.replace(c, "").replace("

", "").replace("", "")

fh.write(res_title.encode("utf-8"))

for i in content:

fh.write(i.strip().encode("utf-8"))

except unicodeencodeerror as e:

continue

fh.close()

class contrl(threading.thread):

def __init__(self,urlqueue):

threading.thread.__init__(self)

self.urlqueue = urlqueue

while true:

print ("程式正在執行")

if self.urlqueue.empty():

time.sleep(3)

print ("程式執行完畢")

exit()

if __name__ == '__main__':

pagestart = 1

pageend = 2

key = "人工智慧"

get_url = get_url(key,pagestart,pageend,urlque)

get_url.start()

get_content = get_url_content(urlque)

get_content.start()

cntrol = contrl(urlque)

cntrol.start()

python微信爬蟲

import urllib.request import re import time import urllib.error 自定義函式,功能為使用 伺服器爬乙個 def use proxy proxy addr,url 異常處理機制 try req urllib.request.request ...

微博爬蟲python 微博爬蟲 python

本文爬取的是m站的微博內容,基於python 2.7 一 微博內容爬取 1.要爬取的微博首頁 2.手機微博是看不到翻頁,是一直往下載入的,但是其json格式的資料仍然以翻頁的形式呈現。3.開啟開發者工具,向下翻頁面,可以在network下的xhr的響應檔案中,找到json檔案的 如 通過分析發現每個...

關於微信指數爬蟲

1,普通條件欄位很好理解,就是size,page,keyword之類的,大多是控制資料庫的查詢條件,並且明文傳輸沒有加密。2,所以加密條件欄位就應該是有過加密的字段,例如passwd e10adc3949ba59abbe56e057f20f883e,密碼通常是要加密的,而且理論上應該是使用不可逆的加...