python簡單爬蟲

2022-09-16 16:54:16 字數 2560 閱讀 9995

目錄

# requests庫

## requests.get(url) 模擬瀏覽器開啟網頁

# re庫

import requests

import re

response = requests.get('') # 模擬瀏覽器開啟網頁

# print(response.status_code) # 200成功,301,404網頁丟失

# print(response.encoding) # utf-8

data = response.text #

# print(data)

# .匹配所有字元,*表示前面的字元0到無窮個

content_res = re.findall('(.*?)

', data)

title_res = re.findall('(.*?)', data)

# print(title_res.index('活得糊塗的人,容易幸福'))

# print(title_res.index('購買銀行理財產品虧損後如何起訴'))

title_res= title_res[10:60]

# print(title_res)

title_content_dic = {}

for i in range(len(title_res)):

title_content_dic[title_res[i]] = content_res[i]

# print(title_content_dic)

# print(title_content_dic)

for i in title_content_dic.items():

# print(str(i)+'\n')

print(f' | ')

import requests

import re

response = requests.get('') # 模擬瀏覽器開啟網頁

data = response.text

res = re.findall('(.*?)',data)

title_content_desc_dic = {}

for i in res:

content = re.findall('(.*?)

',i)[0]

title = re.findall('(.*?)',i)[0]

desc = re.findall('(04月.*?)

',i)[0]

title_content_desc_dic[title] = (content,desc)

for i in title_content_desc_dic.items():

print(f' | ')

import requests

import re

response = requests.get('')

data = response.text

# print(data)

img_url_res = re.findall('data-src="(.*?)"',data)

for i in img_url_res:

img_response = requests.get(i)

img_data = img_response.content

img_name = i.split('/')[-1]

f=open(img_name,'wb')

f.write(img_data)

# f.flush() # 快速重新整理

import requests

import re

response = requests.get('')

# response.encoding = 'utf8'

data = response.text

# print(data)

# mp4_res1 = re.findall('',data)

# for i in mp4_res1:

# print(i)

mp4_res2 = re.findall('', data)

for i in mp4_res2: # type:str

res = re.findall('(.*?htm)', i)[0]

res = '' + res

response = requests.get(res)

data = response.text

# url_res = re.findall('//video (.*?.mp4)',data)[0]

mp4_response = requests.get(url_res)

mp4_data = mp4_response.content

f = open('test.mp4','wb')

f.write(mp4_data)

# break

''''''

python爬蟲簡單 python爬蟲 簡單版

學過python的帥哥都知道,爬蟲是python的非常好玩的東西,而且python自帶urllib urllib2 requests等的庫,為爬蟲的開發提供大大的方便。這次我要用urllib2,爬一堆風景。先上重點 1 response urllib2.urlopen url read 2 soup...

簡單python爬蟲

一段簡單的 python 爬蟲程式,用來練習挺不錯的。讀出乙個url下的a標籤裡href位址為.html的所有位址 一段簡單的 python 爬蟲程式,用來練習挺不錯的。讀出乙個url下的a標籤裡href位址為.html的所有位址 usr bin python filename test.py im...

Python簡單爬蟲

一.獲取整個頁面的資料 coding utf 8 import urllib defgethtml url page urllib.urlopen url html page.read return html html gethtml print html 二.篩選需要的資料 利用正規表示式來獲取想...