python實現簡單爬蟲 Python實現簡單爬蟲

2021-10-19 02:29:33 字數 3970 閱讀 8023

簡介

爬蟲架構

1)url管理器

3)網頁分析器

4)爬蟲呼叫器

5)價值資料使用

爬蟲實現

1)排程器實現# coding:utf-8

import url_manager

import html_**********

import html_parser

import html_outputer

import url_manager

class spidermain(object):

def __init__(self):

self.urls = url_manager.urlmanager()

self.********** = html_**********.html**********()

self.parser = html_parser.htmlparser()

self.outputer = html_outputer.htmloutputer()

def craw(self, root_url):

count = 1

self.urls.add_new_url(root_url)

while self.urls.has_new_url():

try:

new_url = self.urls.get_new_url()

print "craw %d : %s" % (count, new_url)

html_cont = self.**********.download(new_url)

new_urls, new_data = self.parser.parse(new_url, html_cont)

self.urls.add_new_urls(new_urls)

self.outputer.collect_data(new_data)

if count == 1000:

break

count = count + 1

except:

print "craw failed"

self.outputer.output_html()

if __name__ == "__main__":

root_url = ""

obj_spider = spidermain()

obj_spider.craw(root_url)

2)url管理器實現class urlmanager(object):

def __init__(self):

self.new_urls = set()

self.old_urls = set()

def add_new_url(self, url):

if url is none:

return

if url not in self.new_urls and url not in self.old_urls:

self.new_urls.add(url)

def add_new_urls(self, urls):

if urls is none or len(urls) == 0:

return

for url in urls:

self.add_new_url(url)

def has_new_url(self):

return len(self.new_urls) != 0

def get_new_url(self):

new_url = self.new_urls.pop()

self.old_urls.add(new_url)

return new_url

class html**********(object):

def download(self, url):

if url is none:

return none

response = urllib2.urlopen(url)

if response.getcode() != 200:

return none

return response.read()

4)url解析器實現from bs4 import beautifulsoup

5)價值資料輸出顯示# coding:utf-8

class htmloutputer(object):

def __init__(self):

self.datas =

def collect_data(self, data):

if data is none:

return

def output_html(self):

fout = open('output.html', 'w')

fout.write("")

fout.write("")

fout.write("

") fout.write("

for data in self.datas:

fout.write("

") fout.write("

%s" % data['url'])

fout.write("

%s" % data['title'].encode('utf-8'))

fout.write("

%s" % data['summary'].encode('utf-8'))

fout.write("

") fout.write("

") fout.write("")

fout.write("")

fout.close()

執行

Python實現簡單爬蟲

簡單爬蟲構架 時序圖 管理待抓取url集合和已抓取url集合 通過兩個列表 已抓取url列表,未抓取url的列表 防止重複抓取 防止迴圈抓取 request.add header user agent mozilla 5.0 偽裝成火狐瀏覽器 urllib2.install opener opene...

Python實現簡單爬蟲

簡單爬蟲構架 時序圖 管理待抓取url集合和已抓取url集合 通過兩個列表 已抓取url列表,未抓取url的列表 防止重複抓取 防止迴圈抓取 request.add header user agent mozilla 5.0 偽裝成火狐瀏覽器 urllib2.install opener opene...

python實現簡單爬蟲功能

我們最常規的做法就是通過滑鼠右鍵,選擇另存為。但有些滑鼠右鍵的時候並沒有另存為選項,還有辦法就通過就是通過截圖工具擷取下來,但這樣就降低的清晰度。好吧 其實你很厲害的,右鍵檢視頁面源 我們可以通過python 來實現這樣乙個簡單的爬蟲功能,把我們想要的 爬取到本地。下面就看看如何使用 python ...