python爬豆瓣 Python 爬一下豆瓣電影

2021-10-11 11:57:50 字數 2986 閱讀 5457

簡介

純屬python小練習

檔案結構

#!/usr/bin/python

# -*- coding: utf-8 -*-

import urllib2

class html**********(object):

def downlod(self, url):

if url is none:

return none

response = urllib2.urlopen(url)

if response.getcode() != 200:

return none

return response.read()

html_outputer.py - 輸出結果到檔案中

#!/usr/bin/python

# -*- coding: utf-8 -*-

class htmloutputer(object):

def collect_data(self, movie_data):

if movie_data is none:

return

fout = open('output.html', 'a+')

for data in movie_data:

print data['name'] + '|', data['rate'] + '|', data['actor'], '\n'

fout.write('%s,' % data['name'].encode('utf-8'))

fout.write('%s,' % data['rate'])

fout.write('%s\n' % data['actor'].encode('utf-8'))

fout.close()

html_parser.py: 解析器:解析html的dom樹

#!/usr/bin/python

# -*- coding: utf-8 -*-

from bs4 import beautifulsoup

class htmlparser(object):

def __init__(self):

pass

def parser_html(self, cnt):

if cnt is none:

return

soup = beautifulsoup(cnt, 'html.parser', from_encoding='utf-8')

# movie_name, movie_desc, movie_rate =

return self.get_movie_names(soup)

def get_movie_names(self, soup):

movie_data =

movie_all = soup.find('div', class_='article').find_next('table').find_next_sibling('div').find_next_sibling('div').find_all('table')

count = 1

for movie_one in movie_all:

# if count > 2:

# break

count += 1

return movie_data

def get_movie_name(self, cnt):

info = {}

soup = beautifulsoup(str(cnt), 'html.parser', from_encoding='utf-8')

movie_one = soup.find('tr', class_='item').find_next('td').find_next_sibling('td').find('div', class_='pl2')

info['name'] = movie_one.find('a').get_text().replace("\n", "").replace(" ", "")

info['actor'] = movie_one.find('p', class_='pl').get_text().replace("\n", "").replace(" ", "")

info['rate'] = movie_one.find('div', class_='star clearfix').find('span', class_='rating_nums').get_text()

return info

spider_main.py - 主函式

#!/usr/bin/python

# -*- coding: utf-8 -*-

import html_parser, html_outputer, html_**********

class spidermain(object):

def __init__(self):

self.parser = html_parser.htmlparser()

self.outputer = html_outputer.htmloutputer()

self.********** = html_**********.html**********()

def craw(self, url):

html_cnt = self.**********.downlod(url)

movie_data = self.parser.parser_html(html_cnt)

self.outputer.collect_data(movie_data)

if __name__ == '__main__':

url = ''

spider = spidermain()

spider.craw(url)

綜述其實就是使用了urllib2和beautifulsoup庫,沒啥好說的,你也可以直接改url,然後更改html_parser.py檔案來滿足你自己的爬蟲需求。當前也可以更改html_outputer.py來定義儲存格式,目前是csv。

python爬取豆瓣影評

看的別人的 爬取某部影片的影評 沒有模擬登入只能爬6頁 encoding utf 8 import requests from bs4 import beautifulsoup import re import random import io import sys import time 使用se...

python爬取資料豆瓣讀書

xpath爬取指令碼 from urllib import request from lxml import etree base url response request.urlopen base url html response.read decode utf 8 htmls etree.ht...

python爬取豆瓣網頁短評實戰!

首先我們開啟我的父親母親的網頁介面 鏈結 可以觀察到如下介面以及讀者對本書的評價 接下來我們直接附上 書名 我的父親母親 出版社 南海出版公司 原作名 alfred and emily 譯者 匡詠梅 出版年 2013 1 頁數 238 定價 29.50元 裝幀 精裝 叢書 新經典文庫 萊辛作品 is...