lxml爬取豆瓣

2021-08-15 22:22:39 字數 3421 閱讀 1651

#coding:utf-8

#__author__='wang'

#fake-useragent:第三方庫user-agent模組,它提供了最新的,最全面的user-agent

瀏覽器標識,支援谷歌,火狐,ie,opera等主流瀏覽器的user-agent值

#安裝方法:pip install fake-useragent

import csv,requests,re,codecs

#匯入用於隨機user-agent值的第三方庫

from fake_useragent import useragent

from lxml import etree

class dbmovie(object):

def

__init__(self):

self.base_url = ''

self.ua = useragent()

self.html_obj = none

def get_page_code(self,url=""):

'''根據url獲取網頁源**

:param

:return

:'''

#拼接每一頁的完整url位址

abs_url = self.base_url + str(url)

headers =

content = requests.get(abs_url,headers=headers).content

self.html_obj = etree.html(content,parser=etree.htmlparser

(encoding='utf-8'))

self.get_content()

def get_content(self):

'''根據每一頁的文件物件element,使用xpath/cssselect取出相關資訊

:param

html_obj:接收某一頁的根文件物件

:return

:'''

item_div = self.html_obj.xpath('//div[@class="item"]')

movie_list =

for item_tag in item_div:

movie_dict = {}

#獲取em標籤內部的電影排名

em = item_tag.xpath('.//em/text()')[0]

#獲取電影的簡要資訊

hd =item_tag.xpath('.//div[@class="hd"]/a/span/text()')

#將hd中的三個資訊拼接成乙個字串

info = ''

for info_text in hd:

content = info_text.strip('

\n').strip()

info += content

#獲取電影的詳細資訊

#演員介紹

member_info = item_tag.xpath('.//p[@class=""]/text()')[0]

.strip('

\n').strip()

#電影評分

star_number = item_tag.xpath('.//span[@class="rating_num"]

/text()')[0]

#電影評價

comment_number = item_tag.xpath('.//div[@class="star"]/

span[last()]/text()')[0]

comment_number = re.search(re.compile('(\d+)'),

comment_number).group(1)

#電影點評

quote = item_tag.xpath('.//span[@class="inq"]')

if len(quote) != 0:

quote = quote[0].xpath('text()')[0]

else:

quote = u""

#將以上資料新增到字典中

movie_dict['movie_rank'] = em

movie_dict['movie_name'] = info.encode('utf-8')

movie_dict['movie_member'] = member_info.encode('utf-8')

movie_dict['movie_star'] = star_number

movie_dict['movie_comment'] = comment_number

movie_dict['movie_quote'] = quote.encode('utf-8')

#將movie_list中的所有字典資料寫入帶本地excel檔案中

self.write_movie_info(movie_list)

def write_movie_info(self,movie_list):

'''將當前頁的所有電影資料寫入到本地檔案

:param

movie_list: 當前頁的所有資料

:return

:'''

for movie in movie_list:

self.writer.writerow(movie)

self.get_next_page_url()

def open_file(self):

csv_file = open('movie.csv', 'w')

self.writer = csv.dictwriter(csv_file,fieldnames=['movie_rank',

'movie_name', 'movie_member', 'movie_star', 'movie_comment',

'movie_quote'])

self.writer.writeheader()

def get_next_page_url(self):

a =self.html_obj.xpath('//span[@class="next"]/a')

if len(a)==0:

print

'最後一頁了'

return

next_page = a[0].xpath('@href')[0]

self.get_page_code(next_page)

def get_content_by_css(self, html_obj):

itme_div = html_obj.cssselect('.itme')

if __name__=='__main__':

movie_obj = dbmovie()

movie_obj.open_file()

movie_obj.get_page_code()

python爬取豆瓣影評

看的別人的 爬取某部影片的影評 沒有模擬登入只能爬6頁 encoding utf 8 import requests from bs4 import beautifulsoup import re import random import io import sys import time 使用se...

nodejs爬取豆瓣影評

爬取豆瓣心靈奇旅影評,包括使用者主頁頭像 let request require request let fs require fs const path require path var startnum 0 起始爬取位置 傳送請求 function reqdata url else 請求處理 a...

豆瓣熱門電影爬取

import requests import json import csv defgetonepagedata page start url headers params response requests.get start url,headers headers,params params i...