爬取foxconn傳媒網獲取電影資訊

2021-09-04 10:55:01 字數 2607 閱讀 1589

#!/user/bin/env python

# coding=utf-8

用正規表示式取資料

'''import requests

import re

import time

import json

from bs4 import beautifulsoup

class downmovie(object):

def __init__(self):

self.server = ''

self.start_url = '/video/list/99?page=1'

self.proxy = 'username:password@host:port'

self.proxies =

self.names = # 存放電影名

self.urls = # 存放電影詳細介面url

self.content = {} # 存放電影相關資訊

self.nums = 0 # 電影數量

'''函式說明:

獲取目標頁面html以及其中所含的電影名字和電影詳細介面的href

parameters:

none

returns:

none

modify:

2018-09-19

'''def get_html(self):

html = requests.get(self.start_url, proxies=self.proxies).text

soup = beautifulsoup(html, 'lxml')

movie_name = soup.find_all('a', )

self.nums = len(movie_name)

for each in movie_name:

'''函式說明:

parameters:

url - 電影詳細介面鏈結

returns:

self.content - 電影簡介

modify:

2018-09-19

'''def get_content(self, url):

html = requests.get(url, proxies=self.proxies).text

soup = beautifulsoup(html, 'lxml')

div = soup.find_all('div', )

soup1 = beautifulsoup(str(div[0]), 'lxml')

movie_dl = soup1.find_all('script')

# 通過正規表示式來獲取電影相關資訊

pattern = re.compile(''

+ '(.*?)(.*?).*?' # rating

+ '(.*?)(.*?).*?' # duration

+ '(.*?)(.*?).*?' # producer

+ '(.*?)(.*?).*?' # editor

+ '(.*?)(.*?).*?' # screenplay

+ '(.*?)(.*?).*?class="div180">' # actor

+ '(.*?)(.*?)', re.s) # introduction

items = re.findall(pattern, html)[0]

for i in range(0, 14, 2):

self.content[items[i].strip()] = items[i + 1].strip()

return self.content, userurl

'''函式說明:

寫檔案parameters:

filename - 檔名稱(string)

movie_name - 電影名稱名稱(string)

content - 電影簡介內容(string)

returns:

none

modify:

2018-09-19

'''def write_m(self, filename, movie_name, content, dlurl):

write_flag = true

with open(filename, 'a', encoding='utf-8') as fp:

fp.write(movie_name + '\n')

fp.write(json.dumps(content, ensure_ascii=false) + '\n')

fp.writelines(self.server + dlurl)

fp.write('\n\n')

if __name__ == '__main__':

dl = downmovie()

dl.get_html()

for i in range(dl.nums):

content, dlurl = dl.get_content(dl.urls[i])

dl.write_m('movies.txt', dl.names[i], content, dlurl)

time.sleep(1)

爬取豆瓣網電影資訊

coding utf 8 import urllib2 import bs4 from bs4 import beautifulsoup 爬取豆瓣網電影簡介,包括電影名,導演,評分以及介紹等 class dbtop def init self self.usr agent mozilla 5.0 w...

Scrapy爬取1908電影網電影資料

import scrapy class movie1905item scrapy.item define the fields for your item here like name scrapy.field 電影名稱 movie name scrapy.field 評分rating scrapy...

python爬蟲 爬取豆瓣網電影資訊

豆瓣網 如下 import requests import urllib.request if name main 指定ajax get請求的url 通過抓包進行獲取 url 定製請求頭資訊,相關的頭資訊必須封裝在字典結構中 headers import requests import urllib...