爬取糗事百科段子

2021-09-02 17:33:56 字數 2814 閱讀 4905

#!/user/bin/env python

# coding=utf-8

# @author: holley

# @file: baike1.py

# @datetime: 4/12/2018 14:32

'''description:

'''import requests

import re

import csv

from bs4 import beautifulsoup

from lxml import etree

class crawler(object):

def __init__(self):

self.base_url = ''

self.start_url = ''

self.csvfilename = 'qiushibaike.csv'

# self.data_list =

self.headers =

# def get_urls(self, url):

# html = requests.get(url, proxies=self.proxies).text

# soup = beautifulsoup(html, 'lxml')

# button = soup.find('ul', ).find_all('li')[-1]

# url = button.find('a')['href']

# return self.base_url + url

def get_html(self, url):

html = requests.get(url, proxies=self.proxies).text

return html

def get_contents(self, html):

selector = etree.html(html)

data_list =

parts = selector.xpath('//*[@id="content-left"]/div')

# 將每乙個段子部分分割槽,所需要的內容一一對應

for i in parts:

# print(type(i)) # soup = beautifulsoup(etree.tostring(i), 'lxml')

string_soup = str(etree.tostring(i), encoding="utf-8")

# print(soup)

# 獲取使用者id

try:

id = soup.find('h2').string.strip()

# 獲取使用者性別

pattern = re.compile('.*?', re.s)

gender = re.search(pattern, string_soup).group(1)

gender = gender.split('gender ')[1].split('icon')[0]

# 獲取使用者年齡

age = soup.find('div', ).string

except attributeerror:

id, gender, age = '匿名使用者', ' ', ' '

# 獲取內容

joke_div = soup.find('div', )

joke = joke_div.find('span').gettext().strip()

# 獲取

# 將csvdata中的資料迴圈寫入到csvfilename檔案中

for items in data:

writer.writerow(items)

if __name__ == '__main__':

base_url = ''

c = crawler()

for i in range(1, 14):

start_url = base_url + str(i) + '/'

html = c.get_html(start_url)

data = c.get_contents(html)

c.write_csv(data)

Scrapy 爬取糗事百科段子

1.python爬蟲實戰一之爬取糗事百科段子 2.在工作目錄建立myproject scrapy startproject myproject3.編寫 myproject myproject items.py coding utf 8 define here the models for your ...

爬取糗事百科,朗讀段子

一閒下來就不務正業了,寫個爬蟲,聽段子。額,mac自帶的語音朗讀,windows我就不知道啦,有興趣的可以去研究一下哈。環境 python 2.7 mac os 10.12 使用朗讀的 from subprocess import call call say hello pengge 當然了,聽起來...

爬取糗事百科段子內容

import requests,sqlite3,re class processdatatool object 資料處理的工具類 工具類中一般不寫 init 初始化屬性,只封裝工具方法對資料進行操作。工具類中的方法一般是以工具類居多。classmethod def process data cls,...