python爬蟲入門簡單爬蟲

2021-07-23 03:58:21 字數 3718 閱讀 6297

# -*- coding: utf-8 -*-

from bs4 import beautifulsoup,soupstrainer

from threading import lock,thread

import sys,time,os

from urlparse import urlparse, urljoin

from urllib2 import urlopen

from urllib import urlretrieve

begin = ''

picset = set()

meiziset = set()

look = lock()

其中我將**分成了幾個函式,本來是想用物件導向的方式進行的,但是突然發現我想了半天不知道自動手所有就直接寫成方法了。

def

gethtml

(url):

''' @url:需要獲取html文件的url

'''f = urlopen(url)

if f.geturl() != url:

print

'chong ding xiang ',f.geturl

return

none

data = f.read()

f.close()

return data

獲取url中的鏈結

def

geturllist

(data,so=none,finds=none,ss = 'href'):

''' @so:soupstrainer的乙個例項

@finds:查詢的內容的標籤

@ss:需要從標籤中獲取的內容

'''soup = beautifulsoup(data,'html.parser')

links = soup.find(so)

for x in links.find_all(finds):

yield x[ss]

#return set(x[ss]for x in links.find_all(finds))

defdownload

(url):

''' @url:的src

'''site = urlparse(url).netloc.split('@')[-1].split(':')[0]

if site.startswith('www'):

print

'skipping this url'

return

path = url[-18:]

dirs = '/home/young/mei/'

name = dirs+path.replace('/','_')

ifnot os.path.exists(name):

data = urlretrieve(url,name)

else:

print

'cunzai'

defgetpageurl

():'''

'''global begin

global picset

data = gethtml(begin)

so = soupstrainer('div',class_="tags")

for cs in set(geturllist(data,so,'a')):

print

'\nfrom ',cs,"get html"

data = gethtml(cs)

so = soupstrainer(class_='wp-list clearfix')

s = geturllist(data,so,'a')

with look:

picset.update(s)

so = soupstrainer('div',id='wp_page_numbers')

numset = set(urljoin('',x)for x in geturllist(data,so,'a'))

print

'there are ',len(numset),'numbers'

for nu in numset:

print nu

data = gethtml(nu)

so = soupstrainer(class_='wp-list clearfix')

lists = geturllist(data,so,'a')

with look:

picset.update(lists)

ef getpicurl():

''''''

global picset

while

true:

with look:

try:

url = picset.pop()

except keyerror:

print

'pic is empty'

break

print

'from picset ',url

data = gethtml(url)

so = soupstrainer('div',class_="postcontent")

lists = geturllist(data,so,'img','src')

with look:

meiziset.update(lists)

defgetpic

():'''

'''global meiziset

while

true:

with look:

try:

url = meiziset.pop()

except keyerror:

print

'download error'

break

print

'download ',url

download(url)

defmain

():print

'begin page_thread'

page_thread = thread(target=getpageurl)

page_thread.start()

time.sleep(20)

print

'begin url_thread'

url_thread = thread(target=getpicurl)

url_thread.start()

time.sleep(40)

print

'begin pic_thread'

pic_thread = thread(target=getpic).start()

time.sleep(60)

print

'\n start two threading'

pic_thread1 = thread(target=getpic).start()

pic_thread3 = thread(target=getpic).start()

time.sleep(60)

print

'\n start two threading'

pic_thread2 = thread(target=getpic).start()

pic_thread4 = thread(target=getpic).start()

python爬蟲簡單入門

coding utf 8 from bs4 import beautifulsoup,soupstrainer from threading import lock,thread import sys,time,os from urlparse import urlparse,urljoin fro...

python爬蟲簡單 python爬蟲 簡單版

學過python的帥哥都知道,爬蟲是python的非常好玩的東西,而且python自帶urllib urllib2 requests等的庫,為爬蟲的開發提供大大的方便。這次我要用urllib2,爬一堆風景。先上重點 1 response urllib2.urlopen url read 2 soup...

Python簡單爬蟲入門二

上一次我們爬蟲我們已經成功的爬下了網頁的源 那麼這一次我們將繼續來寫怎麼抓去具體想要的元素 首先回顧以下我們beautifulsoup的基本結構如下 usr bin env python coding utf 8 from bs4 import beautifulsoup import reques...