shikaobang python爬蟲指令碼

2022-09-20 13:12:14 字數 4843 閱讀 7246

"""

"""import pandas as pd

import urllib

import urllib2

from bs4 import beautifulsoup

import codecs

import re

a1 = 101500 #需要自己修改起始值

urlname_list =

url_name_start = u'/questionbank/5ymjvwgym6' #填入查詢到開始的urlname

url_name_end = u'/questionbank/g5mbgom1ax' #填入查詢到最後的urlname

a = 1

b = 1

while true:

url_name = "" + url_name_start

user_agent = "mozilla/5.0 (x11; u; linux x86_64; zh-cn; rv:1.9.2.10) gecko/2011122 ubuntu/10.10 (m**erick) firefox/2.5.1"

request = urllib2.request(url_name, headers=)

html = urllib2.urlopen(request)

html_data = beautifulsoup(html,"html.parser")

if html_data.find(name='a') is none:

urlname_list.pop()

url_name_start = urlname_list[-1]

continue

for m in html_data.find_all(href=re.compile("/questionbank/")) :

if m['href'] == url_name_end:

break

else:

a = a + 1

url_name_start = urlname_list[-1]

if url_name_end == url_name_start:

break

print u"查詢結果共" + str(a) + u"條"

print u"最終查詢結果共" + str(a) + u"條"

print u'開始爬取網頁'

#爬取網頁

import pandas as pd

import urllib

import urllib2

from bs4 import beautifulsoup

import codecs

import time

time_start=time.time()

"""修改題目對應網頁數值

"""a2 = a1

for i in urlname_list:

try:

url_name = "" + i

user_agent = "mozilla/5.0 (x11; u; linux x86_64; zh-cn; rv:1.9.2.10) gecko/2011122 ubuntu/10.10 (m**erick) firefox/2.5.1"

request = urllib2.request(url_name, headers=)

html = urllib2.urlopen(request)

f = codecs.open('html/sz_'+str(a1),'w')

f.write(html.read())

f.close()

a1 = a1 + 1

except:

print i

pass

continue

print "下次使用該編碼作為起始值:" + str((int(a1/100)+1)*100)

print "爬取網頁結束,開始處理文字"

# -*- coding: utf-8 -*-

def html_chuli(html):

html_data = beautifulsoup(html)

t_miaosu = html_data.find(attrs=)['content'] #題目描述

t_news_title = html_data.find_all(attrs=)

t_news_typs = html_data.find_all(attrs=)

t_news_time = html_data.find_all(attrs=)

tdata1 = html_data.find("div", attrs=)#抓取第乙個框架

if tdata1:

t_leixing = tdata1.select('span')[0].string #題目型別

t_content = tdata1.select('div.question-title')[0].string #題目內容 注:id是#;name是.

t_xueze = tdata1.select('div.question-item') #題目所有選項

x_abcd = #選項abcd

x_content = #選項abcd對應內容

z_xueze = #正確選項

for item in t_xueze:

item_middle = item.get_text().split()

for item in tdata1.select('label.actives'):#選擇

for item in tdata1.select('div.question-item.correct i'):#判斷

return t_miaosu,t_leixing,t_content,x_abcd,x_content,z_xueze,t_news_title,t_news_typs,t_news_time

else:

return '0'

#文字處理

import pandas as pd

import urllib

import urllib2

import re

import json

import random

from bs4 import beautifulsoup

import codecs

"""修改提取後對應文字編碼

"""for i in range(a2,a1):

try:

with open('html/sz_'+str(i), 'r') as f:

s_1 = ""

s_2 = ""

t_n = ""

contents = f.read().decode("utf-8", "ignore") #處理�

t_miaosu,t_leixing,t_content,x_abcd,x_content,z_xueze,t_news_title,t_news_typs,t_news_time = html_chuli(contents)

for m in range(len(x_abcd)):

if x_abcd[m][0]:

s1 = x_abcd[m][0]

else:

s1=""

if x_content[m][0]:

s2 = x_content[m][0]

else:

s2=""

s_1 = s_1 + s1 + ":" + s2 + " "

for n in range(len(z_xueze)):

s_2 = s_2 + z_xueze[n].strip()

for z in range(len(t_news_title)):

if t_news_title[z]:

new1 = t_news_title[z].text

else:

new1=""

if t_news_typs[z]:

new2 = t_news_typs[z].text

else:

new2=""

if t_news_time[z]:

new3 = t_news_time[z].text

else:

new3=""

t_n = t_n + new1 + "|" + new2 + "|" + new3 + "&"

if t_leixing is none:

continue

k1 = str(i) + "#" + t_miaosu.replace("\n", "") + "#" + t_leixing + "#" + t_content.replace(" ", "").replace("\n", "") + "#" + s_1.replace("\n", "") + "#" + s_2.replace("\n", "") + "#" + t_n.replace("\n", "")

f1 = codecs.open(u'out/時政202011-20210325.txt','a',encoding="utf-8") #修改匯出txt檔案編號

f1.write(k1 + "\n")

except:

f2 = codecs.open('out/fail_num.txt','a',encoding="utf-8")

k2 = str(i)

f2.write(k2 + "\n")

print str(i) + u"號html檔案匯入失敗!"

f2.close()

pass

continue

f1.close()

print u"處理完畢!再次執行請修改「輸出檔名」,並儲存py檔案,然後重新開始!!!"

此**僅紀念作用,目前已不可用

Python爬蟲 scrapy定時執行的指令碼

由於伺服器的crontab莫名掛掉了,還沒找到解決的辦法,於是找了另乙個方法 原理 1個程序 多個子程序 scrapy程序 將以下 檔案放入scrapy專案中任意位置即可 from multiprocessing import process from scrapy import cmdline i...

執行Windows PowerShell指令碼

執行windows powershell指令碼 預設情況下,為了確保安全性,windows powershell禁止執行指令碼。要執行自己建立的指令碼,必須更改windows powershell的執行策略,為此可使用下列命令 set executionpolicy remotesigned執行策略...

postgresql windows 備份指令碼

postgresql windows 備份指令碼 echo off setlocal enableextensions cd d dp0 set pgpath d program files x86 postgresql 9.3 bin pg dump set svpath g 0923 rmtjy...