selenium爬去資料 儲存

2022-08-31 23:51:17 字數 3370 閱讀 5576

1 爬去資料**

#coding=utf-8

from selenium import webdriver

from selenium.webdriver.common.by import by

from selenium.webdriver.support.ui import webdriverwait

from selenium.webdriver.support import expected_conditions as ec

#載入timeoutexception模組,用於進行超時處理

from selenium.common.exceptions import timeoutexception

#正規表示式

import re,sys

from pyquery import pyquery as pq

from config import *

#載入資料庫操作模組

import mysqlop

driver=webdriver.chrome()

#使用phantomjs瀏覽器驅動

#driver=webdriver.phantomjs()

driver.get("")

driver.set_window_size(1400,900)

wait=webdriverwait(driver, 10)

def search():

try:

input=wait.until(ec.presence_of_element_located(by.css_selector,"#q"))

submit=wait.until(ec.element_to_be_clickable((by.css_selector,"#j_tsearchform > div.search-button > button")))

input.clear()

input.send_keys("美食")

submit.click()

#獲取第一頁的資料

get_goods()

except timeoutexception :

search()

#獲取總頁碼

def get_total():

#查詢總頁碼

total=wait.until(ec.presence_of_element_located((by.css_selector,"#mainsrp-pager > div > div > div > div.total")))

return total.text

#翻頁def next_page(page):

try:

input=wait.until(ec.presence_of_element_located((by.css_selector, "#mainsrp-pager > div > div > div > div.form > input")))

submit=wait.until(ec.element_to_be_clickable((by.css_selector,"#mainsrp-pager > div > div > div > div.form > span.btn.j_submit")))

input.clear()

input.send_keys(page)

submit.click()

wait.until(ec.text_to_be_present_in_element((by.css_selector,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page)))

#獲取當前頁的資料

count=get_goods()

except timeoutexception:

next_page(page)

return count

def get_goods():

wait.until(ec.presence_of_element_located((by.css_selector,"#mainsrp-itemlist .items .item")))

#mainsrp-itemlist > div > div > div:nth-child(1) > div.item.j_mouseronverreq.item-ad

#mainsrp-itemlist > div > div > div:nth-child(1)

html=driver.page_source

doc=pq(html)

items=doc("#mainsrp-itemlist .items .item").items()

count=0

for item in items:

goods=

print(goods)

#將資料插入資料庫

mysqlop.mysqlop(goods)

count+=1

return count

def main():

search()

total=get_total()

#使用正規表示式提取頁碼

total=int(re.compile(r"(\d+)").search(total).group(1))

print(total)

total_count=0

for i in range(2,total+1):

count=next_page(i)

total_count +=count

print(total_count)

if __name__=="__main__":

main()

2 存入到mysql中

建立乙個mysqlop.py的檔案

#coding=utf-8

from pymysql import *

def mysqlop(goods):

conn=connect(host='127.0.0.1', port=3306, user='root', passwd='1qaz2wsx#edc', db='taobao_meishi', charset='utf8')

cursor=conn.cursor()

cursor.execute("insert into goods(image,price,deal,title,shop,location) values(%s,%s,%s,%s,%s,%s)",(goods['image'],goods['price'],goods['deal'],goods['title'],goods['shop'],goods['location']))

conn.commit()

cursor.close()

conn.close()

使用selenium爬拉勾網資料

usr bin env python encoding utf 8 description 使用selenium爬拉勾網資料 from selenium import webdriver from selenium.webdriver.support.ui import webdriverwait ...

selenium 爬取拉勾

用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...

selenium爬取拉勾

用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...