一,先看结果
二, 思路
1.通过对网页源代码的使用selenium内xpath点击动作了,获取网页源代码
2.将源代码用BeautifulSoup解析
3.使用mysql语法,创建数据库表的结构,实现自动化建表,一键入库
三,上源码
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
from bs4 import BeautifulSoup
import pymysql
import logging
'''
1.
1.selenium+PhantomJs+BeautifulSoup
'''
###################数据库
class Dandan(object):
def __init__(self):
self.url='http://www.taobao.com'
self.conn = pymysql.Connect(host='x',user='xx',password='xx',port=x,database='xx',charset='utf8')
self.browser = webdriver.PhantomJS()
self.mysql = mysql()
def mysql(self):
cursor=self.conn.cursor()
dataname = input('请输入数据库名字:')
sql='CREATE TABLE IF NOT EXISTS %s(ID INT(10) NOT NULL PRIMARY KEY AUTO_INCREMENT,' \
'A TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,' \
'B VARCHAR(255),' \
'C VARCHAR(255),' \
'D VARCHAR(255),' \
'E VARCHAR(255))ENGINE = INNODB DEFAULT CHARSET=utf8'
cursor.execute(sql%dataname)
print('创建成功!!')
def get_url(self):
browser =self.browser
timeout = WebDriverWait(browser,10)
browser.set_window_size(900,900)
browser.get(self.url)
input_a =browser.find_element_by_id('q')
inputname = input('请输入你想要的商品:')
input_a.send_keys(inputname)
input_a.send_keys(Keys.ENTER)
time.sleep(4)
#print(browser.page_source)
#timeout.until(EC.presence_of_all_elements_located((By.CLASS_NAME,'ctx-box J_MouseEneterLeave J_IconMoreNew')))
try:
i = 0
while True:
i+=1
for page in range(0,6000,1000):
browser.execute_script('window,scrollBy(0,{})'.format(page))
time.sleep(3)
html = browser.page_source
soup = BeautifulSoup(html,'lxml')
price = [price.get_text() for price in soup.find_all('div',class_='price g_price g_price-highlight')]
pay = [pay.get_text() for pay in soup.find_all('div',class_='deal-cnt')]
store = [store.get_text().strip().replace('\n\n','') for store in soup.find_all('div',class_='row row-3 g-clearfix')]
for a,b,c in zip(store,price,pay):
print('\n\n店面/城市:{}\n价钱:{}付款人数:{}'.format(a,b,c))
sql = "INSERT INTO %s(B,C,D)VALUES('店面/城市:%s','价钱:%s','付款人数:%s')"
values = (dataname,a,b,c)
cursor.execute(sql % values)
conn.commit()
print('导入成功!!')
browser.find_element_by_xpath("//div[@id='mainsrp-pager']/div/div/div/ul/li[8]/a/span").click()
print('下一页')
time.sleep(5)
if i == 3: #抓取几页数据
cursor.close()
conn.cursor()
print('关闭数据库')
browser.quit()
print('关闭浏览器')
break
except Exception as e:
print(e)
if __name__ == '__main__':
d = Dandan()
d.mysql()
d.get_url()