这两天想买几本关于Python的书,自然是到各网上书店里找,比较哪家最便宜的下手了。可是发现很麻烦,需要在每个网站里每本书都要搜一遍,搜完还得计个总价格,看看谁家便宜。所以想到了用Python的爬虫技术,做一个工具,到各网上书店里找书并计算总价。
京东的搜索很烂,结果是一大堆无关的东西,未找到好方法解决,只好先放弃。目前实现了当当网和亚马逊搜书并找出最低价和各书的地址,将其保存在results.txt中,并显示最低总价。
最好是做成一个web页面,可以接受输入书名,并且在页面中直观的显示各网站书的总价,还要能一键放入购物书。
代码如下:
# -*- coding:utf-8 -*-
"""在当当和亚马逊中找书,输出最低价格"""
import requests, datetime, threading
from urllib.request import quote
from lxml import etree
books = ('流畅的python', 'Python编程快速上手 让繁琐工作自动化', '编写高质量Python代码的59个有效方法')
def d(book, book_ifos):
"""当当网"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3'}
d_url = 'http://search.dangdang.com/?key={}&act=input&sort_type=sort_xlowprice_asc#J_tab'
search_url = d_url.format(quote(book, encoding='gbk'))
r = requests.get(search_url, headers=headers)
root = etree.HTML(r.text)
results = root.xpath('//li[starts-with(@class,"line")]')
"""若是有results,则找到书了"""
if results:
book_sub = book.lower().split(' ')
for result in results:
title = result.xpath('a')[0].attrib['title'].strip()
"""判断书名中是否含有旧书,有则跳过"""
if '旧书' in title:
continue
"""书名按空格折分,并在title中匹配,全匹配才是找对书"""
hit = False
for s in book_sub:
if s in title.lower():
hit = True
else:
hit = False
continue
"""取得价格和地址,添加到book_ifos中"""
if hit:
a = result.xpath('p/span[@class="search_now_price"]')
if len(a) != 0:
price = float(a[0].text[1:])
else:
continue
url = result.xpath('a')[0].attrib['href']
book_ifos['dangdang'].append({'title': title, 'price': price, 'url': url})
break
else:
book_ifos['dangdang'].append({'title': book, 'price': 0, 'url': ''})
def z(book, book_ifos):
"""亚马逊"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3'}
z_url = 'https://www.amazon.cn/s/ref=nb_sb_ss_ime_c_1_5?__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&url=search-alias%3Daps&field-keywords={}'
search_url = z_url.format(book)
r = requests.get(search_url, headers=headers)
r.encoding = 'utf-8'
root = etree.HTML(r.text)
results = root.xpath('//li[starts-with(@id,"result_")]')
"""若是有results,则找到书了"""
if results:
book_sub = book.lower().split(' ')
book_ifo = []
for result in results:
a = result.xpath('div/div/div/a/h2/..')[0]
title = a.attrib['title'].strip()
"""书名按空格折分,并在title中匹配,全匹配才是找对书"""
hit = False
for s in book_sub:
if s in title.lower():
hit = True
else:
hit = False
continue
if hit:
price_str = result.xpath('div/div/a/span')[1].text
"""
获取到的价格为:¥222.222,所以只提取数字部分,并转为float
若是电子书,则取不到价格,跳过
"""
if price_str:
price = float(price_str[1:])
else:
continue
url = a.attrib['href']
"""获取最低价格"""
if len(book_ifo) == 0:
book_ifo = [title, price, url]
elif book_ifo[1] > price:
book_ifo = [title, price, url]
book_ifos['Amazon'].append({'title': book_ifo[0], 'price': book_ifo[1], 'url': book_ifo[2]})
else:
book_ifos['Amazon'].append({'title': book, 'price': 0, 'url': ''})
if __name__ == '__main__':
start_time = datetime.datetime.now()
book_ifos = {'dangdang': [], 'Amazon': []}
threads = []
for book in books:
"""当当网价格查询"""
t = threading.Thread(target=d, args=(book, book_ifos))
t.start()
threads.append(t)
"""亚马逊网价格查询"""
t = threading.Thread(target=z, args=(book, book_ifos))
t.start()
threads.append(t)
"""等待线程运行结束"""
for t in threads:
t.join()
"""统计各网站的总价格"""
for site in book_ifos:
total_price = 0.0
for book in book_ifos[site]:
total_price += book['price']
print(site, '\t', round(total_price, 2))
print('spend time:', str(datetime.datetime.now() - start_time)[:10])
with open('results.txt', 'w') as f:
f.write(str(book_ifos))
如果本文对您有帮助,请给我留个言。谢谢!