本次目标是爬取商品名称、售价、促销价以及简介导出至csv,并将商品封面保存
源代码
import requests
from lxml import etree
import csv
thing_list = []
thing_id = 0
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
for p in range(1, 21):
url = 'https://www.klook.com/zh-CN/city/13-seoul/?city_id=13&limit=15&template_ids=&tag_ids=&instant=0&sort=&page='+str(p)
res = requests.get(url, headers=headers).text
selector = etree.HTML(res)
for n in range(1, 16):
thing_id = thing_id+1
thing_name = selector.xpath('//*[@id="filter-card-content"]/div['+str(n)+']/a/ul/li[1]/h3/text()')
thing_price_now = selector.xpath('//*[@id="filter-card-content"]/div['+str(n)+']/a/ul/li[3]/p[2]/span[1]/b/text()')
thing_price_pre = selector.xpath('//*[@id="filter-card-content"]/div['+str(n)+']/a/ul/li[3]/p[1]/del/text()')
if thing_price_pre == []:
th_price_pre = thing_price_now[0].strip()
th_price_now = ''
else:
th_price_pre = "¥ " + thing_price_pre[0].strip()
th_price_now = thing_price_now[0].strip()
thing_img = selector.xpath('//*[@id="filter-card-content"]/div['+str(n)+']/a/div/@data-original')
thing_site = selector.xpath('//*[@id="filter-card-content"]/div['+str(n)+']/a/@href')
thing_url = "https://www.klook.com" + thing_site[0]
thing_res = requests.get(thing_url, headers=headers).text
selector_th = etree.HTML(thing_res)
thing_intro = selector_th.xpath('//*[@id="description"]/div[1]/p/text()')
if thing_intro == []:
th_intro = ''
else:
th_intro = thing_intro[0]
thing_list.append([thing_name[0], thing_id, str(th_price_now).replace(',', ''), str(th_price_pre).replace(',', ''), th_intro, thing_img[0]])
pic = requests.get(thing_img[0])
with open(str(thing_id) + '.jpg', 'wb') as file:
file.write(pic.content)
title_list = ["name", "id", "price_now", "price_pre", "intro"]
with open("thing.csv", "w", newline='') as t:
writer = csv.writer(t)
writer.writerow(title_list)
writer.writerows(thing_list)