前面第一节,我们把node的框架搭好了,这一节,我们去天猫上爬数据
首先,这里我们需要用到node的几个库
npm install request iconv-lite cheerio bluebird --save
我来介绍一下
request是网络请求库
iconv-lite编码转换库
cheerio是html处理库,可以使用jquery来查找dom元素
bluebird是Promise库
安装完以后,创建一个http.js文件,这个文件是我们处理网络请求的库,我们对request进行封装一下
'use strict'
import request from 'request'
import Promise from 'bluebird'
import iconv from 'iconv-lite'
export default function requestAsync(url, encode){
return new Promise((reslove, reject) => {
request.get({url, encoding: null}, function(error, response, body){
if(error){
reject(error)
}
//判断是否需要编码转换
if(encode){
let data = iconv.decode(body, encode)
reslove(data)
}else{
reslove(body)
}
})
})
}
接下来,我们在main.js中加入
'use strict'
import requestAsync from './http'
const url = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.Im2OsQ&s=60&q=19.9&sort=s&style=g&from=mallfp..pc_1_searchbutton&active=2&type=pc#J_Filter';
(async() => {
try{
const html = await requestAsync(url, 'gbk')
console.log(html)
}catch(error){
console.log(error)
}
})();
运行一下
npm start
看,控制台应该就有天猫网站的html数据输出了
接下来就是处理html数据了
再创建一个parse.js文件,这里我们放入处理天猫的逻辑
'use strict'
import Promise from 'bluebird'
import cheerio from 'cheerio'
export default function tmallParse(html){
return new Promise((resolve, reject) => {
try{
//产品列表
var products = []
var $ = cheerio.load(html, {decodeEntities: false})
$('.product').each(function(){
var _title = $(this).find('.productTitle').text().trim()
var _url = $(this).find('.productTitle a').attr('href')
var _img = $(this).find('.productImg-wrap img').attr('data-ks-lazyload')
//判断是否有图片地址,没有就换个地方取
if(!_img){
_img = $(this).find('.productImg-wrap img').attr('src')
}
var _price = $(this).find('.productPrice em').attr('title')
//产品信息
var product = {
title: _title,
url: _url,
img: _img,
price: _price
}
//放入产品列表
products.push(product)
})
resolve(products)
}catch(error){
reject(error)
}
})
}
好了,处理逻辑写完,我们来试试对不对
在 main.js中加入
....
import tmallParse from './parse'
...
(async() => {
try{
const html = await requestAsync(url, 'gbk')
const products = await tmallParse(html)
console.log(products)
}catch(error){
console.log(error)
}
})();
运行一下
npm start
如果没有报错的话,应该有像我以下这样的数据
[ { title: '官网乐扣乐扣保鲜盒耐热玻璃便当盒水果饭盒LLG205 19.9低价换购',
url: '//detail.tmall.com/item.htm?id=19255882590&skuId=30422954199&user_id=1024055617&cat_id=2&is_b=1&rn=84b2fa84dbb5923100950f78d786559e',
img: undefined,
price: '41.80' },
{ title: '小柄钻直柄钻头19.1 19.2 19.3 19.4 19.5 19.6 19.7 19.8 19.9',
url: '//detail.tmall.com/item.htm?id=45709408163&skuId=87371450407&user_id=919575892&cat_id=2&is_b=1&rn=84b2fa84dbb5923100950f78d786559e',
img: undefined,
price: '53.80' },
{ title: '【第二件19.9】佳利麦墨西哥牛油果智利牛油果新鲜水果160g*3个装',
url: '//detail.tmall.com/item.htm?id=7366904296&skuId=4611686025794292200&user_id=496514980&cat_id=2&is_b=1&rn=84b2fa84dbb5923100950f78d786559e',
img: undefined,
price: '19.90' },
{ title: 'BSO小魔怪2016新品夏季女装圆领短袖T恤 19.9包邮女装款',
url: '//detail.tmall.com/item.htm?id=533799375853&skuId=3184954928617&user_id=2284318191&cat_id=2&is_b=1&rn=84b2fa84dbb5923100950f78d786559e',
img: undefined,
price: '19.90' },
{ title: '包邮 横开口 金属铁网状 资料收纳架 多层随意叠加文件盘19.9元层',
url: '//detail.tmall.com/item.htm?id=16074699708&skuId=4611686034502087612&user_id=857093887&cat_id=2&is_b=1&rn=84b2fa84dbb5923100950f78d786559e',
img: undefined,
price: '19.90' },
{ title: '【包邮19.9】男士职业红色商务正装6CM结婚新郎领带蓝色条纹',
url: '//detail.tmall.com/item.htm?id=520167716998&skuId=3100802610661&user_id=1699787468&cat_id=2&is_b=1&rn=84b2fa84dbb5923100950f78d786559e',
img: '//img.alicdn.com/bao/uploaded/i4/TB1VFpNIpXXXXXsXVXXXXXXXXXX_!!0-item_pic.jpg_b.jpg',
price: '25.00' },
{ title: '3包19.9网尚新奥尔良腌料140克微辣烧烤烤鸡翅油炸猪排烤肉烧烤料',
url: '//detail.tmall.com/item.htm?id=41131848051&skuId=4611686059559235955&user_id=1696169910&cat_id=2&is_b=1&rn=84b2fa84dbb5923100950f78d786559e',
img: '//img.alicdn.com/bao/uploaded/i3/TB1YVtsLXXXXXbWXVXXXXXXXXXX_!!0-item_pic.jpg_b.jpg',
price: '8.80' },
{ title: '【19.9元换购】韩国HOLIKA 天然92%芦荟沐浴凝露55ml三只装',
url: '//detail.tmall.com/item.htm?id=539864655275&skuId=3199916890481&user_id=2820479942&cat_id=2&is_b=1&rn=84b2fa84dbb5923100950f78d786559e',
img: '//img.alicdn.com/bao/uploaded/i2/TB1mvAkOXXXXXcYXXXXXXXXXXXX_!!0-item_pic.jpg_b.jpg',
price: '19.90' },
{ title: '逸佳户外 烧烤炉烧烤工具套装 19.9元基础套餐',
url: '//detail.tmall.com/item.htm?id=520184728205&skuId=4611686538612116109&user_id=1705256849&cat_id=2&is_b=1&rn=84b2fa84dbb5923100950f78d786559e',
img: '//img.alicdn.com/bao/uploaded/i2/TB18dGcIpXXXXaTXpXXXXXXXXXX_!!0-item_pic.jpg_b.jpg',
price: '19.90' },
...
好了,这一章的处理写完了,下一章我们把这些数据存到mongodb数据库里