今天接了个爬虫任务,主要是从网页上将数据爬下来,规整后导出到Excel。以前工作中的爬虫都是基于HttpClient+jsoup,很早就知道Nodejs有cheerio,HTML和JavaScript天生的一对,拿Nodejs去做网页爬虫很简单,有多简单呢?就这么说吧,和你用jQuery没什么两样。所以选择了Nodejs。
不涉及保密事件,故源码已托管GitHub
开工
mkdir crawler && cd crawler
npm init
MacBook-Pro:crawler$ npm init
This utility will walk you through creating a package.json file.
It only covers the most common items, and tries to guess sensible defaults.
See `npm help json` for definitive documentation on these fields
and exactly what they do.
Use `npm install <pkg> --save` afterwards to install a package and
save it as a dependency in the package.json file.
Press ^C at any time to quit.
name: (crawler) crawler
version: (1.0.0)
description: Nodejs crawler
entry point: (index.js)
test command:
git repository:
keywords:
author: jarvan4dev@163.com
license: (ISC)
About to write to /Users/jarvan4dev/Documents/test/crawler/package.json:
{
"name": "crawler",
"version": "1.0.0",
"description": "Nodejs crawler",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "jarvan4dev@163.com",
"license": "ISC"
}
Is this ok? (yes) yes
npm install - 安装依赖包
npm i cheerio --save
npm i excel-export --save
npm i request --save
简要介绍下这几个依赖包:
- cheerio,类似于Java中的jsoup,cheerio的部分就自己看吧。
- excel-export,Excel操作工具
- request,网络请求工具
小试牛刀
拿富德生命开刀
/**
* Created by jarvan4dev on 2017/2/28.
* 富德爬虫
*/
const http = require('http');
const cheerio = require('cheerio');
const exportUtils = require('../utils/exportUtils'); // 数据导出工具类
const fileName = 'fude.xlsx';
const companyName = '富德生命人寿保险股份有限公司';
const subName = '分公司';
const headers = [{
caption: '省',
type: 'string'
}, {
caption: '分公司名称',
type: 'string'
}, {
caption: '营业场所',
type: 'string'
}, {
caption: '电话',
type: 'string'
}];
let rows = [];
http.get('http://www.sino-life.com/publicinfo/fzjgyycsjlxdh/', res => {
let html = '';
res.setEncoding('utf-8'); //防止中文乱码
res.on('data', data => {
html += data;
});
res.on('end', () => {
let $ = cheerio.load(html); //采用cheerio模块解析html
$('.cantactTel').each((index, element) => {
let province = $(element).find('h3').eq(0).text().replace(companyName, '').replace(subName, '');
let subComName, location, tel;
$(element).find('td').each((i, tdEle) => {
switch (i % 3) {
case 0:
subComName = $(tdEle).text();
break;
case 1:
location = $(tdEle).text();
break;
case 2:
tel = $(tdEle).text();
break;
}
if (subComName && location && tel)
rows.push([province, subComName, location, tel]);
});
});
exportUtils.excelWrite(headers, rows, fileName);
})
});
可以看出Node原生的网络请求是异步的,这为后面的任务挖了个坑。
拿人保寿险深入
直接上代码
/**
* Created by jarvan4dev on 2017/2/28.
* 人保爬虫
*/
const http = require('http');
const cheerio = require('cheerio');
const request = require('request');
const companyName = '人保寿险';
const subName = '分公司';
const headers = [{
caption: '省',
type: 'string'
}, {
caption: '机构名称',
type: 'string'
}, {
caption: '营业场所',
type: 'string'
},{
caption: '邮编',
type: 'string'
}, {
caption: '电话',
type: 'string'
}];
const fileName = 'renbao.xlsx';
let rows = [];
const exportUtils = require('../utils/exportUtils');
http.get('http://www.picclife.com/aboutUsBranch.jhtml', res => {
let html = '';
res.setEncoding('utf-8'); //防止中文乱码
res.on('data', data => {
html += data;
});
res.on('end', () => {
let $ = cheerio.load(html); //采用cheerio模块解析html
$('.fgs_nr').each((index, element) => {
let nextLink = $(element).find('.fgs_mc_qg a').attr('href').trim();
let subComName = $(element).find('.fgs_mc_qg').attr('title').trim(); // 公司名称
let province = subComName.replace(companyName, '').replace(subName, '').trim();
let location = $(element).find('.fgs_add_qg').attr('title').trim();
let zipCode = $(element).find('.fgs_zc_qg').text().trim();
let tel = $(element).find('.fgs_tel_qg').attr('title').trim();
rows.push([province, subComName, location, zipCode, tel]);
http.get(nextLink, res => {
let nextHtml = '';
res.setEncoding('utf-8'); //防止中文乱码
res.on('data', data => {
nextHtml += data;
});
res.on('end', () => {
let $$ = cheerio.load(nextHtml);
$$('.fgs_nr').each((i, ele) => {
subComName = $$(ele).find('.fgs_mc_qg2').attr('title').trim(); // 公司名称
location = $$(ele).find('.fgs_add_qg2').attr('title').trim();
zipCode = $$(ele).find('.fgs_zc_qg').text().trim();
tel = $$(ele).find('.fgs_tel_qg').attr('title').trim();
rows.push([province, subComName, location, zipCode, tel]);
});
});
});
});
exportUtils.excelWrite(headers, rows, fileName);
});
});
是不是觉得大功告成?Too young too simple!别忘了http请求是异步的(request也是),这就明显有问题,在外层for循环中第一次执行rows.push([province, subComName, location, zipCode, tel])
时,由于http请求是异步的,所以完全可能第二次请求还没完成,就执行了exportUtils.excelWrite(headers, rows, fileName)
(ps: 在这里吐槽下简书代码不带行号的问题)。
下面看下我的改造,我的想法很简(chun)单(ben),我这样想的,既然第一for循环内部可能存在异步的http请求,那就让它从这里面脱离,直接让第一个for完全执行结束,注意看links变量。
/**
* Created by jarvan4dev on 2017/2/28.
* 人保爬虫
*/
const http = require('http');
const cheerio = require('cheerio');
const request = require('request');
const companyName = '人保寿险';
const subName = '分公司';
const headers = [{
caption: '省',
type: 'string'
}, {
caption: '机构名称',
type: 'string'
}, {
caption: '营业场所',
type: 'string'
},{
caption: '邮编',
type: 'string'
}, {
caption: '电话',
type: 'string'
}];
const fileName = 'renbao.xlsx';
let rows = [];
let links = [];
const exportUtils = require('../utils/exportUtils');
http.get('http://www.picclife.com/aboutUsBranch.jhtml', res => {
let html = '';
res.setEncoding('utf-8'); //防止中文乱码
res.on('data', data => {
html += data;
});
res.on('end', () => {
let $ = cheerio.load(html); //采用cheerio模块解析html
$('.fgs_nr').each((index, element) => {
let nextLink = $(element).find('.fgs_mc_qg a').attr('href').trim();
let subComName = $(element).find('.fgs_mc_qg').attr('title').trim(); // 公司名称
let province = subComName.replace(companyName, '').replace(subName, '').trim();
let location = $(element).find('.fgs_add_qg').attr('title').trim();
let zipCode = $(element).find('.fgs_zc_qg').text().trim();
let tel = $(element).find('.fgs_tel_qg').attr('title').trim();
links.push({'province': province, 'nextLink': nextLink});
rows.push([province, subComName, location, zipCode, tel]);
});
links.forEach((index, link) => {
http.get(link, res => {
let html = '';
res.setEncoding('utf-8'); //防止中文乱码
res.on('data', data => {
html += data;
});
res.on('end', () => {
let $ = cheerio.load(html);
$('.fgs_nr').each((i, element) => {
let subComName = $(element).find('.fgs_mc_qg2').attr('title').trim(); // 公司名称
let location = $(element).find('.fgs_add_qg2').attr('title').trim();
let zipCode = $(element).find('.fgs_zc_qg').text().trim();
let tel = $(element).find('.fgs_tel_qg').attr('title').trim();
rows.push([link.province, subComName, location, zipCode, tel]);
});
});
});
});
exportUtils.excelWrite(headers, rows, fileName);
});
});
其实然并卵... 关于第二种写法,我想了一个解决办法:
借助node的EventEmitter,在第一个forEach结束的时候记录下第二个forEach应该执行的次数,即 变量
links
的长度,记做全局变量count
。在第二层for循环的res.on('end')
中每完整执行一次网络请求并正确解析数据后,count--
,当count减为0时触发一个事件,在外部监听这个事件,然后对数据进行处理即可。参考网上解决方案:
function walk (path, handleFile, callback) {
var len = 1, // 文件|目录数,起始一个
floor = 0; // 第x个目录?
function done () {
// 完成任务, 运行回调函数
if (--len === 0) {
callback();
}
}
function composeErr (err) {
// 错误处理
console.log('stat error');
done(); // 以错误内容完成
}
function composeDir (path) {
// 目录处理
floor++;
fs.readdir(path, function (err, files) {
if (err) {
console.log('read dir error');
done(); // 目录完成
return;
}
len += files.length; // 子文件|子目录计数
files.forEach(function (filename) {
compose(path + '/' + filename); // 子内容新的操作
});
done(); // 目录完成
});
}
function composeFile (path) {
// 文件处理
handleFile(path, floor);
done(); // 文件完成
}
function compose (path) {
fs.stat(path, function (err, stats) {
if (err) {
composeErr(err);
return;
}
if (stats.isDirectory()) {
composeDir(path);
return;
}
composeFile(path);
});
}
compose(path);
}
其实方式二就是自己实现异步流程控制,其实有更好的方法 --- async。
祭出杀器 --- async
使用async做异步流程控制,代码会优雅很多。
/**
* Created by jarvan4dev on 2017/2/28.
* 人保爬虫
*/
const http = require('http');
const cheerio = require('cheerio');
const async = require('async');
const companyName = '人保寿险';
const subName = '分公司';
const headers = [{
caption: '省',
type: 'string'
}, {
caption: '机构名称',
type: 'string'
}, {
caption: '营业场所',
type: 'string'
},{
caption: '邮编',
type: 'string'
}, {
caption: '电话',
type: 'string'
}];
const fileName = 'renbao.xlsx';
let rows = [];
let links = [];
const exportUtils = require('../utils/exportUtils');
http.get('http://www.picclife.com/aboutUsBranch.jhtml', res => {
let html = '';
res.setEncoding('utf-8'); //防止中文乱码
res.on('data', data => {
html += data;
});
res.on('end', () => {
let $ = cheerio.load(html); //采用cheerio模块解析html
$('.fgs_nr').each((index, element) => {
let nextLink = $(element).find('.fgs_mc_qg a').attr('href').trim();
let subComName = $(element).find('.fgs_mc_qg').attr('title').trim(); // 公司名称
let province = subComName.replace(companyName, '').replace(subName, '').trim();
let location = $(element).find('.fgs_add_qg').attr('title').trim();
let zipCode = $(element).find('.fgs_zc_qg').text().trim();
let tel = $(element).find('.fgs_tel_qg').attr('title').trim();
links.push({'province': province, 'nextLink': nextLink});
rows.push([province, subComName, location, zipCode, tel]);
});
async.each(links, (link, callback) => {
http.get(link.nextLink, res => {
let html = '';
res.setEncoding('utf-8'); //防止中文乱码
res.on('data', data => {
html += data;
});
res.on('end', () => {
let $ = cheerio.load(html);
$('.fgs_nr').each((i, element) => {
let subComName = $(element).find('.fgs_mc_qg2').attr('title').trim(); // 公司名称
let location = $(element).find('.fgs_add_qg2').attr('title').trim();
let zipCode = $(element).find('.fgs_zc_qg').text().trim();
let tel = $(element).find('.fgs_tel_qg').attr('title').trim();
rows.push([link.province, subComName, location, zipCode, tel]);
});
callback();
});
});
}, err => {
// 此时所有的循环结束
exportUtils.excelWrite(headers, rows, fileName);
});
});
});
或者
/**
* Created by jarvan4dev on 2017/2/28.
* 人保爬虫
*/
const cheerio = require('cheerio');
const request = require('request');
const async = require('async');
const companyName = '人保寿险';
const subName = '分公司';
const headers = [{
caption: '省',
type: 'string'
}, {
caption: '机构名称',
type: 'string'
}, {
caption: '营业场所',
type: 'string'
},{
caption: '邮编',
type: 'string'
}, {
caption: '电话',
type: 'string'
}];
const fileName = 'renbao.xlsx';
let rows = [];
const exportUtils = require('../utils/exportUtils');
request('http://www.picclife.com/aboutUsBranch.jhtml', (error, response, body) => {
let $ = cheerio.load(body); //采用cheerio模块解析html
async.eachSeries($('.fgs_nr'), (element, callback) => {
let nextLink = $(element).find('.fgs_mc_qg a').attr('href').trim();
let subComName = $(element).find('.fgs_mc_qg').attr('title').trim(); // 公司名称
let province = subComName.replace(companyName, '').replace(subName, '').trim();
let location = $(element).find('.fgs_add_qg').attr('title').trim();
let zipCode = $(element).find('.fgs_zc_qg').text().trim();
let tel = $(element).find('.fgs_tel_qg').attr('title').trim();
rows.push([province, subComName, location, zipCode, tel]);
request(nextLink, (err, res, subBody) => {
let $$ = cheerio.load(subBody);
$$('.fgs_nr').each((i, ele) => {
subComName = $$(ele).find('.fgs_mc_qg2').attr('title').trim(); // 公司名称
location = $$(ele).find('.fgs_add_qg2').attr('title').trim();
zipCode = $$(ele).find('.fgs_zc_qg').text().trim();
tel = $$(ele).find('.fgs_tel_qg').attr('title').trim();
rows.push([province, subComName, location, zipCode, tel]);
});
callback();
});
}, err => {
console.log(rows.length);
exportUtils.excelWrite(headers, rows, fileName);
});
});
这两种写法只是方式一用的是原生http模块,方式二用的是request包,另外 请注意:each和eachSeries,后者是串行的,能够保证顺序。更多关于async请参看官方文档
参考文档:
Nodejs异步流程控制Async
关于excel-export
贴出我的导出文件的工具类吧!
/**
* Created by jarvan4dev on 2017/2/28.
*/
const excelExport = require('excel-export');
const fs = require('fs');
const path = require('path');
// 导出Excel
exports.excelWrite = (headers, rows, fileName) => {
let conf ={};
conf.name = fileName;
conf.cols = [];
for(let i = 0; i < headers.length; i++){
let col = {};
col.caption = headers[i].caption;
col.type = headers[i].type;
conf.cols.push(col);
}
conf.rows = rows;
let result = excelExport.execute(conf);
let filePath = path.join('/Users/jarvan4dev/Documents', fileName);
// appendFile 可以当文件不存在的时候自动创建
fs.appendFile(filePath, result, 'binary',function(err){
if(err){
console.log(err);
}
console.log('saved')
});
};
源码放在GitHub上,nodejs-crawler,动动手指,star一下!