爬取的是
豆瓣网
本次将会用到两个库:superagent 和cheerio
其中 superagent是用来请求目标库的,用法也很简单,除了get 还有post等
superagent.get('https://movie.douban.com/chart').end(function(err,res){
if(err){
console.log('请求失败')
}else{
movies = getMovies(res);
}
})
注:这里需要封装一个函数,来获取数据,如getMovies
cheerio是用来获取数据的,这个库很好用,可以说是node版的jQuery
爬取的数据将会放在data.json里面
代码
const express = require('express');
const superagent = require('superagent');
const cheerio = require('cheerio');
const fs = require('fs');
const path = require('path');
const app = express();
let movies = [];
superagent.get('https://movie.douban.com/chart').end(function(err,res){
if(err){
console.log('fail')
}else{
movies = getMovies(res);
}
})
let getMovies = (res)=>{
let movies = [];
let $ = cheerio.load(res.text);
$('.item>td').each((index,ele)=>{
let movie = {
picture:$(ele).children().children().attr('src'),
title:$(ele).next().children().children("a").text().replace(/[\r\n]/g,"").replace(/\ +/g,""),
content:$(ele).next().children().children('.pl').text().replace(/[\r\n]/g,"").replace(/\ +/g,""),
score:$(ele).next().children().children('.star').children('.rating_nums').text(),
count:$(ele).next().children().children('.star').children('.pl').text()
}
if(movie.title!=''){
movies.push(movie)
}
})
return movies;
}
app.get('/',async (req,res,next)=>{
fs.writeFile(path.resolve(__dirname, 'data.json'),
JSON.stringify(movies)
, () => {
console.log("保存成功")
})
res.send(movies)
})
app.listen(4000,function(){
console.log('http://localhost:4000')
})
运行 node movies.js
爬取的数据data.json
注:因为我安装了jsonview插件,所以看起来很规整,或者在vscode里面的data.json(数据写入的文件)文件里面 ctrl+A ctrl+K ctrl+F