我们学校的所有通知是在同一个网站上发布的,每天都会有很多条。我这个人比较懒,不喜欢经常翻看校内通知网,所以经常会错过某些重要的校内通知。所以最近突发奇想,我干脆做一个爬虫帮我每天看看都有哪些对我有用的通知吧。
const utils = require('../../myUtils')
const Crawler = require("crawler")
const oa = require('../../mongodb/jlu.edu.cn/oa')
const rootUrl = 'https://oa.jlu.edu.cn/defaultroot/'
const homepageUrl = 'https://oa.jlu.edu.cn/defaultroot/PortalInformation!jldxList.action'
let numOfNotifications = 0
let numOfPages = 0
function getPageUrl(num)
{
return `${homepageUrl}?1=1&startPage=${num}`;
}
function handlePage(error, res, done)
{
if(error)
{
console.log(error)
return
}
else
{
const $ = res.$
const $items = $("#itemContainer>div")
$items.each(async function(index, element)
{
const $item = $.load($(element).html())
const notification = {}
notification.href = rootUrl + $item(".font14").attr("href")
notification.title = $item(".font14").text().replace('[置顶]', '')
notification.releaser = $item(".column").text()
notification.date = $item(".time").text().replace(/今天*/, utils.date())
await handleNotification(notification)
numOfNotifications ++
})
numOfPages ++
console.log(`crawled ${numOfPages} pages, ${numOfNotifications} notifications`)
}
done()
}
async function handleNotification(notification)
{
const _notification = await oa.findOneNotification(notification)
if( ! _notification)
{
await handleNewNotification(notification)
}
}
async function handleNewNotification(notification)
{
// todo: send some message to me
return await oa.upsertNotification(notification)
}
const crawler = new Crawler
(
{
maxConnections: 20,
callback: handlePage,
}
)
function homepage()
{
const url = getPageUrl(1)
console.log(`crawling page: ${url}`)
crawler.queue(url);
}
homepage()
setInterval(function()
{
homepage()
}, 3600 * 1000)