spa网页无法被蜘蛛抓取到,如何做seo呢?
一种方案是,可针对蜘蛛单独做一套服务端渲染的页面,通过nginx做反向代理即可。这种方案不会动原有的代码,只是单纯增加了工作量。用官方推荐的ssr也可以,但实际代码改动量也非常的大,坑也特别多,如果时间不是很充裕,不如直接这种方案简单明了,否则会让你改到吐血还不一定搞得出来。
upstream spider_server {
server localhost:3000;
}
server {
listen 80;
server_name example.com;
location / {
proxy_set_header Host $host:$proxy_port;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
if ($http_user_agent ~* "Baiduspider|bingbot|Googlebot|360Spider") {
proxy_pass http://spider_server;
}
}
}
当然还有一种省事的办法,也不用重新做一套页面,可以利用puppeteer无头浏览器,把网页的html内容先都加载完,然后返回即可。
本地生成一个express,控制器可以全干掉。
app.js:
var createError = require('http-errors');
var express = require('express');
const request = require('request');
const ssr = require('./ssr.js');
var app = express();
const host = 'http://localhost:4203';
app.get('/assets/*', async (req, res) => {
request(`${host}${req.url}`).pipe(res);
});
app.get('/favicon.ico', async (req, res) => {
request(`${host}${req.url}`).pipe(res);
});
app.get('*', async (req, res) => {
console.log(req.originalUrl);
const { html, ttRenderMs } = await ssr(`${host}${req.originalUrl}`);
res.set('Server-Timing', `Prerender;dur=${ttRenderMs};desc="Headless render time (ms)"`);
return res.status(200).send(html); // Serve prerendered page as response.
});
// catch 404 and forward to error handler
app.use(function (req, res, next) {
next(createError(404));
});
// error handler
app.use(function (err, req, res, next) {
// set locals, only providing error in development
res.locals.message = err.message;
res.locals.error = req.app.get('env') === 'development' ? err : {};
// render the error page
res.status(err.status || 500);
res.render('error');
});
module.exports = app;
这里需要注意的是,给蜘蛛看的只是html骨架就行了,所以除了必要的css和js,assets下的静态资源一律不要解析,否则会报超时错误。
ssr.js:
const puppeteer = require('puppeteer');
// In-memory cache of rendered pages.
const RENDER_CACHE = new Map();
async function ssr(url) {
if (RENDER_CACHE.has(url)) {
return { html: RENDER_CACHE.get(url), ttRenderMs: 0 };
}
const start = Date.now();
const browser = await puppeteer.launch();
const page = await browser.newPage();
try {
// networkidle0 waits for the network to be idle (no requests for 500ms).
console.log('render url>>>', url);
await page.goto(url, { waitUntil: 'networkidle0' });
await page.waitForSelector('body'); // ensure #root exists in the DOM.
} catch (err) {
console.error(err);
throw new Error('page.goto/waitForSelector timed out.');
}
const html = await page.content(); // serialized HTML of page DOM.
await browser.close();
const ttRenderMs = Date.now() - start;
console.info(`Puppeteer rendered page: ${url} in: ${ttRenderMs}ms`);
RENDER_CACHE.set(url, html); // cache rendered page.
return { html, ttRenderMs };
}
module.exports = ssr;
最后用curl验证一下。
curl -H 'User-agent:Baiduspider' https://xxx.com