网络爬虫主要功能就是对网页内容进行爬取,然后根据特定需求对内容进行过滤分析。
针对网页内容爬取,假设需求为要对一个网站进行全站爬取,将爬取的文件按类型保存在本地磁盘,并提供配置网站爬取的最大层次、最大链接数、爬取类型范围等。
这里使用kafka主题做爬虫队列,使用springboot做了一个简单的实现。
任务创建接口
这里提供了两个接口,一个是输入网站爬取配置创建爬取任务,一个是根据任务id查询任务状态,没有提供详细结果查询,结果可直接数据库查看。
@RestController
public class CrawlerTaskController {
@Autowired
private WebsiteTaskService websiteTaskService;
@Autowired
private WebsiteTaskDao websiteTaskDao;
@Autowired
private TaskProducer taskProducer;
@PostMapping("task/add")
@ResponseBody
public Map<String, Object> addWebsiteTask(WebsiteTask item) {
item.setTaskCount(1);
websiteTaskService.put(item);
UrlTask task = new UrlTask();
task.setUrl(item.getUrl());
task.setParentId(-1);
task.setRootId(task.getRootId());
task.setLevel(0);
taskProducer.sendUrlTask(task);
Map<String, Object> map = new HashMap<>();
map.put("id", item.getId());
map.put("message", "爬虫任务添加成功!");
return map;
}
@PostMapping("task/get")
@ResponseBody
public WebsiteTask getWebsiteTask(int id) {
return websiteTaskDao.findById(id).get();
}
}
网站任务实体
@Entity
@EntityListeners(AuditingEntityListener.class)
public class WebsiteTask {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private int id;
@Column(length = 1024)
private String url;//网站url,一般为首页链接
private int maxLevel;//最大爬取层次
private int maxCount;//最大爬取链接数
private int outerLevel;//最大爬取外链层次
private String range;//爬取类型范围
private int taskCount;//任务数
private int finishCount;//爬取完成任务数
private int state = 1;//状态:1=执行中;2=已完成
@CreatedDate
private Date createTime;//创建时间
private Date finishTime;//完成时间
@Transient
private List<String> ranges;
......
}
url任务实体
@Entity
@EntityListeners(AuditingEntityListener.class)
public class UrlTask {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private int id;
private int parentId;//父页面任务id
private int rootId;//网站任务id
@Column(length = 1024)
private String url;
private String contentType;//网页类型
private long contentLength;//内容长度
private int level;//当前层次
private long useMillis;//爬取用时
private int respCode;//响应状态码
private String remark;//备注
private String filePath;//保存磁盘文件路径
@CreatedDate
private Date createTime;//创建时间
......
}
爬取的网页模型定义
public class WebPageModel {
public int respCode = 200;//响应状态码
public String message;//异常信息
public Document document;//html页面Document对象
public String encoding;//页面编码
public String contentType;//网页类型
public long contentLength;//内容长度
public String filePath;//文件路径
public String fileExt;//文件后缀
public PageFormat format = PageFormat.OTHER;
public enum PageFormat {
HTML, IMAGE, AUDIO, VIDEO, TXT, WORD, EXCEL, PPT, PDF, COMPRESS, APK, IPA, OTHER
}
public void updateFormat() {
String type = contentType;
if (ContentTypeUtil.OCTET_STREAM_TYPE.equalsIgnoreCase(contentType)) {
type = ContentTypeUtil.getContentType(fileExt);
}
if (ContentTypeUtil.isHtml(type)) {
format = PageFormat.HTML;
} else if (ContentTypeUtil.isImage(type)) {
format = PageFormat.IMAGE;
} else if (ContentTypeUtil.isAudio(type)) {
format = PageFormat.AUDIO;
} else if (ContentTypeUtil.isVideo(type)) {
format = PageFormat.VIDEO;
} else if (ContentTypeUtil.isTxt(type)) {
format = PageFormat.TXT;
} else if (ContentTypeUtil.isWord(type)) {
format = PageFormat.WORD;
} else if (ContentTypeUtil.isExcel(type)) {
format = PageFormat.EXCEL;
} else if (ContentTypeUtil.isPpt(type)) {
format = PageFormat.PPT;
} else if (ContentTypeUtil.isPdf(type)) {
format = PageFormat.PDF;
} else if (ContentTypeUtil.isCompress(type)) {
format = PageFormat.COMPRESS;
} else if (ContentTypeUtil.isApk(type)) {
format = PageFormat.APK;
} else if (ContentTypeUtil.isIpa(type)) {
format = PageFormat.IPA;
}
}
}
url去重
public class UrlDuplicateFilter {
private final Object lock = new Object();
private final Set<String> set = new HashSet<>();
private final int maxCount; // 最大不相同数量
public UrlDuplicateFilter(int maxCount) {
this.maxCount = maxCount;
}
/**
* 过滤重复url
*/
public boolean filter(String url) {
if (StringUtils.isBlank(url)) {
return false;
}
synchronized (lock) {
if (reachMaxCount() || set.contains(url)) {
return false;
}
set.add(url);
}
return true;
}
/**
* 判断数量是否达到上限
*/
private boolean reachMaxCount() {
return set.size() >= maxCount;
}
}
一条url的爬取执行过程
public class CrawlerTask implements Runnable {
private static final Logger LOG = LoggerFactory.getLogger(CrawlerTask.class);
private UrlTask task;
private WebsiteTaskService websiteTaskService;
private UrlTaskDao urlTaskDao;
private TaskProducer taskProducer;
public CrawlerTask(UrlTask task, WebsiteTaskService websiteTaskService, UrlTaskDao urlTaskDao,
TaskProducer taskProducer) {
this.task = task;
this.websiteTaskService = websiteTaskService;
this.urlTaskDao = urlTaskDao;
this.taskProducer = taskProducer;
}
@Override
public void run() {
long millis = System.currentTimeMillis();
WebsiteTask website = websiteTaskService.getWebsiteTask(task.getRootId());
//爬取链接内容
WebPageModel page = PageDownloadUtil.executeGet(task.getUrl(), website.getRanges());
task.setContentLength(page.contentLength);
task.setContentType(page.contentType);
task.setRespCode(page.respCode);
task.setRemark(page.message);
task.setFilePath(page.filePath);
task.setUseMillis(System.currentTimeMillis() - millis);
urlTaskDao.saveAndFlush(task);
if (task.getLevel() < website.getMaxLevel()) {
Set<String> childUrls = new UrlExtract(page.document, task.getUrl()).extractFromA().extractFromFrame()
.extractFromIframe().extractFromImg().getUrls();
if (!childUrls.isEmpty()) {
UrlDuplicateFilter dupFilter = websiteTaskService.getUrlDuplicateFilter(task.getRootId());
int addCount = 0;
for (String childUrl : childUrls) {
if (CrawlerUtil.isOuterUrl(task.getUrl(), childUrl) && task.getLevel() >= website.getOuterLevel()) {
continue;
}
//提取出的子链接去重
if (dupFilter.filter(childUrl)) {
UrlTask childTask = new UrlTask();
childTask.setUrl(childUrl);
childTask.setParentId(task.getId());
childTask.setRootId(task.getRootId());
childTask.setLevel(task.getLevel() + 1);
taskProducer.sendUrlTask(childTask);
addCount++;
}
}
//任务数更新
websiteTaskService.addTaskCount(task.getRootId(), addCount);
}
}
//完成任务数更新
websiteTaskService.addFinishCount(task.getRootId());
LOG.info(String.format("爬取用时=%s,url=%s", System.currentTimeMillis() - millis, task.getUrl()));
}
}
网页爬取工具
public class PageDownloadUtil {
private static final Logger LOG = LoggerFactory.getLogger(PageDownloadUtil.class);
private static final int MAX_HTML_LENGTH = 20 * 1024 * 1024;//html页面限制20M
private static final int MAX_FILE_LENGTH = 500 * 1024 * 1024;//其它附件类型限制500M
private static final String FOLDER_NAME = "d:/temp/" + UUID.randomUUID().toString().replace("-", "") + "/";
private static final AtomicInteger INDEX = new AtomicInteger();
private static final CloseableHttpClient client = HttpClientUtil.createHttpClient();
//自定义错误返回值
private static final Map<Integer,String> CODE_MAP = new HashMap<>();
static{
CODE_MAP.put(-501,"uri解析异常");
CODE_MAP.put(-502,"网络协议异常");
CODE_MAP.put(-503,"域名解析异常");
CODE_MAP.put(-504,"http连接异常");
CODE_MAP.put(-505,"网络IO异常");
CODE_MAP.put(-506,"页面解析异常");
CODE_MAP.put(-507,"编码格式异常");
CODE_MAP.put(-508,"内容长度超出限制");
CODE_MAP.put(-509,"网页类型超出可爬取范围");
}
public static WebPageModel executeGet(String url, List<String> ranges) {
WebPageModel page = new WebPageModel();
int redirectTimes = 0;
boolean redirect;
URI uri = CrawlerUtil.urlConvertToUri(url);
if (uri == null) {
page.respCode = -501;
page.message = CODE_MAP.get(page.respCode);
return page;
}
do {
redirectTimes++;
redirect = false;
HttpGet method = new HttpGet(uri);
HttpClientUtil.setHeader(method, url);
CloseableHttpResponse response = null;
long millis = System.currentTimeMillis();
try {
response = client.execute(method);
page.respCode = response.getStatusLine().getStatusCode();
if (page.respCode == HttpStatus.SC_OK) {
download(page, url, response, ranges);
} else if (page.respCode >= 300 && page.respCode < 400) {// 页面跳转
Header[] locationHeader = response.getHeaders("location");
if (locationHeader != null && locationHeader.length > 0) {
String redirectUrl = locationHeader[0].getValue();
if (StringUtils.isNotBlank(redirectUrl) && !url.equals(redirectUrl)) {
uri = CrawlerUtil.urlConvertToUri(redirectUrl);
redirect = true;
}
}
}
} catch (ClientProtocolException e) {
LOG.error("", e);
page.respCode = -502;
page.message = CODE_MAP.get(page.respCode);
} catch (UnknownHostException e) {
LOG.error("", e);
page.respCode = -503;
page.message = CODE_MAP.get(page.respCode);
} catch (HttpHostConnectException e) {
LOG.error("", e);
page.respCode = -504;
page.message = CODE_MAP.get(page.respCode);
} catch (IOException e) {//连接超时尝试重连3次
redirectTimes++;
redirect = true;
LOG.error(String.format("第%s次链接失败,executeusetime=%s", redirectTimes / 2,
System.currentTimeMillis() - millis), e);
page.respCode = -505;
page.message = CODE_MAP.get(page.respCode);
} finally {
if (response != null) {
EntityUtils.consumeQuietly(response.getEntity());
try {
response.close();
} catch (IOException e) {
LOG.error("responseclose", e);
}
}
method.releaseConnection();
}
} while (redirect && redirectTimes <= 5);
return page;
}
private static void download(WebPageModel page, String url, CloseableHttpResponse response,
List<String> ranges) {
HttpEntity entity = response.getEntity();
page.contentLength = entity.getContentLength();// 此方法不准确,经常返回-1,后面重新赋值
// ContentType.getOrDefault(entity).getMimeType()提取可能会因为非支持的charset类型而报错,所以这里改为手工提取mimeType
Header header = entity.getContentType();
if (header != null) {
HeaderElement[] headerElements = header.getElements();
if (headerElements != null && headerElements.length > 0) {
page.contentType = headerElements[0].getName();
}
}
if (ContentTypeUtil.OCTET_STREAM_TYPE.equalsIgnoreCase(page.contentType)) {
page.fileExt = HttpClientUtil.getOctetStreamFileExt(url, response);
} else if (page.contentType == null) {
//若未从header中取到contentType,根据url后缀判断
if (url.lastIndexOf("/") > 8) {
String name = url.substring(url.lastIndexOf("/"));
if (name.contains(".")) {
page.contentType = ContentTypeUtil.getContentType(name.substring(name.lastIndexOf(".")));
}
}
}
page.updateFormat();
if (ranges.contains(page.format.toString())) {
if (page.format == WebPageModel.PageFormat.HTML) {
if (page.contentLength == 0 || page.contentLength > MAX_HTML_LENGTH) {
page.respCode = -508;
page.message = CODE_MAP.get(page.respCode);
return;
}
try {
String html = null;
Document document = null;
String charset = null;
if (header != null) {
charset = CrawlerUtil.judgeCharset(header.toString());
}
if (charset != null) {
html = EntityUtils.toString(entity, charset);
document = Jsoup.parse(html);
} else {
byte[] data = EntityUtils.toByteArray(entity);
html = new String(data, CrawlerUtil.UTF_8);
document = Jsoup.parse(html);
charset = CrawlerUtil.getCharsetFromMeta(document);
if (charset != null && !CrawlerUtil.UTF_8.equals(charset)) {
html = new String(data, charset);
document = Jsoup.parse(html);
}
}
byte[] data = html.getBytes(CrawlerUtil.UTF_8);
page.contentLength = data.length;
if (page.contentLength <= 0 || page.contentLength > MAX_HTML_LENGTH) {
page.respCode = -508;
page.message = CODE_MAP.get(page.respCode);
return;
}
page.encoding = CrawlerUtil.UTF_8;
page.document = document;
createFilePath(page);
HttpClientUtil.exportDataAsFile(data, page.filePath);
} catch (ParseException e) {
LOG.error("", e);
page.respCode = -506;
page.message = CODE_MAP.get(page.respCode);
} catch (UnsupportedEncodingException e) {
LOG.error("", e);
page.respCode = -507;
page.message = CODE_MAP.get(page.respCode);
} catch (IOException e) {
LOG.error("", e);
page.respCode = -505;
page.message = CODE_MAP.get(page.respCode);
}
} else {// 如果是非html页面直接下载
if (page.contentLength == 0 || page.contentLength > MAX_FILE_LENGTH) {
page.respCode = -508;
page.message = CODE_MAP.get(page.respCode);
return;
}
if (page.fileExt == null) {
page.fileExt = ContentTypeUtil.getExtendFileName(page.contentType);
}
createFilePath(page);
HttpClientUtil.exportEntityAsFile(entity, page.filePath);
}
} else {
page.respCode = -509;
page.message = CODE_MAP.get(page.respCode);
}
}
private static void createFilePath(WebPageModel page) {
String filePath = FOLDER_NAME + page.format.toString() + "/";
File file = new File(filePath);
file.mkdirs();
page.filePath = filePath + INDEX.getAndIncrement() + page.fileExt;
}
}
项目地址
详细项目代码可到我的Github上查看下载:https://github.com/DexterQY/website-crawler