思路就是整站,获取到对应 js ,css,图片 全部本地下载,然后页面替换本地的资源,主要就是找到对应的 权限js 给他干掉,就ok了
部分爬虫源码
public static void main(String[] args) throws IOException, InterruptedException {
String link = "";
Document document = getDocument(link);
Elements scriptElements = document.select("script");
for (Element element:scriptElements){
String attr = element.attr("abs:src");
int lastIndexOf = attr.lastIndexOf("/");
String substring = attr.substring(lastIndexOf);
copyURLToFile(new URL(attr), new File(""+substring));
element.attr("src",substring);
}
Elements elements = document.select("link");
for (Element element:elements){
String href = element.attr("href");
if(!href.contains("http") && !href.contains("https")){
String attr = element.attr("abs:href");
copyURLToFile(new URL(attr), new File(parent+href));
}
}
//FileUtil.writeString(document.html(),"e://yudao//index.html","utf-8");
//copyURLToFile(new URL(link), new File("e://yudao//1.html"));
getContent(link);
}
public static void getContent(String startLink) throws IOException, InterruptedException {
String item = "/intro/";
String link = startLink + item;
Document document = getDocument(link);
/*document.select(".right-menu-content").stream().forEach(element -> {
System.out.println(element.text());
});*/
//移除
Element aside = document.select("aside").first();
Elements as = aside.select(".sidebar-links>li");
int i = 0;
for (Element element:as) {
Element first = element.select("a").first();
String attr = first.attr("href");
String absAttr = first.attr("abs:href");
String text = first.text();
System.out.println(text + attr);
//element.attr("href",getLeave(attr)+attr);
start(getDocument(absAttr),attr,getLeave(attr));
}
}
效果图
总结
| 有条件的还是 开通知识星球吧 也不贵,我这里主要就是 写的好玩哈,仅供娱乐参考