"爬虫"的简单实现

该项目是基于某网页上的图片解析:主要利用了线程的同步与数据流的读取技术.

public class Test02 {
    public static void main(String[] args) {
        // 创建一个图片下载对象
        final imgDownload imgDownload = new imgDownload();
//这部分代码是创建了3个图片链接
        // 生产图片链接
        new Thread(new Runnable() {

            @Override
            public void run() {
                for (int i = 1; i < 500; i++) {
                    try {
                        imgDownload.parseImageURL(i);
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    } catch (Exception e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }

            }
        }).start();
        new Thread(new Runnable() {

            @Override
            public void run() {
                for (int i = 500; i < 1000; i++) {
                    try {
                        imgDownload.parseImageURL(i);
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    } catch (Exception e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }

            }
        }).start();
        new Thread(new Runnable() {

            @Override
            public void run() {
                for (int i = 1000; i < 1500; i++) {
                    try {
                        imgDownload.parseImageURL(i);
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    } catch (Exception e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }

            }
        }).start();
//如下代码是创建了5个下载图片的线程
        // 下载图片
        new Thread(new Runnable() {

            @Override
            public void run() {
                try {
                    while (true) {
                        imgDownload.downloadImage();
                    }
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                } catch (Exception e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }).start();
        new Thread(new Runnable() {

            @Override
            public void run() {
                try {
                    while (true) {
                        imgDownload.downloadImage();
                    }
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                } catch (Exception e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }).start();
        new Thread(new Runnable() {

            @Override
            public void run() {
                try {
                    while (true) {
                        imgDownload.downloadImage();
                    }
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                } catch (Exception e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }).start();
        new Thread(new Runnable() {

            @Override
            public void run() {
                try {
                    while (true) {
                        imgDownload.downloadImage();
                    }
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                } catch (Exception e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }).start();
        new Thread(new Runnable() {

            @Override
            public void run() {
                try {
                    while (true) {
                        imgDownload.downloadImage();
                    }
                } catch (IOException e) {

                    e.printStackTrace();
                } catch (Exception e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }).start();

    }
}

class imgDownload {

    // imgurl的数组
    LinkedList<String> list = new LinkedList<String>();

    // 解析链接
    void parseImageURL(int page) throws Exception {
        synchronized (this) {
            
            if (list.size() > 100) {
                wait();// 生产太多了,让她等待 不知100个,会99+几个
            }
            String htmlUrl = "https://www.doutula.com/article/list/?page="
                    + page;
            Document doc = Jsoup.connect(htmlUrl).get();
            // 获取图片链接
            Elements els = doc.select(".lazy,.image_dtb,.img-responsive");// css选择器
                                                        Iterator iter = els.iterator();
            while (iter.hasNext()) {
                Element object = (Element) iter.next();
                String imgUrl = object.attr("data-original");
                if (!imgUrl.contains("http")) {
                    continue;
                }
                this.list.add(imgUrl);
            }
            // 通知可以下载图片
            notifyAll();
        }
    }
    
    // 下载图片
    void downloadImage() throws Exception {
        synchronized (this) {
            if (this.list.size() <= 0) {
                wait();
            }

            String imgUrl = null;
            if (this.list.size()>0) {
                imgUrl = this.list.removeFirst();
            }else {
                return;
            }   
            //String imgUrl = this.list.removeFirst();
            // 又要取又要拿 返回的是删掉的元素
            URL url = new URL(imgUrl);// 请求资源
            URLConnection con = url.openConnection();// 获取链接
            con.connect();// 去链接
            InputStream stream = con.getInputStream();// 获取流
            byte[] by = new byte[1024];
            int length = -1;
            // 写入流 out流
            String[] imageURLArr = imgUrl.split("/");
            String fileName = "img/" + imageURLArr[imageURLArr.length - 1];
            FileOutputStream fileOutputStream = new FileOutputStream(fileName);

            while ((length = stream.read(by)) != -1) {
                // 把流写入到本地文件夹
                fileOutputStream.write(by, 0, length);
            }
            fileOutputStream.close();
            stream.close();
            if (this.list.size() < 100) {
                notify();
            }
        }
    }
}


最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容