Java爬虫程序简介：

/** * 程序名：Crawler * 作者：--------------------------------- * 编译环境：Microsoft Windows 7(64-bit)下的NetBeans IDE 7.3 * 源文件：Crawler.java, DownloadPage.java, Parser.java, SameFileName.java, TxtFileFilter.java, Work.java * 功能： * 1.多线程地连接互联网，获取页面源代码，在工作中随时可以停止或退出 * 2.通过正则表达式匹配，根据用户的选择可提取URL、电子邮箱、QQ号码、日期、电驴链接等信息 * 3.用户可自定义正则表达式，从页面源代码或在正文中提取信息 * 4.用户可自定义URL的正则表达式，当页面含有匹配的URL时，继续连接并提取信息 * 5.允许下载网页或URL指向的文件（如exe、mp3等） * 6.获取网页正文（去掉源代码中的html标签和js脚本等） * 7.下载并保存网页中的图片或网页中含有的自定义格式的文件 * 8.通过设置代理服务器连接互联网 * 9.请求网页时发送给定的Cookie */

此程序本是清华大学的课程设计（但是找不到链接网址了，当初下载下来没有记下网址）
优点：此程序是一个比较适合爬虫基础入门的程序，便于初学者了解爬虫的实现流程，也更适合Java的一些基础学习，例如HTTP连接、对于字符串的操作与正则匹配、对于IO流概念的理解、对于文件的读写等都是一个比较不错的学习程序。
有程序的地方就有缺点，此程序也不例外。
- 在Work工作类中： “继续爬取新的网页”功能处的实现，我有一丝不理解其目的何在（在测试的过程中没有理解。）
- DownLoadPage线程实现细节中：
  - 1.存在太多重复的http连接，一个功能的实现需要一次连接。能否考虑一次连接就实现提取或保存不同信息的功能呢？是否可以考虑将该连接封装一下呢？我想或许可以。
  - 2.存在太多文件输入输出，文件读写问题也存在其它类中，是否可以考虑将文件读写单独封装一下呢？尽管可能每次需要的文件读写不同，有一些读写方式也确实特殊，无法合并，但总是有较大的一部分是相同的，这一部分共同点或许就是可以封装的点。
- 此程序针对一般的网页还可以，但是针对一些特制的网页（例如爱奇艺，腾讯视频等）或者动态加载机制的网页就失效了，此外程序对于下载的源文件或者文本文件并没有进行格式处理，会出现打不开或者特别乱等情况，下载的文件也需要文件链接的绝对路径，例如图片需要完整的路径格式才可正确下载，对于相对路径下的图片则无能为力。

学习准备：

一、学习此程序最好搭配JDK文档如JAVA-SE-8-API.chm（网上有资源）
二、关于正则表达式：
1. python正则表达式大全
 2. 最全的常用正则表达式大全——包括校验数字、字符、一些特殊的需求等等
三、关于文件读写相关内容：
UML结构图（用IDEA生成即可）：

程序运行截图：

程序实现：

一、主类：Crawler（画主界面，捕捉按键）：

/* * To change this template, choose Tools | Templates * and open the template in the editor. */
package crawler;

import java.awt.*;
import java.awt.event.*;
import java.io.*;
import javax.swing.*;

/** * 主类：Crawler * 功能：画界面，捕捉按键 */
public class Crawler extends JApplet {
   

    static JFrame frame; // 界面
    static JTextField inputurljtf, importtxtjtf, proxyaddrjtf, proxyportjtf; // 输入网址、导入文件、代理地址、代理端口的输入框
    static JTextField userdefsourcejtf, userdeftextjtf, continueurljtf, saveformatjtf, cookiejtf; // 自定义正则表达式（两个）、继续搜索的网址、存储格式、Cookie的输入框
    static ButtonGroup bgrp; // 两个单选框的组
    static JRadioButton inputurljrb, importtxtjrb; // 输入网址和导入文件的单选框
    static JButton choosefilejb, startjb, stopjb; // 浏览文件、开始、停止按钮
    static JCheckBox arrjcb[], useproxyjcb, sendcookiejcb; // 18个功能选项、是否使用代理、是否发送Cookie的复选框
    static JLabel proxyaddrjlb, proxyportjlb; // "地址"和"端口"提示标语
    static Work work = null; // 启动的工作
    
    @Override
    public void init() {
   
        frame = new JFrame("Crawler");
        frame.setSize(406, 446); // 窗体大小
	    frame.setLocation(350, 130); // 窗体初始位置
        frame.setResizable(false); // 不可以改变大小
        
        /* 点击窗口右上角的×时退出程序 */
        frame.addWindowListener(new WindowAdapter() {
   
            @Override
            public void windowClosing(WindowEvent arg0) {
   
                deletetempfile();
                System.out.println("退出");
                System.exit(0);
            }
        });
        
        /* 设置输入网址的输入框 */
        inputurljtf=new JTextField("", 8192);
        inputurljtf.setSize(286, 20);
        inputurljtf.setLocation(110, 10);
        inputurljtf.setBackground(Color.WHITE);
        frame.add(inputurljtf);
        
        /* 设置导入文件的输入框 */
        importtxtjtf=new JTextField("", 8192);
        importtxtjtf.setSize(205, 20);
        importtxtjtf.setLocation(110, 40);
        importtxtjtf.setBackground(Color.WHITE);
        frame.add(importtxtjtf);
        
        /* 设置两个单选框和组 */
        bgrp = new ButtonGroup();
        inputurljrb = new JRadioButton("输入网页地址 ", true);
        importtxtjrb = new JRadioButton("从txt导入网址", false);
        bgrp.add(inputurljrb);
        bgrp.add(importtxtjrb);
        inputurljrb.setSize(110, 20);
        importtxtjrb.setSize(110, 20);
        inputurljrb.setLocation(0, 10);
        importtxtjrb.setLocation(0, 40);
        frame.add(inputurljrb);
        frame.add(importtxtjrb);
        
        /* 设置浏览按钮 */
        choosefilejb = new JButton("浏览...");
        choosefilejb.setSize(75, 20);
        choosefilejb.setLocation(320, 40);
        frame.add(choosefilejb);
        
        arrjcb = new JCheckBox[18];
        
        /* 设置提取URL的复选框 */
        arrjcb[0] = new JCheckBox("提取URL");
        arrjcb[0].setSize(92, 20);
        arrjcb[0].setLocation(0, 70);
        
        /* 设置提取电子邮箱地址的复选框 */
        arrjcb[1] = new JCheckBox("提取电子邮箱地址");
        arrjcb[1].setSize(132, 20);
        arrjcb[1].setLocation(135,70);
        
        /* 设置提取ip地址的复选框 */
        arrjcb[2] = new JCheckBox("提取ip地址");
        arrjcb[2].setSize(98, 20);
        arrjcb[2].setLocation(270, 70);
        
        /* 设置提取手机号码的复选框 */
        arrjcb[3] = new JCheckBox("提取手机号码");
        arrjcb[3].setSize(113, 20);
        arrjcb[3].setLocation(0, 95);
        
        /* 设置提取电话号码的复选框 */
        arrjcb[4] = new JCheckBox("提取电话号码");
        arrjcb[4].setSize(113, 20);
        arrjcb[4].setLocation(135, 95);
        
        /* 设置提取QQ号码的复选框 */
        arrjcb[5] = new JCheckBox("提取QQ号码");
        arrjcb[5].setSize(99, 20);
        arrjcb[5].setLocation(270, 95);
        
        /* 设置提取身份证号码的复选框 */
        arrjcb[6] = new JCheckBox("提取身份证号码");
        arrjcb[6].setSize(119, 20);
        arrjcb[6].setLocation(0, 120);
        
        /* 设置提取日期的复选框 */
        arrjcb[7] = new JCheckBox("提取日期");
        arrjcb[7].setSize(93, 20);
        arrjcb[7].setLocation(135, 120);
        
        /* 设置提取时间的复选框 */
        arrjcb[8] = new JCheckBox("提取时间");
        arrjcb[8].setSize(93, 20);
        arrjcb[8].setLocation(270, 120);
        
        /* 设置提取电驴链接的复选框 */
        arrjcb[9] = new JCheckBox("提取电驴链接(ed2k://...)");
        arrjcb[9].setSize(176, 20);
        arrjcb[9].setLocation(0, 145);
        
        /* 设置提取迅雷链接的复选框 */
        arrjcb[10] = new JCheckBox("提取迅雷链接(thunder://...)");
        arrjcb[10].setSize(184, 20);
        arrjcb[10].setLocation(186, 145);
        
        /* 设置从源代码提取自定义正则表达式内容的复选框和对应的输入框 */
        arrjcb[11] = new JCheckBox("从源代码提取以下内容");
        arrjcb[11].setSize(160, 20);
        arrjcb[11].setLocation(0, 175);
        userdefsourcejtf = new JTextField("", 8192);
        userdefsourcejtf.setSize(234, 20);
        userdefsourcejtf.setLocation(162, 175);
        frame.add(userdefsourcejtf);
        
        /* 设置从正文提取自定义正则表达式内容的复选框和对应的输入框 */
        arrjcb[12] = new JCheckBox("在正文中提取以下内容");
        arrjcb[12].setSize(160, 20);
        arrjcb[12].setLocation(0, 205);
        userdeftextjtf = new JTextField("", 8192);
        userdeftextjtf.setSize(234, 20);
        userdeftextjtf.setLocation(162, 205);
        frame.add(userdeftextjtf);
        
        /* 设置继续爬网页的复选框和对应的输入框 */
        arrjcb[13] = new JCheckBox("继续爬以下网页");
        arrjcb[13].setSize(119, 20);
        arrjcb[13].setLocation(0, 235);
        continueurljtf = new JTextField("", 8192);
        continueurljtf.setSize(275, 20);
        continueurljtf.setLocation(121, 235);
        frame.add(continueurljtf);
        
        /* 设置保存目标的复选框 */
        arrjcb[14] = new JCheckBox("保存目标");
        arrjcb[14].setSize(93, 20);
        arrjcb[14].setLocation(0, 265);
        
        /* 设置保存网页正文的复选框 */
        arrjcb[15] = new JCheckBox("保存网页正文");
        arrjcb[15].setSize(113, 20);
        arrjcb[15].setLocation(135, 265);
        
        /* 设置下载网页图片的复选框 */
        arrjcb[16] = new JCheckBox("下载网页图片");
        arrjcb[16].setSize(113, 20);
        arrjcb[16].setLocation(270, 265);
        
        /* 设置下载指定格式文件的复选框和对应的输入框 */
        arrjcb[17] = new JCheckBox("下载以下格式的文件");
        arrjcb[17].setSize(142, 20);
        arrjcb[17].setLocation(0, 295);
        saveformatjtf = new JTextField("", 8192);
        saveformatjtf.setSize(252, 20);
        saveformatjtf.setLocation(144, 295);
        frame.add(saveformatjtf);
        
        /* 设置以上复选框的字体并显示 */
        int i;
        for(i = 0; i <= 17; i++) {
   
            arrjcb[i].setFont(new Font(Font.DIALOG, Font.PLAIN, 13));
            frame.add(arrjcb[i]);
        }
        
        /* 设置使用代理服务器的复选框 */
        useproxyjcb = new JCheckBox("使用代理服务器");
        useproxyjcb.setSize(119, 20);
        useproxyjcb.setLocation(0, 325);
        useproxyjcb.setFont(new Font(Font.DIALOG, Font.PLAIN, 13));
        frame.add(useproxyjcb);
        
        /* 设置代理地址的提示 */
        proxyaddrjlb = new JLabel("地址:");
        proxyaddrjlb.setSize(33, 20);
        proxyaddrjlb.setLocation(139, 325);
        proxyaddrjlb.setFont(new Font(Font.DIALOG, Font.PLAIN, 13));
        frame.add(proxyaddrjlb);
        
        /* 设置代理地址的输入框 */
        proxyaddrjtf = new JTextField("", 256);
        proxyaddrjtf.setSize(108, 20);
        proxyaddrjtf.setLocation(173, 325);
        frame.add(proxyaddrjtf);
        
        /* 设置代理端口的提示 */
        proxyportjlb = new JLabel("端口:");
        proxyportjlb.setSize(33, 20);
        proxyportjlb.setLocation(306, 325);
        proxyportjlb.setFont(new Font(Font.DIALOG, Font.PLAIN, 13));
        frame.add(proxyportjlb);
        
        /* 设置代理端口的输入框 */
        proxyportjtf = new JTextField("", 16);
        proxyportjtf.setSize(56, 20);
        proxyportjtf.setLocation(340, 325);
        frame.add(proxyportjtf);
        
        /* 设置发送Cookie的复选框 */
        sendcookiejcb = new JCheckBox("发送Cookie");
        sendcookiejcb.setSize(97, 20);
        sendcookiejcb.setLocation(0, 355);
        sendcookiejcb.setFont(new Font(Font.DIALOG, Font.PLAIN, 13));
        frame.add(sendcookiejcb);
        
        /* 设置Cookie的输入框 */
        cookiejtf = new JTextField("", 8192);
        cookiejtf.setSize(295, 20);
        cookiejtf.setLocation(101, 355);
        frame.add(cookiejtf);
        
        /* 设置开始按钮 */
        startjb = new JButton("开 始 (Enter)");
        startjb.setSize(190, 23);
        startjb.setLocation(4, 386);
        frame.add(startjb);
        
        /* 设置停止按钮 */
        stopjb = new JButton("停 止");
        stopjb.setSize(190, 23);
        stopjb.setLocation(205, 386);
        stopjb.setBackground(Color.LIGHT_GRAY);
        frame.add(stopjb);
        
        frame.setLayout(null); // 取消默认布局管理器
    }
    
    @Override
    public void start() {
   
        
        /* 点击"浏览"按钮时的响应 */
        ActionListener choosefileal = new ActionListener() {
   
            @Override
            public void actionPerformed(ActionEvent ae) {
   
                /* 若已开始工作则不响应 */
                if (work == null || !work.isAlive()) {
   
                    importtxtjrb.setSelected(true);
                    
                    /* 弹出选择文件的对话框 */
                    JFileChooser jfc = new JFileChooser (".");
                    jfc.setAcceptAllFileFilterUsed(false);
                    jfc.addChoosableFileFilter(new TxtFileFilter());
                    int result = jfc.showOpenDialog(null);
                    if(result == JFileChooser.APPROVE_OPTION) {
   
                        String path = jfc.getSelectedFile().getAbsolutePath();
                        importtxtjtf.setText(path);
                    }
                }
            }       
        };       
        choosefilejb.addActionListener(choosefileal);
        
        final Crawler crawler = this;
        
        /* 点击"开始"按钮时的响应 */
        ActionListener startal = new ActionListener() {
   
            @Override
            public void actionPerformed(ActionEvent ae) {
   
                /* 若已开始工作则不响应 */
                if (work == null || !work.isAlive()) {
   
                    System.out.println("开始");
                    work = new Work(); // 启动一项新工作
                    work.start();
                }
            }
        };
        startjb.addActionListener(startal);
        
        /* 点击停止按钮时的响应 */
        ActionListener stopal = new ActionListener() {
   
            @Override
            public void actionPerformed(ActionEvent ae) {
   
                /* 若当前没有工作则不响应 */
                if (work != null && work.isAlive()) {
   
                    work.needstop = true; // 依次将所有子线程的needstop设为true，使各子线程尽快终止
                    System.out.println("准备停止...");
                    deletetempfile(); // 删除临时文件
                    while (work.isAlive()) {
   }
                    System.out.println("已停止");
                }
            }
        };
        stopjb.addActionListener(stopal);
        
        /* 在任意一个输入框按回车键视为点击"开始"按钮 */
        addenterlistener(inputurljtf);
        addenterlistener(importtxtjtf);
        addenterlistener(userdefsourcejtf);
        addenterlistener(userdeftextjtf);
        addenterlistener(continueurljtf);
        addenterlistener(saveformatjtf);    
        addenterlistener(proxyaddrjtf);
        addenterlistener(proxyportjtf);
        addenterlistener(cookiejtf);
              
        frame.setVisible(true); // 将界面显示
        
    }
    
    void addenterlistener(JTextField jtf) {
               
        jtf.addKeyListener(new KeyAdapter() {
   
            @Override
            public void keyPressed(KeyEvent event) 
            {
    
                if (event.getKeyText(event.getKeyCode()).compareToIgnoreCase("Enter")==0) {
    
                    startjb.doClick(); // 模拟点击"开始"按钮
                } 
            }
        });
    }
    
    /* 删除所有临时文件 */
    void deletetempfile() {
   
        Integer i;
        for (i = 1; i <= 500; i++) {
   
            new File("~tmp"+i.toString()).delete();
            new File("~"+i.toString()+"saveurls.txt").delete();
        }
    }
    
    /* 主函数 */
    public static void main(String[] args) {
   
        JApplet applet = new Crawler();
        System.out.println("欢迎使用java版网络爬虫Crawler");
        applet.init();
        applet.start();
    }
}

二、Work类（网络爬虫的工作类，启动DownloadPage线程，并保存要连接的URL的队列）：

从主页面获取网页URL或者选择从文件导入URL信息加入到待爬取队列中，同时从主页面获取一些基本信息，例如是否使用代理或者添加cookie等内容。

/* * To change this template, choose Tools | Templates * and open the template in the editor. */
package crawler;

import java.awt.Color;
import java.util.*;
import java.io.*;
import java.util.concurrent.*;
import java.util.regex.*;

/** * * @author 1100012773, 1100012778 */

/** * Work类 * 网络爬虫的工作类，启动DownloadPage线程，并保存要连接的URL的队列 */
public class Work extends Thread {
   

    boolean needstop; // 是否需要停止（由Crawler控制）
    
    /* 构造函数 */
    public Work() {
   
        needstop = false;
    }
    
    @Override
    public void run() {
   
        /* 开始工作时，更改三个按钮的颜色 */
        Crawler.startjb.setBackground(Color.LIGHT_GRAY);
        Crawler.stopjb.setBackground(null);
        Crawler.choosefilejb.setBackground(Color.LIGHT_GRAY);
        
        Set<String> set = new HashSet<String>(); // 记录连接的URL
        Queue<String> queue = new LinkedList<String>(); // 记录即将连接的URL
        
        /* 直接输入网址 */
        if(Crawler.inputurljrb.isSelected()) {
   
            queue.offer(Crawler.inputurljtf.getText());
        }
        
        /* 从txt文件中导入网址 */
        else {
   
            String importfilename = Crawler.importtxtjtf.getText();
            FileInputStream fis = null;
            InputStreamReader isr = null;
            BufferedReader br = null;
            try {
   
                String readstr;
                fis = new FileInputStream(importfilename);
                isr = new InputStreamReader(fis);
                br = new BufferedReader(isr);
                while ((readstr = br.readLine()) != null) {
   
                    queue.offer(readstr);
                }
            } catch(FileNotFoundException e) {
   
                System.out.println("找不到指定文件");
            } catch(IOException e) {
   
                System.out.println("读取文件失败");
            } finally {
   
                try {
   
                    if (br != null) br.close();
                    if (isr != null) isr.close();
                    if (fis != null) fis.close();
                } catch (IOException ex) {
   
                    System.out.println("关闭文件失败");
                }
            }
        }
        
        /* 判断是否需要代理服务器，若需要则读取代理服务器的地址和端口 */
        boolean useproxy = Crawler.useproxyjcb.isSelected();
        if (useproxy) {
   
            if (!Pattern.matches("(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9]))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9]))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9]))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9]))", Crawler.proxyaddrjtf.getText())) {
   
                useproxy = false;
                System.out.println("代理服务器地址格式错误");
            }
            else if (!Pattern.matches(("[0-9]{1,5}"), Crawler.proxyportjtf.getText())) {
   
                useproxy = false;
                System.out.println("代理服务器端口格式错误");
            }
        }
        
        /* 是否需要发送Cookie */
        boolean sendcookie = Crawler.sendcookiejcb.isSelected();
        
        /* 直到即将连接的URL为空才结束循环 */
        while (!queue.isEmpty()) {
   
            if (needstop) break;
            String str;
            int queuesize = queue.size();
            if (queuesize > 500) queuesize = 500; // 设置DownloadPage线程数最多为500
            Integer i;
            boolean b[] = new boolean[18];
            for (i = 0; i <= 17; i++)
                b[i] = Crawler.arrjcb[i].isSelected();
            CountDownLatch runningthreadnum = new CountDownLatch(queue.size());
            DownloadPage task[] = new DownloadPage[queuesize+1];
            for (i = 1; i <= queuesize; i++) {
   
                if (needstop) {
   
                    int j;
                    for (j = 1; j < i; j++)
                        if (task[j] != null) task[j].needstop = true;
                    break;
                }
                str = queue.poll();
                str = str.replace(" ", "");
                if (!str.equals("")) {
   
                    if (str.indexOf("://") == -1 || str.indexOf("://") > 12) str = "http://" + str; // 若输入的网址没有http://，则补上
                    set.add(str);
                    task[i] = new DownloadPage(str, i, b, runningthreadnum); // 新建任务
                    if (useproxy) task[i].getproxyinfo(Crawler.proxyaddrjtf.getText(), Crawler.proxyportjtf.getText()); // 需要的话设置代理
                    if (sendcookie) task[i].getcookiecontent(Crawler.cookiejtf.getText());
                    task[i].gettext(Crawler.userdefsourcejtf.getText(), Crawler.userdeftextjtf.getText(), Crawler.continueurljtf.getText(), Crawler.saveformatjtf.getText()); // 获取输入框内容
                    task[i].start(); // 任务开始
                }
                else {
   
                    runningthreadnum.countDown(); // 网址为空，没有新建任务，线程数自动减一
                }          
            }
            
            try {
   
                /* 等候全部DownloadPage线程结束 */
                while (runningthreadnum.getCount() > 0) {
   
                        sleep(200);
                    if (needstop) {
   
                        int j;
                        for (j = 1; j <= queuesize; j++)
                            if (task[j] != null) task[j].needstop = true;
                        sleep(1000);
                        break;
                    }
                }
            } catch (InterruptedException ex) {
   
                System.out.println("线程中断异常");
            }
            
            /* 若还需要爬新的网页，则加入队列中 */
            if (b[13]) {
   
                for (i = 1; i <= queuesize; i++) {
   
                    if (needstop) break;
                    String urlsavefile = "~"+i.toString()+"saveurls.txt";
                    File file = new File(urlsavefile);
                    if (file.exists()) {
   
                        FileInputStream fis = null;
                        InputStreamReader isr = null;
                        BufferedReader br = null;
                        try {
   
                            fis = new FileInputStream(file);
                            isr = new InputStreamReader(fis);
                            br = new BufferedReader(isr);
                            String line;
                            String regex = Crawler.continueurljtf.getText();
                            while ((line = br.readLine()) != null) {
   
                                boolean pass = false;
                                
                                /* 若在输入框中没有输入任何字符，视为继续爬任何网页，否则只爬符合那条正则表达式的内容 */
                                if (regex.equals("")) {
   
                                    pass = true;
                                }
                                else {
   
                                    pass = Pattern.matches(regex, line);
                                }
                                if (pass) {
   
                                    if (set.add(line)) {
   
                                        queue.offer(line);
                                    }
                                }
                            }
                        } catch (PatternSyntaxException ex) {
   
                            System.out.println("自定义正则表达式语法错误");
                        } catch (FileNotFoundException ex) {
   
                            System.out.println("临时保存url的文件不存在");
                        } catch (IOException ex) {
   
                            System.out.println("读取临时保存url的文件失败");
                        } finally {
   
                            try {
   
                                if (br != null) br.close();
                                if (isr != null) isr.close();
                                if (fis != null) fis.close();
                            } catch (IOException ex) {
   
                            System.out.println("关闭临时文件失败");
                            }
                        }
                        file.delete();
                    } // end of if(file.exists())
                } // end of for
            } // end of if(b[13])
        } // end of while
        
        /* 工作结束时，更改三个按钮的颜色 */
        Crawler.startjb.setBackground(null);
        Crawler.stopjb.setBackground(Color.LIGHT_GRAY);
        Crawler.choosefilejb.setBackground(null);
    }
    
}

三、DownloadPage类（给定URL等信息，连接互联网下载信息，交由Parser类提取数据，如果用户有需要则保存相应内容）：

public class DownloadPage extends Thread实则为线程类，实现网页获取下载以及保存到文件的各类细节，是爬虫的重要实现类：

/* * To change this template, choose Tools | Templates * and open the template in the editor. */
package crawler;

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.concurrent.*;
import java.util.regex.*;

/** * * @author 1100012773, 1100012778 */

/** * DownloadPage类 * 功能：给定URL等信息，连接互联网下载信息，交由Parser类提取数据，如果用户有需要则保存相应内容 */
public class DownloadPage extends Thread {
   
    private Integer tempfileid; // 临时文件对应的序号
    private String url; // 网页链接
    private String tempfilename; // 临时文件名
    private String objectname; // 目标名称（如果URL对应的是网页则取网页标题，否则直接从URL中提取）
    private String userdefsource; // 用户自定义的正则表达式（对应从源代码中匹配）
    private String userdeftext; // 用户自定义的正则表达式（对应从正文中匹配）
    private String continueurl; // 要继续爬的URL的正则表达式
    private String saveformat; // 要存储的文件格式
    private boolean useproxy = false, sendcookie = false; // 是否使用代理服务器、是否发送Cookie
    private String proxyaddr; // 代理地址
    private String cookie; // Cookie内容
    private int proxyport; // 代理端口
    private boolean need[]; // 对应18个复选框的真值
    private boolean error = false; // 是否出错
    private CountDownLatch runningthreadnum; // 当前线程数
    boolean needstop; // 是否需要停止（由Work控制）
    
    /* 构造函数 */
    public DownloadPage(String u, Integer tfi, boolean n[], CountDownLatch rtn) {
   
        super();
        url = u;
        tempfileid = tfi;
        tempfilename = "~tmp"+tfi.toString();
        need = new boolean[18];
        int i;
        for (i = 0 ;i <= 17; i++)
            need[i] = n[i];
        runningthreadnum = rtn;
        needstop = false;
    }
    
    /* 获取代理服务器信息 */
    public void getproxyinfo(String paddr, String pp) {
   
        useproxy = true;
        proxyaddr = paddr;
        proxyport = Integer.valueOf(pp);
    }
    
    /* 获取Cookie内容 */
    public void getcookiecontent(String cookie) {
   
        sendcookie = true;
        this.cookie = cookie;
    }
    
    
    /* 获取输入框内容 */
    public void gettext(String userdefsource, String userdeftext, String continueurl, String saveformat) {
   
        this.userdefsource = userdefsource;
        this.userdeftext = userdeftext;
        this.continueurl = continueurl;
        this.saveformat = saveformat;
    }
    
    @Override
    public void run() {
   
        try {
   
            System.out.println("准备连接 - "+url);
            String charcode = null;
            if (!needstop) charcode = getcharcodefromurl(); // 分析网页编码类型
            String content = null;
            if (!needstop) content = getcontentfromurl(charcode); // 获取网页内容
            if (!needstop) outputtotempfile(content, charcode); // 将网页内容输出到临时文件中
            if (!error && !needstop) {
   
                Parser parser = new Parser(url, tempfileid, content, need);
                String title = parser.gettitle(); // 获取网页标题
                if (title.equals("")) System.out.println("连接成功 - "+url);
                else System.out.println("连接成功 - "+title);
                parser.parse(userdefsource, userdeftext, continueurl, saveformat); // 分析网页内容并提取需要的数据
                if (title.equals("")) System.out.println("提取信息成功 - "+url);
                else System.out.println("提取信息成功 - "+title);
                
                /* 若需要保存目标…… */
                if (need[14] && !needstop) {
   
                    /* 该目标为网页且有非空标题 */
                    if (!title.equals("")) {
   
                        File file = new File(tempfilename);
                        if (file.exists()) {
   
                            new File("下载\\目标").mkdirs();
                            System.out.println("目标保存为 - "+SameFileName.newfilename("下载\\目标\\", title+".html"));
                            file.renameTo(new File("下载\\目标\\"+SameFileName.newfilename("下载\\目标\\", title+".html"))); // 将之前的临时文件重命名即可
                        }
                    }
                    /* 该目标不是网页或是网页但没有非空标题 */
                    else {
   
                        objectname = SameFileName.newfilename("下载\\目标\\", getobjname(url));
                        new File("下载\\目标").mkdirs();
                        System.out.println("目标保存为 - "+objectname);
                        downloadbybyte(url, "下载\\目标\\"+objectname); // 重新下载（因为之前的下载很可能会丢失部分特殊字符的数据）
                    }
                }
                
                /* 若需要保存网页正文…… */
                if (need[15] && !needstop) {
   
                    String textname = null;
                    if (!title.equals("")) {
   
                        textname = title+".txt";
                    }
                    else {
   
                        textname = getobjname(url)+".txt";
                    }
                    new File("下载\\网页正文").mkdirs();
                    textname = SameFileName.newfilename(("下载\\网页正文\\"), textname);
                    System.out.println("网页正文保存为 - "+textname);
                    outputtext(content, charcode, "下载\\网页正文\\"+textname); // 将网页正文输出到指定的文件
                }
                
                /* 若需要下载网页中特定格式的文件…… */
                if (need[16] || need[17]) {
   
                    Set<String> filesuffixset = new HashSet<String>();
                    /* 需要下载图片 */
                    if (need[16]) {
   
                        filesuffixset.add(".jpg");
                        filesuffixset.add(".gif");
                        filesuffixset.add(".png");
                        filesuffixset.add(".jpeg");
                        filesuffixset.add(".bmp");
                    }
                    /* 需要下载自定义格式的文件 */
                    if (need[17]) {
   
                        Pattern pattern = Pattern.compile("\\.(\\w|\\-|\\_)+", Pattern.CASE_INSENSITIVE);
                        Matcher matcher = pattern.matcher(saveformat);
                        while (matcher.find()) {
   
                            filesuffixset.add(saveformat.substring(matcher.start(), matcher.end()).toLowerCase());
                        }
                    }
                    if (!filesuffixset.isEmpty() && !needstop) downloadfile(content, "下载 - "+title, filesuffixset); // 下载文件
                }
                
            }
        } catch (MalformedURLException ex) {
   
            System.out.println("url格式不对 - "+url); // url格式不对
            error = true;
        } catch (IOException ex) {
   
            System.out.println("网络连接异常 - "+url); // 网络连接异常
            error = true;
        }

        File file = new File(tempfilename);
        if (file.exists()) file.delete(); // 删除临时文件
        runningthreadnum.countDown(); // 线程数减一
    }
    
    /* 获取网页编码类型 */
    private String getcharcodefromurl() {
   
        try {
   
            /* 若需要使用代理服务器则设置代理 */
            SocketAddress add = null;
            if (useproxy) add = new InetSocketAddress(proxyaddr, proxyport); 
            Proxy proxy = null;
            if (useproxy) proxy = new Proxy(Proxy.Type.HTTP , add);
            
            /* 连接网络，获取网页头信息 */
            URL u = new URL(url);
            HttpURLConnection urlconnection = null;
            if (useproxy) urlconnection = (HttpURLConnection)u.openConnection(proxy);
            else urlconnection = (HttpURLConnection)u.openConnection();
            
            if (sendcookie) urlconnection.setRequestProperty("Cookie", cookie);
            urlconnection.connect();
            
            String charcode = null;
            
            /* 分析网页头信息 */
            Map<String, List<String>> map = urlconnection.getHeaderFields();   
            Set<String> keys = map.keySet();   
            Iterator<String> iterator = keys.iterator();
        
            String key = null;   
            String tmp = null;   
            while (iterator.hasNext()) {
   
                if (needstop) return "UTF-8";
                key = iterator.next();   
                tmp = map.get(key).toString().toLowerCase();   
                
                /* 若网页头中含有"Content-Type"项且含有"charset="字段，则提取信息并返回 */
                if (key != null && key.equals("Content-Type")) {
      
                    int m = tmp.indexOf("charset=");   
                    if (m != -1) {
      
                        charcode = tmp.substring(m + 8).replace("]", "");   
                        return charcode;   
                    }   
                }   
            }
        
            if (needstop) return "UTF-8";
            
            /* 重新连接，逐行提取网页源代码，再从源代码中寻找字符编码信息 */
            HttpURLConnection conn = null;
            if (useproxy) conn = (HttpURLConnection)(new URL(url).openConnection(proxy));
            else conn = (HttpURLConnection)(new URL(url).openConnection());
            
            if (sendcookie) conn.setRequestProperty("Cookie", cookie);
            conn.connect();
            
            BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
            StringBuilder sb = new StringBuilder();
            String line;
            
            while ((line = reader.readLine()) != null) {
   
                if (needstop) return "UTF-8";
                line = line.toLowerCase();
                /* 在读取的字符串中寻找"charset="字段 */
                int indexofcharset = line.indexOf("charset=");
                if (indexofcharset > 0) {
   
                    line = line.substring(indexofcharset);
                    int indexofquotation = line.indexOf("\"");
                    if (indexofquotation > 0) {
   
                        return line.substring(8, indexofquotation);
                    }
                }
            }
        } catch (MalformedURLException ex) {
   }
          catch (IOException ex) {
   }

        return "UTF-8"; // 默认是UTF-8编码
    }
    
    /* 获取网页内容 */
    private String getcontentfromurl(String charcode) {
   
        try {
   
            /* 若需要使用代理服务器则先设置代理 */
            SocketAddress add = null;
            if (useproxy) add = new InetSocketAddress(proxyaddr, proxyport); 
            Proxy proxy = null;
            if (useproxy) proxy = new Proxy(Proxy.Type.HTTP , add);
            
            /* 连接网络 */
            HttpURLConnection conn = null;
            if (useproxy) conn = (HttpURLConnection)(new URL(url).openConnection(proxy));
            else conn = (HttpURLConnection)(new URL(url).openConnection());
            
            if (sendcookie) conn.setRequestProperty("Cookie", cookie);
            conn.connect();
            
            InputStream is = conn.getInputStream();
            if (needstop) return "";
            String content = readfromstream(is, charcode); // 在InputStream中获取内容
            return content;
        } catch(MalformedURLException e) {
   
            System.out.println("url格式不对 - "+url);
            error = true;
        } catch (IOException ex) {
   
            System.out.println("网络连接异常 - "+url);
            error = true;
        }
        return "";
    }
    
    /* 从网页给的输入流中获取内容并保存在String中 */
    private String readfromstream(InputStream stream, String charcode) throws IOException {
   
        try {
   
            BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charcode));
            StringBuilder sb = new StringBuilder();
            String line;
            
            /* 逐行读取数据 */
            while ((line = reader.readLine()) != null) {
   
                if (needstop) return "";
                sb.append(line+"\r\n");
            }
            return sb.toString();
        } catch (UnsupportedEncodingException ex) {
   
            System.out.println("无法识别的编码方式 - "+url);
            error = true;
        }
        return "";
    }
    
    /* 将网页内容输出到临时文件 */
    private void outputtotempfile(String content, String charcode) {
   
        File file = new File(tempfilename);
        FileOutputStream fos = null;
        Writer out = null;
        try {
   
            fos = new FileOutputStream(file, false);
            out = new OutputStreamWriter(fos, charcode);
            out.write(content);
        } catch (FileNotFoundException ex) {
   
            System.out.println("临时文件名出错");
            error = true;
        } catch (IOException ex) {
   
            System.out.println("临时文件输出失败");
            error = true;
        } finally {
   
            try {
   
                if (out != null) out.close();
                if (fos != null) fos.close();
            } catch (IOException ex) {
   
                System.out.println("无法关闭临时文件");
            }
        }
    }
    
    /* 逐字节地下载网页目标 */
    private boolean downloadbybyte(String url, String savefile) throws MalformedURLException {
   
        boolean succeed = true;
        URL u = new URL(url);
        DataInputStream dis = null;
        FileOutputStream fos = null;
        try {
   
            if (needstop) return false;
            /* 设置代理、连网等同上 */
            SocketAddress add = null;
            if (useproxy) add = new InetSocketAddress(proxyaddr, proxyport); 
            Proxy proxy = null;
            if (useproxy) proxy = new Proxy(Proxy.Type.HTTP , add);
            HttpURLConnection conn = null;
            if (useproxy) conn = (HttpURLConnection)(new URL(url).openConnection(proxy));
            else conn = (HttpURLConnection)(new URL(url).openConnection());
            
            if (sendcookie) conn.setRequestProperty("Cookie", cookie);
            conn.connect();
            
            dis = new DataInputStream(conn.getInputStream());
            if (needstop) return false;
            fos = new FileOutputStream(new File(savefile));
            byte buffer[] = new byte[65536];
            int length;
            
            /* 每次固定读若干字节的内容 */
            while ((length = dis.read(buffer)) > 0) {
   
                fos.write(buffer, 0, length);
                if (needstop) return false;
            }
        } catch (IOException ex) {
   
            System.out.println("下载失败 - "+savefile.substring(savefile.lastIndexOf("\\")+1));
            succeed = false;
        } finally {
   
            try {
   
                if (dis != null) dis.close();
                if (fos != null) fos.close();
            } catch (IOException ex) {
   
                System.out.println("关闭下载的文件失败");
            }
        }
        return succeed;
    }
    
    /* 根据URL获取目标名称 */
    private String getobjname(String url) {
   
        if (needstop) return url;
        String destfile = new String(url);
        if (destfile.endsWith("/")) destfile = destfile.substring(0, destfile.length() - 1);
        //选择最后的‘/’后的内容作为标题（文件名）
        int lastslashpos = destfile.lastIndexOf("/");
        destfile = destfile.substring(lastslashpos + 1);
        int lastdotpos = destfile.lastIndexOf(".");
        if (lastdotpos > 0) {
   
            int tmpindex = destfile.indexOf("?", lastdotpos);
            if (tmpindex > 0) destfile = destfile.substring(0, tmpindex);
            if ((tmpindex = destfile.indexOf("%", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex);
            if ((tmpindex = destfile.indexOf("&", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex);
            if ((tmpindex = destfile.indexOf("=", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex);
            if ((tmpindex = destfile.indexOf("+", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex);
            if ((tmpindex = destfile.indexOf(":", lastdotpos)) > 0) destfile = destfile.substring(0, tmpindex);          
        }
        //替换不合法标识符
        destfile = destfile.replace(":", "").replace("<","").replace(">","").replace("?","").replace("|","").replace("*","").replace("/","").replace("\\","").replace("\"", "");
        //避免标题（文件名）过长
        if (destfile.length() > 127) destfile = destfile.substring(destfile.length() - 127);
        return destfile;
    }
    
    /* 输出网页正文 */
    private void outputtext(String content, String charcode, String savefile) {
   
        if (needstop) return;
        String text = new String(content);
        /* 删去所有js脚本、html标签，并将&***;变为原有字符 */
        text = text.replaceAll("<(s|S)(c|C)(r|R)(i|I)(p|P)(t|T)[^>]*?>[\\s\\S]*?</(s|S)(c|C)(r|R)(i|I)(p|P)(t|T)>", "").replaceAll("<(s|S)(t|T)(y|Y)(l|L)(e|E)[^>]*?>[\\s\\S]*?</(s|S)(t|T)(y|Y)(l|L)(e|E)>","").replaceAll("<br>","\r\n").replaceAll("<[\\s\\S]*?>", "").replaceAll("(\r\n)+","\r\n").replaceAll("(\\s)+"," ").replaceAll("\\&((nbsp)|(\\#12288)|(\\#160))(\\;)?", " ").replaceAll("\\&((lt)|(\\#60))(\\;)?","<").replaceAll("\\&((gt)|(\\#62))(\\;)?",">").replaceAll("\\&((quot)|(#34))(\\;)?","\"").replaceAll("\\&((apos)|(\\#39))(\\;)?","'").replaceAll("\\&copy(\\;)?","©").replaceAll("\\&reg(\\;)?","®").replaceAll("\\&((amp)|(#38))(\\;)?","&");
        File file = new File(savefile);
        FileOutputStream fos = null;
        Writer out = null;
        try {
   
            fos = new FileOutputStream(file, false);
            out = new OutputStreamWriter(fos, charcode);
            if (needstop) return;
            out.write(text);
        } catch (FileNotFoundException ex) {
   
            System.out.println("网页正文文件名出错");
            error = true;
        } catch (IOException ex) {
   
            System.out.println("网页正文输出失败");
            error = true;
        } finally {
   
            try {
   
                if (out != null) out.close();
                if (fos != null) fos.close();
            } catch (IOException ex) {
   
                System.out.println("无法关闭网页正文文件");
            }
        }        
    }
    
    /* 下载指定格式的文件 */
    void downloadfile(String content, String dir, Set<String> suffixset) throws MalformedURLException, IOException {
   
        Set<String> downloaded = new HashSet<String>();
        Pattern pattern = Pattern.compile("(http://|ftp://|https://|rstp://|telnet://|file://)([\\w-]+\\.)+[\\w-]+(/[\\w\\-\\_\\.\\/\\?\\%\\&\\=\\:\\+\\,]*)?", Pattern.CASE_INSENSITIVE);
        Matcher matcher = pattern.matcher(content);
        
        /* 从网页源码中提取指定格式的URL */
        while (matcher.find()) {
   
            if (needstop) return;
            String url = content.substring(matcher.start(), matcher.end());
            String destfile = getobjname(url);
            destfile = SameFileName.newfilename("下载\\"+dir+"\\", destfile);
            int lastdotpos = destfile.lastIndexOf(".");
            if (lastdotpos > 0) {
   
                if (suffixset.contains(destfile.substring(lastdotpos).toLowerCase()) && downloaded.add(url.toLowerCase())) {
   
                    new File("下载\\"+dir).mkdirs();
                    if (downloadbybyte(url, "下载\\"+dir+"\\"+destfile)) System.out.println("下载成功 - "+destfile); /* 下载该URL */
                }
            }
        }
    }
    
}

四、辅助类：

1.SameFileName类（若某个目录下已存在某文件，更改新文件的文件名）：

（如a.txt->a[2].txt，ab[3]->ab[4]）

/* * To change this template, choose Tools | Templates * and open the template in the editor. */
package crawler;

import java.io.*;
/** * * @author 1100012773, 1100012778 */

/** * SameFileName类 * 若某个目录下已存在某文件，更改新文件的文件名（如a.txt->a[2].txt，ab[3]->ab[4]） */
public class SameFileName {
   
    
    public static String newfilename(String dir, String oldfilename) {
   
        String filename = oldfilename;
        File file = new File(dir+oldfilename);
        if (!file.exists()) return filename;
        int lastdotpos = oldfilename.lastIndexOf(".");
        Integer index = 1;
        //没有后缀的情况
        if (lastdotpos == -1) {
   
            while (true) {
   
                index++;
                filename = oldfilename+"["+index.toString()+"]";
                if (!new File(dir+filename).exists()) return filename;
            }
        }
        //带后缀的情况（如.html）
        else {
   
            String suffix = oldfilename.substring(lastdotpos);
            while (true) {
   
                index++;
                filename = oldfilename.substring(0, lastdotpos)+"["+index.toString()+"]"+suffix;
                if (!new File(dir+filename).exists()) return filename;
            }
        }
    }
    
}

2.TxtFileFilter类（文件筛选器，只用于点击"浏览"按钮弹出的选择文件的对话框）：

/* * To change this template, choose Tools | Templates * and open the template in the editor. */
package crawler;

import java.io.File;
import javax.swing.filechooser.*;

/** * * @author 1100012773, 1100012778 */

/** * TxtFileFilter类 * 文件筛选器，只用于点击"浏览"按钮弹出的选择文件的对话框 */
class TxtFileFilter extends FileFilter {
   
    
    @Override
    public boolean accept(File f) {
   
        if(f.isDirectory()) {
   
            return true; // 显示文件夹
        }
        String nameString = f.getName();
        return nameString.toLowerCase().endsWith(".txt"); // 显示txt文件
    }
    
    @Override
    public String getDescription() {
   
        return "文本文件 (*.txt)"; // 类型提示
    }
}

3.Parser类（分析网页源文件，利用正则匹配等提取出需要的信息并输出到文件）（由每一个线程进行初始化，指定输出格式）：

/* * To change this template, choose Tools | Templates * and open the template in the editor. */
package crawler;

import java.io.*;
import java.util.*;
import java.util.regex.*;

/** * * @author 1100012773, 1100012778 */

/** * Parser类 * 功能：分析网页源文件，提取需要的信息并输出 */
public class Parser {
   
    
    private String url; // 网页链接
    private String tempfilename; // 输出的临时文件名
    private String content; // 网页源代码
    private boolean need[]; // 对应DownloadPage的need
    
    /* 构造函数 */
    public Parser(String u, Integer tfi, String ct, boolean n[]) {
   
        url = u;
        tempfilename = "~"+tfi.toString()+"saveurls.txt";
        content = ct;
        int i;
        need = new boolean[18];
        for (i = 0; i < 17; i++)
            need[i] = n[i];
    }
    
    /* 分析源代码内容，提取需要的信息并输出 */
    public void parse(String userdefsource, String userdeftext, String continueurl, String saveformat) {
   
        String title = gettitle();
        try {
   
            String text = content.replace("\n","").replace("\r","").replaceAll("<(s|S)(c|C)(r|R)(i|I)(p|P)(t|T)[^>]*?>.*?</(s|S)(c|C)(r|R)(i|I)(p|P)(t|T)>", "").replaceAll("<(s|S)(t|T)(y|Y)(l|L)(e|E)[^>]*?>.*?</(s|S)(t|T)(y|Y)(l|L)(e|E)>","").replaceAll("<.*?>", "").replaceAll("\\&((nbsp)|(\\#12288)|(\\#160))(\\;)?", " ").replaceAll("\\&((lt)|(\\#60))(\\;)?","<").replaceAll("\\&((gt)|(\\#62))(\\;)?",">").replaceAll("\\&((quot)|(#34))(\\;)?","\"").replaceAll("\\&((apos)|(\\#39))(\\;)?","'").replaceAll("\\&copy(\\;)?","©").replaceAll("\\&reg(\\;)?","®").replaceAll("\\&((amp)|(#38))(\\;)?","&");
            
            /* regex[0]到regex[10]为URL、电子邮箱地址、ip地址、手机号码、电话号码、QQ号码、身份证号码、日期、时间、电驴链接、迅雷链接的正则表达式 */
            String regex[] = new String[18];
            regex[0] = new String("(http://|ftp://|https://|rstp://|telnet://|file://)([\\w-]+\\.)+[\\w-]+(/[\\w\\-\\_\\.\\/\\?\\%\\&\\=\\:\\+\\,]*)?");
            regex[1] = new String("[\\w]+([\\.\\_\\-]*[\\w])*\\@([\\w]+[\\w\\-]*[\\w]+\\.)+[\\w]+");
            regex[2] = new String("(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9])|(\\*))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9])|(\\*))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9])|(\\*))\\.(((25)[0-5][0-5])|(2[0-4][0-9])|(1[0-9][0-9])|([1-9]?[0-9])|(\\*))");
            regex[3] = new String("((\\+)?86(\\-)?)?(1)(((3|5|8)[0-9])|(47))[0-9]{8}");
            regex[4] = new String("(((0?)|[1-9])(0?|[1-9])([0-9][0-9])\\-)?[1-9]([0-9]{6})[0-9]?((\\-)[0-9]{1,4})?");
            regex[5] = new String("[1-9][0-9]{4,9}");
            regex[6] = new String("(([1-5][1-9])|6[1-5]|(71)|(81)|(82))([0-9]{4})((18)|(19)|(20))([0-9]{2})((0[1-9])|(11)|(12))(([0-2][0-9])|30|31)[0-9]{3}([0-9]|x|X)");
            regex[7] = new String("([0-9]{2,4}(\\-)((0?[1-9])|(10)|(11)|(12))(\\-)(([1-2][0-9])|(30)|(31)|((0)?[1-9])))|([0-9]{2,4}(\\.)((0?[1-9])|(10)|(11)|(12))(\\.)(([1-2][0-9])|(30)|(31)|((0)?[1-9])))|([0-9]{2,4}(\\/)((0?[1-9])|(10)|(11)|(12))(\\/)(([1-2][0-9])|(30)|(31)|((0)?[1-9])))|((([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\-((0?[1-9])|(10)|(11)|(12))\\-([0-9]{2,4}))|((([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\.((0?[1-9])|(10)|(11)|(12))\\.([0-9]{2,4}))|((([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\/((0?[1-9])|(10)|(11)|(12))\\/([0-9]{2,4}))|(((0?[1-9])|(10)|(11)|(12))\\-(([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\-([0-9]{2,4}))|(((0?[1-9])|(10)|(11)|(12))\\.(([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\.([0-9]{2,4}))|(((0?[1-9])|(10)|(11)|(12))\\/(([1-2][0-9])|(30)|(31)|((0)?[1-9]))\\/([0-9]{2,4}))");
            regex[8] = new String("(((1[0-9])|(2[0-3])|(0?[0-9]))\\:([0-5][0-9])(\\:[0-5][0-9])?)|(24\\:00(\\:00)?)");
            regex[9] = new String("ed2k://\\|file\\|[\\w\\-\\%\\(\\)\\[\\]\\.\\!]*[\\w]+\\|[0-9]+\\|[0-9A-F]+\\|((p|h)\\=[0-9A-Z]+(\\|)?)?(\\/)?");
            regex[10] = new String("thunder://[\\w\\+\\/\\=]+");
            regex[11] = userdefsource; // 用户自定义的正则表达式1
            regex[12] = userdeftext; // 用户自定义的正则表达式2
            regex[13] = new String("(http://|ftp://|https://|rstp://|telnet://|file://)([\\w-]+\\.)+[\\w-]+(/[\\w\\-\\_\\.\\/\\?\\%\\&\\=\\:\\+\\,]*)?"); // 同为URL正则表达式
            
            /* 输出文件的前缀以及保存的文件夹的名称 */
            String filetitleprefix[] = new String[18];
            filetitleprefix[0] = new String("URL");
            filetitleprefix[1] = new String("电子邮箱地址");
            filetitleprefix[2] = new String("ip地址");
            filetitleprefix[3] = new String("手机号码");
            filetitleprefix[4] = new String("电话号码");
            filetitleprefix[5] = new String("QQ号码");
            filetitleprefix[6] = new String("身份证号码");
            filetitleprefix[7] = new String("日期");
            filetitleprefix[8] = new String("时间");
            filetitleprefix[9] = new String("电驴链接");
            filetitleprefix[10] = new String("迅雷链接");
            filetitleprefix[11] = new String("自定义从源代码提取");
            filetitleprefix[12] = new String("自定义在正文中提取");
            
            int i;
            for (i = 0; i <= 13; i++) {
   
                if (need[i]) {
   
                    Pattern pattern = Pattern.compile(regex[i], Pattern.CASE_INSENSITIVE);
                    String matchstr;
                    if ((i >= 3 && i <= 6) || (i >= 9 && i <= 10) || (i == 12)) matchstr = text; // 手机号码、电话号码、QQ号码、身份证号码、电驴链接、迅雷链接、自定义1在正文中提取
                    else matchstr = content; // 其余从源代码中提取
                    Matcher matcher = pattern.matcher(matchstr);
                    Set<String> set = new HashSet<String>();
                    while (matcher.find()) {
   
                        String newitem = matchstr.substring(matcher.start(), matcher.end());
                        set.add(newitem);                   
                    }
                    
                    /* 额外分析形如"<a href="的相对链接 */
                    if (i == 0 || i == 13) {
   
                        Pattern patternrelativelink = Pattern.compile("\\<a\\shref\\=\"\\/[\\w\\-\\_\\.\\/\\?\\%\\&\\=\\:\\+\\,]*", Pattern.CASE_INSENSITIVE);
                        Matcher matcherrelativelink = patternrelativelink.matcher(content);
                        while (matcherrelativelink.find()) {
   
                            String newitem = content.substring(matcherrelativelink.start(), matcherrelativelink.end());
                            int slashpos = newitem.indexOf("/");
                            newitem = newitem.substring(slashpos);
                            slashpos = url.indexOf("://");
                            slashpos = url.indexOf("/", slashpos+5);
                            if (slashpos == -1) newitem = url+newitem;
                            else newitem = url.substring(0, slashpos)+newitem;
                            set.add(newitem);
                        }
                    }
                    
                    if (i != 13) new File(filetitleprefix[i]).mkdir();
                    File file;
                    if (i != 13) file = new File(filetitleprefix[i]+"\\"+SameFileName.newfilename(filetitleprefix[i]+"\\", filetitleprefix[i]+" - "+title+".txt"));
                    else file = new File(tempfilename);
                    
                    /* 输出提取的结果(i!=13)或临时文件(i==13) */
                    OutputStream os = null;
                    FileOutputStream fos = new FileOutputStream(file, false);
                    Writer out = new OutputStreamWriter(fos, "UTF-8");
                    Iterator<String> iter = set.iterator();
                    while (iter.hasNext()) {
   
                        out.write(iter.next()+"\r\n");
                    }
                    out.close();
                    fos.close();
                }
            }
            
        } catch (PatternSyntaxException ex) {
   
            System.out.println("自定义正则表达式语法错误");
        } catch (FileNotFoundException ex) {
   
            System.out.println("输出失败");
        } catch (UnsupportedEncodingException ex) {
   
            System.out.println("无法识别的编码方式");
        } catch (IOException ex) {
   
            System.out.println("输出失败");
        }
    }
    
    /* 从源代码的<title>...</title>段中获取网页标题 */
    public String gettitle() {
   
        String contentwithoutline = content;
        contentwithoutline = contentwithoutline.replace("\n", "").replace("\r", "").replaceAll("[\\s]+"," ");
        String regex = "<title>[\\s\\S]*?</title>";
        String title = "";
        Pattern pattern = Pattern.compile(regex, Pattern.CANON_EQ);
        Matcher matcher = pattern.matcher(contentwithoutline);
        if (matcher.find()) {
   
            title = contentwithoutline.substring(matcher.start(), matcher.end()).replaceAll("<.*?>", "").replace(":", "").replace("<","").replace(">","").replace("?","").replace("|","").replace("*","").replace("/","").replace("\\","").replace("\"", "");
            if (title.startsWith(" ")) title = title.substring(1);
            return title;
        }
        return "";
    }
    
}

Java爬虫程序小测试

文章目录

Java爬虫程序简介：

学习准备：

程序运行截图：

程序实现：

一、主类：Crawler（画主界面，捕捉按键）：

二、Work类（网络爬虫的工作类，启动DownloadPage线程，并保存要连接的URL的队列）：

三、DownloadPage类（给定URL等信息，连接互联网下载信息，交由Parser类提取数据，如果用户有需要则保存相应内容）：

四、辅助类：

1.SameFileName类（若某个目录下已存在某文件，更改新文件的文件名）：

2.TxtFileFilter类（文件筛选器，只用于点击"浏览"按钮弹出的选择文件的对话框）：

3.Parser类（分析网页源文件，利用正则匹配等提取出需要的信息并输出到文件）（由每一个线程进行初始化，指定输出格式）：