Java+MySQL实现网络爬虫程序（转）

释放双眼，带上耳机，听听看~！

网络爬虫，也叫网络蜘蛛，有的项目也把它称作“walker”。维基百科所给的定义是“一种系统地扫描互联网，以获取索引为目的的网络程序”。网络上有很多关于网络爬虫的开源项目，其中比较有名的是Heritrix和Apache Nutch。有时需要在网上搜集信息，如果需要搜集的是获取方法单一而人工搜集费时费力的信息，比如统计一个网站每个月发了多少篇文章、用了哪些标签，为自然语言处理项目搜集语料，或者为模式识别项目搜集图片等等，就需要爬虫程序来完成这样的任务。而且搜索引擎必不可少的组件之一也是网络爬虫。很多网络爬虫都是用Python，Java或C#实现的。我这里给出的是Java版本的爬虫程序。为了节省时间和空间，我把程序限制在只扫描本博客地址下的网页（也就是http://johnhan.net/但不包括http://johnhany.net/wp-content/下的内容），并从网址中统计出所用的所有标签。只要稍作修改，去掉代码里的限制条件就能作为扫描整个网络的程序使用。或者对输出格式稍作修改，可以作为生成博客sitemap的工具。代码也可以在这里下载：johnhany/WPCrawler。环境需求我的开发环境是Windows7 + Eclipse。需要XAMPP提供通过url访问MySQL数据库的端口。还要用到三个开源的Java类库： Apache HttpComponents 4.3 提供HTTP接口，用来向目标网址提交HTTP请求，以获取网页的内容； HTML Parser 2.0 用来解析网页，从DOM节点中提取网址链接； MySQL Connector/J 5.1.27 连接Java程序和MySQL，然后就可以用Java代码操作数据库。代码代码位于三个文件中，分别是：crawler.java，httpGet.java和parsePage.java。包名为net.johnhany.wpcrawler。

crawler.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
1package net.johnhany.wpcrawler;

2

3import java.sql.Connection;

4import java.sql.DriverManager;

5import java.sql.ResultSet;

6import java.sql.SQLException;

7import java.sql.Statement;

8

9public class crawler {

10  

11  public static void main(String args[]) throws Exception {

12      String frontpage = &quot;http://johnhany.net/&quot;;

13      Connection conn = null;

14      

15      //connect the MySQL database

16      try {

17          Class.forName(&quot;com.mysql.jdbc.Driver&quot;);

18          String dburl = &quot;jdbc:mysql://localhost:3306?useUnicode=true&amp;characterEncoding=utf8&quot;;

19          conn = DriverManager.getConnection(dburl, &quot;root&quot;, &quot;&quot;);

20          System.out.println(&quot;connection built&quot;);

21      } catch (SQLException e) {

22          e.printStackTrace();

23      } catch (ClassNotFoundException e) {

24          e.printStackTrace();

25      }

26      

27      String sql = null;

28      String url = frontpage;

29      Statement stmt = null;

30      ResultSet rs = null;

31      int count = 0;

32      

33      if(conn != null) {

34          //create database and table that will be needed

35          try {

36              sql = &quot;CREATE DATABASE IF NOT EXISTS crawler&quot;;

37              stmt = conn.createStatement();

38              stmt.executeUpdate(sql);

39              

40              sql = &quot;USE crawler&quot;;

41              stmt = conn.createStatement();

42              stmt.executeUpdate(sql);

43              

44              sql = &quot;create table if not exists record (recordID int(5) not null auto_increment, URL text not null, crawled tinyint(1) not null, primary key (recordID)) engine=InnoDB DEFAULT CHARSET=utf8&quot;;

45              stmt = conn.createStatement();

46              stmt.executeUpdate(sql);

47              

48              sql = &quot;create table if not exists tags (tagnum int(4) not null auto_increment, tagname text not null, primary key (tagnum)) engine=InnoDB DEFAULT CHARSET=utf8&quot;;

49              stmt = conn.createStatement();

50              stmt.executeUpdate(sql);

51          } catch (SQLException e) {

52              e.printStackTrace();

53          }

54          

55          //crawl every link in the database

56          while(true) {

57              //get page content of link &quot;url&quot;

58              httpGet.getByString(url,conn);

59              count++;

60              

61              //set boolean value &quot;crawled&quot; to true after crawling this page

62              sql = &quot;UPDATE record SET crawled = 1 WHERE URL = &#x27;&quot; + url + &quot;&#x27;&quot;;

63              stmt = conn.createStatement();

64              

65              if(stmt.executeUpdate(sql) &gt; 0) {

66                  //get the next page that has not been crawled yet

67                  sql = &quot;SELECT * FROM record WHERE crawled = 0&quot;;

68                  stmt = conn.createStatement();

69                  rs = stmt.executeQuery(sql);

70                  if(rs.next()) {

71                      url = rs.getString(2);

72                  }else {

73                      //stop crawling if reach the bottom of the list

74                      break;

75                  }

76

77                  //set a limit of crawling count

78                  if(count &gt; 1000 || url == null) {

79                      break;

80                  }

81              }

82          }

83          conn.close();

84          conn = null;

85          

86          System.out.println(&quot;Done.&quot;);

87          System.out.println(count);

88      }

89  }

90}

91

httpGet.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
1package net.johnhany.wpcrawler;

2

3import java.io.IOException;

4import java.sql.Connection;

5

6import org.apache.http.HttpEntity;

7import org.apache.http.HttpResponse;

8import org.apache.http.client.ClientProtocolException;

9import org.apache.http.client.ResponseHandler;

10import org.apache.http.client.methods.HttpGet;

11import org.apache.http.impl.client.CloseableHttpClient;

12import org.apache.http.impl.client.HttpClients;

13import org.apache.http.util.EntityUtils;

14

15public class httpGet {

16

17    public final static void getByString(String url, Connection conn) throws Exception {

18        CloseableHttpClient httpclient = HttpClients.createDefault();

19        

20        try {

21            HttpGet httpget = new HttpGet(url);

22            System.out.println(&quot;executing request &quot; + httpget.getURI());

23

24            ResponseHandler&lt;String&gt; responseHandler = new ResponseHandler&lt;String&gt;() {

25

26                public String handleResponse(

27                        final HttpResponse response) throws ClientProtocolException, IOException {

28                    int status = response.getStatusLine().getStatusCode();

29                    if (status &gt;= 200 &amp;&amp; status &lt; 300) {

30                        HttpEntity entity = response.getEntity();

31                        return entity != null ? EntityUtils.toString(entity) : null;

32                    } else {

33                        throw new ClientProtocolException(&quot;Unexpected response status: &quot; + status);

34                    }

35                }

36            };

37            String responseBody = httpclient.execute(httpget, responseHandler);

38            /*

39            //print the content of the page

40            System.out.println(&quot;----------------------------------------&quot;);

41            System.out.println(responseBody);

42            System.out.println(&quot;----------------------------------------&quot;);

43            */

44            parsePage.parseFromString(responseBody,conn);

45            

46        } finally {

47            httpclient.close();

48        }

49    }

50}

51

parsePage.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
1package net.johnhany.wpcrawler;

2

3import java.sql.Connection;

4import java.sql.PreparedStatement;

5import java.sql.ResultSet;

6import java.sql.SQLException;

7import java.sql.Statement;

8

9import org.htmlparser.Node;

10import org.htmlparser.Parser;

11import org.htmlparser.filters.HasAttributeFilter;

12import org.htmlparser.tags.LinkTag;

13import org.htmlparser.util.NodeList;

14import org.htmlparser.util.ParserException;

15

16import java.net.URLDecoder;

17

18public class parsePage {

19  

20  public static void parseFromString(String content, Connection conn) throws Exception {

21      Parser parser = new Parser(content);

22      HasAttributeFilter filter = new HasAttributeFilter(&quot;href&quot;);

23      

24      try {

25          NodeList list = parser.parse(filter);

26          int count = list.size();

27          

28          //process every link on this page

29          for(int i=0; i&lt;count; i++) {

30              Node node = list.elementAt(i);

31              

32              if(node instanceof LinkTag) {

33                  LinkTag link = (LinkTag) node;

34                  String nextlink = link.extractLink();

35                  String mainurl = &quot;http://johnhany.net/&quot;;

36                  String wpurl = mainurl + &quot;wp-content/&quot;;

37

38                  //only save page from &quot;http://johnhany.net&quot;

39                  if(nextlink.startsWith(mainurl)) {

40                      String sql = null;

41                      ResultSet rs = null;

42                      PreparedStatement pstmt = null;

43                      Statement stmt = null;

44                      String tag = null;

45                      

46                      //do not save any page from &quot;wp-content&quot;

47                      if(nextlink.startsWith(wpurl)) {

48                          continue;

49                      }

50                      

51                      try {

52                          //check if the link already exists in the database

53                          sql = &quot;SELECT * FROM record WHERE URL = &#x27;&quot; + nextlink + &quot;&#x27;&quot;;

54                          stmt = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY,ResultSet.CONCUR_UPDATABLE);

55                          rs = stmt.executeQuery(sql);

56

57                          if(rs.next()) {

58                              

59                          }else {

60                              //if the link does not exist in the database, insert it

61                              sql = &quot;INSERT INTO record (URL, crawled) VALUES (&#x27;&quot; + nextlink + &quot;&#x27;,0)&quot;;

62                              pstmt = conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS);

63                              pstmt.execute();

64                              System.out.println(nextlink);

65                              

66                              //use substring for better comparison performance

67                              nextlink = nextlink.substring(mainurl.length());

68                              //System.out.println(nextlink);

69                              

70                              if(nextlink.startsWith(&quot;tag/&quot;)) {

71                                  tag = nextlink.substring(4, nextlink.length()-1);

72                                  //decode in UTF-8 for Chinese characters

73                                  tag = URLDecoder.decode(tag,&quot;UTF-8&quot;);

74                                  sql = &quot;INSERT INTO tags (tagname) VALUES (&#x27;&quot; + tag + &quot;&#x27;)&quot;;

75                                  pstmt = conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS);

76                                  //if the links are different from each other, the tags must be different

77                                  //so there is no need to check if the tag already exists

78                                  pstmt.execute();

79                              }

80                          }

81                      } catch (SQLException e) {

82                          //handle the exceptions

83                          System.out.println(&quot;SQLException: &quot; + e.getMessage());

84                          System.out.println(&quot;SQLState: &quot; + e.getSQLState());

85                          System.out.println(&quot;VendorError: &quot; + e.getErrorCode());

86                      } finally {

87                          //close and release the resources of PreparedStatement, ResultSet and Statement

88                          if(pstmt != null) {

89                              try {

90                                  pstmt.close();

91                              } catch (SQLException e2) {}

92                          }

93                          pstmt = null;

94                          

95                          if(rs != null) {

96                              try {

97                                  rs.close();

98                              } catch (SQLException e1) {}

99                          }

100                         rs = null;

101                         

102                         if(stmt != null) {

103                             try {

104                                 stmt.close();

105                             } catch (SQLException e3) {}

106                         }

107                         stmt = null;

108                     }

109                     

110                 }

111             }

112         }

113     } catch (ParserException e) {

114         e.printStackTrace();

115     }

116 }

117}

118

程序原理

所谓“互联网”，是网状结构，任意两个节点间都有可能存在路径。爬虫程序对互联网的扫描，在图论角度来讲，就是对有向图的遍历（链接是从一个网页指向另一个网页，所以是有向的）。常见的遍历方法有深度优先和广度优先两种。相关理论知识可以参考树的遍历：这里和这里。我的程序采用的是广度优先方式。

程序从
crawler.java的main()开始运行。


1
2
3
4
5
1Class.forName(&quot;com.mysql.jdbc.Driver&quot;);

2String dburl = &quot;jdbc:mysql://localhost:3306?useUnicode=true&amp;characterEncoding=utf8&quot;;

3conn = DriverManager.getConnection(dburl, &quot;root&quot;, &quot;&quot;);

4System.out.println(&quot;connection built&quot;);

5

首先，调用DriverManager连接MySQL服务。这里使用的是XAMPP的默认MySQL端口3306，端口值可以在XAMPP主界面看到：

Apache和MySQL都启动之后，在浏览器地址栏输入“http://localhost/phpmyadmin/”就可以看到数据库了。等程序运行完之后可以在这里检查一下运行是否正确。


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
1sql = &quot;CREATE DATABASE IF NOT EXISTS crawler&quot;;

2stmt = conn.createStatement();

3stmt.executeUpdate(sql);

4

5sql = &quot;USE crawler&quot;;

6stmt = conn.createStatement();

7stmt.executeUpdate(sql);

8

9sql = &quot;create table if not exists record (recordID int(5) not null auto_increment, URL text not null, crawled tinyint(1) not null, primary key (recordID)) engine=InnoDB DEFAULT CHARSET=utf8&quot;;

10stmt = conn.createStatement();

11stmt.executeUpdate(sql);

12

13sql = &quot;create table if not exists tags (tagnum int(4) not null auto_increment, tagname text not null, primary key (tagnum)) engine=InnoDB DEFAULT CHARSET=utf8&quot;;

14stmt = conn.createStatement();

15stmt.executeUpdate(sql);

16

Apache和MySQL都启动之后，在浏览器地址栏输入“http://localhost/phpmyadmin/”就可以看到数据库了。等程序运行完之后可以在这里检查一下运行是否正确。

连接好数据库后，建立一个名为“crawler”的数据库，在库里建两个表，一个叫“record”，包含字段“recordID”，“URL”和“crawled”，分别记录地址编号、链接地址和地址是否被扫描过；另一个叫“tags”，包含字段“tagnum”和“tagname”，分别记录标签编号和标签名。


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
1while(true) {

2   httpGet.getByString(url,conn);

3   count++;

4   

5   sql = &quot;UPDATE record SET crawled = 1 WHERE URL = &#x27;&quot; + url + &quot;&#x27;&quot;;

6   stmt = conn.createStatement();

7   

8   if(stmt.executeUpdate(sql) &gt; 0) {

9       sql = &quot;SELECT * FROM record WHERE crawled = 0&quot;;

10      stmt = conn.createStatement();

11      rs = stmt.executeQuery(sql);

12      if(rs.next()) {

13          url = rs.getString(2);

14      }else {

15          break;

16      }

17  }

18}

19

Apache和MySQL都启动之后，在浏览器地址栏输入“http://localhost/phpmyadmin/”就可以看到数据库了。等程序运行完之后可以在这里检查一下运行是否正确。

接着在一个while循环内依次处理表record内的每个地址。每次处理时，把地址url传递给httpGet.getByString()，然后在表record中把crawled改为true，表明这个地址已经处理过。然后寻找下一个crawled为false的地址，继续处理，直到处理到表尾。

这里需要注意的细节是，执行executeQuery()后，得到了一个ResultSet结构rs，rs包含SQL查询返回的所有行和一个指针，指针指向结果中第一行之前的位置，需要执行一次rs.next()才能让rs的指针指向第一个结果，同时返回true，之后每次执行rs.next()都会把指针移到下一个结果上并返回true，直至再也没有结果时，rs.next()的返回值变成了false。

还有一个细节，在执行建库建表、INSERT、UPDATE时，需要用executeUpdate()；在执行SELECT时，需要使用executeQuery()。executeQuery()总是返回一个ResultSet，executeUpdate()返回符合查询的行数。

httpGet.java的getByString()类负责向所给的网址发送请求，然后下载网页内容。


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
1HttpGet httpget = new HttpGet(url);

2System.out.println(&quot;executing request &quot; + httpget.getURI());

3

4ResponseHandler&lt;String&gt; responseHandler = new ResponseHandler&lt;String&gt;() {

5   public String handleResponse(

6           final HttpResponse response) throws ClientProtocolException, IOException {

7       int status = response.getStatusLine().getStatusCode();

8       if (status &gt;= 200 &amp;&amp; status &lt; 300) {

9           HttpEntity entity = response.getEntity();

10          return entity != null ? EntityUtils.toString(entity) : null;

11      } else {

12          throw new ClientProtocolException(&quot;Unexpected response status: &quot; + status);

13      }

14  }

15};

16String responseBody = httpclient.execute(httpget, responseHandler);

17

Apache和MySQL都启动之后，在浏览器地址栏输入“http://localhost/phpmyadmin/”就可以看到数据库了。等程序运行完之后可以在这里检查一下运行是否正确。

这段代码是HTTPComponents的HTTP Client组件中给出的样例，在很多情况下可以直接使用。这部分代码获得了一个字符串responseBody，里面保存着网页中的全部字符。

接着，就需要把responseBody传递给
parsePage.java的parseFromString类提取链接。


1
2
3
4
5
6
7
8
9
10
11
12
1Parser parser = new Parser(content);

2HasAttributeFilter filter = new HasAttributeFilter(&quot;href&quot;);

3

4try {

5   NodeList list = parser.parse(filter);

6   int count = list.size();

7   

8   //process every link on this page

9   for(int i=0; i&lt;count; i++) {

10      Node node = list.elementAt(i);

11      if(node instanceof LinkTag) {

12

Apache和MySQL都启动之后，在浏览器地址栏输入“http://localhost/phpmyadmin/”就可以看到数据库了。等程序运行完之后可以在这里检查一下运行是否正确。

在HTML文件中，链接一般都在a标签的href属性中，所以需要创建一个属性过滤器。NodeList保存着这个HTML文件中的所有DOM节点，通过在for循环中依次处理每个节点寻找符合要求的标签，可以把网页中的所有链接提取出来。

然后通过nextlink.startsWith()进一步筛选，只处理以“http://johnhany.net/”开头的链接并跳过以“http://johnhany.net/wp-content/”开头的链接。


1
2
3
4
5
6
7
8
9
10
11
12
1sql = &quot;SELECT * FROM record WHERE URL = &#x27;&quot; + nextlink + &quot;&#x27;&quot;;

2stmt = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY,ResultSet.CONCUR_UPDATABLE);

3rs = stmt.executeQuery(sql);

4

5if(rs.next()) {

6   

7}else {

8   //if the link does not exist in the database, insert it

9   sql = &quot;INSERT INTO record (URL, crawled) VALUES (&#x27;&quot; + nextlink + &quot;&#x27;,0)&quot;;

10  pstmt = conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS);

11  pstmt.execute();

12


1
2
1 在表record中查找是否已经存在这个链接，如果存在（rs.next()==true），不做任何处理；如果不存在（rs.next()==false），在表中插入这个地址并把crawled置为false。因为之前recordID设为AUTO_INCREMENT，所以要用 Statement.RETURN_GENERATED_KEYS获取适当的编号。

2


1
2
3
4
5
6
7
8
9
1nextlink = nextlink.substring(mainurl.length());

2

3if(nextlink.startsWith(&quot;tag/&quot;)) {

4   tag = nextlink.substring(4, nextlink.length()-1);

5   tag = URLDecoder.decode(tag,&quot;UTF-8&quot;);

6   sql = &quot;INSERT INTO tags (tagname) VALUES (&#x27;&quot; + tag + &quot;&#x27;)&quot;;

7   pstmt = conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS);

8   pstmt.execute();

9