释放双眼，带上耳机，听听看~！

博客地址：http://blog.whattoc.com/2013/09/19/nodejs_api_http_2/

详解Node.js API系列 Http模块(2) CNodejs爬虫实现

简单爬虫设计


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
1var http = require(&#x27;http&#x27;);

2http.get(&quot;http://cnodejs.org/&quot;, function(res) {

3    var size = 0;

4    var chunks = [];

5  res.on(&#x27;data&#x27;, function(chunk){

6      size += chunk.length;

7      chunks.push(chunk);

8  });

9  res.on(&#x27;end&#x27;, function(){

10      var data = Buffer.concat(chunks, size);

11      console.log(data.toString())

12  });

13}).on(&#x27;error&#x27;, function(e) {

14  console.log(&quot;Got error: &quot; + e.message);

15});

16

http.get(options, callback)

http://cnodejs.org/ 爬行目标地址。
res.on(‘data’) 监听data事件。
res.on(‘end’) 数据获取完毕事件。
Buffer.concat(chunks, size); 连接多次data的buff。
data.toString() 将data二进制数据转换成utf-8的字符串，如果页面是GBK的时候，请使用iconv模块进行转换，原生Node.js不支持GBK。

设计目标

制定爬虫的url规则
分析页面信息
清洗没用数据
存储有用数据

制定爬虫的url规则

观察 http://cnodejs.org/ 的url规则，http://cnodejs.org/?page=页数，根据规则，不难想出处理的思路，首先获取用迭代器模式是最方便的，首先，获取单个page页里面的单个page路径，通过路径爬取page的页面内容。采用迭代器模式是最方便的，next()做page的页面索引，hasNext()判断page页是否超出了有效范围,超出范围则停止索引，下面是伪代码


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
1var Urls = function(start_url){

2  this.start_url = start_url; //base url

3  this.page = 0;  //url page

4  this.targetPage = &#x27;&#x27;; //topic page

5}

6Urls.prototype.next = function(){

7  var data;

8  if (!this.hasNext()) {

9      return null;

10  }

11  this.page += 1;

12  data = request.get(this.targetPage)  //get topic page

13  return data;

14}

15Urls.prototype.hasNext = function(){

16  //http://cnodejs.org/p=[1,2,3,4]

17  var url = this.start_url + this.page;

18  // if get page success from url,return ture,or return false

19  // get topic page url

20}

21// main     

22var urls = new Urls();

23while(urls.hasNext()){

24  console.log(urls.next());

25}

26

分析页面数据

分析页面的过程，主要工作是分析页面的元素提取出目标的内容，例如正文和评论等。这里我们需要采用cheerio的第三方库,该模块采取类似Jquery方式的DOM选择器，通过DOM选择器来实现信息提取。


1
2
1npm install cheerio

2

项目地址：https://github.com/MatthewMueller/cheerio

官方demo例子


1
2
3
4
5
6
7
8
9
1var cheerio = require(&#x27;cheerio&#x27;),

2    $ = cheerio.load(&#x27;&lt;h2 class=&quot;title&quot;&gt;Hello world&lt;/h2&gt;&#x27;);

3

4$(&#x27;h2.title&#x27;).text(&#x27;Hello there!&#x27;);

5$(&#x27;h2&#x27;).addClass(&#x27;welcome&#x27;);

6

7$.html();

8//=&gt; &lt;h2 class=&quot;title welcome&quot;&gt;Hello there!&lt;/h2&gt;

9

提取cnodejs的topics 链接


1
2
3
4
5
6
7
8
9
10
11
12
13
14
1$ = cheerio.load(data); //data是的页面数据

2topics = $(&#x27;.cell .topic_wrap a&#x27;)

3for(var i=0; i &lt; topics.length; i++){

4console.log(topics[i].attribs[&#x27;href&#x27;])

5

6result：

7

8/topic/52386d26101e574521a12ccd

9/topic/5232cd39101e57452106ce5a

10/topic/52390cdb101e574521b1e252

11/topic/521b1dcabee8d3cb128c56dd

12/topic/5238c6d2101e574521aaca13

13/topic/52380b4e101e57452193617c

14

内容信息提取

提取condejs帖子内容和标题


1
2
3
4
5
6
1$ = cheerio.load(data);  

2var topic = $(&#x27;.inner.topic&#x27;);  

3console.log(topic.children(&#x27;h3&#x27;).text()) //标题

4var content = topic.children(&#x27;.topic_content&#x27;).text()

5console.log(content);   //文章内容

6

清洗没用的数据

由于爬取的内容，可能带有html标签或者表情方面的信息，可能跟目标内容不符合，通过这个环节来过滤，这里向大家推荐一个模块 validator,该模块可以过滤xss攻击，字符串里面的空格，判断内容的属性等，详细可以到项目地址学习https://github.com/chriso/node-validator

安装


1
2
1npm install validator

2

demo例子


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
1var check = require(&#x27;validator&#x27;).check,

2    sanitize = require(&#x27;validator&#x27;).sanitize

3//Validate

4check(&#x27;test@email.com&#x27;).len(6, 64).isEmail();        //Methods are chainable

5check(&#x27;abc&#x27;).isInt();                                //Throws &#x27;Invalid integer&#x27;

6check(&#x27;abc&#x27;, &#x27;Please enter a number&#x27;).isInt();       //Throws &#x27;Please enter a number&#x27;

7check(&#x27;abcdefghijklmnopzrtsuvqxyz&#x27;).is(/^[a-z]+$/);

8//Set a message per validator

9check(&#x27;foo&#x27;, {

10    isNumeric: &#x27;This is not a number&#x27;,

11    contains: &#x27;The value doesn\&#x27;t have a 0 in it&#x27;

12}).isNumeric().contains(&#x27;0&#x27;);

13//Referencing validator args from the message

14check(&#x27;foo&#x27;, &#x27;The message needs to be between %1 and %2 characters long (you passed &quot;%0&quot;)&#x27;).len(2, 6);

15//Sanitize / Filter

16var int = sanitize(&#x27;0123&#x27;).toInt();                  //123

17var bool = sanitize(&#x27;true&#x27;).toBoolean();             //true

18var str = sanitize(&#x27; \t\r hello \n&#x27;).trim();         //&#x27;hello&#x27;

19var str = sanitize(&#x27;aaaaaaaaab&#x27;).ltrim(&#x27;a&#x27;);         //&#x27;b&#x27;

20var str = sanitize(large_input_str).xss();

21var str = sanitize(&#x27;&lt;a&gt;&#x27;).entityDecode();      //&#x27;&lt;a&gt;&#x27;

22

过滤刚才爬取的内容，主要是过滤空格


1
2
3
4
1var topic = $(&#x27;.inner.topic&#x27;);  

2title = topic.children(&#x27;h3&#x27;).text() //标题

3sanitize(title).trim()

4

存储有用数据

咸鱼白菜各有所需，对于游泳的数据，可以存成文本，也可以存到数据库，本次实例，为了足够精简，所以不采用数据库存储，采用文本的方式记录和json的格式记录数据。

一个爬虫的流程完成了，我们来重新看看实现代码

vi url.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
1var http = require(&#x27;http&#x27;);

2var cheerio = require(&#x27;cheerio&#x27;);

3var sanitize = require(&#x27;validator&#x27;).sanitize;

4var async = require(&#x27;async&#x27;);

5var fs = require(&#x27;fs&#x27;);

6

7var BASE_URL = &#x27;http://cnodejs.org&#x27;

8var scrapy = {}

9/**

10 * Get page from url.

11 *

12 * Examples:

13 *

14 *     scrapy.get(&#x27;http://www.baidu.com&#x27;, cb);

15 *     // =&gt; &#x27;baidu page html

16 * 

17 * @interface

18 * @param {String} url:ex http://www.baidu.com

19 * @param {Function} cb

20 * @private

21 */

22scrapy.get = function(url, cb){

23  http.get(url, function(res) {

24

25    var size = 0;

26    var chunks = [];

27

28    res.on(&#x27;data&#x27;, function(chunk){

29      size += chunk.length;

30      chunks.push(chunk);

31    });

32

33    res.on(&#x27;end&#x27;, function(){

34      var data = Buffer.concat(chunks, size);

35      cb(null, data);

36    });

37

38  }).on(&#x27;error&#x27;, function(e) {

39    cb(e, null);

40  });

41}

42

43var Urls = function(startUrl){

44  this.startUrl = startUrl;

45  this.page = 0;

46  this.homePage = &#x27;&#x27;;

47}

48

49Urls.prototype.next = function(cb){

50  var self = this;

51

52  this.hasNext(function(err, bRet){

53    if(!bRet){

54      return null;

55    }

56

57    self.homeParse(function(err, topics){

58      self.page += 1;

59      cb(null, topics);

60    })

61  })

62}

63

64Urls.prototype.hasNext = function(cb){

65  var self = this;

66  var url = this.startUrl + this.page;

67

68  scrapy.get(url, function(err, data){

69    var html = data.toString();

70    $ = cheerio.load(html);

71    self.homePage = $(&#x27;.cell .topic_wrap a&#x27;);

72

73    if(self.homePage.length === 0){

74      return cb(null, false);

75    }

76    return cb(null, true);

77

78  });

79};

80

81Urls.prototype.homeParse = function(cb){

82  var self = this;

83  var topics = [];

84

85  async.filter(self.homePage, function(i, cb){

86    var url = BASE_URL + self.homePage[i].attribs[&#x27;href&#x27;]

87    scrapy.get(url, function(err, topic){

88      topics.push(topic.toString());

89      cb(null);

90    })

91

92  },function(err){

93    cb(err, topics);

94  });

95}

96

97Urls.prototype.parseTopic = function(html){

98  $ = cheerio.load(html);

99  var topic = $(&#x27;.inner.topic&#x27;);

100  var item = {};

101  item.title = sanitize(topic.children(&#x27;h3&#x27;).text()).trim();

102  item.content = sanitize(topic.children(&#x27;.topic_content&#x27;).text()).trim();

103

104  return item;

105};

106

107Urls.prototype.Pipeline = function(items){

108

109  var result = JSON.stringify(items);

110  fs.writeFileSync(&#x27;result.txt&#x27;, result)

111

112}

113

114exports = module.exports = Urls

115

vi app.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
1var Urls = require(&#x27;./lib/url.js&#x27;);

2var async = require(&#x27;async&#x27;);

3

4

5var startUrl = &#x27;http://cnodejs.org/?page=&#x27;

6

7var urls = new Urls(startUrl);

8

9urls.next(function(err, topics){

10  var self = this;

11  var items = [];

12  async.filter(topics, function(topic, cb){

13    items.push(urls.parseTopic(topic));

14    cb(null);

15  },function(err){

16    urls.Pipeline(items);

17  })

18})

19

完整项目地址：https://github.com/youyudehexie/simple-scrapy

{{userData.name}}已认证

详解Node.js API系列 Http模块(2) CNodejs爬虫实现

详解Node.js API系列 Http模块(2) CNodejs爬虫实现

简单爬虫设计

http.get(options, callback)

设计目标

制定爬虫的url规则

分析页面数据

清洗没用的数据

存储有用数据

使用工厂模式、策略模式实现BASE64,MD5,SHA,HMAC,DES各种加密算法

C++ 高性能服务器网络框架设计细节

{{userData.name}}已认证

详解Node.js API系列 Http模块(2) CNodejs爬虫实现

简单爬虫设计

http.get(options, callback)

设计目标

制定爬虫的url规则

分析页面数据

清洗没用的数据

存储有用数据

Related posts:

使用工厂模式、策略模式实现BASE64,MD5,SHA,HMAC,DES各种加密算法

C++ 高性能服务器网络框架设计细节

Linux的系统调用、网络连接状态、磁盘I/O；可疑行为监控/日志收集、SHELL命令执行流程

前端点滴（Node.js）（五）---- 构建 Web 应用（五）Express中间件、koa 基础与实例

C#文件操作

微服务之分布式跟踪系统（springboot+zipkin+mysql）