python 简单的网络爬虫 + html 正文抽取

释放双眼，带上耳机，听听看~！

今天将一个bfs 的爬虫和抽取Html整合到一起了。现在功能还是有局限性。其中抽取正文，详见 http://www.fuxiang90.me/2012/02/%E6%8A%BD%E5%8F%96html-%E6%AD%A3%E6%96%87/

现在只限定爬取 http 协议的网址，并只在内网测试了，因为和外网的连接不是不快。
一个全局的 url 队列和 url set 。队列是为了方便的实现bfs ， set 是为了不重复爬取网页，流程还是相当的简单的，原理也是相当的简单。
然后是单线程的，所以应该是比较慢的，之后会考虑多线程，爬取网页，抽取URL ，抽取正文，可以同步进行。
其中下图是来源 https://www.ibm.com/developerworks/cn/opensource/os-cn-crawler/ ，然后抽取网页中的url ，我同时还抽取了里面的正文，这个是为了以后建立索引的时候，方便进行中文分词

代码在这里贴有问题，可能是里面有 html 的标签，请移步http://www.fuxiang90.me/?p=728


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
1# encoding:utf-8

2# use BeautifulSoup to get font|p context

3# 单线程版本的爬取html ，并深度遍历 ，之后 抽取正文 ，但是单线程未免有点慢

4# 可以随意的使用这段代码，但请保留 下面的一行

5# author ： fuxiang ，mail： fuxiang90@gmail.com

6from BeautifulSoup import BeautifulSoup          # For processing HTML

7import urllib2

8import os

9import sys

10import re

11import Queue

12import socket

13import time

14

15socket.setdefaulttimeout(8) 

16

17g_url_queue = Queue.Queue()

18g_url_queue.put(&#x27;http://www.bupt.edu.cn/&#x27;)

19tt = [&#x27;http://www.bupt.edu.cn/&#x27;]

20g_url_set = set(tt)

21max_deep = 1

22

23#  传入的参数是soup 类型 这个提取soup 类型里面的网址

24def get_url_list(html):

25    global g_url_set

26    re_html = r&#x27;(http://(\w+\.)+\w+)&#x27;

27    

28    res = html.findAll(&#x27;a&#x27;) #找到所有a标签

29    

30    for x in res:

31        t = unicode(x) #这里的x是soup对象

32        #url[pos] = str(unicode(x[&#x27;href&#x27;]) )

33        #t = unicode(x)

34        #print unicode(x[&#x27;href&#x27;])

35        m = re.findall(re_html , t)

36        if m  is None:

37            continue

38        for xx in m:            

39            str_url = xx[0]

40            #print str_url

41            g_url_set |= set(&#x27;fuxiang&#x27;)

42            if str_url not in g_url_set :

43                g_url_queue.put(str_url ) 

44                g_url_set |= set(str_url)

45

46#######################################################

47def strip_tags(html):

48    &quot;&quot;&quot;

49    Python中过滤HTML标签的函数

50    &gt;&gt;&gt; str_text=strip_tags(&quot;&lt;font color=red&gt;hello&lt;/font&gt;&quot;)

51    &gt;&gt;&gt; print str_text

52    hello

53    &quot;&quot;&quot;

54    from HTMLParser import HTMLParser

55    html = html.strip()

56    html = html.strip(&quot;\n&quot;)

57    result = []

58    parser = HTMLParser()

59    parser.handle_data = result.append

60    parser.feed(html)

61    parser.close()

62    return &#x27;&#x27;.join(result)

63

64#######################################################

65# 可以传入 网址 或者 本地文件 ，解析出里面的正文

66def get_context( url ):

67    re_html = &#x27;http[s]?://[A-Za-z0-9]+.[A-Za-z0-9]+.[A-Za-z0-9]+

68


1
2
1  m = re.match(re_html,str(url)) if m is None : \# 如果url 是本地文件 fp = open(unicode(url),&#x27;r&#x27;) else: fp = urllib2.urlopen(url) html = fp.read() soup = BeautifulSoup(html) allfonttext=soup.findAll([&#x27;a&#x27;,&#x27;p&#x27;,&#x27;font&#x27;]) if len(allfonttext)&lt;=0: print &#x27;not found text&#x27; fwrite = open(&#x27;u&#x27;+str(url) ,&#x27;w&#x27;) for i in allfonttext: t = (i.renderContents() ) context = strip_tags(t) fwrite.write (context) fwrite.close()\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\# def main_fun(deep): global g_url_set global g_url_queue if deep &gt; max_deep: return count = 0 print &#x27;debug&#x27; while g_url_queue.empty() is not True: print &#x27;debug2&#x27; l_url = g_url_queue.get() print l_url \# 捕捉超时错误 ，有些网页链接不上 try: fp = urllib2.urlopen(l_url) except : continue html = fp.read() fwrite = open(str(count+1) ,&#x27;w&#x27;) fwrite.write(html) fwrite.close() soup = BeautifulSoup(html) get_url_list(soup) get_context(count+1) count += 1 if count &gt;= 100 : return \# uncompletedef get_html_page(url): furl = urllib2.urlopen(url) html = furl.read() soup = BeautifulSoup(html)if __name__ == &quot;__main__&quot;: main_fun(1) time.sleep(10)   

2

然后我现在想做一个多线程的，即下载页面和分析html 抽取里面的正文和url 是可以同步进行的，然后对上面的代码进行简单的修改后，勉强能运行，主要是增加了 threading ，对全局的queue 访问了加了锁控制，因为之前没有写过多线程的代码，所以觉得还是希望路过的朋友可以，提出建议。


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
1# encoding:utf-8

2# use BeautifulSoup to get font|p context

3# 可以随意的使用这段代码，但请保留 下面的一行

4# author ： fuxiang ，mail： fuxiang90@gmail.com

5from BeautifulSoup import BeautifulSoup          # For processing HTML

6import urllib2

7import os

8import sys

9import re

10import Queue

11import socket

12import time

13import threading

14

15

16queue_lock = threading.RLock()

17file_lock = threading.RLock()

18socket.setdefaulttimeout(8) 

19

20g_url_queue = Queue.Queue()

21g_url_queue.put(&#x27;http://www.bupt.edu.cn/&#x27;)

22

23g_file_queue = Queue.Queue()

24tt = [&#x27;http://www.bupt.edu.cn/&#x27;]

25g_url_set = set(tt)

26max_deep = 1

27

28

29

30#######################################################

31def strip_tags(html):

32    &quot;&quot;&quot;

33    Python中过滤HTML标签的函数

34    &gt;&gt;&gt; str_text=strip_tags(&quot;&lt;font color=red&gt;hello&lt;/font&gt;&quot;)

35    &gt;&gt;&gt; print str_text

36    hello

37    &quot;&quot;&quot;

38    from HTMLParser import HTMLParser

39    html = html.strip()

40    html = html.strip(&quot;\n&quot;)

41    result = []

42    parser = HTMLParser()

43    parser.handle_data = result.append

44    parser.feed(html)

45    parser.close()

46    return &#x27;&#x27;.join(result)

47

48  

49def get_context( soup ,url):       

50    allfonttext=soup.findAll([&#x27;a&#x27;,&#x27;p&#x27;,&#x27;font&#x27;])

51    if len(allfonttext)&lt;=0:

52        print &#x27;not found text&#x27;

53    fwrite = open(&#x27;u&#x27;+str(url) ,&#x27;w&#x27;)

54    for i in allfonttext:

55        t = (i.renderContents() )

56        context = strip_tags(t)

57        fwrite.write (context)

58

59    fwrite.close()

60

61        

62

63class get_page_thread(threading.Thread):

64    def __init__(self, name):

65        threading.Thread.__init__(self)

66        self.t_name = name

67    

68    def run(self):

69        global g_url_set

70        global g_url_queue

71        global g_file_queue

72

73        count = 0

74        print &#x27;debug&#x27;

75        while g_url_queue.empty() is not True:

76            print self.t_name

77

78            # 增加一个锁

79            queue_lock.acquire()

80            l_url = g_url_queue.get()

81

82            queue_lock.release()

83            print l_url

84            # 捕捉超时错误 ，有些网页链接不上

85            try:

86                fp = urllib2.urlopen(l_url)

87            except :

88                continue

89            html = fp.read()

90

91            fwrite = open(str(count+1) ,&#x27;w&#x27;)

92            fwrite.write(html)

93            fwrite.close()

94

95            file_lock.acquire()

96            g_file_queue.put(count+1)

97            file_lock.release()

98            

99            count += 1

100            if count &gt;= 100 :

101               exit

102        

103class get_url_list_thread(threading.Thread):

104    def __init__(self, name):

105        threading.Thread.__init__(self)

106        self.t_name = name

107        

108  

109

110        

111    def run(self):

112        global g_url_set

113        global g_file_queue

114        global queue_lock

115        global file_lock

116

117        while g_file_queue.empty() is not True:

118            file_lock.acquire()

119            filename = g_file_queue.get()

120            file_lock.release()

121

122            fd = open(str(filename),&#x27;r&#x27;)

123            html = fd.read();

124            soup = BeautifulSoup(html) 

125

126            get_context(soup,filename)

127            

128            re_html = r&#x27;(http://(\w+\.)+\w+)&#x27;        

129            res = soup.findAll(&#x27;a&#x27;) #找到所有a标签

130

131            

132            

133            for x in res:

134                t = unicode(x) #这里的x是soup对象

135                #url[pos] = str(unicode(x[&#x27;href&#x27;]) )

136                #t = unicode(x)

137                #print unicode(x[&#x27;href&#x27;])

138                m = re.findall(re_html , t)

139                if m  is None:

140                    continue

141                for xx in m:            

142                    str_url = xx[0]

143                    #print str_url

144                    g_url_set |= set(&#x27;fuxiang&#x27;)

145                    if str_url not in g_url_set :

146                        queue_lock.acquire()

147                        g_url_queue.put(str_url )

148                        queue_lock.release()

149                        g_url_set |= set(str_url)

150            

151# uncomplete

152def get_html_page(url):

153    furl =  urllib2.urlopen(url)

154    html = furl.read()

155    soup = BeautifulSoup(html)

156

157

158if __name__ == &quot;__main__&quot;:

159    thread1 = get_page_thread(&#x27;a&#x27;)

160    

161    thread2 = get_url_list_thread(&#x27;b&#x27;)

162    thread3 = get_page_thread(&#x27;c&#x27;)

163    thread4 = get_page_thread(&#x27;d&#x27;)

164

165    thread1.start()

166    time.sleep(20)

167    thread2.start()

168

169    time.sleep(20)

170    thread3.start()

171    thread4.start()

172    

173

174

175

{{userData.name}}已认证

python 简单的网络爬虫 + html 正文抽取

职场中的那些话那些事

Linux日志分析

{{userData.name}}已认证

Related posts:

职场中的那些话那些事

Linux日志分析

python爬虫Pragmatic系列IV

IDEA插件系列（3）：FindBugs插件检查项目bug

UNICODE,GBK,UTF-8区别

pagerank算法