今天将 一个bfs 的爬虫 和 抽取Html整合到一起了。现在功能还是有局限性 。 其中抽取正文,详见 http://www.fuxiang90.me/2012/02/%E6%8A%BD%E5%8F%96html-%E6%AD%A3%E6%96%87/
- 现在只限定爬取 http 协议的网址,并只在内网测试了,因为和外网的连接不是不快。
- 一个全局的 url 队列 和 url set 。队列是为了方便的实现bfs , set 是为了不重复爬取网页,流程还是相当的简单的,原理也是相当的简单。
- 然后是单线程的,所以应该是比较慢的,之后会考虑多线程 ,爬取网页 ,抽取URL ,抽取正文,可以同步进行。
- 其中 下图是来源 https://www.ibm.com/developerworks/cn/opensource/os-cn-crawler/ ,然后抽取网页中的url ,我同时还抽取了里面的正文,这个是为了以后建立索引的时候 ,方便进行中文分词
代码在这里贴 有问题,可能是里面有 html 的标签 ,请移步http://www.fuxiang90.me/?p=728
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68 1# encoding:utf-8
2# use BeautifulSoup to get font|p context
3# 单线程版本的爬取html ,并深度遍历 ,之后 抽取正文 ,但是单线程未免有点慢
4# 可以随意的使用这段代码,但请保留 下面的一行
5# author : fuxiang ,mail: fuxiang90@gmail.com
6from BeautifulSoup import BeautifulSoup # For processing HTML
7import urllib2
8import os
9import sys
10import re
11import Queue
12import socket
13import time
14
15socket.setdefaulttimeout(8)
16
17g_url_queue = Queue.Queue()
18g_url_queue.put('http://www.bupt.edu.cn/')
19tt = ['http://www.bupt.edu.cn/']
20g_url_set = set(tt)
21max_deep = 1
22
23# 传入的参数是soup 类型 这个提取soup 类型里面的网址
24def get_url_list(html):
25 global g_url_set
26 re_html = r'(http://(\w+\.)+\w+)'
27
28 res = html.findAll('a') #找到所有a标签
29
30 for x in res:
31 t = unicode(x) #这里的x是soup对象
32 #url[pos] = str(unicode(x['href']) )
33 #t = unicode(x)
34 #print unicode(x['href'])
35 m = re.findall(re_html , t)
36 if m is None:
37 continue
38 for xx in m:
39 str_url = xx[0]
40 #print str_url
41 g_url_set |= set('fuxiang')
42 if str_url not in g_url_set :
43 g_url_queue.put(str_url )
44 g_url_set |= set(str_url)
45
46#######################################################
47def strip_tags(html):
48 """
49 Python中过滤HTML标签的函数
50 >>> str_text=strip_tags("<font color=red>hello</font>")
51 >>> print str_text
52 hello
53 """
54 from HTMLParser import HTMLParser
55 html = html.strip()
56 html = html.strip("\n")
57 result = []
58 parser = HTMLParser()
59 parser.handle_data = result.append
60 parser.feed(html)
61 parser.close()
62 return ''.join(result)
63
64#######################################################
65# 可以传入 网址 或者 本地文件 ,解析出里面的正文
66def get_context( url ):
67 re_html = 'http[s]?://[A-Za-z0-9]+.[A-Za-z0-9]+.[A-Za-z0-9]+
68
1
2 1 m = re.match(re_html,str(url)) if m is None : \# 如果url 是本地文件 fp = open(unicode(url),'r') else: fp = urllib2.urlopen(url) html = fp.read() soup = BeautifulSoup(html) allfonttext=soup.findAll(['a','p','font']) if len(allfonttext)<=0: print 'not found text' fwrite = open('u'+str(url) ,'w') for i in allfonttext: t = (i.renderContents() ) context = strip_tags(t) fwrite.write (context) fwrite.close()\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\# def main_fun(deep): global g_url_set global g_url_queue if deep > max_deep: return count = 0 print 'debug' while g_url_queue.empty() is not True: print 'debug2' l_url = g_url_queue.get() print l_url \# 捕捉超时错误 ,有些网页链接不上 try: fp = urllib2.urlopen(l_url) except : continue html = fp.read() fwrite = open(str(count+1) ,'w') fwrite.write(html) fwrite.close() soup = BeautifulSoup(html) get_url_list(soup) get_context(count+1) count += 1 if count >= 100 : return \# uncompletedef get_html_page(url): furl = urllib2.urlopen(url) html = furl.read() soup = BeautifulSoup(html)if __name__ == "__main__": main_fun(1) time.sleep(10)
2
然后我现在想做一个多线程的,即下载页面 和分析html 抽取里面的正文和url 是可以同步进行的 ,然后对上面的代码进行简单的修改后,勉强能运行 ,主要是增加了 threading ,对全局的queue 访问了加了锁控制,因为之前没有写过多线程的代码,所以觉得还是希望路过的朋友可以,提出建议。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175 1# encoding:utf-8
2# use BeautifulSoup to get font|p context
3# 可以随意的使用这段代码,但请保留 下面的一行
4# author : fuxiang ,mail: fuxiang90@gmail.com
5from BeautifulSoup import BeautifulSoup # For processing HTML
6import urllib2
7import os
8import sys
9import re
10import Queue
11import socket
12import time
13import threading
14
15
16queue_lock = threading.RLock()
17file_lock = threading.RLock()
18socket.setdefaulttimeout(8)
19
20g_url_queue = Queue.Queue()
21g_url_queue.put('http://www.bupt.edu.cn/')
22
23g_file_queue = Queue.Queue()
24tt = ['http://www.bupt.edu.cn/']
25g_url_set = set(tt)
26max_deep = 1
27
28
29
30#######################################################
31def strip_tags(html):
32 """
33 Python中过滤HTML标签的函数
34 >>> str_text=strip_tags("<font color=red>hello</font>")
35 >>> print str_text
36 hello
37 """
38 from HTMLParser import HTMLParser
39 html = html.strip()
40 html = html.strip("\n")
41 result = []
42 parser = HTMLParser()
43 parser.handle_data = result.append
44 parser.feed(html)
45 parser.close()
46 return ''.join(result)
47
48
49def get_context( soup ,url):
50 allfonttext=soup.findAll(['a','p','font'])
51 if len(allfonttext)<=0:
52 print 'not found text'
53 fwrite = open('u'+str(url) ,'w')
54 for i in allfonttext:
55 t = (i.renderContents() )
56 context = strip_tags(t)
57 fwrite.write (context)
58
59 fwrite.close()
60
61
62
63class get_page_thread(threading.Thread):
64 def __init__(self, name):
65 threading.Thread.__init__(self)
66 self.t_name = name
67
68 def run(self):
69 global g_url_set
70 global g_url_queue
71 global g_file_queue
72
73 count = 0
74 print 'debug'
75 while g_url_queue.empty() is not True:
76 print self.t_name
77
78 # 增加一个锁
79 queue_lock.acquire()
80 l_url = g_url_queue.get()
81
82 queue_lock.release()
83 print l_url
84 # 捕捉超时错误 ,有些网页链接不上
85 try:
86 fp = urllib2.urlopen(l_url)
87 except :
88 continue
89 html = fp.read()
90
91 fwrite = open(str(count+1) ,'w')
92 fwrite.write(html)
93 fwrite.close()
94
95 file_lock.acquire()
96 g_file_queue.put(count+1)
97 file_lock.release()
98
99 count += 1
100 if count >= 100 :
101 exit
102
103class get_url_list_thread(threading.Thread):
104 def __init__(self, name):
105 threading.Thread.__init__(self)
106 self.t_name = name
107
108
109
110
111 def run(self):
112 global g_url_set
113 global g_file_queue
114 global queue_lock
115 global file_lock
116
117 while g_file_queue.empty() is not True:
118 file_lock.acquire()
119 filename = g_file_queue.get()
120 file_lock.release()
121
122 fd = open(str(filename),'r')
123 html = fd.read();
124 soup = BeautifulSoup(html)
125
126 get_context(soup,filename)
127
128 re_html = r'(http://(\w+\.)+\w+)'
129 res = soup.findAll('a') #找到所有a标签
130
131
132
133 for x in res:
134 t = unicode(x) #这里的x是soup对象
135 #url[pos] = str(unicode(x['href']) )
136 #t = unicode(x)
137 #print unicode(x['href'])
138 m = re.findall(re_html , t)
139 if m is None:
140 continue
141 for xx in m:
142 str_url = xx[0]
143 #print str_url
144 g_url_set |= set('fuxiang')
145 if str_url not in g_url_set :
146 queue_lock.acquire()
147 g_url_queue.put(str_url )
148 queue_lock.release()
149 g_url_set |= set(str_url)
150
151# uncomplete
152def get_html_page(url):
153 furl = urllib2.urlopen(url)
154 html = furl.read()
155 soup = BeautifulSoup(html)
156
157
158if __name__ == "__main__":
159 thread1 = get_page_thread('a')
160
161 thread2 = get_url_list_thread('b')
162 thread3 = get_page_thread('c')
163 thread4 = get_page_thread('d')
164
165 thread1.start()
166 time.sleep(20)
167 thread2.start()
168
169 time.sleep(20)
170 thread3.start()
171 thread4.start()
172
173
174
175