释放双眼，带上耳机，听听看~！

python爬虫Pragmatic系列III

By 白熊花田(http://blog.csdn.net/whiterbear)

说明：

在上一篇博客中，我们已经学会了从赶集网上的一家公司中提取出有关的信息，并存储到Excel中。

本次目标：

在本节中，我们将批量下载赶集首页上所有的公司界面（注意不是赶集网上所有的公司页面，我们可以把这个留给之后的任务），并批量的处理所有公司的有关信息，并保存到Excel中。

注意：

在上一篇博客中，我们使用的只是匹配赶集网上其中一家公司界面的中信息，而且不幸的是，很多的其他的公司的联系店主模块中的信息数量并不是固定的，即有的是10个li，而有的是9个li或者12个li，而且并不是所有公司都提供了QQ联系方式或者公司名称。所以，我对代码稍微做了处理，使其能够适应大部分的网页。

批量下载网页：

如下图：

我们将提取出该网页所包含的公司链接，并批量下载下来。这次，我们使用了一个下载类，利用下载类来封装我们所要的方法。我们先给定赶集网的首页链接，下载首页，接着我们分析出首页包含的公司链接并保存起来，最后我们将这些链接都下载到pagestroage文件夹中。

代码：


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
1#-*-coding:utf-8-*-

2import re

3import os

4from urllib import urlretrieve

5from bs4 import BeautifulSoup

6

7

8class Download(object):

9    &#x27;该类将包含下载给定的url和将其保存\

10    为相应的文件的方法&#x27;

11    def __init__(self):

12        self.index = 0

13        #初始化

14

15    def getPages(self,url,isMain=False):

16        &#x27;根据给定的url进行下载，如果是赶集网主界面，则另行处理&#x27;

17        for u in url:

18            try:

19                revtal = urlretrieve(u)[0]

20            except IOError:

21                revtal = None

22            if revtal &lt;&gt; None and isMain == True:

23                #是赶集网主界面

24                self.savePages(revtal, isMain=True)

25            elif revtal &lt;&gt; None and isMain &lt;&gt; True:

26                self.savePages(revtal)

27            else:

28                print(&#x27;Open url error&#x27;)

29

30    def savePages(self,webpage,isMain=False):

31        &#x27;将给定的网页保存起来&#x27;

32        f = open(webpage)

33        lines = f.readlines()

34        f.close()

35

36        if isMain == False:

37            #不是主界面则按序存储为filei.txt,i为序号

38            fobj = open(&quot;pagestroage\\file%s.txt&quot;%str(self.index), &#x27;w&#x27;)

39            self.index += 1

40            fobj.writelines(lines)

41        else:

42            #是赶集网主界面，则存储为mian.txt

43            fobj = open(&quot;pagestroage\main.txt&quot;,&#x27;w&#x27;)

44            fobj.writelines(lines)

45        

46        fobj.close()

47            

48

49    def getAllComUrls(self):

50        &#x27;我们对赶集网的主界面进行分析，提取出所有公司的链接，保存起来&#x27;

51        if os.path.exists(&#x27;pagestroage\main.txt&#x27;): #判断文件是否存在

52            fobj = open(&#x27;pagestroage\main.txt&#x27;,&#x27;r&#x27;)

53            lines = fobj.readlines()

54            fobj.close()

55

56            soup = BeautifulSoup(&#x27;&#x27;.join(lines))

57            body = soup.body

58            #wrapper = soup.find(id=&quot;wrapper&quot;)

59            leftBox = soup.find(attrs={&#x27;class&#x27;:&#x27;leftBox&#x27;})

60            list_ = leftBox.find(attrs={&#x27;class&#x27;:&#x27;list&#x27;})

61            ul = list_.find(&#x27;ul&#x27;)

62            li = ul.find_all(&#x27;li&#x27;)

63            href_regex = r&#x27;href=&quot;(.*?)&quot;&#x27;

64            urls = []

65

66            for l in li:

67                urls.append(&#x27;http://bj.ganji.com&#x27; + re.search(href_regex,str(l)).group(1))

68            #print urls

69

70            return urls

71        else:

72            print(&#x27;The file is missing&#x27;)

73            return None

74    

75    

76if __name__ == &#x27;__main__&#x27;:

77    #初试设定url为赶集网首页

78    url=[&#x27;http://bj.ganji.com/danbaobaoxian/o1/&#x27;]

79    #实例化下载类

80    download = Download()

81    #先下载赶集网首页

82    download.getPages(url,True)

83    #对下载的赶集网首页信息进行分析，提取出所有公司的url

84    urls = download.getAllComUrls()

85    #对上面提取的url进行下载

86    download.getPages(urls)

87

运行结果：

我们得到了十几个包含公司网页的文本文件。如下图：

分析网页：

由上面的操作，我们已经得到了赶集网上所有公司的html文本。接着我们使用Analysiser类来处理我们得到的数据。注意，Analysiser类中的方法基本上都在前面的博客中介绍了，这里只是用类封装了，并使其能够批量处理。

代码：


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
1#-*-coding:utf-8-*-

2import re

3from bs4 import BeautifulSoup

4import xlwt

5import os

6import sys

7reload(sys)

8sys.setdefaultencoding(&#x27;utf-8&#x27;)

9

10class Analysiser(object):

11    &#x27;该类将分析下载的公司信息存储到Excel表格中&#x27;

12    def __init__(self):

13        &#x27;初始化一个Excel&#x27;

14        self.wb = xlwt.Workbook()

15        self.ws = self.wb.add_sheet(&#x27;CompanyInfoSheet&#x27;)

16        self.initExcel()

17

18    def initExcel(self):

19        &#x27;我们初试化一个表格，并给表格一个头部，所以我们给头部不一样的字体&#x27;

20        #初始化样式

21        style = xlwt.XFStyle() 

22        #为样式创建字体

23        font = xlwt.Font() 

24        font.name = &#x27;Times New Roman&#x27;

25        font.bold = True

26        #为样式设置字体

27        style.font = font 

28

29        # 使用样式

30        #写入公司名称

31        self.ws.write(0,0,u&#x27;公司名称&#x27;, style)

32        #写入服务特色

33        self.ws.write(0,1,u&#x27;服务特色&#x27;, style)

34        #写入服务范围

35        self.ws.write(0,2,u&#x27;服务范围&#x27;, style)

36        #写入联系人

37        self.ws.write(0,3,u&#x27;联系人&#x27;, style)

38        #写入商家地址

39        self.ws.write(0,4,u&#x27;商家地址&#x27;, style)

40        #写入聊天QQ

41        self.ws.write(0,5,u&#x27;QQ&#x27;, style)

42        #写入联系电话

43        self.ws.write(0,6,u&#x27;联系电话&#x27;, style)

44        #写入网址

45        self.ws.write(0,7,u&#x27;公司网址&#x27;, style)

46        self.wb.save(&#x27;xinrui.xls&#x27;)

47

48    def analysAllFiles(self):

49        &#x27;&#x27;&#x27;

50        批量分析网页源码，并提取出公司相关信息

51        &#x27;&#x27;&#x27;

52        #得到pagestroage（我们存放下载的公司网页的文件夹）下所有的文件

53        filenames = os.listdir(&#x27;pagestroage&#x27;)

54        #得到所有存储的公司数目（去除一个包含赶集网首页的main.txt）

55        counts = len(filenames) - 1

56        #循环处理

57        for i in range(counts):

58            #打开文件，读文件到lines中，关闭文件对象

59            f = open(&quot;pagestroage\\file%s.txt&quot;%i, &#x27;r&#x27;)

60            lines = f.readlines()

61            f.close()

62

63            #这两个网页的联系店主模块与其他的不一样，如果也要匹配只能重新写代码匹配，遂放弃

64            if i == 12 or i == 7:

65                continue

66

67            #建立一个BeautifulSoup解析树，并利用这课解析树依次按照

68            #soup--&gt;body--&gt;(id为wrapper的div层)--&gt;(class属性为clearfix的div层)

69            #--&gt;(id为dzcontactus的div层)--&gt;(class属性为con的div层)--&gt;ul--&gt;(ul下的每个li)

70            soup = BeautifulSoup(&#x27;&#x27;.join(lines))

71            body = soup.body #body2 = soup.find(&#x27;body&#x27;)

72            wrapper = soup.find(id=&quot;wrapper&quot;)

73            clearfix = wrapper.find_all(attrs={&#x27;class&#x27;:&#x27;d-left-box&#x27;})[0]

74            dzcontactus = clearfix.find(id=&quot;dzcontactus&quot;)

75            con = dzcontactus.find(attrs={&#x27;class&#x27;:&#x27;con&#x27;})

76            ul = con.find(&#x27;ul&#x27;)

77            li = ul.find_all(&#x27;li&#x27;)

78

79            #记录一家公司的所有信息，用字典存储，可以依靠键值对存取，也可以换成列表存储

80            record = {} 

81

82            #公司名称

83            companyName = li[1].find(&#x27;h1&#x27;).contents[0]

84            record[&#x27;companyName&#x27;] = companyName

85

86            #服务特色

87            serviceFeature = li[2].find(&#x27;p&#x27;).contents[0]

88            record[&#x27;serviceFeature&#x27;] = serviceFeature

89            

90            #服务提供

91            serviceProvider = []

92            serviceProviderResultSet = li[3].find_all(&#x27;a&#x27;)

93            for service in serviceProviderResultSet:

94                serviceProvider.append(service.contents[0])

95

96            record[&#x27;serviceProvider&#x27;] = serviceProvider

97

98            #服务范围

99            serviceScope = [] 

100            serviceScopeResultSet = li[4].find_all(&#x27;a&#x27;)

101            for scope in serviceScopeResultSet:

102                serviceScope.append(scope.contents[0])

103

104            record[&#x27;serviceScope&#x27;] = serviceScope

105

106            #联系人

107            contacts = li[5].find(&#x27;p&#x27;).contents[0]

108            contacts = str(contacts).strip().encode(&quot;utf-8&quot;)

109            record[&#x27;contacts&#x27;] = contacts

110

111            #商家地址

112            addressResultSet = li[6].find(&#x27;p&#x27;)

113            re_h=re.compile(&#x27;&lt;/?\w+[^&gt;]*&gt;&#x27;)#HTML标签

114            address = re_h.sub(&#x27;&#x27;, str(addressResultSet))

115            record[&#x27;address&#x27;] = address.encode(&quot;utf-8&quot;)

116

117            restli = &#x27;&#x27;

118            for l in range(8,len(li) - 1):

119                restli += str(li[l])

120

121            #商家QQ

122            qqNumResultSet = restli

123            qq_regex = &#x27;(\d{5,10})&#x27;

124            qqNum = re.search(qq_regex,qqNumResultSet).group()

125            qqNum = qqNum

126            record[&#x27;qqNum&#x27;] = qqNum

127            

128            #联系电话

129            phone_regex= &#x27;1[3|5|7|8|][0-9]{9}&#x27;

130            phoneNum = re.search(phone_regex,restli).group()

131            record[&#x27;phoneNum&#x27;] = phoneNum

132            

133            #公司网址

134            companySite = li[len(li) - 1].find(&#x27;a&#x27;).contents[0]

135            record[&#x27;companySite&#x27;] = companySite

136                         

137            self.writeToExcel(record,i + 1)

138

139    def writeToExcel(self,record,index):

140        &#x27;该函数将给定的record字典中所有值存储到Excel相应的index行中&#x27;

141        #写入公司名称

142        companyName = record[&#x27;companyName&#x27;]

143        self.ws.write(index,0,companyName)

144

145        #写入服务特色

146        serviceFeature = record[&#x27;serviceFeature&#x27;]

147        self.ws.write(index,1,serviceFeature)

148

149        #写入服务范围

150        serviceScope = &#x27;,&#x27;.join(record[&#x27;serviceScope&#x27;])

151        self.ws.write(index,2,serviceScope)

152

153        #写入联系人

154        contacts = record[&#x27;contacts&#x27;]

155        self.ws.write(index,3,contacts.decode(&quot;utf-8&quot;))

156        

157        #写入商家地址

158        address = record[&#x27;address&#x27;]

159        self.ws.write(index,4,address.decode(&quot;utf-8&quot;))

160        

161        #写入聊天QQ

162        qqNum = record[&#x27;qqNum&#x27;]

163        self.ws.write(index,5,qqNum)

164        

165        #写入联系电话

166        phoneNum = record[&#x27;phoneNum&#x27;]

167        phoneNum = str(phoneNum).encode(&quot;utf-8&quot;)

168        self.ws.write(index,6,phoneNum.decode(&quot;utf-8&quot;))

169        

170        #写入网址

171        companySite = record[&#x27;companySite&#x27;]

172        self.ws.write(index,7,companySite)

173        self.wb.save(&#x27;xinrui.xls&#x27;)

174

175if __name__ == &#x27;__main__&#x27;:

176    ana = Analysiser()

177    ana.analysAllFiles()

178    

179    

180    

181

运行结果：

我们将得到包含赶集网首页上包含的所有公司的相关信息的Excel，如下图：

后感：

看到这个Excel是不觉得很cool，终于能做点实际的事情了。

不过，我们还可以做的更好，更智能。

未完待续。

{{userData.name}}已认证

python爬虫Pragmatic系列III

python爬虫Pragmatic系列III

说明：

本次目标：

注意：

代码：

运行结果：

分析网页：

代码：

运行结果：

后感：

职场中的那些话那些事

nginx 日志分析及性能排查

{{userData.name}}已认证

python爬虫Pragmatic系列III

说明：

本次目标：

注意：

代码：

运行结果：

分析网页：

代码：

运行结果：

后感：

Related posts:

职场中的那些话那些事

nginx 日志分析及性能排查

python爬虫Pragmatic系列IV

python爬虫Pragmatic系列II

jenkins+ansible+gitlab自动化部署三剑客

百度搜索引擎工作原理分析