本文使用WebMagic爬取汽车之家的品牌车系车型结构价格能源产地国别等;java代码
备注,只是根据url变化爬取的,没有使用爬取script页面具体的数据,也有反爬机制,知识简单爬取html标签
爬取的网页:
需要配置pom.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 1<!--测试-->
2 <dependency>
3 <groupId>junit</groupId>
4 <artifactId>junit</artifactId>
5 <version>4.12</version>
6 </dependency>
7 <dependency>
8 <groupId>us.codecraft</groupId>
9 <artifactId>webmagic-core</artifactId>
10 <version>0.7.3</version>
11 </dependency>
12 <dependency>
13 <groupId>us.codecraft</groupId>
14 <artifactId>webmagic-extension</artifactId>
15 <version>0.7.3</version>
16 </dependency>
17 <dependency>
18 <groupId>us.codecraft</groupId>
19 <artifactId>webmagic-samples</artifactId>
20 <version>0.7.3</version>
21 </dependency><groupId>us.codecraft</groupId>
22 <artifactId>webmagic-selenium</artifactId>
23 <version>0.7.3</version>
24 </dependency>
25
代码如下:跑@test测试代码,数据存在d:\data\AutoHome中
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295 1import org.apache.commons.io.FileUtils;
2import org.junit.Test;
3import us.codecraft.webmagic.Page;
4import us.codecraft.webmagic.Site;
5import us.codecraft.webmagic.Spider;
6import us.codecraft.webmagic.processor.PageProcessor;
7import us.codecraft.webmagic.selector.Html;
8import us.codecraft.webmagic.selector.Selectable;
9
10import java.io.File;
11import java.text.SimpleDateFormat;
12import java.util.ArrayList;
13import java.util.Calendar;
14import java.util.List;
15
16/***Created by mo
17 *On 2018/4/23 ***10:04.
18 ******/
19public class CarAutoHome implements PageProcessor {
20 private static int myid = 0;
21 int size = 0;
22 // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
23 private Site site = Site.me().setRetryTimes(5).setSleepTime(1000).setTimeOut(100000).setCharset("gbk")
24 .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36");
25 Calendar now = Calendar.getInstance();
26 String time = new SimpleDateFormat("HHmm").format(now.getTime());
27
28 @Override
29 public Site getSite() {
30 return site;
31 }
32
33 @Override
34 public void process(Page page) {
35 Html html = page.getHtml();
36 myid++;
37 size++;
38
39 try {
40 String url = page.getUrl().get();
41 String label = url.replaceAll("https://www.autohome.com.cn/","").replace("/","");
42 int op = 0;
43 List<String> one_car = new ArrayList<>();
44 List<Selectable> nodes = html.xpath("//div[@class='uibox-con rank-list rank-list-pic']/dl").nodes();
45 for (Selectable item : nodes) {
46 Selectable item_2 = item;
47 List<Selectable> cars = item_2.xpath("//dd/ul/li").nodes();
48 String Car_brand = item_2.xpath("//dt/div/a/text()").get();
49 //List<String> one_car = new ArrayList<>();
50 for (Selectable model : cars) {
51 String Car_https = model.xpath("//h4/a/@href").get();
52 String Car_model = model.xpath("//h4/a/text()").get();
53 String cart = label + "mo_mo" + Car_brand + "mo_mo" + Car_model + "mo_mo" + Car_https;
54 one_car.add(cart);
55 op++;
56 System.out.println(op + "mo_mo" + cart);
57 }
58 }
59
60// File file = new File("D:\\workspace\\java\\WebAnimationTest\\src\\main\\java\\Webmagic\\AutoHome\\car_brand_0423.txt");//汽车品牌
61
62 File file = new File("D:\\data\\AutoHome"+ time+".txt");//进口,出口
63// File file = new File("D:\\workspace\\java\\WebAnimationTest\\src\\main\\java\\Webmagic\\AutoHome\\origit_data\\car_oil.txt");//进口,出口
64 FileUtils.writeLines(file, one_car, true);
65 } catch (Exception e) {
66 }
67
68 }
69
70// public static void main(String[] args) throws IOException, InterruptedException {
71 @Test//获取品牌,车系等
72 public void brand_model(){
73 try {
74 long startTime, endTime;
75 System.out.println("开始爬取...");
76 startTime = System.currentTimeMillis();
77 //"https://www.autohome.com.cn/2951/?pvareaid=105126"
78 //a00/a0/a/b/c/d/suv/mpv/s/p"https://www.autohome.com.cn/3948/#levelsource=000000000_0&pvareaid=101594"
79
80 List<String> list = new ArrayList<>();
81 list.add("a00");
82 list.add("a0");
83 list.add("a");
84 list.add("b");
85 list.add("c");
86 list.add("d");
87 list.add("suv");
88 list.add("mpv");
89 list.add("s");
90 list.add("p");
91 list.add("mb");
92 list.add("qk");
93 for(String list_car :list){
94 String https = "https://www.autohome.com.cn/"+ list_car +"/";
95 Spider.create(new CarAutoHome()).addUrl(https).thread(5).run();
96 endTime = System.currentTimeMillis();
97 System.out.println(list_car + "\t"+ (endTime-startTime) +"\t");
98 Thread.sleep(10000);
99 }
100
101 } catch (Exception e) { }
102 }
103
104
105 @Test//获取进口或者是出口数据
106 public void car_board(){
107 try {
108 long startTime, endTime;
109 System.out.println("开始爬取...");
110 startTime = System.currentTimeMillis();
111
112 List<String> list = new ArrayList<>();
113 list.add("0_0-0.0_0.0-0-0-0-1-0-0-0-0");
114 list.add("0_0-0.0_0.0-0-0-0-3-0-0-0-0");
115
116 for(String list_car :list){
117 String https = "https://www.autohome.com.cn/car/"+ list_car +"/";
118 Spider.create(new CarAutoHome()).addUrl(https).thread(5).run();
119 endTime = System.currentTimeMillis();
120 System.out.println(list_car + "\t"+ (endTime-startTime) +"\t");
121 Thread.sleep(10000);
122 }
123
124 } catch (Exception e) { }
125 }
126
127
128 @Test//获取座位号等
129 public void brand_sit(){
130 try {
131 long startTime, endTime;
132 System.out.println("开始爬取...");
133 startTime = System.currentTimeMillis();
134
135 List<String> list = new ArrayList<>();
136 list.add("0_0-0.0_0.0-0-0-0-0-0-0-5-0");
137 list.add("0_0-0.0_0.0-0-0-0-0-0-0-7-0");
138 list.add("0_0-0.0_0.0-0-0-0-0-0-0-4-0");
139 list.add("0_0-0.0_0.0-0-0-0-0-0-0-2-0");
140 list.add("0_0-0.0_0.0-0-0-0-0-0-0-6-0");
141 list.add("0_0-0.0_0.0-0-0-0-0-0-0-8-0");
142
143 for(String list_car :list){
144 String https = "https://www.autohome.com.cn/car/"+ list_car +"/";
145 Spider.create(new CarAutoHome()).addUrl(https).thread(5).run();
146 endTime = System.currentTimeMillis();
147 System.out.println(list_car + "\t"+ (endTime-startTime) +"\t");
148 Thread.sleep(10000);
149 }
150 } catch (Exception e) { }
151 }
152
153
154
155 @Test//结构:两箱,3相
156 public void brand_structure(){
157 try {
158 long startTime, endTime;
159 System.out.println("开始爬取...");
160 startTime = System.currentTimeMillis();
161
162 List<String> list = new ArrayList<>();
163 list.add("0_0-0.0_0.0-0-0-1-0-0-0-0-0");
164 list.add("0_0-0.0_0.0-0-0-2-0-0-0-0-0");
165 list.add("0_0-0.0_0.0-0-0-3-0-0-0-0-0");
166 list.add("0_0-0.0_0.0-0-0-4-0-0-0-0-0");
167 list.add("0_0-0.0_0.0-0-0-5-0-0-0-0-0");
168 list.add("0_0-0.0_0.0-0-0-6-0-0-0-0-0");
169 list.add("0_0-0.0_0.0-0-0-7-0-0-0-0-0");
170 list.add("0_0-0.0_0.0-0-0-8-0-0-0-0-0");
171 list.add("0_0-0.0_0.0-0-0-9-0-0-0-0-0");
172 for(String list_car :list){
173 String https = "https://www.autohome.com.cn/car/"+ list_car +"/";
174 Spider.create(new CarAutoHome()).addUrl(https).thread(5).run();
175 endTime = System.currentTimeMillis();
176 System.out.println(list_car + "\t"+ (endTime-startTime) +"\t");
177 Thread.sleep(10000);
178 }
179 } catch (Exception e) { }
180 }
181
182
183
184 @Test//能源:汽油
185 public void brand_engine(){
186 try {
187 long startTime, endTime;
188 System.out.println("开始爬取...");
189 startTime = System.currentTimeMillis();
190
191 List<String> list = new ArrayList<>();
192 list.add("0_0-0.0_0.0-0-0-0-0-1-0-0-0");
193 list.add("0_0-0.0_0.0-0-0-0-0-2-0-0-0");
194 list.add("0_0-0.0_0.0-0-0-0-0-3-0-0-0");
195 list.add("0_0-0.0_0.0-0-0-0-0-4-0-0-0");
196 list.add("0_0-0.0_0.0-0-0-0-0-5-0-0-0");
197
198 for(String list_car :list){
199 String https = "https://www.autohome.com.cn/car/"+ list_car +"/";
200 Spider.create(new CarAutoHome()).addUrl(https).thread(5).run();
201 endTime = System.currentTimeMillis();
202 System.out.println(list_car + "\t"+ (endTime-startTime) +"\t");
203 Thread.sleep(10000);
204 }
205 } catch (Exception e) { }
206 }
207
208
209 @Test//能源:汽油
210 public void brand_gas(){
211 try {
212 long startTime, endTime;
213 System.out.println("开始爬取...");
214 startTime = System.currentTimeMillis();
215
216 List<String> list = new ArrayList<>();
217 list.add("0_0-0.0_0.0-0-0-0-0-1-0-0-0");
218 list.add("0_0-0.0_0.0-0-0-0-0-2-0-0-0");
219 list.add("0_0-0.0_0.0-0-0-0-0-3-0-0-0");
220 list.add("0_0-0.0_0.0-0-0-0-0-4-0-0-0");
221 list.add("0_0-0.0_0.0-0-0-0-0-5-0-0-0");
222
223 for(String list_car :list){
224 String https = "https://www.autohome.com.cn/car/"+ list_car +"/";
225 Spider.create(new CarAutoHome()).addUrl(https).thread(5).run();
226 endTime = System.currentTimeMillis();
227 System.out.println(list_car + "\t"+ (endTime-startTime) +"\t");
228 Thread.sleep(10000);
229 }
230 } catch (Exception e) { }
231 }
232
233
234 @Test//汽油量,V,L
235 public void brand_oil(){
236 try {
237 long startTime, endTime;
238 System.out.println("开始爬取...");
239 startTime = System.currentTimeMillis();
240
241 List<String> list = new ArrayList<>();
242 list.add("0_0-0.0_1.0-0-0-0-0-0-0-0-0");
243 list.add("0_0-1.1_1.6-0-0-0-0-0-0-0-0");
244 list.add("0_0-1.7_2.0-0-0-0-0-0-0-0-0");
245 list.add("0_0-2.1_2.5-0-0-0-0-4-0-0-0");
246 list.add("0_0-2.6_3.0-0-0-0-0-5-0-0-0");
247 list.add("0_0-3.1_4.0-0-0-0-0-4-0-0-0");
248 list.add("0_0-4.0_0.0-0-0-0-0-0-0-0-0");
249 for(String list_car :list){
250 String https = "https://www.autohome.com.cn/car/"+ list_car +"/";
251 Spider.create(new CarAutoHome()).addUrl(https).thread(5).run();
252 endTime = System.currentTimeMillis();
253 System.out.println(list_car + "\t"+ (endTime-startTime) +"\t");
254 Thread.sleep(10000);
255 }
256 } catch (Exception e) { }
257 }
258
259
260 @Test//价格等
261 public void brand_price(){
262 try {
263 long startTime, endTime;
264 System.out.println("开始爬取...");
265 startTime = System.currentTimeMillis();
266
267 List<String> list = new ArrayList<>();
268 list.add("1_5-0.0_0.0-0-0-0-0-0-0-0-0");
269 list.add("5_10-0.0_0.0-0-0-0-0-0-0-0-0");
270 list.add("10_15-0.0_0.0-0-0-0-0-0-0-0-0");
271 list.add("15_20-0.0_0.0-0-0-0-0-0-0-0-0");
272 list.add("20_25-0.0_0.0-0-0-0-0-0-0-0-0");
273 list.add("25_30-0.0_0.0-0-0-0-0-0-0-0-0");
274 list.add("30_35-0.0_0.0-0-0-0-0-0-0-0-0");
275 list.add("35_50-0.0_0.0-0-0-0-0-0-0-0-0");
276 list.add("50_70-0.0_0.0-0-0-0-0-0-0-0-0");
277 list.add("70_100-0.0_0.0-0-0-0-0-0-0-0-0");
278
279 for(String list_car :list){
280 String https = "https://www.autohome.com.cn/car/"+ list_car +"/";
281 Spider.create(new CarAutoHome()).addUrl(https).thread(5).run();
282 endTime = System.currentTimeMillis();
283 System.out.println(list_car + "\t"+ (endTime-startTime) +"\t");
284 Thread.sleep(10000);
285 }
286 } catch (Exception e) { }
287 }
288
289
290
291
292
293
294}
295
希望对你有所帮助!