(一)前言
我的上一篇博客已经说明如何爬取某一个网页的动漫数据,这里重点说一下一个完整的爬虫实例。
和上一篇文章相比,多了的就是动画种类,日文名什么的。
推荐这个爬取博客的:http://www.voidcn.com/article/p-yoxynhhz-gt.html
我也是根据这个学的。
用到的工具有:Intellij IDEA,mySQL, webmagic0.73等
(二)详细过程
** 爬取的内容有:**
标题
,
动画种类
,
日文名
,
别名
,
播放时间
,
播放状态
,
类型
,
原作
,
监督
,
制作公司
,
官网
,
剧情简介
,
剧情简介
,
评分
,
评分人数
,
相关url等。
爬取图片如下:
**
**
*** ***
**推荐
**
** (1)首先mySQL建表dmzjAnimation**
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 1CREATE TABLE `dmzjAnimation` (
2 `id` int(11) unsigned NOT NULL auto_increment,
3 `hahawebname` varchar(127) default NULL,
4 `antag` varchar(255) default NULL,
5 `japanname` varchar(255) default NULL,
6 `allname` varchar(255) default NULL,
7 `year` varchar(255) default NULL,
8 `state` varchar(255) default NULL,
9 `tag` varchar(643) default NULL,
10 `original` varchar(255) default NULL,
11 `Screenwriter` varchar(255) default NULL,
12 `company` varchar(255) default NULL,
13 `website` varchar(511) default NULL,
14 `content` varchar(2559) default NULL,
15 `contentdetail` varchar(10240) default NULL,
16 `goal` varchar(255) default NULL,
17 `mentotal` varchar(255) default NULL,
18 `url` varchar(255) default NULL,
19 PRIMARY KEY (`id`)
20) ENGINE=InnoDB DEFAULT CHARSET=utf8;
21
用Navicat for MySQL查看是这样子的
(2)第二步就开始写java代码,主要包括D
mzjAnimationProcessor(核心类),
D
mzjAnimationDao(java连接MySQL类)和
D
mzjAnimation(数据实体类)
(
2
.1)dmzjAnimation,实体类对应
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189 1package Webmagic.donghua.dmzj.com;
2
3/***Created by mo
4 *On 2017/10/23 ***12:08.
5 ******/
6public class DmzjAnimation {
7 private int id;
8 private String hahawebname;// 标题
9 private String antag;//动画种类
10 private String japanname;//日文名
11 private String allname;//别名
12 private String year;//播放时间
13 private String state;//播放状态
14 private String tag;//类型
15 private String original;//原作
16 private String screenwriter;//监督
17 private String company;//制作公司
18 private String website;//官网
19 private String content;//剧情简介
20 private String contentdetail;//剧情简介
21 private String goal;//评分
22 private String mentotal;//评分人数
23 private String url;//相关url
24
25 @Override
26 public String toString() {
27 return "DmzjAnimation{" +
28 "id=" + id +
29 ", hahawebname='" + hahawebname + '\'' +
30 ", antag='" + antag + '\'' +
31 ", japanname='" + japanname + '\'' +
32 ", allname='" + allname + '\'' +
33 ", year='" + year + '\'' +
34 ", state='" + state + '\'' +
35 ", tag='" + tag + '\'' +
36 ", original='" + original + '\'' +
37 ", screenwriter='" + screenwriter + '\'' +
38 ", company='" + company + '\'' +
39 ", website='" + website + '\'' +
40 ", content='" + content + '\'' +
41 ", contentdetail='" + contentdetail + '\'' +
42 ", goal=" + goal +
43 ", mentotal=" + mentotal +
44 ", url='" + url + '\'' +
45 '}';
46 }
47
48
49 public int getId() {
50 return id;
51 }
52
53 public void setId(int id) {
54 this.id = id;
55 }
56
57 public String getHahawebname() {
58 return hahawebname;
59 }
60
61 public void setHahawebname(String hahawebname) {
62 this.hahawebname = hahawebname;
63 }
64
65 public String getAntag() {
66 return antag;
67 }
68
69 public void setAntag(String antag) {
70 this.antag = antag;
71 }
72
73 public String getJapanname() {
74 return japanname;
75 }
76
77 public void setJapanname(String japanname) {
78 this.japanname = japanname;
79 }
80
81 public String getAllname() {
82 return allname;
83 }
84
85 public void setAllname(String allname) {
86 this.allname = allname;
87 }
88
89 public String getYear() {
90 return year;
91 }
92
93 public void setYear(String year) {
94 this.year = year;
95 }
96
97 public String getState() {
98 return state;
99 }
100
101 public void setState(String state) {
102 this.state = state;
103 }
104
105 public String getTag() {
106 return tag;
107 }
108
109 public void setTag(String tag) {
110 this.tag = tag;
111 }
112
113 public String getOriginal() {
114 return original;
115 }
116
117 public void setOriginal(String original) {
118 this.original = original;
119 }
120
121 public String getScreenwriter() {
122 return screenwriter;
123 }
124
125 public void setScreenwriter(String screenwriter) {
126 this.screenwriter = screenwriter;
127 }
128
129 public String getCompany() {
130 return company;
131 }
132
133 public void setCompany(String company) {
134 this.company = company;
135 }
136
137 public String getWebsite() {
138 return website;
139 }
140
141 public void setWebsite(String website) {
142 this.website = website;
143 }
144
145 public String getContent() {
146 return content;
147 }
148
149 public void setContent(String content) {
150 this.content = content;
151 }
152
153 public String getContentdetail() {
154 return contentdetail;
155 }
156
157 public void setContentdetail(String contentdetail) {
158 this.contentdetail = contentdetail;
159 }
160
161 public String getGoal() {
162 return goal;
163 }
164
165 public void setGoal(String goal) {
166 this.goal = goal;
167 }
168
169 public String getMentotal() {
170 return mentotal;
171 }
172
173 public void setMentotal(String mentotal) {
174 this.mentotal = mentotal;
175 }
176
177 public String getUrl() {
178 return url;
179 }
180
181 public void setUrl(String url) {
182 this.url = url;
183 }
184
185
186
187}
188
189
(2.2)
D
mzjAnimationProcessor,
爬虫逻辑,核
心类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149 1package Webmagic.donghua.dmzj.com;
2
3import us.codecraft.webmagic.Page;
4import us.codecraft.webmagic.Site;
5import us.codecraft.webmagic.Spider;
6import us.codecraft.webmagic.processor.PageProcessor;
7import us.codecraft.webmagic.selector.Html;
8import us.codecraft.webmagic.selector.Selectable;
9
10import java.util.List;
11
12/***Created by mo
13 *On 2017/10/23 ***12:09.
14 ******/
15public class DmzjAnimationProcessor implements PageProcessor {
16 int myid = 0;
17 int size =10;
18 // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
19 private Site site = Site.me().setRetryTimes(1000).setSleepTime(1000).setCharset("utf8");
20 @Override
21 public Site getSite() {
22 return site;
23 }
24
25 @Override
26 public void process(Page page) {
27 DmzjAnimation dmzjAnimation = new DmzjAnimation();
28 Html html = page.getHtml();
29 size++;
30 myid++;
31 int id = myid;
32 dmzjAnimation.setId(id);
33 String hahawebname = html.xpath("//div[@class=\"odd_anim_title_tnew\"]/div[@class=\"tvversion\"]/a/span[@class=\"anim_title_text\"]/h1/text()").get();//得分
34 dmzjAnimation.setHahawebname(hahawebname);
35 String goal = html.xpath("//div[@class=\"anim_star\"]/ul/li[@id=\"anim_score_info\"]/span[@class=\"points_text\"]/text()").get();//得分
36 dmzjAnimation.setGoal(goal);
37 String mentotalold = html.xpath("//div[@class=\"anim_star\"]/ul/li[@id=\"score_statistics\"]/span[@id=\"score_count_span\"]/text()").get();//人数
38 String mentotal = mentotalold.replaceAll("人评分","");
39 dmzjAnimation.setMentotal(mentotal);
40 String content = html.xpath("//div[@class=\"odd_anim_title_mnew\"]/p/span[@id=\"gamedescshort\"]/text()").get();//内容
41 dmzjAnimation.setContent(content);
42 String contentdetail = html.xpath("//div[@class=\"odd_anim_title_mnew\"]/p/span[@id=\"gamedescall\"]/text()").get();//内容
43 dmzjAnimation.setContentdetail(contentdetail);
44 System.out.println("hahawebname: "+ hahawebname);
45 System.out.println("goal: "+goal);
46 System.out.println("mentotal: "+ mentotal);
47 System.out.println("content: "+ content);
48 System.out.println("contentdetail: "+ contentdetail);
49 List<Selectable> nodes = html.xpath("//div[@class=\"anim_attributenew_text\"]/ul/li").nodes();
50 for(Selectable item : nodes){
51 String tmp = item.get();
52 if(tmp.contains("动画种类")) {//动画种类 : 剧场版
53 String antag11 = tmp.replaceAll("</?[^>]+>","");
54 String antag = antag11.replaceAll("动画种类 : ","");
55 System.out.println(antag);
56 dmzjAnimation.setAntag(antag);
57 }//日文名 : 暂无
58 if(tmp.contains("日文名")) {
59 String japanname11 = tmp.replaceAll("</?[^>]+>","");
60 String japanname = japanname11.replaceAll("日文名 : ","");
61 if(japanname.contains("暂无")){japanname = null;}
62 System.out.println(japanname);
63 dmzjAnimation.setJapanname(japanname);
64 }//别名 : 大闹天宫 上下集 / The Monkey King
65 if(tmp.contains("别名")) {
66 String allname11 = tmp.replaceAll("</?[^>]+>","");
67 String allname = allname11.replaceAll("别名 : ","");
68 if(allname.contains("暂无")){allname = null;}
69 System.out.println(allname);
70 dmzjAnimation.setAllname(allname);
71 }//首播时间 : 暂无
72 if(tmp.contains("首播")) {
73 String year11 = tmp.replaceAll("</?[^>]+>","");
74 String year1111 = year11.replaceAll("首播时间 : ","");
75 String year = year1111;
76 if(year1111.contains("暂无")){year = null;}
77 System.out.println(year);
78 dmzjAnimation.setYear(year);
79 }
80 if(tmp.contains("播放状态")) {
81 String state11 = tmp.replaceAll("</?[^>]+>","");
82 String state1111 = state11.replaceAll("播放状态 : ","");
83 String state = state1111;
84 if(state.contains("暂无")){state = null;}
85 System.out.println(state);
86 dmzjAnimation.setState(state);
87 }
88 if(tmp.contains("剧情类型")) {
89 String tag11 = tmp.replaceAll("</?[^>]+>","");
90 String tag1111 = tag11.replaceAll("剧情类型 : ","");
91 String tag = tag1111.replaceAll(" "," / ");
92 System.out.println(tag);
93 dmzjAnimation.setTag(tag);
94 }//原作 : 暂无
95 if(tmp.contains("原作")) {
96 String original11 = tmp.replaceAll("</?[^>]+>","");
97 String original1111 = original11.replaceAll("原作 :","");
98 String original = original1111;
99 if(original.contains("暂无")){original = null;}
100 System.out.println(original);
101 dmzjAnimation.setOriginal(original);
102 }//监督 : 万籁鸣 / 唐澄
103 if(tmp.contains("监督")) {
104 String screenwriter11 = tmp.replaceAll("</?[^>]+>","");
105 String screenwriter1111 = screenwriter11.replaceAll("监督 :","");
106 String screenwriter = screenwriter1111;
107 if(screenwriter.contains("暂无")){screenwriter = null;}
108 dmzjAnimation.setScreenwriter(screenwriter);
109 System.out.println(screenwriter);
110 }//制作公司 : 上海美术电影制片厂
111 if(tmp.contains("制作公司")) {
112 String company11 = tmp.replaceAll("</?[^>]+>","");
113 String company1111 = company11.replaceAll("制作公司 :","");
114 company1111 = company1111 +" / "+ company1111 +"公司";
115 String company = company1111;
116 if(company.contains("暂无")){company = null;}
117 System.out.println(company);
118 dmzjAnimation.setCompany(company);
119 }//官方网站 : 暂无
120 if(tmp.contains("官方网站")) {
121 String website = tmp.replaceAll(".*?href=|target(.*)","");
122 if(website.contains("暂无")){website = null;}
123 System.out.println(website);
124 dmzjAnimation.setWebsite(website);
125 }
126 }
127 String url = "http://donghua.dmzj.com/donghua_info/"+size+".html";
128 dmzjAnimation.setUrl(url);
129 new DmzjAnimationDao().add(dmzjAnimation);
130 }
131
132
133
134 public static void main(String[] args) {
135 int username = 10;
136 DmzjAnimationProcessor my = new DmzjAnimationProcessor();
137 long startTime, endTime;
138 System.out.println("开始爬取...");
139 for(;username<=15000;username++) {
140 startTime = System.currentTimeMillis();
141 Spider.create(my).addUrl("http://donghua.dmzj.com/donghua_info/" + username + ".html").thread(5).run();
142 endTime = System.currentTimeMillis();
143 System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒");
144 }
145 }
146}
147
148
149
(2.3)
D
mzjAnimationDao,java连接MySQL类,
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61 1package Webmagic.donghua.dmzj.com;
2
3/***Created by mo
4 *On 2017/10/23 ***12:08.
5 ******/
6import java.sql.*;
7
8 public class DmzjAnimationDao {
9
10 private Connection conn = null;
11 private Statement stmt = null;
12
13 public DmzjAnimationDao() {
14 try {
15 Class.forName("com.mysql.jdbc.Driver");
16 //spider是数据库,用户名,密码,数据格式
17 String url = "jdbc:mysql://localhost:3306/spider?user=root&password=xiemo&useUnicode=true&characterEncoding=UTF8";
18 conn = (Connection) DriverManager.getConnection(url);
19 stmt = conn.createStatement();
20 } catch (ClassNotFoundException e) {
21 e.printStackTrace();
22 } catch (SQLException e) {
23 e.printStackTrace();
24 }
25 System.out.println("连接数据库成功");
26 }
27
28 public int add(DmzjAnimation dmzjAnimation) {
29 try {
30 //dmzjAnimation是表名
31 String sql = "INSERT INTO `spider`.`dmzjAnimation` (`id`,`hahawebname`,`antag`,`japanname`, `allname`, `year`,`state`,`tag`, `original`,`screenwriter`,`company`, `website`, `content`,`contentdetail`,`goal`,`mentotal`,`url`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);";
32 PreparedStatement ps = conn.prepareStatement(sql);
33 ps.setInt(1, dmzjAnimation.getId());
34 ps.setString(2, dmzjAnimation.getHahawebname());
35 ps.setString(3, dmzjAnimation.getAntag());
36 ps.setString(4, dmzjAnimation.getJapanname());
37 ps.setString(5, dmzjAnimation.getAllname());
38 ps.setString(6, dmzjAnimation.getYear());
39 ps.setString(7, dmzjAnimation.getState());
40 ps.setString(8, dmzjAnimation.getTag());
41
42 ps.setString(9, dmzjAnimation.getOriginal());
43 ps.setString(10, dmzjAnimation.getScreenwriter());
44 ps.setString(11, dmzjAnimation.getCompany());
45 ps.setString(12, dmzjAnimation.getWebsite());
46 ps.setString(13, dmzjAnimation.getContent());
47 ps.setString(14, dmzjAnimation.getContentdetail());
48 ps.setString(15, dmzjAnimation.getGoal());
49
50 ps.setString(16, dmzjAnimation.getMentotal());
51 ps.setString(17, dmzjAnimation.getUrl());
52 return ps.executeUpdate();
53 } catch (SQLException e) {
54 e.printStackTrace();
55 }
56 return -1;
57 }
58
59}
60
61
(2.4)结果
IDEA:
**
**
MySQL:
**
**
希望对你有所帮助!