Hadoop 中利用 mapreduce 读写 mysql 数据

释放双眼，带上耳机，听听看~！

有时候我们在项目中会遇到输入结果集很大，但是输出结果很小，比如一些 pv、uv 数据，然后为了实时查询的需求，或者一些 OLAP 的需求，我们需要 mapreduce 与 mysql 进行数据的交互，而这些特性正是 hbase 或者 hive 目前亟待改进的地方。

好了言归正传，简单的说说背景、原理以及需要注意的地方：

1、为了方便 MapReduce 直接访问关系型数据库（Mysql,Oracle），Hadoop提供了DBInputFormat和DBOutputFormat两个类。通过DBInputFormat类把数据库表数据读入到HDFS，根据DBOutputFormat类把MapReduce产生的结果集导入到数据库表中。

2、
由于0.20版本对DBInputFormat和DBOutputFormat支持不是很好，该例用了0.19版本来说明这两个类的用法。

至少在我的 0.20.203 中的
org.apache.hadoop.mapreduce.lib 下是没见到 db 包，所以本文也是以老版的 API 来为例说明的。

3、
运行MapReduce时候报错：java.io.IOException: com.mysql.jdbc.Driver，一般是由于程序找不到mysql驱动包。解决方法是让每个

tasktracker运行MapReduce程序时都可以找到该驱动包。

添加包有两种方式：

（1）在每个节点下的${HADOOP_HOME}/lib下添加该包。重启集群，一般是比较原始的方法。

（2）a)把包传到集群上： hadoop fs -put mysql-connector-java-5.1.0- bin.jar /
hdfsPath/

b)在mr程序提交job前，添加语句：DistributedCache.addFileToClassPath(new Path(“/hdfsPath/mysql- connector-java- 5.1.0-bin.jar”), conf);

（3）虽然API用的是0.19的，但是使用0.20的API一样可用，只是会提示方法已过时而已。

4、测试数据：

CREATE TABLE

(

1id

int DEFAULT NULL,

1name

varchar(10) DEFAULT NULL


04	) ENGINE=InnoDB DEFAULT CHARSET=utf8;


05

CREATE TABLE

1t2

(

1id

int DEFAULT NULL,

1name

varchar(10) DEFAULT NULL


09	) ENGINE=InnoDB DEFAULT CHARSET=utf8;


10


11	insert into t values (1,"june"),(2,"decli"),(3,"hello"),


12	(4,"june"),(5,"decli"),(6,"hello"),(7,"june"),


13	(8,"decli"),(9,"hello"),(10,"june"),


14	(11,"june"),(12,"decli"),(13,"hello");

5、代码：


001	import java.io.DataInput;


002	import java.io.DataOutput;


003	import java.io.IOException;


004	import java.sql.PreparedStatement;


005	import java.sql.ResultSet;


006	import java.sql.SQLException;


007	import java.util.Iterator;


008


009	import org.apache.hadoop.filecache.DistributedCache;


010	import org.apache.hadoop.fs.Path;


011	import org.apache.hadoop.io.LongWritable;


012	import org.apache.hadoop.io.Text;


013	import org.apache.hadoop.io.Writable;


014	import org.apache.hadoop.mapred.JobClient;


015	import org.apache.hadoop.mapred.JobConf;


016	import org.apache.hadoop.mapred.MapReduceBase;


017	import org.apache.hadoop.mapred.Mapper;


018	import org.apache.hadoop.mapred.OutputCollector;


019	import org.apache.hadoop.mapred.Reducer;


020	import org.apache.hadoop.mapred.Reporter;


021	import org.apache.hadoop.mapred.lib.IdentityReducer;


022	import org.apache.hadoop.mapred.lib.db.DBConfiguration;


023	import org.apache.hadoop.mapred.lib.db.DBInputFormat;


024	import org.apache.hadoop.mapred.lib.db.DBOutputFormat;


025	import org.apache.hadoop.mapred.lib.db.DBWritable;


026


027	/**


028	* Function: 测试 mr 与 mysql 的数据交互，此测试用例将一个表中的数据复制到另一张表中


029	* 实际当中，可能只需要从 mysql 读，或者写到 mysql 中。


030	* date: 2013-7-29 上午2:34:04 <br/>


031	* @author june


032	*/


033	public class Mysql2Mr {

034

// DROP TABLE IF EXISTS

1hadoop

1	1studentinfo

;


035	// CREATE TABLE studentinfo (


036	// id INTEGER NOT NULL PRIMARY KEY,


037	// name VARCHAR(32) NOT NULL);


038


039	public static class StudentinfoRecord implements Writable, DBWritable {


040	int id;


041	String name;


042


043	public StudentinfoRecord() {


044


045	}


046


047	public void readFields(DataInput in) throws IOException {


048	this.id = in.readInt();


049	this.name = Text.readString(in);


050	}


051


052	public String toString() {


053	return new String(this.id + " " + this.name);


054	}


055


056	@Override


057	public void write(PreparedStatement stmt) throws SQLException {


058	stmt.setInt(1, this.id);


059	stmt.setString(2, this.name);


060	}


061


062	@Override


063	public void readFields(ResultSet result) throws SQLException {


064	this.id = result.getInt(1);


065	this.name = result.getString(2);


066	}


067


068	@Override


069	public void write(DataOutput out) throws IOException {


070	out.writeInt(this.id);


071	Text.writeString(out, this.name);


072	}


073	}


074


075	// 记住此处是静态内部类，要不然你自己实现无参构造器，或者等着抛异常：


076	// Caused by: java.lang.NoSuchMethodException: DBInputMapper.<init>()


077	// http://stackoverflow.com/questions/7154125/custom-mapreduce-input-format-cant-find-constructor


078	// 网上脑残式的转帖，没见到一个写对的。。。


079	public static class DBInputMapper extends MapReduceBase implements


080	Mapper<LongWritable, StudentinfoRecord, LongWritable, Text> {


081	public void map(LongWritable key, StudentinfoRecord value,


082	OutputCollector<LongWritable, Text> collector, Reporter reporter) throws IOException {


083	collector.collect(new LongWritable(value.id), new Text(value.toString()));


084	}


085	}


086


087	public static class MyReducer extends MapReduceBase implements


088	Reducer<LongWritable, Text, StudentinfoRecord, Text> {


089	@Override


090	public void reduce(LongWritable key, Iterator<Text> values,


091	OutputCollector<StudentinfoRecord, Text> output, Reporter reporter) throws IOException {


092	String[] splits = values.next().toString().split(" ");


093	StudentinfoRecord r = new StudentinfoRecord();


094	r.id = Integer.parseInt(splits[0]);


095	r.name = splits[1];


096	output.collect(r, new Text(r.name));


097	}


098	}


099


100	public static void main(String[] args) throws IOException {


101	JobConf conf = new JobConf(Mysql2Mr.class);


102	DistributedCache.addFileToClassPath(new Path("/tmp/mysql-connector-java-5.0.8-bin.jar"), conf);


103


104	conf.setMapOutputKeyClass(LongWritable.class);


105	conf.setMapOutputValueClass(Text.class);


106	conf.setOutputKeyClass(LongWritable.class);


107	conf.setOutputValueClass(Text.class);


108


109	conf.setOutputFormat(DBOutputFormat.class);


110	conf.setInputFormat(DBInputFormat.class);


111	// // mysql to hdfs


112	// conf.setReducerClass(IdentityReducer.class);


113	// Path outPath = new Path("/tmp/1");


114	// FileSystem.get(conf).delete(outPath, true);


115	// FileOutputFormat.setOutputPath(conf, outPath);


116


117	DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver", "jdbc:mysql://192.168.1.101:3306/test",


118	"root", "root");


119	String[] fields = { "id", "name" };


120	// 从 t 表读数据


121	DBInputFormat.setInput(conf, StudentinfoRecord.class, "t", null, "id", fields);


122	// mapreduce 将数据输出到 t2 表


123	DBOutputFormat.setOutput(conf, "t2", "id", "name");


124	// conf.setMapperClass(org.apache.hadoop.mapred.lib.IdentityMapper.class);


125	conf.setMapperClass(DBInputMapper.class);


126	conf.setReducerClass(MyReducer.class);


127


128	JobClient.runJob(conf);


129	}


130	}

6、结果：

执行两次后，你可以看到mysql结果：


01	mysql> select * from t2;


02	+——+——-+


03	\| id \| name \|


04	+——+——-+


05	\| 1 \| june \|


06	\| 2 \| decli \|


07	\| 3 \| hello \|


08	\| 4 \| june \|


09	\| 5 \| decli \|


10	\| 6 \| hello \|


11	\| 7 \| june \|


12	\| 8 \| decli \|


13	\| 9 \| hello \|


14	\| 10 \| june \|


15	\| 11 \| june \|


16	\| 12 \| decli \|


17	\| 13 \| hello \|


18	\| 1 \| june \|


19	\| 2 \| decli \|


20	\| 3 \| hello \|


21	\| 4 \| june \|


22	\| 5 \| decli \|


23	\| 6 \| hello \|


24	\| 7 \| june \|


25	\| 8 \| decli \|


26	\| 9 \| hello \|


27	\| 10 \| june \|


28	\| 11 \| june \|


29	\| 12 \| decli \|


30	\| 13 \| hello \|


31	+——+——-+


32	26 rows in set (0.00 sec)


33


34	mysql>

7、日志：


01	13/07/29 02:33:03 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.


02	13/07/29 02:33:03 INFO filecache.TrackerDistributedCacheManager: Creating mysql-connector-java-5.0.8-bin.jar in /tmp/hadoop-june/mapred/local/archive/-8943686319031389138_-1232673160_640840668/192.168.1.101/tmp-work–8372797484204470322 with rwxr-xr-x


03	13/07/29 02:33:03 INFO filecache.TrackerDistributedCacheManager: Cached hdfs://192.168.1.101:9000/tmp/mysql-connector-java-5.0.8-bin.jar as /tmp/hadoop-june/mapred/local/archive/-8943686319031389138_-1232673160_640840668/192.168.1.101/tmp/mysql-connector-java-5.0.8-bin.jar


04	13/07/29 02:33:03 INFO filecache.TrackerDistributedCacheManager: Cached hdfs://192.168.1.101:9000/tmp/mysql-connector-java-5.0.8-bin.jar as /tmp/hadoop-june/mapred/local/archive/-8943686319031389138_-1232673160_640840668/192.168.1.101/tmp/mysql-connector-java-5.0.8-bin.jar


05	13/07/29 02:33:03 INFO mapred.JobClient: Running job: job_local_0001


06	13/07/29 02:33:03 INFO mapred.MapTask: numReduceTasks: 1


07	13/07/29 02:33:03 INFO mapred.MapTask: io.sort.mb = 100


08	13/07/29 02:33:03 INFO mapred.MapTask: data buffer = 79691776/99614720


09	13/07/29 02:33:03 INFO mapred.MapTask: record buffer = 262144/327680


10	13/07/29 02:33:03 INFO mapred.MapTask: Starting flush of map output


11	13/07/29 02:33:03 INFO mapred.MapTask: Finished spill 0


12	13/07/29 02:33:03 INFO mapred.Task: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting


13	13/07/29 02:33:04 INFO mapred.JobClient: map 0% reduce 0%


14	13/07/29 02:33:06 INFO mapred.LocalJobRunner:


15	13/07/29 02:33:06 INFO mapred.Task: Task 'attempt_local_0001_m_000000_0' done.


16	13/07/29 02:33:06 INFO mapred.LocalJobRunner:


17	13/07/29 02:33:06 INFO mapred.Merger: Merging 1 sorted segments


18	13/07/29 02:33:06 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 235 bytes


19	13/07/29 02:33:06 INFO mapred.LocalJobRunner:


20	13/07/29 02:33:06 INFO mapred.Task: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting


21	13/07/29 02:33:07 INFO mapred.JobClient: map 100% reduce 0%


22	13/07/29 02:33:09 INFO mapred.LocalJobRunner: reduce > reduce


23	13/07/29 02:33:09 INFO mapred.Task: Task 'attempt_local_0001_r_000000_0' done.


24	13/07/29 02:33:09 WARN mapred.FileOutputCommitter: Output path is null in cleanup


25	13/07/29 02:33:10 INFO mapred.JobClient: map 100% reduce 100%


26	13/07/29 02:33:10 INFO mapred.JobClient: Job complete: job_local_0001


27	13/07/29 02:33:10 INFO mapred.JobClient: Counters: 18


28	13/07/29 02:33:10 INFO mapred.JobClient: File Input Format Counters


29	13/07/29 02:33:10 INFO mapred.JobClient: Bytes Read=0


30	13/07/29 02:33:10 INFO mapred.JobClient: File Output Format Counters


31	13/07/29 02:33:10 INFO mapred.JobClient: Bytes Written=0


32	13/07/29 02:33:10 INFO mapred.JobClient: FileSystemCounters


33	13/07/29 02:33:10 INFO mapred.JobClient: FILE_BYTES_READ=1211691


34	13/07/29 02:33:10 INFO mapred.JobClient: HDFS_BYTES_READ=1081704


35	13/07/29 02:33:10 INFO mapred.JobClient: FILE_BYTES_WRITTEN=2392844


36	13/07/29 02:33:10 INFO mapred.JobClient: Map-Reduce Framework


37	13/07/29 02:33:10 INFO mapred.JobClient: Map output materialized bytes=239


38	13/07/29 02:33:10 INFO mapred.JobClient: Map input records=13


39	13/07/29 02:33:10 INFO mapred.JobClient: Reduce shuffle bytes=0


40	13/07/29 02:33:10 INFO mapred.JobClient: Spilled Records=26


41	13/07/29 02:33:10 INFO mapred.JobClient: Map output bytes=207


42	13/07/29 02:33:10 INFO mapred.JobClient: Map input bytes=13


43	13/07/29 02:33:10 INFO mapred.JobClient: SPLIT_RAW_BYTES=75


44	13/07/29 02:33:10 INFO mapred.JobClient: Combine input records=0


45	13/07/29 02:33:10 INFO mapred.JobClient: Reduce input records=13


46	13/07/29 02:33:10 INFO mapred.JobClient: Reduce input groups=13


47	13/07/29 02:33:10 INFO mapred.JobClient: Combine output records=0


48	13/07/29 02:33:10 INFO mapred.JobClient: Reduce output records=13


49	13/07/29 02:33:10 INFO mapred.JobClient: Map output records=13

{{userData.name}}已认证

Hadoop 中利用 mapreduce 读写 mysql 数据

MySQL和MongoDB数据相互迁移

Ubuntu上NFS的安装配置

{{userData.name}}已认证

Related posts:

MySQL和MongoDB数据相互迁移

Ubuntu上NFS的安装配置

Nginx+Tomcat+Memcached实现负载均衡和Session共享

搭建高可用mongodb集群（四）—— 分片

Hbase与 Sqoop 的集成

CentOS7安装Docker