HBase With MR
MR是分布式计算框架,对于数据源和数据目的地没有限制,用户可以任意选择,只不过需要实现两个接口InputFormat和OutputFormat,可以在官网的'HBase MapReduce Examples'中查看案例
当需要从hbase读取数据的时候,必须调用这个方法TableMapReduceUtil.initTableMapperJob设置Mapper。当需要写数据到hbase的时候,必须调用这个这个方法TableMapReduceUtil.initTableReducerJob设置TableReducer即Reducer,如果不需要Reducer也需要调用这个方法,该参数传null即可
下面实现wordcount:1.从hdfs读取数据,2.将结果存储到hbase
以上面"Java API 使用"的maven项目为基础
pom.xml中添加如下依赖
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.2.4</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-mapreduce</artifactId>
<version>2.5.8-hadoop3</version>
</dependency>
三个java类,代码如下所示
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private final IntWritable one = new IntWritable(1);
private final Text word = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
String[] splits = value.toString().split(" ");
for (String str : splits) {
word.set(str);
context.write(word, one);
}
}
}
public class WCReducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable> {
private byte[] cfBytes = Bytes.toBytes("cf");
private byte[] ctBytes = Bytes.toBytes("ct");
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, ImmutableBytesWritable, Mutation>.Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
Put put = new Put(Bytes.toBytes(key.toString()));
// 整型数字,转换成字符串,再往hbase中写
put.addColumn(cfBytes, ctBytes, Bytes.toBytes(String.valueOf(sum)));
context.write(null, put);
}
}
public class WCRunner {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration(true);
conf.set("hbase.zookeeper.quorum", "node02,node03,node04");
// 创建job对象
Job job = Job.getInstance(conf);
job.setJarByClass(WCRunner.class);
// 设置mapper类
job.setMapperClass(WCMapper.class);
// TableMapReduceUtil.initTableMapperJob();
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置reduce类,
// job.setReducerClass();
TableMapReduceUtil.initTableReducerJob("wc", WCReducer.class, job);
// 下面两个配置可以省略
// job.setOutputKeyClass(NullWriter.class);
// job.setOutputValueClass(Put.class);
// 指定输入数据的目录
FileInputFormat.addInputPath(job, new Path("/wc/wc"));
job.waitForCompletion(true);
}
}
# node01节点bigdata用户
# 将上面的项目打包,上传到node01节点
# 准备数据
cat > wc_input.txt << EOF
hello world
hello java
hello python
hello mysql
EOF
# 创建hdfs目录,并上传数据
hdfs dfs -mkdir -p /wc/wc
hdfs dfs -put wc_input.txt /wc/wc/
# 先在hbase中创建表
hbase shell
create 'wc','cf'
quit
# 执行MR,可能还需要在命令后面添加` -libjars $(hbase mapredcp | tr ':' ',')`
HADOOP_CLASSPATH=$(hbase mapredcp):$HBASE_HOME/conf hadoop jar hbase-test.jar org.example.hbase.wc.WCRunner
# 登录hbase shell,查看
hbase shell
# MR执行结果已经写入hbase中
scan 'wc'
ROW COLUMN+CELL
hello column=cf:ct, timestamp=2024-04-15T10:36:46.823, value=4
java column=cf:ct, timestamp=2024-04-15T10:36:46.823, value=1
mysql column=cf:ct, timestamp=2024-04-15T10:36:46.823, value=1
python column=cf:ct, timestamp=2024-04-15T10:36:46.823, value=1
world column=cf:ct, timestamp=2024-04-15T10:36:46.823, value=1
# 退出hbase shell
quit
下面实现wordcount:1.从hbase读取数据,2.将结果存储到hbase
public class WC2Mapper extends TableMapper<Text, IntWritable> {
private byte[] cfBytes = Bytes.toBytes("cf");
private byte[] numBytes = Bytes.toBytes("name");
private final IntWritable one = new IntWritable(1);
@Override
protected void map(ImmutableBytesWritable key, Result value, Mapper<ImmutableBytesWritable, Result, Text, IntWritable>.Context context) throws IOException, InterruptedException {
String name = Bytes.toString(value.getValue(cfBytes, numBytes));
context.write(new Text(name), one);
}
}
public class WC2Reducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable> {
private byte[] cfBytes = Bytes.toBytes("cf");
private byte[] ctBytes = Bytes.toBytes("ct");
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, ImmutableBytesWritable, Mutation>.Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
Put put = new Put(Bytes.toBytes(key.toString()));
// 整型数字,转换成字符串,再往hbase中写
put.addColumn(cfBytes, ctBytes, Bytes.toBytes(String.valueOf(sum)));
// TableOutputFormat.TableRecordWriter.write方法中可以看到key并没有使用,所以这里传null
context.write(null, put);
}
}
public class WC2Runner {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration(true);
conf.set("hbase.zookeeper.quorum", "node02,node03,node04");
// 创建job对象
Job job = Job.getInstance(conf);
job.setJarByClass(WC2Runner.class);
// 设置mapper类
TableMapReduceUtil.initTableMapperJob("wc_origin", new Scan(), WC2Mapper.class, Text.class, IntWritable.class, job);
// 设置reduce类,
TableMapReduceUtil.initTableReducerJob("wc2", WC2Reducer.class, job);
job.waitForCompletion(true);
}
}
# node01节点bigdata用户
# 将上面的项目打包,上传到node01节点
# 登录hbase shell,准备表和数据
# 创建输入表,并写入数据
create 'wc_origin','cf'
put 'wc_origin','1','cf:name','hello'
put 'wc_origin','2','cf:name','hello'
put 'wc_origin','3','cf:name','hello'
put 'wc_origin','4','cf:name','hello'
put 'wc_origin','5','cf:name','world'
put 'wc_origin','6','cf:name','java'
put 'wc_origin','7','cf:name','python'
put 'wc_origin','8','cf:name','mysql'
# 创建输出表
create 'wc2','cf'
# 退出hbase shell
quit
# 执行MR
HADOOP_CLASSPATH=$(hbase mapredcp):$HBASE_HOME/conf hadoop jar hbase-test.jar org.example.hbase.wc2.WC2Runner
# 登录hbase shell,查看
hbase shell
# 查看MR执行结果
scan 'wc2'
ROW COLUMN+CELL
hello column=cf:ct, timestamp=2024-04-15T11:44:46.304, value=4
java column=cf:ct, timestamp=2024-04-15T11:44:46.304, value=1
mysql column=cf:ct, timestamp=2024-04-15T11:44:46.304, value=1
python column=cf:ct, timestamp=2024-04-15T11:44:46.304, value=1
world column=cf:ct, timestamp=2024-04-15T11:44:46.304, value=1
注意:上面两个案例在windows环境中执行有问题(不知道为什么???),上面都是在Linux环境测试通过的
phoenix可以利用sql分析hbase里面的数据