06HBase With MR

6 阅读2分钟

HBase With MR

MR是分布式计算框架,对于数据源和数据目的地没有限制,用户可以任意选择,只不过需要实现两个接口InputFormat和OutputFormat,可以在官网的'HBase MapReduce Examples'中查看案例

当需要从hbase读取数据的时候,必须调用这个方法TableMapReduceUtil.initTableMapperJob设置Mapper。当需要写数据到hbase的时候,必须调用这个这个方法TableMapReduceUtil.initTableReducerJob设置TableReducer即Reducer,如果不需要Reducer也需要调用这个方法,该参数传null即可

下面实现wordcount:1.从hdfs读取数据,2.将结果存储到hbase

以上面"Java API 使用"的maven项目为基础

pom.xml中添加如下依赖

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.2.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-mapreduce</artifactId>
            <version>2.5.8-hadoop3</version>
        </dependency>

三个java类,代码如下所示

public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    private final IntWritable one = new IntWritable(1);
    private final Text word = new Text();
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        String[] splits = value.toString().split(" ");
        for (String str : splits) {
            word.set(str);
            context.write(word, one);
        }
    }
}
public class WCReducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable> {
    private byte[] cfBytes = Bytes.toBytes("cf");
    private byte[] ctBytes = Bytes.toBytes("ct");
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, ImmutableBytesWritable, Mutation>.Context context) throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }
        Put put = new Put(Bytes.toBytes(key.toString()));
        // 整型数字,转换成字符串,再往hbase中写
        put.addColumn(cfBytes, ctBytes, Bytes.toBytes(String.valueOf(sum)));
        context.write(null, put);
    }
}
public class WCRunner {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration(true);
        conf.set("hbase.zookeeper.quorum", "node02,node03,node04");
        // 创建job对象
        Job job = Job.getInstance(conf);
        job.setJarByClass(WCRunner.class);
        // 设置mapper类
        job.setMapperClass(WCMapper.class);
        // TableMapReduceUtil.initTableMapperJob();
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        // 设置reduce类,
        // job.setReducerClass();
        TableMapReduceUtil.initTableReducerJob("wc", WCReducer.class, job);
        // 下面两个配置可以省略
        // job.setOutputKeyClass(NullWriter.class);
        // job.setOutputValueClass(Put.class);
        // 指定输入数据的目录
        FileInputFormat.addInputPath(job, new Path("/wc/wc"));
        job.waitForCompletion(true);
    }
}
# node01节点bigdata用户

# 将上面的项目打包,上传到node01节点

# 准备数据
cat > wc_input.txt << EOF
hello world
hello java
hello python
hello mysql
EOF

# 创建hdfs目录,并上传数据
hdfs dfs -mkdir -p /wc/wc
hdfs dfs -put wc_input.txt /wc/wc/

# 先在hbase中创建表
hbase shell
create 'wc','cf'
quit

# 执行MR,可能还需要在命令后面添加` -libjars $(hbase mapredcp | tr ':' ',')`
HADOOP_CLASSPATH=$(hbase mapredcp):$HBASE_HOME/conf hadoop jar hbase-test.jar org.example.hbase.wc.WCRunner

# 登录hbase shell,查看
hbase shell
# MR执行结果已经写入hbase中
scan 'wc'
ROW                                             COLUMN+CELL
 hello                                          column=cf:ct, timestamp=2024-04-15T10:36:46.823, value=4
 java                                           column=cf:ct, timestamp=2024-04-15T10:36:46.823, value=1
 mysql                                          column=cf:ct, timestamp=2024-04-15T10:36:46.823, value=1
 python                                         column=cf:ct, timestamp=2024-04-15T10:36:46.823, value=1
 world                                          column=cf:ct, timestamp=2024-04-15T10:36:46.823, value=1
 
 # 退出hbase shell
 quit

下面实现wordcount:1.从hbase读取数据,2.将结果存储到hbase

public class WC2Mapper extends TableMapper<Text, IntWritable> {
    private byte[] cfBytes = Bytes.toBytes("cf");
    private byte[] numBytes = Bytes.toBytes("name");
    private final IntWritable one = new IntWritable(1);

    @Override
    protected void map(ImmutableBytesWritable key, Result value, Mapper<ImmutableBytesWritable, Result, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        String name = Bytes.toString(value.getValue(cfBytes, numBytes));
        context.write(new Text(name), one);
    }
}
public class WC2Reducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable> {

    private byte[] cfBytes = Bytes.toBytes("cf");
    private byte[] ctBytes = Bytes.toBytes("ct");

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, ImmutableBytesWritable, Mutation>.Context context) throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }
        Put put = new Put(Bytes.toBytes(key.toString()));
        // 整型数字,转换成字符串,再往hbase中写
        put.addColumn(cfBytes, ctBytes, Bytes.toBytes(String.valueOf(sum)));
        // TableOutputFormat.TableRecordWriter.write方法中可以看到key并没有使用,所以这里传null
        context.write(null, put);
    }
}
public class WC2Runner {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration(true);
        conf.set("hbase.zookeeper.quorum", "node02,node03,node04");
        // 创建job对象
        Job job = Job.getInstance(conf);
        job.setJarByClass(WC2Runner.class);
        // 设置mapper类
        TableMapReduceUtil.initTableMapperJob("wc_origin", new Scan(), WC2Mapper.class, Text.class, IntWritable.class, job);
        // 设置reduce类,
        TableMapReduceUtil.initTableReducerJob("wc2", WC2Reducer.class, job);
        job.waitForCompletion(true);
    }
}
# node01节点bigdata用户

# 将上面的项目打包,上传到node01节点

# 登录hbase shell,准备表和数据
# 创建输入表,并写入数据
create 'wc_origin','cf'
put 'wc_origin','1','cf:name','hello'
put 'wc_origin','2','cf:name','hello'
put 'wc_origin','3','cf:name','hello'
put 'wc_origin','4','cf:name','hello'
put 'wc_origin','5','cf:name','world'
put 'wc_origin','6','cf:name','java'
put 'wc_origin','7','cf:name','python'
put 'wc_origin','8','cf:name','mysql'
# 创建输出表
create 'wc2','cf'
# 退出hbase shell
quit

# 执行MR
HADOOP_CLASSPATH=$(hbase mapredcp):$HBASE_HOME/conf hadoop jar hbase-test.jar org.example.hbase.wc2.WC2Runner

# 登录hbase shell,查看
hbase shell
# 查看MR执行结果
scan 'wc2'
ROW                                             COLUMN+CELL
 hello                                          column=cf:ct, timestamp=2024-04-15T11:44:46.304, value=4
 java                                           column=cf:ct, timestamp=2024-04-15T11:44:46.304, value=1
 mysql                                          column=cf:ct, timestamp=2024-04-15T11:44:46.304, value=1
 python                                         column=cf:ct, timestamp=2024-04-15T11:44:46.304, value=1
 world                                          column=cf:ct, timestamp=2024-04-15T11:44:46.304, value=1

注意:上面两个案例在windows环境中执行有问题(不知道为什么???),上面都是在Linux环境测试通过的

phoenix可以利用sql分析hbase里面的数据