读写HBase的几种方式

301 阅读4分钟
1、纯Java API读写HBase的方式
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @author 260502
 */
public class HBaseUtil {
    private static final Logger logger;
    static Configuration conf;
    static Connection conn = null;

    static {
        logger = LoggerFactory.getLogger(HBaseUtil.class);
        // conf的配置默认从resources的hbase-site.xml
        conf = HBaseConfiguration.create();
        try {
            conn = ConnectionFactory.createConnection(conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * 创建HBase表
     *
     * @param tableName    表名
     * @param columnFamily 列簇名
     * @throws Exception 异常
     */
    public static void createTable(String tableName, String columnFamily) throws Exception {
        String key = "1,2,3,4,5,6,7,8,9";
        Admin admin = conn.getAdmin();
        if (!admin.tableExists(TableName.valueOf(tableName))) {
            // 创建列簇描述器,并获得列描述器
            ColumnFamilyDescriptor cfd = ColumnFamilyDescriptorBuilder
                .newBuilder(Bytes.toBytes(columnFamily))
                // 设置最大存储版本号
                .setMaxVersion(3)
                // 设置列存储压缩方式
                .setCompressionType(Compression.Algorithm.SNAPPY)
                .build();
            // 创建表描述构造器,并获得表描述器
            TableDescriptor td = TableDescriptorBuilder
                .newBuilder(TableName.valueOf(tableName))
                .setColumnFamily(cfd)
                .build();
            // 创建表,并创建split
            admin.createTable(td, Bytes.toByteArrays(key.split(,)));
        } else {
            logger.info("{}表已存在", tableName);
        }
    }
	
	/**
     * 向HBase中批量插入数据
     *
     * @param tableName 表名
     * @param context   插入的内容
     * @throws Exception 异常
     */
    public static void batchInsert(String tableName, List<List<String>> context) throws Exception {
        Table table = conn.getTable(TableName.valueOf(tableName));
        ArrayList<Put> puts = new ArrayList<Put>(16);
        for (List<String> valueList : context) {
            Put put = new Put(Bytes.toBytes(valueList.get(0)));
            // put.addColumn方法接收三个参数:列族、列名、数据
            put.addColumn(Bytes.toBytes(valueList.get(1)), Bytes.toBytes(valueList.get(2)), Bytes.toBytes(valueList.get(3)));
            puts.add(put);
        }
        table.put(puts);

        table.close();
    }

    /**
     * 根据rowKey删除一行数据、或者删除某一行的某个列簇,或者某一行某个列簇某列
     *
     * @param tableName    表名
     * @param rowKey       rowKey
     * @param columnFamily 列族
     * @param columnName   列名
     * @throws Exception 异常
     */
    public static void delete(String tableName, String rowKey, String columnFamily, String columnName) throws Exception {
        Table table = conn.getTable(TableName.valueOf(tableName));
        Delete line = new Delete(Bytes.toBytes(rowKey));
        // 根据rowKey删除一整行数据
        table.delete(line);

        // 根据rowKey删除该行指定列族的数据
        line.addFamily(Bytes.toBytes(columnFamily));

        // 根据rowKey删除该行指定列族的某一列数据
        line.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes(columnName));
        table.close();
    }
	
	/**
     * 根据rowKey、列族、列名修改值
     *
     * @param tableName    表名
     * @param rowKey       rowKey
     * @param columnFamily 列族
     * @param columnName   列名
     * @param columnValue  修改值
     * @throws Exception 异常
     */
    public static void update(String tableName, String rowKey, String columnFamily, String columnName, String columnValue) throws Exception {
        Table table = conn.getTable(TableName.valueOf(tableName));
        Put put = new Put(Bytes.toBytes(tableName));
        put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes(columnName), Bytes.toBytes(columnValue));
        table.put(put);
        table.close();
    }

    /**
     * 根据rowKey查询数据(通过get方式查询数据)
     *
     * @param tableName 表名
     * @param rowKeys   List的rowKey
     * @return List
     * @throws Exception 异常
     */
    public static List<String> queryByRowKey(String tableName, List<String> rowKeys) throws Exception {
        Table table = conn.getTable(TableName.valueOf(tableName));
        List<String> queryValues = new ArrayList<String>(16);
        Get get;
        for (String rowKey : rowKeys) {
            get = new Get(Bytes.toBytes(rowKey));
            Result result = table.get(get);
            Cell[] cells = result.rawCells();
            for (Cell cell : cells) {
                queryValues.add(Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength()));
            }
        }
        table.close();
        return queryValues;
    }
	
	public static void scanTable(String tableName) throws Exception {
        Table table = conn.getTable(TableName.valueOf(tableName));

        // rowKey过滤器
        Scan scan = new Scan();
        // 全表扫描
        ResultScanner scanner = table.getScanner(scan);

        // Key1$ 末尾匹配,相当于sql中的 %Key1;^Key1 开头匹配,相当于sql中的 Key1%
        RowFilter filter = new RowFilter(CompareOperator.EQUAL, new RegexStringComparator("Key1$"));
        scan.setFilter(filter);
        ResultScanner scanner1 = table.getScanner(scan);

        // 列值过滤器,四个参数分别代表:列族、列名、比较符号、值
        String columnFamily = "columnFamily";
        String columnName = "columnName";
        String value = "value";
        SingleColumnValueFilter filter1 = new SingleColumnValueFilter(Bytes.toBytes(columnFamily), Bytes.toBytes(columnName), CompareOperator.EQUAL, Bytes.toBytes(value));
        scan.setFilter(filter1);
        ResultScanner scanner2 = table.getScanner(scan);

        // 列名前缀过滤器
        ColumnPrefixFilter filter2 = new ColumnPrefixFilter(Bytes.toBytes(columnName));
        scan.setFilter(filter2);
        ResultScanner scanner3 = table.getScanner(scan);

        // 过滤器集合,等价于sql中的多个判断条件
        FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL);
        SingleColumnValueFilter filter3 = new SingleColumnValueFilter(Bytes.toBytes(columnFamily), Bytes.toBytes(columnName), CompareOperator.EQUAL, Bytes.toBytes(value));
        ColumnPrefixFilter filter4 = new ColumnPrefixFilter(Bytes.toBytes(columnName));
        filterList.addFilter(filter3);
        filterList.addFilter(filter4);
        scan.setFilter(filterList);
        ResultScanner scanner4 = table.getScanner(scan);
		
		for (Result result : scanner4) {
            String rowKey = Bytes.toString(result.getRow());
            Cell[] cells = result.rawCells();
            for (Cell cell : cells) {
                String rowKeyStr = Bytes.toString(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength());
                // 列族名称
                String familyStr = Bytes.toString(cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength());
                // 列名
                String qualifierStr = Bytes.toString(cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength());
                // 列值
                String valueStr = Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength());
            }
        }
    }
}
2、使用 saveAsHadoopDataset和saveAsNewAPIHadoopDataset 写入数据
import java.net.URI

import org.apache.commons.codec.digest.DigestUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.hbase.client.{Admin, Connection, ConnectionFactory, Put, Result, Table}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat2, LoadIncrementalHFiles}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaRDD.fromRDD
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

import scala.collection.mutable.ListBuffer

/**
 * spark操作HBase的几种方式
 *
 * @author 260502
 */
object HbaseOperator {
  /**
   * saveAsHadoopDataset spark2.0以前的写法
   * saveAsNewAPIHadoopDataset spark2.0以后的写法
   */
  def insert(spark: SparkSession, tableName: String, familyColumn: String): Unit = {
    val sc: SparkContext = spark.sparkContext
    // 配置参数不设置,默认从resources目录下hbase-site.xml中获取
    val config: Configuration = HBaseConfiguration.create
	
	// 测试数据(RDD类型的数据)
    val value: RDD[String] = sc.makeRDD(Array("12,jack,16", "11,Lucy,15", "15,mike,17", "13,Lily,14"))
    val data: RDD[(ImmutableBytesWritable, Put)] = value.map { line =>
      val Array(key, name, age) = line.split(",")
      val put: Put = new Put(Bytes.toBytes(key))
      put.addColumn(Bytes.toBytes(familyColumn), Bytes.toBytes("name"), Bytes.toBytes(name))
      put.addColumn(Bytes.toBytes(familyColumn), Bytes.toBytes("age"), Bytes.toBytes(age))
      (new ImmutableBytesWritable(), put)
    }
    // 初始job, 设置输出格式,以saveAsHadoopDataset方式存入,TableOutputFormat 是 org.apache.hadoop.hbase.mapred包下的
    config.set(TableOutputFormat.OUTPUT_TABLE, tableName)
    val jobConf: JobConf = new JobConf(config)
    jobConf.setOutputFormat(classOf[TableOutputFormat])
    data.saveAsHadoopDataset(jobConf)

    // 初始job, 设置输出格式,以saveAsNewAPIHadoopDataset方式存入,TableOutputFormat 是 org.apache.hadoop.hbase.mapreduce包下的
    import org.apache.hadoop.hbase.mapreduce.TableOutputFormat

    config.set(TableOutputFormat.OUTPUT_TABLE, tableName)
    val job: Job = Job.getInstance(config)
    // 指定写入key的类型
    job.setOutputKeyClass(classOf[ImmutableBytesWritable])
    // TODO 指定写入value的类型,不太懂为什么是Result类型
    job.setOutputValueClass(classOf[Result])
    // 设置输出格式
    job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
    jobConf.setOutputKeyClass(classOf[ImmutableBytesWritable])
    data.saveAsNewAPIHadoopDataset(job.getConfiguration)
  }
}
3、使用 newAPIHadoopRDD 读取 HBase 数据
 def readFromHBaseWithHBaseNewAPIScan(): Unit ={ 
	//屏蔽不必要的日志显示在终端上
   	Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
   	val sparkSession = SparkSession.builder().appName("SparkToHBase").master("local").getOrCreate()
   	val sc = sparkSession.sparkContext

   	val tableName = "test" val hbaseConf = HBaseConfiguration.create()
   	hbaseConf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.INPUT_TABLE, tableName)

   	val scan = new Scan()
   	scan.addFamily(Bytes.toBytes("cf1"))
   	val proto = ProtobufUtil.toScan(scan)
   	val scanToString = new String(Base64.getEncoder.encode(proto.toByteArray))
   	hbaseConf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.SCAN, scanToString) 
	//读取数据并转化成rdd TableInputFormat是org.apache.hadoop.hbase.mapreduce包下的
   	val hbaseRDD = sc.newAPIHadoopRDD(hbaseConf, classOf[org.apache.hadoop.hbase.mapreduce.TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])

   	val dataRDD = hbaseRDD
     .map(x => x._2)
     .map{result => (result.getRow, result.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("name")), result.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("age")))}
	 .map(row => (new String(row._1), new String(row._2), new String(row._3)))
     .collect()
     .foreach(r => (println("rowKey:"+r._1 + ", name:" + r._2 + ", age:" + r._3)))
}
4、利用BulkLoad往HBase批量插入数据
BulkLoad原理是先利用mapreduce在hdfs上生成相应的HFlie文件,然后再把HFile文件导入到HBase中,以此来达到高效批量插入数据。 
博客:<https://www.cnblogs.com/swordfall/p/10517177.html>
5、利用Phoenix往HBase读写数据
def readFromHBaseWithPhoenix: Unit ={
   //屏蔽不必要的日志显示在终端上
   Logger.getLogger("org.apache.spark").setLevel(Level.WARN)

   val sparkSession = SparkSession.builder().appName("SparkHBaseDataFrame").master("local[4]").getOrCreate()

   //表小写,需要加双引号,否则报错
   val dbTable = "\"test\""

   //spark 读取 phoenix 返回 DataFrame的第一种方式
   val rdf = sparkSession.read
     .format("jdbc")
     .option("driver", "org.apache.phoenix.jdbc.PhoenixDriver")
     .option("url", "jdbc:phoenix:192.168.187.201:2181")
     .option("dbtable", dbTable)
     .load()

   val rdfList = rdf.collect()
   for (i <- rdfList){
     println(i.getString(0) + " " + i.getString(1) + " " + i.getString(2))
   }
   rdf.printSchema()

   //spark 读取 phoenix 返回 DataFrame的第二种方式
   val df = sparkSession.read
     .format("org.apache.phoenix.spark")
     .options(Map("table" -> dbTable, "zkUrl" -> "192.168.187.201:2181"))
     .load()
   df.printSchema()
   val dfList = df.collect()
   for (i <- dfList){
      println(i.getString(0) + " " + i.getString(1) + " " + i.getString(2))
   }
   //spark DataFrame 写入 phoenix,需要先建好表
   /*df.write
     .format("org.apache.phoenix.spark")
     .mode(SaveMode.Overwrite)
     .options(Map("table" -> "PHOENIXTESTCOPY", "zkUrl" -> "jdbc:phoenix:192.168.187.201:2181"))
     .save()
*/
   sparkSession.stop()
}