1、纯Java API读写HBase的方式
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.*
import org.apache.hadoop.hbase.client.*
import org.apache.hadoop.hbase.filter.*
import org.apache.hadoop.hbase.util.Bytes
import org.slf4j.Logger
import org.slf4j.LoggerFactory
import java.io.IOException
import java.util.ArrayList
import java.util.List
/**
* @author 260502
*/
public class HBaseUtil {
private static final Logger logger
static Configuration conf
static Connection conn = null
static {
logger = LoggerFactory.getLogger(HBaseUtil.class)
// conf的配置默认从resources的hbase-site.xml
conf = HBaseConfiguration.create()
try {
conn = ConnectionFactory.createConnection(conf)
} catch (IOException e) {
e.printStackTrace()
}
}
/**
* 创建HBase表
*
* @param tableName 表名
* @param columnFamily 列簇名
* @throws Exception 异常
*/
public static void createTable(String tableName, String columnFamily) throws Exception {
String key = "1,2,3,4,5,6,7,8,9"
Admin admin = conn.getAdmin()
if (!admin.tableExists(TableName.valueOf(tableName))) {
// 创建列簇描述器,并获得列描述器
ColumnFamilyDescriptor cfd = ColumnFamilyDescriptorBuilder
.newBuilder(Bytes.toBytes(columnFamily))
// 设置最大存储版本号
.setMaxVersion(3)
// 设置列存储压缩方式
.setCompressionType(Compression.Algorithm.SNAPPY)
.build()
// 创建表描述构造器,并获得表描述器
TableDescriptor td = TableDescriptorBuilder
.newBuilder(TableName.valueOf(tableName))
.setColumnFamily(cfd)
.build();
// 创建表,并创建split
admin.createTable(td, Bytes.toByteArrays(key.split(,)))
} else {
logger.info("{}表已存在", tableName)
}
}
/**
* 向HBase中批量插入数据
*
* @param tableName 表名
* @param context 插入的内容
* @throws Exception 异常
*/
public static void batchInsert(String tableName, List<List<String>> context) throws Exception {
Table table = conn.getTable(TableName.valueOf(tableName))
ArrayList<Put> puts = new ArrayList<Put>(16)
for (List<String> valueList : context) {
Put put = new Put(Bytes.toBytes(valueList.get(0)))
// put.addColumn方法接收三个参数:列族、列名、数据
put.addColumn(Bytes.toBytes(valueList.get(1)), Bytes.toBytes(valueList.get(2)), Bytes.toBytes(valueList.get(3)))
puts.add(put)
}
table.put(puts)
table.close()
}
/**
* 根据rowKey删除一行数据、或者删除某一行的某个列簇,或者某一行某个列簇某列
*
* @param tableName 表名
* @param rowKey rowKey
* @param columnFamily 列族
* @param columnName 列名
* @throws Exception 异常
*/
public static void delete(String tableName, String rowKey, String columnFamily, String columnName) throws Exception {
Table table = conn.getTable(TableName.valueOf(tableName))
Delete line = new Delete(Bytes.toBytes(rowKey))
// 根据rowKey删除一整行数据
table.delete(line)
// 根据rowKey删除该行指定列族的数据
line.addFamily(Bytes.toBytes(columnFamily))
// 根据rowKey删除该行指定列族的某一列数据
line.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes(columnName))
table.close()
}
/**
* 根据rowKey、列族、列名修改值
*
* @param tableName 表名
* @param rowKey rowKey
* @param columnFamily 列族
* @param columnName 列名
* @param columnValue 修改值
* @throws Exception 异常
*/
public static void update(String tableName, String rowKey, String columnFamily, String columnName, String columnValue) throws Exception {
Table table = conn.getTable(TableName.valueOf(tableName))
Put put = new Put(Bytes.toBytes(tableName))
put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes(columnName), Bytes.toBytes(columnValue))
table.put(put)
table.close()
}
/**
* 根据rowKey查询数据(通过get方式查询数据)
*
* @param tableName 表名
* @param rowKeys List的rowKey
* @return List
* @throws Exception 异常
*/
public static List<String> queryByRowKey(String tableName, List<String> rowKeys) throws Exception {
Table table = conn.getTable(TableName.valueOf(tableName))
List<String> queryValues = new ArrayList<String>(16)
Get get
for (String rowKey : rowKeys) {
get = new Get(Bytes.toBytes(rowKey))
Result result = table.get(get)
Cell[] cells = result.rawCells()
for (Cell cell : cells) {
queryValues.add(Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength()))
}
}
table.close()
return queryValues
}
public static void scanTable(String tableName) throws Exception {
Table table = conn.getTable(TableName.valueOf(tableName))
// rowKey过滤器
Scan scan = new Scan()
// 全表扫描
ResultScanner scanner = table.getScanner(scan)
// Key1$ 末尾匹配,相当于sql中的 %Key1;^Key1 开头匹配,相当于sql中的 Key1%
RowFilter filter = new RowFilter(CompareOperator.EQUAL, new RegexStringComparator("Key1$"))
scan.setFilter(filter)
ResultScanner scanner1 = table.getScanner(scan)
// 列值过滤器,四个参数分别代表:列族、列名、比较符号、值
String columnFamily = "columnFamily"
String columnName = "columnName"
String value = "value"
SingleColumnValueFilter filter1 = new SingleColumnValueFilter(Bytes.toBytes(columnFamily), Bytes.toBytes(columnName), CompareOperator.EQUAL, Bytes.toBytes(value))
scan.setFilter(filter1)
ResultScanner scanner2 = table.getScanner(scan)
// 列名前缀过滤器
ColumnPrefixFilter filter2 = new ColumnPrefixFilter(Bytes.toBytes(columnName))
scan.setFilter(filter2)
ResultScanner scanner3 = table.getScanner(scan)
// 过滤器集合,等价于sql中的多个判断条件
FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL)
SingleColumnValueFilter filter3 = new SingleColumnValueFilter(Bytes.toBytes(columnFamily), Bytes.toBytes(columnName), CompareOperator.EQUAL, Bytes.toBytes(value))
ColumnPrefixFilter filter4 = new ColumnPrefixFilter(Bytes.toBytes(columnName))
filterList.addFilter(filter3)
filterList.addFilter(filter4)
scan.setFilter(filterList)
ResultScanner scanner4 = table.getScanner(scan)
for (Result result : scanner4) {
String rowKey = Bytes.toString(result.getRow())
Cell[] cells = result.rawCells()
for (Cell cell : cells) {
String rowKeyStr = Bytes.toString(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength())
// 列族名称
String familyStr = Bytes.toString(cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength())
// 列名
String qualifierStr = Bytes.toString(cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength())
// 列值
String valueStr = Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength())
}
}
}
}
2、使用 saveAsHadoopDataset和saveAsNewAPIHadoopDataset 写入数据
import java.net.URI
import org.apache.commons.codec.digest.DigestUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.hbase.client.{Admin, Connection, ConnectionFactory, Put, Result, Table}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat2, LoadIncrementalHFiles}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaRDD.fromRDD
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object HbaseOperator {
def insert(spark: SparkSession, tableName: String, familyColumn: String): Unit = {
val sc: SparkContext = spark.sparkContext
val config: Configuration = HBaseConfiguration.create
val value: RDD[String] = sc.makeRDD(Array("12,jack,16", "11,Lucy,15", "15,mike,17", "13,Lily,14"))
val data: RDD[(ImmutableBytesWritable, Put)] = value.map { line =>
val Array(key, name, age) = line.split(",")
val put: Put = new Put(Bytes.toBytes(key))
put.addColumn(Bytes.toBytes(familyColumn), Bytes.toBytes("name"), Bytes.toBytes(name))
put.addColumn(Bytes.toBytes(familyColumn), Bytes.toBytes("age"), Bytes.toBytes(age))
(new ImmutableBytesWritable(), put)
}
config.set(TableOutputFormat.OUTPUT_TABLE, tableName)
val jobConf: JobConf = new JobConf(config)
jobConf.setOutputFormat(classOf[TableOutputFormat])
data.saveAsHadoopDataset(jobConf)
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
config.set(TableOutputFormat.OUTPUT_TABLE, tableName)
val job: Job = Job.getInstance(config)
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
jobConf.setOutputKeyClass(classOf[ImmutableBytesWritable])
data.saveAsNewAPIHadoopDataset(job.getConfiguration)
}
}
3、使用 newAPIHadoopRDD 读取 HBase 数据
def readFromHBaseWithHBaseNewAPIScan(): Unit ={
//屏蔽不必要的日志显示在终端上
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
val sparkSession = SparkSession.builder().appName("SparkToHBase").master("local").getOrCreate()
val sc = sparkSession.sparkContext
val tableName = "test" val hbaseConf = HBaseConfiguration.create()
hbaseConf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.INPUT_TABLE, tableName)
val scan = new Scan()
scan.addFamily(Bytes.toBytes("cf1"))
val proto = ProtobufUtil.toScan(scan)
val scanToString = new String(Base64.getEncoder.encode(proto.toByteArray))
hbaseConf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.SCAN, scanToString)
//读取数据并转化成rdd TableInputFormat是org.apache.hadoop.hbase.mapreduce包下的
val hbaseRDD = sc.newAPIHadoopRDD(hbaseConf, classOf[org.apache.hadoop.hbase.mapreduce.TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
val dataRDD = hbaseRDD
.map(x => x._2)
.map{result => (result.getRow, result.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("name")), result.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("age")))}
.map(row => (new String(row._1), new String(row._2), new String(row._3)))
.collect()
.foreach(r => (println("rowKey:"+r._1 + ", name:" + r._2 + ", age:" + r._3)))
}
4、利用BulkLoad往HBase批量插入数据
BulkLoad原理是先利用mapreduce在hdfs上生成相应的HFlie文件,然后再把HFile文件导入到HBase中,以此来达到高效批量插入数据。
博客:<https://www.cnblogs.com/swordfall/p/10517177.html>
5、利用Phoenix往HBase读写数据
def readFromHBaseWithPhoenix: Unit ={
//屏蔽不必要的日志显示在终端上
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
val sparkSession = SparkSession.builder().appName("SparkHBaseDataFrame").master("local[4]").getOrCreate()
//表小写,需要加双引号,否则报错
val dbTable = "\"test\""
//spark 读取 phoenix 返回 DataFrame的第一种方式
val rdf = sparkSession.read
.format("jdbc")
.option("driver", "org.apache.phoenix.jdbc.PhoenixDriver")
.option("url", "jdbc:phoenix:192.168.187.201:2181")
.option("dbtable", dbTable)
.load()
val rdfList = rdf.collect()
for (i <- rdfList){
println(i.getString(0) + " " + i.getString(1) + " " + i.getString(2))
}
rdf.printSchema()
//spark 读取 phoenix 返回 DataFrame的第二种方式
val df = sparkSession.read
.format("org.apache.phoenix.spark")
.options(Map("table" -> dbTable, "zkUrl" -> "192.168.187.201:2181"))
.load()
df.printSchema()
val dfList = df.collect()
for (i <- dfList){
println(i.getString(0) + " " + i.getString(1) + " " + i.getString(2))
}
//spark DataFrame 写入 phoenix,需要先建好表
/*df.write
.format("org.apache.phoenix.spark")
.mode(SaveMode.Overwrite)
.options(Map("table" -> "PHOENIXTESTCOPY", "zkUrl" -> "jdbc:phoenix:192.168.187.201:2181"))
.save()
*/
sparkSession.stop()
}