创建HBaseSource包
1、自定义数据源下必须要有DefaultSource
/**
* @theme 自定义数据源下必须要有DefaultSource
* @author 阿左
* @create 2022-05-01
* */
class DefaultSource extends RelationProvider{
override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
HBaseSourceRelation(sqlContext, parameters)
}
}
2、创建自定义的BasseRelation
/**
* @theme HBaseSourceRelation,自定义BaseRelation
* @author 阿左
* @create 2022-05-01
* */
case class HBaseSourceRelation(val sqlContext: SQLContext, val parameters: Map[String, String])
extends BaseRelation with TableScan with Logging{
// val hbaseTableName: String = parameters.getOrElse("hbase.table.name", System.err("the hbase.table.name not found"))
val sparkTableSchema: String = parameters.getOrElse("spark.table.schema", sys.error("the spark.table.schema not found"))
val sparkTableFiles: Array[SparkTableSchema] = HBaseSourceUtils.extractSparkFields(sparkTableSchema)
//构建schema
override def schema: StructType = {
val fields = sparkTableFiles.map(field =>{
val structField = field.fieldType.toLowerCase match {
case "string" => StructField(field.fieldName, StringType)
case "int" => StructField(field.fieldName, IntegerType)
}
structField
})
new StructType(fields)
}
//扫表,全表扫描
override def buildScan(): RDD[Row] = {
val configuration = HBaseConfiguration.create()
configuration.set(HConstants.ZOOKEEPER_QUORUM,"bigdata")
configuration.set(HConstants.ZOOKEEPER_CLIENT_PORT, "2181")
// configuration.set(HConstants.EMPTY_START_ROW, new )
// configuration.set(HConstants.EMPTY_END_ROW, "2181")
configuration.set(TableInputFormat.INPUT_TABLE, "user")
val rdd: RDD[(ImmutableBytesWritable, Result)]
= sqlContext.sparkContext.newAPIHadoopRDD(configuration, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
rdd.map(_._2).map(result=>{
//结果集
val buffer = ArrayBuffer[Any]()
sparkTableFiles.foreach(field =>{
field.fieldType.toLowerCase() match {
case "string" =>{
val cf: Array[Byte] = Bytes.toBytes("o")
val column = Bytes.toBytes(field.fieldName)
// string类型
buffer += new String(result.getValue(cf, column))
}
case "int" =>{
val cf: Array[Byte] = Bytes.toBytes("o")
val column = Bytes.toBytes(field.fieldName)
// int类型
buffer += Integer.parseInt(new String(result.getValue(cf, column)))
}
}
})
Row.fromSeq(buffer)
})
}
}
option参数解析工具
/**
* @theme 参数解析utils
* @author 阿左
* @create 2022-05-01
* */
object HBaseSourceUtils {
def extractSparkFields(sparkTableSchema :String) :Array[SparkTableSchema] = {
sparkTableSchema.drop(1).dropRight(1).split(",").map(x=>{
val filed = x.trim.split(" ")
SparkTableSchema(filed(0).trim, filed(1).trim)
})
}
}
case class封装数据类型,schema
/**
* @theme case class,封装tableschame
* @author 阿左
* @create 2022-05-01
* */
package object hbaseSource {
case class SparkTableSchema(fieldName :String, fieldType :String)
}
测试类
/**
* @theme 自定义数据源测试
* @author 阿左
* @create 2022-05-01
* */
object HbaseRelationTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
val df = spark.read.format("com.ruozedata.saprk3.hbaseSource")
.option("hbase.table.name", "user")
.option("spark.table.schema", "(age int, name string, sex string)")
.load()
df.printSchema()
df.show()
spark.stop()
}
}
hbase数据:
执行结果: