Spark对接HBase外部数据源

270 阅读1分钟

创建HBaseSource包

image.png

1、自定义数据源下必须要有DefaultSource

/**
 * @theme 自定义数据源下必须要有DefaultSource
 * @author 阿左
 * @create 2022-05-01
 * */
class DefaultSource extends RelationProvider{
    override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
        HBaseSourceRelation(sqlContext, parameters)
    }
}

2、创建自定义的BasseRelation

/**
 * @theme HBaseSourceRelation,自定义BaseRelation
 * @author 阿左
 * @create 2022-05-01
 * */
case class HBaseSourceRelation(val sqlContext: SQLContext, val parameters: Map[String, String])
        extends BaseRelation with TableScan with Logging{

//    val hbaseTableName: String = parameters.getOrElse("hbase.table.name", System.err("the hbase.table.name not found"))
    val sparkTableSchema: String = parameters.getOrElse("spark.table.schema", sys.error("the spark.table.schema not found"))

    val sparkTableFiles: Array[SparkTableSchema] = HBaseSourceUtils.extractSparkFields(sparkTableSchema)

    //构建schema
    override def schema: StructType = {
        val fields = sparkTableFiles.map(field =>{
            val structField = field.fieldType.toLowerCase match {
                case "string" => StructField(field.fieldName, StringType)
                case "int" => StructField(field.fieldName, IntegerType)
            }
            structField
        })
        new StructType(fields)
     }

    //扫表,全表扫描
    override def buildScan(): RDD[Row] = {
        val configuration = HBaseConfiguration.create()
        configuration.set(HConstants.ZOOKEEPER_QUORUM,"bigdata")
        configuration.set(HConstants.ZOOKEEPER_CLIENT_PORT, "2181")
//        configuration.set(HConstants.EMPTY_START_ROW, new )
//        configuration.set(HConstants.EMPTY_END_ROW, "2181")
        configuration.set(TableInputFormat.INPUT_TABLE, "user")


        val rdd: RDD[(ImmutableBytesWritable, Result)]
        = sqlContext.sparkContext.newAPIHadoopRDD(configuration, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])

        rdd.map(_._2).map(result=>{

            //结果集
            val buffer = ArrayBuffer[Any]()
            sparkTableFiles.foreach(field =>{
                field.fieldType.toLowerCase() match {
                    case "string" =>{
                        val cf: Array[Byte] = Bytes.toBytes("o")
                        val column = Bytes.toBytes(field.fieldName)
                        // string类型
                        buffer += new String(result.getValue(cf, column))
                    }
                    case "int" =>{
                        val cf: Array[Byte] = Bytes.toBytes("o")
                        val column = Bytes.toBytes(field.fieldName)
                        // int类型
                        buffer += Integer.parseInt(new String(result.getValue(cf, column)))
                    }
                }
            })
            Row.fromSeq(buffer)
        })
    }
}

option参数解析工具

/**
 * @theme 参数解析utils
 * @author 阿左
 * @create 2022-05-01
 * */
object HBaseSourceUtils {
    def extractSparkFields(sparkTableSchema :String) :Array[SparkTableSchema] = {
        sparkTableSchema.drop(1).dropRight(1).split(",").map(x=>{
            val filed = x.trim.split(" ")
            SparkTableSchema(filed(0).trim, filed(1).trim)
        })
    }

}

case class封装数据类型,schema

/**
 * @theme case class,封装tableschame
 * @author 阿左
 * @create 2022-05-01
 * */
package object hbaseSource {
    case class SparkTableSchema(fieldName :String, fieldType :String)
}

测试类

/**
 * @theme 自定义数据源测试
 * @author 阿左
 * @create 2022-05-01
 * */
object HbaseRelationTest {
    def main(args: Array[String]): Unit = {
        val sparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")

        val spark = SparkSession.builder().config(sparkConf).getOrCreate()

        val df = spark.read.format("com.ruozedata.saprk3.hbaseSource")
                .option("hbase.table.name", "user")
                .option("spark.table.schema", "(age int, name string, sex string)")
                .load()

        df.printSchema()
        df.show()

        spark.stop()
    }
}

hbase数据:

image.png 执行结果:

image.png