spark-sql解析,从RDD中创建DataFrame

365 阅读2分钟

idea获取sparksql程序

import org.apache.spark.sql.{DataFrame, SparkSession}

object Demo1_HelloWord {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()
//    可以通过sparkSession获取sparkContext 和 sqlContext对象
//    val sparkContext: SparkContext = spark.sparkContext
//    val sqlContext: SQLContext = spark.sqlContext
    val df: DataFrame = spark.read.json("data/people.data")
    df.printSchema()
    df.show(10,false)
    spark.close()
  }
}

通过rdd创建DataFrame

1、先将txt文件转化为rdd。然后在通过case class 将rdd转化为dataframe

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

case class Stu(id:Int,name:String,age:Int,city:String,score:Double)

object CreateDataFrameFromTxt {

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()
//读取数据
    val rdd: RDD[String] = spark.sparkContext.textFile("data/stu.txt")
//切分字段
    val rddStu: RDD[Stu] = rdd.map(_.split(",")).map {
      arr => Stu(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
    }
//   第一种 1、通过createDataFrame创建DataFrame
//    val df: DataFrame = spark.createDataFrame(rddStu)
//   第二种 2、通过导入隐士转化 创建DataFrame
    import spark.implicits._
    val df: DataFrame = rddStu.toDF()

    df.printSchema()
    df.show()
    spark.close()
  }
}

2、将文本文件转化为tuple.

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

object CreateDataFrameFromTuple {

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()

    val rdd: RDD[String] = spark.sparkContext.textFile("data/stu.txt")

    val stuRDDTuple: RDD[(Int, String, Int, String, Double)] = rdd.map(_.split(",")).map {
      arr => (arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
    }

    val df: DataFrame = spark.createDataFrame(stuRDDTuple)

    df.printSchema()
    df.show()
    spark.close()
  }
}

3、从javaben中创建dataframe

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

object CreateDataFrameFromJavaBean {

  def main(args: Array[String]): Unit = {

    val spark: SparkSession = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()

    val rdd: RDD[String] = spark.sparkContext.textFile("data/stu.txt")

    val rddBean: RDD[Stu2] = rdd.map(_.split(",")).map {

      arr => new Stu2(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
    }

    val df: DataFrame = spark.createDataFrame(rddBean,classOf[Stu2])
    df.show()

    spark.close()
  }
}

4、从scalabean中创建dataFrame

import scala.beans.BeanProperty

class Stu3 (
  @BeanProperty val  id: Int,
  @BeanProperty  val name:String,
  @BeanProperty  val age:Int,
  @BeanProperty  val city:String,
  @BeanProperty  val score:Double

)
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

object CreateDataFrameFromScalaBean {

  def main(args: Array[String]): Unit = {

    val spark: SparkSession = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()

    val rdd: RDD[String] = spark.sparkContext.textFile("data/stu.txt")

    val rddBean: RDD[Stu3] = rdd.map(_.split(",")).map {
      arr => new Stu3(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
    }

    val df: DataFrame = spark.createDataFrame(rddBean,classOf[Stu3])

    df.show()
    spark.close()


  }

5、从row中创建dataframe

注:DataFrame 中的数据,本质上还是封装在 RDD 中,而 RDD[ T ]总有一个 T 类型,DataFrame 内部 的 RDD 中的元素类型 T 即为框架所定义的 Row 类型;

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DataTypes, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

object CreateDataFrameFromRDDRow {

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()

    val rdd: RDD[String] = spark.sparkContext.textFile("data/stu.txt")

    val rddRow: RDD[Row] = rdd.map(_.split(",")).map {
      arr => Row(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
    }


    val schema = new StructType()
                   .add("id",DataTypes.IntegerType)
                   .add("name",DataTypes.StringType)
                   .add("age",DataTypes.IntegerType)
                   .add("city",DataTypes.StringType)
                   .add("score",DataTypes.DoubleType)

    val df: DataFrame = spark.createDataFrame(rddRow,schema)

    df.show()

    spark.close()

  }

}