idea获取sparksql程序
import org.apache.spark.sql.{DataFrame, SparkSession}
object Demo1_HelloWord {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
// 可以通过sparkSession获取sparkContext 和 sqlContext对象
// val sparkContext: SparkContext = spark.sparkContext
// val sqlContext: SQLContext = spark.sqlContext
val df: DataFrame = spark.read.json("data/people.data")
df.printSchema()
df.show(10,false)
spark.close()
}
}
通过rdd创建DataFrame
1、先将txt文件转化为rdd。然后在通过case class 将rdd转化为dataframe
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
case class Stu(id:Int,name:String,age:Int,city:String,score:Double)
object CreateDataFrameFromTxt {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
//读取数据
val rdd: RDD[String] = spark.sparkContext.textFile("data/stu.txt")
//切分字段
val rddStu: RDD[Stu] = rdd.map(_.split(",")).map {
arr => Stu(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
}
// 第一种 1、通过createDataFrame创建DataFrame
// val df: DataFrame = spark.createDataFrame(rddStu)
// 第二种 2、通过导入隐士转化 创建DataFrame
import spark.implicits._
val df: DataFrame = rddStu.toDF()
df.printSchema()
df.show()
spark.close()
}
}
2、将文本文件转化为tuple.
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
object CreateDataFrameFromTuple {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()
val rdd: RDD[String] = spark.sparkContext.textFile("data/stu.txt")
val stuRDDTuple: RDD[(Int, String, Int, String, Double)] = rdd.map(_.split(",")).map {
arr => (arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
}
val df: DataFrame = spark.createDataFrame(stuRDDTuple)
df.printSchema()
df.show()
spark.close()
}
}
3、从javaben中创建dataframe
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
object CreateDataFrameFromJavaBean {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()
val rdd: RDD[String] = spark.sparkContext.textFile("data/stu.txt")
val rddBean: RDD[Stu2] = rdd.map(_.split(",")).map {
arr => new Stu2(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
}
val df: DataFrame = spark.createDataFrame(rddBean,classOf[Stu2])
df.show()
spark.close()
}
}
4、从scalabean中创建dataFrame
import scala.beans.BeanProperty
class Stu3 (
@BeanProperty val id: Int,
@BeanProperty val name:String,
@BeanProperty val age:Int,
@BeanProperty val city:String,
@BeanProperty val score:Double
)
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
object CreateDataFrameFromScalaBean {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()
val rdd: RDD[String] = spark.sparkContext.textFile("data/stu.txt")
val rddBean: RDD[Stu3] = rdd.map(_.split(",")).map {
arr => new Stu3(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
}
val df: DataFrame = spark.createDataFrame(rddBean,classOf[Stu3])
df.show()
spark.close()
}
5、从row中创建dataframe
注:DataFrame 中的数据,本质上还是封装在 RDD 中,而 RDD[ T ]总有一个 T 类型,DataFrame 内部 的 RDD 中的元素类型 T 即为框架所定义的 Row 类型;
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DataTypes, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object CreateDataFrameFromRDDRow {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()
val rdd: RDD[String] = spark.sparkContext.textFile("data/stu.txt")
val rddRow: RDD[Row] = rdd.map(_.split(",")).map {
arr => Row(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
}
val schema = new StructType()
.add("id",DataTypes.IntegerType)
.add("name",DataTypes.StringType)
.add("age",DataTypes.IntegerType)
.add("city",DataTypes.StringType)
.add("score",DataTypes.DoubleType)
val df: DataFrame = spark.createDataFrame(rddRow,schema)
df.show()
spark.close()
}
}