spark官网对于sparkStreaming的介绍
sparkStreaming的特点:
1、 使用简单
2、 容错性好
3、 和spark能够无缝衔接
数据流的处理:
核心计算思想
SparkStreaming数据抽象
DStream? 什么是DStream?
SparkStreaming案例,基本使用
yum install -y nc
package com.spark.day3.stream
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* @author yogurt
* @Date 2022/11/19 - 12:24 - 2022
*
*/
object WordCount1 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("stream").setMaster("local[*]")
val context = new SparkContext(conf)
context.setLogLevel("error")
val sc = new StreamingContext(context,Seconds(5))
val line = sc.socketTextStream("hadoop10",9999)
val word = line.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
word.print()
// 启动
sc.start()
sc.awaitTermination()
}
}
从socket输入数据
控制台输出:
package com.spark.day3.stream
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* @author yogurt
* @Date 2022/11/19 - 15:17 - 2022
*
*/
object WordCount2 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("stream").setMaster("local[*]")
val context = new SparkContext(conf)
context.setLogLevel("error")
val sc = new StreamingContext(context,Seconds(5))
sc.checkpoint("./ckp")
val line = sc.socketTextStream("hadoop10",9999)
val reslut: (Seq[Int], Option[Int]) => Option[Int] = (currentValues: Seq[Int], historyValue: Option[Int]) => {
if (currentValues.size>0 ){
val value = currentValues.sum + historyValue.getOrElse(0)
Some(value)
}else{
historyValue
}
}
val unit = line.flatMap(_.split(" ")).map((_,1)).updateStateByKey(reslut)
unit.print()
sc.start()
sc.awaitTermination()
}
}
package com.spark.day3.stream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* @author yogurt
* @Date 2022/11/19 - 12:24 - 2022
*
*/
object WordCount3 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("stream").setMaster("local[*]")
val context = new SparkContext(conf)
context.setLogLevel("error")
val sc = new StreamingContext(context,Seconds(5))
val line = sc.socketTextStream("hadoop10",9999)
//.reduceByKey(_ + _)
// windowDuration :窗口长度/窗口大小,表示要计算最近多长时间的数据
// slideDuration : 滑动间隔,表示每隔多长时间计算一次
// 注意:windowDuration和slideDuration必须是batchDuration的倍数
// 每隔5s(滑动间隔)计算最近10s(窗口长度/窗口大小)的数据
//reduceByKeyAndWindow(聚合函数,windowDuration,slideDuration)
//.reduceByKeyAndWindow(_+_,Seconds(10),Seconds(5))
val word = line.flatMap(_.split(" ")).map((_,1)).reduceByKeyAndWindow((a:Int,b:Int) =>a+b,Seconds(10),Seconds(5))
word.print()
// 启动
sc.start()
sc.awaitTermination()
}
}