本文已参与「新人创作礼」活动,一起开启掘金创作之路。
6、API
CDC在Sql中无法开窗故用API做demo示例。
package com.bugboy.analysis
import java.sql.Date
import java.text.SimpleDateFormat
import java.util.Properties
import org.apache.flink.formats.avro.registry.confluent.debezium.DebeziumAvroDeserializationSchema
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks
import org.apache.flink.streaming.api.scala.function.ProcessAllWindowFunction
import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _}
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.table.api.{DataTypes, TableSchema}
import org.apache.flink.table.data.RowData
import org.apache.flink.table.runtime.typeutils.InternalTypeInfo
import org.apache.flink.table.types.logical.RowType
import org.apache.flink.types.RowKind
import org.apache.flink.util.Collector
import org.apache.kafka.clients.consumer.ConsumerConfig
/**
* 消费总金额
*/
object TotalCost {
val TOPIC: String = "ods.userAnalysis.order"
val SCHEMA_REGIST_URL: String = "http://你猜:8081"
val BOOTSTRAP_SERVERS: String = "打码:9092,"
val GROUP_ID: String = "flink-analysis"
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.getConfig.setAutoWatermarkInterval(200L)
env.getCheckpointConfig.setCheckpointInterval(1000L)
val schema = TableSchema.builder()
.field("id", DataTypes.INT)
.field("timestamps", DataTypes.BIGINT())
.field("orderInformationId", DataTypes.STRING())
.field("userId", DataTypes.STRING())
.field("categoryId", DataTypes.INT())
.field("productId", DataTypes.INT())
.field("price", DataTypes.DECIMAL(10, 2))
.field("productCount", DataTypes.INT())
.field("priceSum", DataTypes.DECIMAL(10, 2))
.field("shipAddress", DataTypes.STRING())
.field("receiverAddress", DataTypes.STRING())
.build()
val props = new Properties()
props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS)
props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, GROUP_ID)
val rowType = schema.toRowDataType.getLogicalType.asInstanceOf[RowType]
val deserialization: DebeziumAvroDeserializationSchema = new DebeziumAvroDeserializationSchema(rowType, InternalTypeInfo.of(rowType), SCHEMA_REGIST_URL)
val kafkaConsumerSource = new FlinkKafkaConsumer(TOPIC, deserialization, props)
.setStartFromEarliest()
.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[RowData] {
val maxOutOfOrderness = 1000L
var currentMaxTime: Long = 0L
override def extractTimestamp(t: RowData, l: Long): Long = {
val currentTime = t.getLong(1)
currentMaxTime = scala.math.max(currentMaxTime, currentTime)
currentTime
}
override def getCurrentWatermark: Watermark = {
new Watermark(currentMaxTime - maxOutOfOrderness)
}
})
env.addSource(kafkaConsumerSource)
.windowAll(SlidingEventTimeWindows.of(Time.minutes(1), Time.minutes(1)))
.process(new ProcessAllWindowFunction[RowData, (String, BigDecimal), TimeWindow]() {
var sum: BigDecimal = BigDecimal(0)
val FORMAT: String = "yyyy-MM-dd HH:mm"
def formatTime(time: Long): String = {
new SimpleDateFormat(FORMAT).format(new Date(time))
}
override def process(context: Context, elements: Iterable[RowData], out: Collector[(String, BigDecimal)]): Unit = {
elements.foreach(row => {
val priceSum = row.getDecimal(8, 10, 2).toBigDecimal
row.getRowKind match {
case RowKind.INSERT => sum = sum + priceSum
case RowKind.UPDATE_BEFORE => sum = sum - priceSum
case RowKind.UPDATE_AFTER => sum = sum + priceSum
case RowKind.DELETE => sum = sum - priceSum
}
})
val windowEnd = context.window.getEnd
out.collect((formatTime(windowEnd), sum))
}
}).print()
env.execute()
}
}
6、技术探讨
6.1 延迟数据
所以迟到数据可以说是一种特殊的乱序数据,因为是在窗口关闭后才到达的数据。一般这种情况有三种处理办法:
- 重新激活已经关闭的窗口并重新计算以修正结果。
- 将迟到数据收集起来另外处理。
- 将迟到数据视为错误消息并丢弃。
Flink默认采用第三种方法,将迟到数据视为错误消息丢弃。想要使用前两种方法需要使用到sideOutput机制和allowedLateness机制。
sideOutput机制可以将迟到事件单独放入一个数据流分支,这会作为 window 计算结果的副产品,以便用户获取并对其进行特殊处理。
allowedLateness机制允许用户设置一个允许的最大迟到时长。Flink 会在窗口关闭后一直保存窗口的状态直至超过允许迟到时长,这期间的迟到事件不会被丢弃,而是默认会触发窗口重新计算。
所以,如果要设置允许延迟的时间,可以通过DataStream.allowedLateness(lateness: Time)。如果要保存延迟数据要通过sideOutputLateData(outputTag: OutputTag[T])来保存。而要获取已经保存的延迟数据,则要通过DataStream.getSideOutput(tag: OutputTag[X])。
默认情况下,watermark到达窗口结束后,完成聚合操作,只会执行1次,相关于如果定义了1天的时间窗口,1天之后才 能看下结果。这个时候如果需要实时看到结果,需要定义触发器
watermark到达窗口结束前的发射策略是否开启:table.exec.emit.early-fire.enabled,默认false table.exec.emit.early-fire.delay,窗口结束前的发射间隔,单位毫秒。=0,无间隔,>0 间隔时间,<0 非法值。无默认值
watermark到达窗口结束后的发射策略是否开启 table.exec.emit.late-fire.enabled,默认fasle table.exec.emit.late-fire.delay,设置间隔时间
6.2 动态表
6.3 时态表
6.4 Joins
6.5 查询配置
6.6 流式聚合
7、 FlinkSql存在的Bug
- fFlinkSql 目前不支持对CDC数据进行开窗计算。
-- sql
select sum(priceSum) from ods_order group by TUMBLE(ts, INTERVAL '1' DAY)
-- error
Exception in thread "main" org.apache.flink.table.api.TableException: GroupWindowAggregate doesn't support consuming update and delete changes which is produced by node TableSourceScan(table=[[default_catalog, default_database, ods_order, watermark=[-(TO_TIMESTAMP(FROM_UNIXTIME(/($1, 1000))), 3000:INTERVAL SECOND)]]], fields=[id, timestamps, orderInformationId, userId, categoryId, productId, price, productCount, priceSum, shipAddress, receiverAddress])
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.createNewNode(FlinkChangelogModeInferenceProgram.scala:380)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visit(FlinkChangelogModeInferenceProgram.scala:298)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChild(FlinkChangelogModeInferenceProgram.scala:337)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1(FlinkChangelogModeInferenceProgram.scala:326)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1$adapted(FlinkChangelogModeInferenceProgram.scala:325)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:285)
at scala.collection.immutable.Range.foreach(Range.scala:158)
at scala.collection.TraversableLike.map(TraversableLike.scala:285)
at scala.collection.TraversableLike.map$(TraversableLike.scala:278)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChildren(FlinkChangelogModeInferenceProgram.scala:325)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visit(FlinkChangelogModeInferenceProgram.scala:275)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChild(FlinkChangelogModeInferenceProgram.scala:337)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1(FlinkChangelogModeInferenceProgram.scala:326)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1$adapted(FlinkChangelogModeInferenceProgram.scala:325)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:285)
at scala.collection.immutable.Range.foreach(Range.scala:158)
at scala.collection.TraversableLike.map(TraversableLike.scala:285)
at scala.collection.TraversableLike.map$(TraversableLike.scala:278)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChildren(FlinkChangelogModeInferenceProgram.scala:325)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visit(FlinkChangelogModeInferenceProgram.scala:275)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChild(FlinkChangelogModeInferenceProgram.scala:337)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1(FlinkChangelogModeInferenceProgram.scala:326)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1$adapted(FlinkChangelogModeInferenceProgram.scala:325)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:285)
at scala.collection.immutable.Range.foreach(Range.scala:158)
at scala.collection.TraversableLike.map(TraversableLike.scala:285)
at scala.collection.TraversableLike.map$(TraversableLike.scala:278)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChildren(FlinkChangelogModeInferenceProgram.scala:325)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChildren(FlinkChangelogModeInferenceProgram.scala:318)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visit(FlinkChangelogModeInferenceProgram.scala:200)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChild(FlinkChangelogModeInferenceProgram.scala:337)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1(FlinkChangelogModeInferenceProgram.scala:326)
at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1$adapted(FlinkChangelogModeInferenceProgram.scala:325)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:285)
at scala.collection.immutable.Range.foreach(Range.scala:158)
at scala.collection.TraversableLike.map(TraversableLike.scala:285)
at scala.collection.TraversableLike.map$(TraversableLike.scala:278)