基于FlinkSql实时数仓构建之三

564 阅读3分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路。

6、API

CDC在Sql中无法开窗故用API做demo示例。

package com.bugboy.analysis

import java.sql.Date
import java.text.SimpleDateFormat
import java.util.Properties

import org.apache.flink.formats.avro.registry.confluent.debezium.DebeziumAvroDeserializationSchema
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks
import org.apache.flink.streaming.api.scala.function.ProcessAllWindowFunction
import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _}
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.table.api.{DataTypes, TableSchema}
import org.apache.flink.table.data.RowData
import org.apache.flink.table.runtime.typeutils.InternalTypeInfo
import org.apache.flink.table.types.logical.RowType
import org.apache.flink.types.RowKind
import org.apache.flink.util.Collector
import org.apache.kafka.clients.consumer.ConsumerConfig

/**
 * 消费总金额
 */
object TotalCost {
  val TOPIC: String = "ods.userAnalysis.order"
  val SCHEMA_REGIST_URL: String = "http://你猜:8081"
  val BOOTSTRAP_SERVERS: String = "打码:9092,"
  val GROUP_ID: String = "flink-analysis"

  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.getConfig.setAutoWatermarkInterval(200L)
    env.getCheckpointConfig.setCheckpointInterval(1000L)
    val schema = TableSchema.builder()
      .field("id", DataTypes.INT)
      .field("timestamps", DataTypes.BIGINT())
      .field("orderInformationId", DataTypes.STRING())
      .field("userId", DataTypes.STRING())
      .field("categoryId", DataTypes.INT())
      .field("productId", DataTypes.INT())
      .field("price", DataTypes.DECIMAL(10, 2))
      .field("productCount", DataTypes.INT())
      .field("priceSum", DataTypes.DECIMAL(10, 2))
      .field("shipAddress", DataTypes.STRING())
      .field("receiverAddress", DataTypes.STRING())
      .build()
    val props = new Properties()
    props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS)
    props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, GROUP_ID)

    val rowType = schema.toRowDataType.getLogicalType.asInstanceOf[RowType]
    val deserialization: DebeziumAvroDeserializationSchema = new DebeziumAvroDeserializationSchema(rowType, InternalTypeInfo.of(rowType), SCHEMA_REGIST_URL)
    val kafkaConsumerSource = new FlinkKafkaConsumer(TOPIC, deserialization, props)
      .setStartFromEarliest()
      .assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[RowData] {
        val maxOutOfOrderness = 1000L
        var currentMaxTime: Long = 0L

        override def extractTimestamp(t: RowData, l: Long): Long = {
          val currentTime = t.getLong(1)
          currentMaxTime = scala.math.max(currentMaxTime, currentTime)
          currentTime
        }

        override def getCurrentWatermark: Watermark = {
          new Watermark(currentMaxTime - maxOutOfOrderness)
        }
      })
    env.addSource(kafkaConsumerSource)
      .windowAll(SlidingEventTimeWindows.of(Time.minutes(1), Time.minutes(1)))
      .process(new ProcessAllWindowFunction[RowData, (String, BigDecimal), TimeWindow]() {
        var sum: BigDecimal = BigDecimal(0)
        val FORMAT: String = "yyyy-MM-dd HH:mm"

        def formatTime(time: Long): String = {
          new SimpleDateFormat(FORMAT).format(new Date(time))
        }

        override def process(context: Context, elements: Iterable[RowData], out: Collector[(String, BigDecimal)]): Unit = {
          elements.foreach(row => {
            val priceSum = row.getDecimal(8, 10, 2).toBigDecimal
            row.getRowKind match {
              case RowKind.INSERT => sum = sum + priceSum
              case RowKind.UPDATE_BEFORE => sum = sum - priceSum
              case RowKind.UPDATE_AFTER => sum = sum + priceSum
              case RowKind.DELETE => sum = sum - priceSum
            }
          })
          val windowEnd = context.window.getEnd
          out.collect((formatTime(windowEnd), sum))
        }
      }).print()
    env.execute()
  }
}

6、技术探讨

6.1 延迟数据

所以迟到数据可以说是一种特殊的乱序数据,因为是在窗口关闭后才到达的数据。一般这种情况有三种处理办法:

  1. 重新激活已经关闭的窗口并重新计算以修正结果。
  2. 将迟到数据收集起来另外处理。
  3. 将迟到数据视为错误消息并丢弃。

Flink默认采用第三种方法,将迟到数据视为错误消息丢弃。想要使用前两种方法需要使用到sideOutput机制和allowedLateness机制。

sideOutput机制可以将迟到事件单独放入一个数据流分支,这会作为 window 计算结果的副产品,以便用户获取并对其进行特殊处理。

allowedLateness机制允许用户设置一个允许的最大迟到时长。Flink 会在窗口关闭后一直保存窗口的状态直至超过允许迟到时长,这期间的迟到事件不会被丢弃,而是默认会触发窗口重新计算。

所以,如果要设置允许延迟的时间,可以通过DataStream.allowedLateness(lateness: Time)。如果要保存延迟数据要通过sideOutputLateData(outputTag: OutputTag[T])来保存。而要获取已经保存的延迟数据,则要通过DataStream.getSideOutput(tag: OutputTag[X])。

默认情况下,watermark到达窗口结束后,完成聚合操作,只会执行1次,相关于如果定义了1天的时间窗口,1天之后才 能看下结果。这个时候如果需要实时看到结果,需要定义触发器

watermark到达窗口结束前的发射策略是否开启:table.exec.emit.early-fire.enabled,默认false table.exec.emit.early-fire.delay,窗口结束前的发射间隔,单位毫秒。=0,无间隔,>0 间隔时间,<0 非法值。无默认值

watermark到达窗口结束后的发射策略是否开启 table.exec.emit.late-fire.enabled,默认fasle table.exec.emit.late-fire.delay,设置间隔时间

6.2 动态表

官方文档

6.3 时态表

官方文档

6.4 Joins

官方文档

6.5 查询配置

官方文档

6.6 流式聚合

官方文档

7、 FlinkSql存在的Bug

  • fFlinkSql 目前不支持对CDC数据进行开窗计算。
-- sql 
select sum(priceSum) from ods_order group by TUMBLE(ts, INTERVAL '1' DAY)

-- error

Exception in thread "main" org.apache.flink.table.api.TableException: GroupWindowAggregate doesn't support consuming update and delete changes which is produced by node TableSourceScan(table=[[default_catalog, default_database, ods_order, watermark=[-(TO_TIMESTAMP(FROM_UNIXTIME(/($1, 1000))), 3000:INTERVAL SECOND)]]], fields=[id, timestamps, orderInformationId, userId, categoryId, productId, price, productCount, priceSum, shipAddress, receiverAddress])
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.createNewNode(FlinkChangelogModeInferenceProgram.scala:380)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visit(FlinkChangelogModeInferenceProgram.scala:298)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChild(FlinkChangelogModeInferenceProgram.scala:337)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1(FlinkChangelogModeInferenceProgram.scala:326)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1$adapted(FlinkChangelogModeInferenceProgram.scala:325)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:285)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:285)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:278)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChildren(FlinkChangelogModeInferenceProgram.scala:325)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visit(FlinkChangelogModeInferenceProgram.scala:275)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChild(FlinkChangelogModeInferenceProgram.scala:337)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1(FlinkChangelogModeInferenceProgram.scala:326)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1$adapted(FlinkChangelogModeInferenceProgram.scala:325)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:285)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:285)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:278)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChildren(FlinkChangelogModeInferenceProgram.scala:325)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visit(FlinkChangelogModeInferenceProgram.scala:275)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChild(FlinkChangelogModeInferenceProgram.scala:337)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1(FlinkChangelogModeInferenceProgram.scala:326)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1$adapted(FlinkChangelogModeInferenceProgram.scala:325)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:285)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:285)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:278)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChildren(FlinkChangelogModeInferenceProgram.scala:325)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChildren(FlinkChangelogModeInferenceProgram.scala:318)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visit(FlinkChangelogModeInferenceProgram.scala:200)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.visitChild(FlinkChangelogModeInferenceProgram.scala:337)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1(FlinkChangelogModeInferenceProgram.scala:326)
	at org.apache.flink.table.planner.plan.optimize.program.FlinkChangelogModeInferenceProgram$SatisfyModifyKindSetTraitVisitor.$anonfun$visitChildren$1$adapted(FlinkChangelogModeInferenceProgram.scala:325)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:285)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:285)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:278)