Flink自定义数据源

877 阅读1分钟

下面通过一个小例子来看看如何自定义数据源

  1. 定义一个样例类SensorReading,封装温度数据
// `id`: 传感器id;`timestamp`:时间戳;`temperature`:温度值
case class SensorReading(id: String,
                         timestamp: Long,
                         temperature: Double)
  1. 继承RichParallelSourceFunction类,实现方法,并自己写一个实现无限流数据的方法
import java.util.Calendar

import org.apache.flink.streaming.api.functions.source.SourceFunction.SourceContext
import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}

import scala.util.Random
// 泛型是`SensorReading`,表明产生的流中的事件的类型是`SensorReading`
class SensorSource extends RichParallelSourceFunction[SensorReading] {
  // 表示数据源是否正常运行
  var running: Boolean = true

  // 上下文参数用来发出数据
  override def run(ctx: SourceContext[SensorReading]): Unit = {
    val rand = new Random

    var curFTemp = (1 to 10).map(
      // 使用高斯噪声产生随机温度值
      i => ("sensor_" + i, (rand.nextGaussian() * 20))
    )

    // 产生无限数据流
    while (running) {
      curFTemp = curFTemp.map(
        t => (t._1, t._2 + (rand.nextGaussian() * 0.5))
      )

      // 产生ms为单位的时间戳
      val curTime = Calendar.getInstance.getTimeInMillis

      // 使用ctx参数的collect方法发射传感器数据
      curFTemp.foreach(t => ctx.collect(SensorReading(t._1, curTime, t._2)))

      // 每隔100ms发送一条传感器数据
      Thread.sleep(1000)
    }
  }

  // 定义当取消flink任务时,需要关闭数据源
  override def cancel(): Unit = running = false
}
  1. 定义一个测试类,并测试
import org.apache.flink.api.common.functions.MapFunction
import org.apache.flink.streaming.api.scala._

object MapExample {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    //添加数据源
    val stream : DataStream[SensorReading] = env.addSource(new SensorSource)

    val mapped1 : DataStream[String] = stream.map(r => r.id)

    val mapped2 : DataStream[String] = stream.map(new MyMapFunction)

    val mapped3 : DataStream[String] = stream
      .map(new MapFunction[SensorReading, String] {
        override def map(value: SensorReading): String = value.id
      })

    mapped1.print()
    mapped2.print()
    mapped3.print()

    env.execute()
  }

  // 输入泛型:SensorReading; 输出泛型:String;
  class MyMapFunction extends MapFunction[SensorReading, String] {
    override def map(value: SensorReading): String = value.id
  }
}