五、Flink 流处理API

204 阅读18分钟

5.1 Environment

image.png

5.1.1 getExecutionEnvironment

创建一个执行环境,表示当前执行程序的上下文。如果程序是独立调用的,则此方法返回本地执行环境;如果从命令行客户端调用程序以提交到集群,则此方法返回此集群的执行环境,也就是说,getExecutionEnvironment会根据查询运行的方式决定返回什么样的运行环境,是最常用的一种创建执行环境的方式。

ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); //批处理环境StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // 流处理环境

如果没有设置并行度,会以flink-conf.yaml中的配置为准,默认是1。

image.png

5.1.2 createLocalEnvironment

返回本地执行环境,需要在调用时指定默认的并行度。

LocalStreamEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(``1``);

5.1.3 createRemoteEnvironment

返回集群执行环境,将Jar提交到远程服务器。需要在调用时指定JobManager的IP和端口号,并指定要在集群中运行的Jar包。

// JobManager的IP和端口号还有要运行的jar包

StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("job-manage-hostName", 6123,"YourPath//XXXX.jar");

5.2 Source

5.2.1 从集合读取数据

package daily.flink.api;

import lombok.AllArgsConstructor;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import java.util.Arrays;

/**
 * @author wangdanyang1
 * @date Created in 2022/11/14 5:28 PM
 */
public class SourceTestCollection {

    public static void main(String[] args) throws Exception {
        // 创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 设置env并行度1,使得整个任务抢占同一个线程执行
        env.setParallelism(1);

        // Source: 从集合Collection中获取数据
        DataStream<SensorReading> dataStream = env.fromCollection(
                Arrays.asList(
                        new SensorReading("sensor_1", 1547718199L, 35.8),
                        new SensorReading("sensor_6", 1547718201L, 15.4),
                        new SensorReading("sensor_7", 1547718202L, 6.7),
                        new SensorReading("sensor_10", 1547718205L, 38.1)
                )
        );

        DataStream<Integer> intStream = env.fromElements(1,2,3,4,5,6,7,8,9);

        // 打印输出
        dataStream.print("SENSOR");
        intStream.print("INT");

        // 执行
        env.execute("JobName");

    }
}
package com.zjl.source; import com.zjl.bean.SensorReading;``import org.apache.flink.streaming.api.datastream.DataStream;``import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import java.util.Arrays; /**`` ``* 测试--从集合读取数据`` ``*/``public class SourceTestCollection {      ``public static void main(String[] args) ``throws Exception {``        ``// 创建执行环境``        ``StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();         ``// 设置env并行度1,使得整个任务抢占同一个线程执行``        ``env.setParallelism(``1``);         ``// Source: 从集合Collection中获取数据``        ``DataStream<SensorReading> dataStream = env.fromCollection(``                ``Arrays.asList(``                        ``new SensorReading(``"sensor_1"``, 1547718199L, ``35.8``),``                        ``new SensorReading(``"sensor_6"``, 1547718201L, ``15.4``),``                        ``new SensorReading(``"sensor_7"``, 1547718202L, ``6.7``),``                        ``new SensorReading(``"sensor_10"``, 1547718205L, ``38.1``)``                ``)``        ``);         ``DataStream<Integer> intStream = env.fromElements(``1``,``2``,``3``,``4``,``5``,``6``,``7``,``8``,``9``);         ``// 打印输出``        ``dataStream.print(``"SENSOR"``);``        ``intStream.print(``"INT"``);         ``// 执行``        ``env.execute(``"JobName"``);     ``} }

SensorReading 实体:

package daily.flink.api;

import lombok.AllArgsConstructor;

/**
 * @author wangdanyang1
 * @date Created in 2022/11/14 5:29 PM
 */
@AllArgsConstructor
public class SensorReading {
    /**
     * 编号
     */
    private String id;

    /**
     * 时间
     */
    private Long time;

    /**
     * 温度
     */
    private Double temperature;
}

| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |

5.2.2 从文件中读取数据

package daily.flink.api;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @author wangdanyang1
 * @date Created in 2022/11/14 5:32 PM
 */
public class SourceTestFile {

    public static void main(String[] args) throws Exception {
        // 创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 使得任务抢占同一个线程
        env.setParallelism(1);

        // 从文件中获取数据输出
        DataStream<String> dataStream = env.readTextFile("xxx//sensor.txt");

        dataStream.print();

        env.execute();
    }
}

文件内容:

sensor_1, 1547718199, 35.8
sensor_6, 1547718201, 15.4
sensor_7, 1547718202, 6.7
sensor_10, 1547718205, 38.1

5.2.3 从Kafka读取数据

1、新增Pom依赖:

<!-- kafka -->
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
    <version>${flink.version}</version>
</dependency>

2、启动zookeeper

$ bin/zookeeper-server-start.sh config/zookeeper.properties

3、启动kafka服务

$ bin/kafka-server-start.sh config/server.properties

4、启动kafka生产者

$ bin/kafka-console-producer.sh --broker-list localhost:9092  --topic sensor

5、编写java代码

package daily.flink.api;

import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;

import java.util.Properties;

/**
 * @author wangdanyang1
 * @date Created in 2022/11/14 5:39 PM
 */
public class SourceTestKafka {

    public static void main(String[] args) throws Exception {
        // 创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 设置并行度1
        env.setParallelism(1);

        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "localhost:9092");
        // 下面这些次要参数
        properties.setProperty("group.id", "consumer-group");
        properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties.setProperty("auto.offset.reset", "latest");

        // flink添加外部数据源
        DataStream<String> dataStream = env.addSource(new FlinkKafkaConsumer<String>("sensor", new SimpleStringSchema(),properties));

        // 打印输出
        dataStream.print();

        env.execute();
    }
}

6、运行java代码,在Kafka生产者console中输入

$ bin/kafka-console-producer.sh --broker-list localhost:9092  --topic sensor
>sensor_1,1547718199,35.8
>sensor_6,1547718201,15.4
>

7、java输出

sensor_1,1547718199,35.8
sensor_6,1547718201,15.4

5.2.4 自定义Source

package daily.flink.api;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;

import java.util.HashMap;
import java.util.Random;

/**
 * @author wangdanyang1
 * @date Created in 2022/11/14 5:41 PM
 */
public class SourceTestUDF {

    public static void main(String[] args) throws Exception {
        // 创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStream<SensorReading> dataStream = env.addSource(new MySensorSource());

        dataStream.print();

        env.execute();
    }

    // 实现自定义的SourceFunction
    public static class MySensorSource implements SourceFunction<SensorReading> {

        // 标示位,控制数据产生
        private volatile boolean running = true;


        @Override
        public void run(SourceContext<SensorReading> ctx) throws Exception {
            //定义一个随机数发生器
            Random random = new Random();

            // 设置10个传感器的初始温度
            HashMap<String, Double> sensorTempMap = new HashMap<>();
            for (int i = 0; i < 10; ++i) {
                sensorTempMap.put("sensor_" + (i + 1), 60 + random.nextGaussian() * 20); // Gaussian 高斯分布(正态分布)
            }

            while (running) {
                for (String sensorId : sensorTempMap.keySet()) {
                    // 在当前温度基础上随机波动
                    Double newTemp = sensorTempMap.get(sensorId) + random.nextGaussian();
                    sensorTempMap.put(sensorId, newTemp);
                    ctx.collect(new SensorReading(sensorId,System.currentTimeMillis(),newTemp));
                }
                // 控制输出评率
                Thread.sleep(2000L);
            }
        }

        @Override
        public void cancel() {
            this.running = false;
        }
    }
}

5.3 Transform

5.3.1 基本转换算子(map/flatMap/filter)

map、flatMap、filter通常被统一称为基本转换算子(简单转换算子)。

  • map

       把数组流中的每一个值,使用所提供的函数执行一遍,一一对应。得到元素个数相同的数组流

image.png

  • flatMap

         flat是扁平的意思。它把数组流中的每一个值,使用所提供的函数执行一遍,一一对应。得到元素相同的数组流。只不过,里面的元素也是一个子数组流。把这些子数组合并成一个数组以后,元素个数大概率会和原数组流的个数不同。

image.png

  • filter

    过滤,为True 的过滤成功,false 的过滤掉。

package daily.flink.api;

import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * Transform 转换算子
 *
 * @author wangdanyang1
 * @date Created in 2022/11/14 5:43 PM
 */
public class TransformTest {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        env.setParallelism(1);

        // 从文件中获取数据输出
        DataStream<String> dataStream = env.readTextFile(```
"/Users/wangdanyang1/IdeaProjects/wdy/src/main/java/daily/flink/api/sensor.txt");

        // 1、map
        DataStream<Integer> mapStream = dataStream.map(new MapFunction<String, Integer>() {
            @Override
            public Integer map(String value) throws Exception {
                return value.length();
            }
        });
        mapStream.print("map");

        // 2、flatMap
        DataStream<String> flatMapStream = dataStream.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public void flatMap(String s, Collector<String> collector) throws Exception {
                String[] fields = s.split(",");
                for(String field:fields){
                    collector.collect(field);
                }
            }
        });
        flatMapStream.print("flatMap");

        // 3、filter,筛选"sensor_1"开头的数据
        DataStream<String> filterStream = dataStream.filter(new FilterFunction<String>() {
            @Override
            public boolean filter(String value) throws Exception {
                return value.startsWith("sensor_1");
            }
        });
        filterStream.print("filter");

        env.execute();
    }
}

5.3.2 聚合操作算子

DataStream里没有reduce和sum这类聚合操作的方法,因为Flink设计中,所有数据必须先分组才能做聚合操作。 先keyBy得到KeyedStream,然后调用其reduce、sum等聚合操作方法。(先分组后聚合)

常见的聚合操作算子主要有:

  • keyBy
  • 滚动聚合算子Rolling Aggregation
  • reduce

keyBy

image.png

DataStream -> KeyedStream:逻辑地将一个流拆分成不相交的分区,每个分区包含具有相同key的元素,在内部以hash的形式实现的。

1、KeyBy会重新分区;

2、不同的key有可能分到一起,因为是通过hash原理实现的;

Rolling Aggregation

这些算子可以针对KeyedStream的每一个支流做聚合。

  • sum()
  • min()
  • max()
  • minBy()
  • maxBy()

练习代码:

package daily.flink.api;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * 滚动聚合算子
 *
 * @author wangdanyang1
 * @date Created in 2022/11/14 5:45 PM
 */
public class TransformAggregation {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        env.setParallelism(1);

        // 从文件中获取数据输出
        DataStream<String> dataStream = env.readTextFile("/Users/zhangjiuliang/IdeaProjects/FlinkLean/src/main/resources/sensor.txt");

        // 传统写法
//        DataStream<SensorReading> sensorStream = dataStream.map(new MapFunction<String, SensorReading>() {
//            @Override
//            public SensorReading map(String value) throws Exception {
//                String[] fields = value.split(",");
//                return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2]));
//            }
//        });


        // lambda 表达式写法
        DataStream<SensorReading> sensorStream = dataStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0], new Long(fields[1]), new Double(fields[2]));
        });

        // 先分组再聚合
        // 分组
        // 几种不同的写法
        //KeyedStream<SensorReading, Tuple> keyedStream2 = sensorStream.keyBy("id");
        //KeyedStream<SensorReading, String> keyedStream1 = sensorStream.keyBy(data -> data.getId());
        KeyedStream<SensorReading, String> keyedStream = sensorStream.keyBy(SensorReading::getId);

        // 滚动聚合,max和maxBy区别在于,maxBy除了用于max比较的字段以外,其他字段也会更新成最新的,而max只有比较的字段更新,其他字段不变
        //DataStream<SensorReading> resultStreamMax = keyedStream.max("temperature");
        DataStream<SensorReading> resultStream = keyedStream.maxBy("temperature");

        resultStream.print("result");

        env.execute();
    }
}

其中sensor.txt文件内容如下

sensor_1,1547718199,35.8
sensor_6,1547718201,15.4
sensor_7,1547718202,6.7
sensor_10,1547718205,38.1
sensor_1,1547718207,36.3
sensor_1,1547718209,32.8
sensor_1,1547718212,37.1

输出如下:

由于是滚动更新,每次输出历史最大值,所以下面36.3才会出现两次

SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
result> SensorReading(id=sensor_1, time=1547718199, temperature=35.8)
result> SensorReading(id=sensor_6, time=1547718201, temperature=15.4)
result> SensorReading(id=sensor_7, time=1547718202, temperature=6.7)
result> SensorReading(id=sensor_10, time=1547718205, temperature=38.1)
result> SensorReading(id=sensor_1, time=1547718207, temperature=36.3)
result> SensorReading(id=sensor_1, time=1547718207, temperature=36.3)
result> SensorReading(id=sensor_1, time=1547718212, temperature=37.1)

Process finished with exit code 0

如果报错:

: Cannot reference field by field expression on GenericType<com.zjl.bean.SensorReading>Field express

1、实体类所有变量都是public (这个不用,我的就是私有属性)
2、keyby用到的变量不能是布尔类型的
3、添加无参构造函数 (flink1.10不需要)

reduce

reduce类似于一个递归的概念。最终会归约成一个值。看看这个公式:)

reduce([p1,p2,p3,p4],fn) = reduce([fn(p2,p4),fn(p1,p3)])

Reduce适用于更加一般化的聚合操作场景。java中需要实现ReduceFunction函数式接口。

在flink中我们经常会用到ReduceFunction来合并两个参数生成一个新的值,这个新的值同时也可以再下一次reduce操作中跟新的参数的再次进行合并操作。 value1是上一次reduce合并后的结果值,value2 是新来的值

代码练习:

在前面Rolling Aggregation的前提下,对需求进行修改。获取同组历史温度最高的传感器信息,同时要求实时更新其时间戳信息。

package daily.flink.api;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * 滚动聚合算子 Reduce
 *
 * @author wangdanyang1
 * @date Created in 2022/11/14 5:47 PM
 */
public class TransformReduce {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        env.setParallelism(1);

        // 从文件中获取数据输出
        DataStream<String> dataStream = env.readTextFile("/Users/zhangjiuliang/IdeaProjects/FlinkLean/src/main/resources/sensor.txt");



        // lambda 表达式写法
        DataStream<SensorReading> sensorStream = dataStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0], new Long(fields[1]), new Double(fields[2]));
        });
        // 先分组再聚合
        // 分组
        KeyedStream<SensorReading, String> keyedStream = sensorStream.keyBy(SensorReading::getId);

        // reduce,自定义规约函数,获取max温度的传感器信息以外,时间戳要求更新成最新的
        DataStream<SensorReading> resultStream = keyedStream.reduce(
                (curSensor,newSensor)->new SensorReading(curSensor.getId(),newSensor.getTime(), Math.max(curSensor.getTemperature(), newSensor.getTemperature()))
        );

        resultStream.print("result");

        env.execute();

    }
}

sensor.txt文件内容同上:

输出如下:

和前面“Rolling Aggregation”小节不同的是,倒数第二条数据的时间戳用了当前比较时最新的时间戳。

result> SensorReading(id=sensor_1, time=1547718199, temperature=35.8)
result> SensorReading(id=sensor_6, time=1547718201, temperature=15.4)
result> SensorReading(id=sensor_7, time=1547718202, temperature=6.7)
result> SensorReading(id=sensor_10, time=1547718205, temperature=38.1)
result> SensorReading(id=sensor_1, time=1547718207, temperature=36.3)
result> SensorReading(id=sensor_1, time=1547718209, temperature=36.3)
result> SensorReading(id=sensor_1, time=1547718212, temperature=37.1)

Process finished with exit code 0

5.3.3 多流转换算子

多流转换算子一般包括:

  • Split和Select (新版已经移除)
  • Connect和CoMap
  • Union

Split和Select

注:新版Flink已经不存在Split和Select这两个API了(至少Flink1.12.1没有!)

Split

image.png

DataStream -> SplitStream:根据某些特征把DataStream拆分成SplitStream;

SplitStream虽然看起来像是两个Stream,但是其实它是一个特殊的Stream;

Select

image.png

SplitStream -> DataStream:从一个SplitStream中获取一个或者多个DataStream;

我们可以结合split&select将一个DataStream拆分成多个DataStream。

练习代码:

当前版本Flink 已经没有了这两个操作。直接贴了教程里的代码

package daily.flink.api;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import java.util.Collections;

/**
 * @author wangdanyang1
 * @date Created in 2022/11/14 5:50 PM
 */
public class SplitAndSelect {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        // 从文件读取数据
        DataStream<String> inputStream = env.readTextFile("D:\Projects\BigData\FlinkTutorial\src\main\resources\sensor.txt");

        // 转换成SensorReading
        DataStream<SensorReading> dataStream = inputStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0], new Long(fields[1]), new Double(fields[2]));
        } );

        // 1. 分流,按照温度值30度为界分为两条流
        SplitStream<SensorReading> splitStream = dataStream.split(new OutputSelector<SensorReading>() {
            @Override
            public Iterable<String> select(SensorReading value) {
                return (value.getTemperature() > 30) ? Collections.singletonList("high") : Collections.singletonList("low");
            }
        });

        DataStream<SensorReading> highTempStream = splitStream.select("high");
        DataStream<SensorReading> lowTempStream = splitStream.select("low");
        DataStream<SensorReading> allTempStream = splitStream.select("high", "low");

        highTempStream.print("high");
        lowTempStream.print("low");
        allTempStream.print("all");

        env.execute();
    }
}

输出结果如下:

high> SensorReading{id=``'sensor_1'``, timestamp=``1547718199``, temperature=``35.8``}``all > SensorReading{id=``'sensor_1'``, timestamp=``1547718199``, temperature=``35.8``}``low > SensorReading{id=``'sensor_6'``, timestamp=``1547718201``, temperature=``15.4``}``all > SensorReading{id=``'sensor_6'``, timestamp=``1547718201``, temperature=``15.4``}``...

Connect和CoMap

Connect

image.png

DataStream,DataStream -> ConnectedStreams: 连接两个保持他们类型的数据流,两个数据流被Connect 之后,只是被放在了一个流中,内部依然保持各自的数据和形式不发生任何变化,两个流相互独立。

CoMap

image.png

ConnectedStreams -> DataStream: 作用于ConnectedStreams 上,功能与map和flatMap一样,对ConnectedStreams 中的每一个Stream分别进行map和flatMap操作;

练习代码:

虽然Flink1.12.1的DataStream有connect和map方法,但是教程基于前面的split和select编写,所以这里直接附上教程的代码:

package daily.flink.api;

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;

import java.util.Collections;

/**
 * @author wangdanyang1
 * @date Created in 2022/11/14 5:51 PM
 */
public class TransformTest4_MultipleStreams {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        // 从文件读取数据
        DataStream<String> inputStream = env.readTextFile("D:\Projects\BigData\FlinkTutorial\src\main\resources\sensor.txt");

        // 转换成SensorReading
        DataStream<SensorReading> dataStream = inputStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0], new Long(fields[1]), new Double(fields[2]));
        } );

        // 1. 分流,按照温度值30度为界分为两条流
        SplitStream<SensorReading> splitStream = dataStream.split(new OutputSelector<SensorReading>() {
            @Override
            public Iterable<String> select(SensorReading value) {
                return (value.getTemperature() > 30) ? Collections.singletonList("high") : Collections.singletonList("low");
            }
        });

        DataStream<SensorReading> highTempStream = splitStream.select("high");
        DataStream<SensorReading> lowTempStream = splitStream.select("low");
        DataStream<SensorReading> allTempStream = splitStream.select("high", "low");

        // highTempStream.print("high");
        // lowTempStream.print("low");
        // allTempStream.print("all");

        // 2. 合流 connect,将高温流转换成二元组类型,与低温流连接合并之后,输出状态信息
        DataStream<Tuple2<String, Double>> warningStream = highTempStream.map(new MapFunction<SensorReading, Tuple2<String, Double>>() {
            @Override
            public Tuple2<String, Double> map(SensorReading value) throws Exception {
                return new Tuple2<>(value.getId(), value.getTemperature());
            }
        });

        ConnectedStreams<Tuple2<String, Double>, SensorReading> connectedStreams = warningStream.connect(lowTempStream);

        DataStream<Object> resultStream = connectedStreams.map(new CoMapFunction<Tuple2<String, Double>, SensorReading, Object>() {
            @Override
            public Object map1(Tuple2<String, Double> value) throws Exception {
                return new Tuple3<>(value.f0, value.f1, "high temp warning");
            }

            @Override
            public Object map2(SensorReading value) throws Exception {
                return new Tuple2<>(value.getId(), "normal");
            }
        });

        resultStream.print();

        env.execute();
    }
}

输出如下:

(sensor_1,``35.8``,high temp warning)``(sensor_6,normal)``(sensor_10,``38.1``,high temp warning)``(sensor_7,normal)``(sensor_1,``36.3``,high temp warning)``(sensor_1,``32.8``,high temp warning)``(sensor_1,``37.1``,high temp warning)

Union

image.png

DataStream -> DataStream:对两个或者两个以上的DataStream进行Union操作,产生一个包含多有DataStream元素的新DataStream。

问题:和Connect的区别?

  1. Connect 的数据类型可以不同,Connect 只能合并两个流;
  2. Union可以合并多条流,Union的数据结构必须是一样的;
  3. union联合多条流
//warningStream.union(lowTempStream); 这个不行,因为warningStream类型是DataStream<Tuple2<String, Double>>,而highTempStream是DataStream<SensorReading>
highTempStream.union(lowTempStream, allTempStream);

5.3.4 算子转换

Flink常用算子Transformation(转换)

在Storm中,我们常常用Bolt的层级关系来表示各个数据的流向关系,组成一个拓扑。

在Flink中,Transformation算子就是将一个或多个DataStream转换为新的DataStream,可以将多个转换组合成复杂的数据流拓扑。 如下图所示,DataStream会由不同的Transformation操作,转换、过滤、聚合成其他不同的流,从而完成我们的业务要求。

image.png

5.4 支持的数据类型

Flink流应用程序处理的是以数据对象表示的事件流。所以在Flink内部,我们需要能够处理这些对象。它们需要被序列化和反序列化,以便通过网络传送它们;或者从状态后端、检查点和保存点读取它们。为了有效地做到这一点,Flink需要明确知道应用程序所处理的数据类型。Flink使用类型信息的概念来表示数据类型,并为每个数据类型生成特定的序列化器、反序列化器和比较器。

Flink还具有一个类型提取系统,该系统分析函数的输入和返回类型,以自动获取类型信息,从而获得序列化器和反序列化器。但是,在某些情况下,例如lambda函数或泛型类型,需要显式地提供类型信息,才能使应用程序正常工作或提高其性能。

Flink支持Java和Scala中所有常见数据类型。使用最广泛的类型有以下几种。

5.4.1 基础数据类型

 Flink支持所有的Java和Scala基础数据类型,Int, Double, Long, String, …

DataStream<Integer> numberStream = env.fromElements(1234);
numberStream.map(data -> data * 2);

5.4.2 Java和Scala元组(Tuples)

java不像Scala天生支持元组Tuple类型,java的元组类型由Flink的包提供,默认提供Tuple0~Tuple25  (数字代表元组支持的数据个数 ,Tuple2 就是2元组)

DataStream<Tuple2<String, Integer>> personStream = env.fromElements(
new Tuple2("Adam"17),
new Tuple2("Sarah"23));
personStream.filter(p -> p.f1 > 18);

5.4.3 Scala样例类(case classes)

case class Person(name:String,age:Int) val numbers: DataStream[(String,Integer)] = env.fromElements( Person("张三",12), Person("李四"23))

5.4.4 Java简单对象(POJO)

java的POJO这里要求必须提供无参构造函数

  • 成员变量要求都是public(或者private但是提供get、set方法)
public class Person {
    public String name;
    public int age;
    public Person() {}
    public Person( String name , int age) {
        this.name = name;
        this.age = age;
    }

    public static void main(String[] args) {
        DataStream<Person> persons = env.fromElements(
                new Person (" Alex", 42),
                new Person (" Wendy",23)
        );
    }
}

5.4.5 其他(Arrays, Lists, Maps, Enums,等等)

Flink对Java和Scala中的一些特殊目的的类型也都是支持的,比如Java的ArrayList,HashMap,Enum等等。

5.5 实现UDF函数——更细粒度的控制流

5.5.1 函数类(Function Classes)

Flink暴露了所有UDF函数的接口(实现方式为接口或者抽象类)。例如MapFunction, FilterFunction, ProcessFunction等等。

下面例子实现了FilterFunction接口:

使用自定义的Filter

DataStream<String> flinkTweets = tweets.filter(new FlinkFilter());
public static class FlinkFilter implements FilterFunction<String> {
    @Override public boolean filter(String value) throws Exception {
        return value.contains("flink");
    }
}

还可以将函数实现成匿名类

DataStream<String> flinkTweets = tweets.filter(
        new FilterFunction<String>() {
            @Override public boolean filter(String value) throws Exception {
                return value.contains("flink");
            }
        }
);

我们filter的字符串"flink"还可以当作参数传进去。

DataStream<String> tweets = env.readTextFile("INPUT_FILE ");
DataStream<String> flinkTweets = tweets.filter(new KeyWordFilter("flink"));
public static class KeyWordFilter implements FilterFunction<String> {
    private String keyWord;

    KeyWordFilter(String keyWord) {
        this.keyWord = keyWord;
    }

    @Override public boolean filter(String value) throws Exception {
        return value.contains(this.keyWord);
    }
}

5.5.2 匿名函数写法(Lambda Functions)

DataStream<String> tweets = env.readTextFile("INPUT_FILE");
DataStream<String> flinkTweets = tweets.filter( tweet -> tweet.contains("flink") );

5.5.3 富函数(Rich Functions)

“富函数”是DataStream API提供的一个函数类的接口,所有Flink函数类都有其Rich版本。

 它与常规函数的不同在于,可以获取运行环境的上下文,并拥有一些生命周期方法,所以可以实现更复杂的功能。

  • RichMapFunction
  • RichFlatMapFunction
  • RichFilterFunction

Rich Function有一个生命周期的概念。典型的生命周期方法有:

  • open()方法是rich function的初始化方法,当一个算子例如map或者filter被调用之前open()会被调用。
  • close()方法是生命周期中的最后一个调用的方法,做一些清理工作。
  • getRuntimeContext()方法提供了函数的RuntimeContext的一些信息,例如函数执行的并行度,任务的名字,以及state状态
public static class MyMapFunction extends RichMapFunction<SensorReading, Tuple2<Integer, String>> {

    @Override public Tuple2<Integer, String> map(SensorReading value) throws Exception {
        return new Tuple2<>(getRuntimeContext().getIndexOfThisSubtask(), value.getId());
    }

    @Override public void open(Configuration parameters) throws Exception {
        System.out.println("my map open"); // 以下可以做一些初始化工作,例如建立一个和HDFS的连接
    }

    @Override public void close() throws Exception {
        System.out.println("my map close"); // 以下做一些清理工作,例如断开和HDFS的连接
    }
}

练习代码:

package daily.flink.api;

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * “富函数”是DataStream API提供的一个函数类的接口,所有Flink函数类都有其Rich版本。
 * 它与常规函数的不同在于,可以获取运行环境的上下文,并拥有一些生命周期方法,所以可以实现更复杂的功能。
 * RichMapFunction
 * RichFlatMapFunction
 * RichFilterFunction
 *
 *
 * Rich Function有一个生命周期的概念。典型的生命周期方法有:
 * open()方法是rich function的初始化方法,当一个算子例如map或者filter被调用之前open()会被调用。
 * close()方法是生命周期中的最后一个调用的方法,做一些清理工作。
 * getRuntimeContext()方法提供了函数的RuntimeContext的一些信息,例如函数执行的并行度,任务的名字,以及state状态
 *
 * @author wangdanyang1
 * @date Created in 2022/11/14 6:04 PM
 */
public class TransformRichFunction {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(4);

        DataStream<String> inputStream = env.readTextFile("/tmp/Flink_Tutorial/src/main/resources/sensor.txt");

        // 转换成SensorReading类型
        DataStream<SensorReading> dataStream = inputStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0], new Long(fields[1]), new Double(fields[2]));
        });

        DataStream<Tuple2<String, Integer>> resultStream = dataStream.map( new MyMapper() );

        resultStream.print();

        env.execute();
    }

    // 传统的Function不能获取上下文信息,只能处理当前数据,不能和其他数据交互
    public static class MyMapper0 implements MapFunction<SensorReading, Tuple2<String, Integer>> {
        @Override
        public Tuple2<String, Integer> map(SensorReading value) throws Exception {
            return new Tuple2<>(value.getId(), value.getId().length());
        }
    }

    // 实现自定义富函数类(RichMapFunction是一个抽象类)
    public static class MyMapper extends RichMapFunction<SensorReading, Tuple2<String, Integer>> {
        @Override
        public Tuple2<String, Integer> map(SensorReading value) throws Exception {
//            RichFunction可以获取State状态
//            getRuntimeContext().getState();
            return new Tuple2<>(value.getId(), getRuntimeContext().getIndexOfThisSubtask());
        }

        /**
         * 由于设置了执行环境env的并行度为4,所以有4个slot执行自定义的RichFunction,输出4次open和close
         *
         * @param parameters
         * @throws Exception
         */
        @Override
        public void open(Configuration parameters) throws Exception {
            // 初始化工作,一般是定义状态,或者建立数据库连接
            System.out.println("open");
        }

        @Override
        public void close() throws Exception {
            // 一般是关闭连接和清空状态的收尾操作
            System.out.println("close");
        }
    }


}

5.6 数据重分区操作

重分区操作,在DataStream类中可以看到很多**Partitioner**字眼的类。

其中partitionCustom(...)方法用于自定义重分区。

1. shuffle (并非批处理中的获取一批后才打乱,这里每次获取到直接打乱且分区)

2. keyBy (Hash,然后取模)

3. global (直接发送给第一个分区,少数特殊情况才用)

练习代码:

package daily.flink.api;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * 重分区操作,在DataStream类中可以看到很多Partitioner字眼的类。
 * 其中partitionCustom(...)方法用于自定义重分区。
 *
 * @author wangdanyang1
 * @date Created in 2022/11/14 6:05 PM
 */
public class TransformPartition {

    public static void main(String[] args) throws Exception{

        // 创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 设置并行度 = 4
        env.setParallelism(4);

        // 从文件读取数据
        DataStream<String> inputStream = env.readTextFile("/tmp/Flink_Tutorial/src/main/resources/sensor.txt");

        // 转换成SensorReading类型
        DataStream<SensorReading> dataStream = inputStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0], new Long(fields[1]), new Double(fields[2]));
        });

        // SingleOutputStreamOperator多并行度默认就rebalance,轮询方式分配
        dataStream.print("input");

        // 1. shuffle (并非批处理中的获取一批后才打乱,这里每次获取到直接打乱且分区)
        DataStream<String> shuffleStream = inputStream.shuffle();
        shuffleStream.print("shuffle");

        // 2. keyBy (Hash,然后取模)
        dataStream.keyBy(SensorReading::getId).print("keyBy");

        // 3. global (直接发送给第一个分区,少数特殊情况才用)
        dataStream.global().print("global");

        env.execute();
    }
}

5.7 Sink

Flink没有类似于spark中foreach方法,让用户进行迭代的操作。虽有对外的输出操作都要利用Sink完成。最后通过类似如下方式完成整个任务最终输出操作。

stream.addSink(``new` `MySink(xxxx))

官方提供了一部分的框架的sink。除此以外,需要用户自定义实现sink。

image.png

5.7.1 Kafka

1、pom依赖

...

2、练习代码

package daily.flink.api;

import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;

import java.util.Properties;

/**
 * @author wangdanyang1
 * @date Created in 2022/11/14 6:07 PM
 */
public class SinkTestKafka {
    public static void main(String[] args) throws Exception{
        // 创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 并行度设置为1
        env.setParallelism(1);

        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "localhost:9092");
        properties.setProperty("group.id", "consumer-group");
        properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties.setProperty("auto.offset.reset", "latest");

        // 从Kafka中读取数据
        DataStream<String> inputStream = env.addSource( new FlinkKafkaConsumer<String>("sensor", new SimpleStringSchema(), properties));

        // 序列化从Kafka中读取的数据
        DataStream<String> dataStream = inputStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0], new Long(fields[1]), new Double(fields[2])).toString();
        });

        // 将数据写入Kafka
        dataStream.addSink( new FlinkKafkaProducer<String>("localhost:9092", "sinktest", new SimpleStringSchema()));

        env.execute();
    }
}

3、启动zookeeper

$ bin/zookeeper-server-start.sh config/zookeeper.properties

4、启动kafka服务

$ bin/kafka-server-start.sh config/server.properties

5、新建kafka生产者console

$ bin/kafka-console-producer.sh --broker-list localhost:``9092  --topic sensor

6、新建kafka消费者console

$ bin/kafka-console-consumer.sh --bootstrap-server localhost:``9092 --topic sinktest

7、运行Flink程序,在kafka生产者console输入数据,查看kafka消费者console的输出结果

输入(kafka生产者console)

>sensor_1,1547718199,35.8
>sensor_6,1547718201,15.4

8、输出(kafka消费者console)

SensorReading{id='sensor_1', timestamp=1547718199, temperature=35.8}
SensorReading{id='sensor_6', timestamp=1547718201, temperature=15.4}

这里Flink的作用相当于pipeline了。

5.7.2 Redis

这里将Redis当作sink的输出对象。

查询Flink连接器,最简单的就是查询关键字flink-connector-

1、pom依赖

上面的pom 里面呢

<!-- https://mvnrepository.com/artifact/org.apache.bahir/flink-connector-redis -->
<dependency>
    <groupId>org.apache.bahir</groupId>
    <artifactId>flink-connector-redis_2.11</artifactId>
    <version>1.0</version>
</dependency>

2、Java 代码 

package daily.flink.api;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.redis.RedisSink;
import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommand;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommandDescription;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisMapper;

/**
 * @author wangdanyang1
 * @date Created in 2022/11/14 6:11 PM
 */
public class SinkTestRedis {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        // 从文件读取数据
        DataStream<String> inputStream = env.readTextFile("/tmp/Flink_Tutorial/src/main/resources/sensor.txt");

        // 转换成SensorReading类型
        DataStream<SensorReading> dataStream = inputStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0], new Long(fields[1]), new Double(fields[2]));
        });

        // 定义jedis连接配置(我这里连接的是docker的redis)
        FlinkJedisPoolConfig config = new FlinkJedisPoolConfig.Builder()
                .setHost("localhost")
                .setPort(6379)
                .setPassword("123456")
                .setDatabase(0)
                .build();

        dataStream.addSink(new RedisSink<>(config, new MyRedisMapper()));

        env.execute();
    }

    /**
     *  自定义RedisMapper
     */
    public static class MyRedisMapper implements RedisMapper<SensorReading> {

        /**
         * 定义保存数据到redis的命令,存成Hash表,hset sensor_temp id temperature
         */
        @Override
        public RedisCommandDescription getCommandDescription() {
            return new RedisCommandDescription(RedisCommand.HSET, "sensor_temp");
        }

        /**
         * 从流数据中提取KEY
         * @param data
         * @return
         */
        @Override
        public String getKeyFromData(SensorReading data) {
            return data.getId();
        }

        /**
         * 从流数据中提取Value
         * @param data
         * @return
         */
        @Override
        public String getValueFromData(SensorReading data) {
            return data.getTemperature().toString();
        }
    }
}

3、启动redis服务(我这里是docker里的)

4、启动Flink程序

5、查看Redis里的数据

因为最新数据覆盖前面的,所以最后redis里呈现的是最新的数据。

localhost:0>hgetall sensor_temp
1) "sensor_1"
2) "37.1"
3) "sensor_6"
4) "15.4"
5) "sensor_7"
6) "6.7"
7) "sensor_10"
8) "38.1"

5.7.3 Elasticsearch

1、pom 依赖

<!-- ElasticSearch7 -->
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-connector-elasticsearch7_2.12</artifactId>
    <version>1.12.1</version>
</dependency>

2、Java代码

package daily.flink.api;

import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction;
import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer;
import org.apache.flink.streaming.connectors.elasticsearch7.ElasticsearchSink;
import org.apache.http.HttpHost;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Requests;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

/**
 * @author wangdanyang1
 * @date Created in 2022/11/14 6:13 PM
 */
public class SinkTestEs {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        // 从文件读取数据
        DataStream<String> inputStream = env.readTextFile("/tmp/Flink_Tutorial/src/main/resources/sensor.txt");

        // 转换成SensorReading类型
        DataStream<SensorReading> dataStream = inputStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0], new Long(fields[1]), new Double(fields[2]));
        });

        // 定义es的连接配置
        List<HttpHost> httpHosts = new ArrayList<>();
        httpHosts.add(new HttpHost("localhost", 9200));

        dataStream.addSink( new ElasticsearchSink.Builder<SensorReading>(httpHosts, new MyEsSinkFunction()).build());

        env.execute();
    }

    // 实现自定义的ES写入操作
    public static class MyEsSinkFunction implements ElasticsearchSinkFunction<SensorReading> {
        @Override
        public void process(SensorReading element, RuntimeContext ctx, RequestIndexer indexer) {
            // 定义写入的数据source
            HashMap<String, String> dataSource = new HashMap<>();
            dataSource.put("id", element.getId());
            dataSource.put("temp", element.getTemperature().toString());
            dataSource.put("ts", element.getTime().toString());

            // 创建请求,作为向es发起的写入命令(ES7统一type就是_doc,不再允许指定type)
            IndexRequest indexRequest = Requests.indexRequest()
                    .index("sensor")
                    .source(dataSource);

            // 用index发送请求
            indexer.add(indexRequest);
        }
    }
}

4、启动ElasticSearch(我这里是docker启动的

5、运行Flink程序,查看ElasticSearch是否新增数据

$ curl "localhost:9200/sensor/_search?pretty"
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 7,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "sensor",
        "_type" : "_doc",
        "_id" : "jciyWXcBiXrGJa12kSQt",
        "_score" : 1.0,
        "_source" : {
          "temp" : "35.8",
          "id" : "sensor_1",
          "ts" : "1547718199"
        }
      },
      {
        "_index" : "sensor",
        "_type" : "_doc",
        "_id" : "jsiyWXcBiXrGJa12kSQu",
        "_score" : 1.0,
        "_source" : {
          "temp" : "15.4",
          "id" : "sensor_6",
          "ts" : "1547718201"
        }
      },
      {
        "_index" : "sensor",
        "_type" : "_doc",
        "_id" : "j8iyWXcBiXrGJa12kSQu",
        "_score" : 1.0,
        "_source" : {
          "temp" : "6.7",
          "id" : "sensor_7",
          "ts" : "1547718202"
        }
      },
      {
        "_index" : "sensor",
        "_type" : "_doc",
        "_id" : "kMiyWXcBiXrGJa12kSQu",
        "_score" : 1.0,
        "_source" : {
          "temp" : "38.1",
          "id" : "sensor_10",
          "ts" : "1547718205"
        }
      },
      {
        "_index" : "sensor",
        "_type" : "_doc",
        "_id" : "kciyWXcBiXrGJa12kSQu",
        "_score" : 1.0,
        "_source" : {
          "temp" : "36.3",
          "id" : "sensor_1",
          "ts" : "1547718207"
        }
      },
      {
        "_index" : "sensor",
        "_type" : "_doc",
        "_id" : "ksiyWXcBiXrGJa12kSQu",
        "_score" : 1.0,
        "_source" : {
          "temp" : "32.8",
          "id" : "sensor_1",
          "ts" : "1547718209"
        }
      },
      {
        "_index" : "sensor",
        "_type" : "_doc",
        "_id" : "k8iyWXcBiXrGJa12kSQu",
        "_score" : 1.0,
        "_source" : {
          "temp" : "37.1",
          "id" : "sensor_1",
          "ts" : "1547718212"
        }
      }
    ]
  }
}