大数据开发Flink DataStream API(第四十六篇)

417 阅读5分钟

一、Flink核心API

  1. SQL
  2. Table API
  3. DataStream/DataSet API
  4. Stateful Stream Processing

一、DataStream API

  • DataSource
  • Transformation
  • DataSink
1.1、DataSource

是程序的输入数据源,Flink提供了大量内置的DataSource,也支持自定义DatsSource

内置数据源:基于socket,基于Collection

第三方数据源:通过Connectors,来读取第三方数据源

Flink内置Apache Bahir
KafkaActiveMQ
Kinesis StreamsNetty
RabbitMQ
NiFi
Twitter Streaming API
Google PubSub
DataSource容错性保证
DataSource容错保证备注
Socketat most once 最多一次的保证
Collectionexactly once 仅一次
Kafkaexactly once 仅一次需要使用0.10及以上版本
package com.strivelearn.flink.api;
​
import java.util.Arrays;
​
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
​
/**
 * @author strivelearn
 * @version StreamCollectionSourceJava.java, 2023年01月15日
 */
public class StreamCollectionSourceJava {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<Integer> integerDataStreamSource = executionEnvironment.fromCollection(Arrays.asList(1, 2, 3, 4, 5, 6));
        integerDataStreamSource.print().setParallelism(1);
        executionEnvironment.execute("StreamCollectionSourceJava");
    }
}
1.2、DataStream API 之Transformation
算子解释
map输入一个元素进行处理,返回一个元素
flatMap输入一个元素进行处理,可以返回多个元素
filter对数据进行过滤,符合条件的数据会被留下
keyBy根据key分组,相同key的数据会进入同一个分区
reduce对当前元素和上一次的结果进行聚合操作
aggregationssum(),min(),max()等
union合并多个流,多个流的数据类型必须一致
connect只能连接两个流,两个流的数据类型可以不同
split根据规则把一个数据流切分为多个流。注意:split只能分一次流,切分出来的流不能继续分流(在flink 1.12版本删除了)

推荐使用sideout
package com.strivelearn.flink.api;
​
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
​
import java.util.Arrays;
​
/**
 * @author strivelearn
 * @version StreamUnionJava.java, 2023年01月15日
 */
public class StreamUnionJava {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<Integer> text1 = executionEnvironment.fromCollection(Arrays.asList(1, 2, 3, 4, 5));
        DataStreamSource<Integer> text2 = executionEnvironment.fromCollection(Arrays.asList(6, 7, 8, 9, 10));
        // 合并流操作
        DataStream<Integer> union = text1.union(text2);
        // 打印流中的数据
        DataStreamSink<Integer> integerDataStreamSink = union.print().setParallelism(1);
        executionEnvironment.execute("streamUnionJava");
    }
}
connect
package com.strivelearn.flink.api;
​
import org.apache.flink.streaming.api.datastream.*;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;
​
import java.util.Arrays;
​
/**
 * @author strivelearn
 * @version StreamConnectJava.java, 2023年01月15日
 */
public class StreamConnectJava {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<String> text1 = executionEnvironment.fromElements("user:tom,age:18");
        DataStreamSource<String> text2 = executionEnvironment.fromElements("user:jack_age:18");
        // 合并流操作
        ConnectedStreams<String, String> connect = text1.connect(text2);
        SingleOutputStreamOperator<String> map = connect.map(new CoMapFunction<String, String, String>() {
            // 处理第一份流中的数据
            @Override
            public String map1(String value) throws Exception {
                return value.replace(",", "-");
            }
​
            // 处理第二份流中的数据
            @Override
            public String map2(String value) throws Exception {
                return value.replace("_", "-");
            }
        });
        // 打印流中的数据
        map.print().setParallelism(1);
        executionEnvironment.execute("StreamConnectJava");
    }
}

输出结果

user:jack-age:18 user:tom-age:18

SideOutputDataStream
package com.strivelearn.flink.api;
​
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SideOutputDataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
​
import java.lang.reflect.Array;
import java.util.Arrays;
​
/**
 * @author strivelearn
 * @version StreamSideOutputJava.java, 2023年01月15日
 */
public class StreamSideOutputJava {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<Integer> outDataStreamSource = executionEnvironment.fromCollection(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11));
​
        // 保存偶数
        OutputTag<Integer> evenOutput = new OutputTag<Integer>("even") {
        };
        // 保存奇数
        OutputTag<Integer> oddOutput = new OutputTag<Integer>("odd") {
        };
        SingleOutputStreamOperator<Integer> process = outDataStreamSource.process(new ProcessFunction<Integer, Integer>() {
            @Override
            public void processElement(Integer value, ProcessFunction<Integer, Integer>.Context ctx, Collector<Integer> out) throws Exception {
                if (value % 2 == 0) {
                    ctx.output(evenOutput, value);
                } else {
                    ctx.output(oddOutput, value);
                }
            }
        });
​
        // 获取偶数数据流
        SideOutputDataStream<Integer> evenOutputStream = process.getSideOutput(evenOutput);
        // 获取奇数数据流
        SideOutputDataStream<Integer> oddOutputStream = process.getSideOutput(oddOutput);
​
        // 对eventStream流进行二次切分
        // 小于5的数据
        OutputTag<Integer> lowOutput = new OutputTag<Integer>("low") {
        };
        // 大于5的数据
        OutputTag<Integer> highOutput = new OutputTag<Integer>("high") {
        };
​
        SingleOutputStreamOperator<Integer> process1 = evenOutputStream.process(new ProcessFunction<Integer, Integer>() {
            @Override
            public void processElement(Integer value, ProcessFunction<Integer, Integer>.Context ctx, Collector<Integer> out) throws Exception {
                if (value < 5) {
                    ctx.output(lowOutput, value);
                } else {
                    ctx.output(highOutput, value);
                }
            }
        });
​
        SideOutputDataStream<Integer> sideOutput = process1.getSideOutput(lowOutput);
        sideOutput.print().setParallelism(1);
        executionEnvironment.execute("StreamSideOutputJava");
    }
}

输出

4 2

Hadoop-Flink Stream.drawio

partition
package com.strivelearn.flink.api;
​
import org.apache.flink.api.common.functions.Partitioner;
​
/**
 * 自定义分区规则:按照数字的奇偶性进行分区
 * @author strivelearn
 * @version MyPartitionerJava.java, 2023年01月15日
 */
public class MyPartitionerJava implements Partitioner<Integer> {
    @Override
    public int partition(Integer integer, int i) {
        System.out.println("分区总数:" + i);
        if (integer % 2 == 0) {
            return 0;
        } else {
            return 1;
        }
    }
}
package com.strivelearn.flink.api;
​
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
​
import java.util.Arrays;
​
/**
 * @author xys
 * @version StreamPartitionOpJava.java, 2023年01月15日
 */
public class StreamPartitionOpJava {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<Integer> outDataStreamSource = executionEnvironment.fromCollection(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11));
        // shuffle(outDataStreamSource);
        myPartition(outDataStreamSource);
        executionEnvironment.execute("StreamPartitionOpJava");
    }
​
    // 使用自定义分区规则
    private static void myPartition(DataStreamSource<Integer> outDataStreamSource) {
        // 使用shuffle分区规则
        outDataStreamSource.map(new MapFunction<Integer, Integer>() {
            @Override
            public Integer map(Integer integer) throws Exception {
                return integer;
            }
        }).setParallelism(2).partitionCustom(new MyPartitionerJava(), new KeySelector<Integer, Integer>() {
            @Override
            public Integer getKey(Integer integer) throws Exception {
                return integer;
            }
        }).print().setParallelism(4);
    }
​
    // 使用shuffle分区规则
    private static void shuffle(DataStreamSource<Integer> outDataStreamSource) {
        // 使用shuffle分区规则
        outDataStreamSource.map(new MapFunction<Integer, Integer>() {
            @Override
            public Integer map(Integer integer) throws Exception {
                return integer;
            }
        }).setParallelism(2).shuffle().print().setParallelism(4);
    }
​
    // 使用rebalance分区规则
    private static void rebalance(DataStreamSource<Integer> outDataStreamSource) {
        // 使用shuffle分区规则
        outDataStreamSource.map(new MapFunction<Integer, Integer>() {
            @Override
            public Integer map(Integer integer) throws Exception {
                return integer;
            }
        }).setParallelism(2).rebalance().print().setParallelism(4);
    }
​
    // 使用rescale分区规则
    private static void rescale(DataStreamSource<Integer> outDataStreamSource) {
        // 使用shuffle分区规则
        outDataStreamSource.map(new MapFunction<Integer, Integer>() {
            @Override
            public Integer map(Integer integer) throws Exception {
                return integer;
            }
        }).setParallelism(2).rescale().print().setParallelism(4);
    }
​
    // 使用broadcast分区规则
    private static void broadcast(DataStreamSource<Integer> outDataStreamSource) {
        // 使用shuffle分区规则
        outDataStreamSource.map(new MapFunction<Integer, Integer>() {
            @Override
            public Integer map(Integer integer) throws Exception {
                return integer;
            }
        }).setParallelism(2).broadcast().print().setParallelism(4);
    }
}
1.3、DataStream API之DataSink
  1. wirteAsText()将元素以字符串形式逐行写入,这些字符串通过调用每个元素的toString()方法来获取
  2. print() 打印每个元素的toString()方法的值到标准输出
  3. 提供了一批Connectors,可以实现输出到第三方目的地
Flink内置Apache Bahir
KafkaActiveMQ
CassandraFlume
Kinesis StreamsRedis
ElasticsearchAkka
HDFS
RabbitMQ
NiFi
JDBC
DataSink容错性保证
DataSource容错保证备注
Redisat least once
Kafkaat least once 至少一次
exactly once 仅一次
Kafka 0.9和0.10提供at least once
Kafka 0.11及以上提供exactly once

官网链接文档nightlies.apache.org/flink/flink…

DataSink Redis
<dependency>
    <groupId>org.apache.bahir</groupId>
    <artifactId>flink-connector-redis_2.12</artifactId>
    <version>1.1.0</version>
</dependency>
package com.strivelearn.flink.api;
​
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.redis.RedisSink;
import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommand;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommandDescription;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisMapper;
​
/**
 * 接收socket传输过来的数据,把数据保存在redis的list队列中
 * @author strivelearn
 * @version StreamRedisSinkJava.java, 2023年01月15日
 */
public class StreamRedisSinkJava {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
        // 链接socket获取输入的数据
        DataStreamSource<String> inputDataSource = executionEnvironment.socketTextStream("192.168.234.100", 9001);
        // 组装数据
        SingleOutputStreamOperator<Tuple2<String, String>> listData = inputDataSource.map(new MapFunction<String, Tuple2<String, String>>() {
            @Override
            public Tuple2<String, String> map(String s) throws Exception {
                return new Tuple2<>("l_java", s);
            }
        });
        // 指定货redissink
        FlinkJedisPoolConfig redisConf = new FlinkJedisPoolConfig.Builder().setHost("192.168.234.100").setPort(6379).build();
        RedisSink<Tuple2<String, String>> tuple2RedisSink = new RedisSink<>(redisConf, new MyRedisMapper());
        listData.addSink(tuple2RedisSink);
        executionEnvironment.execute("StreamRedisSinkJava");
    }
​
    static class MyRedisMapper implements RedisMapper<Tuple2<String, String>> {
​
        // 指定具体的操作命令
        @Override
        public RedisCommandDescription getCommandDescription() {
            return new RedisCommandDescription(RedisCommand.LPUSH);
        }
​
        // 获取key
        @Override
        public String getKeyFromData(Tuple2<String, String> stringStringTuple2) {
            return stringStringTuple2.f0;
        }
​
        // 获取value
        @Override
        public String getValueFromData(Tuple2<String, String> stringStringTuple2) {
            return stringStringTuple2.f1;
        }
    }
}
  1. 输入nc -l 9001 启动socket链接 输入数据

    nc是netcat的简写,是一个功能强大的网络工具,有着网络界的瑞士军刀美誉。nc命令在linux系统中实际命令是ncat,nc是软连接到ncat。nc命令的主要作用如下:

    1. 实现任意TCP/UDP端口的侦听,nc可以作为server以TCP或UDP方式侦听指定端口
    2. 机器之间传输文件
    3. 端口的扫描,nc可以作为client发起TCP或UDP连接
    4. 机器之间网络测速

image-20230115170241882

image-20230115170233546