Flink 数据源分流/一个数据源分成多个/使用filter/使用split / select(已废弃)/side outputs 旁路输出

1,080 阅读3分钟

实时数据分流的方法

1 使用filter

2 使用split / select(已废弃)

package splitStream;
 
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SplitStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
 
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;
import java.util.concurrent.TimeUnit;
 
/**
 * @Author you guess
 * @Date 2021/1/17 12:40
 * @Version 1.0
 * @Desc
 */
public class SplitDataStreamTest {
    private static final Logger LOG = LoggerFactory.getLogger(MinMinByMaxMaxBy.MinMinByMaxMaxByTest.class);
    private static final String[] TYPE = {"a苹果", "b梨", "c西瓜", "d葡萄", "e火龙果"};
 
    public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
 
        //添加自定义数据源,每秒发出一笔订单信息{商品名称,商品数量}
        DataStreamSource<Tuple2<String, Integer>> orderSource = env.addSource(new SourceFunction<Tuple2<String, Integer>>() {
            private volatile boolean isRunning = true;
            private final Random random = new Random();
 
            @Override
            public void run(SourceContext<Tuple2<String, Integer>> ctx) throws Exception {
                while (isRunning) {
                    TimeUnit.SECONDS.sleep(1);
                    Tuple2<String, Integer> tuple2 = Tuple2.of(TYPE[random.nextInt(TYPE.length)], random.nextInt(10));
                    System.out.println(new Date() + ",提交元素:" + tuple2);
                    ctx.collect(tuple2);
                }
            }
 
            @Override
            public void cancel() {
                isRunning = false;
            }
 
        }, "order-info");
 
        SplitStream<Tuple2<String, Integer>> splitStream = orderSource.split(new OutputSelector<Tuple2<String, Integer>>() {
            @Override
            public Iterable<String> select(Tuple2<String, Integer> value) {//是给每个消息打了一个标;随用再用select选择符合条件tag的消息
                List<String> list = new ArrayList<>();
                if (value.f0.contains("a") || value.f0.contains("b") || value.f0.contains("c")) {
                    list.add("pre");
                } else {
                    list.add("post");
                }
                return list;
            }
        });
 
        DataStream<Tuple2<String, Integer>> preSplitStream = splitStream.select("pre");
        preSplitStream.print();
 
        env.execute("Flink Streaming Java API Skeleton");
    }
}

输出:

Sun Jan 17 12:48:29 CST 2021,提交元素:(e火龙果,4)

Sun Jan 17 12:48:30 CST 2021,提交元素:(b梨,2)

5> (b梨,2)

Sun Jan 17 12:48:31 CST 2021,提交元素:(e火龙果,5)

Sun Jan 17 12:48:32 CST 2021,提交元素:(e火龙果,5)

Sun Jan 17 12:48:33 CST 2021,提交元素:(a苹果,5)

6> (a苹果,5)

Sun Jan 17 12:48:34 CST 2021,提交元素:(d葡萄,3)

Sun Jan 17 12:48:35 CST 2021,提交元素:(a苹果,3)

7> (a苹果,3)

Sun Jan 17 12:48:36 CST 2021,提交元素:(b梨,3)

8> (b梨,3)

Sun Jan 17 12:48:37 CST 2021,提交元素:(e火龙果,2)

Sun Jan 17 12:48:38 CST 2021,提交元素:(c西瓜,7)

9> (c西瓜,7)

3 side outputs旁路输出

package splitStream;
 
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
 
import java.util.Date;
import java.util.Random;
import java.util.concurrent.TimeUnit;
 
/**
 * @Author you guess
 * @Date 2021/1/17 12:57
 * @Version 1.0
 * @Desc
 */
public class SideOutputsDataStreamTest {
    private static final Logger LOG = LoggerFactory.getLogger(SideOutputsDataStreamTest.class);
    private static final String[] TYPE = {"a苹果", "b梨", "c西瓜", "d葡萄", "e火龙果"};
 
    public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
 
        //添加自定义数据源,每秒发出一笔订单信息{商品名称,商品数量}
        DataStreamSource<Tuple2<String, Integer>> orderSource = env.addSource(new SourceFunction<Tuple2<String, Integer>>() {
            private volatile boolean isRunning = true;
            private final Random random = new Random();
 
            @Override
            public void run(SourceContext<Tuple2<String, Integer>> ctx) throws Exception {
                while (isRunning) {
                    TimeUnit.SECONDS.sleep(1);
                    Tuple2<String, Integer> tuple2 = Tuple2.of(TYPE[random.nextInt(TYPE.length)], random.nextInt(10));
                    System.out.println(new Date() + ",提交元素:" + tuple2);
                    ctx.collect(tuple2);
                }
            }
 
            @Override
            public void cancel() {
                isRunning = false;
            }
 
        }, "order-info");
 
 
        final OutputTag<Tuple2<String, Integer>> preTag = new OutputTag<Tuple2<String, Integer>>("pre") {
        };
        final OutputTag<Tuple2<String, Integer>> postTag = new OutputTag<Tuple2<String, Integer>>("post") {
        };
 
 
        SingleOutputStreamOperator<Object> singleOutputStreamOperator = orderSource.process(new ProcessFunction<Tuple2<String, Integer>, Object>() {
            @Override
            public void processElement(Tuple2<String, Integer> value, Context ctx, Collector<Object> out) throws Exception {
                if (value.f0.contains("a") || value.f0.contains("b") || value.f0.contains("c")) {
                    ctx.output(preTag, value);
                } else {
                    ctx.output(postTag, value);
                }
            }
        });
 
        DataStream<Tuple2<String, Integer>> preSplitStream = singleOutputStreamOperator.getSideOutput(preTag);
        preSplitStream.print();
 
        env.execute("Flink Streaming Java API Skeleton");
    }
}

输出:

Sun Jan 17 13:08:37 CST 2021,提交元素:(c西瓜,4)

9> (c西瓜,4)

Sun Jan 17 13:08:38 CST 2021,提交元素:(c西瓜,4)

10> (c西瓜,4)

Sun Jan 17 13:08:39 CST 2021,提交元素:(b梨,5)

11> (b梨,5)

Sun Jan 17 13:08:40 CST 2021,提交元素:(d葡萄,7)

Sun Jan 17 13:08:41 CST 2021,提交元素:(a苹果,5)

1> (a苹果,5)

Sun Jan 17 13:08:42 CST 2021,提交元素:(d葡萄,9)

Sun Jan 17 13:08:43 CST 2021,提交元素:(b梨,5)

3> (b梨,5)

Side outputs are more flexible and more efficient than split/select.

ci.apache.org/projects/fl… 

blog.csdn.net/qq\_3751857… 

flink 1.9.2 jdk1.8

end