实时数据分流的方法
1 使用filter
2 使用split / select(已废弃)
package splitStream;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SplitStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;
import java.util.concurrent.TimeUnit;
/**
* @Author you guess
* @Date 2021/1/17 12:40
* @Version 1.0
* @Desc
*/
public class SplitDataStreamTest {
private static final Logger LOG = LoggerFactory.getLogger(MinMinByMaxMaxBy.MinMinByMaxMaxByTest.class);
private static final String[] TYPE = {"a苹果", "b梨", "c西瓜", "d葡萄", "e火龙果"};
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//添加自定义数据源,每秒发出一笔订单信息{商品名称,商品数量}
DataStreamSource<Tuple2<String, Integer>> orderSource = env.addSource(new SourceFunction<Tuple2<String, Integer>>() {
private volatile boolean isRunning = true;
private final Random random = new Random();
@Override
public void run(SourceContext<Tuple2<String, Integer>> ctx) throws Exception {
while (isRunning) {
TimeUnit.SECONDS.sleep(1);
Tuple2<String, Integer> tuple2 = Tuple2.of(TYPE[random.nextInt(TYPE.length)], random.nextInt(10));
System.out.println(new Date() + ",提交元素:" + tuple2);
ctx.collect(tuple2);
}
}
@Override
public void cancel() {
isRunning = false;
}
}, "order-info");
SplitStream<Tuple2<String, Integer>> splitStream = orderSource.split(new OutputSelector<Tuple2<String, Integer>>() {
@Override
public Iterable<String> select(Tuple2<String, Integer> value) {//是给每个消息打了一个标;随用再用select选择符合条件tag的消息
List<String> list = new ArrayList<>();
if (value.f0.contains("a") || value.f0.contains("b") || value.f0.contains("c")) {
list.add("pre");
} else {
list.add("post");
}
return list;
}
});
DataStream<Tuple2<String, Integer>> preSplitStream = splitStream.select("pre");
preSplitStream.print();
env.execute("Flink Streaming Java API Skeleton");
}
}
输出:
Sun Jan 17 12:48:29 CST 2021,提交元素:(e火龙果,4)
Sun Jan 17 12:48:30 CST 2021,提交元素:(b梨,2)
5> (b梨,2)
Sun Jan 17 12:48:31 CST 2021,提交元素:(e火龙果,5)
Sun Jan 17 12:48:32 CST 2021,提交元素:(e火龙果,5)
Sun Jan 17 12:48:33 CST 2021,提交元素:(a苹果,5)
6> (a苹果,5)
Sun Jan 17 12:48:34 CST 2021,提交元素:(d葡萄,3)
Sun Jan 17 12:48:35 CST 2021,提交元素:(a苹果,3)
7> (a苹果,3)
Sun Jan 17 12:48:36 CST 2021,提交元素:(b梨,3)
8> (b梨,3)
Sun Jan 17 12:48:37 CST 2021,提交元素:(e火龙果,2)
Sun Jan 17 12:48:38 CST 2021,提交元素:(c西瓜,7)
9> (c西瓜,7)
3 side outputs旁路输出
package splitStream;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Date;
import java.util.Random;
import java.util.concurrent.TimeUnit;
/**
* @Author you guess
* @Date 2021/1/17 12:57
* @Version 1.0
* @Desc
*/
public class SideOutputsDataStreamTest {
private static final Logger LOG = LoggerFactory.getLogger(SideOutputsDataStreamTest.class);
private static final String[] TYPE = {"a苹果", "b梨", "c西瓜", "d葡萄", "e火龙果"};
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//添加自定义数据源,每秒发出一笔订单信息{商品名称,商品数量}
DataStreamSource<Tuple2<String, Integer>> orderSource = env.addSource(new SourceFunction<Tuple2<String, Integer>>() {
private volatile boolean isRunning = true;
private final Random random = new Random();
@Override
public void run(SourceContext<Tuple2<String, Integer>> ctx) throws Exception {
while (isRunning) {
TimeUnit.SECONDS.sleep(1);
Tuple2<String, Integer> tuple2 = Tuple2.of(TYPE[random.nextInt(TYPE.length)], random.nextInt(10));
System.out.println(new Date() + ",提交元素:" + tuple2);
ctx.collect(tuple2);
}
}
@Override
public void cancel() {
isRunning = false;
}
}, "order-info");
final OutputTag<Tuple2<String, Integer>> preTag = new OutputTag<Tuple2<String, Integer>>("pre") {
};
final OutputTag<Tuple2<String, Integer>> postTag = new OutputTag<Tuple2<String, Integer>>("post") {
};
SingleOutputStreamOperator<Object> singleOutputStreamOperator = orderSource.process(new ProcessFunction<Tuple2<String, Integer>, Object>() {
@Override
public void processElement(Tuple2<String, Integer> value, Context ctx, Collector<Object> out) throws Exception {
if (value.f0.contains("a") || value.f0.contains("b") || value.f0.contains("c")) {
ctx.output(preTag, value);
} else {
ctx.output(postTag, value);
}
}
});
DataStream<Tuple2<String, Integer>> preSplitStream = singleOutputStreamOperator.getSideOutput(preTag);
preSplitStream.print();
env.execute("Flink Streaming Java API Skeleton");
}
}
输出:
Sun Jan 17 13:08:37 CST 2021,提交元素:(c西瓜,4)
9> (c西瓜,4)
Sun Jan 17 13:08:38 CST 2021,提交元素:(c西瓜,4)
10> (c西瓜,4)
Sun Jan 17 13:08:39 CST 2021,提交元素:(b梨,5)
11> (b梨,5)
Sun Jan 17 13:08:40 CST 2021,提交元素:(d葡萄,7)
Sun Jan 17 13:08:41 CST 2021,提交元素:(a苹果,5)
1> (a苹果,5)
Sun Jan 17 13:08:42 CST 2021,提交元素:(d葡萄,9)
Sun Jan 17 13:08:43 CST 2021,提交元素:(b梨,5)
3> (b梨,5)
Side outputs are more flexible and more efficient than split/select.
flink 1.9.2 jdk1.8
end