1. DataStream API Join
DataStream 中的 Join 即 Window Join,通过把两个流的数据分配到同一个窗口,来把两个流的数据 join 到一起,分配到同一个窗口的元素做笛卡尔积作为输出结果。所以跟 flink 中的 window 一样,Window Join 也支持 Tumbling Window Join、Sliding Window Join、Session Window Join,除此以外还支持 Interval Join。
1.1 Window Join
以 Tumbling Window Join 为例
示例代码如下
import cn.hutool.core.date.DateUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.junit.Test;
@Slf4j
public class SocketDemo {
@Test
public void windowJoinTest() throws Exception {
// nc -lk 12345
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 从 1.12 开始, 默认 TimeCharacteristic 是 EventTime
// env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStreamSource<String> dataStreamSource = env.socketTextStream("localhost", 12345);
// watermark 产生周期, 默认就是 200ms, 一般不需要做修改
// env.getConfig().setAutoWatermarkInterval(200);
// 设置 watermark 生成规则, 简单当做顺序时间来处理
SingleOutputStreamOperator<String> watermarkStream = dataStreamSource.assignTimestampsAndWatermarks(
WatermarkStrategy
.<String>forMonotonousTimestamps()
.withTimestampAssigner((element, recordTimestamp) -> {
// yyyyMMddHHmmss test message
String[] ss = StringUtils.split(element, StringUtils.SPACE);
return DateUtil.parse(ss[0]).getTime();
})).uid("source");
// 分成两个流, 为下面的 join 做准备
SingleOutputStreamOperator<String> firstStream = watermarkStream.filter(new FilterFunction<String>() {
@Override
public boolean filter(String value) throws Exception {
return StringUtils.startsWith(StringUtils.split(value, StringUtils.SPACE)[1], "A");
}
}).uid("first_stream");
SingleOutputStreamOperator<String> secondStream = watermarkStream.filter(new FilterFunction<String>() {
@Override
public boolean filter(String value) throws Exception {
return StringUtils.startsWith(StringUtils.split(value, StringUtils.SPACE)[1], "B");
}
}).uid("second_stream");
firstStream.join(secondStream)
.where(new KeySelector<String, String>() {
@Override
public String getKey(String value) throws Exception {
return StringUtils.split(value, StringUtils.SPACE)[2];
}
})
.equalTo(new KeySelector<String, String>() {
@Override
public String getKey(String value) throws Exception {
return StringUtils.split(value, StringUtils.SPACE)[2];
}
})
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
.apply(new JoinFunction<String, String, String>() {
@Override
public String join(String first, String second) throws Exception {
return first + "->" + second;
}
}).print();
env.execute("demo");
}
}
输入与输出
1.2 Interval Join
Interval Join 是通过构造类似窗口的方式来把数据 join 到一起,目前只持支 Event Time
构造窗口方式: orangeElem.ts + lowerBound <= greenElem.ts <= orangeElem.ts + upperBound
示例代码如下
import cn.hutool.core.date.DateUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
import org.junit.Test;
@Slf4j
public class SocketDemo {
@Test
public void intervalJoinTest() throws Exception {
// nc -lk 12345
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 从 1.12 开始, 默认 TimeCharacteristic 是 EventTime
// env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStreamSource<String> dataStreamSource = env.socketTextStream("localhost", 12345);
// watermark 产生周期, 默认就是 200ms, 一般不需要做修改
// env.getConfig().setAutoWatermarkInterval(200);
// 设置 watermark 生成规则, 简单当做顺序时间来处理
SingleOutputStreamOperator<String> watermarkStream = dataStreamSource.assignTimestampsAndWatermarks(
WatermarkStrategy
.<String>forMonotonousTimestamps()
.withTimestampAssigner((element, recordTimestamp) -> {
// yyyyMMddHHmmss test message
String[] ss = StringUtils.split(element, StringUtils.SPACE);
return DateUtil.parse(ss[0]).getTime();
})).uid("source");
// 分成两个流, 为下面的 join 做准备
SingleOutputStreamOperator<String> firstStream = watermarkStream.filter(new FilterFunction<String>() {
@Override
public boolean filter(String value) throws Exception {
return StringUtils.startsWith(StringUtils.split(value, StringUtils.SPACE)[1], "A");
}
}).uid("first_stream");
SingleOutputStreamOperator<String> secondStream = watermarkStream.filter(new FilterFunction<String>() {
@Override
public boolean filter(String value) throws Exception {
return StringUtils.startsWith(StringUtils.split(value, StringUtils.SPACE)[1], "B");
}
}).uid("second_stream");
firstStream.keyBy(new KeySelector<String, String>() {
@Override
public String getKey(String value) throws Exception {
return StringUtils.split(value, StringUtils.SPACE)[2];
}
})
.intervalJoin(secondStream.keyBy(new KeySelector<String, String>() {
@Override
public String getKey(String value) throws Exception {
return StringUtils.split(value, StringUtils.SPACE)[2];
}
}))
.between(Time.seconds(-2), Time.seconds(1))
.process(new ProcessJoinFunction<String, String, String>() {
@Override
public void processElement(String left, String right, ProcessJoinFunction<String, String, String>.Context ctx, Collector<String> out) throws Exception {
out.collect(left + "->" + right);
}
})
.uid("interval_join")
.print();
env.execute("demo");
}
}
输入输出