代码内容
首先我们需要引入maven的配置依赖:
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<flink.version>1.7.0</flink.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>1.13.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.12</artifactId>
<version>1.13.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime-web_2.12</artifactId>
<version>1.13.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.12</artifactId>
<version>1.13.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.12</artifactId>
<version>1.13.2</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.67</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.2</version>
</dependency>
</dependencies>
这段依赖会帮助我们把flink的基础所需要引入的包给配置好。
然后便是开始编写主要的flink代码:
package org.idea.flink.api;
import com.alibaba.fastjson.JSON;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.AsyncDataStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer011;
import org.apache.flink.util.Collector;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.idea.flink.api.constants.KafkaProperties;
import org.idea.flink.api.model.UserClickMsg;
import org.idea.flink.api.model.UserMsgFlatMapFunction;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
/**
* @author idea
* @create 2024/3/16 17:46
* @description 接收kafka数据源
*/
public class KafkaSourceMain {
private static StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
public static void main(String[] args) throws Exception {
System.out.println("开启flink程序接收kafka数据源");
env.enableCheckpointing(60000);
Properties properties = new Properties();
//这里写自己的kafka地址,用于接收外界数据流
properties.setProperty("bootstrap.servers", KafkaProperties.KAFKA_BROKER_ADDRESS);
properties.setProperty("group.id", "test_group_id");
//假设我们有多个topic作为消息入口,可以采用flatmap来对消息格式统一,然后用union将各个流聚合起来
FlinkKafkaConsumer userCenterSource = new FlinkKafkaConsumer(KafkaProperties.USER_SERVICE_TOPIC, new SimpleStringSchema(), properties);
DataStream<UserClickMsg> userDataStream = env.addSource(userCenterSource).name("用户消息").flatMap(new UserMsgFlatMapFunction());
FlinkKafkaConsumer orderCenterSource = new FlinkKafkaConsumer(KafkaProperties.ORDER_SERVICE_TOPIC, new SimpleStringSchema(), properties);
DataStream<UserClickMsg> orderDataStream = env.addSource(orderCenterSource).name("订单中心").flatMap(new UserMsgFlatMapFunction());
DataStream<UserClickMsg> unionDataStream = userDataStream;
unionDataStream.union(orderDataStream);
unionDataStream.print();
//做个简单的去重,然后上报到一个指定的topic上,可以做elk日志的记录等能力
SingleOutputStreamOperator<UserClickMsg> singleOutputStreamOperator = unionDataStream.
keyBy(userClickMsg -> userClickMsg.getUserId() + ":" + userClickMsg.getGoodId() + ":" + userClickMsg.getAction() + ":" + userClickMsg.getPlatform())
.process(new KeyedProcessFunction<String, UserClickMsg, UserClickMsg>() {
@Override
public void processElement(UserClickMsg value, KeyedProcessFunction<String, UserClickMsg, UserClickMsg>.Context ctx, Collector<UserClickMsg> out) throws Exception {
out.collect(value);
}
}).name("旁路上报kafka消息");
//所有消息做个旁路,然后上报
singleOutputStreamOperator.flatMap(new FlatMapFunction<UserClickMsg, String>() {
@Override
public void flatMap(UserClickMsg userClickMsg, Collector<String> collector) throws Exception {
collector.collect(JSON.toJSONString(userClickMsg));
}
}).addSink(getSinkOutProducer()).name("去重后上报kafka消息");
//异步处理
DataStream<String> asyncStream = AsyncDataStream.unorderedWait(singleOutputStreamOperator,new AsyncHandler(),5, TimeUnit.SECONDS);
asyncStream.addSink(getAsyncProducer()).name("异步流处理结果");
asyncStream.print();
env.execute("test-flink-consumer");
}
//构建一个kafka的producer用于吐消息
public static FlinkKafkaProducer<String> getAsyncProducer() {
Properties rptDataSinkProp = new Properties();
rptDataSinkProp.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, KafkaProperties.KAFKA_BROKER_ADDRESS);
rptDataSinkProp.setProperty(ProducerConfig.TRANSACTION_TIMEOUT_CONFIG, 1000 * 700 + "");
rptDataSinkProp.setProperty(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, "1");
rptDataSinkProp.setProperty(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, "true");
return new FlinkKafkaProducer<String>( "async_handle_result", new SimpleStringSchema(),rptDataSinkProp);
}
//构建一个kafka的producer用于吐消息
public static FlinkKafkaProducer<String> getSinkOutProducer() {
Properties rptDataSinkProp = new Properties();
rptDataSinkProp.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, KafkaProperties.KAFKA_BROKER_ADDRESS);
rptDataSinkProp.setProperty(ProducerConfig.TRANSACTION_TIMEOUT_CONFIG, 1000 * 700 + "");
rptDataSinkProp.setProperty(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, "1");
rptDataSinkProp.setProperty(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, "true");
return new FlinkKafkaProducer<String>( "sink_out_topic", new SimpleStringSchema(),rptDataSinkProp);
}
}
UserClickMsg消息对象
/**
* @author idea
* @create 2024/3/16 19:54
* @description
*/
public class UserClickMsg {
private String platform;
//用户id
private Long userId;
//点击类型:1:收藏,2:取消收藏,3:查看,4:屏蔽
private int action;
//商品id
private Long goodId;
public String getPlatform() {
return platform;
}
public void setPlatform(String platform) {
this.platform = platform;
}
public Long getUserId() {
return userId;
}
public void setUserId(Long userId) {
this.userId = userId;
}
public int getAction() {
return action;
}
public void setAction(int action) {
this.action = action;
}
public Long getGoodId() {
return goodId;
}
public void setGoodId(Long goodId) {
this.goodId = goodId;
}
@Override
public String toString() {
return "UserClickMsg{" +
"platform='" + platform + ''' +
", userId=" + userId +
", action=" + action +
", goodId=" + goodId +
'}';
}
}
用于格式转换的FlatMapFunction
package org.idea.flink.api.model;
import com.alibaba.fastjson.JSON;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.util.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author idea
* @create 2024/3/16 19:55
* @description 用户消息转换函数
*/
public class UserMsgFlatMapFunction implements FlatMapFunction<String, UserClickMsg> {
private static final Logger LOGGER = LoggerFactory.getLogger(UserMsgFlatMapFunction.class);
@Override
public void flatMap(String msg, Collector<UserClickMsg> collector) throws Exception {
try {
UserClickMsg userMsg = JSON.parseObject(msg, UserClickMsg.class);
System.out.println(userMsg);
collector.collect(userMsg);
} catch (Exception e) {
LOGGER.error("msg content error:{}", msg);
}
}
}
异步处理
package org.idea.flink.api;
import com.alibaba.fastjson.JSON;
import org.apache.commons.lang3.RandomUtils;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.async.ResultFuture;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import org.idea.flink.api.model.UserClickMsg;
import java.util.Collections;
import java.util.concurrent.*;
/**
* @author idea
* @create 2024/3/17 08:35
* @description 异步处理handler
*/
public class AsyncHandler extends RichAsyncFunction<UserClickMsg, String> {
private transient ThreadPoolExecutor threadPool;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
threadPool = new ThreadPoolExecutor(10, 100, 3000, TimeUnit.MICROSECONDS,
new ArrayBlockingQueue<>(1000), new ThreadFactory() {
@Override
public Thread newThread(Runnable r) {
Thread thread = new Thread(r);
thread.setName("async-handle-" + RandomUtils.nextInt(0,100));
return thread;
}
});
}
@Override
public void close() throws Exception {
super.close();
threadPool.shutdown();
}
@Override
public void asyncInvoke(UserClickMsg input, ResultFuture<String> resultFuture) throws Exception {
CompletableFuture.runAsync( ()->{
System.out.println("异步处理数据:" + Thread.currentThread().getName() + "|" + JSON.toJSONString(input));
//这里面你可以做自定义的业务操作,例如es查询,mongodb查询等等
resultFuture.complete(Collections.singleton("success"));
},threadPool);
}
}
这段代码里面涉及到了使用flatmap,union,addSink,AsyncDataStream.unorderedWait 等操作。这块我简单解释一下它们的作用。
- flatmap
你可以理解为类似jdk8的lambda表达式中的扁平化处理,将对象的输入格式A转换为输出格式B。
- union
聚合流,你可以理解为将多个输入流合并成一个流,这样对于我们的代码处理会更简单一些,但是要确保多个流的格式统一才行。
- addSink
添加输出流,一般我们会把flink处理的结果sink出去,例如投递到下一个Kafka主题上。
- AsyncDataStream.unorderedWait
异步处理,一般我们会将一些比较复杂的业务操作,例如es查询,mongodb查询等等。
运行效果
我们往 user_service_topic,order_service_topic 两个主题的投递Kafka消息,然后监听 async_handle_result,sink_out_topic 主题 可以分别看到flink异步处理后的结果,以及所有投递过来的消息转换格式后的内容。