Flink内部聚合流+异步处理实战笔记

321 阅读3分钟

代码内容

首先我们需要引入maven的配置依赖:

<properties>
    <maven.compiler.source>8</maven.compiler.source>
    <maven.compiler.target>8</maven.compiler.target>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <flink.version>1.7.0</flink.version>
</properties>

<dependencies>
    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-core</artifactId>
        <version>1.13.2</version>
    </dependency>
    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-clients_2.12</artifactId>
        <version>1.13.2</version>
    </dependency>
    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-runtime-web_2.12</artifactId>
        <version>1.13.2</version>
    </dependency>
    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-streaming-java_2.12</artifactId>
        <version>1.13.2</version>
    </dependency>
    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-connector-kafka_2.12</artifactId>
        <version>1.13.2</version>
    </dependency>
    <dependency>
        <groupId>com.alibaba</groupId>
        <artifactId>fastjson</artifactId>
        <version>1.2.67</version>
    </dependency>
    <dependency>
        <groupId>org.projectlombok</groupId>
        <artifactId>lombok</artifactId>
        <version>1.18.2</version>
    </dependency>
</dependencies>

这段依赖会帮助我们把flink的基础所需要引入的包给配置好。

然后便是开始编写主要的flink代码:

package org.idea.flink.api;

import com.alibaba.fastjson.JSON;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.AsyncDataStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer011;
import org.apache.flink.util.Collector;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.idea.flink.api.constants.KafkaProperties;
import org.idea.flink.api.model.UserClickMsg;
import org.idea.flink.api.model.UserMsgFlatMapFunction;

import java.util.Properties;
import java.util.concurrent.TimeUnit;

/**
 * @author idea
 * @create 2024/3/16 17:46
 * @description 接收kafka数据源
 */
public class KafkaSourceMain {

    private static StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

    public static void main(String[] args) throws Exception {
        System.out.println("开启flink程序接收kafka数据源");
        env.enableCheckpointing(60000);
        Properties properties = new Properties();
        //这里写自己的kafka地址,用于接收外界数据流
        properties.setProperty("bootstrap.servers", KafkaProperties.KAFKA_BROKER_ADDRESS);
        properties.setProperty("group.id", "test_group_id");
        //假设我们有多个topic作为消息入口,可以采用flatmap来对消息格式统一,然后用union将各个流聚合起来
        FlinkKafkaConsumer userCenterSource = new FlinkKafkaConsumer(KafkaProperties.USER_SERVICE_TOPIC, new SimpleStringSchema(), properties);
        DataStream<UserClickMsg> userDataStream = env.addSource(userCenterSource).name("用户消息").flatMap(new UserMsgFlatMapFunction());
        FlinkKafkaConsumer orderCenterSource = new FlinkKafkaConsumer(KafkaProperties.ORDER_SERVICE_TOPIC, new SimpleStringSchema(), properties);
        DataStream<UserClickMsg> orderDataStream = env.addSource(orderCenterSource).name("订单中心").flatMap(new UserMsgFlatMapFunction());

        DataStream<UserClickMsg> unionDataStream = userDataStream;
        unionDataStream.union(orderDataStream);
        unionDataStream.print();
        //做个简单的去重,然后上报到一个指定的topic上,可以做elk日志的记录等能力
        SingleOutputStreamOperator<UserClickMsg> singleOutputStreamOperator = unionDataStream.
                keyBy(userClickMsg -> userClickMsg.getUserId() + ":" + userClickMsg.getGoodId() + ":" + userClickMsg.getAction() + ":" + userClickMsg.getPlatform())
                .process(new KeyedProcessFunction<String, UserClickMsg, UserClickMsg>() {
                    @Override
                    public void processElement(UserClickMsg value, KeyedProcessFunction<String, UserClickMsg, UserClickMsg>.Context ctx, Collector<UserClickMsg> out) throws Exception {
                        out.collect(value);
                    }
                }).name("旁路上报kafka消息");
        //所有消息做个旁路,然后上报
        singleOutputStreamOperator.flatMap(new FlatMapFunction<UserClickMsg, String>() {
            @Override
            public void flatMap(UserClickMsg userClickMsg, Collector<String> collector) throws Exception {
                collector.collect(JSON.toJSONString(userClickMsg));
            }
        }).addSink(getSinkOutProducer()).name("去重后上报kafka消息");
        //异步处理
        DataStream<String> asyncStream = AsyncDataStream.unorderedWait(singleOutputStreamOperator,new AsyncHandler(),5, TimeUnit.SECONDS);
        asyncStream.addSink(getAsyncProducer()).name("异步流处理结果");
        asyncStream.print();
        env.execute("test-flink-consumer");
    }

    //构建一个kafka的producer用于吐消息
    public static FlinkKafkaProducer<String> getAsyncProducer() {
        Properties rptDataSinkProp = new Properties();
        rptDataSinkProp.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, KafkaProperties.KAFKA_BROKER_ADDRESS);
        rptDataSinkProp.setProperty(ProducerConfig.TRANSACTION_TIMEOUT_CONFIG, 1000 * 700 + "");
        rptDataSinkProp.setProperty(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, "1");
        rptDataSinkProp.setProperty(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, "true");
        return new FlinkKafkaProducer<String>( "async_handle_result", new SimpleStringSchema(),rptDataSinkProp);
    }

    //构建一个kafka的producer用于吐消息
    public static FlinkKafkaProducer<String> getSinkOutProducer() {
        Properties rptDataSinkProp = new Properties();
        rptDataSinkProp.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, KafkaProperties.KAFKA_BROKER_ADDRESS);
        rptDataSinkProp.setProperty(ProducerConfig.TRANSACTION_TIMEOUT_CONFIG, 1000 * 700 + "");
        rptDataSinkProp.setProperty(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, "1");
        rptDataSinkProp.setProperty(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, "true");
        return new FlinkKafkaProducer<String>( "sink_out_topic", new SimpleStringSchema(),rptDataSinkProp);
    }
}

UserClickMsg消息对象

/**
 * @author idea
 * @create 2024/3/16 19:54
 * @description
 */
public class UserClickMsg {

    private String platform;

    //用户id
    private Long userId;

    //点击类型:1:收藏,2:取消收藏,3:查看,4:屏蔽
    private int action;

    //商品id
    private Long goodId;

    public String getPlatform() {
        return platform;
    }

    public void setPlatform(String platform) {
        this.platform = platform;
    }

    public Long getUserId() {
        return userId;
    }

    public void setUserId(Long userId) {
        this.userId = userId;
    }

    public int getAction() {
        return action;
    }

    public void setAction(int action) {
        this.action = action;
    }

    public Long getGoodId() {
        return goodId;
    }

    public void setGoodId(Long goodId) {
        this.goodId = goodId;
    }

    @Override
    public String toString() {
        return "UserClickMsg{" +
                "platform='" + platform + ''' +
                ", userId=" + userId +
                ", action=" + action +
                ", goodId=" + goodId +
                '}';
    }
}

用于格式转换的FlatMapFunction

package org.idea.flink.api.model;

import com.alibaba.fastjson.JSON;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.util.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @author idea
 * @create 2024/3/16 19:55
 * @description 用户消息转换函数
 */
public class UserMsgFlatMapFunction implements FlatMapFunction<String, UserClickMsg> {

    private static final Logger LOGGER = LoggerFactory.getLogger(UserMsgFlatMapFunction.class);

    @Override
    public void flatMap(String msg, Collector<UserClickMsg> collector) throws Exception {
        try {
            UserClickMsg userMsg = JSON.parseObject(msg, UserClickMsg.class);
            System.out.println(userMsg);
            collector.collect(userMsg);
        } catch (Exception e) {
            LOGGER.error("msg content error:{}", msg);
        }
    }
}

异步处理

package org.idea.flink.api;

import com.alibaba.fastjson.JSON;
import org.apache.commons.lang3.RandomUtils;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.async.ResultFuture;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import org.idea.flink.api.model.UserClickMsg;

import java.util.Collections;
import java.util.concurrent.*;

/**
 * @author idea
 * @create 2024/3/17 08:35
 * @description 异步处理handler
 */
public class AsyncHandler extends RichAsyncFunction<UserClickMsg, String> {

    private transient ThreadPoolExecutor threadPool;

    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);
        threadPool = new ThreadPoolExecutor(10, 100, 3000, TimeUnit.MICROSECONDS,
                new ArrayBlockingQueue<>(1000), new ThreadFactory() {
            @Override
            public Thread newThread(Runnable r) {
                Thread thread = new Thread(r);
                thread.setName("async-handle-" + RandomUtils.nextInt(0,100));
                return thread;
            }
        });
    }

    @Override
    public void close() throws Exception {
        super.close();
        threadPool.shutdown();
    }

    @Override
    public void asyncInvoke(UserClickMsg input, ResultFuture<String> resultFuture) throws Exception {
        CompletableFuture.runAsync( ()->{
            System.out.println("异步处理数据:" + Thread.currentThread().getName() + "|" + JSON.toJSONString(input));
            //这里面你可以做自定义的业务操作,例如es查询,mongodb查询等等
            resultFuture.complete(Collections.singleton("success"));
        },threadPool);
    }
}

这段代码里面涉及到了使用flatmap,union,addSink,AsyncDataStream.unorderedWait 等操作。这块我简单解释一下它们的作用。

  • flatmap

你可以理解为类似jdk8的lambda表达式中的扁平化处理,将对象的输入格式A转换为输出格式B。

  • union

聚合流,你可以理解为将多个输入流合并成一个流,这样对于我们的代码处理会更简单一些,但是要确保多个流的格式统一才行。

  • addSink

添加输出流,一般我们会把flink处理的结果sink出去,例如投递到下一个Kafka主题上。

  • AsyncDataStream.unorderedWait

异步处理,一般我们会将一些比较复杂的业务操作,例如es查询,mongodb查询等等。

运行效果

我们往 user_service_topic,order_service_topic 两个主题的投递Kafka消息,然后监听 async_handle_result,sink_out_topic 主题 可以分别看到flink异步处理后的结果,以及所有投递过来的消息转换格式后的内容。