异步关联缓存器--JoinDim

46 阅读8分钟

前言

不了解缓存器的,可以先看缓存器缓存器

不了解Aerospike的,可以先看初识Aerospike

1.采用Caffine去做维度数据的缓存器

这里以Aerospike作为维度dim层数据存储,熟悉数仓的很了解dim

/**
 * 利用构造者模式去构造对应数据库的缓存加载器CacheLoader对象
 * @author except7g
 * @date 2025/6/27
 */
public class CacheLoadBuilder {

    private static class AerospikeCacheLoader implements CacheLoader<String, Map<String,String>> {
        private AerospikeClient client;
        private String namespace;
        private String setName;
        private List<String> binNames;
        private String keyType;

        public AerospikeCacheLoader() {
        }
        // 根据不同类型,去封装成对应的Key对象
        private Key getKey(String key) {
            Key asKey;
            if ("String".equals(keyType)) {
                asKey = new Key(namespace, setName, key);
            } else if ("int".equals(keyType)) {
                asKey = new Key(namespace, setName, Integer.parseInt(key));
            } else if ("long".equals(keyType)) {
                asKey = new Key(namespace, setName, Long.parseLong(key));
            } else if ("byte[]".equals(keyType)) {
                asKey = new Key(namespace, setName, key.getBytes());
            } else {
                throw new IllegalArgumentException("Unsupported key type: " + key.getClass().getName());
            }
            return asKey;
        }
        
        // 同步加载dim层的数据,放到缓存器中
        @Override
        public @Nullable Map<String, String> load(@NonNull String key) throws Exception {
            Map<String,String> resMap=new HashMap<>();
            try {
                Key asKey = getKey(key);
                Record record = client.get(null, asKey);
                // 到这,已经从as中获取到key对应的维度数据record了
                if (record != null) {
                    binNames.forEach(binName -> {
                        if (record.getValue(binName) != null) {
                            resMap.put(binName, String.valueOf(record.getValue(binName)));
                        } else {
                            System.out.println("as中没有" + namespace + ":" + setName + ":" + key + "对应的维度数据");
                        }
                    });
                }
            }catch (Exception e){
                e.printStackTrace();
            }
            return resMap;
        }
        
        // 异步加载dim层的数据,放到缓存器中
        @Override
        public @NonNull CompletableFuture<Map<String, String>> asyncLoad(@NonNull String key, @NonNull Executor executor) {
            return CompletableFuture.supplyAsync(() -> {
                try {
                    return load(key);
                }catch (Exception e){
                    throw new RuntimeException("as异步加载数据时出错", e);
                }
            }, executor);
        }
    }

    // 设计成构造者模式
    public static class Builder{
        private String host;
        private Integer port;
        private String namespace;
        private String tableName;
        private List<String> binNames;
        private String type;
        private String keyType;

        public Builder(){

        }

        public Builder setHost(String host) {
            this.host = host;
            return this;
        }

        public Builder setPort(Integer port) {
            this.port = port;
            return this;
        }

        public Builder setNamespace(String namespace) {
            this.namespace = namespace;
            return this;
        }

        public Builder setTableName(String tableName) {
            this.tableName = tableName;
            return this;
        }

        public Builder setBinNames(List<String> binNames) {
            this.binNames = binNames;
            return this;
        }


        public Builder setType(String type) {
            this.type = type;
            return this;
        }

        public Builder setKeyType(String keyType) {
            this.keyType = keyType;
            return this;
        }

        // 同步加载器
        public CacheLoader<String, Map<String, String>> build(){
            if("as".equals(type)){
                AerospikeCacheLoader asLoader = new AerospikeCacheLoader();
                AerospikeClient asClient = new AerospikeClient(host, port);
                asLoader.client=asClient;
                asLoader.namespace=namespace;
                asLoader.setName=tableName;
                asLoader.binNames=binNames;
                asLoader.keyType=keyType;
                return asLoader;
            }else{
                System.out.println("还未更新其他类型的数据库缓存器");
            }
            return null;
        }

        // 异步加载器
        public AsyncCacheLoader<String, Map<String, String>> buildAsync() {
            if ("as".equals(type)) {
                AerospikeCacheLoader asLoader = new AerospikeCacheLoader();
                AerospikeClient asClient = new AerospikeClient(host, port);
                asLoader.client = asClient;
                asLoader.namespace = namespace;
                asLoader.setName = tableName;
                asLoader.binNames = binNames;
                asLoader.keyType = keyType;
                return asLoader::asyncLoad;
            } else {
                System.out.println("还未更新其他类型的数据库缓存器");
            }
            return null;
        }
    }
}

2.构造异步自定义维度关联器

这里是在Flink中构造,因此需要继承RichAsyncFunction

/**
 * 异步IO维度关联实现(MethodHandle版本)
 * 1.CacheLoader缓存自动加载器,获取维度数据 --- 存在一致性问题:2s内的维度数据变化,cache未能及时捕获dim变化的数据
 * 2.CompletableFuture异步编排,实现维度关联
 * 3.千万要注意:joinFieldCache是要关联的操作,应该是set。keyFieldCache是要查询的,应该是get
 * @author expect7g
 * @date 2025/6/27
 */

public class AsyncJoinDimFunction2<T> extends RichAsyncFunction<T, T> {
    private static final Logger LOGGER = Logger.getLogger(AsyncJoinDimFunction2.class.getName());
    private @NonNull AsyncLoadingCache<String, Map<String, String>> asyncCache;
    // 注意:下面这俩本地缓存不是缓存器
    // 1.关联的Field的本地缓存
    private Map<String, MethodHandle> JOIN_FIELD_CACHE = new ConcurrentHashMap<>();
    // 2.关联的Key字段的本地缓存,因为有的场景下,关联条件不只是一个字段,可能是多个字段共同形成一个key
    private Map<String, MethodHandle> KEY_FIELD_CACHE = new ConcurrentHashMap<>();
    private String defaultValue;
    private FlowConfig.JoinDim joinDim;

    public AsyncJoinDimFunction2(FlowConfig.JoinDim joinDim) {
        this.joinDim = joinDim;
        this.defaultValue = joinDim.getDefaultValue();
    }

    // 初始化缓存
    @Override
    public void open(Configuration parameters) throws Exception {
        if(asyncCache != null){
            asyncCache.synchronous().invalidateAll();
            asyncCache.synchronous().cleanUp();
        }
        // 创建异步缓存器,并设置缓存数据在写入后2s自动过期
        AsyncCacheLoader<String, Map<String, String>> asyncCacheLoader = new CacheLoadBuilder.Builder()
                .setHost(joinDim.getHost())
                .setPort(Integer.valueOf(joinDim.getPort()))
                .setNamespace(joinDim.getNamespace())
                .setTableName(joinDim.getTableName())
                .setBinNames(joinDim.getDimFields())
                .setType(joinDim.getType())
                .setKeyType(joinDim.getKeyType())
                .buildAsync();
        asyncCache = Caffeine.newBuilder()
                .expireAfterWrite(2, TimeUnit.SECONDS)
                .buildAsync(asyncCacheLoader);
    }

    // 异步维度关联
    @Override
    public void asyncInvoke(T data, ResultFuture<T> resultFuture) throws Exception {
        // 将当前数据中,关联的key字段以及要填充的维度字段的值,都缓存到本地,提升效率。
        fillCache(data);
        String key = extractKey(data);
        // 异步编排
        asyncCache.get(key) // 若缓存未名字,会触发缓存器的asyncLoad方法,返回的是CompletableFuture<Map<String, String>>
                .thenApplyAsync(dimMap -> {
                    // 如果dim层有key对应的维度数据,用维度数据进行关联
                    if (dimMap != null && !dimMap.isEmpty()) {
                        return joinDim(data, dimMap,false); // 进行维度关联
                    } else {// 否则,用默认值代替
                        return joinDim(data, Collections.emptyMap(),true);
                    }
                })
                .thenAccept(res -> { // 将关联后的结果返回给flink
                    if (res != null) {
                        resultFuture.complete(Collections.singletonList(res));
                    }
                    else{
                        resultFuture.completeExceptionally(new Exception("异步维度关联失败"));
                    }
                });
    }

    // 关闭资源
    @Override
    public void close() throws Exception {
        if(asyncCache != null){
            asyncCache.synchronous().invalidateAll();
            asyncCache.synchronous().cleanUp();
        }
        if(JOIN_FIELD_CACHE != null || KEY_FIELD_CACHE != null){
            JOIN_FIELD_CACHE.clear();
            KEY_FIELD_CACHE.clear();
        }
    }

    // 获取 key 并填充 join 的 FieldCache
    private void fillCache(T data) {
        // 这里是反射的重点,怎么能在原有数据的基础上,获取key的值,然后填充维度字段的值呢
        // 获取:get
        // 填充:set
        fillFieldCache(joinDim.getKeyFields(), data, KEY_FIELD_CACHE,"get");
        fillFieldCache(joinDim.getDimFields(), data, JOIN_FIELD_CACHE,"set");
    }

    // 填充数据对应的 FieldCache 缓存
    private void fillFieldCache(List<String> fieldNameList, T data, Map<String, MethodHandle> fieldCache, String type) {
        for (String fieldName : fieldNameList) {
            getMethodHandle(data, fieldCache, type, fieldName);
        }
    }

    // 根据type和cache,进行获取|补充
    private MethodHandle getMethodHandle(T data, Map<String, MethodHandle> fieldCache, String type, String fieldName) {
        MethodHandle methodHandle = fieldCache.get(fieldName);
        // 双重检查,防止线程不安全 ---也可以采用CocurrentMap的containKey
        if (methodHandle == null) {
            synchronized (fieldCache) {
                methodHandle = fieldCache.get(fieldName);
                if (methodHandle == null) {
                    try {
                        Field field = data.getClass().getDeclaredField(fieldName);
                        field.setAccessible(true);
                        MethodHandles.Lookup lookup = MethodHandles.lookup();
                        // 获取data数据中field字段的值
                        if ("get".equals(type)) {
                            methodHandle = lookup.unreflectGetter(field);
                        } else if ("set".equals(type)) {// 设置维度值
                            methodHandle = lookup.unreflectSetter(field);
                        }
                        // 注意:到这,methodHandle都是配置各种行为,还没有真正执行,真正执行反射操作是在methodHandle.invoke()执行的,如下面JoinDim方法的joinMH.invoke(data, value);
                        // 比如我上面定义是给这个字段get操作,那么下面invoke就会get这个字段的值
                        // 比如我上面定义的是给这个字段set操作,那么下面invoke就会set值给这个字段
                        fieldCache.put(fieldName, methodHandle);
                    } catch (Exception e) {
                        LOGGER.log(Level.SEVERE, "Failed to get MethodHandle for field " + fieldName, e);
                    }
                }
            }
        }
        return methodHandle;
    }

    // 进行维度关联,在这里已经获取数据中 key 值对应的维度了,只需要补充字段即可
    private T joinDim(T data, Map<String, String> dimMap , boolean isDefault) {
        if(!isDefault) {// 用维度值填充
            for (Map.Entry<String, String> dim : dimMap.entrySet()) {
                try {
                    MethodHandle joinMH = getMethodHandle(data, JOIN_FIELD_CACHE, "set", dim.getKey());
                    // 这里parameterType数组[0]是类名,[1]是具体属性的类型
                    Class<?> fieldType = joinMH.type().parameterType(1);
                    Object value = convertValue(dim.getValue(), fieldType);
                    joinMH.invoke(data, value);
                } catch (Throwable e) {
                    LOGGER.log(Level.WARNING, "join失败: " + dim.getKey(), e);
                }
            }
        } else{// 用默认值代替
            for (Map.Entry<String, MethodHandle> joinEntry : JOIN_FIELD_CACHE.entrySet()) {
                try {
                    MethodHandle joinDe = getMethodHandle(data, JOIN_FIELD_CACHE, "set", joinEntry.getKey());
                    Class<?> fieldType = joinDe.type().parameterType(1);
                    Object value = convertValue(defaultValue,fieldType);
                    joinDe.invoke(data, value);
                }catch (Throwable e){
                    LOGGER.log(Level.WARNING, "默认值join失败: " + joinEntry.getKey(), e);
                }
            }
        }
        return data;
    }

    // 数据类型转换
    private Object convertValue(String value, Class<?> targetType) {
        if (targetType == String.class) {
            return value;
        } else if (targetType == Integer.class || targetType == int.class) {
            if ("".equals(value)) {
                return 0;
            }
            return Integer.parseInt(value);
        } else if (targetType == Long.class || targetType == long.class) {
            if ("".equals(value)) {
                return 0L;
            }
            return Long.parseLong(value);
        } else if (targetType == Boolean.class || targetType == boolean.class) {
            if ("".equals(value)) {
                return false;
            }
            return Boolean.parseBoolean(value);
        } else if (targetType == Double.class || targetType == double.class) {
            if ("".equals(value)) {
                return 0D;
            }
            return Double.parseDouble(value);
        } else {
            throw new IllegalArgumentException("Unsupported type: " + targetType);
        }
    }

    // 针对组合主键的情况,单独进行修改 提取数据中 key 的值
    private String extractKey(T data) throws Exception {
        StringBuffer resKey = new StringBuffer();
        List<String> keyFields = joinDim.getKeyFields();
        for (int i = 0; i < keyFields.size(); i++) {
            MethodHandle keyMH = getMethodHandle(data, KEY_FIELD_CACHE, "get", keyFields.get(i));
            try {
                // 获取key字段的值
                Object value = keyMH.invoke(data);
                if (keyFields.size() == 1) {
                    resKey.append(value.toString());
                    break;
                }
                resKey.append(value.toString());
                // 这个应该让开发者自己决定,因为不同的数据库的多字段组合key情况是不同的,这里只是列举as的一种常规多字段组合成key的情况,就是用_连接
                if (i != keyFields.size() - 1) {
                    resKey.append("_");
                }
            } catch (Throwable e) {
                LOGGER.log(Level.SEVERE, "Failed to extract key value", e);
            }
        }
        return resKey.toString();
    }

    //TODO待优化:如果有写入as的代码,可以在写入的时候调用下面的代码,手动更新缓存的方法,在维度表数据更新时调用---解决关联一致性问题
    public void updateCache(String key, Map<String, String> newData) {
        asyncCache.put(key, CompletableFuture.completedFuture(newData));
    }
}

3.测试案例

yaml文件是配置的config,告诉flink要去哪个数据库进行维度关联,关联的key是什么,需要关联的维度字段是什么

runEnv:
  checkPoint:
    enable: false # 开始cp
    intervalMillis: 60000 #检查点之间的最小时间间隔
    checkpointTimeout: 60000 #检查点超时时间

joinDims:
  - name: with-join-1
    type: as
    host: 127.0.0.1
    port: 3000
    namespace: mediav
    tableName: test2_dim_sh
    keyFields: [report_type,report_id]
    keyType: String
    dimFields: [repeat_name]
    defaultValue: "0"
  - name: with-join-2
    type: as
    host: 127.0.0.1
    port: 3000
    namespace: mediav
    tableName: test_dim_sh
    keyFields: [id]
    keyType: int
    dimFields: [name,age]
    defaultValue: "0"
public class dimTest {
    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        configuration.setString("rest.port","8081");
        StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);

        // 创建Yaml对象
        Yaml yaml = new Yaml();
        // 从文件加载YAML数据并转换为FlowConfig对象
        FileInputStream inputStream = new FileInputStream("src/main/java/com/flink/dw/flow/joinDim/test2.yaml");
        FlowConfig flowConfig = yaml.loadAs(inputStream, FlowConfig.class);
        /*先自己模拟Source和Sink
         * */
        FlowConfig.RuntimeEnvironment runEnv = flowConfig.getRunEnv();
        if(runEnv.getCheckPoint()!=null){
            env.enableCheckpointing(runEnv.getCheckPoint().getIntervalMillis());
            env.getCheckpointConfig().setCheckpointTimeout(runEnv.getCheckPoint().getCheckpointTimeout());
        }

        DataStreamSource ds = env.fromElements(
                new MyData("log", "1","11", "1", "16", "1", "740","777",1,2,3,2),
                new MyData("log","0", "11", "1", "4", "0", "333","0",1,2,3,1),
                new MyData("log", "2","11", "1", "4", "0", "222","888",1,2,3,3),
                new MyData("text", "4","11", "1", "48", "0", "703","777",1,2,3,4),
                new MyData("log", "1","11", "1", "16", "1", "740","777",1,2,3,2),
                new MyData("log","0", "11", "1", "4", "0", "333","0",1,2,3,1),
                new MyData("log", "2","11", "1", "4", "0", "222","888",1,2,3,3),
                new MyData("text", "4","11", "1", "48", "0", "703","777",1,2,3,4),
                new MyData("log", "1","11", "1", "16", "1", "740","777",1,2,3,2),
                new MyData("log","0", "11", "1", "4", "0", "333","0",1,2,3,1),
                new MyData("log", "2","11", "1", "4", "0", "222","888",1,2,3,3),
                new MyData("text", "4","11", "1", "48", "0", "703","777",1,2,3,4)
        );
/*        DataStreamSource<String> ds = env.socketTextStream("localhost", 7777);
        SingleOutputStreamOperator<MyData> mapDs = ds.map(new MapFunction<String, MyData>() {
            @Override
            public MyData map(String s) throws Exception {
                String[] split = s.split(",");
                return new MyData(split[0], split[1], split[2], split[3], split[4], split[5], split[6], split[7], Integer.parseInt(split[8]), Integer.parseInt(split[9]), Integer.parseInt(split[10]), Integer.parseInt(split[11]));
            }
        });*/
        SingleOutputStreamOperator finalDs=ds;
        int i=0;
        // 这里是可能会多次关联,比如先去a表获取一个字段,然后再去b表关联其他字段(支持嵌套)
        for (FlowConfig.JoinDim joinDim : flowConfig.getJoinDims()) {
                // 是由Flink的异步IO去做维度关联
                finalDs = AsyncDataStream.orderedWait(
                        finalDs,
                        new AsyncJoinDimFunction2(joinDim),
                        1000,
                        TimeUnit.MILLISECONDS,
                        100
                );
                finalDs.print(++i+":");
            }
        finalDs.print("res");
        env.execute();
    }
}