大数据组件- 1. 数据湖仓,流批一体架构

0 阅读1分钟

docs.starrocks.io/zh/docs/dat…

CREATE EXTERNAL CATALOG 'iceberg'
PROPERTIES
(
  "type"="iceberg",
  "iceberg.catalog.type"="rest",
  "iceberg.catalog.uri"="http://iceberg-gravitino.cyan.com/iceberg/",
  "aws.s3.access_key"="rustfsadmin",
  "aws.s3.secret_key"="rustfsadmin",
  "aws.s3.endpoint"="http://rustfs.cyan.com",
  "aws.s3.enable_path_style_access"="true",
  "client.factory"="com.starrocks.connector.iceberg.IcebergAwsClientFactory"
);


-- 创建 MySQL External Catalog
CREATE EXTERNAL CATALOG mysql
PROPERTIES
(
    "type"="jdbc",
    "user"="root",
    "password"="627459859@qq.com",
    "jdbc_uri"="jdbc:mysql://10.0.0.2:3306",
    "driver_url"="https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.28/mysql-connector-java-8.0.28.jar",
    "driver_class"="com.mysql.cj.jdbc.Driver"
);
-- =============================================
-- 第一步:核心全局配置(解决Checkpoint不触发、流模式失效问题)
-- =============================================


-- 1. 强制流执行模式(SQL Client默认可能是批模式,导致Checkpoint不生效)
SET execution.runtime-mode = streaming;

-- 2. 开启Checkpoint(核心!Iceberg依赖Checkpoint提交数据)
SET execution.checkpointing.interval = 30s;          -- 每30s触发一次Checkpoint
SET execution.checkpointing.tolerable-failed-checkpoints = 3;  -- 允许3次失败



-- =============================================
-- 第二步:创建Kafka CDC源表(仅优化格式配置,核心不变)
-- =============================================
CREATE TABLE kafka_cdc_goods_category (
    id BIGINT,
    name STRING,
    brand STRING,
    price DECIMAL(10, 2),
    created_at STRING,
    updated_at STRING,
    deleted_at STRING,
    op STRING  -- CDC操作类型:c=新增,u=更新,d=删除
) WITH (
    'connector' = 'kafka',
    'topic' = 'goods_category_copy_goods_goods_category_copy.goods.goods_category_copy',
    'properties.bootstrap.servers' = '10.0.0.2:9092',
    'properties.group.id' = 'flink-cdc-iceberg-sql-client',  -- 单独指定group.id,避免和Java程序冲突
    'scan.startup.mode' = 'earliest-offset',
    'format' = 'debezium-json',
    'debezium-json.schema-include' = 'true',
    'debezium-json.ignore-parse-errors' = 'true',
    -- 新增:适配Debezium JSON的时间格式,避免解析失败
    'debezium-json.timestamp-format.standard' = 'ISO-8601'
);

-- =============================================
-- 第三步:创建Iceberg REST Catalog(补充必要参数,和Java对齐)
-- =============================================
CREATE CATALOG iceberg_dw WITH (
    'type' = 'iceberg',
    'catalog-type' = 'rest',
    'uri' = 'http://iceberg-gravitino.cyan.com/iceberg',
    's3.access-key-id' = 'rustfsadmin',
    's3.secret-access-key' = 'rustfsadmin',
    's3.endpoint' = 'http://rustfs.cyan.com',
    's3.region' = 'us-east-1',
    's3.path-style-access' = 'true'
);

-- 创建数据库(确保存在)
CREATE DATABASE IF NOT EXISTS iceberg_dw.ods;

-- =============================================
-- 第四步:创建Iceberg目标表(补充connector参数,避免识别失败)
-- =============================================
CREATE TABLE IF NOT EXISTS iceberg_dw.ods.ods_goods_category_copy (
    id BIGINT,
    name STRING,
    brand STRING,
    price DECIMAL(10, 2),
    created_at STRING,
    updated_at STRING,
    deleted_at STRING,
    PRIMARY KEY (id) NOT ENFORCED  -- 主键用于UPSERT
) WITH (
    'format-version' = '2',
    'write.format.default' = 'parquet',
    'write.upsert.enabled' = 'true'
);

-- =============================================
-- 第五步:后台提交INSERT任务(核心!退出SQL Client不停止)
-- =============================================
INSERT INTO iceberg_dw.ods.ods_goods_category_copy
SELECT 
    id, name, brand, price, created_at, updated_at, deleted_at
FROM kafka_cdc_goods_category;