本文已参与「新人创作礼」活动,一起开启掘金创作之路。
基于FlinkSql实时数仓构建
实时数仓主要解决传统数仓数据时效性低的问题,实时数仓通常会用在实时的OLAP分析,实时大屏展示,实时监控报警各个场景。虽然关于实时数仓架构及技术选型与传统的离线数仓会存在差异,但是关于数仓建设的基本方法论是一致的。接下来主要介绍Flink SQL从0到1搭建一个实时数仓的demo,涉及到数据采集、存储、计算、可视化整个流程。
1、案例简介
本文以电商业务为例,展示实时数仓的数据处理流程。另外,本文旨在说明实时数仓的构建流程,所以不会涉及复杂的数据计算。为了保证案例的可操作性和完整性,本文会给出详细的操作步骤。为了方便演示,本文的所有操作都是在Flink SQL Cli中完成。
1.1 指标
- 地区消费能力TopN
- 学历购物爱好TopN
- 热门商品TopN
- 消费总金额
- 文明城市TopN
2、架构设计
2.1 架构设计概要
详细的架构设计如图所示:首先通过CDC方式解析Mysql的binlog日志,将数据存储到Kafka中当做ODS层的数据。然后使用Flink SQL对原始数据进行清洗关联,并将处理之后的明细宽表数据写入到Kafka中。维表数据通过Flink的方式进行回表到本地的Mysql中。通过Flink Sql对明细宽表与维表进行join,将聚合后的数据写入到写入到elasticsearch中,通过Kibana的可视化进行展示。
2.2 架构分层设计
3、业务数据
3.1 业务数据表关系
3.2 业务数据表
DROP TABLE IF EXISTS `action_type`;
CREATE TABLE `action_type` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`type` varchar(50) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for categorys
-- ----------------------------
DROP TABLE IF EXISTS `categorys`;
CREATE TABLE `categorys` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`category` varchar(50) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for commoditys
-- ----------------------------
DROP TABLE IF EXISTS `commoditys`;
CREATE TABLE `commoditys` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`category` int(10) DEFAULT NULL,
`commodity` varchar(50) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for evaluate
-- ----------------------------
DROP TABLE IF EXISTS `evaluate`;
CREATE TABLE `evaluate` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`userId` varchar(50) DEFAULT NULL,
`productId` int(10) DEFAULT NULL,
`described` int(10) DEFAULT NULL,
`comment` varchar(255) DEFAULT NULL,
`logisticsAttitude` int(10) DEFAULT NULL,
`serviceAttitude` int(10) DEFAULT NULL,
`merchantId` varchar(50) DEFAULT NULL,
`orderInformationId` varchar(50) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for operation_log
-- ----------------------------
DROP TABLE IF EXISTS `operation_log`;
CREATE TABLE `operation_log` (
`id` int(10) NOT NULL AUTO_INCREMENT,
`actionTime` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
`userId` varchar(50) DEFAULT NULL,
`sessionId` varchar(50) DEFAULT NULL,
`pageId` int(10) DEFAULT NULL,
`actionType` int(10) DEFAULT NULL,
`keyWord` varchar(255) DEFAULT NULL,
`categoryId` int(10) DEFAULT NULL,
`productId` int(10) DEFAULT NULL,
`advertising` int(10) DEFAULT NULL,
`advertisingId` int(10) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for order
-- ----------------------------
DROP TABLE IF EXISTS `order`;
CREATE TABLE `order` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`timestamps` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
`orderInformationId` varchar(50) DEFAULT NULL,
`userId` varchar(50) DEFAULT NULL,
`categoryId` int(10) DEFAULT NULL,
`productId` int(10) DEFAULT NULL,
`price` decimal(10,2) DEFAULT NULL,
`productCount` int(10) DEFAULT NULL,
`priceSum` decimal(10,2) DEFAULT NULL,
`shipAddress` varchar(50) DEFAULT NULL,
`receiverAddress` varchar(50) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for user_profile
-- ----------------------------
DROP TABLE IF EXISTS `user_profile`;
CREATE TABLE `user_profile` (
`userId` varchar(50) NOT NULL,
`province` varchar(50) DEFAULT NULL,
`city` varchar(50) DEFAULT NULL,
`age` int(10) DEFAULT NULL,
`education` varchar(50) DEFAULT NULL,
`jobType` varchar(50) DEFAULT NULL,
`marriage` varchar(50) DEFAULT NULL,
`sex` varchar(50) DEFAULT NULL,
`interest` varchar(50) DEFAULT NULL,
PRIMARY KEY (`userId`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
4、数据处理流程
4.1 ODS层数据同步
- kafka Connect 任务配置:
{
"name":"userAnalysis",
"config":{
"connector.class":"io.debezium.connector.mysql.MySqlConnector",
"database.history.connector.id":"userAnalysis",
"database.history.connector.class":"org.apache.kafka.connect.source.SourceConnector",
"database.hostname":"打码",
"database.port":3306,
"database.user":"你猜",
"database.password":"你猜",
"database.server.name":"ods",
"database.jdbc.driver":"com.mysql.jdbc.Driver",
"database.history.kafka.bootstrap.servers":"打码:9092",
"database.history.kafka.topic":"userAnalysis",
"database.whitelist":"userAnalysis",
"tasks.max":"1",
"key.converter":"org.apache.kafka.connect.storage.StringConverter",
"value.converter":"io.confluent.connect.avro.AvroConverter",
"value.converter.schema.registry.url":"http://localhost:8081"
}
}
- 通过以上配置同步后的CDC监控数据库表:
- Flink Sql 加载ODS层数据
-- 用户表
create table ods_users(
userId string primary key comment '用户id',
province string comment '省',
city string comment '市',
age int comment '年龄',
education string comment '学历',
jobType string comment '工作类别',
marriage string comment '婚姻状态',
sex string comment '性别',
interest string comment '兴趣'
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.user_profile',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset'
)
-- 订单表
create table if not exists ods_order(
id int PRIMARY KEY comment '订单id',
timestamps bigint comment '订单创建时间',
orderInformationId string comment '订单信息ID',
userId string comment '用户ID',
categoryId int comment '商品类别',
productId int comment '商品ID',
price decimal(10,2) comment '单价',
productCount int comment '购买数量',
priceSum decimal(10,2) comment '订单总价',
shipAddress string comment '商家地址',
receiverAddress string comment '收货地址'
--ts AS TO_TIMESTAMP(FROM_UNIXTIME(timestamps/1000)),
--WATERMARK FOR ts AS ts - INTERVAL '3' SECOND
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.order',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset' --earliest-offset latest-offset
)
-- 操作类型
create table if not exists ods_action_type(
id int PRIMARY KEY comment '操作类型ID',
type string comment '操作类型'
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.action_type',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset'
)
--商品品类
create table if not exists ods_category(
id int PRIMARY KEY comment '商品品类ID',
category string comment '商品品类'
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.categorys',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset'
)
-- 商品表
create table if not exists ods_commodity(
id int PRIMARY KEY comment '商品ID',
category int comment '商品品类ID',
commodity string comment '商品名称'
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.commoditys',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset'
)
-- 评价表
create table if not exists ods_evaluate(
id int PRIMARY KEY comment 'ID',
userId string comment '用户ID',
productId int comment '商品ID',
described int comment '评分',
`comment` string comment '评价',
logisticsAttitude int comment '物流评分',
serviceAttitude int comment '服务评分',
merchantId string comment '商家ID',
orderInformationId string comment '订单ID'
-- ts AS PROCTIME()
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.evaluate',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset' --latest-offset earliest-offset
)
-- 操作日志
create table if not exists ods_operation_log(
id int PRIMARY KEY comment 'ID',
actionTime bigint comment '操作时间',
userId string comment '用户ID',
sessionId string comment '会话ID',
pageId int comment '页面ID',
actionType int comment '操作ID',
keyWord string comment '关键词',
categoryId int comment '品类ID',
productId int comment '商品ID',
advertising int comment '广告次数',
advertisingId int comment '广告ID'
)
with(
'connector' = 'kafka',
'format' = 'debezium-avro-confluent', -- avro-confluent
'debezium-avro-confluent.schema-registry.url' = 'http://你猜:8081', -- avro-confluent.schema-registry.url
'topic' = 'ods.userAnalysis.operation_log',
'properties.bootstrap.servers' = '你猜:9092',
'properties.group.id' = 'flink-analysis',
'scan.startup.mode' = 'earliest-offset' --earliest-offset
)