高效数据管理:利用PySpark和Apache Hudi实现数据湖中的插入与更新
在现代数据密集型任务中,高效的数据管理变得尤为重要。这篇文章介绍了一种利用PySpark与Apache Hudi进行数据处理的方案,通过对数据进行插入和更新操作,来保持 数据湖中数据的一致性和准确性。
核心概述
这段代码展示了PySpark应用程序如何集成Apache Hudi进行数据插入和更新。通过创建DwdProdInst类,它能够:
- 使用SQL提取和过滤数据源中的新旧数据。
- 划分数据为需要插入或更新的记录。
- 利用Apache Hudi的特性,确保数据的增量更新和ACID事务的一致性。
整体工作流程
-
数据提取与过滤:
- 从源数据库中选择符合条件的数据,根据是否在目标表中存在,生成相应的插入和更新数据集。
-
合并处理:
- 利用Spark SQL进行合并操作,将插入和更新的结果整合在一个临时视图中。
-
数据写入:
- 使用Hudi实现数据的upsert操作,将合并后的结果写入指定的Hudi表路径中,有效地管理以增量更新为目标的大规模数据。
优势特点
- 高效更新:通过Hudi的
Copy-On-Write模式,在逻辑上实现数据的更新和去重。 - 扩展性强:轻松适配不同的源数据表和目标路径,通过自定义SQL语句调整操作逻辑。
- 性能优化:使用Spark的
cache()方法和合理的分区配置,提升整体计算效率。
这一解决方案提供了一种有效的数据处理方式,特别适合日益复杂的数据管理需求,通过灵活的SQL配置和精简的处理流程,实现数据湖中高效的插入与更新操作。
gbase 存储过程
gbase 存过转pyspark
存储过程 sp_dwd_prod_inst_inc
-
总体目标:
-
从
CUST.PROD_INST表同步数据到GBASE,具体操作包含三个步骤:- 去重数据: 从
dws_cust.dws_prod_inst表按主键prod_inst_id去重,确保保留最新的update_date。 - 更新数据: 将
TMPDB.TMP_DWS_PROD_INST_01中存在的数据更新到目标表DWD_DB.DWD_PROD_INST。 - 插入数据: 将
DWD_DB.DWD_PROD_INST中不存在的数据从TMPDB.TMP_DWS_PROD_INST_01插入。
- 去重数据: 从
-
-
错误处理: 使用
EXIT handler捕获异常,将错误信息记录到etl.log_sp_exec表。 -
日志记录: 记录每个步骤的开始时间、处理行数、出现的任何错误及整个过程的结束。
Spark SQL with Hudi
-
总体目标:
- 使用 Hudi,处理从
db_dws_crm.dws_cus_prod_inst_flinksql_rt表的数据,以更新或插入到db_dwd.dwd_prod_inst_02表。过滤选定的时间范围内的记录(2024-03-06 到 2024-04-06)。
- 使用 Hudi,处理从
-
步骤:
- 插入数据: 创建
TMP_INSERT_RECORD_sparksql_j2表存储在目标表db_dwd.dwd_prod_inst_02中不存在的新记录。 - 更新数据: 使用
TMP_UPDATE_RECORD_sparksql_j2表存储目标表中已存在的记录。 - 合并数据: 将插入和更新的数据组合到
TMP_UNION_RECORD_sparksql_j2表中。 - 插入组合数据: 将组合后的数据插入到目标表
db_dwd.dwd_prod_inst_02。
- 插入数据: 创建
-
Hudi 使用:
- 使用
cow存储类型(Copy-on-write),适用于批量处理,以确保 ACID 事务和历史版本管理。
- 使用
主要不同点
-
数据库:
- 存储过程中使用 MySQL 存储过程和 GBASE 表,适合于传统关系数据库的处理。
- Spark SQL 代码旨在利用大数据技术,特别是 Hudi 的增量更新和大规模数据处理能力,配合 Apache Spark 处理。
-
日志记录和错误处理:
- 存储过程详细记录日志信息和错误,而 Spark SQL 中未显式展示这种处理。
这些代码在相应数据库环境中,可用于批处理、数据同步和清洗操作,利用 GBASE 和 Hudi 的各自特点进行优化处理。
完整存过
CREATE DEFINER="e_exec_sp"@"%" PROCEDURE "sp_dwd_prod_inst_inc"(INOUT V_FLAG INT)
BEGIN
/*过程名称:etl.sp_dwd_prod_inst_inc
功能:从CRM复制库库同步prod_inst表数据到GBASE
目标表:DWD_DB.DWD_PROD_INST
源表:CUST.PROD_INST
过程表:DWS_CUST.DWS_PROD_INST
返回:1 成功
0 失败
日志信息:select * from etl.log_sp_exec
where sp_name='etl.sp_dwd_prod_inst_inc'
order by start_date desc;
创建人员:tydic
创建时间:2018-12-28
关键指标:无
修改人员:tydic
修改时间:2019-4-22
关键指标:新增STD_ADDR_ID字段
修改人员:tydic
修改时间:2019-5-15
修改内容:新增ICCID字段
修改人员:tydic
修改时间:2020-2-24
修改内容:新增asset_source_cd,version字段
修改人员:tydic
修改时间:2021-12-03
修改内容:新增BRANCH_REGION_ID 字段
程序目录:
STEP1:按接口层表主键去重
STEP2:更新目标表中已存在的数据
STEP3:插入目标表中不存在的数据
*/
/*日志变量申明*/
/*错误日志代码*/
DECLARE SQLCODE INT;
/*错误日志信息*/
DECLARE SQLERRM TEXT;
/*处理步骤名称*/
DECLARE V_EXEC_POINT VARCHAR(800);
/*存储过程名称*/
DECLARE V_SP_NAME VARCHAR(400);
/*数据日期*/
DECLARE V_PS_DATE DATETIME;
/*步骤开始时间*/
DECLARE V_SPSTEP_DT DATETIME;
/*记录数统计*/
DECLARE V_LOAD_ROWS DECIMAL(38);
/*错误日志获取并记录*/
DECLARE EXIT handler FOR sqlexception
BEGIN
GET diagnostics CONDITION 1 SQLCODE = gbase_errno,SQLERRM=message_text;
INSERT INTO etl.log_sp_exec (sp_name,start_date,end_date,ps_date,exec_point,mysql_error,err_code)
VALUES
(V_SP_NAME,V_SPSTEP_DT,NOW(),V_PS_DATE,V_EXEC_POINT,SQLERRM,SQLCODE);
COMMIT;
END;
/*变量赋值*/
/*程序运行是否成功: 成功:1、失败:0*/
SET V_FLAG = 0;
SET V_PS_DATE = curdate();
SET gbase_fast_update=1;
SET V_SP_NAME = 'etl.sp_dwd_prod_inst_inc';
SET V_SPSTEP_DT = NOW();
SET V_EXEC_POINT = '程序开始';
INSERT INTO etl.log_sp_exec (sp_name,start_date,end_date,ps_date,exec_point,upd_rows)
VALUES (V_SP_NAME,V_SPSTEP_DT,NOW(),V_PS_DATE,V_EXEC_POINT,V_LOAD_ROWS);
COMMIT;
SET V_SPSTEP_DT = NOW();
set V_EXEC_POINT = 'STEP1:按接口层表主键去重';
drop table if exists tmpdb.tmp_dws_prod_inst_01;
create table tmpdb.tmp_dws_prod_inst_01 distributed by('prod_inst_id') nolock as
select t.*
from (select a.*,row_number() over(partition by a.prod_inst_id order by a.update_date desc) rn
from dws_cust.dws_prod_inst a) t
where t.rn='1';
SELECT COUNT(*) INTO v_load_rows FROM tmpdb.tmp_dws_prod_inst_01;
INSERT INTO etl.log_sp_exec (sp_name,start_date,end_date,ps_date,ins_rows,exec_point)
VALUES (V_SP_NAME,V_SPSTEP_DT,NOW(),V_PS_DATE,V_LOAD_ROWS,V_EXEC_POINT);
COMMIT;
SET V_SPSTEP_DT = NOW();
SET V_EXEC_POINT = 'STEP2:更新目标表中已存在的数据';
UPDATE DWD_DB.DWD_PROD_INST A,TMPDB.TMP_DWS_PROD_INST_01 B
SET
A.ACC_NUM=B.ACC_NUM,
A.ACC_PROD_INST_ID=B.ACC_PROD_INST_ID,
A.ACCOUNT=B.ACCOUNT,
A.ACT_DATE=B.ACT_DATE,
A.ADDRESS_DESC=B.ADDRESS_DESC,
A.ADDRESS_ID=B.ADDRESS_ID,
A.BEGIN_RENT_DATE=B.BEGIN_RENT_DATE,
A.BUSI_MOD_DATE=B.BUSI_MOD_DATE,
A.CREATE_DATE=B.CREATE_DATE,
A.CREATE_ORG_ID=B.CREATE_ORG_ID,
A.CREATE_STAFF=B.CREATE_STAFF,
A.EXCH_ID=B.EXCH_ID,
A.EXT_PROD_INST_ID=B.EXT_PROD_INST_ID,
A.FIRST_FINISH_DATE=B.FIRST_FINISH_DATE,
A.LAN_ID=B.LAN_ID,
A.LAST_ORDER_ITEM_ID=B.LAST_ORDER_ITEM_ID,
A.OWNER_CUST_ID=B.OWNER_CUST_ID,
A.PAYMENT_MODE_CD=B.PAYMENT_MODE_CD,
A.POINT_OWNER_ID=B.POINT_OWNER_ID,
A.PROD_ID=B.PROD_ID,
A.PROD_INST_PWD=B.PROD_INST_PWD,
A.PROD_USE_TYPE=B.PROD_USE_TYPE,
A.REGION_ID=B.REGION_ID,
A.REMARK=B.REMARK,
A.STATUS_CD=B.STATUS_CD,
A.STATUS_DATE=B.STATUS_DATE,
A.STOP_RENT_DATE=B.STOP_RENT_DATE,
A.UPDATE_DATE=B.UPDATE_DATE,
A.UPDATE_STAFF=B.UPDATE_STAFF,
A.USE_CUST_ID=B.USE_CUST_ID,
A.SIEBEL_ROW_ID=B.SIEBEL_ROW_ID,
A.INTEGRATION_ID=B.INTEGRATION_ID,
A.X_TRACK_NUM=B.X_TRACK_NUM,
A.ACCT_ID=B.ACCT_ID,
A.ACCT_CD=B.ACCT_CD,
A.PAR_REGION_ID=B.PAR_REGION_ID,
A.GRP_PROD_NBR=B.GRP_PROD_NBR,
A.ACCT_BILLING_TYPE=B.ACCT_BILLING_TYPE,
A.NEWCRM_FLAG=B.NEWCRM_FLAG,
A.STD_ADDR_ID=B.STD_ADDR_ID,
A.ICCID=B.ICCID,
A.ASSET_SOURCE_CD=B.ASSET_SOURCE_CD,
A.VERSION=B.VERSION,
A.BRANCH_REGION_ID=B.BRANCH_REGION_ID,
A.LST_UPD_TMP=NOW(),
A.LST_UPD_BY='ETL_USER',
A.BUSI_KEY=B.PROD_INST_ID,
A.SRC_SYS='CRM'
WHERE A.PROD_INST_ID=B.PROD_INST_ID;
SET V_LOAD_ROWS = row_count();
INSERT INTO etl.log_sp_exec (sp_name,start_date,end_date,ps_date,upd_rows,exec_point)
VALUES (V_SP_NAME,V_SPSTEP_DT,NOW(),V_PS_DATE,V_LOAD_ROWS,V_EXEC_POINT);
COMMIT;
SET V_SPSTEP_DT = NOW();
SET V_EXEC_POINT = 'STEP3:插入目标表中不存在的数据';
INSERT INTO DWD_DB.DWD_PROD_INST
(ACC_NUM,ACC_PROD_INST_ID,ACCOUNT,ACT_DATE,ADDRESS_DESC,ADDRESS_ID,BEGIN_RENT_DATE,BUSI_MOD_DATE,CREATE_DATE,CREATE_ORG_ID,CREATE_STAFF,EXCH_ID,EXT_PROD_INST_ID,FIRST_FINISH_DATE,LAN_ID,LAST_ORDER_ITEM_ID,OWNER_CUST_ID,PAYMENT_MODE_CD,POINT_OWNER_ID,PROD_ID,PROD_INST_ID,PROD_INST_PWD,PROD_USE_TYPE,REGION_ID,REMARK,STATUS_CD,STATUS_DATE,STOP_RENT_DATE,UPDATE_DATE,UPDATE_STAFF,USE_CUST_ID,SIEBEL_ROW_ID,INTEGRATION_ID,X_TRACK_NUM,ACCT_ID,ACCT_CD,PAR_REGION_ID,GRP_PROD_NBR,ACCT_BILLING_TYPE,NEWCRM_FLAG,STD_ADDR_ID,ICCID,ASSET_SOURCE_CD,VERSION,BRANCH_REGION_ID,CREATE_TMP,CREATE_BY,LST_UPD_TMP,LST_UPD_BY,BUSI_KEY,SRC_SYS)
SELECT ACC_NUM,ACC_PROD_INST_ID,ACCOUNT,ACT_DATE,ADDRESS_DESC,ADDRESS_ID,BEGIN_RENT_DATE,BUSI_MOD_DATE,CREATE_DATE,CREATE_ORG_ID,CREATE_STAFF,EXCH_ID,EXT_PROD_INST_ID,FIRST_FINISH_DATE,LAN_ID,LAST_ORDER_ITEM_ID,OWNER_CUST_ID,PAYMENT_MODE_CD,POINT_OWNER_ID,PROD_ID,PROD_INST_ID,PROD_INST_PWD,PROD_USE_TYPE,REGION_ID,REMARK,STATUS_CD,STATUS_DATE,STOP_RENT_DATE,UPDATE_DATE,UPDATE_STAFF,USE_CUST_ID,SIEBEL_ROW_ID,INTEGRATION_ID,X_TRACK_NUM,ACCT_ID,ACCT_CD,PAR_REGION_ID,GRP_PROD_NBR,ACCT_BILLING_TYPE,NEWCRM_FLAG,STD_ADDR_ID,ICCID,ASSET_SOURCE_CD,VERSION,BRANCH_REGION_ID,NOW() CREATE_TMP,'ETL_USER' CREATE_BY,NOW() LST_UPD_TMP,'ETL_USER' LST_UPD_BY,PROD_INST_ID BUSI_KEY,'CRM' SRC_SYS
FROM TMPDB.TMP_DWS_PROD_INST_01 A
WHERE NOT EXISTS (SELECT 8 FROM DWD_DB.DWD_PROD_INST B WHERE A.PROD_INST_ID=B.PROD_INST_ID);
SET V_LOAD_ROWS = row_count();
INSERT INTO etl.log_sp_exec (sp_name,start_date,end_date,ps_date,ins_rows,exec_point)
VALUES (V_SP_NAME,V_SPSTEP_DT,NOW(),V_PS_DATE,V_LOAD_ROWS,V_EXEC_POINT);
COMMIT;
SET v_spstep_dt=now();
SET v_exec_point='程序结束';
insert into etl.log_sp_exec (sp_name,start_date,end_date,ps_date,exec_point)
values (v_sp_name,v_spstep_dt,now(),V_PS_DATE,v_exec_point);
commit;
SET v_flag=1;
end
sparksql
sparksql操作hudi
数据插入处理
-
创建和插入新记录:
- 表
db_tmp.TMP_INSERT_RECORD_sparksql_j2使用 Hudi 创建,储存从另一个表中提取的待插入的新记录。 LEFT JOIN确定在db_dwd.dwd_prod_inst_02表中不存在的prod_inst_id,保证新增数据不与已有数据重复。
- 表
-
数据来源:
- 从
db_dws_crm.dws_cus_prod_inst_flinksql_rt t1表提取数据。 - 过滤条件:
t1.etl_time范围在'2024-03-06 00:00:00'和'2024-04-06 00:00:00'之间,同时确保t2.prod_inst_id为 null,即不在目标表中的数据。
- 从
数据更新处理
-
创建和更新已存在的记录:
- 表
db_tmp.TMP_UPDATE_RECORD_sparksql_j2使用 Hudi 创建,储存从另一个表中提取的待更新的记录。 INNER JOIN确保只更新db_dwd.dwd_prod_inst_02表中已存在的prod_inst_id。
- 表
-
数据来源:
- 从
db_dws_crm.dws_cus_prod_inst_flinksql_rt t1表提取数据。 - 同样过滤条件为:
t1.etl_time的范围在'2024-03-06 00:00:00'和'2024-04-06 00:00:00'之间。
- 从
合并处理
-
合并插入和更新的表:
- 表
db_tmp.TMP_UNION_RECORD_sparksql_j2用于合并新插入和待更新的记录。 - 使用
UNION ALL合并TMP_INSERT_RECORD_sparksql_j2和TMP_UPDATE_RECORD_sparksql_j2表的数据。
- 表
最终插入
-
将处理后的数据插入目标表:
- 将合并后的数据插入到目标表
db_dwd.dwd_prod_inst_02中。 CAST操作为每列强制转换数据类型为字符串,因为目标表的列数据类型预期是字符串。
- 将合并后的数据插入到目标表
使用 Hudi 的好处
- Hudi 特性:提供 ACID 事务,支持增量处理,可以高效管理日常数据更新。
- Copy-on-write (COW) :适用于批处理作业,能减少对数据的冲突,同时保持数据一致性。
这段代码通过 Hudi 管理数据湖中的数据更新和插入操作,能很好地处理大规模数据集的变更,让批处理和流处理任务更易于进行。
drop table
IF EXISTS db_tmp.TMP_INSERT_RECORD_sparksql_j2;
create table
IF NOT EXISTS db_tmp.TMP_INSERT_RECORD_sparksql_j2 using hudi TBLPROPERTIES (type = 'cow', primaryKey = 'prod_inst_id') as
SELECT
cast(t1.prod_inst_id as string),
t1.acc_num as acc_num,
cast(t1.acc_prod_inst_id as string),
t1.account,
t1.act_date,
t1.address_desc,
t1.address_id,
t1.begin_rent_date,
t1.busi_mod_date,
t1.create_date,
cast(t1.create_org_id as string),
cast(t1.create_staff as string),
cast(t1.exch_id as string),
t1.ext_prod_inst_id,
t1.first_finish_date,
cast(t1.lan_id as string),
cast(t1.last_order_item_id as string),
cast(t1.owner_cust_id as string),
t1.payment_mode_cd,
cast(t1.point_owner_id as string),
cast(t1.prod_id as string),
t1.prod_inst_pwd,
t1.prod_use_type,
cast(t1.region_id as string),
t1.remark,
t1.status_cd,
t1.status_date,
t1.stop_rent_date,
t1.update_date,
cast(t1.update_staff as string),
cast(t1.use_cust_id as string),
t1.siebel_row_id,
t1.integration_id,
cast(t1.acct_id as string),
t1.acct_cd,
t1.x_track_num,
t1.grp_prod_nbr,
cast(t1.par_region_id as string),
t1.iccid,
t1.acct_billing_type,
t1.asset_source_cd,
t1.newcrm_flag,
cast(t1.version as string),
cast(t1.std_addr_id as string),
cast(t1.branch_region_id as string),
DATE_FORMAT (now (), 'yyyy-MM-dd HH:mm:ss') as create_tmp,
DATE_FORMAT (now (), 'yyyy-MM-dd HH:mm:ss') as lst_upd_tmp,
'CRM-CUS' as src_sys,
'ETL' as create_by,
'ETL' as lst_upd_by,
t1.prod_inst_id as busi_key
FROM
db_dws_crm.dws_cus_prod_inst_flinksql_rt t1
LEFT JOIN db_dwd.dwd_prod_inst_02 t2 ON t1.prod_inst_id = t2.prod_inst_id
WHERE
t1.etl_time BETWEEN '2024-03-06 00:00:00' AND '2024-04-06 00:00:00'
AND t2.prod_inst_id is null;
drop table
IF EXISTS db_tmp.TMP_UPDATE_RECORD_sparksql_j2;
create table
IF NOT EXISTS db_tmp.TMP_UPDATE_RECORD_sparksql_j2 using hudi TBLPROPERTIES (type = 'cow', primaryKey = 'prod_inst_id') as
SELECT
cast(t1.prod_inst_id as string),
t1.acc_num,
cast(t1.acc_prod_inst_id as string),
t1.account,
t1.act_date,
t1.address_desc,
t1.address_id,
t1.begin_rent_date,
t1.busi_mod_date,
t1.create_date,
cast(t1.create_org_id as string),
cast(t1.create_staff as string),
cast(t1.exch_id as string),
t1.ext_prod_inst_id,
t1.first_finish_date,
cast(t1.lan_id as string),
cast(t1.last_order_item_id as string),
cast(t1.owner_cust_id as string),
t1.payment_mode_cd,
cast(t1.point_owner_id as string),
cast(t1.prod_id as string),
t1.prod_inst_pwd,
t1.prod_use_type,
cast(t1.region_id as string),
t1.remark,
t1.status_cd,
t1.status_date,
t1.stop_rent_date,
t1.update_date,
cast(t1.update_staff as string),
cast(t1.use_cust_id as string),
t1.siebel_row_id,
t1.integration_id,
cast(t1.acct_id as string),
t1.acct_cd,
t1.x_track_num,
t1.grp_prod_nbr,
cast(t1.par_region_id as string),
t1.iccid,
t1.acct_billing_type,
t1.asset_source_cd,
t1.newcrm_flag,
cast(t1.version as string),
cast(t1.std_addr_id as string),
cast(t1.branch_region_id as string),
t2.create_tmp as create_tmp,
DATE_FORMAT (now (), 'yyyy-MM-dd HH:mm:ss') as lst_upd_tmp,
'CRM-CUS' as src_sys,
'ETL' as create_by,
'ETL' as lst_upd_by,
t1.prod_inst_id as busi_key
FROM
db_dws_crm.dws_cus_prod_inst_flinksql_rt t1
INNER JOIN db_dwd.dwd_prod_inst_02 t2 on t1.prod_inst_id = t2.prod_inst_id
WHERE
t1.etl_time BETWEEN '2024-03-06 00:00:00' AND '2024-04-06 00:00:00';
drop table
IF EXISTS db_tmp.TMP_UNION_RECORD_sparksql_j2;
create table
IF NOT EXISTS db_tmp.TMP_UNION_RECORD_sparksql_j2 using hudi TBLPROPERTIES (type = 'cow', primaryKey = 'prod_inst_id') as (
SELECT
*
from
db_tmp.TMP_INSERT_RECORD_sparksql_j2
union all
SELECT
*
from
db_tmp.TMP_UPDATE_RECORD_sparksql_j2
);
insert into
db_dwd.dwd_prod_inst_02
select
cast(acc_num as string),
cast(acc_prod_inst_id as string),
cast(account as string),
cast(act_date as string),
cast(address_desc as string),
cast(address_id as string),
cast(begin_rent_date as string),
cast(busi_mod_date as string),
cast(create_date as string),
cast(create_org_id as string),
cast(create_staff as string),
cast(exch_id as string),
cast(ext_prod_inst_id as string),
cast(first_finish_date as string),
cast(lan_id as string),
cast(last_order_item_id as string),
cast(owner_cust_id as string),
cast(payment_mode_cd as string),
cast(point_owner_id as string),
cast(prod_id as string),
cast(prod_inst_id as string),
cast(prod_inst_pwd as string),
cast(prod_use_type as string),
cast(region_id as string),
cast(remark as string),
cast(status_cd as string),
cast(status_date as string),
cast(stop_rent_date as string),
cast(update_date as string),
cast(update_staff as string),
cast(use_cust_id as string),
cast(siebel_row_id as string),
cast(integration_id as string),
cast(x_track_num as string),
cast(acct_id as string),
cast(acct_cd as string),
cast(par_region_id as string),
cast(grp_prod_nbr as string),
cast(acct_billing_type as string),
cast(newcrm_flag as string),
cast(std_addr_id as string),
cast(iccid as string),
cast(create_tmp as string),
cast(create_by as string),
cast(lst_upd_tmp as string),
cast(lst_upd_by as string),
cast(busi_key as string),
cast(src_sys as string),
cast(asset_source_cd as string),
cast(version as string),
cast(branch_region_id as string)
from
db_tmp.TMP_UNION_RECORD_sparksql_j2;
pyspark
pyspark操作hudi
DwdProdInst 类
-
构造函数 (
__init__) :- 初始化 Spark Session、表名、基路径等。
- 配置 Hudi 的选项,包括表名、写入模式、主键字段
prod_inst_id、排序字段lst_upd_tmp等。
-
generate_insert方法:- 创建一个临时视图
TMP_INSERT_RECORD。 - 选择
db_dws_crm.dws_cus_prod_inst_flinksql_rt中在db_dwd.dwd_prod_inst_01不存在的记录。 - 时间过滤条件会替换为类初始化时设置的
dataStartTime和dataEndTime。
- 创建一个临时视图
-
generate_update方法:- 创建一个临时视图
TMP_UPDATE_RECORD。 - 选择
db_dws_crm.dws_cus_prod_inst_flinksql_rt中在db_dwd.dwd_prod_inst_01已经存在的记录。 - 然后将数据集缓存以提高后续操作的效率。
- 创建一个临时视图
-
union方法:- 合并插入和更新的临时视图,生成一个新的临时视图
TMP_UNION_RECORD。 - 利用
UNION ALL将插入和更新记录整合。
- 合并插入和更新的临时视图,生成一个新的临时视图
-
upsert方法:- 从
TMP_UNION_RECORD提取数据,确保所有字段均被类型转换为字符串。 - 使用 Hudi 将数据保存到指定路径。
- 使用
append模式将数据写入,虽然设置为 append,但由于操作类型为upsert,即使在存储方面是追加操作,但在逻辑上会进行更新或插入。
- 从
主程序
- 创建一个 Spark Session。
- 实例化
DwdProdInst类。 - 执行插入生成、更新生成、合并和写入操作。
关键点
-
数据处理:
- 数据首先从源表中提取,根据是否存在于目标表中分为插入和更新两部分。
- 通过视图的方式将生成的数据集临时保存,以便下一个步骤使用。
-
Apache Hudi:
- Hudi 用于确保数据的 ACID 特性,特别是在需要频繁更新的数据湖中很有用。
Copy-On-Write模式下,覆盖更新数据,并确保仅生成一个视图以持久化最新的数据。
-
性能优化:
- 使用
cache()减少重复计算。 - 配置
spark.sql.shuffle.partitions来优化数据的 shuffle 操作。
- 使用
-
扩展性:
- 类的方法划分清晰,可以轻松调整数据选定范围或目标存储路径。
- 如果需要更新业务逻辑,只需更新相关 SQL 语句。
这个代码是大数据处理中的一个典型用例,展示了如何使用 PySpark 和 Hudi 高效管理和处理数据。
# -*- coding: utf-8 -*-
from pyspark.sql import SparkSession
class DwdProdInst:
def __init__(self, spark, tableName, basePath):
self.spark = spark
self.basePath = basePath
self.tableName = tableName
self.hudi_options = {
'hoodie.table.name': tableName,
'hoodie.datasource.write.table.type': 'COPY_ON_WRITE',
'hoodie.datasource.write.recordkey.field': 'prod_inst_id',
'hoodie.datasource.write.precombine.field': 'lst_upd_tmp',
'hoodie.datasource.write.operation': "upsert",
'hoodie.upsert.shuffle.parallelism': 2,
'hoodie.insert.shuffle.parallelism': 2
}
self.dataStartTime = '2024-03-06 00:00:00'
self.dataEndTime = '2024-04-04 00:00:00' # todo
def generate_insert(self):
sql01 = """
SELECT
cast (t1.prod_inst_id as string),
t1.acc_num as acc_num,
cast (t1.acc_prod_inst_id as string),
t1.account,
t1.act_date,
t1.address_desc,
t1.address_id,
t1.begin_rent_date,
t1.busi_mod_date,
t1.create_date,
cast(t1.create_org_id as string),
cast (t1.create_staff as string),
cast (t1.exch_id as string),
t1.ext_prod_inst_id,
t1.first_finish_date,
cast(t1.lan_id as string),
cast(t1.last_order_item_id as string),
cast(t1.owner_cust_id as string),
t1.payment_mode_cd,
cast(t1.point_owner_id as string),
cast(t1.prod_id as string),
t1.prod_inst_pwd,
t1.prod_use_type,
cast(t1.region_id as string),
t1.remark,
t1.status_cd,
t1.status_date,
t1.stop_rent_date,
t1.update_date,
cast(t1.update_staff as string),
cast(t1.use_cust_id as string),
t1.siebel_row_id,
t1.integration_id,
cast(t1.acct_id as string),
t1.acct_cd,
t1.x_track_num,
t1.grp_prod_nbr,
cast(t1.par_region_id as string),
t1.iccid,
t1.acct_billing_type,
t1.asset_source_cd,
t1.newcrm_flag,
cast(t1.version as string),
cast(t1.std_addr_id as string),
cast(t1.branch_region_id as string),
DATE_FORMAT(now(), 'yyyy-MM-dd HH:mm:ss') as create_tmp,
DATE_FORMAT(now(), 'yyyy-MM-dd HH:mm:ss') as lst_upd_tmp,
'CRM-CUS' as src_sys,
'ETL' as create_by,
'ETL' as lst_upd_by,
t1.prod_inst_id as busi_key
FROM db_dws_crm.dws_cus_prod_inst_flinksql_rt t1
LEFT JOIN db_dwd.dwd_prod_inst_01 t2
ON t1.prod_inst_id = t2.prod_inst_id
WHERE t1.etl_time BETWEEN 'DATA_START_TIME' AND 'DATA_END_TIME'
AND t2.prod_inst_id is null
"""
sql01 = sql01.replace('DATA_START_TIME', self.dataStartTime).replace('DATA_END_TIME', self.dataEndTime)
self.spark.sql(sql01).createOrReplaceTempView("TMP_INSERT_RECORD")
def generate_update(self):
sql02 = """
SELECT
cast (t1.prod_inst_id as string),
t1.acc_num,
cast (t1.acc_prod_inst_id as string),
t1.account,
t1.act_date,
t1.address_desc,
t1.address_id,
t1.begin_rent_date,
t1.busi_mod_date,
t1.create_date,
cast(t1.create_org_id as string),
cast (t1.create_staff as string),
cast (t1.exch_id as string),
t1.ext_prod_inst_id,
t1.first_finish_date,
cast(t1.lan_id as string),
cast(t1.last_order_item_id as string),
cast(t1.owner_cust_id as string),
t1.payment_mode_cd,
cast(t1.point_owner_id as string),
cast(t1.prod_id as string),
t1.prod_inst_pwd,
t1.prod_use_type,
cast(t1.region_id as string),
t1.remark,
t1.status_cd,
t1.status_date,
t1.stop_rent_date,
t1.update_date,
cast(t1.update_staff as string),
cast(t1.use_cust_id as string),
t1.siebel_row_id,
t1.integration_id,
cast(t1.acct_id as string),
t1.acct_cd,
t1.x_track_num,
t1.grp_prod_nbr,
cast(t1.par_region_id as string),
t1.iccid,
t1.acct_billing_type,
t1.asset_source_cd,
t1.newcrm_flag,
cast(t1.version as string),
cast(t1.std_addr_id as string),
cast(t1.branch_region_id as string),
t2.create_tmp as create_tmp,
DATE_FORMAT(now(), 'yyyy-MM-dd HH:mm:ss') as lst_upd_tmp,
'CRM-CUS' as src_sys,
'ETL' as create_by,
'ETL' as lst_upd_by,
t1.prod_inst_id as busi_key
FROM db_dws_crm.dws_cus_prod_inst_flinksql_rt t1
INNER JOIN db_dwd.dwd_prod_inst_01 t2 on t1.prod_inst_id = t2.prod_inst_id
WHERE t1.etl_time BETWEEN 'DATA_START_TIME' AND 'DATA_END_TIME'"""
sql02 = sql02.replace('DATA_START_TIME', self.dataStartTime).replace('DATA_END_TIME', self.dataEndTime)
df02 = self.spark.sql(sql02)
df02.createOrReplaceTempView("TMP_UPDATE_RECORD")
df02.cache()
def union(self):
sql03 = """
SELECT * from TMP_INSERT_RECORD
union all
SELECT * from TMP_UPDATE_RECORD
"""
df03 = self.spark.sql(sql03)
df03.createOrReplaceTempView("TMP_UNION_RECORD")
df03.cache()
def upsert(self):
self.spark.sql("""
select
cast(acc_num as string),
cast(acc_prod_inst_id as string),
cast(account as string),
cast(act_date as string),
cast(address_desc as string),
cast(address_id as string),
cast(begin_rent_date as string),
cast(busi_mod_date as string),
cast(create_date as string),
cast(create_org_id as string),
cast(create_staff as string),
cast(exch_id as string),
cast(ext_prod_inst_id as string),
cast(first_finish_date as string),
cast(lan_id as string),
cast(last_order_item_id as string),
cast(owner_cust_id as string),
cast(payment_mode_cd as string),
cast(point_owner_id as string),
cast(prod_id as string),
cast(prod_inst_id as string),
cast(prod_inst_pwd as string),
cast(prod_use_type as string),
cast(region_id as string),
cast(remark as string),
cast(status_cd as string),
cast(status_date as string),
cast(stop_rent_date as string),
cast(update_date as string),
cast(update_staff as string),
cast(use_cust_id as string),
cast(siebel_row_id as string),
cast(integration_id as string),
cast(x_track_num as string),
cast(acct_id as string),
cast(acct_cd as string),
cast(par_region_id as string),
cast(grp_prod_nbr as string),
cast(acct_billing_type as string),
cast(newcrm_flag as string),
cast(std_addr_id as string),
cast(iccid as string),
cast(create_tmp as string),
cast(create_by as string),
cast(lst_upd_tmp as string),
cast(lst_upd_by as string),
cast(busi_key as string),
cast(src_sys as string),
cast(asset_source_cd as string),
cast(version as string),
cast(branch_region_id as string) from TMP_UNION_RECORD""") \
.write.format("hudi"). \
options(**self.hudi_options). \
mode("append"). \
save(self.basePath)
if __name__ == "__main__":
spark = SparkSession.builder \
.appName("pytispark_handle_data") \
.enableHiveSupport() \
.config("spark.sql.shuffle.partitions", "40") \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.getOrCreate()
dwdProdInst = DwdProdInst(spark, "dwd_prod_inst_01", "/domain/ns1/prd/hudi/shct/eda/db_dwd/dwd_prod_inst_01")
dwdProdInst.generate_insert()
dwdProdInst.generate_update()
dwdProdInst.union()
dwdProdInst.upsert()