一、变更内容
变更背景: 逻辑复制异常导致数据丢失,需重建表后全量同步数据
变更环境:PRD
影响范围:xx
变更类型:低风险
涉及系统等级:xx
变更负责人:xx
变更复核人:xx
变更内容:修复xx hotdata主库上的逻辑复制槽异常
变更验收标准:
二、变更文档
前置工作
-
hotdata(172.25.181.157)上的发布信息
# 发布信息
duola-trans=# SELECT pubname, pubowner, puballtables, pubinsert,pubupdate,pubdelete FROM pg_publication where pubname='duola_pub_t_b2b_order_refund1_16';
pubname | pubowner | puballtables | pubinsert | pubupdate | pubdelete
----------------------------------+----------+--------------+-----------+-----------+-----------
duola_pub_t_b2b_order_refund1_16 | 10 | f | t | t | t
(1 row)
# 发布对应的表
duola-trans=# select * from pg_publication_tables where pubname ='duola_pub_t_b2b_order_refund1_16';
pubname | schemaname | tablename
----------------------------------+------------+----------------------
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund1
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund2
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund3
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund4
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund5
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund6
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund7
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund8
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund9
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund10
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund11
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund12
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund13
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund14
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund15
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund16
(16 rows)
2. ### hotdata上(172.25.181.157)的复制槽信息
# 出问题的复制槽名
duola-trans=# SELECT * from pg_replication_slots where slot_name='slot_t_b2b_order_refund1_16';
slot_name | plugin | slot_type | datoid | database | temporary | active | active_pid | xmin | catalog_xmin | restart_lsn | confirmed_flush_lsn
-----------------------------+----------+-----------+--------+-------------+-----------+--------+------------+------+--------------+---------------+---------------------
slot_t_b2b_order_refund1_16 | pgoutput | logical | 16385 | duola-trans | f | f | | | 836959443 | 30E2/49CEB528 | 30E2/476FD9D0
(1 row)
3. ### 中间库(172.25.189.18)上订阅信息
# 重新创建订阅 duola_sub_t_b2b_order_refund1_16
CREATE SUBSCRIPTION duola_sub_t_b2b_order_refund1_16 CONNECTION 'host=172.25.181.157 port=1921 dbname=duola-trans user=replica password=PostGRESQL.replica' PUBLICATION duola_pub_t_b2b_order_refund1_16 WITH(create_slot = true,slot_name= 'slot_t_b2b_order_refund1_16' ,copy_data=true);
# duola_sub_t_b2b_order_refund1_16的信息如下
select * from pg_subscription where subname ='duola_sub_t_b2b_order_refund1_16';
-[ RECORD 1 ]-------+-------------------------------------------------------------------------------------------
oid | 29183
subdbid | 16422
subskiplsn | 0/0
subname | duola_sub_t_b2b_order_refund1_16
subowner | 10
subenabled | t
subbinary | f
substream | f
subtwophasestate | d
subdisableonerr | f
subpasswordrequired | t
subrunasowner | f
subconninfo | host=172.25.181.157 port=1921 dbname=duola-trans user=replica password=PostGRESQL.replica
subslotname | slot_t_b2b_order_refund1_16
subsynccommit | off
subpublications | {duola_pub_t_b2b_order_refund1_16}
suborigin | any
# 订阅名为duola_sub_t_b2b_order_refund1_16的复制状态
duola-trans=# SELECT * FROM pg_stat_subscription where subname='duola_sub_t_b2b_order_refund1_16';
-[ RECORD 1 ]---------+---------------------------------
subid | 29183
subname | duola_sub_t_b2b_order_refund1_16
pid |
leader_pid |
relid |
received_lsn |
last_msg_send_time |
last_msg_receipt_time |
latest_end_lsn |
latest_end_time |
详细操作WBS
-
关闭中间库的订阅任务
#
DROP SUBSCRIPTION IF EXISTS duola_sub_t_b2b_order_refund1_16;
2. ### hotdata库上删除复制槽
# hotdata中的复制槽 slot_t_b2b_order_refund1_16 自动删除
3. ### 在hotdata库上确定16张待同步的表
# 发布对应的表
duola-trans=# select * from pg_publication_tables where pubname ='duola_pub_t_b2b_order_refund1_16';
pubname | schemaname | tablename
----------------------------------+------------+----------------------
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund1
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund2
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund3
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund4
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund5
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund6
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund7
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund8
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund9
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund10
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund11
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund12
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund13
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund14
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund15
duola_pub_t_b2b_order_refund1_16 | public | t_b2b_order_refund16
(16 rows)
4. ### 检查中间库的16张表的数据量
# 涉及到16张表的数据量
duola-trans=# select count(1) from t_b2b_order_refund1;
-[ RECORD 1 ]
count | 82098
duola-trans=# select count(1) from t_b2b_order_refund2;
-[ RECORD 1 ]
count | 82113
duola-trans=# select count(1) from t_b2b_order_refund3;
-[ RECORD 1 ]
count | 82631
duola-trans=# select count(1) from t_b2b_order_refund4;
-[ RECORD 1 ]
count | 82953
duola-trans=# select count(1) from t_b2b_order_refund5;
-[ RECORD 1 ]
count | 82818
duola-trans=# select count(1) from t_b2b_order_refund6;
-[ RECORD 1 ]
count | 82614
duola-trans=# select count(1) from t_b2b_order_refund7;
-[ RECORD 1 ]
count | 83329
duola-trans=# select count(1) from t_b2b_order_refund8;
-[ RECORD 1 ]
count | 82911
duola-trans=# select count(1) from t_b2b_order_refund9;
-[ RECORD 1 ]
count | 83076
duola-trans=# select count(1) from t_b2b_order_refund10;
-[ RECORD 1 ]
count | 82728
duola-trans=# select count(1) from t_b2b_order_refund11;
-[ RECORD 1 ]
count | 82976
duola-trans=# select count(1) from t_b2b_order_refund12;
-[ RECORD 1 ]
count | 82275
duola-trans=# select count(1) from t_b2b_order_refund13;
-[ RECORD 1 ]
count | 82353
duola-trans=# select count(1) from t_b2b_order_refund14;
-[ RECORD 1 ]
count | 81996
duola-trans=# select count(1) from t_b2b_order_refund15;
-[ RECORD 1 ]
count | 81931
duola-trans=# select count(1) from t_b2b_order_refund16;
-[ RECORD 1 ]
count | 81900
-
datahot库上导出16张表的表结构
# 仅备份表结构
pg_dump -U postgres -p 1921 -d duola-trans -s -t t_b2b_order_refund1 -t t_b2b_order_refund2 -t t_b2b_order_refund3 -t t_b2b_order_refund4 -t t_b2b_order_refund5 -t t_b2b_order_refund6 -t t_b2b_order_refund7 -t t_b2b_order_refund8 -t t_b2b_order_refund9 -t t_b2b_order_refund10 -t t_b2b_order_refund11 -t t_b2b_order_refund12 -t t_b2b_order_refund13 -t t_b2b_order_refund14 -t t_b2b_order_refund15 -t t_b2b_order_refund16 -f 0701bak.sql
6. ### 备份中间库的16张表
#
alter table t_b2b_order_refund1 rename to t_b2b_order_refund1_bak0701;
alter table t_b2b_order_refund2 rename to t_b2b_order_refund2_bak0701;
alter table t_b2b_order_refund3 rename to t_b2b_order_refund3_bak0701;
alter table t_b2b_order_refund4 rename to t_b2b_order_refund4_bak0701;
alter table t_b2b_order_refund5 rename to t_b2b_order_refund5_bak0701;
alter table t_b2b_order_refund6 rename to t_b2b_order_refund6_bak0701;
alter table t_b2b_order_refund7 rename to t_b2b_order_refund7_bak0701;
alter table t_b2b_order_refund8 rename to t_b2b_order_refund8_bak0701;
alter table t_b2b_order_refund9 rename to t_b2b_order_refund9_bak0701;
alter table t_b2b_order_refund10 rename to t_b2b_order_refund10_bak0701;
alter table t_b2b_order_refund11 rename to t_b2b_order_refund11_bak0701;
alter table t_b2b_order_refund12 rename to t_b2b_order_refund12_bak0701;
alter table t_b2b_order_refund13 rename to t_b2b_order_refund13_bak0701;
alter table t_b2b_order_refund14 rename to t_b2b_order_refund14_bak0701;
alter table t_b2b_order_refund15 rename to t_b2b_order_refund15_bak0701;
alter table t_b2b_order_refund16 rename to t_b2b_order_refund16_bak0701;
-
中间库中导入16张表的结构
# 订阅端导入数据
psql -U postgres -p 1921 -d duola-trans -f 0701bak.sql
8. ### 中间库重建订阅duola_sub_t_b2b_order_refund1_16
# 中间库重建订阅
# 重新创建订阅duola_sub_t_b2b_order_refund1_16
CREATE SUBSCRIPTION duola_sub_t_b2b_order_refund1_16 CONNECTION 'host=172.25.181.157 port=1921 dbname=duola-trans user=replica password=PostGRESQL.replica' PUBLICATION duola_pub_t_b2b_order_refund1_16 WITH(create_slot = true,slot_name= ' slot_t_b2b_order_refund1_16 ' ,copy_data=true);
9. ### 等待数据全量同步
-
重建中间库对应的复制槽
# 创建好订阅后会自动在 hotdata中创建复制槽
11. ### 通知下游重新消费
等待数据同步完成后,通知赵清亮的开发们
#
select count(1) from t_b2b_order_refund1;
select count(1) from t_b2b_order_refund2;
select count(1) from t_b2b_order_refund3;
select count(1) from t_b2b_order_refund4;
select count(1) from t_b2b_order_refund5;
select count(1) from t_b2b_order_refund6;
select count(1) from t_b2b_order_refund7;
select count(1) from t_b2b_order_refund8;
select count(1) from t_b2b_order_refund9;
select count(1) from t_b2b_order_refund10;
select count(1) from t_b2b_order_refund11;
select count(1) from t_b2b_order_refund12;
select count(1) from t_b2b_order_refund13;
select count(1) from t_b2b_order_refund14;
select count(1) from t_b2b_order_refund15;
select count(1) from t_b2b_order_refund16;
12. ### 验证
# 查看订阅 duola_sub_t_b2b_order_refund1_16
select * from pg_subscription where subname='duola_sub_t_b2b_order_refund1_16';
# 订阅端的订阅状态
SELECT subname,received_lsn,latest_end_time,latest_end_lsn FROM pg_stat_subscription where subname ='duola_sub_t_b2b_order_refund1_16';
# 订阅端的数据量检查
# 发布端的复制槽检查
-
完成
预案手册
灰度方案
回退方案
- 删除中间库上的订阅
duola_sub_t_b2b_order_refund1_16
三、变更记录
变更执行记录
四、此次变更的原因
-
6.27上午10:20左右,开发自己修改了hotdata PG主库上t_b2b_order_refund8中的字段main_order_no导致逻辑复制下游的PG中间库写入失败,进一步导致wal日志归档失败,引发磁盘空间以每天约5%的速度上涨
-
6.30凌晨出现大批量的退单(100W),由于PG的特性,大批量的增删改动作会导致wal暴增
-
wal归档的失败+大量的增删改动作 ,一起导致磁盘空间的瓶颈(90%-99.6%)
导致wal归档失败的复制槽名:slot_t_b2c_dyl_order_coupon_detail1_16
查看中间库的PG日志,如下:
2025-06-27 10:26:12.587 CST,"postgres","postgres",20105,"[local]",685e0144.4e89,3,"idle",2025-06-27 10:26:12 CST,,0,LOG,00000,"disconnection: session time: 0:00:00.073 user=p
ostgres database=postgres host=[local]",,,,,,,,"log_disconnections, postgres.c:5155","","client backend",,0
2025-06-27 10:26:20.722 CST,,,22498,,68504a1a.57e2,2,,2025-06-17 00:45:14 CST,9/0,0,ERROR,22001,"value too long for type character varying(64)",,,,,"processing remote data for replication origin ""pg_29183"" during message type ""INSERT"" for replication target relation ""public.t_b2b_order_refund8"" column ""main_order_no"" in transaction 836960851, finished at 30E2/4A854690",,,"varchar_input, varchar.c:477","","logical replication worker",,0
2025-06-27 10:26:20.752 CST,,,20135,,685e014c.4ea7,1,,2025-06-27 10:26:20 CST,9/192304,0,LOG,00000,"logical replication apply worker for subscription ""duola_sub_t_b2b_order_refund1_16"" has started",,,,,,,,"InitializeApplyWorker, worker.c:4516","","logical replication worker",,0
2025-06-27 10:26:20.754 CST,,,14766,,67161cf6.39ae,715306,,2024-10-21 17:20:54 CST,,0,LOG,00000,"background worker ""logical replication worker"" (PID 22498) exited with exit code 1",,,,,,,,"LogChildExit, postmaster.c:3667","","postmaster",,0
2025-06-27 10:26:20.800 CST,,,20135,,685e014c.4ea7,2,,2025-06-27 10:26:20 CST,9/0,0,ERROR,22001,"value too long for type character varying(64)",,,,,"processing remote data for replication origin ""pg_29183"" during message type ""INSERT"" for replication target relation ""public.t_b2b_order_refund8"" column ""main_order_no"" in transaction 836960851, finished at 30E2/4A854690",,,"varchar_input, varchar.c:477","","logical replication worker",,0
2025-06-27 10:26:20.803 CST,,,14766,,67161cf6.39ae,715307,,2024-10-21 17:20:54 CST,,0,LOG,00000,"background worker ""logical replication worker"" (PID 20135) exited with exit code 1",,,,,,,,"LogChildExit, postmaster.c:3667","","postmaster",,0
2025-06-27 10:26:25.734 CST,,,20142,,685e0151.4eae,1,,2025-06-27 10:26:25 CST,9/192309,0,LOG,00000,"logical replication apply worker for subscription ""duola_sub_t_b2b_order_refund1_16"" has started",,,,,,,,"InitializeApplyWorker, worker.c:4516","","logical replication worker",,0
2025-06-27 10:26:25.780 CST,,,20142,,685e0151.4eae,2,,2025-06-27 10:26:25 CST,9/0,0,ERROR,22001,"value too long for type character varying(64)",,,,,"processing remote data for replication origin ""pg_29183"" during message type ""INSERT"" for replication target relation ""public.t_b2b_order_refund8"" column ""main_order_no"" in transaction 836960851, finished at 30E2/4A854690",,,"varchar_input, varchar.c:477","","logical replication worker",,0
2025-06-27 10:26:25.782 CST,,,14766,,67161cf6.39ae,715308,,2024-10-21 17:20:54 CST,,0,LOG,00000,"background worker ""logical replication worker"" (PID 20142) exited with exit code 1",,,,,,,,"LogChildExit, postmaster.c:3667","","postmaster",,0
表t_b2b_order_refund8的字段的数据格式character varying(64), 上游的是character varying(128), 导致下游的数据写入(insert)失败