欢迎提出改进意见,优化逻辑,提升性能
每1000万条数据进行一次分表
分表名称格式:${table_name}_{Num} 如:t_data_collection_0 、 t_data_collection_1
数据存储
主键0~10000000存储在t_data_collection_0
主键10000000~20000000存储在t_data_collection_1
分表思路:
利用postgresql触发器、通知、存储过程,提前创建好下一个表分区
概要步骤:
插入数据-->触发器判断是否要分表-->要分表-->通知后台服务-->后台服务从通知中拿到要分表的表名-->调用分表存储过程进行分表
详细步骤
- 首先创建表时指定以主键ID作为分表依据(参考www.postgres.cn/docs/14/ddl…
create table t_data_collection ( data_collection_id SERIAL8 not null, data_value TEXT null, collection_time TIMESTAMP WITH TIME ZONE not null, constraint PK_T_DATA_COLLECTION primary key (data_collection_id) ) PARTITION BY RANGE(data_collection_id); - 在要自动分区的表中添加触发器(data_collection_child_table_create),监听每次插入数据动作
- 触发器调用存储过程(create_data_collection_partition_notice)判断是否需要分表
- 判断逻辑:新插入数据的ID > (分区数量-1)×1000万
- 即最后一个分区中开始插入数据
- 向channel中写入当前分区数量(childNum),channel名称为表名(t_data_collection)
- 后台启动后监听channel(t_data_collection)
- 监听到消息后,调用存储过程create_table_partition(table_name)
- table_name为channel名称,即要分区的表名
- create_table_partition开始创建表分区
- 查询分区数量:childNum
- 生成分区表名:t_data_collection_{childNum}
- 生成分区并划分主键ID范围
data_collection_child_table_create
表数据插入触发器
CREATE TRIGGER data_collection_child_table_create AFTER
INSERT
ON
public.t_data_collection FOR EACH ROW EXECUTE FUNCTION create_data_collection_partition_notice()
create_data_collection_partition_notice
判断是否需要创建分区,若是,则向channel中发生数据,通知后台创建表分区
-- DROP FUNCTION public.create_data_collection_partition_notice();
CREATE OR REPLACE FUNCTION public.create_data_collection_partition_notice()
RETURNS trigger
LANGUAGE plpgsql
AS $function$
DECLARE
childNum int;
BEGIN
SELECT count(child.relname) FROM pg_inherits JOIN pg_class parent
ON pg_inherits.inhparent = parent.oid JOIN pg_class child
ON pg_inherits.inhrelid = child.oid JOIN pg_namespace nmsp_parent
ON nmsp_parent.oid = parent.relnamespace JOIN pg_namespace nmsp_child
ON nmsp_child.oid = child.relnamespace WHERE parent.relname = 't_data_collection' INTO childNum;
IF NEW.collection_id > (childNum-1)*10000000 AND (SELECT EXISTS (SELECT FROM pg_tables WHERE tablename = FORMAT('%s_%s', TG_ARGV[0], childNum))) = false
THEN
EXECUTE
format('NOTIFY t_data_collection, ''%s''', childNum);
END IF;
RETURN NULL;
END;
$function$
;
create_table_partition(table_name)
-- DROP FUNCTION public.create_table_partition(varchar);
CREATE OR REPLACE FUNCTION public.create_table_partition(table_name character varying)
RETURNS text
LANGUAGE plpgsql
AS $function$
DECLARE
childNum int;
BEGIN
SELECT count(child.relname) FROM pg_inherits JOIN pg_class parent
ON pg_inherits.inhparent = parent.oid JOIN pg_class child
ON pg_inherits.inhrelid = child.oid JOIN pg_namespace nmsp_parent
ON nmsp_parent.oid = parent.relnamespace JOIN pg_namespace nmsp_child
ON nmsp_child.oid = child.relnamespace WHERE parent.relname = table_name INTO childNum;
IF (SELECT EXISTS (SELECT FROM pg_tables WHERE tablename = FORMAT('%s_%s', table_name, childNum))) = false
THEN
EXECUTE
format('CREATE TABLE %s_%s PARTITION OF %s FOR VALUES FROM (%L) TO (%L)', table_name, childNum, table_name, childNum*10000000 , (childNum + 1)*10000000);
END IF;
RETURN '1';
END;
$function$
;
后台监听器
监听channel数据,并调用创建分区存储过程
package com.advantech.ems.config.datasource;
import lombok.extern.slf4j.Slf4j;
import org.postgresql.PGNotification;
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
/**
* @author zhuye
* @version 1.0
* @description: TODO
* @date 2023/11/8 10:38
*/
@Slf4j
public class PGNoticeListener extends Thread {
private final Connection conn;
private final org.postgresql.PGConnection pgConn;
//监听channel && tableName
private final String channel;
public PGNoticeListener(Connection conn, String channel) throws SQLException {
this.conn = conn;
this.pgConn = conn.unwrap(org.postgresql.PGConnection.class);
this.channel = channel;
Statement stmt = conn.createStatement();
stmt.execute("LISTEN ".concat(channel));
stmt.close();
}
public void run() {
while (true) {
try {
org.postgresql.PGNotification[] notifications = pgConn.getNotifications();
if (notifications != null) {
for (PGNotification notification : notifications) {
Statement statement = conn.createStatement();
statement.execute(String.format("SELECT create_table_partition('%s')", channel));
log.warn("Received PG notice, channel: {}, notification: {} ,payload: {}", channel, notification.getName(), notification.getParameter());
}
}
} catch (SQLException e) {
log.error("PGNoticeListener error: {}, channel&table: {}", e.getMessage(), channel);
}
}
}
}
channel检查定时器
定时检查channel监听器是否正常
package com.advantech.ems.config.schedule.timer;
import cn.hutool.core.bean.BeanUtil;
import com.advantech.ems.config.datasource.PGNoticeListener;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import javax.sql.DataSource;
import java.sql.SQLException;
/**
* @author zhuye
* @version 1.0
* @description: TODO
* @date 2023/11/8 10:44
*/
@Component
@Slf4j
@EnableAsync
public class PGListenTimer {
@Autowired
private DataSource dataSource;
private PGNoticeListener dataCollectionListener;
private PGNoticeListener pageDataValueListener;
/**
* 定时检查PGListener是否正常
* initialDelay:容器启动后多长时间第一次执行
* fixedDelay:间隔时间 10seconds
*/
@Scheduled(initialDelay = 2000, fixedDelay = 10000)
public void dataCollectionListener() {
if (BeanUtil.isEmpty(dataCollectionListener) || !dataCollectionListener.isAlive()) {
try {
dataCollectionListener = new PGNoticeListener(dataSource.getConnection(), "t_data_collection");
dataCollectionListener.start();
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
}
}
2025/02/12更新
确实如评论区大佬所说,每次新增记录都触发trigger,确实十分浪费性能
下面我们可以利用TimescaleDB自动分区以及清理
TimescaleDB
TimescaleDB安装步骤请自行百度,这里给出容器安装的方法:
构建PostgreSQL镜像
新建Dockerfile文件
# 使用官方的 PostgreSQL 16 镜像作为基础镜像
FROM postgres:16
# 设置环境变量
ENV TIMESCALEDB_VERSION=2.17.2
# 安装依赖
RUN apt-get update && \
apt-get install -y \
build-essential \
cmake \
git \
postgresql-server-dev-16 \
libssl-dev \
libkrb5-dev \
libicu-dev \
libpq-dev \
&& rm -rf /var/lib/apt/lists/*
# 下载并编译 TimescaleDB
RUN git clone --branch ${TIMESCALEDB_VERSION} https://github.com/timescale/timescaledb.git /timescaledb && \
cd /timescaledb && \
./bootstrap -DREGRESS_CHECKS=OFF && \
make -C build && \
make -C build install && \
cd / && \
rm -rf /timescaledb
# 初始化 TimescaleDB
RUN echo "shared_preload_libraries = 'timescaledb'" >> /usr/share/postgresql/postgresql.conf.sample
ENTRYPOINT ["docker-entrypoint.sh"]
CMD ["postgres"]
构建arm64镜像命令:(需要代理)
我这里使用的代理工具是clash,默认端口是7890
代理问题就靠各位大佬八仙过海了
因为我的应用的运行环境是arm,若需要x86环境,在x86的设备上,将--platform linux/arm64去掉即可
https_proxy=http://your_proxy_ip:7890 http_proxy=http://your_proxy_ip:7890 docker build --no-cache --platform linux/arm64 -t postgresql_arm64:latest .
保存arm64镜像:
docker save postgresql_arm64 | gzip > postgresql_arm64.tar.gz
添加策略
-- 安装 拓展
CREATE EXTENSION IF NOT EXISTS timescaledb;
-- 转换为 TimescaleDB hypertable
SELECT create_hypertable(
't_collection_data_value', -- 表名
'collection_time', -- 时间分区字段
chunk_time_interval => INTERVAL '1 hour', -- 每个分区的间隔(例如每1小时一个分区)
if_not_exists => TRUE
);
ALTER TABLE t_collection_data_value SET (
timescaledb.compress,
timescaledb.compress_orderby = 'collection_time',
timescaledb.compress_segmentby = 'collection_data_config_id'
);
SELECT add_compression_policy('t_collection_data_value', INTERVAL '1 days');
SELECT add_retention_policy('t_collection_data_value', INTERVAL '180 days');
修改策略
-- 删除压缩策略
SELECT remove_compression_policy('t_collection_data_value');
-- 删除保留策略
SELECT remove_retention_policy('t_collection_data_value');
-- 新增压缩策略
SELECT add_compression_policy('t_collection_data_value', INTERVAL '1 day');
-- 新增保留策略
SELECT add_retention_policy('t_collection_data_value', INTERVAL '30 days');
查询策略
--查询压缩策略
SELECT
hypertable_name,
schedule_interval,
config ->> 'compress_after' AS compress_after
FROM
timescaledb_information.jobs
WHERE
hypertable_name = 't_collection_data_value'
AND proc_name = 'policy_compression';
--查询清理策略
SELECT
hypertable_name,
schedule_interval,
config ->> 'drop_after' AS drop_after
FROM
timescaledb_information.jobs
WHERE
hypertable_name = 't_collection_data_value'
AND proc_name = 'policy_retention';
上面是根据时间分区,也可根据ID范围或size
以下回答来自GPT,作者并未验证过,请自行尝试
根据size进行分区
CREATE TABLE metrics ( time TIMESTAMPTZ NOT NULL, id BIGINT NOT NULL, value DOUBLE PRECISION NOT NULL );
--将基础表转换为超表
SELECT create_hypertable('metrics', 'time', chunk_target_size => '1GB', partitioning_column => 'id');
--启用压缩
ALTER TABLE metrics SET (
timescaledb.compress,
timescaledb.compress_orderby = 'time DESC',
timescaledb.compress_segmentby = 'id'
);
--压缩策略
SELECT add_compression_policy('metrics', INTERVAL '7 days');
根据ID范围进行分区
--创建表
CREATE TABLE metrics (
time TIMESTAMPTZ NOT NULL,
id BIGINT NOT NULL,
value DOUBLE PRECISION NOT NULL
);
--将基础表转换为超表
SELECT create_hypertable(
'metrics',
'time',
partitioning_column => 'id',
chunk_time_interval => 10000000
);
--启用压缩
ALTER TABLE metrics SET (
timescaledb.compress,
timescaledb.compress_orderby = 'time DESC',
timescaledb.compress_segmentby = 'id'
);
--压缩策略
SELECT add_compression_policy('metrics', INTERVAL '7 days');