我将深入解析 PostgreSQL 内核中的核心算法,涵盖其独特的架构设计和关键技术实现。PostgreSQL 的算法设计体现了学术严谨性和工程实用性的结合。
一、存储结构与数据组织算法
1. 堆表(Heap)存储算法
页面布局与元组结构
// src/include/storage/bufpage.h
// 页面头部结构
typedef struct PageHeaderData {
PageXLogRecPtr pd_lsn; // 最后修改的LSN
uint16 pd_checksum; // 页面校验和
uint16 pd_flags; // 页面标志位
LocationIndex pd_lower; // 空闲空间起始位置
LocationIndex pd_upper; // 空闲空间结束位置
LocationIndex pd_special; // 特殊空间起始位置
uint16 pd_pagesize_version; // 页面大小和版本
TransactionId pd_prune_xid; // 可清理的最老XID
ItemIdData pd_linp[FLEXIBLE_ARRAY_MEMBER]; // 行指针数组
} PageHeaderData;
// 行指针(Line Pointer)
typedef struct ItemIdData {
unsigned lp_off:15, // 元组偏移量
lp_flags:2, // 状态标志
lp_len:15; // 元组长度
} ItemIdData;
// 元组头部
typedef struct HeapTupleHeaderData {
union {
HeapTupleFields t_heap;
DatumTupleFields t_datum;
} t_choice;
ItemPointerData t_ctid; // 当前元组ID(块号+偏移量)
uint16 t_infomask2; // 属性数量+标志
uint16 t_infomask; // 元组状态标志
uint8 t_hoff; // 头部长度
bits8 t_bits[FLEXIBLE_ARRAY_MEMBER]; // NULL位图
} HeapTupleHeaderData;
页面空间管理算法
// src/backend/storage/page/bufpage.c
// 页面空闲空间管理
void PageInit(Page page, Size pageSize, Size specialSize) {
PageHeader phdr = (PageHeader) page;
// 初始化页面头部
phdr->pd_lower = SizeOfPageHeaderData;
phdr->pd_upper = pageSize - specialSize;
phdr->pd_special = pageSize - specialSize;
phdr->pd_pagesize_version = pageSize | PG_PAGE_LAYOUT_VERSION;
// 初始化行指针数组为空
for (int i = 0; i < (int)((phdr->pd_upper - phdr->pd_lower) /
sizeof(ItemIdData)); i++) {
phdr->pd_linp[i].lp_flags = LP_UNUSED;
}
}
// 在页面中插入元组
OffsetNumber
PageAddItemExtended(Page page, Item item, Size size,
OffsetNumber offsetNumber, int flags) {
PageHeader phdr = (PageHeader) page;
ItemId lp;
Offset lower;
Offset upper;
int i;
// 计算空闲空间
lower = phdr->pd_lower + sizeof(ItemIdData);
upper = phdr->pd_upper;
// 检查是否有足够空间
if (lower > upper || (Size)(upper - lower) < size)
return InvalidOffsetNumber;
// 查找空闲行指针
for (i = FirstOffsetNumber; i <= (int)phdr->pd_lower; i++) {
lp = PageGetItemId(phdr, i);
if (lp->lp_flags == LP_UNUSED)
break;
}
if (i > (int)phdr->pd_lower) {
// 没有空闲行指针,需要扩展
if (phdr->pd_lower >= phdr->pd_upper - (int)sizeof(ItemIdData))
return InvalidOffsetNumber;
phdr->pd_lower += sizeof(ItemIdData);
}
// 从页面尾部分配空间
upper -= size;
phdr->pd_upper = upper;
// 设置行指针
lp = PageGetItemId(phdr, i);
lp->lp_off = upper;
lp->lp_flags = LP_NORMAL;
lp->lp_len = size;
// 复制元组数据
memcpy((char *) page + upper, item, size);
return i;
}
2. TOAST(The Oversized-Attribute Storage Technique)算法
大字段存储算法
// src/include/access/heaptoast.h
// TOAST 存储策略
typedef enum ToastStrategy {
TOAST_STRATEGY_PLAIN = 0, // 不压缩,不线外存储
TOAST_STRATEGY_EXTENDED = 1, // 允许压缩和线外存储
TOAST_STRATEGY_EXTERNAL = 2, // 允许线外存储,不压缩
TOAST_STRATEGY_MAIN = 3 // 允许压缩,尽量避免线外存储
} ToastStrategy;
// TOAST 指针结构
typedef struct varatt_external {
int32 va_rawsize; // 原始数据大小
int32 va_extsize; // 外部存储大小
Oid va_valueid; // TOAST 值ID
Oid va_toastrelid; // TOAST 表OID
} varatt_external;
// TOAST 压缩与存储决策算法
Datum toast_save_datum(Relation rel, Datum value,
struct varlena *oldexternal) {
struct varlena *new_value = (struct varlena *) value;
int32 rawsize = VARSIZE_ANY_EXHDR(new_value);
int32 newsize;
bool need_free = false;
bool need_detoast = false;
// 1. 检查是否超过页内存储阈值
if (rawsize <= TOAST_TUPLE_THRESHOLD)
return value; // 无需TOAST处理
// 2. 尝试压缩
if (rel->rd_toastoid != InvalidOid &&
rawsize > TOAST_COMPRESS_THRESHOLD) {
struct varlena *tmp = toast_compress_datum(value);
if (tmp != NULL) {
newsize = VARSIZE(tmp) - VARHDRSZ;
if (newsize < rawsize) {
value = PointerGetDatum(tmp);
rawsize = newsize;
need_free = true;
}
}
}
// 3. 检查是否仍需要线外存储
if (rawsize <= TOAST_TUPLE_THRESHOLD) {
if (need_free)
pfree(DatumGetPointer(value));
return value;
}
// 4. 执行线外存储
return toast_insert_or_update(rel, value, oldexternal,
rel->rd_toastoid, need_free);
}
// TOAST 压缩算法(LZ4或pglz)
static struct varlena *
toast_compress_datum(Datum value) {
struct varlena *tmp = NULL;
int32 rawsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
if (ToastCompressionId == TOAST_LZ4_COMPRESSION_ID) {
// 使用LZ4压缩
tmp = palloc(rawsize);
int compressed_size = LZ4_compress_default(
VARDATA_ANY(DatumGetPointer(value)),
VARDATA(tmp), rawsize, rawsize - 1);
if (compressed_size > 0 && compressed_size < rawsize) {
SET_VARSIZE_COMPRESSED(tmp, compressed_size + VARHDRSZ);
return tmp;
}
} else if (ToastCompressionId == TOAST_PGLZ_COMPRESSION_ID) {
// 使用pglz压缩
tmp = palloc(rawsize);
int32 compressed_size = pglz_compress(
VARDATA_ANY(DatumGetPointer(value)), rawsize,
VARDATA(tmp), NULL);
if (compressed_size < rawsize) {
SET_VARSIZE_COMPRESSED(tmp, compressed_size + VARHDRSZ);
return tmp;
}
}
if (tmp)
pfree(tmp);
return NULL;
}
二、MVCC(多版本并发控制)算法
1. 快照(Snapshot)算法
// src/include/utils/snapshot.h
// 快照数据结构
typedef struct SnapshotData {
SnapshotSatisfiesFunc satisfies; // 可见性判断函数
TransactionId xmin; // 所有XID < xmin 可见
TransactionId xmax; // 所有XID >= xmax 不可见
TransactionId *xip; // 进行中的XID数组
uint32 xcnt; // 进行中XID数量
TransactionId *subxip; // 进行中的子事务XID
uint32 subxcnt; // 进行中子事务数量
uint32 takenDuringRecovery; // 是否在恢复中获取
bool copied; // 是否是拷贝的
CommandId curcid; // 当前命令ID
} SnapshotData;
// 快照获取算法
Snapshot GetTransactionSnapshot(void) {
if (!FirstSnapshotSet) {
// 第一次获取快照
if (IsolationUsesXactSnapshot()) {
// 可重复读或可序列化隔离级别
return GetSerializableTransactionSnapshot();
} else {
// 读已提交隔离级别
return GetSnapshotData(&CurrentSnapshotData);
}
}
if (IsolationUsesXactSnapshot())
return CurrentSnapshot;
else
return GetSnapshotData(&CurrentSnapshotData);
}
// 构建快照
Snapshot GetSnapshotData(Snapshot snapshot) {
ProcArrayStruct *arrayP = procArray;
TransactionId xmin;
TransactionId xmax;
TransactionId globalxmin;
int count = 0;
int subcount = 0;
// 获取最老的运行中XID
xmin = TransactionXmin = GetOldestXmin(NULL, false);
// 获取下一个要分配的XID
xmax = ShmemVariableCache->nextXid;
// 遍历所有后端进程,收集运行中的XID
for (int index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC *proc = &allProcs[pgprocno];
volatile PGXACT *pgxact = &allPgXact[pgprocno];
// 忽略非运行中的后端
if (proc->pid == 0)
continue;
TransactionId xid = pgxact->xid;
if (TransactionIdIsNormal(xid) &&
NormalTransactionIdPrecedes(xid, xmax)) {
// 添加到运行中XID列表
if (xid < xmin) {
xmin = xid; // 更新xmin
}
snapshot->xip[count++] = xid;
}
}
snapshot->xmin = xmin;
snapshot->xmax = xmax;
snapshot->xcnt = count;
return snapshot;
}
2. 可见性判断算法
// src/backend/utils/time/combocid.c
// HeapTupleSatisfiesXXX 系列函数
bool HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
Buffer buffer) {
HeapTupleHeader tuple = htup->t_data;
TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
// 规则1:由未来事务插入的元组不可见
if (TransactionIdIsCurrentTransactionId(xmin)) {
// 由当前事务插入
if (tuple->t_infomask & HEAP_XMAX_INVALID)
return true; // 尚未删除
if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
return true; // 仅被锁定,未被删除
if (!TransactionIdIsCurrentTransactionId(xmax))
return true; // 被其他事务删除
return false; // 被当前事务删除
}
// 规则2:检查插入事务的提交状态
if (TransactionIdIsInProgress(xmin))
return false; // 插入事务仍在进行中
if (TransactionIdDidCommit(xmin)) {
// 插入事务已提交
if (tuple->t_infomask & HEAP_XMAX_INVALID)
return true; // 尚未删除
// 检查删除事务
if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
return true; // 仅被锁定,未被删除
if (TransactionIdIsCurrentTransactionId(xmax))
return false; // 被当前事务删除
if (TransactionIdIsInProgress(xmax))
return true; // 删除事务仍在进行中
if (TransactionIdDidCommit(xmax))
return false; // 删除事务已提交
else
return true; // 删除事务已回滚
} else {
// 插入事务已回滚
return false;
}
}
// 针对不同快照的可见性判断
bool HeapTupleSatisfiesSnapshot(HeapTuple htup, Snapshot snapshot,
Buffer buffer) {
// 根据快照类型调用相应的函数
switch (snapshot->satisfies) {
case SNAPSHOT_MVCC:
return HeapTupleSatisfiesMVCC(htup, snapshot, buffer);
case SNAPSHOT_SELF:
return HeapTupleSatisfiesSelf(htup, snapshot, buffer);
case SNAPSHOT_ANY:
return HeapTupleSatisfiesAny(htup, snapshot, buffer);
case SNAPSHOT_TOAST:
return HeapTupleSatisfiesToast(htup, snapshot, buffer);
case SNAPSHOT_DIRTY:
return HeapTupleSatisfiesDirty(htup, snapshot, buffer);
case SNAPSHOT_HISTORIC_MVCC:
return HeapTupleSatisfiesHistoricMVCC(htup, snapshot, buffer);
default:
elog(ERROR, "unrecognized snapshot type: %d",
snapshot->satisfies);
return false;
}
}
三、查询优化算法
1. 动态规划连接顺序优化算法
// src/backend/optimizer/path/joinpath.c
// 标准连接搜索算法
static void standard_join_search(PlannerInfo *root, int levels_needed) {
List **joinrels;
int lev;
// 初始化动态规划表
joinrels = (List **) palloc0((levels_needed + 1) * sizeof(List *));
// 第1层:单个关系
for (lev = 1; lev <= levels_needed; lev++) {
joinrels[lev] = NIL;
}
joinrels[1] = initial_rels(root);
// 动态规划主循环
for (lev = 2; lev <= levels_needed; lev++) {
ListCell *lc;
// 考虑所有划分 lev = k + (lev - k)
for (int k = 1; k < lev; k++) {
ListCell *lcr;
ListCell *lcs;
// 遍历左子树所有可能的连接关系
foreach(lcr, joinrels[k]) {
RelOptInfo *outer_rel = (RelOptInfo *) lfirst(lcr);
// 遍历右子树所有可能的连接关系
foreach(lcs, joinrels[lev - k]) {
RelOptInfo *inner_rel = (RelOptInfo *) lfirst(lcs);
// 检查连接兼容性
if (!bms_overlap(outer_rel->relids, inner_rel->relids)) {
// 尝试不同连接类型
try_hashjoin_path(root, outer_rel, inner_rel,
JOIN_INNER, NIL);
try_mergejoin_path(root, outer_rel, inner_rel,
JOIN_INNER, NIL);
try_nestloop_path(root, outer_rel, inner_rel,
JOIN_INNER, NIL);
}
}
}
}
// 去重和剪枝
joinrels[lev] = remove_useless_joinrels(joinrels[lev]);
}
}
2. 遗传算法优化连接顺序
// src/backend/optimizer/geqo/geqo_main.c
// 遗传算法主函数
RelOptInfo *geqo(PlannerInfo *root, int number_of_rels,
List *initial_rels) {
GeqoPrivateData private;
int generation;
Chromosome *momma;
Chromosome *daddy;
Chromosome *kid;
Pool *pool;
int pool_size, number_generations;
// 1. 初始化参数
geqo_params(&private, root, number_of_rels);
pool_size = private.pool_size;
number_generations = private.generations;
// 2. 初始化种群
pool = alloc_pool(pool_size);
random_init_pool(root, pool, initial_rels);
// 3. 评估初始种群适应度
sort_pool(root, pool);
// 4. 进化循环
for (generation = 0; generation < number_generations; generation++) {
// 选择父代
momma = selection(root, pool, SELECTION_BIAS);
daddy = selection(root, pool, SELECTION_BIAS);
// 交叉产生子代
kid = crossover(root, momma, daddy, private.cx_method);
// 变异
if (geqo_rand() < private.mutation_rate)
mutation(root, kid, private.mut_method);
// 评估子代适应度
kid->worth = evaluate_join_order(root, kid->string, initial_rels);
// 替换:用子代替换最差个体
spread_chromo(root, kid, pool);
// 重新排序种群
sort_pool(root, pool);
}
// 5. 返回最佳个体
return gimme_tree(root, pool->data[0].string, initial_rels);
}
// 顺序交叉(Order Crossover)
Chromosome *order_crossover(PlannerInfo *root,
Chromosome *momma,
Chromosome *daddy) {
Chromosome *kid = makeNode(Chromosome);
int number_of_rels = momma->string_length;
int left, right, temp;
Gene *mom = momma->string;
Gene *dad = daddy->string;
Gene *child = kid->string;
bool *used;
used = (bool *) palloc0((number_of_rels + 1) * sizeof(bool));
// 随机选择交叉点
left = geqo_randint(number_of_rels - 1, 0);
right = geqo_randint(number_of_rels - 1, 0);
if (left > right) {
temp = left;
left = right;
right = temp;
}
// 1. 从妈妈复制中间段
for (int i = left; i <= right; i++) {
child[i] = mom[i];
used[mom[i]] = true;
}
// 2. 从爸爸复制剩余基因(保持顺序)
int pos = 0;
for (int i = 0; i < number_of_rels; i++) {
if (pos == left)
pos = right + 1;
if (!used[dad[i]]) {
child[pos] = dad[i];
pos++;
}
}
pfree(used);
kid->string_length = number_of_rels;
return kid;
}
四、执行引擎算法
1. 火山模型(Volcano)执行算法
// src/backend/executor/execProcnode.c
// 执行节点迭代器接口
TupleTableSlot *ExecProcNode(PlanState *node) {
if (node->chgParam != NULL) // 参数变化,需要重新扫描
ExecReScan(node);
// 调用节点特定的执行函数
return node->ExecProcNode(node);
}
// 顺序扫描执行算法
static TupleTableSlot *
ExecSeqScan(SeqScanState *node) {
TableScanDesc scandesc = node->ss.ss_currentScanDesc;
EState *estate = node->ss.ps.state;
ScanDirection direction = estate->es_direction;
HeapTuple tuple;
// 获取下一个元组
tuple = heap_getnext(scandesc, direction);
if (tuple == NULL) {
// 扫描结束
ExecClearTuple(node->ss.ss_ScanTupleSlot);
return NULL;
}
// 存储元组到槽中
ExecStoreBufferHeapTuple(tuple, node->ss.ss_ScanTupleSlot,
scandesc->rs_cbuf);
return node->ss.ss_ScanTupleSlot;
}
// 哈希连接执行算法
static TupleTableSlot *
ExecHashJoin(HashJoinState *hjstate) {
ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
List *joinqual = hjstate->js.joinqual;
List *otherqual = hjstate->js.ps.qual;
TupleTableSlot *outerTupleSlot;
for (;;) {
// 1. 获取外元组
outerTupleSlot = ExecProcNode(outerPlanState(hjstate));
if (TupIsNull(outerTupleSlot)) {
// 外表结束
return NULL;
}
// 2. 设置表达式上下文
econtext->ecxt_outertuple = outerTupleSlot;
// 3. 哈希探测
if (ExecHashGetHashValue(hjstate, econtext,
hjstate->hj_OuterHashKeys,
false, /* outer tuple */
hjstate->hj_hashvalue,
hjstate->hj_CurHashValue)) {
int hashvalue = hjstate->hj_CurHashValue;
// 4. 查找哈希桶
HashJoinTuple hashTuple = hjstate->hj_CurTuple =
hjstate->hj_HashTable->buckets[hashvalue & hjstate->hj_HashTable->nbuckets];
// 5. 遍历哈希链
while (hashTuple != NULL) {
// 获取内元组
TupleTableSlot *innerTupleSlot =
ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
hjstate->hj_HashTupleSlot, false);
econtext->ecxt_innertuple = innerTupleSlot;
// 6. 检查连接条件
if (joinqual == NIL || ExecQual(joinqual, econtext)) {
// 7. 检查其他条件
if (otherqual == NIL || ExecQual(otherqual, econtext)) {
// 找到匹配,返回连接结果
return ExecProject(hjstate->js.ps.ps_ProjInfo);
}
}
hashTuple = hashTuple->next;
hjstate->hj_CurTuple = hashTuple;
}
}
// 没有匹配,继续下一个外元组
hjstate->hj_CurTuple = NULL;
}
}
2. 并行查询执行算法
// src/backend/executor/nodeGather.c
// Gather节点 - 并行查询协调器
static TupleTableSlot *
ExecGather(GatherState *node) {
TupleTableSlot *slot;
EState *estate = node->ps.state;
// 检查是否所有工作进程都已完成
if (node->need_to_scan_locally) {
// 本地执行
slot = ExecProcNode(outerPlanState(node));
if (!TupIsNull(slot))
return slot;
node->need_to_scan_locally = false;
}
// 从工作进程收集结果
while (true) {
int nworkers = node->nworkers_launched;
int i;
// 轮询工作进程
for (i = 0; i < nworkers; i++) {
ParallelWorkerInfo *w = &node->pei->parallel_workers[i];
if (w->tqueue) {
// 从共享队列读取
slot = TupleQueueReaderNext(w->tqueue, true, &w->done);
if (slot != NULL)
return slot;
if (w->done) {
// 工作进程已完成
DestroyTupleQueueReader(w->tqueue);
w->tqueue = NULL;
}
}
}
// 检查所有工作进程是否都已完成
bool all_done = true;
for (i = 0; i < nworkers; i++) {
if (!node->pei->parallel_workers[i].done) {
all_done = false;
break;
}
}
if (all_done)
break;
// 等待更多结果
WaitLatch(&node->ps.state->es_plannedstmt->queryLatch,
WL_LATCH_SET | WL_TIMEOUT, 10);
ResetLatch(&node->ps.state->es_plannedstmt->queryLatch);
}
return NULL;
}
// 并行顺序扫描
static TupleTableSlot *
ExecParallelSeqScan(ParallelSeqScanState *node) {
ParallelTableScanDesc pscan = node->ss.ss_currentScanDesc;
HeapTuple tuple;
// 获取下一个并行扫描的块
if (!node->initialized) {
// 初始化并行扫描
table_parallelscan_initialize(node->ss.ss_currentRelation, pscan);
node->initialized = true;
}
// 从共享进度中获取下一个块
BlockNumber blockno = table_parallelscan_nextpage(pscan);
if (blockno == InvalidBlockNumber) {
// 扫描结束
ExecClearTuple(node->ss.ss_ScanTupleSlot);
return NULL;
}
// 读取块
Buffer buffer = ReadBufferExtended(node->ss.ss_currentRelation,
MAIN_FORKNUM, blockno,
RBM_NORMAL, NULL);
// 扫描块中的元组
tuple = heapam_scan_getnexttuple(&node->ss, ForwardScanDirection);
if (tuple == NULL) {
ReleaseBuffer(buffer);
return ExecParallelSeqScan(node); // 继续下一个块
}
// 存储元组
ExecStoreBufferHeapTuple(tuple, node->ss.ss_ScanTupleSlot, buffer);
ReleaseBuffer(buffer);
return node->ss.ss_ScanTupleSlot;
}
五、WAL(预写日志)算法
1. LSN(Log Sequence Number)管理
// src/include/access/xlogdefs.h
typedef uint64 XLogRecPtr;
// WAL记录结构
typedef struct XLogRecord {
uint32 xl_tot_len; // 总长度
TransactionId xl_xid; // 事务ID
XLogRecPtr xl_prev; // 前一个记录的LSN
uint8 xl_info; // 标志位
RmgrId xl_rmid; // 资源管理器ID
pg_crc32c xl_crc; // CRC校验
XLogRecData *xl_blocks[FLEXIBLE_ARRAY_MEMBER]; // 关联的块
} XLogRecord;
// 插入WAL记录
XLogRecPtr
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) {
XLogRecPtr StartPos;
XLogRecPtr EndPos;
// 1. 保留插入位置
START_CRIT_SECTION();
do {
// 2. 保留LSN位置
StartPos = GetXLogInsertRecPtr();
// 3. 复制数据到WAL缓冲区
CopyXLogRecordToWALBuffer(rmid, info, StartPos, rdata);
// 4. 更新LSN
EndPos = StartPos + rdata->len;
AdvanceXLogInsertRecPtr(EndPos);
} while (LWLockConditionalAcquire(WALBufMappingLock, LW_EXCLUSIVE) == 0);
// 5. 检查是否需要刷写
if (XLogCheckpointNeeded(EndPos)) {
XLogBackgroundFlush();
}
LWLockRelease(WALBufMappingLock);
END_CRIT_SECTION();
return EndPos;
}
2. WAL写入与刷盘算法
// src/backend/access/transam/xlog.c
// WAL刷写算法
void XLogFlush(XLogRecPtr record) {
XLogRecPtr WriteRqstPtr = record;
XLogwrtRqst WriteRqst;
static XLogRecPtr lastflush = 0;
// 1. 更新写入请求位置
SpinLockAcquire(&XLogCtl->info_lck);
if (WriteRqstPtr <= XLogCtl->LogwrtResult.Write) {
// 已经写入
SpinLockRelease(&XLogCtl->info_lck);
return;
}
XLogCtl->LogwrtRqst.Write = WriteRqstPtr;
XLogCtl->LogwrtRqst.Flush = WriteRqstPtr;
// 2. 唤醒WAL写入器
if (XLogCtl->LogwrtResult.Write < WriteRqstPtr) {
XLogCtl->Write.waiting = true;
XLogCtl->Write.event = CreateEvent(NULL, FALSE, FALSE, NULL);
SpinLockRelease(&XLogCtl->info_lck);
SetLatch(&XLogCtl->Write.latch);
} else {
SpinLockRelease(&XLogCtl->info_lck);
}
// 3. 等待刷写完成
if (lastflush < WriteRqstPtr) {
int wakeEvents = WL_LATCH_SET | WL_TIMEOUT;
long timeout = 10000; // 10秒超时
while (XLogCtl->LogwrtResult.Flush < WriteRqstPtr) {
WaitLatch(&MyProc->procLatch, wakeEvents, timeout);
ResetLatch(&MyProc->procLatch);
}
lastflush = WriteRqstPtr;
}
}
// 组提交优化
void XLogGroupCommit(void) {
static XLogRecPtr last_group_commit = 0;
static int group_count = 0;
XLogRecPtr current_recptr = GetXLogInsertRecPtr();
// 1. 检查是否满足组提交条件
if (current_recptr - last_group_commit < GROUP_COMMIT_BYTES &&
group_count < MAX_GROUP_COMMIT_COUNT) {
group_count++;
return; // 等待更多事务
}
// 2. 执行组提交
XLogFlush(current_recptr);
// 3. 更新统计
last_group_commit = current_recptr;
group_count = 0;
}
六、VACUUM算法
1. 懒惰VACUUM算法
// src/backend/commands/vacuumlazy.c
// 懒惰VACUUM主函数
void lazy_vacuum_rel(Relation onerel, VacuumParams *params) {
LVRelStats *vacrelstats;
BlockNumber relblocks;
bool scanned_all_unused = false;
// 1. 获取关系统计信息
vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
relblocks = RelationGetNumberOfBlocks(onerel);
// 2. 扫描关系,收集死亡元组
lazy_scan_heap(onerel, params, vacrelstats, &scanned_all_unused);
// 3. 清理索引
lazy_vacuum_indexes(onerel, params, vacrelstats);
// 4. 清理堆
if (vacrelstats->num_dead_tuples > 0) {
lazy_vacuum_heap(onerel, vacrelstats);
}
// 5. 截断末尾的空闲页面
if (scanned_all_unused) {
lazy_truncate_heap(onerel, vacrelstats);
}
// 6. 更新系统目录
lazy_update_fsm(onerel, vacrelstats);
vacuum_update_relstats(onerel, vacrelstats);
}
// 懒惰扫描堆
static void lazy_scan_heap(Relation onerel, VacuumParams *params,
LVRelStats *vacrelstats, bool *scanned_all_unused) {
BlockNumber nblocks = vacrelstats->rel_pages;
Buffer buf;
Page page;
vacrelstats->dead_tuples = (ItemPointer) palloc(
MAX_TUPLES_PER_PAGE * nblocks * sizeof(ItemPointerData));
vacrelstats->num_dead_tuples = 0;
// 逐块扫描
for (BlockNumber blkno = 0; blkno < nblocks; blkno++) {
// 读取块
buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, NULL);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(buf);
// 检查页面是否需要清理
if (PageIsNew(page) || PageIsEmpty(page)) {
UnlockReleaseBuffer(buf);
continue;
}
// 获取页面中所有行指针
OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
for (OffsetNumber offnum = FirstOffsetNumber;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum)) {
ItemId itemid = PageGetItemId(page, offnum);
if (ItemIdIsUsed(itemid)) {
HeapTupleData tuple;
tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
tuple.t_len = ItemIdGetLength(itemid);
ItemPointerSet(&(tuple.t_self), blkno, offnum);
// 检查元组是否死亡
if (heap_tuple_needs_freeze(&tuple,
GetCurrentTransactionIdIfAny())) {
// 需要冻结
heap_freeze_tuple(&tuple, GetCurrentTransactionIdIfAny());
} else if (heap_tuple_is_dead(&tuple,
GetActiveSnapshot())) {
// 记录死亡元组
vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = tuple.t_self;
vacrelstats->num_dead_tuples++;
}
}
}
UnlockReleaseBuffer(buf);
}
}
七、HOT(Heap-Only Tuple)优化算法
// src/backend/access/heap/README.HOT
// HOT更新决策算法
bool HeapTupleSatisfiesHOT(HeapTupleHeader oldtuple,
HeapTupleHeader newtuple,
Buffer oldbuf) {
// 1. 检查是否在同一页面
if (BufferGetBlockNumber(oldbuf) !=
ItemPointerGetBlockNumber(&newtuple->t_ctid))
return false;
// 2. 检查索引列是否被修改
Relation rel = RelationIdGetRelation(oldtuple->t_tableOid);
TupleDesc tupdesc = RelationGetDescr(rel);
for (int i = 0; i < tupdesc->natts; i++) {
Form_pg_attribute attr = TupleDescAttr(tupdesc, i);
if (attr->attnum < 0) // 系统列
continue;
// 检查是否为索引列
if (relation_has_index_on_att(rel, attr->attnum)) {
Datum oldval, newval;
bool oldisnull, newisnull;
oldval = heap_getattr(oldtuple, attr->attnum, tupdesc, &oldisnull);
newval = heap_getattr(newtuple, attr->attnum, tupdesc, &newisnull);
// 如果索引列被修改,不能使用HOT
if (oldisnull != newisnull ||
(!oldisnull && !newisnull &&
!datumIsEqual(oldval, newval, attr->attbyval, attr->attlen)))
return false;
}
}
// 3. 检查页面是否有足够空间
Size newsize = MAXALIGN(newtuple->t_len);
Page page = BufferGetPage(oldbuf);
if (PageGetFreeSpace(page) < newsize)
return false;
return true;
}
// HOT链管理
void heap_page_prune_opt(Relation relation, Buffer buffer,
TransactionId OldestXmin) {
Page page = BufferGetPage(buffer);
OffsetNumber offnum, maxoff;
bool changed = false;
maxoff = PageGetMaxOffsetNumber(page);
for (offnum = FirstOffsetNumber;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum)) {
ItemId lp = PageGetItemId(page, offnum);
HeapTupleHeader tuple;
if (!ItemIdIsNormal(lp))
continue;
tuple = (HeapTupleHeader) PageGetItem(page, lp);
// 检查是否为HOT链的一部分
if (HeapTupleHeaderIsHeapOnly(tuple)) {
// 清理HOT链
if (heap_prune_chain(relation, buffer, offnum,
OldestXmin, &changed)) {
// 链被清理
}
}
}
if (changed) {
// 记录WAL
MarkBufferDirty(buffer);
}
}
八、算法复杂度分析
| 算法 | 平均复杂度 | 最坏情况 | 空间复杂度 | 适用场景 |
|---|---|---|---|---|
| B树查找 | O(logₘn) | O(logₘn) | O(n) | 通用索引 |
| 哈希连接 | O(n+m) | O(n*m) | O(min(n,m)) | 等值连接 |
| 归并连接 | O(n log n + m log m) | O(n log n + m log m) | O(n+m) | 已排序数据 |
| 动态规划连接 | O(3ⁿ) | O(3ⁿ) | O(2ⁿ) | 小规模连接 |
| 遗传算法连接 | O(gps) | O(gps) | O(p) | 大规模连接 |
| 排序 | O(n log n) | O(n²) | O(n) | 内存排序 |
| 外部排序 | O(n logₘn) | O(n logₘn) | O(B) | 大数据排序 |
| MVCC可见性判断 | O(1) | O(k) | O(t) | 事务隔离 |
其中:
- n, m: 关系基数
- g: 遗传代数
- p: 种群大小
- s: 个体评估代价
- B: 缓冲区大小
- t: 事务数量
- k: 活跃事务数
九、高级算法实现
1. GiST(通用搜索树)算法
// src/backend/access/gist/gist.c
// GiST搜索算法
void gistgettuple(IndexScanDesc scan, ScanDirection dir) {
GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
GISTSearchItem *item;
for (;;) {
// 1. 获取下一个待搜索项
if (so->queue == NULL || so->queue->head == NULL) {
// 队列空,搜索结束
ItemPointerSetInvalid(&scan->xs_ctup.t_self);
scan->xs_recheck = false;
return;
}
item = (GISTSearchItem *) linitial(so->queue->head);
so->queue->head = list_delete_first(so->queue->head);
// 2. 处理搜索项
if (GISTSearchItemIsHeap(*item)) {
// 叶节点项,返回元组
scan->xs_ctup.t_self = item->data.heap.heapPtr;
if (scan->numberOfOrderBys > 0) {
// 需要距离重新检查
scan->xs_recheck = true;
}
pfree(item);
return;
} else {
// 内部节点,扩展搜索
gistScanPage(scan, item, so->qual, so->orderTypes, dir);
pfree(item);
}
}
}
// GiST一致性函数
bool gistindex_keytest(IndexTuple itup,
Datum *query,
StrategyNumber strategy) {
// 调用操作符类的consistent函数
GISTENTRY entry;
gistentryinit(entry, itup->t_tid.ip_blkid, NULL, NULL, 0, false);
return DatumGetBool(FunctionCall2Coll(
opclass->consistentFn,
index->rd_indcollation[0],
PointerGetDatum(&entry),
PointerGetDatum(query)));
}
2. BRIN(块范围索引)算法
// src/backend/access/brin/brin.c
// BRIN扫描算法
bool bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) {
BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
Relation index = scan->indexRelation;
BlockNumber blk = 0;
// 遍历所有范围
while (blk < opaque->nblocks) {
Buffer buf;
Page page;
BrinMetaPageData *metadata;
// 读取元数据
buf = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
LockBuffer(buf, BUFFER_LOCK_SHARE);
page = BufferGetPage(buf);
metadata = (BrinMetaPageData *) PageGetContents(page);
// 计算当前范围
BlockNumber start = blk * metadata->pagesPerRange;
BlockNumber end = (blk + 1) * metadata->pagesPerRange - 1;
// 检查范围是否匹配查询
if (brin_check_range(scan, blk, start, end)) {
// 将整个范围添加到位图
for (BlockNumber b = start; b <= end; b++) {
tbm_add_page(tbm, b);
}
}
UnlockReleaseBuffer(buf);
blk++;
}
return true;
}
// BRIN范围检查
static bool brin_check_range(IndexScanDesc scan, BlockNumber blkno,
BlockNumber start, BlockNumber end) {
BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
BrinDesc *brdesc = opaque->bd;
Buffer buf;
Page page;
BrinTuple *tup;
// 读取BRIN元组
buf = ReadBuffer(scan->indexRelation, blkno);
LockBuffer(buf, BUFFER_LOCK_SHARE);
page = BufferGetPage(buf);
tup = (BrinTuple *) PageGetItem(page,
PageGetItemId(page, BRIN_TUPLE_ITEMNO));
// 检查每个摘要是否满足查询
for (int i = 0; i < brdesc->bd_totalstored; i++) {
BrinValues *col = &tup->bt_columns[i];
if (!brin_match_qual(scan, i, col))
return false;
}
UnlockReleaseBuffer(buf);
return true;
}
PostgreSQL 的算法设计体现了数据库理论的深度实践,其 MVCC 实现、优化器设计和可扩展的索引框架是其核心技术优势。这些算法经过多年的工业验证,提供了高可靠性和高性能的数据管理能力。