postgresql核心算法详细解析

19 阅读15分钟

我将深入解析 PostgreSQL 内核中的核心算法,涵盖其独特的架构设计和关键技术实现。PostgreSQL 的算法设计体现了学术严谨性和工程实用性的结合。

一、存储结构与数据组织算法

1. 堆表(Heap)存储算法

页面布局与元组结构

// src/include/storage/bufpage.h
// 页面头部结构
typedef struct PageHeaderData {
    PageXLogRecPtr pd_lsn;         // 最后修改的LSN
    uint16      pd_checksum;       // 页面校验和
    uint16      pd_flags;          // 页面标志位
    LocationIndex pd_lower;         // 空闲空间起始位置
    LocationIndex pd_upper;         // 空闲空间结束位置
    LocationIndex pd_special;       // 特殊空间起始位置
    uint16      pd_pagesize_version; // 页面大小和版本
    TransactionId pd_prune_xid;     // 可清理的最老XID
    ItemIdData  pd_linp[FLEXIBLE_ARRAY_MEMBER]; // 行指针数组
} PageHeaderData;

// 行指针(Line Pointer)
typedef struct ItemIdData {
    unsigned    lp_off:15,         // 元组偏移量
                lp_flags:2,        // 状态标志
                lp_len:15;         // 元组长度
} ItemIdData;

// 元组头部
typedef struct HeapTupleHeaderData {
    union {
        HeapTupleFields t_heap;
        DatumTupleFields t_datum;
    } t_choice;
    
    ItemPointerData t_ctid;        // 当前元组ID(块号+偏移量)
    uint16      t_infomask2;       // 属性数量+标志
    uint16      t_infomask;        // 元组状态标志
    uint8       t_hoff;            // 头部长度
    bits8       t_bits[FLEXIBLE_ARRAY_MEMBER]; // NULL位图
} HeapTupleHeaderData;

页面空间管理算法

// src/backend/storage/page/bufpage.c
// 页面空闲空间管理
void PageInit(Page page, Size pageSize, Size specialSize) {
    PageHeader  phdr = (PageHeader) page;
    
    // 初始化页面头部
    phdr->pd_lower = SizeOfPageHeaderData;
    phdr->pd_upper = pageSize - specialSize;
    phdr->pd_special = pageSize - specialSize;
    phdr->pd_pagesize_version = pageSize | PG_PAGE_LAYOUT_VERSION;
    
    // 初始化行指针数组为空
    for (int i = 0; i < (int)((phdr->pd_upper - phdr->pd_lower) / 
                               sizeof(ItemIdData)); i++) {
        phdr->pd_linp[i].lp_flags = LP_UNUSED;
    }
}

// 在页面中插入元组
OffsetNumber
PageAddItemExtended(Page page, Item item, Size size,
                    OffsetNumber offsetNumber, int flags) {
    PageHeader  phdr = (PageHeader) page;
    ItemId      lp;
    Offset      lower;
    Offset      upper;
    int         i;
    
    // 计算空闲空间
    lower = phdr->pd_lower + sizeof(ItemIdData);
    upper = phdr->pd_upper;
    
    // 检查是否有足够空间
    if (lower > upper || (Size)(upper - lower) < size)
        return InvalidOffsetNumber;
    
    // 查找空闲行指针
    for (i = FirstOffsetNumber; i <= (int)phdr->pd_lower; i++) {
        lp = PageGetItemId(phdr, i);
        if (lp->lp_flags == LP_UNUSED)
            break;
    }
    
    if (i > (int)phdr->pd_lower) {
        // 没有空闲行指针,需要扩展
        if (phdr->pd_lower >= phdr->pd_upper - (int)sizeof(ItemIdData))
            return InvalidOffsetNumber;
        phdr->pd_lower += sizeof(ItemIdData);
    }
    
    // 从页面尾部分配空间
    upper -= size;
    phdr->pd_upper = upper;
    
    // 设置行指针
    lp = PageGetItemId(phdr, i);
    lp->lp_off = upper;
    lp->lp_flags = LP_NORMAL;
    lp->lp_len = size;
    
    // 复制元组数据
    memcpy((char *) page + upper, item, size);
    
    return i;
}

2. TOAST(The Oversized-Attribute Storage Technique)算法

大字段存储算法

// src/include/access/heaptoast.h
// TOAST 存储策略
typedef enum ToastStrategy {
    TOAST_STRATEGY_PLAIN = 0,     // 不压缩,不线外存储
    TOAST_STRATEGY_EXTENDED = 1,  // 允许压缩和线外存储
    TOAST_STRATEGY_EXTERNAL = 2,  // 允许线外存储,不压缩
    TOAST_STRATEGY_MAIN = 3       // 允许压缩,尽量避免线外存储
} ToastStrategy;

// TOAST 指针结构
typedef struct varatt_external {
    int32       va_rawsize;       // 原始数据大小
    int32       va_extsize;       // 外部存储大小
    Oid         va_valueid;       // TOAST 值ID
    Oid         va_toastrelid;    // TOAST 表OID
} varatt_external;

// TOAST 压缩与存储决策算法
Datum toast_save_datum(Relation rel, Datum value,
                       struct varlena *oldexternal) {
    struct varlena *new_value = (struct varlena *) value;
    int32   rawsize = VARSIZE_ANY_EXHDR(new_value);
    int32   newsize;
    bool    need_free = false;
    bool    need_detoast = false;
    
    // 1. 检查是否超过页内存储阈值
    if (rawsize <= TOAST_TUPLE_THRESHOLD)
        return value;  // 无需TOAST处理
    
    // 2. 尝试压缩
    if (rel->rd_toastoid != InvalidOid && 
        rawsize > TOAST_COMPRESS_THRESHOLD) {
        struct varlena *tmp = toast_compress_datum(value);
        if (tmp != NULL) {
            newsize = VARSIZE(tmp) - VARHDRSZ;
            if (newsize < rawsize) {
                value = PointerGetDatum(tmp);
                rawsize = newsize;
                need_free = true;
            }
        }
    }
    
    // 3. 检查是否仍需要线外存储
    if (rawsize <= TOAST_TUPLE_THRESHOLD) {
        if (need_free)
            pfree(DatumGetPointer(value));
        return value;
    }
    
    // 4. 执行线外存储
    return toast_insert_or_update(rel, value, oldexternal, 
                                  rel->rd_toastoid, need_free);
}

// TOAST 压缩算法(LZ4或pglz)
static struct varlena *
toast_compress_datum(Datum value) {
    struct varlena *tmp = NULL;
    int32   rawsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
    
    if (ToastCompressionId == TOAST_LZ4_COMPRESSION_ID) {
        // 使用LZ4压缩
        tmp = palloc(rawsize);
        int compressed_size = LZ4_compress_default(
            VARDATA_ANY(DatumGetPointer(value)),
            VARDATA(tmp), rawsize, rawsize - 1);
        
        if (compressed_size > 0 && compressed_size < rawsize) {
            SET_VARSIZE_COMPRESSED(tmp, compressed_size + VARHDRSZ);
            return tmp;
        }
    } else if (ToastCompressionId == TOAST_PGLZ_COMPRESSION_ID) {
        // 使用pglz压缩
        tmp = palloc(rawsize);
        int32 compressed_size = pglz_compress(
            VARDATA_ANY(DatumGetPointer(value)), rawsize,
            VARDATA(tmp), NULL);
        
        if (compressed_size < rawsize) {
            SET_VARSIZE_COMPRESSED(tmp, compressed_size + VARHDRSZ);
            return tmp;
        }
    }
    
    if (tmp)
        pfree(tmp);
    return NULL;
}

二、MVCC(多版本并发控制)算法

1. 快照(Snapshot)算法

// src/include/utils/snapshot.h
// 快照数据结构
typedef struct SnapshotData {
    SnapshotSatisfiesFunc satisfies;  // 可见性判断函数
    
    TransactionId xmin;               // 所有XID < xmin 可见
    TransactionId xmax;               // 所有XID >= xmax 不可见
    TransactionId *xip;               // 进行中的XID数组
    uint32        xcnt;               // 进行中XID数量
    TransactionId *subxip;            // 进行中的子事务XID
    uint32        subxcnt;            // 进行中子事务数量
    
    uint32        takenDuringRecovery; // 是否在恢复中获取
    bool          copied;              // 是否是拷贝的
    CommandId     curcid;              // 当前命令ID
} SnapshotData;

// 快照获取算法
Snapshot GetTransactionSnapshot(void) {
    if (!FirstSnapshotSet) {
        // 第一次获取快照
        if (IsolationUsesXactSnapshot()) {
            // 可重复读或可序列化隔离级别
            return GetSerializableTransactionSnapshot();
        } else {
            // 读已提交隔离级别
            return GetSnapshotData(&CurrentSnapshotData);
        }
    }
    
    if (IsolationUsesXactSnapshot())
        return CurrentSnapshot;
    else
        return GetSnapshotData(&CurrentSnapshotData);
}

// 构建快照
Snapshot GetSnapshotData(Snapshot snapshot) {
    ProcArrayStruct *arrayP = procArray;
    TransactionId xmin;
    TransactionId xmax;
    TransactionId globalxmin;
    int           count = 0;
    int           subcount = 0;
    
    // 获取最老的运行中XID
    xmin = TransactionXmin = GetOldestXmin(NULL, false);
    
    // 获取下一个要分配的XID
    xmax = ShmemVariableCache->nextXid;
    
    // 遍历所有后端进程,收集运行中的XID
    for (int index = 0; index < arrayP->numProcs; index++) {
        int         pgprocno = arrayP->pgprocnos[index];
        volatile PGPROC *proc = &allProcs[pgprocno];
        volatile PGXACT *pgxact = &allPgXact[pgprocno];
        
        // 忽略非运行中的后端
        if (proc->pid == 0)
            continue;
        
        TransactionId xid = pgxact->xid;
        
        if (TransactionIdIsNormal(xid) &&
            NormalTransactionIdPrecedes(xid, xmax)) {
            // 添加到运行中XID列表
            if (xid < xmin) {
                xmin = xid;  // 更新xmin
            }
            snapshot->xip[count++] = xid;
        }
    }
    
    snapshot->xmin = xmin;
    snapshot->xmax = xmax;
    snapshot->xcnt = count;
    
    return snapshot;
}

2. 可见性判断算法

// src/backend/utils/time/combocid.c
// HeapTupleSatisfiesXXX 系列函数
bool HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
                            Buffer buffer) {
    HeapTupleHeader tuple = htup->t_data;
    TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
    TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
    
    // 规则1:由未来事务插入的元组不可见
    if (TransactionIdIsCurrentTransactionId(xmin)) {
        // 由当前事务插入
        if (tuple->t_infomask & HEAP_XMAX_INVALID)
            return true;  // 尚未删除
        
        if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
            return true;  // 仅被锁定,未被删除
        
        if (!TransactionIdIsCurrentTransactionId(xmax))
            return true;  // 被其他事务删除
        
        return false;  // 被当前事务删除
    }
    
    // 规则2:检查插入事务的提交状态
    if (TransactionIdIsInProgress(xmin))
        return false;  // 插入事务仍在进行中
    
    if (TransactionIdDidCommit(xmin)) {
        // 插入事务已提交
        if (tuple->t_infomask & HEAP_XMAX_INVALID)
            return true;  // 尚未删除
        
        // 检查删除事务
        if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
            return true;  // 仅被锁定,未被删除
        
        if (TransactionIdIsCurrentTransactionId(xmax))
            return false;  // 被当前事务删除
        
        if (TransactionIdIsInProgress(xmax))
            return true;  // 删除事务仍在进行中
        
        if (TransactionIdDidCommit(xmax))
            return false;  // 删除事务已提交
        else
            return true;   // 删除事务已回滚
    } else {
        // 插入事务已回滚
        return false;
    }
}

// 针对不同快照的可见性判断
bool HeapTupleSatisfiesSnapshot(HeapTuple htup, Snapshot snapshot,
                                Buffer buffer) {
    // 根据快照类型调用相应的函数
    switch (snapshot->satisfies) {
        case SNAPSHOT_MVCC:
            return HeapTupleSatisfiesMVCC(htup, snapshot, buffer);
        case SNAPSHOT_SELF:
            return HeapTupleSatisfiesSelf(htup, snapshot, buffer);
        case SNAPSHOT_ANY:
            return HeapTupleSatisfiesAny(htup, snapshot, buffer);
        case SNAPSHOT_TOAST:
            return HeapTupleSatisfiesToast(htup, snapshot, buffer);
        case SNAPSHOT_DIRTY:
            return HeapTupleSatisfiesDirty(htup, snapshot, buffer);
        case SNAPSHOT_HISTORIC_MVCC:
            return HeapTupleSatisfiesHistoricMVCC(htup, snapshot, buffer);
        default:
            elog(ERROR, "unrecognized snapshot type: %d", 
                 snapshot->satisfies);
            return false;
    }
}

三、查询优化算法

1. 动态规划连接顺序优化算法

// src/backend/optimizer/path/joinpath.c
// 标准连接搜索算法
static void standard_join_search(PlannerInfo *root, int levels_needed) {
    List      **joinrels;
    int         lev;
    
    // 初始化动态规划表
    joinrels = (List **) palloc0((levels_needed + 1) * sizeof(List *));
    
    // 第1层:单个关系
    for (lev = 1; lev <= levels_needed; lev++) {
        joinrels[lev] = NIL;
    }
    joinrels[1] = initial_rels(root);
    
    // 动态规划主循环
    for (lev = 2; lev <= levels_needed; lev++) {
        ListCell   *lc;
        
        // 考虑所有划分 lev = k + (lev - k)
        for (int k = 1; k < lev; k++) {
            ListCell   *lcr;
            ListCell   *lcs;
            
            // 遍历左子树所有可能的连接关系
            foreach(lcr, joinrels[k]) {
                RelOptInfo *outer_rel = (RelOptInfo *) lfirst(lcr);
                
                // 遍历右子树所有可能的连接关系
                foreach(lcs, joinrels[lev - k]) {
                    RelOptInfo *inner_rel = (RelOptInfo *) lfirst(lcs);
                    
                    // 检查连接兼容性
                    if (!bms_overlap(outer_rel->relids, inner_rel->relids)) {
                        // 尝试不同连接类型
                        try_hashjoin_path(root, outer_rel, inner_rel, 
                                         JOIN_INNER, NIL);
                        try_mergejoin_path(root, outer_rel, inner_rel,
                                          JOIN_INNER, NIL);
                        try_nestloop_path(root, outer_rel, inner_rel,
                                         JOIN_INNER, NIL);
                    }
                }
            }
        }
        
        // 去重和剪枝
        joinrels[lev] = remove_useless_joinrels(joinrels[lev]);
    }
}

2. 遗传算法优化连接顺序

// src/backend/optimizer/geqo/geqo_main.c
// 遗传算法主函数
RelOptInfo *geqo(PlannerInfo *root, int number_of_rels, 
                 List *initial_rels) {
    GeqoPrivateData private;
    int         generation;
    Chromosome *momma;
    Chromosome *daddy;
    Chromosome *kid;
    Pool       *pool;
    int         pool_size, number_generations;
    
    // 1. 初始化参数
    geqo_params(&private, root, number_of_rels);
    pool_size = private.pool_size;
    number_generations = private.generations;
    
    // 2. 初始化种群
    pool = alloc_pool(pool_size);
    random_init_pool(root, pool, initial_rels);
    
    // 3. 评估初始种群适应度
    sort_pool(root, pool);
    
    // 4. 进化循环
    for (generation = 0; generation < number_generations; generation++) {
        // 选择父代
        momma = selection(root, pool, SELECTION_BIAS);
        daddy = selection(root, pool, SELECTION_BIAS);
        
        // 交叉产生子代
        kid = crossover(root, momma, daddy, private.cx_method);
        
        // 变异
        if (geqo_rand() < private.mutation_rate)
            mutation(root, kid, private.mut_method);
        
        // 评估子代适应度
        kid->worth = evaluate_join_order(root, kid->string, initial_rels);
        
        // 替换:用子代替换最差个体
        spread_chromo(root, kid, pool);
        
        // 重新排序种群
        sort_pool(root, pool);
    }
    
    // 5. 返回最佳个体
    return gimme_tree(root, pool->data[0].string, initial_rels);
}

// 顺序交叉(Order Crossover)
Chromosome *order_crossover(PlannerInfo *root, 
                            Chromosome *momma, 
                            Chromosome *daddy) {
    Chromosome *kid = makeNode(Chromosome);
    int         number_of_rels = momma->string_length;
    int         left, right, temp;
    Gene       *mom = momma->string;
    Gene       *dad = daddy->string;
    Gene       *child = kid->string;
    bool       *used;
    
    used = (bool *) palloc0((number_of_rels + 1) * sizeof(bool));
    
    // 随机选择交叉点
    left = geqo_randint(number_of_rels - 1, 0);
    right = geqo_randint(number_of_rels - 1, 0);
    
    if (left > right) {
        temp = left;
        left = right;
        right = temp;
    }
    
    // 1. 从妈妈复制中间段
    for (int i = left; i <= right; i++) {
        child[i] = mom[i];
        used[mom[i]] = true;
    }
    
    // 2. 从爸爸复制剩余基因(保持顺序)
    int pos = 0;
    for (int i = 0; i < number_of_rels; i++) {
        if (pos == left)
            pos = right + 1;
        
        if (!used[dad[i]]) {
            child[pos] = dad[i];
            pos++;
        }
    }
    
    pfree(used);
    kid->string_length = number_of_rels;
    return kid;
}

四、执行引擎算法

1. 火山模型(Volcano)执行算法

// src/backend/executor/execProcnode.c
// 执行节点迭代器接口
TupleTableSlot *ExecProcNode(PlanState *node) {
    if (node->chgParam != NULL)  // 参数变化,需要重新扫描
        ExecReScan(node);
    
    // 调用节点特定的执行函数
    return node->ExecProcNode(node);
}

// 顺序扫描执行算法
static TupleTableSlot *
ExecSeqScan(SeqScanState *node) {
    TableScanDesc scandesc = node->ss.ss_currentScanDesc;
    EState      *estate = node->ss.ps.state;
    ScanDirection direction = estate->es_direction;
    HeapTuple   tuple;
    
    // 获取下一个元组
    tuple = heap_getnext(scandesc, direction);
    
    if (tuple == NULL) {
        // 扫描结束
        ExecClearTuple(node->ss.ss_ScanTupleSlot);
        return NULL;
    }
    
    // 存储元组到槽中
    ExecStoreBufferHeapTuple(tuple, node->ss.ss_ScanTupleSlot,
                             scandesc->rs_cbuf);
    
    return node->ss.ss_ScanTupleSlot;
}

// 哈希连接执行算法
static TupleTableSlot *
ExecHashJoin(HashJoinState *hjstate) {
    ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
    List       *joinqual = hjstate->js.joinqual;
    List       *otherqual = hjstate->js.ps.qual;
    TupleTableSlot *outerTupleSlot;
    
    for (;;) {
        // 1. 获取外元组
        outerTupleSlot = ExecProcNode(outerPlanState(hjstate));
        if (TupIsNull(outerTupleSlot)) {
            // 外表结束
            return NULL;
        }
        
        // 2. 设置表达式上下文
        econtext->ecxt_outertuple = outerTupleSlot;
        
        // 3. 哈希探测
        if (ExecHashGetHashValue(hjstate, econtext,
                                 hjstate->hj_OuterHashKeys,
                                 false,  /* outer tuple */
                                 hjstate->hj_hashvalue,
                                 hjstate->hj_CurHashValue)) {
            int     hashvalue = hjstate->hj_CurHashValue;
            
            // 4. 查找哈希桶
            HashJoinTuple hashTuple = hjstate->hj_CurTuple = 
                hjstate->hj_HashTable->buckets[hashvalue & hjstate->hj_HashTable->nbuckets];
            
            // 5. 遍历哈希链
            while (hashTuple != NULL) {
                // 获取内元组
                TupleTableSlot *innerTupleSlot = 
                    ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
                                          hjstate->hj_HashTupleSlot, false);
                
                econtext->ecxt_innertuple = innerTupleSlot;
                
                // 6. 检查连接条件
                if (joinqual == NIL || ExecQual(joinqual, econtext)) {
                    // 7. 检查其他条件
                    if (otherqual == NIL || ExecQual(otherqual, econtext)) {
                        // 找到匹配,返回连接结果
                        return ExecProject(hjstate->js.ps.ps_ProjInfo);
                    }
                }
                
                hashTuple = hashTuple->next;
                hjstate->hj_CurTuple = hashTuple;
            }
        }
        
        // 没有匹配,继续下一个外元组
        hjstate->hj_CurTuple = NULL;
    }
}

2. 并行查询执行算法

// src/backend/executor/nodeGather.c
// Gather节点 - 并行查询协调器
static TupleTableSlot *
ExecGather(GatherState *node) {
    TupleTableSlot *slot;
    EState      *estate = node->ps.state;
    
    // 检查是否所有工作进程都已完成
    if (node->need_to_scan_locally) {
        // 本地执行
        slot = ExecProcNode(outerPlanState(node));
        if (!TupIsNull(slot))
            return slot;
        node->need_to_scan_locally = false;
    }
    
    // 从工作进程收集结果
    while (true) {
        int         nworkers = node->nworkers_launched;
        int         i;
        
        // 轮询工作进程
        for (i = 0; i < nworkers; i++) {
            ParallelWorkerInfo *w = &node->pei->parallel_workers[i];
            
            if (w->tqueue) {
                // 从共享队列读取
                slot = TupleQueueReaderNext(w->tqueue, true, &w->done);
                if (slot != NULL)
                    return slot;
                
                if (w->done) {
                    // 工作进程已完成
                    DestroyTupleQueueReader(w->tqueue);
                    w->tqueue = NULL;
                }
            }
        }
        
        // 检查所有工作进程是否都已完成
        bool all_done = true;
        for (i = 0; i < nworkers; i++) {
            if (!node->pei->parallel_workers[i].done) {
                all_done = false;
                break;
            }
        }
        
        if (all_done)
            break;
        
        // 等待更多结果
        WaitLatch(&node->ps.state->es_plannedstmt->queryLatch,
                  WL_LATCH_SET | WL_TIMEOUT, 10);
        ResetLatch(&node->ps.state->es_plannedstmt->queryLatch);
    }
    
    return NULL;
}

// 并行顺序扫描
static TupleTableSlot *
ExecParallelSeqScan(ParallelSeqScanState *node) {
    ParallelTableScanDesc pscan = node->ss.ss_currentScanDesc;
    HeapTuple   tuple;
    
    // 获取下一个并行扫描的块
    if (!node->initialized) {
        // 初始化并行扫描
        table_parallelscan_initialize(node->ss.ss_currentRelation, pscan);
        node->initialized = true;
    }
    
    // 从共享进度中获取下一个块
    BlockNumber blockno = table_parallelscan_nextpage(pscan);
    
    if (blockno == InvalidBlockNumber) {
        // 扫描结束
        ExecClearTuple(node->ss.ss_ScanTupleSlot);
        return NULL;
    }
    
    // 读取块
    Buffer buffer = ReadBufferExtended(node->ss.ss_currentRelation,
                                       MAIN_FORKNUM, blockno,
                                       RBM_NORMAL, NULL);
    
    // 扫描块中的元组
    tuple = heapam_scan_getnexttuple(&node->ss, ForwardScanDirection);
    
    if (tuple == NULL) {
        ReleaseBuffer(buffer);
        return ExecParallelSeqScan(node);  // 继续下一个块
    }
    
    // 存储元组
    ExecStoreBufferHeapTuple(tuple, node->ss.ss_ScanTupleSlot, buffer);
    ReleaseBuffer(buffer);
    
    return node->ss.ss_ScanTupleSlot;
}

五、WAL(预写日志)算法

1. LSN(Log Sequence Number)管理

// src/include/access/xlogdefs.h
typedef uint64 XLogRecPtr;

// WAL记录结构
typedef struct XLogRecord {
    uint32      xl_tot_len;        // 总长度
    TransactionId xl_xid;          // 事务ID
    XLogRecPtr  xl_prev;           // 前一个记录的LSN
    uint8       xl_info;           // 标志位
    RmgrId      xl_rmid;           // 资源管理器ID
    pg_crc32c   xl_crc;            // CRC校验
    XLogRecData *xl_blocks[FLEXIBLE_ARRAY_MEMBER]; // 关联的块
} XLogRecord;

// 插入WAL记录
XLogRecPtr
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) {
    XLogRecPtr  StartPos;
    XLogRecPtr  EndPos;
    
    // 1. 保留插入位置
    START_CRIT_SECTION();
    
    do {
        // 2. 保留LSN位置
        StartPos = GetXLogInsertRecPtr();
        
        // 3. 复制数据到WAL缓冲区
        CopyXLogRecordToWALBuffer(rmid, info, StartPos, rdata);
        
        // 4. 更新LSN
        EndPos = StartPos + rdata->len;
        AdvanceXLogInsertRecPtr(EndPos);
        
    } while (LWLockConditionalAcquire(WALBufMappingLock, LW_EXCLUSIVE) == 0);
    
    // 5. 检查是否需要刷写
    if (XLogCheckpointNeeded(EndPos)) {
        XLogBackgroundFlush();
    }
    
    LWLockRelease(WALBufMappingLock);
    END_CRIT_SECTION();
    
    return EndPos;
}

2. WAL写入与刷盘算法

// src/backend/access/transam/xlog.c
// WAL刷写算法
void XLogFlush(XLogRecPtr record) {
    XLogRecPtr  WriteRqstPtr = record;
    XLogwrtRqst WriteRqst;
    static XLogRecPtr lastflush = 0;
    
    // 1. 更新写入请求位置
    SpinLockAcquire(&XLogCtl->info_lck);
    if (WriteRqstPtr <= XLogCtl->LogwrtResult.Write) {
        // 已经写入
        SpinLockRelease(&XLogCtl->info_lck);
        return;
    }
    
    XLogCtl->LogwrtRqst.Write = WriteRqstPtr;
    XLogCtl->LogwrtRqst.Flush = WriteRqstPtr;
    
    // 2. 唤醒WAL写入器
    if (XLogCtl->LogwrtResult.Write < WriteRqstPtr) {
        XLogCtl->Write.waiting = true;
        XLogCtl->Write.event = CreateEvent(NULL, FALSE, FALSE, NULL);
        SpinLockRelease(&XLogCtl->info_lck);
        
        SetLatch(&XLogCtl->Write.latch);
    } else {
        SpinLockRelease(&XLogCtl->info_lck);
    }
    
    // 3. 等待刷写完成
    if (lastflush < WriteRqstPtr) {
        int         wakeEvents = WL_LATCH_SET | WL_TIMEOUT;
        long        timeout = 10000;  // 10秒超时
        
        while (XLogCtl->LogwrtResult.Flush < WriteRqstPtr) {
            WaitLatch(&MyProc->procLatch, wakeEvents, timeout);
            ResetLatch(&MyProc->procLatch);
        }
        
        lastflush = WriteRqstPtr;
    }
}

// 组提交优化
void XLogGroupCommit(void) {
    static XLogRecPtr last_group_commit = 0;
    static int    group_count = 0;
    XLogRecPtr    current_recptr = GetXLogInsertRecPtr();
    
    // 1. 检查是否满足组提交条件
    if (current_recptr - last_group_commit < GROUP_COMMIT_BYTES &&
        group_count < MAX_GROUP_COMMIT_COUNT) {
        group_count++;
        return;  // 等待更多事务
    }
    
    // 2. 执行组提交
    XLogFlush(current_recptr);
    
    // 3. 更新统计
    last_group_commit = current_recptr;
    group_count = 0;
}

六、VACUUM算法

1. 懒惰VACUUM算法

// src/backend/commands/vacuumlazy.c
// 懒惰VACUUM主函数
void lazy_vacuum_rel(Relation onerel, VacuumParams *params) {
    LVRelStats *vacrelstats;
    BlockNumber relblocks;
    bool        scanned_all_unused = false;
    
    // 1. 获取关系统计信息
    vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
    relblocks = RelationGetNumberOfBlocks(onerel);
    
    // 2. 扫描关系,收集死亡元组
    lazy_scan_heap(onerel, params, vacrelstats, &scanned_all_unused);
    
    // 3. 清理索引
    lazy_vacuum_indexes(onerel, params, vacrelstats);
    
    // 4. 清理堆
    if (vacrelstats->num_dead_tuples > 0) {
        lazy_vacuum_heap(onerel, vacrelstats);
    }
    
    // 5. 截断末尾的空闲页面
    if (scanned_all_unused) {
        lazy_truncate_heap(onerel, vacrelstats);
    }
    
    // 6. 更新系统目录
    lazy_update_fsm(onerel, vacrelstats);
    vacuum_update_relstats(onerel, vacrelstats);
}

// 懒惰扫描堆
static void lazy_scan_heap(Relation onerel, VacuumParams *params,
                           LVRelStats *vacrelstats, bool *scanned_all_unused) {
    BlockNumber nblocks = vacrelstats->rel_pages;
    Buffer      buf;
    Page        page;
    
    vacrelstats->dead_tuples = (ItemPointer) palloc(
        MAX_TUPLES_PER_PAGE * nblocks * sizeof(ItemPointerData));
    vacrelstats->num_dead_tuples = 0;
    
    // 逐块扫描
    for (BlockNumber blkno = 0; blkno < nblocks; blkno++) {
        // 读取块
        buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, NULL);
        LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
        
        page = BufferGetPage(buf);
        
        // 检查页面是否需要清理
        if (PageIsNew(page) || PageIsEmpty(page)) {
            UnlockReleaseBuffer(buf);
            continue;
        }
        
        // 获取页面中所有行指针
        OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
        
        for (OffsetNumber offnum = FirstOffsetNumber;
             offnum <= maxoff;
             offnum = OffsetNumberNext(offnum)) {
            ItemId      itemid = PageGetItemId(page, offnum);
            
            if (ItemIdIsUsed(itemid)) {
                HeapTupleData tuple;
                
                tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
                tuple.t_len = ItemIdGetLength(itemid);
                ItemPointerSet(&(tuple.t_self), blkno, offnum);
                
                // 检查元组是否死亡
                if (heap_tuple_needs_freeze(&tuple, 
                                            GetCurrentTransactionIdIfAny())) {
                    // 需要冻结
                    heap_freeze_tuple(&tuple, GetCurrentTransactionIdIfAny());
                } else if (heap_tuple_is_dead(&tuple, 
                                              GetActiveSnapshot())) {
                    // 记录死亡元组
                    vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = tuple.t_self;
                    vacrelstats->num_dead_tuples++;
                }
            }
        }
        
        UnlockReleaseBuffer(buf);
    }
}

七、HOT(Heap-Only Tuple)优化算法

// src/backend/access/heap/README.HOT
// HOT更新决策算法
bool HeapTupleSatisfiesHOT(HeapTupleHeader oldtuple,
                           HeapTupleHeader newtuple,
                           Buffer oldbuf) {
    // 1. 检查是否在同一页面
    if (BufferGetBlockNumber(oldbuf) != 
        ItemPointerGetBlockNumber(&newtuple->t_ctid))
        return false;
    
    // 2. 检查索引列是否被修改
    Relation    rel = RelationIdGetRelation(oldtuple->t_tableOid);
    TupleDesc   tupdesc = RelationGetDescr(rel);
    
    for (int i = 0; i < tupdesc->natts; i++) {
        Form_pg_attribute attr = TupleDescAttr(tupdesc, i);
        
        if (attr->attnum < 0)  // 系统列
            continue;
        
        // 检查是否为索引列
        if (relation_has_index_on_att(rel, attr->attnum)) {
            Datum oldval, newval;
            bool  oldisnull, newisnull;
            
            oldval = heap_getattr(oldtuple, attr->attnum, tupdesc, &oldisnull);
            newval = heap_getattr(newtuple, attr->attnum, tupdesc, &newisnull);
            
            // 如果索引列被修改,不能使用HOT
            if (oldisnull != newisnull ||
                (!oldisnull && !newisnull && 
                 !datumIsEqual(oldval, newval, attr->attbyval, attr->attlen)))
                return false;
        }
    }
    
    // 3. 检查页面是否有足够空间
    Size        newsize = MAXALIGN(newtuple->t_len);
    Page        page = BufferGetPage(oldbuf);
    
    if (PageGetFreeSpace(page) < newsize)
        return false;
    
    return true;
}

// HOT链管理
void heap_page_prune_opt(Relation relation, Buffer buffer,
                         TransactionId OldestXmin) {
    Page        page = BufferGetPage(buffer);
    OffsetNumber offnum, maxoff;
    bool        changed = false;
    
    maxoff = PageGetMaxOffsetNumber(page);
    
    for (offnum = FirstOffsetNumber;
         offnum <= maxoff;
         offnum = OffsetNumberNext(offnum)) {
        ItemId      lp = PageGetItemId(page, offnum);
        HeapTupleHeader tuple;
        
        if (!ItemIdIsNormal(lp))
            continue;
        
        tuple = (HeapTupleHeader) PageGetItem(page, lp);
        
        // 检查是否为HOT链的一部分
        if (HeapTupleHeaderIsHeapOnly(tuple)) {
            // 清理HOT链
            if (heap_prune_chain(relation, buffer, offnum, 
                                 OldestXmin, &changed)) {
                // 链被清理
            }
        }
    }
    
    if (changed) {
        // 记录WAL
        MarkBufferDirty(buffer);
    }
}

八、算法复杂度分析

算法平均复杂度最坏情况空间复杂度适用场景
B树查找O(logₘn)O(logₘn)O(n)通用索引
哈希连接O(n+m)O(n*m)O(min(n,m))等值连接
归并连接O(n log n + m log m)O(n log n + m log m)O(n+m)已排序数据
动态规划连接O(3ⁿ)O(3ⁿ)O(2ⁿ)小规模连接
遗传算法连接O(gps)O(gps)O(p)大规模连接
排序O(n log n)O(n²)O(n)内存排序
外部排序O(n logₘn)O(n logₘn)O(B)大数据排序
MVCC可见性判断O(1)O(k)O(t)事务隔离

其中:

  • n, m: 关系基数
  • g: 遗传代数
  • p: 种群大小
  • s: 个体评估代价
  • B: 缓冲区大小
  • t: 事务数量
  • k: 活跃事务数

九、高级算法实现

1. GiST(通用搜索树)算法

// src/backend/access/gist/gist.c
// GiST搜索算法
void gistgettuple(IndexScanDesc scan, ScanDirection dir) {
    GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
    GISTSearchItem *item;
    
    for (;;) {
        // 1. 获取下一个待搜索项
        if (so->queue == NULL || so->queue->head == NULL) {
            // 队列空,搜索结束
            ItemPointerSetInvalid(&scan->xs_ctup.t_self);
            scan->xs_recheck = false;
            return;
        }
        
        item = (GISTSearchItem *) linitial(so->queue->head);
        so->queue->head = list_delete_first(so->queue->head);
        
        // 2. 处理搜索项
        if (GISTSearchItemIsHeap(*item)) {
            // 叶节点项,返回元组
            scan->xs_ctup.t_self = item->data.heap.heapPtr;
            
            if (scan->numberOfOrderBys > 0) {
                // 需要距离重新检查
                scan->xs_recheck = true;
            }
            
            pfree(item);
            return;
        } else {
            // 内部节点,扩展搜索
            gistScanPage(scan, item, so->qual, so->orderTypes, dir);
            pfree(item);
        }
    }
}

// GiST一致性函数
bool gistindex_keytest(IndexTuple itup, 
                       Datum *query, 
                       StrategyNumber strategy) {
    // 调用操作符类的consistent函数
    GISTENTRY entry;
    gistentryinit(entry, itup->t_tid.ip_blkid, NULL, NULL, 0, false);
    
    return DatumGetBool(FunctionCall2Coll(
        opclass->consistentFn,
        index->rd_indcollation[0],
        PointerGetDatum(&entry),
        PointerGetDatum(query)));
}

2. BRIN(块范围索引)算法

// src/backend/access/brin/brin.c
// BRIN扫描算法
bool bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) {
    BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
    Relation    index = scan->indexRelation;
    BlockNumber blk = 0;
    
    // 遍历所有范围
    while (blk < opaque->nblocks) {
        Buffer      buf;
        Page        page;
        BrinMetaPageData *metadata;
        
        // 读取元数据
        buf = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
        LockBuffer(buf, BUFFER_LOCK_SHARE);
        page = BufferGetPage(buf);
        metadata = (BrinMetaPageData *) PageGetContents(page);
        
        // 计算当前范围
        BlockNumber start = blk * metadata->pagesPerRange;
        BlockNumber end = (blk + 1) * metadata->pagesPerRange - 1;
        
        // 检查范围是否匹配查询
        if (brin_check_range(scan, blk, start, end)) {
            // 将整个范围添加到位图
            for (BlockNumber b = start; b <= end; b++) {
                tbm_add_page(tbm, b);
            }
        }
        
        UnlockReleaseBuffer(buf);
        blk++;
    }
    
    return true;
}

// BRIN范围检查
static bool brin_check_range(IndexScanDesc scan, BlockNumber blkno,
                             BlockNumber start, BlockNumber end) {
    BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
    BrinDesc   *brdesc = opaque->bd;
    Buffer      buf;
    Page        page;
    BrinTuple  *tup;
    
    // 读取BRIN元组
    buf = ReadBuffer(scan->indexRelation, blkno);
    LockBuffer(buf, BUFFER_LOCK_SHARE);
    page = BufferGetPage(buf);
    
    tup = (BrinTuple *) PageGetItem(page, 
                                    PageGetItemId(page, BRIN_TUPLE_ITEMNO));
    
    // 检查每个摘要是否满足查询
    for (int i = 0; i < brdesc->bd_totalstored; i++) {
        BrinValues *col = &tup->bt_columns[i];
        
        if (!brin_match_qual(scan, i, col))
            return false;
    }
    
    UnlockReleaseBuffer(buf);
    return true;
}

PostgreSQL 的算法设计体现了数据库理论的深度实践,其 MVCC 实现、优化器设计和可扩展的索引框架是其核心技术优势。这些算法经过多年的工业验证,提供了高可靠性和高性能的数据管理能力。