PostgreSQL顺序扫描原理

159 阅读12分钟

理解顺序扫描对于理解数据库的运行是非常重要的,对于一款数据库来说,顺序扫描是必须要实现的最小功能集。这里,我们分析一下顺序扫描的过程,以最简单的select * from t1;语句为例,通过分析全表扫描来分析顺序扫描的过程。

postgres@postgres=# \d t1;
                 Table "public.t1"
 Column |  Type   | Collation | Nullable | Default 
--------+---------+-----------+----------+---------
 a      | integer |           | not null | 
 b      | integer |           |          | 
Indexes:
    "t1_pkey" PRIMARY KEY, btree (a)

-- 顺序扫描
postgres@postgres=# explain select * from t1;
                      QUERY PLAN                      
------------------------------------------------------
 Seq Scan on t1  (cost=0.00..15.00 rows=1000 width=8)
(1 row)

语法解析以及语义分析

解析层面流程如下:

exec_simple_query
--> pg_parse_query              // 生成抽象语法树
    --> raw_parser
        --> base_yyparse
--> pg_analyze_and_rewrite      // 语义分析,转为查询树Query

关键数据结构:

// select语句的抽象语法树表示
typedef struct SelectStmt
{
	NodeTag		type;
    // 对应select *
	List	   *targetList;		/* the target list (of ResTarget) */ 
    // 对应         from t1
	List	   *fromClause;		/* the FROM clause */  // 保存RangeVar节点

    // ...... 

} SelectStmt;

/* RangeVar - range variable, used in FROM clauses */
typedef struct RangeVar
{
	NodeTag		type;
	char	   *catalogname;	/* the catalog (database) name, or NULL */
	char	   *schemaname;		/* the schema name, or NULL */
	char	   *relname;		/* the relation/sequence name */
	bool		inh;			/* expand rel by inheritance? recursively act on children? */
	char		relpersistence; /* see RELPERSISTENCE_* in pg_class.h */
	Alias	   *alias;			/* table alias & optional column aliases */
	int			location;		/* token location, or -1 if unknown */
} RangeVar;    // 存储表的所在库名,模式名,表名信息

语义分析流程:
```c++
pg_analyze_and_rewrite   // 语义分析,生成查询树Query
--> pg_analyze
    --> transformStmt
        --> transformSelectStmt
            --> transformFromClause  // 处理表, 将RangeVar转为RangeTblEntry
                --> transformFromClauseItem
                    --> transformTableEntry     
            --> transformTargetList     // 处理 * ,展开为a, b
--> pg_rewrite_query

SelectStmt经过语义分析后转为Query表示。

查询优化

主要是生成执行计划,前面已经通过EXPLAIN命令得知其执行计划为顺序扫描,让我们具体看一下其是如何生成的。

pg_plan_queries
--> pg_plan_query
    --> planner
        --> standard_planner
            --> subquery_planner
                --> grouping_planner
                    --> query_planner 		// Generate a path (that is, a simplified plan) for a basic query
                        --> setup_simple_rel_arrays
                        --> add_base_rels_to_query
                            --> build_simple_rel
                        --> make_one_rel	// Finds all possible access paths for executing a query
                            --> set_base_rel_sizes
                                --> set_rel_size
                                    --> set_plain_rel_size
                                        --> set_baserel_size_estimates	// 选择率计算,计算代价Cost要用
                            --> set_base_rel_pathlists
                                --> set_rel_pathlist // Build access paths for a base relation
                                    --> set_plain_rel_pathlist
                                        --> create_seqscan_path // 生成顺序扫描路径
                            --> make_rel_from_joinlist
                    --> apply_scanjoin_target_to_paths
            --> create_plan
                --> create_scan_plan
                    --> create_seqscan_plan

这里有个函数要重点说明一下,在涉及多表路径规划时,单表路径是最基础的,单表路径可以是顺序扫描、索引扫描、TID扫描等,这个是最底层的。

// 表路径规划
void set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
{
	Relids		required_outer;

	/* We don't support pushing join clauses into the quals of a seqscan, but
	 * it could still have required parameterization due to LATERAL refs in its tlist. */
	required_outer = rel->lateral_relids;

	add_path(rel, create_seqscan_path(root, rel, required_outer, 0));	// 顺序扫描

	if (rel->consider_parallel && required_outer == NULL)		// 尝试并行顺序扫描
		create_plain_partial_paths(root, rel);

	create_index_paths(root, rel);			// 索引扫描

	create_tidscan_paths(root, rel);      // TID扫描
}

最后由最佳路径Path生成顺序扫描执行计划SeqScan

typedef struct Scan
{
	Plan		plan;
	Index		scanrelid;	// 指明扫描那个表,
} Scan;

typedef Scan SeqScan;

执行器

输入执行计划,输出最终结果。执行顺序扫描。这里先不考虑并行顺序扫描的情况,并行顺序扫描后续再进行分析。我们在继续分析顺序扫描前,先自己想一下实现顺序扫描都需要做哪些工作,需要注意哪些问题?首先,我们需要知道要扫描的那个表的OID,我们知道表是以堆表的形式存储的,我们需要根据表OID打开堆表文件,表中的元组是存在堆表的页中,每个页都有页头以及数据部分,我们需要遍历表中的页,然后读取每个页中所有的元组。还需要注意的问题是事务问题,因为我们在读取页的过程中,可能还有其他事务在插入、删除、更新元组,所以我们扫描的过程中需要注意可见性问题。

主流程
PortalStart
--> ExecutorStart
    --> InitPlan
		--> ExecCheckRTPerms(rangeTable, true); // 检查权限
        --> ExecInitSeqScan
			--> ExecOpenScanRelation(estate, node->scanrelid, eflags); // 打开表
				--> ExecGetRangeTableRelation(estate, scanrelid);
					--> table_open(rte->relid, NoLock);	// 打开指定OID的表
						--> relation_open(relationId, lockmode);	// 先查relcache缓存,缓存没有就通过访问pg_class,pg_attribute系统表构造RelationData,并插入缓存
PortalRun
--> ExecutorRun
    --> ExecutePlan
        --> ExecSeqScan   // 顺序扫描算子
            --> ExecScan
                --> ExecScanFetch
                    --> SeqNext 
						// 在正式扫描前,要判断是否进行并行扫描,扫描的总页数是多少,扫描的起点页是哪里
						--> table_beginscan     // 表扫描函数scan_begin
							--> heap_beginscan  // 堆表beginscan
								--> initscan
									// 计算表的总块数,如何计算,后续有详细分析
									--> RelationGetNumberOfBlocks
										--> smgrnblocks
											--> mdnblocks // 磁盘文件获取总块数
						--> table_scan_getnextslot
							--> heap_getnextslot
								--> heapgettup_pagemode
									// 1. 判断表的总块数,如果为0,则返回NULL
									// 2. 是否进行并行处理,这里我们先不分析并行的情况
									// 3. 从buffer中读取指定的页,如果buffer中没有,则从磁盘读取,并插入buffer
									// 4. 判断页中的元组的可见性 scan->rs_vistuples
									--> heapgetpage
										--> scan->rs_cbuf = ReadBufferExtended
											--> ReadBuffer_common	// 从磁盘读取页,并插入buffer,这部分的内容在之前的文章已分析,这里不再展开细述
										--> scan->rs_cblock = page;
										--> buffer = scan->rs_cbuf;
										--> dp = BufferGetPage(buffer);
										--> // 可见性判断
										--> scan->rs_ntuples = ntup; // 页中可见的元组个数
									
								// 从缓冲区中读取元组,存储在Slot中		
								--> ExecStoreBufferHeapTuple
									--> tts_buffer_heap_store_tuple
PortalDrop

具体实现代码如下:

static TupleTableSlot *ExecSeqScan(PlanState *pstate)
{
	SeqScanState *node = castNode(SeqScanState, pstate);

	return ExecScan(&node->ss,
					(ExecScanAccessMtd) SeqNext,
					(ExecScanRecheckMtd) SeqRecheck);
}

/* ExecScanFetch -- check interrupts & fetch next potential tuple */
static inline TupleTableSlot *ExecScanFetch(ScanState *node, ExecScanAccessMtd accessMtd, ExecScanRecheckMtd recheckMtd)
{
	EState	   *estate = node->ps.state;
	if (estate->es_epq_active != NULL)
	{
		// ...
	}

	/* Run the node-type-specific access method function to get the next tuple */
	return (*accessMtd) (node);
}

// 具体执行顺序扫描
static TupleTableSlot *SeqNext(SeqScanState *node)
{
	TableScanDesc scandesc;
	EState	   *estate;
	ScanDirection direction;
	TupleTableSlot *slot;

	/* get information from the estate and scan state */
	scandesc = node->ss.ss_currentScanDesc;
	estate = node->ss.ps.state;
	direction = estate->es_direction;
	slot = node->ss.ss_ScanTupleSlot;

	if (scandesc == NULL)
	{
		/* We reach here if the scan is not parallel, or if we're serially
		 * executing a scan that was planned to be parallel. */
		scandesc = table_beginscan(node->ss.ss_currentRelation, estate->es_snapshot, 0, NULL);
		node->ss.ss_currentScanDesc = scandesc;
	}

	// 从表中获取下一个元组
	if (table_scan_getnextslot(scandesc, direction, slot))
		return slot;
	return NULL;
}

/* Return next tuple from `scan`, store in slot. */
static inline bool
table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
{
	slot->tts_tableOid = RelationGetRelid(sscan->rs_rd);
	return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot);	// heap_getnextslot
}
怎么计算表的总块数

这里分析一下怎么获取堆表的页数,也就是总块数。计算的核心方法是堆表文件大小/BLOCKSZ,每个页大小默认是8k,表文件每超过1G则增加一个段文件,每个段文件大小默认是1G,所以计算堆表总页数需要考虑段文件数量。具体实现如下:

// 获取堆表总页数
BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum)
{
	MdfdVec    *v;
	BlockNumber nblocks;
	BlockNumber segno;

	mdopenfork(reln, forknum, EXTENSION_FAIL);

	// ...

	segno = reln->md_num_open_segs[forknum] - 1;
	v = &reln->md_seg_fds[forknum][segno];

	for (;;)
	{
		nblocks = _mdnblocks(reln, forknum, v); // 获取当前段文件的总页数
		if (nblocks > ((BlockNumber) RELSEG_SIZE))
			elog(FATAL, "segment too big");
		if (nblocks < ((BlockNumber) RELSEG_SIZE)) // 如果当前段文件总页数小于段文件大小,则说明是最后一个段文件,直接返回
			return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;

		segno++; // 如果当前段文件总页数等于段文件大小,则说明还有下一个段文件,继续计算下一个段文件的总页数

		v = _mdfd_openseg(reln, forknum, segno, 0);
		if (v == NULL)
			return segno * ((BlockNumber) RELSEG_SIZE);
	}
}

// 获取当前段文件的总页数,计算方法是文件大小/BLOCKSZ
static BlockNumber
_mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
{
	off_t		len;

	len = FileSize(seg->mdfd_vfd);
	if (len < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not seek to end of file \"%s\": %m",
						FilePathName(seg->mdfd_vfd))));
	/* note that this calculation will ignore any partial block at EOF */
	return (BlockNumber) (len / BLCKSZ);
}
堆表扫描

我们知道在PG中,表的存储是被切成很多页(也可以叫区块)的,默认是8k的大小(引申问题:为什么是8k呢?这个页的大小涉及到性能,涉及到很多方面,比如VM,FSM,页缓存,磁盘读取等)。全表扫描就是从表的第0块开始,一直扫描到最后一块。扫描的时候,先指定要扫描的块号,查找块是否在缓冲区中,如果不在缓冲区中,则从磁盘读取该表指定的块到缓存区中。块扫描完成后,加1,继续扫描下一个块,直到扫描结束。如果表很大,缓冲区不能完全缓冲,则会依据时钟扫描算法,选择淘汰掉某些块。

typedef struct HeapScanDescData
{
	TableScanDescData rs_base;	/* AM independent part of the descriptor */

	/* state set up at initscan time */
	BlockNumber rs_nblocks;		// 表中页总数
	BlockNumber rs_startblock;	// 扫描起始页
	BlockNumber rs_numblocks;	/* max number of blocks to scan */
	/* rs_numblocks is usually InvalidBlockNumber, meaning "scan whole rel" */

	/* scan current state */
	bool		rs_inited;		/* false = scan not init'd yet */
	BlockNumber rs_cblock;		/* 当前扫描的页号 */
	Buffer		rs_cbuf;		/* 当前扫描的页的Buffer */
	/* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */

	/* rs_numblocks is usually InvalidBlockNumber, meaning "scan whole rel" */
	BufferAccessStrategy rs_strategy;	/* access strategy for reads */

	HeapTupleData rs_ctup;		/* 当前扫描的元组 */

	/* these fields only used in page-at-a-time mode and for bitmap scans */
	int			rs_cindex;		/* 指明当前扫描的元祖在rs_vistuples数组中的位置 */
	int			rs_ntuples;		/* 页中有多少可见的元组 */
	OffsetNumber rs_vistuples[MaxHeapTuplesPerPage];	/* 所有可见元祖的offsets */
}			HeapScanDescData;
typedef struct HeapScanDescData *HeapScanDesc;

关键函数实现,heap_getnextslot在执行堆扫描的过程中,尝试从页面中读取可见元组,并将其存储在一个TupleTableSlot中。我们知道PG执行是一次一元组,每次只返回一个元组,再读取下一个元组时,其扫描过程的状态信息都在HeapScanDesc中,这样下次执行我就知道应该读取哪一个元组,所以每次扫描时,都需要将HeapScanDesc中的状态信息更新。

bool heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
{
	HeapScanDesc scan = (HeapScanDesc) sscan;

	// 获取页,从页中读取元组,存在scan->re_ctup中
	if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
		heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
	else
		heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);

	if (scan->rs_ctup.t_data == NULL)
	{
		ExecClearTuple(slot);
		return false;
	}

	// 将元组放入slot中
	ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
	return true;
}

// 从页中读取元组,存在scan->rs_ctup中
// 为了提高性能,读取元组时,一次性读取页中所有可见的元组,存在scan->rs_vistuples中
static void heapgettup_pagemode(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
{
	HeapTuple	tuple = &(scan->rs_ctup);
	bool		backward = ScanDirectionIsBackward(dir);
	BlockNumber page;
	bool		finished;
	Page		dp;
	int			lines;
	int			lineindex;
	OffsetNumber lineoff;
	int			linesleft;
	ItemId		lpp;

	/* calculate next starting lineindex, given scan direction */
	if (ScanDirectionIsForward(dir))	// 正向扫描
	{
		if (!scan->rs_inited)
		{
			// 如果表是空的,则返回NULL
			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
			{
				tuple->t_data = NULL;
				return;
			}
			if (scan->rs_base.rs_parallel != NULL)	// 并行扫描
			{
				ParallelBlockTableScanDesc pbscan =
				(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
				ParallelBlockTableScanWorker pbscanwork =
				scan->rs_parallelworkerdata;

				table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
														 pbscanwork, pbscan);

				page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
														 pbscanwork, pbscan);

				/* Other processes might have already finished the scan. */
				if (page == InvalidBlockNumber)
				{
					Assert(!BufferIsValid(scan->rs_cbuf));
					tuple->t_data = NULL;
					return;
				}
			}
			else
				page = scan->rs_startblock; /* first page */
			heapgetpage((TableScanDesc) scan, page);	// 获取页
			lineindex = 0; // 从页的第一个元组开始扫描
			scan->rs_inited = true;
		}
		else
		{
			/* continue from previously returned page/tuple */
			page = scan->rs_cblock;   // 当前扫描的页
			lineindex = scan->rs_cindex + 1;   // 获取要读的下一个元组的位置
		}

		dp = BufferGetPage(scan->rs_cbuf);
		TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
		lines = scan->rs_ntuples;	// 页中可见的元组数量
		/* page and lineindex now reference the next visible tid */

		linesleft = lines - lineindex;		// 页中剩余的元组数量
	}
	else if (backward)		// 反向扫描
	{
		// 并行扫描不支持反向扫描
		Assert(scan->rs_base.rs_parallel == NULL);

		if (!scan->rs_inited)
		{
			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
			{
				Assert(!BufferIsValid(scan->rs_cbuf));
				tuple->t_data = NULL;
				return;
			}

			scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;

			/*
			 * Start from last page of the scan.  Ensure we take into account
			 * rs_numblocks if it's been adjusted by heap_setscanlimits().
			 */
			if (scan->rs_numblocks != InvalidBlockNumber)
				page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
			else if (scan->rs_startblock > 0)
				page = scan->rs_startblock - 1;
			else
				page = scan->rs_nblocks - 1;
			heapgetpage((TableScanDesc) scan, page);
		}
		else
		{
			/* continue from previously returned page/tuple */
			page = scan->rs_cblock; /* current page */
		}

		dp = BufferGetPage(scan->rs_cbuf);
		TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
		lines = scan->rs_ntuples;	// 页中可见的元组数量

		if (!scan->rs_inited)
		{
			lineindex = lines - 1;	// 从页的最后一个元组开始扫描
			scan->rs_inited = true;
		}
		else
		{
			lineindex = scan->rs_cindex - 1; 	// 获取要读的下一个元组的位置,与正向扫描相反,正向是 + 1,反向是 - 1
		}
		/* page and lineindex now reference the previous visible tid */

		linesleft = lineindex + 1;
	}
	else
	{
		/*
		 * ``no movement'' scan direction: refetch prior tuple
		 */
		if (!scan->rs_inited)
		{
			Assert(!BufferIsValid(scan->rs_cbuf));
			tuple->t_data = NULL;
			return;
		}

		page = ItemPointerGetBlockNumber(&(tuple->t_self));
		if (page != scan->rs_cblock)
			heapgetpage((TableScanDesc) scan, page);

		/* Since the tuple was previously fetched, needn't lock page here */
		dp = BufferGetPage(scan->rs_cbuf);
		TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
		lpp = PageGetItemId(dp, lineoff);
		Assert(ItemIdIsNormal(lpp));

		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
		tuple->t_len = ItemIdGetLength(lpp);

		/* check that rs_cindex is in sync */
		Assert(scan->rs_cindex < scan->rs_ntuples);
		Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);

		return;
	}

   // 开始页面扫描,循环遍历页中的元组
	for (;;)
	{
		while (linesleft > 0)	// 页中剩余的元组数量
		{
			lineoff = scan->rs_vistuples[lineindex];  // 获取元组在页中的位置
			lpp = PageGetItemId(dp, lineoff); 		  // 页面内部获取元组
			Assert(ItemIdIsNormal(lpp));

			tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
			tuple->t_len = ItemIdGetLength(lpp);
			ItemPointerSet(&(tuple->t_self), page, lineoff);

			/*
			 * if current tuple qualifies, return it.
			 */
			if (key != NULL)
			{
				bool		valid;

				HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
							nkeys, key, valid);
				if (valid)
				{
					scan->rs_cindex = lineindex;
					return;
				}
			}
			else
			{
				scan->rs_cindex = lineindex;	// 记录当前元组的位置,更新扫描状态信息
				return;
			}

			/*
			 * otherwise move to the next item on the page
			 */
			--linesleft;
			if (backward)
				--lineindex;
			else
				++lineindex;
		}
		// 如果当前页读完了,则要读下一个页
		if (backward)
		{
			finished = (page == scan->rs_startblock) ||
				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
			if (page == 0)
				page = scan->rs_nblocks;
			page--;	// 反向扫描,页号 - 1
		}
		else if (scan->rs_base.rs_parallel != NULL)
		{
			ParallelBlockTableScanDesc pbscan =
			(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
			ParallelBlockTableScanWorker pbscanwork =
			scan->rs_parallelworkerdata;

			page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
													 pbscanwork, pbscan);
			finished = (page == InvalidBlockNumber);
		}
		else
		{
			page++;		// 正向扫描,页号 + 1
			if (page >= scan->rs_nblocks)
				page = 0;
			finished = (page == scan->rs_startblock) ||
				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);

			if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)		// 如果允许同步,则报告当前扫描位置
				ss_report_location(scan->rs_base.rs_rd, page);
		}

		if (finished)		// 如果已经扫描完所有页,则返回 NULL
		{
			if (BufferIsValid(scan->rs_cbuf))
				ReleaseBuffer(scan->rs_cbuf);
			scan->rs_cbuf = InvalidBuffer;
			scan->rs_cblock = InvalidBlockNumber;
			tuple->t_data = NULL;
			scan->rs_inited = false;
			return;
		}

		heapgetpage((TableScanDesc) scan, page);	// 获取下一个页

		dp = BufferGetPage(scan->rs_cbuf);
		TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
		lines = scan->rs_ntuples;
		linesleft = lines;			// 获取当前页中剩余的元组数量
		if (backward)
			lineindex = lines - 1;	// 反向扫描,从最后一个元组开始
		else
			lineindex = 0;			// 正向扫描,从第一个元组开始
	}
}

总结

对全表扫描的过程,就是首先生成顺序扫描执行计划,给执行器的核心信息是我要扫描的表的OID,以及扫描方式为顺序扫描。执行器执行顺序扫描算子,一次一元组执行,在正式开始扫描前,需要通过OID打开将要扫描的表,计算表的总页数,在开始扫描后,从第一个页开始扫描,从Buffer中获取指定页,如果Buffer中没有指定块号的页,则从文件中读取一个页到Buffer中,从Buffer读取一个页后,需要判断页中元组的可见性,一次性将页中所有可见元组读取放入scan->rs_vistuples中,遍历页中的所有元组,当页中的所有元组遍历完后,再读取下一个页,直到所有页读取完。