关键字:
=======
KingbaseES、Large Object、vacuumlo、金仓数据库
Large Object类型
KES提供了large object数据类型(lo),用户可以通过stream-style access访问用户数据。对于不适合整体操作的大数据,streaming access非常有用。
KES将large object分为多个chunks,将chunks存储在数据库的行中。当对large object进行随机读写时,KES使用B-tree索引快速找到正确的chunk number。
KES还支持TOAST存储系统,它自动将大于一个数据库page的值存储到每个表对应的二级存储。Large object相对于TOAST系统的优势是它允许存储最大至4TB的值,而TOAST系统允许的最大值为1GB。另外,我们可以高效地读取和更新large object的部分内容,而对TOAST value的大部分操作都需要读写整个值。
对large object的访问必须在一个事务块内进行。可以调用setAutoCommit(false)来开启一个事务块。
使用方法
建表时创建一个oid列,该列用于存储large object数据。Large object数据使用lo_create()函数创建,并返回其oid。对象的内容可以使用lo_import()函数导入,使用lo_export()函数导出。请参考如下示例:
CREATE TABLE image (
name text,
raster oid
);
SELECT lo_creat(-1); -- returns OID of new, empty large object
SELECT lo_create(43213); -- attempts to create large object with OID 43213
SELECT lo_unlink(173454); -- deletes large object with OID 173454
INSERT INTO image (name, raster) VALUES ('beautiful image', lo_import('/etc/motd'));
INSERT INTO image (name, raster) VALUES ('beautiful image', lo_import('/etc/motd', 68583)); -- same as above, but specify OID to use
SELECT lo_export(image.raster, '/tmp/motd') FROM image WHERE name = 'beautiful image';
create table t1 (i int, file_id OID);
insert into t1 values (1, lo_import('/home/zn/tmp/schema.sql'));
select * from t1;
i | file_id
---+---------
1 | 16909
(1 row)
select lo_export(16909, '/home/zn/tmp/blob_output');
lo_export
-----------
1
(1 row)
\lo_list
Large objects
ID | Owner | Description
-------+----------+-------------
16909 | postgres |
(1 row)
select lo_unlink( 16909 );
lo_unlink
-----------
1
(1 row)
\lo_list
Large objects
ID | Owner | Description
----+-------+-------------
(0 rows)
select * from t2;
i | photo_id
---+----------
1 | 16909
(1 row)
select lo_export(16909, '/home/zn/tmp/blob_output');
ERROR: large object 16909 does not exist
实现
所有large object都保存在名为”sys_largeobject”的系统表中。每个large object在系统表“sys_largeobject_metadata”也有一个表项。Large object可以使用类似文件操作的API来创建、修改和删除。
3.1 sys_catalog.sys_largeobject
// src/include/dbserver/kernel/meta_data/sys_largeobject.h
METADATA(_lob,2613,LargeObjectRelationId)
{
Oid loid;; /* 大对象标识符 */
int32 pageno;; /* 页码(从0开始) */
/* 数据的长度是可变的,但我们允许直接访问;看到inv_api.c */
bytea data BKI_FORCE_NOT_NULL;; /* 页的数据(可能为零长度) */
} FormData_largeobject;; /* data has variable length, but we allow direct access; see inv_api.c */
bytea data BKI_FORCE_NOT_NULL; /* Data for page (may be zero-length) */
} FormData_pg_largeobject;
每个page的数据大小为LOBLKSIZE (当前为BLCKSZ/4, 或2 kB).
Name
Type
References
Description
loid
oid
sys_largeobject_metadata.oid
Identifier of the large object that includes this page
pageno
int4
number of this page within its large object (counting from zero)
data
bytea
Actual data stored in the large object. This will never be more than LOBLKSIZE bytes and might be less.
3.2 sys_catalog.pg_largeobject_metadata
// src/include/dbserver/kernel/meta_data/sys_largeobject_metadata.h
METADATA(_lob_meta,2995,LargeObjectMetadataRelationId)
{
Oid oid;; /* oid */
Oid lomowner;; /* 大对象所有者的OID */
#ifdef METADATA_VARLEN /* 可变长度字段从这里开始 */
aclitem lomacl[1];; /* 访问权限 */
#endif
} FormData_largeobject_metadata;;
Name
Type
References
Description
loid
oid
Row identifier (hidden attribute; must be explicitly selected)
lomowner
oid
sys_authid.oid
Owner of the large object
lomacl
aclitem[]
Access privileges; see GRANT and REVOKE for details
LoDescData描述了一个已打开的large object:
// src/include/dbserver/sdk/datatype/lo.h
typedef struct LoDescData
{
Oid id;; /* LO的标识符 */
Snapshot snapshot;; /* 要使用的快照 */
SubTransactionId subid;; /* 拥有子事务ID */
uint64 offset;; /* 当前搜索指针 */
int flags;; /* 参见下面的标记位 */
/*
* 标志中的位:
*/
#define IFS_READLOCK (1 << 0) /* LO开放读取 */
#define IFS_WRITELOCK (1 << 1) /* LO开放写作 */
} LoDescData;;
/*
* Each "page" (tuple) of a large object can hold this much data
*/
#define LOBLOCKSIZE (BLCKSZ / 4) // 8192 / 4 = 2048 = 2K
/*
* Maximum length in bytes for a large object.
*/
#define MAX_LO_SIZE ((int64) INT_MAX * LOBLOCKSIZE) // 2^31 * 2K = 2^31 * 2^11 = 2^42 = 4T
Interfaces
下面显示了client用于操作large object的SQL函数:
zn=# \df lo_*
List of functions
Schema | Name | Result data type | Argument data types | Type
------------+---------------+------------------+---------------------------+------
sys_catalog | lo_close | integer | integer | func
sys_catalog | lo_creat | oid | integer | func
sys_catalog | lo_create | oid | oid | func
sys_catalog | lo_export | integer | oid, text | func
sys_catalog | lo_from_bytea | oid | oid, bytea | func
sys_catalog | lo_get | bytea | oid | func
sys_catalog | lo_get | bytea | oid, bigint, integer | func
sys_catalog | lo_import | oid | text | func
sys_catalog | lo_import | oid | text, oid | func
sys_catalog | lo_lseek | integer | integer, integer, integer | func
sys_catalog | lo_lseek64 | bigint | integer, bigint, integer | func
sys_catalog | lo_open | integer | oid, integer | func
sys_catalog | lo_put | void | oid, bigint, bytea | func
sys_catalog | lo_tell | integer | integer | func
sys_catalog | lo_tell64 | bigint | integer | func
sys_catalog | lo_truncate | integer | integer, integer | func
sys_catalog | lo_truncate64 | integer | integer, bigint | func
sys_catalog | lo_unlink | integer | oid | func
(18 rows)
在sys_proc.dat中,sql command与back-end function关联:
# src/include/dbserver/kernel/meta_data/sys_proc.dat
{
oid => '952',
descr => 'large object open',
proname => 'lo_open',
provolatile => 'v',
proparallel => 'u',
prorettype => 'int4',
proargtypes => 'oid int4',
prosrc => 'be_lo_open' },
* client side functions
src/dbconnector/libkci/front_lobj.c: lo_create()...
* server side functions
src/dbserver/kernel/comm/libkci/backend_fsstubs.c: be_lo_create()...
vacuumlo
删除一行包含large object的数据或删除表不会删除large object。删除“orphaned”large object需要使用vacuumlo命令。
实现原理:
- 查询数据库中所有的lo,保存至临时表vacuum_l
- 查询数据库中包含lo数据类型的表,返回schema、table、field(列名)
- 在vacuum_l中删除select field from schema.table
- vacuum_l中剩下的就是orphaned lo,将其在系统中删除
CREATE TEMP TABLE vacuum_l AS SELECT oid AS lo FROM sys_largeobject_metadata;
ANALYZE vacuum_l;
SELECT s.nspname, c.relname, a.attname
FROM sys_class c, sys_attribute a, sys_namespace s, sys_type t
WHERE a.attnum > 0 AND NOT a.attisdropped
AND a.attrelid = c.oid
AND a.atttypid = t.oid
AND c.relnamespace = s.oid
AND t.typname in ('oid', 'lo', 'blob', 'clob')
AND c.relkind in ('r', 'm')
AND s.nspname !~ '^sys_';
DELETE FROM vacuum_l WHERE lo IN (SELECT %s FROM %s.%s) -- field, schema, table
-- 每次删除1000行
begin;
DECLARE myportal CURSOR WITH HOLD FOR SELECT lo FROM vacuum_l;
while (1)
FETCH FORWARD 1000 IN myportal;
-- for each line in result set, call lo_unlink(conn, lo);
commit; -- commit once for 1000 deleted lo
begin;
commit;
vacuumlo的局限性:外部工具,需要使用用户名、密码连接数据库,并使用cron等工具定时启动。