He3DB源码分析——WAL日志的存取

90 阅读9分钟

He3DB与PG架构上一个很大的不同就是存算分离,其WAL日志不是放在本地的pg_wal目录下, 而是将生成的WAL日志存放在了tikv集群中。这里我们就分析一下日志存取部分的代码,即,怎么将日志存储到tikv中,以及如何从tikv中取日志。

这部分的代码的实现是采用的Rust实现的,存取WAL日志的实现是在这个工程下:gitee.com/he3db/he3fs… 对Rust不熟悉的可以参考文章后面的参考文档。

设计思考

He3DB是一款云原生数据库,存算分离架构,节点分为主节点、推进节点、备节点,推进节点以及备节点通过WAL日志回放进行状态同步,主节点接收用户读写请求,产生WAL日志,不能像PG一样存储在本地盘中,否则推进节点,备节点无法访问。除了主节点、推进节点、备节点可以访问的要求外,还必须具备可靠性,一旦WAL日志丢失,则就可能会发生丢数据的情况,这是绝对不允许的,对此,通常采用多副本方案实现高可靠。基于上面的两点,开源tikv可以满足He3DB对日志存取的需求。并且可以缩短开发周期,快速验证He3DB的设计。

tikv rust api

将WAL日志存放在tikv中,第一步就是熟悉tikv,可参考官方文档:Interact with TiKV using Rust

在dev_kv分支he3fs下的pgfsgrpc中。可以看到添加了tikv-client依赖:

[package]
name = "pgfs"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
libc = "0.2"
tokio = {version = "1.12", features = ["macros", "rt", "time", "fs"]}
async-trait = "0.1"
tikv-client = {git = "https://github.com/tikv/client-rust.git", rev="8f54e6114227718e256027df2577bbacdf425f86"}

[lib]
name = "rust_log"             # 会生成librust_log.a的静态库,需要放到`he3pg/src/backend/storage/file/`目录下
crate-type = ["staticlib"]    # crate-type = ["staticlib"] 表示编译成静态链接库;
path = "src/lib.rs"

其tikv的API可参考上面的文档。首先是连接到tikv集群,官网上这么说的:With a connected tikv_client::RawClient, you can perform actions such as put, get, delete, and scan:

lazy_static! {

         static ref rt1: Runtime = tokio::runtime::Builder::new_multi_thread()
        .enable_all()
        .build()
        .unwrap();

        static ref raw_client: AsyncOnce<RawClient> = AsyncOnce::new(async {
           
            let default = String::from("127.0.0.1:2379");       // 默认PD访问地址
            let pdaddr = var("PDADDR").unwrap_or(default);      // 从环境变量中获取PD访问地址
            let pdaddr: Vec<&str> = pdaddr.split(",").collect(); 
            
            println!("pd addr is {:?}", pdaddr);
            let mut client = RawClient::new_with_config(pdaddr.clone(), Config::default().with_timeout(Duration::from_secs(10)), None).await; // 构造RawClient客户端
            let mut fail_times: int32 = 0;
            while client.is_err() {    // 如果失败,尝试重连,最高100次
                fail_times += 1;
                println!("get tikv client failed {:?}, retry {} times!", client.err(), fail_times);
                if fail_times > 100 {
                    panic!("fail to get tikv client after retry 100 times")
                }
                sleep(Duration::from_millis(100));
                client = RawClient::new_with_config(pdaddr.clone(), Config::default().with_timeout(Duration::from_secs(10)), None).await;
            }
            client.unwrap()
           });   
}

所以在进行He3DB的安装部署的时候,需要设置环境变量PDADDR,否则默认使用127.0.0.1:2379

存取接口

pgfsgrpc会生成一个名为librust_log.a的静态库,对外接口有:

// 将WAL日志写入tikv中
pub extern "C" fn flushwals(mut xlog_item: *mut XLogItem, timeline: libc::uint32_t)->u8
// 批量读取WAL日志
pub extern  "C" fn batchRead(buf: *mut libc::uint8_t, timeline:libc::uint32_t, startPtr:libc::uint64_t, endPtr:libc::uint64_t, needStore: bool)-> Bufrd
// 从tikv中删除WAL日志
pub extern "C" fn DelRangeWals(tl: libc::uint32_t, startlsn: libc::uint64_t, endlsn: libc::uint64_t)
// 读取tikv中的WAL日志,用于pg_waldump工具,PG中的WAL日志存放在pg_wal目录下,而He3DB存放于tikv中,所以对应的工具也需要有变化
pub extern  "C" fn batchReadForTools(buf: *mut libc::uint8_t, timeline:libc::uint32_t, startPtr:libc::uint64_t, endPtr:libc::uint64_t, needStore:bool)->i32
// 根据lsn链表读WAL日志,因历史原因,函数名字容易产生歧义,前面几个参数都没有用的,只用到了最后二个参数
pub extern "C" fn ReadWalsByPage(dbid: libc::uint32_t, relid: libc::uint32_t, forkno: libc::uint32_t, blkno: libc::uint32_t,  timeline: libc::uint32_t, mut head: *mut LsnNode) -> Bufrd
// 释放buf
pub unsafe extern "C" fn free_dataRead( buf: *mut libc::uint8_t, count: libc::size_t, cap: libc::size_t)
// 下面3个是给推进节点使用的,每当推进推进一批,会更新一个一致性点到tikv中,只读节点根据这个lsn去做链表关系的截断
pub extern "C" fn InsertConsistToKV(lsn: libc::uint64_t) -> u8
pub extern "C" fn GetConsistLsn(lsn: libc::uint64_t) -> u64
pub extern "C" fn DelConsistLsns(lsn: libc::uint64_t)

其实可以看he3pg/src/include/utils/hfs.h的代码,定义

#[repr(C)]  // Rust 允许你选择其他的数据布局策略, 表示和C保持一致
pub struct XLogKey
{
    lsn: libc::uint64_t,  // LSN作Key, XLog为Value
}

#[repr(C)]
pub struct XLogItem     // WAL日志
{
    xlogKey: XLogKey,                 // lsn key
    begin: *mut libc::c_char,         // 地址起始位置
    length: libc::c_int,              // 长度
    next: *mut XLogItem,              // 下一个
}

#[derive(Hash, PartialEq, Eq, Clone, serde_derive::Deserialize, serde_derive::Serialize, Debug)]
pub enum DBKey {
    Meta,
    Inode(u64),
    Block {
        ino: u64,
        block: u64,
    },
    FileHandler {
        ino: u64,
        handler: u64,
    },
    FileIndex {
        parent: u64,
        name: String,
    },
    Wal {
        dbid: u64,
        relid: u64,
        forknum: u64,
        blocknum: u64,
    },
    TranCommit,
    XlogKey {
        timeline: u64,
        lsn: u64,
    },
}

flushwals

将WAL日志写入tikv中,具体实现如下:


//extern "C" :该函数可以提供给其他库或者语言调用,并且采用c语言的调用约定。He3PG是C实现的。
// 表示生成的函数名经过编译后依然为foo,从而和c语言保持一致;
#[no_mangle] 
pub extern "C" fn flushwals(mut xlog_item: *mut XLogItem, timeline: libc::uint32_t)->u8
{
    let mut curtl :u64 = timeline as u64;
    
    // let mut count = 0;
    let mut xlogvec: Vec<TikvPair> = Vec::new();
    while xlog_item != std::ptr::null_mut::<XLogItem>()   // 读日志链表直到空
    {
        unsafe{
            let lsn = (*xlog_item).xlogKey.lsn;

            let begin = (*xlog_item).begin as *mut u8;
            let tmp_xlog_value = Vec::from_raw_parts(begin, (*xlog_item).length as usize, (*xlog_item).length as usize);  // 构造日志value值

            let res: Key = DBKey::XlogKey{timeline: curtl, lsn: lsn}.to_scoperdkey().into();      // 构造key,timeline + lsn 
            let kv = TikvPair::new(res, tmp_xlog_value.clone());   // 构造tikv的kv pair
            std::mem::forget(tmp_xlog_value);
            xlogvec.push(kv);                                  // 插入到Vec中
            xlog_item = (*xlog_item).next as *mut XLogItem;
        }
    }

    let res = rt1
    .block_on(async {   
        let res = raw_client.get().await.batch_put(xlogvec).await;    // 通过tikv api 批量插入到tikv中, 异步插入, 这块可以看一下tokio的实现逻辑,见参考文档
        // std::mem::forget(xlogvec);
        res
    });

    match res {
        Ok(_) => 1,
        Err(error) => {
            println!("batch put wals into kv failed:{:?}",error);
            0
        },
    }
}

// 这里单列一下key的设计
#[derive(Debug, Ord, PartialOrd, Eq, PartialEq, Hash, Clone, Copy)]
pub enum ScopedKey<'a> {
    Meta,
    Inode(u64),
    Block { ino: u64, block: u64 },
    FileHandler { ino: u64, handler: u64 },
    FileIndex { parent: u64, name: &'a str },
    Wal{dbid:u64,relid:u64,forknum:u64,blocknum:u64},
    XlogKey{timeline: u64, id: u64, lsn: u64},  // 如果是按照lsn存的话,写到tikv里会存在热点的问题,因为key是顺序的,基本上大部分负载只会写到一个region里。所以设计了个id
}

impl DBKey {
    pub fn to_scoperdkey(&self) -> ScopedKey {
        use DBKey::*;
        match self {
            Meta => ScopedKey::Meta,
            Inode(ino) => ScopedKey::Inode(*ino),
            Block { ino, block } => ScopedKey::Block {                                                    
                ino: *ino,
                block: *block,
            },
            FileHandler { ino, handler } => ScopedKey::FileHandler {
                ino: *ino,
                handler: *handler,
            },
            FileIndex { parent, name } => ScopedKey::FileIndex {
                parent: *parent,
                name: name,
            },
            Wal {
                dbid,
                relid,
                forknum,
                blocknum,
            } => ScopedKey::Wal {
                dbid: *dbid,
                relid: *relid,
                forknum: *forknum,
                blocknum: *blocknum,
            },
            XlogKey {
                timeline,
                lsn,
            } => ScopedKey::XlogKey {
                timeline: *timeline,
                id: (*lsn / MAX_BATCH_XLOG_SIZE as u64) % MAX_BATCHID,  
                lsn: *lsn,
            },
            _ => panic!("tran not void scopekey"),
        }
    }
}

batchRead

批量读取WAL日志,具体实现如下:

pub extern  "C" fn batchRead(
    buf: *mut libc::uint8_t,
    timeline:libc::uint32_t,   
    startPtr:libc::uint64_t,
    endPtr:libc::uint64_t,
    needStore: bool             // 是否需要本地盘缓存,只有主节点以及读节点需要,推进节点不需要                    
    )-> Bufrd {
    
    if startPtr >= endPtr {
        panic!("batch read startPtr {} is not less than endPtr {}", startPtr, endPtr);
    }
    let res = rt1.block_on(pgread::batchRead(       // 根据输入参数构造 key, 调用rust tikv api接口,从tikv中读取wal日志
        timeline.into(),
        startPtr,
        endPtr,
    ));

    let mut length = 0;
    let mut KvStructVec:Vec<kvStruct> = Vec::new();
    for d in res.iter()
    {
        length += d.len();
        if needStore {
            let walkey = rt1.block_on(pgrecord::XLogRecordNeedToCache(d));
            if !walkey.is_empty()
            {
                let lpk = LdPageKey{
                    dbid: (walkey[0] as u32) << 24 | (walkey[1] as u32)<< 16  | (walkey[2] as u32)<< 8  | (walkey[3]) as u32,
                    relid: (walkey[4] as u32)<< 24  | (walkey[5] as u32)<< 16  | (walkey[6] as u32)<< 8 | (walkey[7]) as u32,
                    forkno: (walkey[8] as u32)<< 24  | (walkey[9] as u32)<< 16 | (walkey[10] as u32)<< 8 | (walkey[11]) as u32,
                    blkno: (walkey[12] as u32)<< 24 | (walkey[13] as u32)<< 16 | (walkey[14] as u32)<< 8 | (walkey[15]) as u32,

                };
                let lsn = (walkey[17] as u64)<< 56 
                                |(walkey[18] as u64)<< 48
                                |(walkey[19] as u64)<< 40
                                |(walkey[20] as u64)<< 32 
                                |(walkey[21] as u64)<< 24 
                                |(walkey[22] as u64)<< 16 
                                |(walkey[23] as u64)<< 8 
                                |walkey[24] as u64;
                
                let dlen = d.len() as i32;
                let ptr = d.as_ptr();
                let kvstrt = kvStruct {
                    lpk: lpk,
                    buf: ptr,
                    length: dlen,
                    lsn: lsn,
                };
                
                KvStructVec.push(kvstrt);
            }
        }
    }

    let mut cap:usize = MAX_BATCH_XLOG_SIZE;
    if length > 32768 {
        cap = length;
    }

    if needStore {
        unsafe{
            let kvStrLen = KvStructVec.len() as i32;
            println!("kvStrLen is {}", kvStrLen);
            
            let strtPtr = KvStructVec.as_ptr();
            println!("before storeWalInLocalBuffer");

            let mut fail_times: int32 = 0;
            while storeWalInLocalBuffer(strtPtr,kvStrLen) == 0      // 存WAL日志到本地盘, 这块与新代码可能会有变化
            {
                fail_times += 1;
                if fail_times > 100 {
                    panic!("fail to store wal into localdisk after retry 100 times")
                }
                println!("store wal in localdisk failed, retry {} times!", fail_times);
                sleep(Duration::from_millis(100));
            }
            println!("after storeWalInLocalBuffer");
        }
    }

    let mut tmp:Vec<u8> = Vec::with_capacity(cap);
    for ve in res
    {
        let mut v = ve;
        tmp.append(&mut v);
    }
    
    if cap <= MAX_BATCH_XLOG_SIZE {
        unsafe{
            let buf_vec = buf as *mut u8;
            let mut bufs = Vec::from_raw_parts(buf_vec, length, length);
            bufs.copy_from_slice(&tmp[..]);
            std::mem::forget(bufs);
            let bf = Bufrd {
                buf: tmp.as_ptr(),
                count: length,
                cap: length,
            };
            return bf;
        }
        
    } else {
        let (x, y, z) = tmp.into_raw_parts();
        let bf = Bufrd {
            buf: x,
            count: y,
            cap: z,
        };
        return bf;
    }
}

DelRangeWals

从tikv中删除WAL日志,He3DB的WAL日志机制与PostgreSQL不同,后续再分析He3DB日志相关的设计。

#[no_mangle]
pub extern "C" fn DelRangeWals(tl: libc::uint32_t, startlsn: libc::uint64_t, endlsn: libc::uint64_t) {

    let mut key = vec![6];
    key.extend((tl as u64).to_be_bytes().iter());

    for i in 0..8 {
        let mut sk = key.clone();
        let mut ek = key.clone();
        let id = i as u64;
        sk.extend(id.to_be_bytes().iter());
        ek.extend(id.to_be_bytes().iter());

        sk.extend(startlsn.to_be_bytes().iter());
        ek.extend(endlsn.to_be_bytes().iter());
        let res = rt1
        .block_on(async {  
            let res = raw_client.get().await.delete_range(sk..ek).await; // 调用tikv api接口删除
            res
        });
        match res {
            Ok(v) => v,
            Err(error) => {
                println!("delete range wals between {} and {} from kv failed:{:?}", startlsn, endlsn, error);
            },
        };
    }

}

InsertConsistToKV

给推进节点使用的,每当推进推进一批,会更新一个一致性点到tikv中,只读节点根据这个lsn去做链表关系的截断

const WAL: u8 = 2;
const CONSISTLSN: &str = "Clsn";

#[no_mangle]
pub extern "C" fn InsertConsistToKV(lsn: libc::uint64_t) -> u8 {
    let mut key = CONSISTLSN.as_bytes().to_owned();
    key.extend(lsn.to_be_bytes().iter());
    let mut value: Vec<u8> = Vec::new();
    value.extend(lsn.to_be_bytes().iter());

    let res = rt1
    .block_on(async {  
        let res = raw_client.get().await.put(key, value).await;
        res
    });

    match res {
        Ok(_) => 1,
        Err(error) => {
            println!("put consistlsn into kv failed:{:?}",error);
            0
        },
    }
}

最后

目前He3DB将WAL写入tikv中,可能是一个临时的方案,采用tikv是否过重了?未来将探索新的方案。


参考文档:
云原生数据库He3DB——安装
Rust 参考手册 中文版
c语言调用rust库函数
Rust异步之Future