boltdb 事务实现

写事务执行流程

事务Commit的流程如下：

截屏2023-01-07 上午11.13.55.png

reblance，重平衡。由于部分节点删了元素，可能导致节点数据量不符合要求。不符合要求的节点需要重平衡，对于不需要重平衡的节点要满足如下条件，否则就需要重平衡：

// Ignore if node is above threshold (25%) and has enough keys.
var threshold = n.bucket.tx.db.pageSize / 4
if n.size() > threshold && len(n.inodes) > n.minKeys() {
  return
}

对于需要重平衡的节点会和相邻节点合并逻辑如下

当前节点为父节点的第一个元素时合并到右侧相邻节点

当前节点不是父节点的第一个元素合并到左侧相邻节点

// Destination node is right sibling if idx == 0, otherwise left sibling.
var target *node
var useNextSibling = (n.parent.childIndex(n) == 0)
if useNextSibling {
  target = n.nextSibling()
} else {
  target = n.prevSibling()
}

// If both this node and the target node are too small then merge them.
if useNextSibling {
  // Reparent all child nodes being moved.
  for _, inode := range target.inodes {
    if child, ok := n.bucket.nodes[inode.pgid]; ok {
      child.parent.removeChild(child)
      child.parent = n
      child.parent.children = append(child.parent.children, child)
    }
  }

  // Copy over inodes from target and remove target.
  n.inodes = append(n.inodes, target.inodes...)
  n.parent.del(target.key)
  n.parent.removeChild(target)
  delete(n.bucket.nodes, target.pgid)
  target.free()
} else {
  // Reparent all child nodes being moved.
  for _, inode := range n.inodes {
    if child, ok := n.bucket.nodes[inode.pgid]; ok {
      child.parent.removeChild(child)
      child.parent = target
      child.parent.children = append(child.parent.children, child)
    }
  }

  // Copy over inodes to target and remove node.
  target.inodes = append(target.inodes, n.inodes...)
  n.parent.del(n.key)
  n.parent.removeChild(n)
  delete(n.bucket.nodes, n.pgid)
  n.free()
}

由于节点已经合并到相邻节点，所以当前节点占用的pgid可以放到freelist中等待被释放(为什么是等待被释放的状态，后面将到隔离性会说到)。

// free adds the node's underlying page to the freelist.
func (n *node) free() {
  if n.pgid != 0 {
    n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
    n.pgid = 0
  }
}

spill，将B+树写入到dirty page。这个dirty page是一个内存中的byte[]，spill是把节点数据写到这些字节切片中。由于第一步重平衡的过程中存在两节点合并的操作，可能会导致一个节点的数据超过了一个数据页的大小，这个时候spill还会将节点划分为多个。


// spill writes the nodes to dirty pages and splits nodes as it goes.
// Returns an error if dirty pages cannot be allocated.
func (n *node) spill() error {
  var tx = n.bucket.tx
  if n.spilled {
    return nil
  }

  // Spill child nodes first. Child nodes can materialize sibling nodes in
  // the case of split-merge so we cannot use a range loop. We have to check
  // the children size on every loop iteration.
  sort.Sort(n.children)
  for i := 0; i < len(n.children); i++ {
    if err := n.children[i].spill(); err != nil {
      return err
    }
  }

  // We no longer need the child list because it's only used for spill tracking.
  n.children = nil

  // Split nodes into appropriate sizes. The first node will always be n.
  var nodes = n.split(uintptr(tx.db.pageSize))
  for _, node := range nodes {
    // Add node's page to the freelist if it's not new.
    if node.pgid > 0 {
      tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
      node.pgid = 0
    }

    // Allocate contiguous space for the node.
    p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
    if err != nil {
      return err
    }

    // Write the node.
    if p.id >= tx.meta.pgid {
      panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
    }
    node.pgid = p.id
    node.write(p)
    node.spilled = true

    // Insert into parent inodes.
    if node.parent != nil {
      var key = node.key
      if key == nil {
        key = node.inodes[0].key
      }

      node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
      node.key = node.inodes[0].key
      _assert(len(node.key) > 0, "spill: zero-length node key")
    }

    // Update the statistics.
    tx.stats.Spill++
  }

  // If the root node split and created a new root then we need to spill that
  // as well. We'll clear out the children to make sure it doesn't try to respill.
  if n.parent != nil && n.parent.pgid == 0 {
    n.children = nil
    return n.parent.spill()
  }

  return nil
}

在拆分节点的过程中原始数据的pgid都会放到freelist中等待被释放，然后从freelist中申请空闲的pgid存储数据。

// Add node's page to the freelist if it's not new.
if node.pgid > 0 {
  tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
  node.pgid = 0


// Allocate contiguous space for the node.
p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
if err != nil {
  return err
}

commit freelist，新的freelist写入到内存的字节切片中。

func (tx *Tx) commitFreelist() error {
  // Allocate new pages for the new free list. This will overestimate
  // the size of the freelist but not underestimate the size (which would be bad).
  opgid := tx.meta.pgid
  p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1)
  if err != nil {
    tx.rollback()
    return err
  }
  if err := tx.db.freelist.write(p); err != nil {
    tx.rollback()
    return err
  }
  tx.meta.freelist = p.id
  // If the high water mark has moved up then attempt to grow the database.
  if tx.meta.pgid > opgid {
    if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil {
      tx.rollback()
      return err
    }
  }

  return nil
}

write dirty page to disk,将所有变更写入到磁盘。


// write writes any dirty pages to disk.
func (tx *Tx) write() error {
  // Sort pages by id.
  pages := make(pages, 0, len(tx.pages))
  for _, p := range tx.pages {
    pages = append(pages, p)
  }
  // Clear out page cache early.
  tx.pages = make(map[pgid]*page)
  sort.Sort(pages)

  // Write pages to disk in order.
  for _, p := range pages {
    rem := (uint64(p.overflow) + 1) * uint64(tx.db.pageSize)
    offset := int64(p.id) * int64(tx.db.pageSize)
    var written uintptr

    // Write out page in "max allocation" sized chunks.
    for {
      sz := rem
      if sz > maxAllocSize-1 {
        sz = maxAllocSize - 1
      }
      buf := unsafeByteSlice(unsafe.Pointer(p), written, 0, int(sz))

      if _, err := tx.db.ops.writeAt(buf, offset); err != nil {
        return err
      }

      // Update statistics.
      tx.stats.Write++

      // Exit inner for loop if we've written all the chunks.
      rem -= sz
      if rem == 0 {
        break
      }

      // Otherwise move offset forward and move pointer to next chunk.
      offset += int64(sz)
      written += uintptr(sz)
    }
  }

  // Ignore file sync if flag is set on DB.
  if !tx.db.NoSync || IgnoreNoSync {
    if err := fdatasync(tx.db); err != nil {
      return err
    }
  }

  // Put small pages back to page pool.
  for _, p := range pages {
    // Ignore page sizes over 1 page.
    // These are allocated using make() instead of the page pool.
    if int(p.overflow) != 0 {
      continue
    }

    buf := unsafeByteSlice(unsafe.Pointer(p), 0, 0, tx.db.pageSize)

    // See https://go.googlesource.com/go/+/f03c9202c43e0abb130669852082117ca50aa9b1
    for i := range buf {
      buf[i] = 0
    }
    tx.db.pagePool.Put(buf) //nolint:staticcheck
  }

  return nil
}

write meta to disk，写入meta到磁盘。

// writeMeta writes the meta to the disk.
func (tx *Tx) writeMeta() error {
  // Create a temporary buffer for the meta page.
  buf := make([]byte, tx.db.pageSize)
  p := tx.db.pageInBuffer(buf, 0)
  tx.meta.write(p)

  // Write the meta page to file.
  if _, err := tx.db.ops.writeAt(buf, int64(p.id)*int64(tx.db.pageSize)); err != nil {
    return err
  }
  if !tx.db.NoSync || IgnoreNoSync {
    if err := fdatasync(tx.db); err != nil {
      return err
    }
  }

  // Update statistics.
  tx.stats.Write++

  return nil
}

$\color{red}{此步骤不管成功与否，都会影响 db.meta0 或者 db.meta1 的值。}$ 因为在db初始化中会执行mmap,其中就会将db.meta0指向文件中的第一个page；db.meta1指向文件中的第二个page。

// Save references to the meta pages.
db.meta0 = db.page(0).meta()
db.meta1 = db.page(1).meta()

如何保证事务的AID

事务执行有如下五个步骤可以进一步划分为如下三块：

步骤1.reblance，2.spill和3.commit freelist都是对内存的操作，不会影响其他事务的读写。
步骤4. write dirty page to disk会将修改过的B+树和freelist写入到磁盘。
- 对于B+树来说，修改的路径会保存下来。包括叶子节点到根节点的路径，以及因为此次事务造成变动的所有节点。
- 对于freelist来说会增加此次事务释放的pgid，并去掉此次事务使用了的pgid。
步骤5.write meta to disk会将meta写入到磁盘。meta中包括如下数据：
```
type meta struct {
  magic    uint32
  version  uint32
  pageSize uint32
  flags    uint32
  root     bucket
  freelist pgid
  pgid     pgid
  txid     txid
  checksum uint64
}
```
其中root决定了boltdb中存储的数据的基本结构，freelist决定了boltdb中空闲pgid。这两部分在最后写入，只要meta写入成功了，那么这次事务就成功提交了。

原子性

要想写事务要么达到执行完成的状态、要么回滚到执行前的状态就需要在写事务过程中出现问题的时候可以通过回滚还还原数据。回滚代码如下：

// rollback needs to reload the free pages from disk in case some system error happens like fsync error.
func (tx *Tx) rollback() {
  if tx.db == nil {
    return
  }
  if tx.writable {
    tx.db.freelist.rollback(tx.meta.txid)
    if !tx.db.hasSyncedFreelist() {
      // Reconstruct free page list by scanning the DB to get the whole free page list.
      // Note: scaning the whole db is heavy if your db size is large in NoSyncFreeList mode.
      tx.db.freelist.noSyncReload(tx.db.freepages())
    } else {
      // Read free page list from freelist page.
      tx.db.freelist.reload(tx.db.page(tx.db.meta().freelist))
    }
  }
  tx.close()
}

持久性

如果boltdb写事务提交过程中，断电了。这个时候需要确保在数据库重启的时候，不会因为脏数据还损坏。

首先需要解决的问题是如何发现数据损坏。可以通过checksum，每次写入meta都会把meta中除了checksum字段算一下校验。在读取的时候，查看下校验是否和计算的一致，不一致则数据损坏了。

发现了数据损坏再怎么处理。boltdb有两个特殊的数据页meta0和meta1,pgid分别为0和1。每个Tx初始化的时候需要将db的meta复制一份。在读取db的meta的时候，会校验这两个meta哪个是没问题的。由于写事务是顺序执行的，所以至少有一个meta是没问题的。

// meta retrieves the current meta page reference.
func (db *DB) meta() *meta {
  // We have to return the meta with the highest txid which doesn't fail
  // validation. Otherwise, we can cause errors when in fact the database is
  // in a consistent state. metaA is the one with the higher txid.
  metaA := db.meta0
  metaB := db.meta1
  if db.meta1.txid > db.meta0.txid {
    metaA = db.meta1
    metaB = db.meta0
  }

  // Use higher meta page if valid. Otherwise fallback to previous, if valid.
  if err := metaA.validate(); err == nil {
    return metaA
  } else if err := metaB.validate(); err == nil {
    return metaB
  }

  // This should never be reached, because both meta1 and meta0 were validated
  // on mmap() and we do fsync() on every write.
  panic("bolt.DB.meta(): invalid meta pages")
}

隔离性

boltdb不能并发写，可以多读一写的。限制的场景如下：

不能并发写，boltdb中每个写事务在创建的时候均需要获得互斥锁db.rwlock并在事务提交的时候释放此锁。
在对db数据修改的时候读写事务都需要通过互斥锁db.metalock来修改。
在对db扩容的时候读写事务会对db.mmaplock有竞争

为了实现读写的隔离性，boltdb做了如下事情：

copy on write，每个写事务都会申请新的pgid来存储此次修改部分的数据，而不是原址修改。

每个事务开启的时候(包括读和写)，都会从db.meta复制一份到Tx。下面代码中db.meta().copy(tx.meta)就是复制meta的操作。

  // init initializes the transaction.
  func (tx *Tx) init(db *DB) {
    tx.db = db
    tx.pages = nil

    // Copy the meta page since it can be changed by the writer.
    tx.meta = &meta{}
    db.meta().copy(tx.meta)

    // Copy over the root bucket.
    tx.root = newBucket(tx)
    tx.root.bucket = &bucket{}
    *tx.root.bucket = tx.meta.root

    // Increment the transaction id and add a page cache for writable transactions.
    if tx.writable {
      tx.pages = make(map[pgid]*page)
      tx.meta.txid += txid(1)
    }
  }

对于读事务来说事务开启的时候读取到的meta中的B+树的根节点就决定了此次读取的数据范围了。这也意味着meta中对应的写事务在提交之后所释放的pgid并不能立刻被后续的写事务使用。那么这些被释放的pgid就需要考虑回收的时机了，因为如果一直不释放就会不断扩大磁盘空间。

在boltdb中是开启写事务的时候，会查看哪些写事务中等待释放的pgid可以释放掉。

小于所有读事务中最小txid的写事务
读事务间隔间的txid的写事务

// freePages releases any pages associated with closed read-only transactions.
func (db *DB) freePages() {
  // Free all pending pages prior to earliest open transaction.
  sort.Sort(txsById(db.txs))
  minid := txid(0xFFFFFFFFFFFFFFFF)
  if len(db.txs) > 0 {
    minid = db.txs[0].meta.txid
  }
  if minid > 0 {
    db.freelist.release(minid - 1)
  }
  // Release unused txid extents.
  for _, t := range db.txs {
    db.freelist.releaseRange(minid, t.meta.txid-1)
    minid = t.meta.txid + 1
  }
  db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF))
  // Any page both allocated and freed in an extent is safe to release.
}