导言:本文将介绍MVStore是如何执行put操作的。从中可以学习到MVStore怎么利用B+树增加元素,还有就是教科书上都说B+树能够减少IO,但是具体实现是怎么样的之前没怎么了解。下面会结合代码加上注释来说明,主要的整体流程并不复杂。
总的操作如下(已把大部分次要的代码省略):
public V operate(K key, V value, DecisionMaker<? super V> decisionMaker) {
IntValueHolder unsavedMemoryHolder = new IntValueHolder();
int attempt = 0;
while(true) {
RootReference<K,V> rootReference = flushAndGetRoot();
boolean locked = rootReference.isLockedByCurrentThread();
if (!locked) {
if (attempt++ == 0) {
beforeWrite();
}
if (attempt > 3 || rootReference.isLocked()) {
rootReference = lockRoot(rootReference, attempt);
locked = true;
}
}
Page<K,V> rootPage = rootReference.root;
long version = rootReference.version;
CursorPos<K,V> tip;
V result;
unsavedMemoryHolder.value = 0;
try {
CursorPos<K,V> pos = CursorPos.traverseDown(rootPage, key);
if(!locked && rootReference != getRoot()) {
continue;
}
Page<K,V> p = pos.page;
int index = pos.index;
tip = pos;
pos = pos.parent;
result = index < 0 ? null : p.getValue(index);
Decision decision = decisionMaker.decide(result, value, tip);
switch (decision) {
........
case PUT: {
value = decisionMaker.selectValue(result, value);
p = p.copy();
if (index < 0) {
p.insertLeaf(-index - 1, key, value);
int keyCount;
while ((keyCount = p.getKeyCount()) > store.getKeysPerPage()
|| p.getMemory() > store.getMaxPageSize()
&& keyCount > (p.isLeaf() ? 1 : 2)) {
long totalCount = p.getTotalCount();
int at = keyCount >> 1;
K k = p.getKey(at);
Page<K,V> split = p.split(at);
unsavedMemoryHolder.value += p.getMemory() + split.getMemory();
if (pos == null) {
K[] keys = p.createKeyStorage(1);
keys[0] = k;
Page.PageReference<K,V>[] children = Page.createRefStorage(2);
children[0] = new Page.PageReference<>(p);
children[1] = new Page.PageReference<>(split);
p = Page.createNode(this, keys, children, totalCount, 0);
break;
}
Page<K,V> c = p;
p = pos.page;
index = pos.index;
pos = pos.parent;
p = p.copy();
p.setChild(index, split);
p.insertNode(index, k, c);
}
} else {
p.setValue(index, value);
}
break;
}
}
rootPage = replacePage(pos, p, unsavedMemoryHolder);
if (!locked) {
rootReference = rootReference.updateRootPage(rootPage, attempt);
if (rootReference == null) {
decisionMaker.reset();
continue;
}
}
store.registerUnsavedMemory(unsavedMemoryHolder.value + tip.processRemovalInfo(version));
return result;
} finally {
if(locked) {
unlockRoot(rootPage);
}
}
}
}
一、
// 首先获取这个MVMap 的RootReference,通过RootReference能找到这个MVMap的rootPage
RootReference<K,V> rootReference = flushAndGetRoot();
二、
// 获取MVMap的rootPage
Page<K,V> rootPage = rootReference.root;
三、
// 从rootPage中开始获取key对应的value,这里用了一个CursorPos类来封装
CursorPos<K,V> pos = CursorPos.traverseDown(rootPage, key)
该方法如下:
// 方法还是很容易理解的,就是B+树的搜索过程。从根节点一直搜索直到叶子结点。B+树的叶子结点包含了key和value
static <K,V> CursorPos<K,V> traverseDown(Page<K,V> page, K key) {
CursorPos<K,V> cursorPos = null;
while (!page.isLeaf()) {
int index = page.binarySearch(key) + 1;
if (index < 0) {
index = -index;
}
cursorPos = new CursorPos<>(page, index, cursorPos);
// 这里读入子page。在这里会进行IO读入。
page = page.getChildPage(index);
}
return new CursorPos<>(page, page.binarySearch(key), cursorPos);
}
// 叶子结点没有实现getChildPage方法
@Override
public Page<K,V> getChildPage(int index) {
throw new UnsupportedOperationException();
}
// 非叶子结点NonLeaf实现的getChildPage方法
@Override
public Page<K,V> getChildPage(int index) {
PageReference<K,V> ref = children[index];
Page<K,V> page = ref.getPage();
if(page == null) {
page = map.readPage(ref.getPos());
assert ref.getPos() == page.getPos();
assert ref.count == page.getTotalCount();
}
return page;
}
// PageReference类(省略了部分不重要的属性和方法) 。
public static final class PageReference<K,V> {
/**
page的pos值。关于page的pos值,可以参考之前的https://juejin.cn/post/7011130873634357262。有
讲述pos的各个段的含义
* The position, if known, or 0.
*/
private long pos;
/**
如果已经page已经在内存了则非空
* The page, if in memory, or null.
*/
private Page<K,V> page;
public Page<K,V> getPage() {
return page;
}
long getPos() {
return pos;
}
}
四、非叶子结点NonLeaf实现的getChildPage方法。这个方法就是将Page从外部设备读入内存的方法
public Page<K,V> getChildPage(int index) {
PageReference<K,V> ref = children[index];
Page<K,V> page = ref.getPage();
// 子节点的page为空,则调用map的readPage()方法进行IO读入
if(page == null) {
page = map.readPage(ref.getPos());
assert ref.getPos() == page.getPos();
assert ref.count == page.getTotalCount();
}
return page;
}
// 利用了MVStore的readPage方法
final Page<K,V> readPage(long pos) {
return store.readPage(this, pos);
}
五、
<K,V> Page<K,V> readPage(MVMap<K,V> map, long pos) {
try {
if (!DataUtils.isPageSaved(pos)) {
throw DataUtils.newMVStoreException(
DataUtils.ERROR_FILE_CORRUPT, "Position 0");
}
// 首先从cache中判断是否已经缓存了该pos值对应的Page
Page<K,V> p = readPageFromCache(pos);
if (p == null) {
// 如果缓存中没有,就通过pos值获取
Chunk chunk = getChunk(pos);
int pageOffset = DataUtils.getPageOffset(pos);
try {
//读入page的内容到ByteBuffer
ByteBuffer buff = chunk.readBufferForPage(fileStore, pageOffset, pos);
// 读到ByteBuffer后从buffer中读入Page
p = Page.read(buff, pos, map);
if (p.pageNo < 0) {
p.pageNo = calculatePageNo(pos);
}
} catch (MVStoreException e) {
throw e;
} catch (Exception e) {
throw DataUtils.newMVStoreException(DataUtils.ERROR_FILE_CORRUPT,
"Unable to read the page at position {0}, chunk {1}, offset {2}",
pos, chunk.id, pageOffset, e);
}
cachePage(p);
}
return p;
} catch (MVStoreException e) {
if (recoveryMode) {
return map.createEmptyLeaf();
}
throw e;
}
}
// 通过pos值获取page所在的chunk
private Chunk getChunk(long pos) {
int chunkId = DataUtils.getPageChunkId(pos);
Chunk c = chunks.get(chunkId);
if (c == null) {
checkOpen();
// chunk.getMetaKey(chunkId) 会返回字符串“chunk.chunkId”,layout保存的内容可看前面第一篇《H2的存储引擎MVStore剖析(1)——MVStore初始化》
String s = layout.get(Chunk.getMetaKey(chunkId));
if (s == null) {
throw DataUtils.newMVStoreException(
DataUtils.ERROR_CHUNK_NOT_FOUND,
"Chunk {0} not found", chunkId);
}
c = Chunk.fromString(s);
if (!c.isSaved()) {
throw DataUtils.newMVStoreException(
DataUtils.ERROR_FILE_CORRUPT,
"Chunk {0} is invalid", chunkId);
}
chunks.put(c.id, c);
}
return c;
}
layout存储结构例子:
六、
ByteBuffer readBufferForPage(FileStore fileStore, int offset, long pos) {
assert isSaved() : this;
while (true) {
long originalBlock = block;
try {
// chunk的originalBlock表示这个chunk在存储文件的其实block位置
long filePos = originalBlock * MVStore.BLOCK_SIZE;
// 这个chunk的最大位置
long maxPos = filePos + len * MVStore.BLOCK_SIZE;
// 起始位置加上偏移量
filePos += offset;
if (filePos < 0) {
throw DataUtils.newMVStoreException(
DataUtils.ERROR_FILE_CORRUPT,
"Negative position {0}; p={1}, c={2}", filePos, pos, toString());
}
int length = DataUtils.getPageMaxLength(pos);
if (length == DataUtils.PAGE_LARGE) {
// read the first bytes to figure out actual length
length = fileStore.readFully(filePos, 128).getInt();
// pageNo is deliberately not included into length to preserve compatibility
// TODO: remove this adjustment when page on disk format is re-organized
length += 4;
}
// 最大位置减去偏移量:其实位置到chunk的结尾的字节数,和pos中表示的page长度比较。两者取小
length = (int) Math.min(maxPos - filePos, length);
if (length < 0) {
throw DataUtils.newMVStoreException(DataUtils.ERROR_FILE_CORRUPT,
"Illegal page length {0} reading at {1}; max pos {2} ", length, filePos, maxPos);
}
//利用FileChannel读入长度为length的内容到ByteBuffer 。java.nio.channels.FileChannel
ByteBuffer buff = fileStore.readFully(filePos, length);
if (originalBlock == block) {
return buff;
}
} catch (MVStoreException ex) {
if (originalBlock == block) {
throw ex;
}
}
}
}
// java.nio.channels.FileChannel 从位置pos读取len字节到ByteBuffer
public ByteBuffer readFully(long pos, int len) {
ByteBuffer dst = ByteBuffer.allocate(len);
DataUtils.readFully(file, pos, dst);
readCount.incrementAndGet();
readBytes.addAndGet(len);
return dst;
}
七、从Buffer中读入Page。Page.read(buff, pos, map);
static <K,V> Page<K,V> read(ByteBuffer buff, long pos, MVMap<K,V> map) {
boolean leaf = (DataUtils.getPageType(pos) & 1) == PAGE_TYPE_LEAF;
Page<K,V> p = leaf ? new Leaf<>(map) : new NonLeaf<>(map);
p.pos = pos;
p.read(buff);
return p;
}
// Page$NonLeaf从ByteBuffer中读入。整个过程可以参考<<H2的存储引擎MVStore剖析(2) —— Page的读入.md>>。理解了Page的结构之后就能明白整个流程
private void read(ByteBuffer buff) {
int chunkId = DataUtils.getPageChunkId(pos);
int offset = DataUtils.getPageOffset(pos);
int start = buff.position();
int pageLength = buff.getInt(); // does not include optional part (pageNo)
int remaining = buff.remaining() + 4;
if (pageLength > remaining || pageLength < 4) {
throw DataUtils.newMVStoreException(DataUtils.ERROR_FILE_CORRUPT,
"File corrupted in chunk {0}, expected page length 4..{1}, got {2}", chunkId, remaining,
pageLength);
}
short check = buff.getShort();
int checkTest = DataUtils.getCheckValue(chunkId)
^ DataUtils.getCheckValue(offset)
^ DataUtils.getCheckValue(pageLength);
if (check != (short) checkTest) {
throw DataUtils.newMVStoreException(DataUtils.ERROR_FILE_CORRUPT,
"File corrupted in chunk {0}, expected check value {1}, got {2}", chunkId, checkTest, check);
}
int mapId = DataUtils.readVarInt(buff);
if (mapId != map.getId()) {
throw DataUtils.newMVStoreException(DataUtils.ERROR_FILE_CORRUPT,
"File corrupted in chunk {0}, expected map id {1}, got {2}", chunkId, map.getId(), mapId);
}
int len = DataUtils.readVarInt(buff);
keys = createKeyStorage(len);
int type = buff.get();
if(isLeaf() != ((type & 1) == PAGE_TYPE_LEAF)) {
throw DataUtils.newMVStoreException(
DataUtils.ERROR_FILE_CORRUPT,
"File corrupted in chunk {0}, expected node type {1}, got {2}",
chunkId, isLeaf() ? "0" : "1" , type);
}
// jump ahead and read pageNo, because if page is compressed,
// buffer will be replaced by uncompressed one
if ((type & DataUtils.PAGE_HAS_PAGE_NO) != 0) {
int position = buff.position();
buff.position(start + pageLength);
pageNo = DataUtils.readVarInt(buff);
buff.position(position);
}
// to restrain hacky GenericDataType, which grabs the whole remainder of the buffer
buff.limit(start + pageLength);
if (!isLeaf()) {
readPayLoad(buff);
}
boolean compressed = (type & DataUtils.PAGE_COMPRESSED) != 0;
if (compressed) {
Compressor compressor;
if ((type & DataUtils.PAGE_COMPRESSED_HIGH) ==
DataUtils.PAGE_COMPRESSED_HIGH) {
compressor = map.getStore().getCompressorHigh();
} else {
compressor = map.getStore().getCompressorFast();
}
int lenAdd = DataUtils.readVarInt(buff);
int compLen = buff.remaining();
byte[] comp;
int pos = 0;
if (buff.hasArray()) {
comp = buff.array();
pos = buff.arrayOffset() + buff.position();
} else {
comp = Utils.newBytes(compLen);
buff.get(comp);
}
int l = compLen + lenAdd;
buff = ByteBuffer.allocate(l);
compressor.expand(comp, pos, compLen, buff.array(),
buff.arrayOffset(), l);
}
//读入key值到key数组——K[] keys
map.getKeyType().read(buff, keys, len);
if (isLeaf()) {
// 分叶子结点和非叶子结点
readPayLoad(buff);
}
diskSpaceUsed = pageLength;
recalculateMemory();
}
八、readPayLoad()
// Leaf 的读入
protected void readPayLoad(ByteBuffer buff) {
int keyCount = getKeyCount();
values = createValueStorage(keyCount);
map.getValueType().read(buff, values, getKeyCount());
}
//NonLeaf 读入。 children是PageReference的数组
protected void readPayLoad(ByteBuffer buff) {
int keyCount = getKeyCount();
children = createRefStorage(keyCount + 1);
long[] p = new long[keyCount + 1];
for (int i = 0; i <= keyCount; i++) {
p[i] = buff.getLong();
}
long total = 0;
for (int i = 0; i <= keyCount; i++) {
long s = DataUtils.readVarLong(buff);
long position = p[i];
assert position == 0 ? s == 0 : s >= 0;
total += s;
children[i] = position == 0 ?
PageReference.empty() :
new PageReference<>(position, s);
}
totalCount = total;
}
九、
从traverseDown(Page<K,V> page, K key)方法返回的CursorPos包含了在B+树上搜索整个key需要读入的所有页
比如下面搜索key=144,那么就会读入根节点和根节点中key=144和key=168之间的那棵子树(标红部分)。
CursorPos类主要属性就3个。traverseDown(Page<K,V> page, K key)返回的就是某个叶子节点,这个叶子节点要么包含key,要么不包含。
public final class CursorPos<K,V> {
public Page<K,V> page;
public int index;
public CursorPos<K,V> parent;
public CursorPos(Page<K,V> page, int index, CursorPos<K,V> parent) {
this.page = page;
this.index = index;
this.parent = parent;
}
// 迭代地读入page
static <K,V> CursorPos<K,V> traverseDown(Page<K,V> page, K key) {
CursorPos<K,V> cursorPos = null;
while (!page.isLeaf()) {
int index = page.binarySearch(key) + 1;
if (index < 0) {
index = -index;
}
cursorPos = new CursorPos<>(page, index, cursorPos);
page = page.getChildPage(index);
}
return new CursorPos<>(page, page.binarySearch(key), cursorPos);
}
int processRemovalInfo(long version) {
int unsavedMemory = 0;
for (CursorPos<K,V> head = this; head != null; head = head.parent) {
unsavedMemory += head.page.removePage(version);
}
return unsavedMemory;
}
}
CursorPos<K,V> pos = CursorPos.traverseDown(rootPage, key);
if(!locked && rootReference != getRoot()) {
continue;
}
Page<K,V> p = pos.page;
int index = pos.index;
tip = pos;
// 将pos指向当前pos的父节点
pos = pos.parent;
result = index < 0 ? null : p.getValue(index);
Decision decision = decisionMaker.decide(result, value, tip);
switch (decision) {
case REPEAT:
decisionMaker.reset();
continue;
case ABORT:
if(!locked && rootReference != getRoot()) {
decisionMaker.reset();
continue;
}
return result;
case REMOVE: {
......
十、
case PUT: {
value = decisionMaker.selectValue(result, value);
// 最底层的叶子节点复制一份。这里是浅拷贝
p = p.copy();
if (index < 0) {
// 底层叶子节点page插入key,value。位置是-index-1
p.insertLeaf(-index - 1, key, value);
int keyCount;
//B+树的满节点操作。如果超过阈值,将会分裂
while ((keyCount = p.getKeyCount()) > store.getKeysPerPage()
|| p.getMemory() > store.getMaxPageSize()
&& keyCount > (p.isLeaf() ? 1 : 2)) {
long totalCount = p.getTotalCount();
// keyCount的一半,中间位置
int at = keyCount >> 1;
K k = p.getKey(at);
// page的分裂操作。分leaf和nonleaf
Page<K,V> split = p.split(at);
unsavedMemoryHolder.value += p.getMemory() + split.getMemory();
// pos == null表示是根节点
if (pos == null) {
// 创建长度是1的keys数组。
K[] keys = p.createKeyStorage(1);
keys[0] = k;
Page.PageReference<K,V>[] children = Page.createRefStorage(2);
children[0] = new Page.PageReference<>(p);
children[1] = new Page.PageReference<>(split);
p = Page.createNode(this, keys, children, totalCount, 0);
break;
}
Page<K,V> c = p; //p是子节点
p = pos.page; // pos是父节点的CursorPos
index = pos.index;
pos = pos.parent;
p = p.copy(); // 父page 浅拷贝。 注:有好几个地方都用了page的浅拷贝。还没完全搞清楚原因
p.setChild(index, split); // 父page 将index 指向新分裂出来的page split
p.insertNode(index, k, c); // 将子节点的key 放到父节点上。并且在父节点的children中的index指向子节点page
}
} else {
p.setValue(index, value);
}
break;
}
//叶子节点insert (key, value)
public void insertLeaf(int index, K key, V value) {
int keyCount = getKeyCount();
insertKey(index, key);
if(values != null) {
V[] newValues = createValueStorage(keyCount + 1);
DataUtils.copyWithGap(values, newValues, keyCount, index);
values = newValues;
setValueInternal(index, value);
if (isPersistent()) {
addMemory(MEMORY_POINTER + map.evaluateMemoryForValue(value));
}
}
}
//非叶子没有实现insert方法
public void insertLeaf(int index, K key, V value) {
throw new UnsupportedOperationException();
}
// 叶子节点
public void insertLeaf(int index, K key, V value) {
//返回keys数组的长度
int keyCount = getKeyCount();
// 将key插入index的位置
insertKey(index, key);
if(values != null) {
V[] newValues = createValueStorage(keyCount + 1);
// 复制一份新的newValues,位置index留一个gap
DataUtils.copyWithGap(values, newValues, keyCount, index);
values = newValues;
setValueInternal(index, value);
if (isPersistent()) {
addMemory(MEMORY_POINTER + map.evaluateMemoryForValue(value));
}
}
}
final void insertKey(int index, K key) {
int keyCount = getKeyCount();
assert index <= keyCount : index + " > " + keyCount;
K[] newKeys = createKeyStorage(keyCount + 1);
// 复制一份新的keys,位置index留一个gap
DataUtils.copyWithGap(keys, newKeys, keyCount, index);
keys = newKeys;
keys[index] = key;
if (isPersistent()) {
addMemory(MEMORY_POINTER + map.evaluateMemoryForKey(key));
}
}
// NonLeaf节点还有一个insertNode方法
public void insertNode(int index, K key, Page<K,V> childPage) {
int childCount = getRawChildPageCount();
insertKey(index, key);
PageReference<K,V>[] newChildren = createRefStorage(childCount + 1);
DataUtils.copyWithGap(children, newChildren, childCount, index);
children = newChildren;
children[index] = new PageReference<>(childPage);
totalCount += childPage.getTotalCount();
if (isPersistent()) {
addMemory(MEMORY_POINTER + PAGE_MEMORY_CHILD);
}
}
十一、Leaf和NonLeaf的分裂操作。返回的都是新分裂出来的页
// Leaf的分裂split操作
public Page<K,V> split(int at) {
assert !isSaved();
int b = getKeyCount() - at;
K[] bKeys = splitKeys(at, b);
V[] bValues = createValueStorage(b);
if(values != null) {
V[] aValues = createValueStorage(at);
System.arraycopy(values, 0, aValues, 0, at);
System.arraycopy(values, at, bValues, 0, b);
// 叶子节点的values指向 aValues。 新的page的values指向bValues
values = aValues;
}
// 新的页初始化。传入keys和values
Page<K,V> newPage = createLeaf(map, bKeys, bValues, 0);
if(isPersistent()) {
recalculateMemory();
}
return newPage;
}
// 分割key。
final K[] splitKeys(int aCount, int bCount) {
assert aCount + bCount <= getKeyCount();
K[] aKeys = createKeyStorage(aCount);
K[] bKeys = createKeyStorage(bCount);
System.arraycopy(keys, 0, aKeys, 0, aCount);
System.arraycopy(keys, getKeyCount() - bCount, bKeys, 0, bCount);
//原来的page的keys数组指向aKeys
keys = aKeys;
// 返回新的新的页的keys
return bKeys;
}
// NonLeaf的分裂split操作
public Page<K,V> split(int at) {
assert !isSaved();
int b = getKeyCount() - at;
K[] bKeys = splitKeys(at, b - 1);
// NonLeaf创建指向子节点的PageReference数组
PageReference<K,V>[] aChildren = createRefStorage(at + 1);
PageReference<K,V>[] bChildren = createRefStorage(b);
System.arraycopy(children, 0, aChildren, 0, at + 1);
System.arraycopy(children, at + 1, bChildren, 0, b);
//旧Page的children指向新创建的aChildren
children = aChildren;
long t = 0;
for (PageReference<K,V> x : aChildren) {
t += x.count;
}
totalCount = t;
t = 0;
for (PageReference<K,V> x : bChildren) {
t += x.count;
}
// 创建新的NonLeaf page。因为是内部节点,所以没有values,但是有指向子节点page的属性bChildren
Page<K,V> newPage = createNode(map, bKeys, bChildren, t, 0);
if(isPersistent()) {
recalculateMemory();
}
return newPage;
}