H2的存储引擎MVStore剖析(4) —— PUT操作

683 阅读5分钟

导言:本文将介绍MVStore是如何执行put操作的。从中可以学习到MVStore怎么利用B+树增加元素,还有就是教科书上都说B+树能够减少IO,但是具体实现是怎么样的之前没怎么了解。下面会结合代码加上注释来说明,主要的整体流程并不复杂。

总的操作如下(已把大部分次要的代码省略):

public V operate(K key, V value, DecisionMaker<? super V> decisionMaker) {
    IntValueHolder unsavedMemoryHolder = new IntValueHolder();
    int attempt = 0;
    while(true) {
        RootReference<K,V> rootReference = flushAndGetRoot();
        boolean locked = rootReference.isLockedByCurrentThread();
        if (!locked) {
            if (attempt++ == 0) {
                beforeWrite();
            }
            if (attempt > 3 || rootReference.isLocked()) {
                rootReference = lockRoot(rootReference, attempt);
                locked = true;
            }
        }
        Page<K,V> rootPage = rootReference.root;
        long version = rootReference.version;
        CursorPos<K,V> tip;
        V result;
        unsavedMemoryHolder.value = 0;
        try {
            CursorPos<K,V> pos = CursorPos.traverseDown(rootPage, key);
            if(!locked && rootReference != getRoot()) {
                continue;
            }
            Page<K,V> p = pos.page;
            int index = pos.index;
            tip = pos;
            pos = pos.parent;
            result = index < 0 ? null : p.getValue(index);
            Decision decision = decisionMaker.decide(result, value, tip);

            switch (decision) {
              
             ........
                  
                case PUT: {
                    value = decisionMaker.selectValue(result, value);
                    p = p.copy();
                    if (index < 0) {
                        p.insertLeaf(-index - 1, key, value);
                        int keyCount;
                        while ((keyCount = p.getKeyCount()) > store.getKeysPerPage()
                                || p.getMemory() > store.getMaxPageSize()
                                && keyCount > (p.isLeaf() ? 1 : 2)) {
                            long totalCount = p.getTotalCount();
                            int at = keyCount >> 1;
                            K k = p.getKey(at);
                            Page<K,V> split = p.split(at);
                            unsavedMemoryHolder.value += p.getMemory() + split.getMemory();
                            if (pos == null) {
                                K[] keys = p.createKeyStorage(1);
                                keys[0] = k;
                                Page.PageReference<K,V>[] children = Page.createRefStorage(2);
                                children[0] = new Page.PageReference<>(p);
                                children[1] = new Page.PageReference<>(split);
                                p = Page.createNode(this, keys, children, totalCount, 0);
                                break;
                            }
                            Page<K,V> c = p;
                            p = pos.page;
                            index = pos.index;
                            pos = pos.parent;
                            p = p.copy();
                            p.setChild(index, split);
                            p.insertNode(index, k, c);
                        }
                    } else {
                        p.setValue(index, value);
                    }
                    break;
                }
            }
            rootPage = replacePage(pos, p, unsavedMemoryHolder);
            if (!locked) {
                rootReference = rootReference.updateRootPage(rootPage, attempt);
                if (rootReference == null) {
                    decisionMaker.reset();
                    continue;
                }
            }
            store.registerUnsavedMemory(unsavedMemoryHolder.value + tip.processRemovalInfo(version));
            return result;
        } finally {
            if(locked) {
                unlockRoot(rootPage);
            }
        }
    }
}

一、

// 首先获取这个MVMap 的RootReference,通过RootReference能找到这个MVMap的rootPage
RootReference<K,V> rootReference = flushAndGetRoot();

二、

// 获取MVMap的rootPage
Page<K,V> rootPage = rootReference.root;

三、

// 从rootPage中开始获取key对应的value,这里用了一个CursorPos类来封装
CursorPos<K,V> pos = CursorPos.traverseDown(rootPage, key)

该方法如下:
// 方法还是很容易理解的,就是B+树的搜索过程。从根节点一直搜索直到叶子结点。B+树的叶子结点包含了key和value
static <K,V> CursorPos<K,V> traverseDown(Page<K,V> page, K key) {
        CursorPos<K,V> cursorPos = null;
        while (!page.isLeaf()) {
            int index = page.binarySearch(key) + 1;
            if (index < 0) {
                index = -index;
            }
            cursorPos = new CursorPos<>(page, index, cursorPos);
            // 这里读入子page。在这里会进行IO读入。
            page = page.getChildPage(index);
        }
        return new CursorPos<>(page, page.binarySearch(key), cursorPos);
    }
    
    
	// 叶子结点没有实现getChildPage方法    
     @Override
        public Page<K,V> getChildPage(int index) {
            throw new UnsupportedOperationException();
        }
        
        
        // 非叶子结点NonLeaf实现的getChildPage方法
         @Override
        public Page<K,V> getChildPage(int index) {
            PageReference<K,V> ref = children[index];
            Page<K,V> page = ref.getPage();
            if(page == null) {
                page = map.readPage(ref.getPos());
                assert ref.getPos() == page.getPos();
                assert ref.count == page.getTotalCount();
            }
            return page;
        }
        
        
      // PageReference类(省略了部分不重要的属性和方法) 。 
      public static final class PageReference<K,V> {

        /**
          page的pos值。关于page的pos值,可以参考之前的https://juejin.cn/post/7011130873634357262。有
          讲述pos的各个段的含义
         * The position, if known, or 0.
         */
        private long pos;

        /**
        如果已经page已经在内存了则非空
         * The page, if in memory, or null.
         */
        private Page<K,V> page;

        
        public Page<K,V> getPage() {
            return page;
        }

        long getPos() {
            return pos;
        }

    }

四、非叶子结点NonLeaf实现的getChildPage方法。这个方法就是将Page从外部设备读入内存的方法


public Page<K,V> getChildPage(int index) {
    PageReference<K,V> ref = children[index];
    Page<K,V> page = ref.getPage();
    // 子节点的page为空,则调用map的readPage()方法进行IO读入
    if(page == null) {
        page = map.readPage(ref.getPos());
        assert ref.getPos() == page.getPos();
        assert ref.count == page.getTotalCount();
    }
    return page;
}
// 利用了MVStore的readPage方法
 final Page<K,V> readPage(long pos) {
        return store.readPage(this, pos);
    }

五、


<K,V> Page<K,V> readPage(MVMap<K,V> map, long pos) {
    try {
        if (!DataUtils.isPageSaved(pos)) {
            throw DataUtils.newMVStoreException(
                    DataUtils.ERROR_FILE_CORRUPT, "Position 0");
        }
        // 首先从cache中判断是否已经缓存了该pos值对应的Page
        Page<K,V> p = readPageFromCache(pos);
        if (p == null) {
            // 如果缓存中没有,就通过pos值获取
            Chunk chunk = getChunk(pos);
            int pageOffset = DataUtils.getPageOffset(pos);
            try {
                //读入page的内容到ByteBuffer
                ByteBuffer buff = chunk.readBufferForPage(fileStore, pageOffset, pos);
                // 读到ByteBuffer后从buffer中读入Page
                p = Page.read(buff, pos, map);
                if (p.pageNo < 0) {
                    p.pageNo = calculatePageNo(pos);
                }
            } catch (MVStoreException e) {
                throw e;
            } catch (Exception e) {
                throw DataUtils.newMVStoreException(DataUtils.ERROR_FILE_CORRUPT,
                        "Unable to read the page at position {0}, chunk {1}, offset {2}",
                        pos, chunk.id, pageOffset, e);
            }
            cachePage(p);
        }
        return p;
    } catch (MVStoreException e) {
        if (recoveryMode) {
            return map.createEmptyLeaf();
        }
        throw e;
    }
}


 // 通过pos值获取page所在的chunk
  private Chunk getChunk(long pos) {
        int chunkId = DataUtils.getPageChunkId(pos);
        Chunk c = chunks.get(chunkId);
        if (c == null) {
            checkOpen();
            // chunk.getMetaKey(chunkId) 会返回字符串“chunk.chunkId”,layout保存的内容可看前面第一篇《H2的存储引擎MVStore剖析(1)——MVStore初始化》
            String s = layout.get(Chunk.getMetaKey(chunkId));
            if (s == null) {
                throw DataUtils.newMVStoreException(
                        DataUtils.ERROR_CHUNK_NOT_FOUND,
                        "Chunk {0} not found", chunkId);
            }
            c = Chunk.fromString(s);
            if (!c.isSaved()) {
                throw DataUtils.newMVStoreException(
                        DataUtils.ERROR_FILE_CORRUPT,
                        "Chunk {0} is invalid", chunkId);
            }
            chunks.put(c.id, c);
        }
        return c;
    }

layout存储结构例子:

image-20210908223159745.png

六、

ByteBuffer readBufferForPage(FileStore fileStore, int offset, long pos) {
        assert isSaved() : this;
        while (true) {
            long originalBlock = block;
            try {
                // chunk的originalBlock表示这个chunk在存储文件的其实block位置
                long filePos = originalBlock * MVStore.BLOCK_SIZE;
                // 这个chunk的最大位置
                long maxPos = filePos + len * MVStore.BLOCK_SIZE;
                // 起始位置加上偏移量
                filePos += offset;
                if (filePos < 0) {
                    throw DataUtils.newMVStoreException(
                            DataUtils.ERROR_FILE_CORRUPT,
                            "Negative position {0}; p={1}, c={2}", filePos, pos, toString());
                }

                int length = DataUtils.getPageMaxLength(pos);
                if (length == DataUtils.PAGE_LARGE) {
                    // read the first bytes to figure out actual length
                    length = fileStore.readFully(filePos, 128).getInt();
                    // pageNo is deliberately not included into length to preserve compatibility
                    // TODO: remove this adjustment when page on disk format is re-organized
                    length += 4;
                }
                // 最大位置减去偏移量:其实位置到chunk的结尾的字节数,和pos中表示的page长度比较。两者取小
                length = (int) Math.min(maxPos - filePos, length);
                if (length < 0) {
                    throw DataUtils.newMVStoreException(DataUtils.ERROR_FILE_CORRUPT,
                            "Illegal page length {0} reading at {1}; max pos {2} ", length, filePos, maxPos);
                }

                //利用FileChannel读入长度为length的内容到ByteBuffer 。java.nio.channels.FileChannel
                ByteBuffer buff = fileStore.readFully(filePos, length);

                if (originalBlock == block) {
                    return buff;
                }
            } catch (MVStoreException ex) {
                if (originalBlock == block) {
                    throw ex;
                }
            }
        }
    }

  // java.nio.channels.FileChannel 从位置pos读取len字节到ByteBuffer
  public ByteBuffer readFully(long pos, int len) {
        ByteBuffer dst = ByteBuffer.allocate(len);
        DataUtils.readFully(file, pos, dst);
        readCount.incrementAndGet();
        readBytes.addAndGet(len);
        return dst;
    }

七、从Buffer中读入Page。Page.read(buff, pos, map);

static <K,V> Page<K,V> read(ByteBuffer buff, long pos, MVMap<K,V> map) {
    boolean leaf = (DataUtils.getPageType(pos) & 1) == PAGE_TYPE_LEAF;
    Page<K,V> p = leaf ? new Leaf<>(map) : new NonLeaf<>(map);
    p.pos = pos;
    p.read(buff);
    return p;
}


  // Page$NonLeaf从ByteBuffer中读入。整个过程可以参考<<H2的存储引擎MVStore剖析(2)  —— Page的读入.md>>。理解了Page的结构之后就能明白整个流程
   private void read(ByteBuffer buff) {
        int chunkId = DataUtils.getPageChunkId(pos);
        int offset = DataUtils.getPageOffset(pos);

        int start = buff.position();
        int pageLength = buff.getInt(); // does not include optional part (pageNo)
        int remaining = buff.remaining() + 4;
        if (pageLength > remaining || pageLength < 4) {
            throw DataUtils.newMVStoreException(DataUtils.ERROR_FILE_CORRUPT,
                    "File corrupted in chunk {0}, expected page length 4..{1}, got {2}", chunkId, remaining,
                    pageLength);
        }

        short check = buff.getShort();
        int checkTest = DataUtils.getCheckValue(chunkId)
                ^ DataUtils.getCheckValue(offset)
                ^ DataUtils.getCheckValue(pageLength);
        if (check != (short) checkTest) {
            throw DataUtils.newMVStoreException(DataUtils.ERROR_FILE_CORRUPT,
                    "File corrupted in chunk {0}, expected check value {1}, got {2}", chunkId, checkTest, check);
        }


        int mapId = DataUtils.readVarInt(buff);
        if (mapId != map.getId()) {
            throw DataUtils.newMVStoreException(DataUtils.ERROR_FILE_CORRUPT,
                    "File corrupted in chunk {0}, expected map id {1}, got {2}", chunkId, map.getId(), mapId);
        }


        int len = DataUtils.readVarInt(buff);
        keys = createKeyStorage(len);
        int type = buff.get();
        if(isLeaf() != ((type & 1) == PAGE_TYPE_LEAF)) {
            throw DataUtils.newMVStoreException(
                    DataUtils.ERROR_FILE_CORRUPT,
                    "File corrupted in chunk {0}, expected node type {1}, got {2}",
                    chunkId, isLeaf() ? "0" : "1" , type);
        }
        // jump ahead and read pageNo, because if page is compressed,
        // buffer will be replaced by uncompressed one
        if ((type & DataUtils.PAGE_HAS_PAGE_NO) != 0) {
            int position = buff.position();
            buff.position(start + pageLength);
            pageNo = DataUtils.readVarInt(buff);
            buff.position(position);
        }
        // to restrain hacky GenericDataType, which grabs the whole remainder of the buffer
        buff.limit(start + pageLength);

        if (!isLeaf()) {
            readPayLoad(buff);
        }
        boolean compressed = (type & DataUtils.PAGE_COMPRESSED) != 0;
        if (compressed) {
            Compressor compressor;
            if ((type & DataUtils.PAGE_COMPRESSED_HIGH) ==
                    DataUtils.PAGE_COMPRESSED_HIGH) {
                compressor = map.getStore().getCompressorHigh();
            } else {
                compressor = map.getStore().getCompressorFast();
            }
            int lenAdd = DataUtils.readVarInt(buff);
            int compLen = buff.remaining();
            byte[] comp;
            int pos = 0;
            if (buff.hasArray()) {
                comp = buff.array();
                pos = buff.arrayOffset() + buff.position();
            } else {
                comp = Utils.newBytes(compLen);
                buff.get(comp);
            }
            int l = compLen + lenAdd;
            buff = ByteBuffer.allocate(l);
            compressor.expand(comp, pos, compLen, buff.array(),
                    buff.arrayOffset(), l);
        }
       //读入key值到key数组——K[] keys
        map.getKeyType().read(buff, keys, len);
        if (isLeaf()) {
            // 分叶子结点和非叶子结点
            readPayLoad(buff);
        }
        diskSpaceUsed = pageLength;
         recalculateMemory();
    }

八、readPayLoad()

       // Leaf 的读入
        protected void readPayLoad(ByteBuffer buff) {
            int keyCount = getKeyCount();
            values = createValueStorage(keyCount);
            map.getValueType().read(buff, values, getKeyCount());
        }
        
        
        //NonLeaf 读入。 children是PageReference的数组
            protected void readPayLoad(ByteBuffer buff) {
            int keyCount = getKeyCount();
            children = createRefStorage(keyCount + 1);
            long[] p = new long[keyCount + 1];
            for (int i = 0; i <= keyCount; i++) {
                p[i] = buff.getLong();
            }
            long total = 0;
            for (int i = 0; i <= keyCount; i++) {
                long s = DataUtils.readVarLong(buff);
                long position = p[i];
                assert position == 0 ? s == 0 : s >= 0;
                total += s;
                children[i] = position == 0 ?
                        PageReference.empty() :
                        new PageReference<>(position, s);
            }
            totalCount = total;
        }

九、

从traverseDown(Page<K,V> page, K key)方法返回的CursorPos包含了在B+树上搜索整个key需要读入的所有页

比如下面搜索key=144,那么就会读入根节点和根节点中key=144和key=168之间的那棵子树(标红部分)。

image-20210927215327121.png

CursorPos类主要属性就3个。traverseDown(Page<K,V> page, K key)返回的就是某个叶子节点,这个叶子节点要么包含key,要么不包含。

public final class CursorPos<K,V> {

    public Page<K,V> page;

    public int index;

    public CursorPos<K,V> parent;


    public CursorPos(Page<K,V> page, int index, CursorPos<K,V> parent) {
        this.page = page;
        this.index = index;
        this.parent = parent;
    }
   // 迭代地读入page
    static <K,V> CursorPos<K,V> traverseDown(Page<K,V> page, K key) {
        CursorPos<K,V> cursorPos = null;
        while (!page.isLeaf()) {
            int index = page.binarySearch(key) + 1;
            if (index < 0) {
                index = -index;
            }
            cursorPos = new CursorPos<>(page, index, cursorPos);
            page = page.getChildPage(index);
        }
        return new CursorPos<>(page, page.binarySearch(key), cursorPos);
    }

    int processRemovalInfo(long version) {
        int unsavedMemory = 0;
        for (CursorPos<K,V> head = this; head != null; head = head.parent) {
            unsavedMemory += head.page.removePage(version);
        }
        return unsavedMemory;
    }
}
CursorPos<K,V> pos = CursorPos.traverseDown(rootPage, key);
if(!locked && rootReference != getRoot()) {
    continue;
}
Page<K,V> p = pos.page;
int index = pos.index;
tip = pos;
// 将pos指向当前pos的父节点
pos = pos.parent;
result = index < 0 ? null : p.getValue(index);
Decision decision = decisionMaker.decide(result, value, tip);

switch (decision) {
    case REPEAT:
        decisionMaker.reset();
        continue;
    case ABORT:
        if(!locked && rootReference != getRoot()) {
            decisionMaker.reset();
            continue;
        }
        return result;
    case REMOVE: {
        ......

十、


case PUT: {
    value = decisionMaker.selectValue(result, value);
    // 最底层的叶子节点复制一份。这里是浅拷贝
    p = p.copy();
    if (index < 0) {
        // 底层叶子节点page插入key,value。位置是-index-1
        p.insertLeaf(-index - 1, key, value);
        int keyCount;
        //B+树的满节点操作。如果超过阈值,将会分裂
        while ((keyCount = p.getKeyCount()) > store.getKeysPerPage()
                || p.getMemory() > store.getMaxPageSize()
                && keyCount > (p.isLeaf() ? 1 : 2)) {
            long totalCount = p.getTotalCount();
            // keyCount的一半,中间位置
            int at = keyCount >> 1;
            K k = p.getKey(at);
            // page的分裂操作。分leaf和nonleaf
            Page<K,V> split = p.split(at);
            unsavedMemoryHolder.value += p.getMemory() + split.getMemory();
            // pos == null表示是根节点
            if (pos == null) {
                // 创建长度是1的keys数组。
                K[] keys = p.createKeyStorage(1);
                keys[0] = k;
                Page.PageReference<K,V>[] children = Page.createRefStorage(2);
                children[0] = new Page.PageReference<>(p);
                children[1] = new Page.PageReference<>(split);
                p = Page.createNode(this, keys, children, totalCount, 0);
                break;
            }
            Page<K,V> c = p; //p是子节点
            p = pos.page; // pos是父节点的CursorPos
            index = pos.index;  
            pos = pos.parent;
            p = p.copy(); // 父page 浅拷贝。 注:有好几个地方都用了page的浅拷贝。还没完全搞清楚原因
            p.setChild(index, split); // 父page 将index 指向新分裂出来的page split
            p.insertNode(index, k, c); // 将子节点的key 放到父节点上。并且在父节点的children中的index指向子节点page
        }
    } else {
        p.setValue(index, value);
    }
    break;
}

      //叶子节点insert (key, value)
        public void insertLeaf(int index, K key, V value) {
            int keyCount = getKeyCount();
            insertKey(index, key);

            if(values != null) {
                V[] newValues = createValueStorage(keyCount + 1);
                DataUtils.copyWithGap(values, newValues, keyCount, index);
                values = newValues;
                setValueInternal(index, value);
                if (isPersistent()) {
                    addMemory(MEMORY_POINTER + map.evaluateMemoryForValue(value));
                }
            }
        }

    //非叶子没有实现insert方法
     public void insertLeaf(int index, K key, V value) {
            throw new UnsupportedOperationException();
        }

  
		// 叶子节点
        public void insertLeaf(int index, K key, V value) {
            //返回keys数组的长度
            int keyCount = getKeyCount();
            // 将key插入index的位置
            insertKey(index, key);

            if(values != null) {
                V[] newValues = createValueStorage(keyCount + 1);
                // 复制一份新的newValues,位置index留一个gap
                DataUtils.copyWithGap(values, newValues, keyCount, index);
                values = newValues;
                setValueInternal(index, value);
                if (isPersistent()) {
                    addMemory(MEMORY_POINTER + map.evaluateMemoryForValue(value));
                }
            }
        }


    final void insertKey(int index, K key) {
        int keyCount = getKeyCount();
        assert index <= keyCount : index + " > " + keyCount;
        K[] newKeys = createKeyStorage(keyCount + 1);
        // 复制一份新的keys,位置index留一个gap
        DataUtils.copyWithGap(keys, newKeys, keyCount, index);
        keys = newKeys;

        keys[index] = key;

        if (isPersistent()) {
            addMemory(MEMORY_POINTER + map.evaluateMemoryForKey(key));
        }
    }

       // NonLeaf节点还有一个insertNode方法
       public void insertNode(int index, K key, Page<K,V> childPage) {
            int childCount = getRawChildPageCount();
            insertKey(index, key);

            PageReference<K,V>[] newChildren = createRefStorage(childCount + 1);
            DataUtils.copyWithGap(children, newChildren, childCount, index);
            children = newChildren;
            children[index] = new PageReference<>(childPage);

            totalCount += childPage.getTotalCount();
            if (isPersistent()) {
                addMemory(MEMORY_POINTER + PAGE_MEMORY_CHILD);
            }
        }

十一、Leaf和NonLeaf的分裂操作。返回的都是新分裂出来的页

// Leaf的分裂split操作
public Page<K,V> split(int at) {
    assert !isSaved();
    int b = getKeyCount() - at;
    K[] bKeys = splitKeys(at, b);
    V[] bValues = createValueStorage(b);
    if(values != null) {
        V[] aValues = createValueStorage(at);
        System.arraycopy(values, 0, aValues, 0, at);
        System.arraycopy(values, at, bValues, 0, b);
        // 叶子节点的values指向 aValues。 新的page的values指向bValues
        values = aValues;
    }
    // 新的页初始化。传入keys和values
    Page<K,V> newPage = createLeaf(map, bKeys, bValues, 0);
    if(isPersistent()) {
        recalculateMemory();
    }
    return newPage;
}

// 分割key。
final K[] splitKeys(int aCount, int bCount) {
        assert aCount + bCount <= getKeyCount();
        K[] aKeys = createKeyStorage(aCount);
        K[] bKeys = createKeyStorage(bCount);
        System.arraycopy(keys, 0, aKeys, 0, aCount);
        System.arraycopy(keys, getKeyCount() - bCount, bKeys, 0, bCount);
        //原来的page的keys数组指向aKeys
        keys = aKeys;
        // 返回新的新的页的keys
        return bKeys;
    }


 // NonLeaf的分裂split操作
   public Page<K,V> split(int at) {
            assert !isSaved();
            int b = getKeyCount() - at;
            K[] bKeys = splitKeys(at, b - 1);
            // NonLeaf创建指向子节点的PageReference数组
            PageReference<K,V>[] aChildren = createRefStorage(at + 1);
            PageReference<K,V>[] bChildren = createRefStorage(b);
            System.arraycopy(children, 0, aChildren, 0, at + 1);
            System.arraycopy(children, at + 1, bChildren, 0, b);
            //旧Page的children指向新创建的aChildren
            children = aChildren;

            long t = 0;
            for (PageReference<K,V> x : aChildren) {
                t += x.count;
            }
            totalCount = t;
            t = 0;
            for (PageReference<K,V> x : bChildren) {
                t += x.count;
            }
            // 创建新的NonLeaf page。因为是内部节点,所以没有values,但是有指向子节点page的属性bChildren
            Page<K,V> newPage = createNode(map, bKeys, bChildren, t, 0);
            if(isPersistent()) {
                recalculateMemory();
            }
            return newPage;
        }