C#重写Lz4 JavaScript文本压缩算法(二)

428 阅读2分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路。

C#重写Lz4 JavaScript文本压缩算法(一) - 掘金 (juejin.cn)

C#重写Lz4 JavaScript文本压缩算法(二) - 掘金 (juejin.cn)

C#重写Lz4 JavaScript文本压缩算法(三) - 掘金 (juejin.cn)

lz4.js -> Lz4.cs

using System;
using System.Collections.Generic;

namespace Lz4CSharp
{
    class Lz4
    {
        // lz4.js - An implementation of Lz4 in plain JavaScript.
        //
        // TODO:
        // - Unify header parsing/writing.
        // - Support options (block size, checksums)
        // - Support streams
        // - Better error handling (handle bad offset, etc.)
        // - HC support (better search algorithm)
        // - Tests/benchmarking

        // Constants
        // --

        // Compression format parameters/constants.
        static UInt32 minMatch = 4;
        static UInt32 minLength = 13;
        static UInt32 searchLimit = 5;
        static int skipTrigger = 6;
        static UInt32 hashSize = 1 << 16;

        // Token constants.
        static int mlBits = 4;
        static int mlMask = (1 << mlBits) - 1;
        static int runBits = 4;
        static int runMask = (1 << runBits) - 1;

        // Shared buffers
        static UInt32[] blockBuf = new UInt32[5 << 20];
        static UInt32[] hashTable = new UInt32[hashSize];

        // Frame constants.
        static UInt32 magicNum = 0x184D2204;

        // Frame descriptor flags.
        static UInt32 fdContentChksum = 0x4;
        static UInt32 fdContentSize = 0x8;
        static UInt32 fdBlockChksum = 0x10;
        // var fdBlockIndep = 0x20;
        static UInt32 fdVersion = 0x40;
        static UInt32 fdVersionMask = 0xC0;

        // Block sizes.
        static UInt32 bsUncompressed = 0x80000000;
        static UInt32 bsDefault = 7;
        static int bsShift = 4;
        static UInt32 bsMask = 7;
        static Dictionary<UInt32, UInt32> bsMap = new Dictionary<UInt32, UInt32>();
        public static void Init()
        {
            bsMap.Add(4, 0x10000);
            bsMap.Add(5, 0x40000);
            bsMap.Add(6, 0x100000);
            bsMap.Add(7, 0x400000);
        }

        // Utility functions/primitives
        // --



        // Clear hashtable.
        static void clearHashTable(UInt32[] table)
        {
            for (var i = 0; i < hashSize; i++)
            {
                hashTable[i] = 0;
            }
        }



        static UInt32[] sliceArray(UInt32[] array, int start, int end)
        {
            // Uint8Array#slice polyfill.
            var len = array.Length;

            // Calculate start.
            start = start | 0;
            start = (start < 0) ? Math.Max(len + start, 0) : Math.Min(start, len);

            // Calculate end.
            end = end == null ? len : end | 0;
            end = (end < 0) ? Math.Max(len + end, 0) : Math.Min(end, len);

            // Copy into new array.
            var arraySlice = new UInt32[end - start];
            for (int i = start, n = 0; i < end;)
            {
                arraySlice[n++] = array[i++];
            }

            return arraySlice;
        }


        // Implementation
        // --

        // Calculates an upper bound for lz4 compression.
        static UInt32 compressBound(UInt32 n)
        {
            return (n + (n / 255) + 16) | 0;
        }

        // Calculates an upper bound for lz4 decompression, by reading the data.
        static UInt64 decompressBound(UInt32[] src)
        {
            int sIndex = 0;

            // Read magic number
            if (Util.readU32(src, sIndex) != magicNum)
            {
                throw new Exception("invalid magic number");
            }

            sIndex += 4;

            // Read descriptor
            var descriptor = src[sIndex++];

            // Check version
            if ((descriptor & fdVersionMask) != fdVersion)
            {
                throw new Exception($"incompatible descriptor version {descriptor & fdVersionMask}");
            }

            // Read flags
            var useBlockSum = (descriptor & fdBlockChksum) != 0;
            var useContentSize = (descriptor & fdContentSize) != 0;

            // Read block size
            UInt32 bsIdx = (src[sIndex++] >> bsShift) & bsMask;

            if (!bsMap.ContainsKey(bsIdx))
            {
                throw new Exception($"invalid block size {bsIdx}");
            }

            var maxBlockSize = bsMap[bsIdx];

            // Get content size
            if (useContentSize)
            {
                return Util.readU64(src, sIndex);
            }

            // Checksum
            sIndex++;

            // Read blocks.
            UInt32 maxSize = 0;
            while (true)
            {
                var blockSize = Util.readU32(src, sIndex);
                sIndex += 4;

                if ((blockSize & bsUncompressed) != 0)
                {
                    blockSize &= ~bsUncompressed;
                    maxSize += blockSize;
                }
                else
                {
                    maxSize += maxBlockSize;
                }

                if (blockSize == 0)
                {
                    return maxSize;
                }

                if (useBlockSum)
                {
                    sIndex += 4;
                }

                sIndex += (int)blockSize;
            }
        }

        // Decompresses a block of Lz4.
        static UInt32 decompressBlock(UInt32[] src, UInt32[] dst, UInt32 sIndex, UInt32 sLength, UInt32 dIndex)
        {
            UInt32 mLength, mOffset, sEnd, n, i;

            // Setup initial state.
            sEnd = sIndex + sLength;

            // Consume entire input block.
            while (sIndex < sEnd)
            {
                var token = src[sIndex++];

                // Copy literals.
                var literalCount = (token >> 4);
                if (literalCount > 0)
                {
                    // Parse length.
                    if (literalCount == 0xf)
                    {
                        while (true)
                        {
                            literalCount += src[sIndex];
                            if (src[sIndex++] != 0xff)
                            {
                                break;
                            }
                        }
                    }

                    // Copy literals
                    for (n = sIndex + literalCount; sIndex < n;)
                    {
                        dst[dIndex++] = src[sIndex++];
                    }
                }

                if (sIndex >= sEnd)
                {
                    break;
                }

                // Copy match.
                mLength = (token & 0xf);

                // Parse offset.
                mOffset = src[sIndex++] | (src[sIndex++] << 8);

                // Parse length.
                if (mLength == 0xf)
                {
                    while (true)
                    {
                        mLength += src[sIndex];
                        if (src[sIndex++] != 0xff)
                        {
                            break;
                        }
                    }
                }

                mLength += minMatch;

                // Copy match.
                for (i = dIndex - mOffset, n = i + mLength; i < n;)
                {
                    dst[dIndex++] = dst[i++] | 0;
                }
            }

            return dIndex;
        }

        // Compresses a block with Lz4.
        static UInt32 compressBlock(UInt32[] src, UInt32[] dst, int sIndex, int sLength, UInt32[] hashTable)
        {
            int mIndex, mAnchor, mLength, mOffset, mStep;
            int literalCount, dIndex, sEnd, n;

            // Setup initial state.
            dIndex = 0;
            sEnd = sLength + sIndex;
            mAnchor = sIndex;

            // Process only if block is large enough.
            if (sLength >= minLength)
            {
                var searchMatchCount = (1 << skipTrigger) + 3;

                // Consume until last n literals (Lz4 spec limitation.)
                while (sIndex + minMatch < sEnd - searchLimit)
                {
                    var seq = Util.readU32(src, (int)sIndex);
                    var hash = Util.UInt32MoveRight(Util.hashU32(seq), 0);

                    // Crush hash to 16 bits.
                    hash = Util.UInt32MoveRight((hash >> 16) ^ hash , 0) & 0xffff;

                    // Look for a match in the hashtable. NOTE: remove one; see below.
                    mIndex = (int)hashTable[hash] - 1;

                    // Put pos in hash table. NOTE: add one so that zero = invalid.
                    hashTable[hash] = (UInt32)sIndex + 1;

                    // Determine if there is a match (within range.)
                    if (mIndex < 0 || Util.UInt32MoveRight((UInt32)(sIndex - mIndex) , 16) > 0 || Util.readU32(src, (int)mIndex) != seq)
                    {
                        mStep = searchMatchCount++ >> skipTrigger;
                        sIndex += mStep;
                        continue;
                    }

                    searchMatchCount = (1 << skipTrigger) + 3;

                    // Calculate literal count and offset.
                    literalCount = sIndex - mAnchor;
                    mOffset = sIndex - mIndex;

                    // We've already matched one word, so get that out of the way.
                    sIndex += (int)minMatch;
                    mIndex += (int)minMatch;

                    // Determine match length.
                    // N.B.: mLength does not include minMatch, Lz4 adds it back
                    // in decoding.
                    mLength = sIndex;
                    while (sIndex < sEnd - searchLimit && src[sIndex] == src[mIndex])
                    {
                        sIndex++;
                        mIndex++;
                    }
                    mLength = sIndex - mLength;

                    // Write token + literal count.
                    int token = mLength < mlMask ? mLength : mlMask;
                    if (literalCount >= runMask)
                    {
                        dst[dIndex++] = (UInt32)((runMask << mlBits) + token);
                        for (n = literalCount - runMask; n >= 0xff; n -= 0xff)
                        {
                            dst[dIndex++] = 0xff;
                        }
                        dst[dIndex++] = (UInt32)n;
                    }
                    else
                    {
                        dst[dIndex++] = (UInt32)((literalCount << mlBits) + token);
                    }

                    // Write literals.
                    for (var i = 0; i < literalCount; i++)
                    {
                        dst[dIndex++] = src[mAnchor + i];
                    }

                    // Write offset.
                    dst[dIndex++] = (UInt32)mOffset;
                    dst[dIndex++] = (UInt32)mOffset >> 8;

                    // Write match length.
                    if (mLength >= mlMask)
                    {
                        for (n = mLength - mlMask; n >= 0xff; n -= 0xff)
                        {
                            dst[dIndex++] = 0xff;
                        }
                        dst[dIndex++] = (UInt32)n;
                    }

                    // Move the anchor.
                    mAnchor = sIndex;
                }
            }

            // Nothing was encoded.
            if (mAnchor == 0)
            {
                return 0;
            }

            // Write remaining literals.
            // Write literal token+count.
            literalCount = sEnd - mAnchor;
            if (literalCount >= runMask)
            {
                dst[dIndex++] = (UInt32)(runMask << mlBits);
                for (n = literalCount - runMask; n >= 0xff; n -= 0xff)
                {
                    dst[dIndex++] = 0xff;
                }
                dst[dIndex++] = (UInt32)n;
            }
            else
            {
                dst[dIndex++] = (UInt32)(literalCount << mlBits);
            }

            // Write literals.
            sIndex = mAnchor;
            while (sIndex < sEnd)
            {
                dst[dIndex++] = src[sIndex++];
            }

            return (UInt32)dIndex;
        }

        // Decompresses a frame of Lz4 data.
        static UInt32 decompressFrame(UInt32[] src, UInt32[] dst)
        {
            bool useBlockSum, useContentSum, useContentSize;
            UInt32 descriptor;
            UInt32 sIndex = 0;
            UInt32 dIndex = 0;

            // Read magic number
            if (Util.readU32(src, (int)sIndex) != magicNum)
            {
                throw new Exception("invalid magic number");
            }

            sIndex += 4;

            // Read descriptor
            descriptor = src[sIndex++];

            // Check version
            if ((descriptor & fdVersionMask) != fdVersion)
            {
                throw new Exception("incompatible descriptor version");
            }

            // Read flags
            useBlockSum = (descriptor & fdBlockChksum) != 0;
            useContentSum = (descriptor & fdContentChksum) != 0;
            useContentSize = (descriptor & fdContentSize) != 0;

            // Read block size
            var bsIdx = (src[sIndex++] >> bsShift) & bsMask;
            
            if (!bsMap.ContainsKey(bsIdx))
            {
                throw new Exception("invalid block size");
            }

            if (useContentSize)
            {
                // TODO: read content size
                sIndex += 8;
            }

            sIndex++;

            // Read blocks.
            while (true)
            {
                UInt32 compSize;

                compSize = Util.readU32(src, (int)sIndex);
                sIndex += 4;

                if (compSize == 0)
                {
                    break;
                }

                if (useBlockSum)
                {
                    // TODO: read block checksum
                    sIndex += 4;
                }

                // Check if block is compressed
                if ((compSize & bsUncompressed) != 0)
                {
                    // Mask off the 'uncompressed' bit
                    compSize &= ~bsUncompressed;

                    // Copy uncompressed data into destination buffer.
                    for (var j = 0; j < compSize; j++)
                    {
                        dst[dIndex++] = src[sIndex++];
                    }
                }
                else
                {
                    // Decompress into blockBuf
                    dIndex = decompressBlock(src, dst, sIndex, compSize, dIndex);
                    sIndex += compSize;
                }
            }

            if (useContentSum)
            {
                // TODO: read content checksum
                sIndex += 4;
            }

            return dIndex;
        }

        // Compresses data to an Lz4 frame.
        static UInt32 compressFrame(UInt32[] src, UInt32[] dst)
        {
            var dIndex = 0;

            // Write magic number.
            Util.writeU32(dst, dIndex, magicNum);
            dIndex += 4;

            // Descriptor flags.
            dst[dIndex++] = fdVersion;
            dst[dIndex++] = bsDefault << bsShift;

            UInt32 x32 = Xxh32.xxh32(0, dst, 4, dIndex - 4);
            // Descriptor checksum.
            dst[dIndex] = (UInt32)((UInt16)x32 >> 8);
            dIndex++;

            // Write blocks.
            var maxBlockSize = bsMap[bsDefault];
            var remaining = src.Length;
            var sIndex = 0;

            // Clear the hashtable.
            clearHashTable(hashTable);

            // Split input into blocks and write.
            while (remaining > 0)
            {
                UInt32 compSize = 0;
                int blockSize = remaining > maxBlockSize ? (int)maxBlockSize : remaining;

                compSize = compressBlock(src, blockBuf, sIndex, blockSize, hashTable);

                if (compSize > blockSize || compSize == 0)
                {
                    // Output uncompressed.
                    Util.writeU32(dst, dIndex, (UInt32)(0x80000000 | blockSize));
                    dIndex += 4;

                    for (var z = sIndex + blockSize; sIndex < z;)
                    {
                        dst[dIndex++] = src[sIndex++];
                    }

                    remaining -= blockSize;
                }
                else
                {
                    // Output compressed.
                    Util.writeU32(dst, dIndex, compSize);
                    dIndex += 4;

                    for (var j = 0; j < compSize;)
                    {
                        dst[dIndex++] = blockBuf[j++];
                    }

                    sIndex += blockSize;
                    remaining -= blockSize;
                }
            }

            // Write blank end block.
            Util.writeU32(dst, dIndex, 0);
            dIndex += 4;

            return (UInt32)dIndex;
        }

        // Decompresses a buffer containing an Lz4 frame. maxSize is optional; if not
        // provided, a maximum size will be determined by examining the data. The
        // buffer returned will always be perfectly-sized.
        public static UInt32[] decompress(UInt32[] src, UInt64 maxSize = 0)
        {
            UInt32[] dst;
            UInt32 size;

            if (maxSize == 0)
            {
                maxSize = decompressBound(src);
            }

            dst = new UInt32[maxSize];
            size = decompressFrame(src, dst);

            if (size != maxSize)
            {
                dst = sliceArray(dst, 0, (int)size);
            }

            return dst;
        }

        // Compresses a buffer to an Lz4 frame. maxSize is optional; if not provided,
        // a buffer will be created based on the theoretical worst output size for a
        // given input size. The buffer returned will always be perfectly-sized.
        public static UInt32[] compress(UInt32[] src, UInt32 maxSize = 0)
        {
            UInt32[] dst;
            UInt32 size;

            if (maxSize == 0)
            {
                maxSize = compressBound((UInt32)src.Length);
            }

            dst = new UInt32[maxSize];
            size = compressFrame(src, dst);

            if (size != maxSize)
            {
                dst = sliceArray(dst, 0, (int)size);
            }

            return dst;
        }
    }
}