本文已参与「新人创作礼」活动,一起开启掘金创作之路。
C#重写Lz4 JavaScript文本压缩算法(一) - 掘金 (juejin.cn)
C#重写Lz4 JavaScript文本压缩算法(二) - 掘金 (juejin.cn)
C#重写Lz4 JavaScript文本压缩算法(三) - 掘金 (juejin.cn)
lz4.js -> Lz4.cs
using System;
using System.Collections.Generic;
namespace Lz4CSharp
{
class Lz4
{
// lz4.js - An implementation of Lz4 in plain JavaScript.
//
// TODO:
// - Unify header parsing/writing.
// - Support options (block size, checksums)
// - Support streams
// - Better error handling (handle bad offset, etc.)
// - HC support (better search algorithm)
// - Tests/benchmarking
// Constants
// --
// Compression format parameters/constants.
static UInt32 minMatch = 4;
static UInt32 minLength = 13;
static UInt32 searchLimit = 5;
static int skipTrigger = 6;
static UInt32 hashSize = 1 << 16;
// Token constants.
static int mlBits = 4;
static int mlMask = (1 << mlBits) - 1;
static int runBits = 4;
static int runMask = (1 << runBits) - 1;
// Shared buffers
static UInt32[] blockBuf = new UInt32[5 << 20];
static UInt32[] hashTable = new UInt32[hashSize];
// Frame constants.
static UInt32 magicNum = 0x184D2204;
// Frame descriptor flags.
static UInt32 fdContentChksum = 0x4;
static UInt32 fdContentSize = 0x8;
static UInt32 fdBlockChksum = 0x10;
// var fdBlockIndep = 0x20;
static UInt32 fdVersion = 0x40;
static UInt32 fdVersionMask = 0xC0;
// Block sizes.
static UInt32 bsUncompressed = 0x80000000;
static UInt32 bsDefault = 7;
static int bsShift = 4;
static UInt32 bsMask = 7;
static Dictionary<UInt32, UInt32> bsMap = new Dictionary<UInt32, UInt32>();
public static void Init()
{
bsMap.Add(4, 0x10000);
bsMap.Add(5, 0x40000);
bsMap.Add(6, 0x100000);
bsMap.Add(7, 0x400000);
}
// Utility functions/primitives
// --
// Clear hashtable.
static void clearHashTable(UInt32[] table)
{
for (var i = 0; i < hashSize; i++)
{
hashTable[i] = 0;
}
}
static UInt32[] sliceArray(UInt32[] array, int start, int end)
{
// Uint8Array#slice polyfill.
var len = array.Length;
// Calculate start.
start = start | 0;
start = (start < 0) ? Math.Max(len + start, 0) : Math.Min(start, len);
// Calculate end.
end = end == null ? len : end | 0;
end = (end < 0) ? Math.Max(len + end, 0) : Math.Min(end, len);
// Copy into new array.
var arraySlice = new UInt32[end - start];
for (int i = start, n = 0; i < end;)
{
arraySlice[n++] = array[i++];
}
return arraySlice;
}
// Implementation
// --
// Calculates an upper bound for lz4 compression.
static UInt32 compressBound(UInt32 n)
{
return (n + (n / 255) + 16) | 0;
}
// Calculates an upper bound for lz4 decompression, by reading the data.
static UInt64 decompressBound(UInt32[] src)
{
int sIndex = 0;
// Read magic number
if (Util.readU32(src, sIndex) != magicNum)
{
throw new Exception("invalid magic number");
}
sIndex += 4;
// Read descriptor
var descriptor = src[sIndex++];
// Check version
if ((descriptor & fdVersionMask) != fdVersion)
{
throw new Exception($"incompatible descriptor version {descriptor & fdVersionMask}");
}
// Read flags
var useBlockSum = (descriptor & fdBlockChksum) != 0;
var useContentSize = (descriptor & fdContentSize) != 0;
// Read block size
UInt32 bsIdx = (src[sIndex++] >> bsShift) & bsMask;
if (!bsMap.ContainsKey(bsIdx))
{
throw new Exception($"invalid block size {bsIdx}");
}
var maxBlockSize = bsMap[bsIdx];
// Get content size
if (useContentSize)
{
return Util.readU64(src, sIndex);
}
// Checksum
sIndex++;
// Read blocks.
UInt32 maxSize = 0;
while (true)
{
var blockSize = Util.readU32(src, sIndex);
sIndex += 4;
if ((blockSize & bsUncompressed) != 0)
{
blockSize &= ~bsUncompressed;
maxSize += blockSize;
}
else
{
maxSize += maxBlockSize;
}
if (blockSize == 0)
{
return maxSize;
}
if (useBlockSum)
{
sIndex += 4;
}
sIndex += (int)blockSize;
}
}
// Decompresses a block of Lz4.
static UInt32 decompressBlock(UInt32[] src, UInt32[] dst, UInt32 sIndex, UInt32 sLength, UInt32 dIndex)
{
UInt32 mLength, mOffset, sEnd, n, i;
// Setup initial state.
sEnd = sIndex + sLength;
// Consume entire input block.
while (sIndex < sEnd)
{
var token = src[sIndex++];
// Copy literals.
var literalCount = (token >> 4);
if (literalCount > 0)
{
// Parse length.
if (literalCount == 0xf)
{
while (true)
{
literalCount += src[sIndex];
if (src[sIndex++] != 0xff)
{
break;
}
}
}
// Copy literals
for (n = sIndex + literalCount; sIndex < n;)
{
dst[dIndex++] = src[sIndex++];
}
}
if (sIndex >= sEnd)
{
break;
}
// Copy match.
mLength = (token & 0xf);
// Parse offset.
mOffset = src[sIndex++] | (src[sIndex++] << 8);
// Parse length.
if (mLength == 0xf)
{
while (true)
{
mLength += src[sIndex];
if (src[sIndex++] != 0xff)
{
break;
}
}
}
mLength += minMatch;
// Copy match.
for (i = dIndex - mOffset, n = i + mLength; i < n;)
{
dst[dIndex++] = dst[i++] | 0;
}
}
return dIndex;
}
// Compresses a block with Lz4.
static UInt32 compressBlock(UInt32[] src, UInt32[] dst, int sIndex, int sLength, UInt32[] hashTable)
{
int mIndex, mAnchor, mLength, mOffset, mStep;
int literalCount, dIndex, sEnd, n;
// Setup initial state.
dIndex = 0;
sEnd = sLength + sIndex;
mAnchor = sIndex;
// Process only if block is large enough.
if (sLength >= minLength)
{
var searchMatchCount = (1 << skipTrigger) + 3;
// Consume until last n literals (Lz4 spec limitation.)
while (sIndex + minMatch < sEnd - searchLimit)
{
var seq = Util.readU32(src, (int)sIndex);
var hash = Util.UInt32MoveRight(Util.hashU32(seq), 0);
// Crush hash to 16 bits.
hash = Util.UInt32MoveRight((hash >> 16) ^ hash , 0) & 0xffff;
// Look for a match in the hashtable. NOTE: remove one; see below.
mIndex = (int)hashTable[hash] - 1;
// Put pos in hash table. NOTE: add one so that zero = invalid.
hashTable[hash] = (UInt32)sIndex + 1;
// Determine if there is a match (within range.)
if (mIndex < 0 || Util.UInt32MoveRight((UInt32)(sIndex - mIndex) , 16) > 0 || Util.readU32(src, (int)mIndex) != seq)
{
mStep = searchMatchCount++ >> skipTrigger;
sIndex += mStep;
continue;
}
searchMatchCount = (1 << skipTrigger) + 3;
// Calculate literal count and offset.
literalCount = sIndex - mAnchor;
mOffset = sIndex - mIndex;
// We've already matched one word, so get that out of the way.
sIndex += (int)minMatch;
mIndex += (int)minMatch;
// Determine match length.
// N.B.: mLength does not include minMatch, Lz4 adds it back
// in decoding.
mLength = sIndex;
while (sIndex < sEnd - searchLimit && src[sIndex] == src[mIndex])
{
sIndex++;
mIndex++;
}
mLength = sIndex - mLength;
// Write token + literal count.
int token = mLength < mlMask ? mLength : mlMask;
if (literalCount >= runMask)
{
dst[dIndex++] = (UInt32)((runMask << mlBits) + token);
for (n = literalCount - runMask; n >= 0xff; n -= 0xff)
{
dst[dIndex++] = 0xff;
}
dst[dIndex++] = (UInt32)n;
}
else
{
dst[dIndex++] = (UInt32)((literalCount << mlBits) + token);
}
// Write literals.
for (var i = 0; i < literalCount; i++)
{
dst[dIndex++] = src[mAnchor + i];
}
// Write offset.
dst[dIndex++] = (UInt32)mOffset;
dst[dIndex++] = (UInt32)mOffset >> 8;
// Write match length.
if (mLength >= mlMask)
{
for (n = mLength - mlMask; n >= 0xff; n -= 0xff)
{
dst[dIndex++] = 0xff;
}
dst[dIndex++] = (UInt32)n;
}
// Move the anchor.
mAnchor = sIndex;
}
}
// Nothing was encoded.
if (mAnchor == 0)
{
return 0;
}
// Write remaining literals.
// Write literal token+count.
literalCount = sEnd - mAnchor;
if (literalCount >= runMask)
{
dst[dIndex++] = (UInt32)(runMask << mlBits);
for (n = literalCount - runMask; n >= 0xff; n -= 0xff)
{
dst[dIndex++] = 0xff;
}
dst[dIndex++] = (UInt32)n;
}
else
{
dst[dIndex++] = (UInt32)(literalCount << mlBits);
}
// Write literals.
sIndex = mAnchor;
while (sIndex < sEnd)
{
dst[dIndex++] = src[sIndex++];
}
return (UInt32)dIndex;
}
// Decompresses a frame of Lz4 data.
static UInt32 decompressFrame(UInt32[] src, UInt32[] dst)
{
bool useBlockSum, useContentSum, useContentSize;
UInt32 descriptor;
UInt32 sIndex = 0;
UInt32 dIndex = 0;
// Read magic number
if (Util.readU32(src, (int)sIndex) != magicNum)
{
throw new Exception("invalid magic number");
}
sIndex += 4;
// Read descriptor
descriptor = src[sIndex++];
// Check version
if ((descriptor & fdVersionMask) != fdVersion)
{
throw new Exception("incompatible descriptor version");
}
// Read flags
useBlockSum = (descriptor & fdBlockChksum) != 0;
useContentSum = (descriptor & fdContentChksum) != 0;
useContentSize = (descriptor & fdContentSize) != 0;
// Read block size
var bsIdx = (src[sIndex++] >> bsShift) & bsMask;
if (!bsMap.ContainsKey(bsIdx))
{
throw new Exception("invalid block size");
}
if (useContentSize)
{
// TODO: read content size
sIndex += 8;
}
sIndex++;
// Read blocks.
while (true)
{
UInt32 compSize;
compSize = Util.readU32(src, (int)sIndex);
sIndex += 4;
if (compSize == 0)
{
break;
}
if (useBlockSum)
{
// TODO: read block checksum
sIndex += 4;
}
// Check if block is compressed
if ((compSize & bsUncompressed) != 0)
{
// Mask off the 'uncompressed' bit
compSize &= ~bsUncompressed;
// Copy uncompressed data into destination buffer.
for (var j = 0; j < compSize; j++)
{
dst[dIndex++] = src[sIndex++];
}
}
else
{
// Decompress into blockBuf
dIndex = decompressBlock(src, dst, sIndex, compSize, dIndex);
sIndex += compSize;
}
}
if (useContentSum)
{
// TODO: read content checksum
sIndex += 4;
}
return dIndex;
}
// Compresses data to an Lz4 frame.
static UInt32 compressFrame(UInt32[] src, UInt32[] dst)
{
var dIndex = 0;
// Write magic number.
Util.writeU32(dst, dIndex, magicNum);
dIndex += 4;
// Descriptor flags.
dst[dIndex++] = fdVersion;
dst[dIndex++] = bsDefault << bsShift;
UInt32 x32 = Xxh32.xxh32(0, dst, 4, dIndex - 4);
// Descriptor checksum.
dst[dIndex] = (UInt32)((UInt16)x32 >> 8);
dIndex++;
// Write blocks.
var maxBlockSize = bsMap[bsDefault];
var remaining = src.Length;
var sIndex = 0;
// Clear the hashtable.
clearHashTable(hashTable);
// Split input into blocks and write.
while (remaining > 0)
{
UInt32 compSize = 0;
int blockSize = remaining > maxBlockSize ? (int)maxBlockSize : remaining;
compSize = compressBlock(src, blockBuf, sIndex, blockSize, hashTable);
if (compSize > blockSize || compSize == 0)
{
// Output uncompressed.
Util.writeU32(dst, dIndex, (UInt32)(0x80000000 | blockSize));
dIndex += 4;
for (var z = sIndex + blockSize; sIndex < z;)
{
dst[dIndex++] = src[sIndex++];
}
remaining -= blockSize;
}
else
{
// Output compressed.
Util.writeU32(dst, dIndex, compSize);
dIndex += 4;
for (var j = 0; j < compSize;)
{
dst[dIndex++] = blockBuf[j++];
}
sIndex += blockSize;
remaining -= blockSize;
}
}
// Write blank end block.
Util.writeU32(dst, dIndex, 0);
dIndex += 4;
return (UInt32)dIndex;
}
// Decompresses a buffer containing an Lz4 frame. maxSize is optional; if not
// provided, a maximum size will be determined by examining the data. The
// buffer returned will always be perfectly-sized.
public static UInt32[] decompress(UInt32[] src, UInt64 maxSize = 0)
{
UInt32[] dst;
UInt32 size;
if (maxSize == 0)
{
maxSize = decompressBound(src);
}
dst = new UInt32[maxSize];
size = decompressFrame(src, dst);
if (size != maxSize)
{
dst = sliceArray(dst, 0, (int)size);
}
return dst;
}
// Compresses a buffer to an Lz4 frame. maxSize is optional; if not provided,
// a buffer will be created based on the theoretical worst output size for a
// given input size. The buffer returned will always be perfectly-sized.
public static UInt32[] compress(UInt32[] src, UInt32 maxSize = 0)
{
UInt32[] dst;
UInt32 size;
if (maxSize == 0)
{
maxSize = compressBound((UInt32)src.Length);
}
dst = new UInt32[maxSize];
size = compressFrame(src, dst);
if (size != maxSize)
{
dst = sliceArray(dst, 0, (int)size);
}
return dst;
}
}
}