背景
前面我们介绍的两种DocValues都是数值类型的。今天我们要开始介绍字符串类型的DocValues。字符串类型的DocValues在存储的时候都是字节数组,其实也就是二进制,所以在Lucene中叫做BinaryDocValues,当然还有其他跟字符串相关的DocValues,本文先介绍最简的BinaryDocValues,一个文档不能有同名的BinaryDocValues。
public class DocValueDemo {
public static void main(String[] args) throws IOException {
Directory directory = FSDirectory.open(new File("D:\\code\\lucene-9.1.0-learning\\data").toPath());
WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriterConfig.setUseCompoundFile(false);
IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
Document document = new Document();
// 一个doc不能有同名的BinaryDocValuesField,存储二进制
document.add(new BinaryDocValuesField("name", new BytesRef("zjc".getBytes(StandardCharsets.UTF_8))));
indexWriter.addDocument(document);
indexWriter.flush();
indexWriter.commit();
indexWriter.close();
}
}
前置知识
本文涉及到的一些知识在之前的文章中都做了详细的介绍,后续碰到不会重复介绍。
- DirectMonotonicWriter:用来压缩存储单调递增的long集合,详见《多值编码压缩算法》
文件格式
dvm
整体结构
字段详解
- FieldNumber:字段编号
- DocValueType:DocValues类型编号
- DataOffset:数据在dvd文件的起始位置
- DataLength:数据在dvd文件的总长度
- DocsWithField:存在此field的docID集合,分为3种情况:
- 所有doc都不包含
- DocsWithFieldOffset:-2,表示所有doc都不包含情况
- DocsWithFieldLength:0
- JumpTableEntryCount:-1
- DenseRankPower:-1
- 所有doc都包含
- DocsWithFieldOffset:-1,表示所有的doc都包含情况
- DocsWithFieldLength:0
- JumpTableEntryCount:-1
- DenseRankPower:-1
- 部分doc包含:使用IndexDISI来存储
- DocsWithFieldOffset:IndexDISI在dvd文件中的起始位置
- DocsWithFieldLength:IndexDISI在dvd文件中的总长度
- JumpTableEntryCount:IndexDISI中jump的参数
- DenseRankPower:IndexDISI中rank的参数
- 所有doc都不包含
- NumDocsWithField:包含此字段的doc数
- MinLength:长度最短的BinaryDocValues
- MaxLength:长度最长的BinaryDocValues
- LengthDataOffset:每个BinaryDocValues长度信息在dvd文件的起始位置。长度信息使用DirectMonotonicWriter存储,第i个位置存储的是value0.length + value1.length + ... + valuei.length。
- BlockShift:DirectMonotonicWriter的参数
- LengthBlockMetas:DirectMonotonicWriter的元信息
- LengthDataLength:长度信息在dvd文件中的总长度
dvd
整体结构
字段详解
- Bytes:所有的binary都存储在一起
- binary:doc的本字段的BinaryDocValues
源码解析
构建
数据收集
BinaryDocValues临时存储的工具是PagedBytes,它逻辑上就是一个byte数组,只是为了避免大数组对内存不友好,所以分成多个block,或者说page。
在持久化的时候,会把BinaryDocValues的值和包含该field的docID集合会根据是否需要对doc进行排序都封装到BufferedBinaryDocValues或者SortingBinaryDocValues中。
SortedNumericDocValuesWriter
class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
// BinaryDocValues值的最大长度
private static final int MAX_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH;
// 4 kB block sizes for PagedBytes storage:
private static final int BLOCK_BITS = 12;
// 逻辑上就是一个byte数组,只是为了避免大数组对内存不友好,所以分成多个block,或者说page
private final PagedBytes bytes;
// bytes的输出流
private final DataOutput bytesOut;
private final Counter iwBytesUsed;
// 临时存储每个二进制的长度,待构建成PackedLongValues
private final PackedLongValues.Builder lengths;
// 记录存在这个字段的docID
private DocsWithFieldSet docsWithField;
private final FieldInfo fieldInfo;
private long bytesUsed;
private int lastDocID = -1;
private int maxLength = 0;
// 记录每个二进制的长度
private PackedLongValues finalLengths;
BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
this.bytes = new PagedBytes(BLOCK_BITS);
this.bytesOut = bytes.getDataOutput();
this.lengths = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
this.iwBytesUsed = iwBytesUsed;
this.docsWithField = new DocsWithFieldSet();
this.bytesUsed = lengths.ramBytesUsed() + docsWithField.ramBytesUsed();
iwBytesUsed.addAndGet(bytesUsed);
}
public void addValue(int docID, BytesRef value) {
if (docID <= lastDocID) {
throw new IllegalArgumentException(
"DocValuesField \""
+ fieldInfo.name
+ "\" appears more than once in this document (only one value is allowed per field)");
}
if (value == null) {
throw new IllegalArgumentException(
"field=\"" + fieldInfo.name + "\": null value not allowed");
}
if (value.length > MAX_LENGTH) {
throw new IllegalArgumentException(
"DocValuesField \"" + fieldInfo.name + "\" is too large, must be <= " + MAX_LENGTH);
}
// 更新最大的长度
maxLength = Math.max(value.length, maxLength);
// 记录当binary的长度
lengths.add(value.length);
try {
// 记录binary的值
bytesOut.writeBytes(value.bytes, value.offset, value.length);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
// 记录docID
docsWithField.add(docID);
updateBytesUsed();
lastDocID = docID;
}
private void updateBytesUsed() {
final long newBytesUsed =
lengths.ramBytesUsed() + bytes.ramBytesUsed() + docsWithField.ramBytesUsed();
iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
bytesUsed = newBytesUsed;
}
@Override
BinaryDocValues getDocValues() {
if (finalLengths == null) {
finalLengths = this.lengths.build();
}
// 封装成 BufferedBinaryDocValues
return new BufferedBinaryDocValues(
finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator());
}
@Override
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer)
throws IOException {
// 冻结bytes
bytes.freeze(false);
if (finalLengths == null) {
finalLengths = this.lengths.build();
}
final BinaryDVs sorted;
if (sortMap != null) {
sorted =
new BinaryDVs(
state.segmentInfo.maxDoc(),
sortMap,
new BufferedBinaryDocValues(
finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator()));
} else {
sorted = null;
}
dvConsumer.addBinaryField(
fieldInfo,
new EmptyDocValuesProducer() {
@Override
public BinaryDocValues getBinary(FieldInfo fieldInfoIn) {
if (fieldInfoIn != fieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo");
}
if (sorted == null) {
return new BufferedBinaryDocValues(
finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator());
} else {
return new SortingBinaryDocValues(sorted);
}
}
});
}
}
BufferedBinaryDocValues
private static class BufferedBinaryDocValues extends BinaryDocValues {
final BytesRefBuilder value;
final PackedLongValues.Iterator lengthsIterator;
final DocIdSetIterator docsWithField;
final DataInput bytesIterator;
BufferedBinaryDocValues(
PackedLongValues lengths,
int maxLength,
DataInput bytesIterator,
DocIdSetIterator docsWithFields) {
this.value = new BytesRefBuilder();
this.value.grow(maxLength);
this.lengthsIterator = lengths.iterator();
this.bytesIterator = bytesIterator;
this.docsWithField = docsWithFields;
}
@Override
public int docID() {
return docsWithField.docID();
}
@Override
public int nextDoc() throws IOException {
int docID = docsWithField.nextDoc();
if (docID != NO_MORE_DOCS) {
int length = Math.toIntExact(lengthsIterator.next());
value.setLength(length);
bytesIterator.readBytes(value.bytes(), 0, length);
}
return docID;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException();
}
@Override
public boolean advanceExact(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
return docsWithField.cost();
}
@Override
public BytesRef binaryValue() {
return value.get();
}
}
SortingBinaryDocValues
static class SortingBinaryDocValues extends BinaryDocValues {
// 封装的是排好序doc和对应的DocValues
private final BinaryDVs dvs;
private final BytesRefBuilder spare = new BytesRefBuilder();
private int docID = -1;
SortingBinaryDocValues(BinaryDVs dvs) {
this.dvs = dvs;
}
@Override
public int nextDoc() {
do {
docID++;
if (docID == dvs.offsets.length) {
return docID = NO_MORE_DOCS;
}
} while (dvs.offsets[docID] <= 0);
return docID;
}
@Override
public int docID() {
return docID;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException("use nextDoc instead");
}
@Override
public boolean advanceExact(int target) throws IOException {
throw new UnsupportedOperationException("use nextDoc instead");
}
@Override
public BytesRef binaryValue() {
dvs.values.get(spare, dvs.offsets[docID] - 1);
return spare.get();
}
@Override
public long cost() {
return dvs.values.size();
}
}
持久化
-
NumericDocValues持久化的核心逻辑在writeValues方法中,主要做了一下几件事:
- 遍历所有的value,存储到dvd文件中
- 记录本字段的docValue元信息
- 记录存在本字段的docID集合
- 如果所有字段的长度没有都相等,则存储每个value的长度
public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);
meta.writeByte(Lucene90DocValuesFormat.BINARY);
BinaryDocValues values = valuesProducer.getBinary(field);
long start = data.getFilePointer();
// 数据在dvd文件中的起始位置
meta.writeLong(start);
int numDocsWithField = 0;
int minLength = Integer.MAX_VALUE;
int maxLength = 0;
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
numDocsWithField++;
BytesRef v = values.binaryValue();
int length = v.length;
data.writeBytes(v.bytes, v.offset, v.length);
minLength = Math.min(length, minLength);
maxLength = Math.max(length, maxLength);
}
assert numDocsWithField <= maxDoc;
// 数据的长度
meta.writeLong(data.getFilePointer() - start);
// 下面处理包含此field的doc集合,存储方案和NumericDocValues一模一样
if (numDocsWithField == 0) {
meta.writeLong(-2);
meta.writeLong(0L);
meta.writeShort((short) -1);
meta.writeByte((byte) -1);
} else if (numDocsWithField == maxDoc) {
meta.writeLong(-1);
meta.writeLong(0L);
meta.writeShort((short) -1);
meta.writeByte((byte) -1);
} else {
long offset = data.getFilePointer();
meta.writeLong(offset);
values = valuesProducer.getBinary(field);
final short jumpTableEntryCount =
IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
meta.writeLong(data.getFilePointer() - offset);
meta.writeShort(jumpTableEntryCount);
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}
meta.writeInt(numDocsWithField);
meta.writeInt(minLength);
meta.writeInt(maxLength);
if (maxLength > minLength) { // 说明不是所有的value的长度都一样,需要额外存储每个value的长度
start = data.getFilePointer();
meta.writeLong(start);
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
final DirectMonotonicWriter writer =
DirectMonotonicWriter.getInstance(
meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
long addr = 0;
writer.add(addr);
values = valuesProducer.getBinary(field);
for (int doc = values.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = values.nextDoc()) {
addr += values.binaryValue().length;
writer.add(addr);
}
writer.finish();
meta.writeLong(data.getFilePointer() - start);
}
}
读取
读取逻辑比较简单:
- 从dvm文件中解析相关的元信息
- 如果所有文档都不包含此字段,则无需继续处理
- 如果所有文档都包含此字段,则可以用maxDoc来判断doc遍历的结束。
- 如果部分文档包含本字段,则使用IndexDISI来遍历doc。
- 数据读取,如果长度都一样,则从元信息中可以获取长度,否则就需要先解析doc对应的BinaryDocValues的长度,然后再读取value。
private BinaryEntry readBinary(IndexInput meta) throws IOException {
final BinaryEntry entry = new BinaryEntry();
entry.dataOffset = meta.readLong();
entry.dataLength = meta.readLong();
entry.docsWithFieldOffset = meta.readLong();
entry.docsWithFieldLength = meta.readLong();
entry.jumpTableEntryCount = meta.readShort();
entry.denseRankPower = meta.readByte();
entry.numDocsWithField = meta.readInt();
entry.minLength = meta.readInt();
entry.maxLength = meta.readInt();
if (entry.minLength < entry.maxLength) { // 说明存在value长度的数据
entry.addressesOffset = meta.readLong();
long numAddresses = entry.numDocsWithField + 1L;
final int blockShift = meta.readVInt();
entry.addressesMeta = DirectMonotonicReader.loadMeta(meta, numAddresses, blockShift);
entry.addressesLength = meta.readLong();
}
return entry;
}
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
BinaryEntry entry = binaries.get(field.name);
if (entry.docsWithFieldOffset == -2) { // 没有doc包含此字段
return DocValues.emptyBinary();
}
final IndexInput bytesSlice = data.slice("fixed-binary", entry.dataOffset, entry.dataLength);
if (entry.docsWithFieldOffset == -1) { // 所有doc都包含此字段
if (entry.minLength == entry.maxLength) { // 如果所有的value长度都相等
// 固定一个value的读取长度
final int length = entry.maxLength;
return new DenseBinaryDocValues(maxDoc) {
final BytesRef bytes = new BytesRef(new byte[length], 0, length);
@Override
public BytesRef binaryValue() throws IOException {
bytesSlice.seek((long) doc * length);
bytesSlice.readBytes(bytes.bytes, 0, length);
return bytes;
}
};
} else { // 如果value的长度不一样,则需要先读取doc的field的value的长度,再读取value
final RandomAccessInput addressesData =
this.data.randomAccessSlice(entry.addressesOffset, entry.addressesLength);
final LongValues addresses =
DirectMonotonicReader.getInstance(entry.addressesMeta, addressesData, merging);
return new DenseBinaryDocValues(maxDoc) {
final BytesRef bytes = new BytesRef(new byte[entry.maxLength], 0, entry.maxLength);
@Override
public BytesRef binaryValue() throws IOException {
long startOffset = addresses.get(doc);
bytes.length = (int) (addresses.get(doc + 1L) - startOffset);
bytesSlice.seek(startOffset);
bytesSlice.readBytes(bytes.bytes, 0, bytes.length);
return bytes;
}
};
}
} else { // 部分doc包含此字段,需要使用IndexedDISI遍历doc
final IndexedDISI disi =
new IndexedDISI(
data,
entry.docsWithFieldOffset,
entry.docsWithFieldLength,
entry.jumpTableEntryCount,
entry.denseRankPower,
entry.numDocsWithField);
if (entry.minLength == entry.maxLength) { // 所有的value的长度都相等
final int length = entry.maxLength;
return new SparseBinaryDocValues(disi) {
final BytesRef bytes = new BytesRef(new byte[length], 0, length);
@Override
public BytesRef binaryValue() throws IOException {
bytesSlice.seek((long) disi.index() * length);
bytesSlice.readBytes(bytes.bytes, 0, length);
return bytes;
}
};
} else { // value的长度不一
final RandomAccessInput addressesData =
this.data.randomAccessSlice(entry.addressesOffset, entry.addressesLength);
final LongValues addresses =
DirectMonotonicReader.getInstance(entry.addressesMeta, addressesData);
return new SparseBinaryDocValues(disi) {
final BytesRef bytes = new BytesRef(new byte[entry.maxLength], 0, entry.maxLength);
@Override
public BytesRef binaryValue() throws IOException {
final int index = disi.index();
long startOffset = addresses.get(index);
bytes.length = (int) (addresses.get(index + 1L) - startOffset);
bytesSlice.seek(startOffset);
bytesSlice.readBytes(bytes.bytes, 0, bytes.length);
return bytes;
}
};
}
}
}
总结
本文介绍了最简的字符串类型的DocValues:BinaryDocValues。后面继续介绍两种可以存储字符串的DocValue,他们的存储基于本文介绍的BinaryDocValues。