Skip to content

Commit

Permalink
Introduced NativeEngineKNNVectorsFormat as a KNNVectorsFormat for Nat…
Browse files Browse the repository at this point in the history
…ive engines

Signed-off-by: Navneet Verma <navneev@amazon.com>
  • Loading branch information
navneet1v committed Jul 16, 2024
1 parent fe1d86f commit 74114c8
Show file tree
Hide file tree
Showing 6 changed files with 496 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.knn.index.codec.KNN990Codec;

import org.apache.lucene.codecs.KnnFieldVectorsWriter;
import org.apache.lucene.index.DocsWithFieldSet;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.RamUsageEstimator;

import java.util.HashMap;
import java.util.Map;

/**
* NativeEngineVectorFieldsWriter is a class that will be used to accumulate all the vectors during ingestion before
* lucene does a flush. This class ensures that KNNVectorWriter is free from generics and this class can encapsulate
* all the details related to vectors types and docIds.
*
* @param <T> float[] or byte[]
*/
class NativeEngineVectorFieldsWriter<T> extends KnnFieldVectorsWriter<T> {
private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(NativeEngineVectorFieldsWriter.class);
private final FieldInfo fieldInfo;
private final Map<Integer, T> vectors;
private int lastDocID = -1;
private final DocsWithFieldSet docsWithField;
private final InfoStream infoStream;

static NativeEngineVectorFieldsWriter<?> create(final FieldInfo fieldInfo, final InfoStream infoStream) {
switch (fieldInfo.getVectorEncoding()) {
case FLOAT32:
return new NativeEngineVectorFieldsWriter<float[]>(fieldInfo, infoStream);
case BYTE:
return new NativeEngineVectorFieldsWriter<byte[]>(fieldInfo, infoStream);
}
throw new IllegalStateException("Unsupported Vector encoding : " + fieldInfo.getVectorEncoding());
}

NativeEngineVectorFieldsWriter(final FieldInfo fieldInfo, final InfoStream infoStream) {
this.fieldInfo = fieldInfo;
this.infoStream = infoStream;
vectors = new HashMap<>();
this.docsWithField = new DocsWithFieldSet();
}

/**
* Add new docID with its vector value to the given field for indexing. Doc IDs must be added in
* increasing order.
*
* @param docID int
* @param vectorValue T
*/
@Override
public void addValue(int docID, T vectorValue) {
if (docID == lastDocID) {
throw new IllegalArgumentException(
"[NativeEngineKNNVectorWriter]VectorValuesField \""
+ fieldInfo.name
+ "\" appears more than once in this document (only one value is allowed per field)"
);
}
assert docID > lastDocID;
vectors.put(docID, vectorValue);
docsWithField.add(docID);
lastDocID = docID;
}

/**
* Used to copy values being indexed to internal storage.
*
* @param vectorValue an array containing the vector value to add
* @return a copy of the value; a new array
*/
@Override
public T copyValue(T vectorValue) {
throw new UnsupportedOperationException("NativeEngineVectorFieldsWriter doesn't support copyValue operation");
}

/**
* Return the memory usage of this object in bytes. Negative values are illegal.
*/
@Override
public long ramBytesUsed() {
return SHALLOW_SIZE + docsWithField.ramBytesUsed() + (long) this.vectors.size() * (long) (RamUsageEstimator.NUM_BYTES_OBJECT_REF
+ RamUsageEstimator.NUM_BYTES_ARRAY_HEADER) + (long) this.vectors.size() * RamUsageEstimator.shallowSizeOfInstance(
Integer.class
) + (long) vectors.size() * fieldInfo.getVectorDimension() * fieldInfo.getVectorEncoding().byteSize;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.knn.index.codec.KNN990Codec;

import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.KnnVectorsWriter;
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
import org.apache.lucene.codecs.hnsw.FlatVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;

import java.io.IOException;

/**
* This is a Vector format that will be used for Native engines like Faiss and Nmslib for reading and writing vector
* related data structures.
*/
public class NativeEngines99KnnVectorsFormat extends KnnVectorsFormat {
/** The format for storing, reading, merging vectors on disk */
private static FlatVectorsFormat flatVectorsFormat;
private static final String FORMAT_NAME = "NativeEngines99KnnVectorsFormat";

public NativeEngines99KnnVectorsFormat() {
super(FORMAT_NAME);
flatVectorsFormat = new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer());
}

public NativeEngines99KnnVectorsFormat(final Lucene99FlatVectorsFormat lucene99FlatVectorsFormat) {
super(FORMAT_NAME);
flatVectorsFormat = lucene99FlatVectorsFormat;
}

/**
* Returns a {@link KnnVectorsWriter} to write the vectors to the index.
*
* @param state {@link SegmentWriteState}
*/
@Override
public KnnVectorsWriter fieldsWriter(final SegmentWriteState state) throws IOException {
return new NativeEnginesKNNVectorsWriter(state, flatVectorsFormat.fieldsWriter(state));
}

/**
* Returns a {@link KnnVectorsReader} to read the vectors from the index.
*
* @param state {@link SegmentReadState}
*/
@Override
public KnnVectorsReader fieldsReader(final SegmentReadState state) throws IOException {
return new NativeEnginesKnnVectorsReader(state, flatVectorsFormat.fieldsReader(state));
}

@Override
public String toString() {
return "NativeEngines99KnnVectorsFormat(name=NativeEngines99KnnVectorsFormat, flatVectorsFormat=" + flatVectorsFormat + ")";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.knn.index.codec.KNN990Codec;

import lombok.RequiredArgsConstructor;
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
import org.apache.lucene.codecs.KnnVectorsWriter;
import org.apache.lucene.codecs.hnsw.FlatVectorsWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Sorter;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
* A KNNVectorsWriter class for writing the vector data strcutures and flat vectors for Native Engines.
*/
@RequiredArgsConstructor
public class NativeEnginesKNNVectorsWriter extends KnnVectorsWriter {
private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(NativeEnginesKNNVectorsWriter.class);
private final SegmentWriteState segmentWriteState;
private final FlatVectorsWriter flatVectorsWriter;
private final List<NativeEngineVectorFieldsWriter<?>> fields = new ArrayList<>();
private boolean finished;

/**
* Add new field for indexing.
* In Lucene, we use single file for all the vector fields so here we need to see how we are going to make things
* work.
* @param fieldInfo {@link FieldInfo}
*/
@Override
public KnnFieldVectorsWriter<?> addField(final FieldInfo fieldInfo) throws IOException {
final NativeEngineVectorFieldsWriter<?> newField = NativeEngineVectorFieldsWriter.create(fieldInfo, segmentWriteState.infoStream);
// TODO: we can build the graph here too iteratively. but right now I am skipping that as we need iterative
// graph build support on the JNI layer.
fields.add(newField);
return flatVectorsWriter.addField(fieldInfo, newField);
}

/**
* Flush all buffered data on disk. This is not fsync. This is lucene flush.
*
* @param maxDoc int
* @param sortMap {@link Sorter.DocMap}
*/
@Override
public void flush(int maxDoc, final Sorter.DocMap sortMap) throws IOException {
// simply write data in the flat file
flatVectorsWriter.flush(maxDoc, sortMap);
// TODO: add code for creating Vector datastructures during lucene flush operation
}

@Override
public void mergeOneField(final FieldInfo fieldInfo, final MergeState mergeState) throws IOException {
// This will ensure that we are merging the FlatIndex during force merge.
flatVectorsWriter.mergeOneField(fieldInfo, mergeState);
// TODO: add code for creating Vector datastructures during merge operation
}

/**
* Called once at the end before close
*/
@Override
public void finish() throws IOException {
if (finished) {
throw new IllegalStateException("NativeEnginesKNNVectorsWriter is already finished");
}
finished = true;
flatVectorsWriter.finish();
}

/**
* Closes this stream and releases any system resources associated
* with it. If the stream is already closed then invoking this
* method has no effect.
*
* <p> As noted in {@link AutoCloseable#close()}, cases where the
* close may fail require careful attention. It is strongly advised
* to relinquish the underlying resources and to internally
* <em>mark</em> the {@code Closeable} as closed, prior to throwing
* the {@code IOException}.
*
* @throws IOException if an I/O error occurs
*/
@Override
public void close() throws IOException {
IOUtils.close(flatVectorsWriter);
}

/**
* Return the memory usage of this object in bytes. Negative values are illegal.
*/
@Override
public long ramBytesUsed() {
return SHALLOW_SIZE + flatVectorsWriter.ramBytesUsed() + fields.stream()
.mapToLong(NativeEngineVectorFieldsWriter::ramBytesUsed)
.sum();
}

}
Loading

0 comments on commit 74114c8

Please sign in to comment.