forked from opensearch-project/k-NN
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Introduced NativeEngineKNNVectorsFormat as a KNNVectorsFormat for Nat…
…ive engines Signed-off-by: Navneet Verma <navneev@amazon.com>
- Loading branch information
Showing
6 changed files
with
496 additions
and
0 deletions.
There are no files selected for viewing
98 changes: 98 additions & 0 deletions
98
src/main/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineVectorFieldsWriter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
* | ||
* Modifications Copyright OpenSearch Contributors. See | ||
* GitHub history for details. | ||
*/ | ||
|
||
package org.opensearch.knn.index.codec.KNN990Codec; | ||
|
||
import org.apache.lucene.codecs.KnnFieldVectorsWriter; | ||
import org.apache.lucene.index.DocsWithFieldSet; | ||
import org.apache.lucene.index.FieldInfo; | ||
import org.apache.lucene.util.InfoStream; | ||
import org.apache.lucene.util.RamUsageEstimator; | ||
|
||
import java.util.HashMap; | ||
import java.util.Map; | ||
|
||
/** | ||
* NativeEngineVectorFieldsWriter is a class that will be used to accumulate all the vectors during ingestion before | ||
* lucene does a flush. This class ensures that KNNVectorWriter is free from generics and this class can encapsulate | ||
* all the details related to vectors types and docIds. | ||
* | ||
* @param <T> float[] or byte[] | ||
*/ | ||
class NativeEngineVectorFieldsWriter<T> extends KnnFieldVectorsWriter<T> { | ||
private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(NativeEngineVectorFieldsWriter.class); | ||
private final FieldInfo fieldInfo; | ||
private final Map<Integer, T> vectors; | ||
private int lastDocID = -1; | ||
private final DocsWithFieldSet docsWithField; | ||
private final InfoStream infoStream; | ||
|
||
static NativeEngineVectorFieldsWriter<?> create(final FieldInfo fieldInfo, final InfoStream infoStream) { | ||
switch (fieldInfo.getVectorEncoding()) { | ||
case FLOAT32: | ||
return new NativeEngineVectorFieldsWriter<float[]>(fieldInfo, infoStream); | ||
case BYTE: | ||
return new NativeEngineVectorFieldsWriter<byte[]>(fieldInfo, infoStream); | ||
} | ||
throw new IllegalStateException("Unsupported Vector encoding : " + fieldInfo.getVectorEncoding()); | ||
} | ||
|
||
NativeEngineVectorFieldsWriter(final FieldInfo fieldInfo, final InfoStream infoStream) { | ||
this.fieldInfo = fieldInfo; | ||
this.infoStream = infoStream; | ||
vectors = new HashMap<>(); | ||
this.docsWithField = new DocsWithFieldSet(); | ||
} | ||
|
||
/** | ||
* Add new docID with its vector value to the given field for indexing. Doc IDs must be added in | ||
* increasing order. | ||
* | ||
* @param docID int | ||
* @param vectorValue T | ||
*/ | ||
@Override | ||
public void addValue(int docID, T vectorValue) { | ||
if (docID == lastDocID) { | ||
throw new IllegalArgumentException( | ||
"[NativeEngineKNNVectorWriter]VectorValuesField \"" | ||
+ fieldInfo.name | ||
+ "\" appears more than once in this document (only one value is allowed per field)" | ||
); | ||
} | ||
assert docID > lastDocID; | ||
vectors.put(docID, vectorValue); | ||
docsWithField.add(docID); | ||
lastDocID = docID; | ||
} | ||
|
||
/** | ||
* Used to copy values being indexed to internal storage. | ||
* | ||
* @param vectorValue an array containing the vector value to add | ||
* @return a copy of the value; a new array | ||
*/ | ||
@Override | ||
public T copyValue(T vectorValue) { | ||
throw new UnsupportedOperationException("NativeEngineVectorFieldsWriter doesn't support copyValue operation"); | ||
} | ||
|
||
/** | ||
* Return the memory usage of this object in bytes. Negative values are illegal. | ||
*/ | ||
@Override | ||
public long ramBytesUsed() { | ||
return SHALLOW_SIZE + docsWithField.ramBytesUsed() + (long) this.vectors.size() * (long) (RamUsageEstimator.NUM_BYTES_OBJECT_REF | ||
+ RamUsageEstimator.NUM_BYTES_ARRAY_HEADER) + (long) this.vectors.size() * RamUsageEstimator.shallowSizeOfInstance( | ||
Integer.class | ||
) + (long) vectors.size() * fieldInfo.getVectorDimension() * fieldInfo.getVectorEncoding().byteSize; | ||
} | ||
} |
68 changes: 68 additions & 0 deletions
68
...main/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngines99KnnVectorsFormat.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
* | ||
* Modifications Copyright OpenSearch Contributors. See | ||
* GitHub history for details. | ||
*/ | ||
|
||
package org.opensearch.knn.index.codec.KNN990Codec; | ||
|
||
import org.apache.lucene.codecs.KnnVectorsFormat; | ||
import org.apache.lucene.codecs.KnnVectorsReader; | ||
import org.apache.lucene.codecs.KnnVectorsWriter; | ||
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; | ||
import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; | ||
import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat; | ||
import org.apache.lucene.index.SegmentReadState; | ||
import org.apache.lucene.index.SegmentWriteState; | ||
|
||
import java.io.IOException; | ||
|
||
/** | ||
* This is a Vector format that will be used for Native engines like Faiss and Nmslib for reading and writing vector | ||
* related data structures. | ||
*/ | ||
public class NativeEngines99KnnVectorsFormat extends KnnVectorsFormat { | ||
/** The format for storing, reading, merging vectors on disk */ | ||
private static FlatVectorsFormat flatVectorsFormat; | ||
private static final String FORMAT_NAME = "NativeEngines99KnnVectorsFormat"; | ||
|
||
public NativeEngines99KnnVectorsFormat() { | ||
super(FORMAT_NAME); | ||
flatVectorsFormat = new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer()); | ||
} | ||
|
||
public NativeEngines99KnnVectorsFormat(final Lucene99FlatVectorsFormat lucene99FlatVectorsFormat) { | ||
super(FORMAT_NAME); | ||
flatVectorsFormat = lucene99FlatVectorsFormat; | ||
} | ||
|
||
/** | ||
* Returns a {@link KnnVectorsWriter} to write the vectors to the index. | ||
* | ||
* @param state {@link SegmentWriteState} | ||
*/ | ||
@Override | ||
public KnnVectorsWriter fieldsWriter(final SegmentWriteState state) throws IOException { | ||
return new NativeEnginesKNNVectorsWriter(state, flatVectorsFormat.fieldsWriter(state)); | ||
} | ||
|
||
/** | ||
* Returns a {@link KnnVectorsReader} to read the vectors from the index. | ||
* | ||
* @param state {@link SegmentReadState} | ||
*/ | ||
@Override | ||
public KnnVectorsReader fieldsReader(final SegmentReadState state) throws IOException { | ||
return new NativeEnginesKnnVectorsReader(state, flatVectorsFormat.fieldsReader(state)); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "NativeEngines99KnnVectorsFormat(name=NativeEngines99KnnVectorsFormat, flatVectorsFormat=" + flatVectorsFormat + ")"; | ||
} | ||
} |
115 changes: 115 additions & 0 deletions
115
src/main/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEnginesKNNVectorsWriter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
* | ||
* Modifications Copyright OpenSearch Contributors. See | ||
* GitHub history for details. | ||
*/ | ||
|
||
package org.opensearch.knn.index.codec.KNN990Codec; | ||
|
||
import lombok.RequiredArgsConstructor; | ||
import org.apache.lucene.codecs.KnnFieldVectorsWriter; | ||
import org.apache.lucene.codecs.KnnVectorsWriter; | ||
import org.apache.lucene.codecs.hnsw.FlatVectorsWriter; | ||
import org.apache.lucene.index.FieldInfo; | ||
import org.apache.lucene.index.MergeState; | ||
import org.apache.lucene.index.SegmentWriteState; | ||
import org.apache.lucene.index.Sorter; | ||
import org.apache.lucene.util.IOUtils; | ||
import org.apache.lucene.util.RamUsageEstimator; | ||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* A KNNVectorsWriter class for writing the vector data strcutures and flat vectors for Native Engines. | ||
*/ | ||
@RequiredArgsConstructor | ||
public class NativeEnginesKNNVectorsWriter extends KnnVectorsWriter { | ||
private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(NativeEnginesKNNVectorsWriter.class); | ||
private final SegmentWriteState segmentWriteState; | ||
private final FlatVectorsWriter flatVectorsWriter; | ||
private final List<NativeEngineVectorFieldsWriter<?>> fields = new ArrayList<>(); | ||
private boolean finished; | ||
|
||
/** | ||
* Add new field for indexing. | ||
* In Lucene, we use single file for all the vector fields so here we need to see how we are going to make things | ||
* work. | ||
* @param fieldInfo {@link FieldInfo} | ||
*/ | ||
@Override | ||
public KnnFieldVectorsWriter<?> addField(final FieldInfo fieldInfo) throws IOException { | ||
final NativeEngineVectorFieldsWriter<?> newField = NativeEngineVectorFieldsWriter.create(fieldInfo, segmentWriteState.infoStream); | ||
// TODO: we can build the graph here too iteratively. but right now I am skipping that as we need iterative | ||
// graph build support on the JNI layer. | ||
fields.add(newField); | ||
return flatVectorsWriter.addField(fieldInfo, newField); | ||
} | ||
|
||
/** | ||
* Flush all buffered data on disk. This is not fsync. This is lucene flush. | ||
* | ||
* @param maxDoc int | ||
* @param sortMap {@link Sorter.DocMap} | ||
*/ | ||
@Override | ||
public void flush(int maxDoc, final Sorter.DocMap sortMap) throws IOException { | ||
// simply write data in the flat file | ||
flatVectorsWriter.flush(maxDoc, sortMap); | ||
// TODO: add code for creating Vector datastructures during lucene flush operation | ||
} | ||
|
||
@Override | ||
public void mergeOneField(final FieldInfo fieldInfo, final MergeState mergeState) throws IOException { | ||
// This will ensure that we are merging the FlatIndex during force merge. | ||
flatVectorsWriter.mergeOneField(fieldInfo, mergeState); | ||
// TODO: add code for creating Vector datastructures during merge operation | ||
} | ||
|
||
/** | ||
* Called once at the end before close | ||
*/ | ||
@Override | ||
public void finish() throws IOException { | ||
if (finished) { | ||
throw new IllegalStateException("NativeEnginesKNNVectorsWriter is already finished"); | ||
} | ||
finished = true; | ||
flatVectorsWriter.finish(); | ||
} | ||
|
||
/** | ||
* Closes this stream and releases any system resources associated | ||
* with it. If the stream is already closed then invoking this | ||
* method has no effect. | ||
* | ||
* <p> As noted in {@link AutoCloseable#close()}, cases where the | ||
* close may fail require careful attention. It is strongly advised | ||
* to relinquish the underlying resources and to internally | ||
* <em>mark</em> the {@code Closeable} as closed, prior to throwing | ||
* the {@code IOException}. | ||
* | ||
* @throws IOException if an I/O error occurs | ||
*/ | ||
@Override | ||
public void close() throws IOException { | ||
IOUtils.close(flatVectorsWriter); | ||
} | ||
|
||
/** | ||
* Return the memory usage of this object in bytes. Negative values are illegal. | ||
*/ | ||
@Override | ||
public long ramBytesUsed() { | ||
return SHALLOW_SIZE + flatVectorsWriter.ramBytesUsed() + fields.stream() | ||
.mapToLong(NativeEngineVectorFieldsWriter::ramBytesUsed) | ||
.sum(); | ||
} | ||
|
||
} |
Oops, something went wrong.