forked from opensearch-project/k-NN
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Introduced NativeEngineKNNVectorsFormat as a KNNVectorsFormat for Nat…
…ive engines Signed-off-by: Navneet Verma <navneev@amazon.com>
- Loading branch information
Showing
9 changed files
with
794 additions
and
1 deletion.
There are no files selected for viewing
102 changes: 102 additions & 0 deletions
102
src/main/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngineVectorFieldsWriter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
* | ||
* Modifications Copyright OpenSearch Contributors. See | ||
* GitHub history for details. | ||
*/ | ||
|
||
package org.opensearch.knn.index.codec.KNN990Codec; | ||
|
||
import lombok.Getter; | ||
import org.apache.lucene.codecs.KnnFieldVectorsWriter; | ||
import org.apache.lucene.index.DocsWithFieldSet; | ||
import org.apache.lucene.index.FieldInfo; | ||
import org.apache.lucene.util.InfoStream; | ||
import org.apache.lucene.util.RamUsageEstimator; | ||
|
||
import java.util.HashMap; | ||
import java.util.Map; | ||
|
||
/** | ||
* NativeEngineVectorFieldsWriter is a class that will be used to accumulate all the vectors during ingestion before | ||
* lucene does a flush. This class ensures that KNNVectorWriter is free from generics and this class can encapsulate | ||
* all the details related to vectors types and docIds. | ||
* | ||
* @param <T> float[] or byte[] | ||
*/ | ||
class NativeEngineVectorFieldsWriter<T> extends KnnFieldVectorsWriter<T> { | ||
private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(NativeEngineVectorFieldsWriter.class); | ||
private final FieldInfo fieldInfo; | ||
@Getter | ||
private final Map<Integer, T> vectors; | ||
private int lastDocID = -1; | ||
|
||
@Getter | ||
private final DocsWithFieldSet docsWithField; | ||
private final InfoStream infoStream; | ||
|
||
static NativeEngineVectorFieldsWriter<?> create(final FieldInfo fieldInfo, final InfoStream infoStream) { | ||
switch (fieldInfo.getVectorEncoding()) { | ||
case FLOAT32: | ||
return new NativeEngineVectorFieldsWriter<float[]>(fieldInfo, infoStream); | ||
case BYTE: | ||
return new NativeEngineVectorFieldsWriter<byte[]>(fieldInfo, infoStream); | ||
} | ||
throw new IllegalStateException("Unsupported Vector encoding : " + fieldInfo.getVectorEncoding()); | ||
} | ||
|
||
NativeEngineVectorFieldsWriter(final FieldInfo fieldInfo, final InfoStream infoStream) { | ||
this.fieldInfo = fieldInfo; | ||
this.infoStream = infoStream; | ||
vectors = new HashMap<>(); | ||
this.docsWithField = new DocsWithFieldSet(); | ||
} | ||
|
||
/** | ||
* Add new docID with its vector value to the given field for indexing. Doc IDs must be added in | ||
* increasing order. | ||
* | ||
* @param docID int | ||
* @param vectorValue T | ||
*/ | ||
@Override | ||
public void addValue(int docID, T vectorValue) { | ||
if (docID == lastDocID) { | ||
throw new IllegalArgumentException( | ||
"[NativeEngineKNNVectorWriter]VectorValuesField \"" | ||
+ fieldInfo.name | ||
+ "\" appears more than once in this document (only one value is allowed per field)" | ||
); | ||
} | ||
assert docID > lastDocID; | ||
vectors.put(docID, vectorValue); | ||
docsWithField.add(docID); | ||
lastDocID = docID; | ||
} | ||
|
||
/** | ||
* Used to copy values being indexed to internal storage. | ||
* | ||
* @param vectorValue an array containing the vector value to add | ||
* @return a copy of the value; a new array | ||
*/ | ||
@Override | ||
public T copyValue(T vectorValue) { | ||
throw new UnsupportedOperationException("NativeEngineVectorFieldsWriter doesn't support copyValue operation"); | ||
} | ||
|
||
/** | ||
* Return the memory usage of this object in bytes. Negative values are illegal. | ||
*/ | ||
@Override | ||
public long ramBytesUsed() { | ||
return SHALLOW_SIZE + docsWithField.ramBytesUsed() + (long) this.vectors.size() * (long) (RamUsageEstimator.NUM_BYTES_OBJECT_REF | ||
+ RamUsageEstimator.NUM_BYTES_ARRAY_HEADER) + (long) this.vectors.size() * RamUsageEstimator.shallowSizeOfInstance( | ||
Integer.class | ||
) + (long) vectors.size() * fieldInfo.getVectorDimension() * fieldInfo.getVectorEncoding().byteSize; | ||
} | ||
} |
68 changes: 68 additions & 0 deletions
68
...ain/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngines990KnnVectorsFormat.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
* | ||
* Modifications Copyright OpenSearch Contributors. See | ||
* GitHub history for details. | ||
*/ | ||
|
||
package org.opensearch.knn.index.codec.KNN990Codec; | ||
|
||
import org.apache.lucene.codecs.KnnVectorsFormat; | ||
import org.apache.lucene.codecs.KnnVectorsReader; | ||
import org.apache.lucene.codecs.KnnVectorsWriter; | ||
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; | ||
import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; | ||
import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat; | ||
import org.apache.lucene.index.SegmentReadState; | ||
import org.apache.lucene.index.SegmentWriteState; | ||
|
||
import java.io.IOException; | ||
|
||
/** | ||
* This is a Vector format that will be used for Native engines like Faiss and Nmslib for reading and writing vector | ||
* related data structures. | ||
*/ | ||
public class NativeEngines990KnnVectorsFormat extends KnnVectorsFormat { | ||
/** The format for storing, reading, merging vectors on disk */ | ||
private static FlatVectorsFormat flatVectorsFormat; | ||
private static final String FORMAT_NAME = "NativeEngines99KnnVectorsFormat"; | ||
|
||
public NativeEngines990KnnVectorsFormat() { | ||
super(FORMAT_NAME); | ||
flatVectorsFormat = new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer()); | ||
} | ||
|
||
public NativeEngines990KnnVectorsFormat(final Lucene99FlatVectorsFormat lucene99FlatVectorsFormat) { | ||
super(FORMAT_NAME); | ||
flatVectorsFormat = lucene99FlatVectorsFormat; | ||
} | ||
|
||
/** | ||
* Returns a {@link KnnVectorsWriter} to write the vectors to the index. | ||
* | ||
* @param state {@link SegmentWriteState} | ||
*/ | ||
@Override | ||
public KnnVectorsWriter fieldsWriter(final SegmentWriteState state) throws IOException { | ||
return new NativeEngines990KnnVectorsWriter(state, flatVectorsFormat.fieldsWriter(state)); | ||
} | ||
|
||
/** | ||
* Returns a {@link KnnVectorsReader} to read the vectors from the index. | ||
* | ||
* @param state {@link SegmentReadState} | ||
*/ | ||
@Override | ||
public KnnVectorsReader fieldsReader(final SegmentReadState state) throws IOException { | ||
return new NativeEngines990KnnVectorsReader(state, flatVectorsFormat.fieldsReader(state)); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "NativeEngines99KnnVectorsFormat(name=NativeEngines99KnnVectorsFormat, flatVectorsFormat=" + flatVectorsFormat + ")"; | ||
} | ||
} |
162 changes: 162 additions & 0 deletions
162
...ain/java/org/opensearch/knn/index/codec/KNN990Codec/NativeEngines990KnnVectorsReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
* | ||
* Modifications Copyright OpenSearch Contributors. See | ||
* GitHub history for details. | ||
*/ | ||
|
||
package org.opensearch.knn.index.codec.KNN990Codec; | ||
|
||
import org.apache.lucene.codecs.KnnVectorsReader; | ||
import org.apache.lucene.codecs.hnsw.FlatVectorsReader; | ||
import org.apache.lucene.index.ByteVectorValues; | ||
import org.apache.lucene.index.FieldInfo; | ||
import org.apache.lucene.index.FloatVectorValues; | ||
import org.apache.lucene.index.SegmentReadState; | ||
import org.apache.lucene.search.KnnCollector; | ||
import org.apache.lucene.search.ScoreDoc; | ||
import org.apache.lucene.search.TopDocs; | ||
import org.apache.lucene.search.TotalHits; | ||
import org.apache.lucene.util.Bits; | ||
import org.apache.lucene.util.IOUtils; | ||
|
||
import java.io.IOException; | ||
|
||
/** | ||
* Vectors reader class for reading the flat vectors for native engines. The class provides methods for iterating | ||
* over the vectors and retrieving their values. | ||
*/ | ||
public class NativeEngines990KnnVectorsReader extends KnnVectorsReader { | ||
|
||
private final FlatVectorsReader flatVectorsReader; | ||
|
||
public NativeEngines990KnnVectorsReader(final SegmentReadState state, final FlatVectorsReader flatVectorsReader) { | ||
this.flatVectorsReader = flatVectorsReader; | ||
} | ||
|
||
/** | ||
* Checks consistency of this reader. | ||
* | ||
* <p>Note that this may be costly in terms of I/O, e.g. may involve computing a checksum value | ||
* against large data files. | ||
* | ||
*/ | ||
@Override | ||
public void checkIntegrity() throws IOException { | ||
flatVectorsReader.checkIntegrity(); | ||
} | ||
|
||
/** | ||
* Returns the {@link FloatVectorValues} for the given {@code field}. The behavior is undefined if | ||
* the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is | ||
* never {@code null}. | ||
* | ||
* @param field {@link String} | ||
*/ | ||
@Override | ||
public FloatVectorValues getFloatVectorValues(final String field) throws IOException { | ||
return flatVectorsReader.getFloatVectorValues(field); | ||
} | ||
|
||
/** | ||
* Returns the {@link ByteVectorValues} for the given {@code field}. The behavior is undefined if | ||
* the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is | ||
* never {@code null}. | ||
* | ||
* @param field {@link String} | ||
*/ | ||
@Override | ||
public ByteVectorValues getByteVectorValues(final String field) throws IOException { | ||
return flatVectorsReader.getByteVectorValues(field); | ||
} | ||
|
||
/** | ||
* Return the k nearest neighbor documents as determined by comparison of their vector values for | ||
* this field, to the given vector, by the field's similarity function. The score of each document | ||
* is derived from the vector similarity in a way that ensures scores are positive and that a | ||
* larger score corresponds to a higher ranking. | ||
* | ||
* <p>The search is allowed to be approximate, meaning the results are not guaranteed to be the | ||
* true k closest neighbors. For large values of k (for example when k is close to the total | ||
* number of documents), the search may also retrieve fewer than k documents. | ||
* | ||
* <p>The returned {@link TopDocs} will contain a {@link ScoreDoc} for each nearest neighbor, in | ||
* order of their similarity to the query vector (decreasing scores). The {@link TotalHits} | ||
* contains the number of documents visited during the search. If the search stopped early because | ||
* it hit {@code visitedLimit}, it is indicated through the relation {@code | ||
* TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO}. | ||
* | ||
* <p>The behavior is undefined if the given field doesn't have KNN vectors enabled on its {@link | ||
* FieldInfo}. The return value is never {@code null}. | ||
* | ||
* @param field the vector field to search | ||
* @param target the vector-valued query | ||
* @param knnCollector a KnnResults collector and relevant settings for gathering vector results | ||
* @param acceptDocs {@link Bits} that represents the allowed documents to match, or {@code null} | ||
* if they are all allowed to match. | ||
*/ | ||
@Override | ||
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { | ||
throw new UnsupportedOperationException("Search functionality using codec is not supported with Native Engine Reader"); | ||
} | ||
|
||
/** | ||
* Return the k nearest neighbor documents as determined by comparison of their vector values for | ||
* this field, to the given vector, by the field's similarity function. The score of each document | ||
* is derived from the vector similarity in a way that ensures scores are positive and that a | ||
* larger score corresponds to a higher ranking. | ||
* | ||
* <p>The search is allowed to be approximate, meaning the results are not guaranteed to be the | ||
* true k closest neighbors. For large values of k (for example when k is close to the total | ||
* number of documents), the search may also retrieve fewer than k documents. | ||
* | ||
* <p>The returned {@link TopDocs} will contain a {@link ScoreDoc} for each nearest neighbor, in | ||
* order of their similarity to the query vector (decreasing scores). The {@link TotalHits} | ||
* contains the number of documents visited during the search. If the search stopped early because | ||
* it hit {@code visitedLimit}, it is indicated through the relation {@code | ||
* TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO}. | ||
* | ||
* <p>The behavior is undefined if the given field doesn't have KNN vectors enabled on its {@link | ||
* FieldInfo}. The return value is never {@code null}. | ||
* | ||
* @param field the vector field to search | ||
* @param target the vector-valued query | ||
* @param knnCollector a KnnResults collector and relevant settings for gathering vector results | ||
* @param acceptDocs {@link Bits} that represents the allowed documents to match, or {@code null} | ||
* if they are all allowed to match. | ||
*/ | ||
@Override | ||
public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { | ||
throw new UnsupportedOperationException("Search functionality using codec is not supported with Native Engine Reader"); | ||
} | ||
|
||
/** | ||
* Closes this stream and releases any system resources associated | ||
* with it. If the stream is already closed then invoking this | ||
* method has no effect. | ||
* | ||
* <p> As noted in {@link AutoCloseable#close()}, cases where the | ||
* close may fail require careful attention. It is strongly advised | ||
* to relinquish the underlying resources and to internally | ||
* <em>mark</em> the {@code Closeable} as closed, prior to throwing | ||
* the {@code IOException}. | ||
* | ||
* @throws IOException if an I/O error occurs | ||
*/ | ||
@Override | ||
public void close() throws IOException { | ||
IOUtils.close(flatVectorsReader); | ||
} | ||
|
||
/** | ||
* Return the memory usage of this object in bytes. Negative values are illegal. | ||
*/ | ||
@Override | ||
public long ramBytesUsed() { | ||
return flatVectorsReader.ramBytesUsed(); | ||
} | ||
} |
Oops, something went wrong.