diff --git a/build.gradle b/build.gradle index bccbae33e..eee5f2c19 100644 --- a/build.gradle +++ b/build.gradle @@ -17,7 +17,7 @@ buildscript { version_qualifier = System.getProperty("build.version_qualifier", "") opensearch_group = "org.opensearch" isSnapshot = "true" == System.getProperty("build.snapshot", "true") - simd_enabled = System.getProperty("simd.enabled", "true") + simd_enabled = System.getProperty("simd.enabled", "false") version_tokens = opensearch_version.tokenize('-') opensearch_build = version_tokens[0] + '.0' diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80BinaryDocValues.java b/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80BinaryDocValues.java index 832737a6d..06421ebee 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80BinaryDocValues.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80BinaryDocValues.java @@ -5,6 +5,8 @@ package org.opensearch.knn.index.codec.KNN80Codec; +import lombok.Getter; +import lombok.Setter; import org.opensearch.knn.index.codec.util.BinaryDocValuesSub; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocIDMerger; @@ -15,10 +17,17 @@ /** * A per-document kNN numeric value. */ -class KNN80BinaryDocValues extends BinaryDocValues { +public class KNN80BinaryDocValues extends BinaryDocValues { private DocIDMerger docIDMerger; + @Setter + private long cost; + + @Getter + @Setter + private long liveDocs; + KNN80BinaryDocValues(DocIDMerger docIdMerger) { this.docIDMerger = docIdMerger; } @@ -54,7 +63,7 @@ public boolean advanceExact(int target) throws IOException { @Override public long cost() { - throw new UnsupportedOperationException(); + return cost; } @Override diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java b/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java index 57192beca..708d452f8 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java @@ -9,6 +9,7 @@ import lombok.NonNull; import lombok.extern.log4j.Log4j2; import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.util.Bits; import org.opensearch.common.StopWatch; import org.opensearch.common.xcontent.XContentHelper; import org.opensearch.core.common.bytes.BytesArray; @@ -110,6 +111,17 @@ public void addKNNBinaryField(FieldInfo field, DocValuesProducer valuesProducer, throws IOException { // Get values to be indexed BinaryDocValues values = valuesProducer.getBinary(field); +// int maxNumberOfDocs = this.state.segmentInfo.maxDoc(); +// if (valuesProducer instanceof KNN80DocValuesReader) { +// MergeState mergeState = ((KNN80DocValuesReader) valuesProducer).getMergeState(); +// Bits[] liveDocsArray = mergeState.liveDocs; +// int liveDocs = 0; +// for(Bits bit : liveDocsArray) { +// +// } +// } + + KNNCodecUtil.Pair pair = KNNCodecUtil.getFloats(values); if (pair.vectors.length == 0 || pair.docs.length == 0) { logger.info("Skipping engine index creation as there are no vectors or docs in the documents"); diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesReader.java b/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesReader.java index ccfaa68fc..affbb31a4 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesReader.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesReader.java @@ -5,6 +5,11 @@ package org.opensearch.knn.index.codec.KNN80Codec; +import lombok.Getter; +import lombok.extern.log4j.Log4j2; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; import org.opensearch.knn.index.codec.util.BinaryDocValuesSub; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.BinaryDocValues; @@ -15,11 +20,14 @@ import org.apache.lucene.index.MergeState; import java.util.ArrayList; +import java.util.BitSet; import java.util.List; /** * Reader for KNNDocValues from the segments */ +@Getter +@Log4j2 class KNN80DocValuesReader extends EmptyDocValuesProducer { private final MergeState mergeState; @@ -30,6 +38,8 @@ class KNN80DocValuesReader extends EmptyDocValuesProducer { @Override public BinaryDocValues getBinary(FieldInfo field) { + long cost = 0; + long liveDocsCount = 0; try { List subs = new ArrayList<>(this.mergeState.docValuesProducers.length); for (int i = 0; i < this.mergeState.docValuesProducers.length; i++) { @@ -41,11 +51,33 @@ public BinaryDocValues getBinary(FieldInfo field) { values = docValuesProducer.getBinary(readerFieldInfo); } if (values != null) { + cost += values.cost(); + Bits liveDocs = this.mergeState.liveDocs[i]; + if (liveDocs != null) { + log.info("There are some deleted docs present"); + // so we counted all the live docs here + int docId; + for(docId = values.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = + values.nextDoc()) { + if (liveDocs.get(docId)) { + liveDocsCount++; + } + } + // again setting this value as we have already used the older doc values. + values = docValuesProducer.getBinary(readerFieldInfo); + } else { + // no live docs are present so lets use all the docs. + liveDocsCount += values.cost(); + } subs.add(new BinaryDocValuesSub(mergeState.docMaps[i], values)); } } } - return new KNN80BinaryDocValues(DocIDMerger.of(subs, mergeState.needsIndexSort)); + KNN80BinaryDocValues knn80BinaryDocValues = new KNN80BinaryDocValues(DocIDMerger.of(subs, mergeState.needsIndexSort)); + knn80BinaryDocValues.setCost(cost); + knn80BinaryDocValues.setLiveDocs(liveDocsCount); + log.info("There are {} live docs, {} cost", liveDocsCount, cost); + return knn80BinaryDocValues; } catch (Exception e) { throw new RuntimeException(e); } diff --git a/src/main/java/org/opensearch/knn/index/codec/util/KNNCodecUtil.java b/src/main/java/org/opensearch/knn/index/codec/util/KNNCodecUtil.java index 02ab2d833..dd8dc50d2 100644 --- a/src/main/java/org/opensearch/knn/index/codec/util/KNNCodecUtil.java +++ b/src/main/java/org/opensearch/knn/index/codec/util/KNNCodecUtil.java @@ -5,14 +5,17 @@ package org.opensearch.knn.index.codec.util; +import lombok.extern.log4j.Log4j2; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.BytesRef; +import org.opensearch.knn.index.codec.KNN80Codec.KNN80BinaryDocValues; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.ArrayList; +@Log4j2 public class KNNCodecUtil { public static final String HNSW_EXTENSION = ".hnsw"; @@ -42,6 +45,11 @@ public static KNNCodecUtil.Pair getFloats(BinaryDocValues values) throws IOExcep ArrayList vectorList = new ArrayList<>(); ArrayList docIdList = new ArrayList<>(); SerializationMode serializationMode = SerializationMode.COLLECTION_OF_FLOATS; + long liveDocs = 0; + if(values instanceof KNN80BinaryDocValues) { + liveDocs = ((KNN80BinaryDocValues) values).getLiveDocs(); + } + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { BytesRef bytesref = values.binaryValue(); try (ByteArrayInputStream byteStream = new ByteArrayInputStream(bytesref.bytes, bytesref.offset, bytesref.length)) { @@ -52,6 +60,8 @@ public static KNNCodecUtil.Pair getFloats(BinaryDocValues values) throws IOExcep } docIdList.add(doc); } + log.info("The cost of the iterator is : {} and docIds are: {} and liveDocs : {}", values.cost(), + docIdList.size(), liveDocs); return new KNNCodecUtil.Pair( docIdList.stream().mapToInt(Integer::intValue).toArray(), vectorList.toArray(new float[][] {}),