diff --git a/backend/onyx/document_index/document_index_utils.py b/backend/onyx/document_index/document_index_utils.py index ae79c73d8a0..28529d21c9e 100644 --- a/backend/onyx/document_index/document_index_utils.py +++ b/backend/onyx/document_index/document_index_utils.py @@ -142,6 +142,8 @@ def get_uuid_from_chunk_info( tenant_id: str | None, large_chunk_id: int | None = None, ) -> UUID: + """NOTE: be VERY carefuly about changing this function. If changed without a migration, + this can cause deletion/update/insertion to function incorrectly.""" doc_str = document_id # Web parsing URL duplicate catching diff --git a/backend/onyx/document_index/vespa/index.py b/backend/onyx/document_index/vespa/index.py index 6dec8722953..f8c9b58ce4c 100644 --- a/backend/onyx/document_index/vespa/index.py +++ b/backend/onyx/document_index/vespa/index.py @@ -346,6 +346,14 @@ def index( # IMPORTANT: This must be done one index at a time, do not use secondary index here cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks] + # needed so the final DocumentInsertionRecord returned can have the original document ID + new_document_id_to_original_document_id: dict[str, str] = {} + for ind, chunk in enumerate(cleaned_chunks): + old_chunk = chunks[ind] + new_document_id_to_original_document_id[ + chunk.source_document.id + ] = old_chunk.source_document.id + existing_docs: set[str] = set() # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for @@ -401,14 +409,14 @@ def index( executor=executor, ) - all_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks} + all_cleaned_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks} return { DocumentInsertionRecord( - document_id=doc_id, - already_existed=doc_id in existing_docs, + document_id=new_document_id_to_original_document_id[cleaned_doc_id], + already_existed=cleaned_doc_id in existing_docs, ) - for doc_id in all_doc_ids + for cleaned_doc_id in all_cleaned_doc_ids } @classmethod @@ -541,7 +549,7 @@ def update( time.monotonic() - update_start, ) - def update_single_chunk( + def _update_single_chunk( self, doc_chunk_id: UUID, index_name: str, @@ -605,6 +613,8 @@ def update_single( """ doc_chunk_count = 0 + doc_id = replace_invalid_doc_id_characters(doc_id) + with self.httpx_client_context as httpx_client: for ( index_name, @@ -627,7 +637,7 @@ def update_single( doc_chunk_count += len(doc_chunk_ids) for doc_chunk_id in doc_chunk_ids: - self.update_single_chunk( + self._update_single_chunk( doc_chunk_id, index_name, fields, doc_id, httpx_client ) @@ -689,6 +699,18 @@ def id_based_retrieval( batch_retrieval: bool = False, get_large_chunks: bool = False, ) -> list[InferenceChunkUncleaned]: + # make sure to use the vespa-afied document IDs + chunk_requests = [ + VespaChunkRequest( + document_id=replace_invalid_doc_id_characters( + chunk_request.document_id + ), + min_chunk_ind=chunk_request.min_chunk_ind, + max_chunk_ind=chunk_request.max_chunk_ind, + ) + for chunk_request in chunk_requests + ] + if batch_retrieval: return batch_search_api_retrieval( index_name=self.index_name, diff --git a/backend/onyx/document_index/vespa/indexing_utils.py b/backend/onyx/document_index/vespa/indexing_utils.py index f37ebb66e79..cef97c43e03 100644 --- a/backend/onyx/document_index/vespa/indexing_utils.py +++ b/backend/onyx/document_index/vespa/indexing_utils.py @@ -242,9 +242,9 @@ def batch_index_vespa_chunks( def clean_chunk_id_copy( chunk: DocMetadataAwareIndexChunk, ) -> DocMetadataAwareIndexChunk: - clean_chunk = chunk.copy( + clean_chunk = chunk.model_copy( update={ - "source_document": chunk.source_document.copy( + "source_document": chunk.source_document.model_copy( update={ "id": replace_invalid_doc_id_characters(chunk.source_document.id) } diff --git a/backend/onyx/document_index/vespa/shared_utils/utils.py b/backend/onyx/document_index/vespa/shared_utils/utils.py index e8dd83a7603..dabfdf4d4a2 100644 --- a/backend/onyx/document_index/vespa/shared_utils/utils.py +++ b/backend/onyx/document_index/vespa/shared_utils/utils.py @@ -45,7 +45,9 @@ def is_text_character(codepoint: int) -> bool: def replace_invalid_doc_id_characters(text: str) -> str: - """Replaces invalid document ID characters in text.""" + """Replaces invalid document ID characters in text. + NOTE: this must be called at the start of every vespa-related operation or else we + risk discrepancies -> silent failures on deletion/update/insertion.""" # There may be a more complete set of replacements that need to be made but Vespa docs are unclear # and users only seem to be running into this error with single quotes return text.replace("'", "_") diff --git a/web/src/app/admin/connector/[ccPairId]/page.tsx b/web/src/app/admin/connector/[ccPairId]/page.tsx index 7881de4e861..72915426cee 100644 --- a/web/src/app/admin/connector/[ccPairId]/page.tsx +++ b/web/src/app/admin/connector/[ccPairId]/page.tsx @@ -33,11 +33,6 @@ import EditPropertyModal from "@/components/modals/EditPropertyModal"; import * as Yup from "yup"; -// since the uploaded files are cleaned up after some period of time -// re-indexing will not work for the file connector. Also, it would not -// make sense to re-index, since the files will not have changed. -const CONNECTOR_TYPES_THAT_CANT_REINDEX: ValidSources[] = [ValidSources.File]; - // synchronize these validations with the SQLAlchemy connector class until we have a // centralized schema for both frontend and backend const RefreshFrequencySchema = Yup.object().shape({ @@ -268,21 +263,18 @@ function Main({ ccPairId }: { ccPairId: number }) { {ccPair.is_editable_for_current_user && (