Skip to content

Commit

Permalink
Fix issue causing file connector to fail
Browse files Browse the repository at this point in the history
  • Loading branch information
Weves committed Feb 5, 2025
1 parent 1c12ab3 commit cac3f9d
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 29 deletions.
2 changes: 2 additions & 0 deletions backend/onyx/document_index/document_index_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ def get_uuid_from_chunk_info(
tenant_id: str | None,
large_chunk_id: int | None = None,
) -> UUID:
"""NOTE: be VERY carefuly about changing this function. If changed without a migration,
this can cause deletion/update/insertion to function incorrectly."""
doc_str = document_id

# Web parsing URL duplicate catching
Expand Down
34 changes: 28 additions & 6 deletions backend/onyx/document_index/vespa/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,14 @@ def index(
# IMPORTANT: This must be done one index at a time, do not use secondary index here
cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks]

# needed so the final DocumentInsertionRecord returned can have the original document ID
new_document_id_to_original_document_id: dict[str, str] = {}
for ind, chunk in enumerate(cleaned_chunks):
old_chunk = chunks[ind]
new_document_id_to_original_document_id[
chunk.source_document.id
] = old_chunk.source_document.id

existing_docs: set[str] = set()

# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
Expand Down Expand Up @@ -401,14 +409,14 @@ def index(
executor=executor,
)

all_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}
all_cleaned_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}

return {
DocumentInsertionRecord(
document_id=doc_id,
already_existed=doc_id in existing_docs,
document_id=new_document_id_to_original_document_id[cleaned_doc_id],
already_existed=cleaned_doc_id in existing_docs,
)
for doc_id in all_doc_ids
for cleaned_doc_id in all_cleaned_doc_ids
}

@classmethod
Expand Down Expand Up @@ -541,7 +549,7 @@ def update(
time.monotonic() - update_start,
)

def update_single_chunk(
def _update_single_chunk(
self,
doc_chunk_id: UUID,
index_name: str,
Expand Down Expand Up @@ -605,6 +613,8 @@ def update_single(
"""
doc_chunk_count = 0

doc_id = replace_invalid_doc_id_characters(doc_id)

with self.httpx_client_context as httpx_client:
for (
index_name,
Expand All @@ -627,7 +637,7 @@ def update_single(
doc_chunk_count += len(doc_chunk_ids)

for doc_chunk_id in doc_chunk_ids:
self.update_single_chunk(
self._update_single_chunk(
doc_chunk_id, index_name, fields, doc_id, httpx_client
)

Expand Down Expand Up @@ -689,6 +699,18 @@ def id_based_retrieval(
batch_retrieval: bool = False,
get_large_chunks: bool = False,
) -> list[InferenceChunkUncleaned]:
# make sure to use the vespa-afied document IDs
chunk_requests = [
VespaChunkRequest(
document_id=replace_invalid_doc_id_characters(
chunk_request.document_id
),
min_chunk_ind=chunk_request.min_chunk_ind,
max_chunk_ind=chunk_request.max_chunk_ind,
)
for chunk_request in chunk_requests
]

if batch_retrieval:
return batch_search_api_retrieval(
index_name=self.index_name,
Expand Down
4 changes: 2 additions & 2 deletions backend/onyx/document_index/vespa/indexing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,9 @@ def batch_index_vespa_chunks(
def clean_chunk_id_copy(
chunk: DocMetadataAwareIndexChunk,
) -> DocMetadataAwareIndexChunk:
clean_chunk = chunk.copy(
clean_chunk = chunk.model_copy(
update={
"source_document": chunk.source_document.copy(
"source_document": chunk.source_document.model_copy(
update={
"id": replace_invalid_doc_id_characters(chunk.source_document.id)
}
Expand Down
4 changes: 3 additions & 1 deletion backend/onyx/document_index/vespa/shared_utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ def is_text_character(codepoint: int) -> bool:


def replace_invalid_doc_id_characters(text: str) -> str:
"""Replaces invalid document ID characters in text."""
"""Replaces invalid document ID characters in text.
NOTE: this must be called at the start of every vespa-related operation or else we
risk discrepancies -> silent failures on deletion/update/insertion."""
# There may be a more complete set of replacements that need to be made but Vespa docs are unclear
# and users only seem to be running into this error with single quotes
return text.replace("'", "_")
Expand Down
32 changes: 12 additions & 20 deletions web/src/app/admin/connector/[ccPairId]/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,6 @@ import EditPropertyModal from "@/components/modals/EditPropertyModal";

import * as Yup from "yup";

// since the uploaded files are cleaned up after some period of time
// re-indexing will not work for the file connector. Also, it would not
// make sense to re-index, since the files will not have changed.
const CONNECTOR_TYPES_THAT_CANT_REINDEX: ValidSources[] = [ValidSources.File];

// synchronize these validations with the SQLAlchemy connector class until we have a
// centralized schema for both frontend and backend
const RefreshFrequencySchema = Yup.object().shape({
Expand Down Expand Up @@ -268,21 +263,18 @@ function Main({ ccPairId }: { ccPairId: number }) {

{ccPair.is_editable_for_current_user && (
<div className="ml-auto flex gap-x-2">
{!CONNECTOR_TYPES_THAT_CANT_REINDEX.includes(
ccPair.connector.source
) && (
<ReIndexButton
ccPairId={ccPair.id}
connectorId={ccPair.connector.id}
credentialId={ccPair.credential.id}
isDisabled={
ccPair.indexing ||
ccPair.status === ConnectorCredentialPairStatus.PAUSED
}
isIndexing={ccPair.indexing}
isDeleting={isDeleting}
/>
)}
<ReIndexButton
ccPairId={ccPair.id}
connectorId={ccPair.connector.id}
credentialId={ccPair.credential.id}
isDisabled={
ccPair.indexing ||
ccPair.status === ConnectorCredentialPairStatus.PAUSED
}
isIndexing={ccPair.indexing}
isDeleting={isDeleting}
/>

{!isDeleting && <ModifyStatusButtonCluster ccPair={ccPair} />}
</div>
)}
Expand Down

0 comments on commit cac3f9d

Please sign in to comment.