Skip to content

Commit

Permalink
Fix Vespa Title Overly Punished when Missing (#995)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhongsun96 authored Jan 24, 2024
1 parent 7174ea3 commit 5008652
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 1 deletion.
1 change: 1 addition & 0 deletions backend/danswer/configs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
SOURCE_LINK = "link"
SEMANTIC_IDENTIFIER = "semantic_identifier"
TITLE = "title"
SKIP_TITLE_EMBEDDING = "skip_title"
SECTION_CONTINUATION = "section_continuation"
EMBEDDINGS = "embeddings"
TITLE_EMBEDDING = "title_embedding"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ schema danswer_chunk {
field semantic_identifier type string {
indexing: summary | attribute
}
# Must have an additional field for whether to skip title embeddings
# This information cannot be extracted from either the title field nor title embedding
field skip_title type bool {
indexing: attribute
}
# May not always match the `semantic_identifier` e.g. for Slack docs the
# `semantic_identifier` will be the channel name, but the `title` will be empty
field title type string {
Expand Down Expand Up @@ -149,7 +154,7 @@ schema danswer_chunk {
function vector_score() {
expression {
# If no title, the full vector score comes from the content embedding
(query(title_content_ratio) * if(isNan(attribute(title)) == 1, closeness(field, embeddings), closeness(field, title_embedding))) +
(query(title_content_ratio) * if(attribute(skip_title), closeness(field, embeddings), closeness(field, title_embedding))) +
((1 - query(title_content_ratio)) * closeness(field, embeddings))
}
}
Expand Down
3 changes: 3 additions & 0 deletions backend/danswer/document_index/vespa/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from danswer.configs.constants import SECONDARY_OWNERS
from danswer.configs.constants import SECTION_CONTINUATION
from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SKIP_TITLE_EMBEDDING
from danswer.configs.constants import SOURCE_LINKS
from danswer.configs.constants import SOURCE_TYPE
from danswer.configs.constants import TITLE
Expand Down Expand Up @@ -256,6 +257,7 @@ def _index_vespa_chunk(
CHUNK_ID: chunk.chunk_id,
BLURB: remove_invalid_unicode_chars(chunk.blurb),
TITLE: remove_invalid_unicode_chars(title) if title else None,
SKIP_TITLE_EMBEDDING: not title,
CONTENT: remove_invalid_unicode_chars(chunk.content),
# This duplication of `content` is needed for keyword highlighting :(
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
Expand Down Expand Up @@ -560,6 +562,7 @@ def _query_vespa(query_params: Mapping[str, str | int | float]) -> list[Inferenc
filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]

inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
# Good Debugging Spot
return inference_chunks


Expand Down

1 comment on commit 5008652

@vercel
Copy link

@vercel vercel bot commented on 5008652 Jan 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.