Skip to content

Commit

Permalink
weaviate: add deduplication functionality to weaviate
Browse files Browse the repository at this point in the history
The changes introduce a deduplication function to the vectorstores
options. This function is used when adding documents to the store,
allowing the system to filter out duplicate documents before they
are added. This is particularly useful to prevent wasting time on
creating an embedding when one already exists.

The changes also include a test case for the new deduplication
functionality, ensuring that it works as expected. The test case
adds two documents to the store, one of which is a duplicate. The
test verifies that only the unique document is added to the store.
  • Loading branch information
corani committed Jan 18, 2024
1 parent 52b6d99 commit 5c45780
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 1 deletion.
17 changes: 16 additions & 1 deletion vectorstores/options.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
package vectorstores

import "github.com/tmc/langchaingo/embeddings"
import (
"context"

"github.com/tmc/langchaingo/embeddings"
"github.com/tmc/langchaingo/schema"
)

// Option is a function that configures an Options.
type Option func(*Options)
Expand All @@ -11,6 +16,7 @@ type Options struct {
ScoreThreshold float32
Filters any
Embedder embeddings.Embedder
Deduplicater func(context.Context, schema.Document) bool
}

// WithNameSpace returns an Option for setting the name space.
Expand Down Expand Up @@ -44,3 +50,12 @@ func WithEmbedder(embedder embeddings.Embedder) Option {
o.Embedder = embedder
}
}

// WithDeduplicater returns an Option for setting the deduplicater that could be used
// when adding documents. This is useful to prevent wasting time on creating an embedding
// when one already exists.
func WithDeduplicater(fn func(ctx context.Context, doc schema.Document) bool) Option {
return func(o *Options) {
o.Deduplicater = fn
}
}
16 changes: 16 additions & 0 deletions vectorstores/weaviate/weaviate.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,22 @@ func (s Store) AddDocuments(ctx context.Context,
opts := s.getOptions(options...)
nameSpace := s.getNameSpace(opts)

if opts.Deduplicater != nil {
filtered := make([]schema.Document, 0, len(docs))
for _, doc := range docs {
if !opts.Deduplicater(ctx, doc) {
filtered = append(filtered, doc)
}
}
docs = filtered
}

if len(docs) == 0 {
// nothing to add (perhaps all documents were duplicates). This is not
// an error.
return nil, nil
}

texts := make([]string, 0, len(docs))
for _, doc := range docs {
texts = append(texts, doc.PageContent)
Expand Down
43 changes: 43 additions & 0 deletions vectorstores/weaviate/weaviate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,49 @@ func TestMetadataSearch(t *testing.T) {
require.Equal(t, "city", docs[0].Metadata["type"])
}

func TestDeduplicater(t *testing.T) {
t.Parallel()

scheme, host := getValues(t)
llm, err := openai.New()
require.NoError(t, err)
e, err := embeddings.NewEmbedder(llm)
require.NoError(t, err)

store, err := New(
WithScheme(scheme),
WithHost(host),
WithEmbedder(e),
WithNameSpace(uuid.New().String()),
WithIndexName(randomizedCamelCaseClass()),
WithQueryAttrs([]string{"type"}),
)
require.NoError(t, err)

err = createTestClass(context.Background(), store)
require.NoError(t, err)

_, err = store.AddDocuments(context.Background(), []schema.Document{
{PageContent: "tokyo", Metadata: map[string]any{
"type": "city",
}},
{PageContent: "potato", Metadata: map[string]any{
"type": "vegetable",
}},
}, vectorstores.WithDeduplicater(
func(ctx context.Context, doc schema.Document) bool {
return doc.PageContent == "tokyo"
},
))
require.NoError(t, err)

docs, err := store.MetadataSearch(context.Background(), 2)
require.NoError(t, err)
require.Len(t, docs, 1)
require.Equal(t, "potato", docs[0].PageContent)
require.Equal(t, "vegetable", docs[0].Metadata["type"])
}

func TestSimilaritySearchWithInvalidScoreThreshold(t *testing.T) {
t.Parallel()

Expand Down

0 comments on commit 5c45780

Please sign in to comment.