From 5c2c24de8f90c9c39bfaa3ca38768eabb6d09efc Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 22 Jan 2025 17:14:30 +0200 Subject: [PATCH 1/5] repoID bitmap for speeding up findShard in compound shards We add a new section to shards which contains a roaring bitmap for quickly checking if a shard contains a repo ID. We then can load just this (small amount) of data to rule out a compound shard. We use roaring bitmaps since we already have that dependency in our codebase. The reason we speed up this operation is we found on a large instance which contained thousands of tiny repos we spent so much time in findShard that our indexing queue would always fall behind. It is possible this new section won't speed this up enough and we need some sort of global oracle (or in-memory cache in indexserver?). This is noted in the code for future travellers. Test Plan: the existing unit tests already cover if this is forwards and backwards compatible. Additionally I added some logging to zoekt to test if older version of shards still work correctly in findShard, as well as if older versions of zoekt can read the new shards. I haven't quantified the performance improvements yet. Before landing I will generate some synthetic large compound shards and test on my machine the changes in perf for findShard (will update description). --- index/builder.go | 13 +++++-- index/read.go | 49 ++++++++++++++++++++++++++ index/shard_builder.go | 17 +++++++++ index/toc.go | 5 ++- index/write.go | 14 ++++++++ testdata/shards/repo2_v16.00000.zoekt | Bin 2873 -> 2897 bytes testdata/shards/repo_v16.00000.zoekt | Bin 2681 -> 2705 bytes 7 files changed, 94 insertions(+), 4 deletions(-) diff --git a/index/builder.go b/index/builder.go index ea9dad7a7..4c10c5cfb 100644 --- a/index/builder.go +++ b/index/builder.go @@ -469,14 +469,21 @@ func (o *Options) findShard() string { // Brute force finding the shard in compound shards. We should only hit this // code path for repositories that are not already existing or are in // compound shards. - // - // TODO add an oracle which can speed this up in the case of repositories - // already in compound shards. compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) if err != nil { return "" } for _, fn := range compoundShards { + // PERF: ReadMetadataPathAlive can be relatively slow on instances with + // thousands of tiny repos in compound shards. This is a much faster check + // to see if we need to do more work to check. + // + // If we are still seeing performance issues, we should consider adding + // some sort of global oracle here to avoid filepath.Glob and checking + // each compound shard. + if !MaybeContainRepo(fn, o.RepositoryDescription.ID) { + continue + } repos, _, err := ReadMetadataPathAlive(fn) if err != nil { continue diff --git a/index/read.go b/index/read.go index a44254733..55a927510 100644 --- a/index/read.go +++ b/index/read.go @@ -24,6 +24,7 @@ import ( "slices" "sort" + "github.com/RoaringBitmap/roaring" "github.com/rs/xid" "github.com/sourcegraph/zoekt" ) @@ -649,6 +650,54 @@ func IndexFilePaths(p string) ([]string, error) { return exist, nil } +// MaybeContainRepo returns true if the shard at path p could contain repoID. +// This only returns false if we are certain it does not. You need to double +// check if it returns true. +// +// This function is a performance optimization mainly intended to be used by +// builder (see findShard) to avoid unmarshalling large metadata files for +// compound shards. It is best-effort, so if encounters any error returns true +// (ie indicating you need to do more checks). +func MaybeContainRepo(p string, repoID uint32) bool { + f, err := os.Open(p) + if err != nil { + return true + } + defer f.Close() + + inf, err := NewIndexFile(f) + if err != nil { + return true + } + defer inf.Close() + + rd := &reader{r: inf} + var toc indexTOC + err = rd.readTOCSections(&toc, []string{"reposIDsBitmap"}) + if err != nil { + return true + } + + // shard does not yet contains reposIDsBitmap so we can't tell if it + // contains repo. + if toc.reposIDsBitmap.sz == 0 { + return true + } + + blob, err := inf.Read(toc.reposIDsBitmap.off, toc.reposIDsBitmap.sz) + if err != nil { + return true + } + + var rb roaring.Bitmap + _, err = rb.FromUnsafeBytes(blob) + if err != nil { + return true + } + + return rb.Contains(repoID) +} + func loadIndexData(r IndexFile) (*indexData, error) { rd := &reader{r: r} diff --git a/index/shard_builder.go b/index/shard_builder.go index 8a3057b63..dbfd77043 100644 --- a/index/shard_builder.go +++ b/index/shard_builder.go @@ -511,6 +511,23 @@ func (b *ShardBuilder) branchMask(br string) uint64 { return 0 } +// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID +// is missing or there are no repos, this returns false. +func (b *ShardBuilder) repoIDs() ([]uint32, bool) { + if len(b.repoList) == 0 { + return nil, false + } + + ids := make([]uint32, 0, len(b.repoList)) + for _, repo := range b.repoList { + if repo.ID == 0 { + return nil, false + } + ids = append(ids, repo.ID) + } + return ids, true +} + type DocChecker struct { // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. trigrams map[ngram]struct{} diff --git a/index/toc.go b/index/toc.go index 1367d9613..87bf3d302 100644 --- a/index/toc.go +++ b/index/toc.go @@ -96,7 +96,8 @@ type indexTOC struct { contentChecksums simpleSection runeDocSections simpleSection - repos simpleSection + repos simpleSection + reposIDsBitmap simpleSection ranks simpleSection } @@ -187,6 +188,8 @@ func (t *indexTOC) sectionsTaggedList() []taggedSection { {"nameBloom", &unusedSimple}, {"contentBloom", &unusedSimple}, {"ranks", &unusedSimple}, + + {"reposIDsBitmap", &t.reposIDsBitmap}, } } diff --git a/index/write.go b/index/write.go index 0a8e1295e..0d9825a04 100644 --- a/index/write.go +++ b/index/write.go @@ -25,6 +25,8 @@ import ( "time" "github.com/sourcegraph/zoekt" + + "github.com/RoaringBitmap/roaring" ) func (w *writer) writeTOC(toc *indexTOC) { @@ -68,6 +70,12 @@ func (s *compoundSection) writeMap(w *writer, m map[string]uint32) { s.writeStrings(w, keys) } +func writeUint32Bitmap(w *writer, dat []uint32) { + rb := roaring.BitmapOf(dat...) + rb.RunOptimize() + rb.WriteTo(w) +} + func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection, charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection, ) { @@ -171,6 +179,12 @@ func (b *ShardBuilder) Write(out io.Writer) error { toc.repos.end(w) } + if repoIDs, ok := b.repoIDs(); ok && next { + toc.reposIDsBitmap.start(w) + writeUint32Bitmap(w, repoIDs) + toc.reposIDsBitmap.end(w) + } + indexTime := b.IndexTime if indexTime.IsZero() { indexTime = time.Now().UTC() diff --git a/testdata/shards/repo2_v16.00000.zoekt b/testdata/shards/repo2_v16.00000.zoekt index eb5aa4d6b9c6765e7a39d6ecef05b7d70b04c3fd..1f16fb5c3c936f4cd48562dcbab2917ef887300e 100644 GIT binary patch delta 40 ncmdlfc2R7DA-4iwQEEYcv8PM1Q)WqSVgUmJaO`GaVCn+^^-c+@ delta 16 Xcmca8wo`0_AvXsD1IKO#2BtCqDF_4R diff --git a/testdata/shards/repo_v16.00000.zoekt b/testdata/shards/repo_v16.00000.zoekt index ee051334924b38ce2a07f9381907678d5dcb05db..acf2bf08a6fb6973d6bb5718844f3fbd6150e0c6 100644 GIT binary patch delta 40 ncmewWRYVCn+^|1%0D delta 16 XcmbOz`cq^>Ar}V&1N(0V2BtCqEKdYs From 04e94b34d14adde2a9d397612953c8a9a343f6ce Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Tue, 18 Feb 2025 11:32:03 +0100 Subject: [PATCH 2/5] mild refactor, add tests and benchmark --- go.mod | 1 + index/builder.go | 27 ++------ index/builder_test.go | 93 +++++++++++++++++++++++--- index/read.go | 88 +++++++++++++++++++----- index/read_test.go | 1 + index/toc.go | 3 +- testdata/shards/repo2_v16.00000.zoekt | Bin 2897 -> 2897 bytes testdata/shards/repo_v16.00000.zoekt | Bin 2705 -> 2705 bytes 8 files changed, 165 insertions(+), 48 deletions(-) diff --git a/go.mod b/go.mod index fe02dcd38..824f46ff8 100644 --- a/go.mod +++ b/go.mod @@ -61,6 +61,7 @@ require ( github.com/davidmz/go-pageant v1.0.2 // indirect github.com/go-fed/httpsig v1.1.0 // indirect github.com/hashicorp/go-version v1.7.0 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect ) diff --git a/index/builder.go b/index/builder.go index 4c10c5cfb..3139fb743 100644 --- a/index/builder.go +++ b/index/builder.go @@ -467,31 +467,18 @@ func (o *Options) findShard() string { } // Brute force finding the shard in compound shards. We should only hit this - // code path for repositories that are not already existing or are in - // compound shards. + // code path for repositories that don't exist yet or are in compound shards. + return o.findCompoundShard() +} + +func (o *Options) findCompoundShard() string { compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) if err != nil { return "" } for _, fn := range compoundShards { - // PERF: ReadMetadataPathAlive can be relatively slow on instances with - // thousands of tiny repos in compound shards. This is a much faster check - // to see if we need to do more work to check. - // - // If we are still seeing performance issues, we should consider adding - // some sort of global oracle here to avoid filepath.Glob and checking - // each compound shard. - if !MaybeContainRepo(fn, o.RepositoryDescription.ID) { - continue - } - repos, _, err := ReadMetadataPathAlive(fn) - if err != nil { - continue - } - for _, repo := range repos { - if repo.ID == o.RepositoryDescription.ID { - return fn - } + if containsRepo(fn, o.RepositoryDescription.ID) { + return fn } } diff --git a/index/builder_test.go b/index/builder_test.go index 91fc8efa2..b48775443 100644 --- a/index/builder_test.go +++ b/index/builder_test.go @@ -16,6 +16,9 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/require" + "github.com/sourcegraph/zoekt" ) @@ -288,6 +291,80 @@ func TestPartialSuccess(t *testing.T) { } } +// Tests that we skip looping over repos in compound shards when we know that +// the repository we are looking for is not in the shard. +func TestSkipCompoundShards(t *testing.T) { + metricCompoundShardLookups.Reset() + + compoundShards := [][]zoekt.Repository{ + { + {Name: "repoA", ID: 1}, + {Name: "repoB", ID: 2}, + {Name: "repoC", ID: 3}, + }, + { + {Name: "repoD", ID: 4}, + {Name: "repoE", ID: 5}, + {Name: "repoF", ID: 6}, + {Name: "repoF", ID: 7}, + {Name: "repoF", ID: 8}, + }, + } + var lookForRepoID uint32 = 99 + wantSkippedCount := 2 + + indexDir := t.TempDir() + for _, repositoryGroup := range compoundShards { + createTestCompoundShard(t, indexDir, repositoryGroup) + } + o := &Options{ + IndexDir: indexDir, + RepositoryDescription: zoekt.Repository{ID: lookForRepoID}, + } + + shard := o.findCompoundShard() + require.Empty(t, shard) + + // Check if the "skipped" counter was incremented + skippedCount := int(testutil.ToFloat64(metricCompoundShardLookups.WithLabelValues("skipped"))) + require.Equal(t, wantSkippedCount, skippedCount) +} + +// With optimization +// BenchmarkFindCompoundShard-16 33505 36016 ns/op +// +// Without optimization +// BenchmarkFindCompoundShard-16 76 15568589 ns/op +func BenchmarkFindCompoundShard(b *testing.B) { + // Generate a large compound shard + const numRepos = 5000 + repositories := make([]zoekt.Repository, numRepos) + for i := 0; i < numRepos; i++ { + repositories[i] = zoekt.Repository{ + Name: fmt.Sprintf("repo%d", i+1), + ID: uint32(i + 1), + } + } + indexDir := b.TempDir() + createTestCompoundShard(b, indexDir, repositories) + + // pick id that is not in the shard + var searchRepoID uint32 = numRepos + 1 + + b.ResetTimer() + for i := 0; i < b.N; i++ { + o := &Options{ + IndexDir: indexDir, + RepositoryDescription: zoekt.Repository{ID: searchRepoID}, + } + + shard := o.findCompoundShard() + if shard != "" { + b.Fatal("expected emtpy result") + } + } +} + func TestOptions_FindAllShards(t *testing.T) { type simpleShard struct { Repository zoekt.Repository @@ -361,17 +438,17 @@ func TestOptions_FindAllShards(t *testing.T) { compoundShards: [][]zoekt.Repository{ { {Name: "repoA", ID: 1}, - {Name: "sameName", ID: 2}, - {Name: "sameName", ID: 3}, + {Name: "repoB", ID: 2}, + {Name: "repoC", ID: 3}, }, { - {Name: "repoB", ID: 4}, - {Name: "sameName", ID: 5}, - {Name: "sameName", ID: 6}, + {Name: "repoD", ID: 4}, + {Name: "repoE", ID: 5}, + {Name: "repoF", ID: 6}, }, }, expectedShardCount: 1, - expectedRepository: zoekt.Repository{Name: "sameName", ID: 5}, + expectedRepository: zoekt.Repository{Name: "something-else", ID: 5}, }, } for _, tt := range tests { @@ -840,7 +917,7 @@ func TestIsLowPriority(t *testing.T) { } } -func createTestShard(t *testing.T, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string { +func createTestShard(t testing.TB, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string { t.Helper() if err := os.MkdirAll(filepath.Dir(indexDir), 0o700); err != nil { @@ -891,7 +968,7 @@ func createTestShard(t *testing.T, indexDir string, r zoekt.Repository, numShard return o.FindAllShards() } -func createTestCompoundShard(t *testing.T, indexDir string, repositories []zoekt.Repository, optFns ...func(options *Options)) { +func createTestCompoundShard(t testing.TB, indexDir string, repositories []zoekt.Repository, optFns ...func(options *Options)) { t.Helper() var shardNames []string diff --git a/index/read.go b/index/read.go index 55a927510..b41248ba1 100644 --- a/index/read.go +++ b/index/read.go @@ -25,7 +25,10 @@ import ( "sort" "github.com/RoaringBitmap/roaring" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/rs/xid" + "github.com/sourcegraph/zoekt" ) @@ -651,35 +654,23 @@ func IndexFilePaths(p string) ([]string, error) { } // MaybeContainRepo returns true if the shard at path p could contain repoID. -// This only returns false if we are certain it does not. You need to double -// check if it returns true. +// This only returns false if we are certain it does not. You need to +// double-check if it returns true. // // This function is a performance optimization mainly intended to be used by // builder (see findShard) to avoid unmarshalling large metadata files for // compound shards. It is best-effort, so if encounters any error returns true // (ie indicating you need to do more checks). -func MaybeContainRepo(p string, repoID uint32) bool { - f, err := os.Open(p) - if err != nil { - return true - } - defer f.Close() - - inf, err := NewIndexFile(f) - if err != nil { - return true - } - defer inf.Close() - +func maybeContainsRepo(inf IndexFile, repoID uint32) bool { rd := &reader{r: inf} var toc indexTOC - err = rd.readTOCSections(&toc, []string{"reposIDsBitmap"}) + err := rd.readTOCSections(&toc, []string{"reposIDsBitmap"}) if err != nil { return true } - // shard does not yet contains reposIDsBitmap so we can't tell if it - // contains repo. + // shard does not yet contain reposIDsBitmap so we can't tell if it contains + // repo. if toc.reposIDsBitmap.sz == 0 { return true } @@ -698,6 +689,67 @@ func MaybeContainRepo(p string, repoID uint32) bool { return rb.Contains(repoID) } +var metricCompoundShardLookups = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "zoekt_compound_shard_lookups", + Help: "Number of compound shard lookups and how much work was done.", +}, []string{"state"}) + +func containsRepo(p string, id uint32) bool { + var err error + earlyReturn := false + + defer func() { + if err != nil { + metricCompoundShardLookups.WithLabelValues("error").Inc() + return + } + if earlyReturn { + metricCompoundShardLookups.WithLabelValues("skipped").Inc() + return + } + metricCompoundShardLookups.WithLabelValues("full_lookup").Inc() + }() + + f, err := os.Open(p) + if err != nil { + return false + } + defer f.Close() + + inf, err := NewIndexFile(f) + if err != nil { + return false + } + defer inf.Close() + + // PERF: Looping over repos can be relatively slow on instances with thousands + // of tiny repos in compound shards. This is a much faster check to see if we + // need to do more work. + // + // If we are still seeing performance issues, we should consider adding + // some sort of global oracle here to avoid filepath.Glob and checking + // each compound shard. + if !maybeContainsRepo(inf, id) { + earlyReturn = true + return false + } + + repos, _, err := ReadMetadata(inf) + if err != nil { + return false + } + for _, repo := range repos { + if repo.Tombstone { + continue + } + if repo.ID == id { + return true + } + } + + return false +} + func loadIndexData(r IndexFile) (*indexData, error) { rd := &reader{r: r} diff --git a/index/read_test.go b/index/read_test.go index 8558a3d9a..6a22bfd9a 100644 --- a/index/read_test.go +++ b/index/read_test.go @@ -29,6 +29,7 @@ import ( "testing" "github.com/google/go-cmp/cmp" + "github.com/sourcegraph/zoekt" "github.com/sourcegraph/zoekt/query" ) diff --git a/index/toc.go b/index/toc.go index 87bf3d302..d0ef6b6a8 100644 --- a/index/toc.go +++ b/index/toc.go @@ -182,14 +182,13 @@ func (t *indexTOC) sectionsTaggedList() []taggedSection { {"languages", &t.languages}, {"runeDocSections", &t.runeDocSections}, {"repos", &t.repos}, + {"reposIDsBitmap", &t.reposIDsBitmap}, // We no longer write these sections, but we still return them here to avoid // warnings about unknown sections. {"nameBloom", &unusedSimple}, {"contentBloom", &unusedSimple}, {"ranks", &unusedSimple}, - - {"reposIDsBitmap", &t.reposIDsBitmap}, } } diff --git a/testdata/shards/repo2_v16.00000.zoekt b/testdata/shards/repo2_v16.00000.zoekt index 1f16fb5c3c936f4cd48562dcbab2917ef887300e..2bf8381e232972877cfcd6a93a053739f1c866b9 100644 GIT binary patch delta 24 gcmca8c2R7@Pp-*^+!B+2a#?UNFmUW Date: Wed, 19 Feb 2025 16:50:59 +0100 Subject: [PATCH 3/5] improve docstrings --- index/read.go | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/index/read.go b/index/read.go index b41248ba1..ffb76f05d 100644 --- a/index/read.go +++ b/index/read.go @@ -653,14 +653,10 @@ func IndexFilePaths(p string) ([]string, error) { return exist, nil } -// MaybeContainRepo returns true if the shard at path p could contain repoID. -// This only returns false if we are certain it does not. You need to -// double-check if it returns true. -// -// This function is a performance optimization mainly intended to be used by -// builder (see findShard) to avoid unmarshalling large metadata files for -// compound shards. It is best-effort, so if encounters any error returns true -// (ie indicating you need to do more checks). +// maybeContainsRepo is a performance optimization mainly intended to be used by +// containsRepo to avoid unmarshalling large metadata files for compound shards. +// It is best-effort, so if it encounters any error returns true (ie indicating +// you need to do more checks). func maybeContainsRepo(inf IndexFile, repoID uint32) bool { rd := &reader{r: inf} var toc indexTOC @@ -694,6 +690,9 @@ var metricCompoundShardLookups = promauto.NewCounterVec(prometheus.CounterOpts{ Help: "Number of compound shard lookups and how much work was done.", }, []string{"state"}) +// containsRepo returns true if the shard at path contains a repo with ID. The +// function returns false if the shard does not contain the repo or if it +// encounters an error. func containsRepo(p string, id uint32) bool { var err error earlyReturn := false From 4c9e0aa5809bb3c64c7abe7c75d7513ded7d8d40 Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Wed, 19 Feb 2025 16:51:48 +0100 Subject: [PATCH 4/5] typo --- index/builder_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/builder_test.go b/index/builder_test.go index b48775443..a6f38966a 100644 --- a/index/builder_test.go +++ b/index/builder_test.go @@ -360,7 +360,7 @@ func BenchmarkFindCompoundShard(b *testing.B) { shard := o.findCompoundShard() if shard != "" { - b.Fatal("expected emtpy result") + b.Fatal("expected empty result") } } } From d218d5a6f3fac790cb3c99e80c91af727967bd66 Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Wed, 19 Feb 2025 16:54:43 +0100 Subject: [PATCH 5/5] typo --- index/read.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/read.go b/index/read.go index ffb76f05d..607598552 100644 --- a/index/read.go +++ b/index/read.go @@ -690,7 +690,7 @@ var metricCompoundShardLookups = promauto.NewCounterVec(prometheus.CounterOpts{ Help: "Number of compound shard lookups and how much work was done.", }, []string{"state"}) -// containsRepo returns true if the shard at path contains a repo with ID. The +// containsRepo returns true if the shard at path contains a repo with id. The // function returns false if the shard does not contain the repo or if it // encounters an error. func containsRepo(p string, id uint32) bool {