Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

repoID bitmap for speeding up findShard in compound shards #899

Merged
merged 5 commits into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ require (
github.com/davidmz/go-pageant v1.0.2 // indirect
github.com/go-fed/httpsig v1.1.0 // indirect
github.com/hashicorp/go-version v1.7.0 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
)

Expand Down
20 changes: 7 additions & 13 deletions index/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -467,24 +467,18 @@ func (o *Options) findShard() string {
}

// Brute force finding the shard in compound shards. We should only hit this
// code path for repositories that are not already existing or are in
// compound shards.
//
// TODO add an oracle which can speed this up in the case of repositories
// already in compound shards.
// code path for repositories that don't exist yet or are in compound shards.
return o.findCompoundShard()
}

func (o *Options) findCompoundShard() string {
compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
if err != nil {
return ""
}
for _, fn := range compoundShards {
repos, _, err := ReadMetadataPathAlive(fn)
if err != nil {
continue
}
for _, repo := range repos {
if repo.ID == o.RepositoryDescription.ID {
return fn
}
if containsRepo(fn, o.RepositoryDescription.ID) {
return fn
}
}

Expand Down
93 changes: 85 additions & 8 deletions index/builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ import (

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require"

"github.com/sourcegraph/zoekt"
)

Expand Down Expand Up @@ -288,6 +291,80 @@ func TestPartialSuccess(t *testing.T) {
}
}

// Tests that we skip looping over repos in compound shards when we know that
// the repository we are looking for is not in the shard.
func TestSkipCompoundShards(t *testing.T) {
metricCompoundShardLookups.Reset()

compoundShards := [][]zoekt.Repository{
{
{Name: "repoA", ID: 1},
{Name: "repoB", ID: 2},
{Name: "repoC", ID: 3},
},
{
{Name: "repoD", ID: 4},
{Name: "repoE", ID: 5},
{Name: "repoF", ID: 6},
{Name: "repoF", ID: 7},
{Name: "repoF", ID: 8},
},
}
var lookForRepoID uint32 = 99
wantSkippedCount := 2

indexDir := t.TempDir()
for _, repositoryGroup := range compoundShards {
createTestCompoundShard(t, indexDir, repositoryGroup)
}
o := &Options{
IndexDir: indexDir,
RepositoryDescription: zoekt.Repository{ID: lookForRepoID},
}

shard := o.findCompoundShard()
require.Empty(t, shard)

// Check if the "skipped" counter was incremented
skippedCount := int(testutil.ToFloat64(metricCompoundShardLookups.WithLabelValues("skipped")))
require.Equal(t, wantSkippedCount, skippedCount)
}

// With optimization
// BenchmarkFindCompoundShard-16 33505 36016 ns/op
//
// Without optimization
// BenchmarkFindCompoundShard-16 76 15568589 ns/op
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice! I guess this answers my question here:

when you saw a bunch of time spent in findShard, do you know what exactly was slow? Was it reading the metadata, or going through all the repos to check if one existed?

func BenchmarkFindCompoundShard(b *testing.B) {
// Generate a large compound shard
const numRepos = 5000
repositories := make([]zoekt.Repository, numRepos)
for i := 0; i < numRepos; i++ {
repositories[i] = zoekt.Repository{
Name: fmt.Sprintf("repo%d", i+1),
ID: uint32(i + 1),
}
}
indexDir := b.TempDir()
createTestCompoundShard(b, indexDir, repositories)

// pick id that is not in the shard
var searchRepoID uint32 = numRepos + 1

b.ResetTimer()
for i := 0; i < b.N; i++ {
o := &Options{
IndexDir: indexDir,
RepositoryDescription: zoekt.Repository{ID: searchRepoID},
}

shard := o.findCompoundShard()
if shard != "" {
b.Fatal("expected emtpy result")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
b.Fatal("expected emtpy result")
b.Fatal("expected empty result")

}
}
}

func TestOptions_FindAllShards(t *testing.T) {
type simpleShard struct {
Repository zoekt.Repository
Expand Down Expand Up @@ -361,17 +438,17 @@ func TestOptions_FindAllShards(t *testing.T) {
compoundShards: [][]zoekt.Repository{
{
{Name: "repoA", ID: 1},
{Name: "sameName", ID: 2},
{Name: "sameName", ID: 3},
{Name: "repoB", ID: 2},
{Name: "repoC", ID: 3},
},
{
{Name: "repoB", ID: 4},
{Name: "sameName", ID: 5},
{Name: "sameName", ID: 6},
{Name: "repoD", ID: 4},
{Name: "repoE", ID: 5},
{Name: "repoF", ID: 6},
},
},
expectedShardCount: 1,
expectedRepository: zoekt.Repository{Name: "sameName", ID: 5},
expectedRepository: zoekt.Repository{Name: "something-else", ID: 5},
Comment on lines -364 to +451
Copy link
Member

@stefanhengl stefanhengl Feb 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fly-by change to make the intent of the test more obvious. Before, the name and id both matched so it was not obvious that we match by id only.

},
}
for _, tt := range tests {
Expand Down Expand Up @@ -840,7 +917,7 @@ func TestIsLowPriority(t *testing.T) {
}
}

func createTestShard(t *testing.T, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string {
func createTestShard(t testing.TB, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string {
t.Helper()

if err := os.MkdirAll(filepath.Dir(indexDir), 0o700); err != nil {
Expand Down Expand Up @@ -891,7 +968,7 @@ func createTestShard(t *testing.T, indexDir string, r zoekt.Repository, numShard
return o.FindAllShards()
}

func createTestCompoundShard(t *testing.T, indexDir string, repositories []zoekt.Repository, optFns ...func(options *Options)) {
func createTestCompoundShard(t testing.TB, indexDir string, repositories []zoekt.Repository, optFns ...func(options *Options)) {
t.Helper()

var shardNames []string
Expand Down
101 changes: 101 additions & 0 deletions index/read.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ import (
"slices"
"sort"

"github.com/RoaringBitmap/roaring"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/rs/xid"

"github.com/sourcegraph/zoekt"
)

Expand Down Expand Up @@ -649,6 +653,103 @@ func IndexFilePaths(p string) ([]string, error) {
return exist, nil
}

// MaybeContainRepo returns true if the shard at path p could contain repoID.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small comment: this should be maybeContainsRepo. Also, it seems like the method we're encouraging callers to use is actually containsRepo. So we could move the doc comment there instead (and rework it?)

// This only returns false if we are certain it does not. You need to
// double-check if it returns true.
//
// This function is a performance optimization mainly intended to be used by
// builder (see findShard) to avoid unmarshalling large metadata files for
// compound shards. It is best-effort, so if encounters any error returns true
// (ie indicating you need to do more checks).
func maybeContainsRepo(inf IndexFile, repoID uint32) bool {
rd := &reader{r: inf}
var toc indexTOC
err := rd.readTOCSections(&toc, []string{"reposIDsBitmap"})
if err != nil {
return true
}

// shard does not yet contain reposIDsBitmap so we can't tell if it contains
// repo.
if toc.reposIDsBitmap.sz == 0 {
return true
}

blob, err := inf.Read(toc.reposIDsBitmap.off, toc.reposIDsBitmap.sz)
if err != nil {
return true
}

var rb roaring.Bitmap
_, err = rb.FromUnsafeBytes(blob)
if err != nil {
return true
}

return rb.Contains(repoID)
}

var metricCompoundShardLookups = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "zoekt_compound_shard_lookups",
Help: "Number of compound shard lookups and how much work was done.",
}, []string{"state"})

func containsRepo(p string, id uint32) bool {
var err error
earlyReturn := false

defer func() {
if err != nil {
metricCompoundShardLookups.WithLabelValues("error").Inc()
return
}
if earlyReturn {
metricCompoundShardLookups.WithLabelValues("skipped").Inc()
return
}
metricCompoundShardLookups.WithLabelValues("full_lookup").Inc()
}()

f, err := os.Open(p)
if err != nil {
return false
}
defer f.Close()

inf, err := NewIndexFile(f)
if err != nil {
return false
}
defer inf.Close()

// PERF: Looping over repos can be relatively slow on instances with thousands
// of tiny repos in compound shards. This is a much faster check to see if we
// need to do more work.
//
// If we are still seeing performance issues, we should consider adding
// some sort of global oracle here to avoid filepath.Glob and checking
// each compound shard.
if !maybeContainsRepo(inf, id) {
earlyReturn = true
return false
}

repos, _, err := ReadMetadata(inf)
if err != nil {
return false
}
for _, repo := range repos {
if repo.Tombstone {
continue
}
if repo.ID == id {
return true
}
}

return false
}

func loadIndexData(r IndexFile) (*indexData, error) {
rd := &reader{r: r}

Expand Down
1 change: 1 addition & 0 deletions index/read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"testing"

"github.com/google/go-cmp/cmp"

"github.com/sourcegraph/zoekt"
"github.com/sourcegraph/zoekt/query"
)
Expand Down
17 changes: 17 additions & 0 deletions index/shard_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,23 @@ func (b *ShardBuilder) branchMask(br string) uint64 {
return 0
}

// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID
// is missing or there are no repos, this returns false.
func (b *ShardBuilder) repoIDs() ([]uint32, bool) {
if len(b.repoList) == 0 {
return nil, false
}

ids := make([]uint32, 0, len(b.repoList))
for _, repo := range b.repoList {
if repo.ID == 0 {
return nil, false
}
ids = append(ids, repo.ID)
}
return ids, true
}

type DocChecker struct {
// A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
trigrams map[ngram]struct{}
Expand Down
4 changes: 3 additions & 1 deletion index/toc.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ type indexTOC struct {
contentChecksums simpleSection
runeDocSections simpleSection

repos simpleSection
repos simpleSection
reposIDsBitmap simpleSection

ranks simpleSection
}
Expand Down Expand Up @@ -181,6 +182,7 @@ func (t *indexTOC) sectionsTaggedList() []taggedSection {
{"languages", &t.languages},
{"runeDocSections", &t.runeDocSections},
{"repos", &t.repos},
{"reposIDsBitmap", &t.reposIDsBitmap},

// We no longer write these sections, but we still return them here to avoid
// warnings about unknown sections.
Expand Down
14 changes: 14 additions & 0 deletions index/write.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ import (
"time"

"github.com/sourcegraph/zoekt"

"github.com/RoaringBitmap/roaring"
)

func (w *writer) writeTOC(toc *indexTOC) {
Expand Down Expand Up @@ -68,6 +70,12 @@ func (s *compoundSection) writeMap(w *writer, m map[string]uint32) {
s.writeStrings(w, keys)
}

func writeUint32Bitmap(w *writer, dat []uint32) {
rb := roaring.BitmapOf(dat...)
rb.RunOptimize()
rb.WriteTo(w)
}

func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection,
charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection,
) {
Expand Down Expand Up @@ -171,6 +179,12 @@ func (b *ShardBuilder) Write(out io.Writer) error {
toc.repos.end(w)
}

if repoIDs, ok := b.repoIDs(); ok && next {
toc.reposIDsBitmap.start(w)
writeUint32Bitmap(w, repoIDs)
toc.reposIDsBitmap.end(w)
}

indexTime := b.IndexTime
if indexTime.IsZero() {
indexTime = time.Now().UTC()
Expand Down
Binary file modified testdata/shards/repo2_v16.00000.zoekt
Binary file not shown.
Binary file modified testdata/shards/repo_v16.00000.zoekt
Binary file not shown.
Loading