Skip to content

Commit

Permalink
repoID bitmap for speeding up findShard in compound shards
Browse files Browse the repository at this point in the history
We add a new section to shards which contains a roaring bitmap for
quickly checking if a shard contains a repo ID. We then can load just
this (small amount) of data to rule out a compound shard. We use roaring
bitmaps since we already have that dependency in our codebase.

The reason we speed up this operation is we found on a large instance
which contained thousands of tiny repos we spent so much time in
findShard that our indexing queue would always fall behind.

It is possible this new section won't speed this up enough and we need
some sort of global oracle (or in-memory cache in indexserver?). This is
noted in the code for future travellers.

Test Plan: the existing unit tests already cover if this is forwards and
backwards compatible. Additionally I added some logging to zoekt to test
if older version of shards still work correctly in findShard, as well as
if older versions of zoekt can read the new shards.

I haven't quantified the performance improvements yet. Before landing I
will generate some synthetic large compound shards and test on my
machine the changes in perf for findShard (will update description).
  • Loading branch information
keegancsmith committed Jan 22, 2025
1 parent e7f0a1a commit 8d65282
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 4 deletions.
13 changes: 10 additions & 3 deletions build/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -468,14 +468,21 @@ func (o *Options) findShard() string {
// Brute force finding the shard in compound shards. We should only hit this
// code path for repositories that are not already existing or are in
// compound shards.
//
// TODO add an oracle which can speed this up in the case of repositories
// already in compound shards.
compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt"))
if err != nil {
return ""
}
for _, fn := range compoundShards {
// PERF: ReadMetadataPathAlive can be relatively slow on instances with
// thousands of tiny repos in compound shards. This is a much faster check
// to see if we need to do more work to check.
//
// If we are still seeing performance issues, we should consider adding
// some sort of global oracle here to avoid filepath.Glob and checking
// each compound shard.
if !zoekt.MaybeContainRepo(fn, o.RepositoryDescription.ID) {
continue
}
repos, _, err := zoekt.ReadMetadataPathAlive(fn)
if err != nil {
continue
Expand Down
17 changes: 17 additions & 0 deletions indexbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,23 @@ func (b *IndexBuilder) branchMask(br string) uint64 {
return 0
}

// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID
// is missing or there are no repos, this returns false.
func (b *IndexBuilder) repoIDs() ([]uint32, bool) {
if len(b.repoList) == 0 {
return nil, false
}

ids := make([]uint32, 0, len(b.repoList))
for _, repo := range b.repoList {
if repo.ID == 0 {
return nil, false
}
ids = append(ids, repo.ID)
}
return ids, true
}

type DocChecker struct {
// A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
trigrams map[ngram]struct{}
Expand Down
49 changes: 49 additions & 0 deletions read.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"slices"
"sort"

"github.com/RoaringBitmap/roaring"
"github.com/rs/xid"
)

Expand Down Expand Up @@ -648,6 +649,54 @@ func IndexFilePaths(p string) ([]string, error) {
return exist, nil
}

// MaybeContainRepo returns true if the shard at path p could contain repoID.
// This only returns false if we are certain it does not. You need to double
// check if it returns true.
//
// This function is a performance optimization mainly intended to be used by
// builder (see findShard) to avoid unmarshalling large metadata files for
// compound shards. It is best-effort, so if encounters any error returns true
// (ie indicating you need to do more checks).
func MaybeContainRepo(p string, repoID uint32) bool {
f, err := os.Open(p)
if err != nil {
return true
}
defer f.Close()

inf, err := NewIndexFile(f)
if err != nil {
return true
}
defer inf.Close()

rd := &reader{r: inf}
var toc indexTOC
err = rd.readTOCSections(&toc, []string{"reposIDsBitmap"})
if err != nil {
return true
}

// shard does not yet contains reposIDsBitmap so we can't tell if it
// contains repo.
if toc.reposIDsBitmap.sz == 0 {
return true
}

blob, err := inf.Read(toc.reposIDsBitmap.off, toc.reposIDsBitmap.sz)
if err != nil {
return true
}

var rb roaring.Bitmap
_, err = rb.FromUnsafeBytes(blob)
if err != nil {
return true
}

return rb.Contains(repoID)
}

func loadIndexData(r IndexFile) (*indexData, error) {
rd := &reader{r: r}

Expand Down
Binary file modified testdata/shards/repo2_v16.00000.zoekt
Binary file not shown.
Binary file modified testdata/shards/repo_v16.00000.zoekt
Binary file not shown.
5 changes: 4 additions & 1 deletion toc.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ type indexTOC struct {
contentChecksums simpleSection
runeDocSections simpleSection

repos simpleSection
repos simpleSection
reposIDsBitmap simpleSection

ranks simpleSection
}
Expand Down Expand Up @@ -187,6 +188,8 @@ func (t *indexTOC) sectionsTaggedList() []taggedSection {
{"nameBloom", &unusedSimple},
{"contentBloom", &unusedSimple},
{"ranks", &unusedSimple},

{"reposIDsBitmap", &t.reposIDsBitmap},
}
}

Expand Down
14 changes: 14 additions & 0 deletions write.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import (
"io"
"sort"
"time"

"github.com/RoaringBitmap/roaring"
)

func (w *writer) writeTOC(toc *indexTOC) {
Expand Down Expand Up @@ -66,6 +68,12 @@ func (s *compoundSection) writeMap(w *writer, m map[string]uint32) {
s.writeStrings(w, keys)
}

func writeUint32Bitmap(w *writer, dat []uint32) {
rb := roaring.BitmapOf(dat...)
rb.RunOptimize()
rb.WriteTo(w)
}

func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection,
charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection,
) {
Expand Down Expand Up @@ -169,6 +177,12 @@ func (b *IndexBuilder) Write(out io.Writer) error {
toc.repos.end(w)
}

if repoIDs, ok := b.repoIDs(); ok && next {
toc.reposIDsBitmap.start(w)
writeUint32Bitmap(w, repoIDs)
toc.reposIDsBitmap.end(w)
}

indexTime := b.IndexTime
if indexTime.IsZero() {
indexTime = time.Now().UTC()
Expand Down

0 comments on commit 8d65282

Please sign in to comment.