diff --git a/build/builder.go b/build/builder.go index 498318699..3ef5b9995 100644 --- a/build/builder.go +++ b/build/builder.go @@ -468,14 +468,21 @@ func (o *Options) findShard() string { // Brute force finding the shard in compound shards. We should only hit this // code path for repositories that are not already existing or are in // compound shards. - // - // TODO add an oracle which can speed this up in the case of repositories - // already in compound shards. compoundShards, err := filepath.Glob(path.Join(o.IndexDir, "compound-*.zoekt")) if err != nil { return "" } for _, fn := range compoundShards { + // PERF: ReadMetadataPathAlive can be relatively slow on instances with + // thousands of tiny repos in compound shards. This is a much faster check + // to see if we need to do more work to check. + // + // If we are still seeing performance issues, we should consider adding + // some sort of global oracle here to avoid filepath.Glob and checking + // each compound shard. + if !zoekt.MaybeContainRepo(fn, o.RepositoryDescription.ID) { + continue + } repos, _, err := zoekt.ReadMetadataPathAlive(fn) if err != nil { continue diff --git a/indexbuilder.go b/indexbuilder.go index 027edf9f4..7dde924ed 100644 --- a/indexbuilder.go +++ b/indexbuilder.go @@ -526,6 +526,23 @@ func (b *IndexBuilder) branchMask(br string) uint64 { return 0 } +// repoIDs returns a list of sourcegraph IDs for the indexed repos. If the ID +// is missing or there are no repos, this returns false. +func (b *IndexBuilder) repoIDs() ([]uint32, bool) { + if len(b.repoList) == 0 { + return nil, false + } + + ids := make([]uint32, 0, len(b.repoList)) + for _, repo := range b.repoList { + if repo.ID == 0 { + return nil, false + } + ids = append(ids, repo.ID) + } + return ids, true +} + type DocChecker struct { // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. trigrams map[ngram]struct{} diff --git a/read.go b/read.go index 189ec64c4..fdc88dc11 100644 --- a/read.go +++ b/read.go @@ -24,6 +24,7 @@ import ( "slices" "sort" + "github.com/RoaringBitmap/roaring" "github.com/rs/xid" ) @@ -648,6 +649,54 @@ func IndexFilePaths(p string) ([]string, error) { return exist, nil } +// MaybeContainRepo returns true if the shard at path p could contain repoID. +// This only returns false if we are certain it does not. You need to double +// check if it returns true. +// +// This function is a performance optimization mainly intended to be used by +// builder (see findShard) to avoid unmarshalling large metadata files for +// compound shards. It is best-effort, so if encounters any error returns true +// (ie indicating you need to do more checks). +func MaybeContainRepo(p string, repoID uint32) bool { + f, err := os.Open(p) + if err != nil { + return true + } + defer f.Close() + + inf, err := NewIndexFile(f) + if err != nil { + return true + } + defer inf.Close() + + rd := &reader{r: inf} + var toc indexTOC + err = rd.readTOCSections(&toc, []string{"reposIDsBitmap"}) + if err != nil { + return true + } + + // shard does not yet contains reposIDsBitmap so we can't tell if it + // contains repo. + if toc.reposIDsBitmap.sz == 0 { + return true + } + + blob, err := inf.Read(toc.reposIDsBitmap.off, toc.reposIDsBitmap.sz) + if err != nil { + return true + } + + var rb roaring.Bitmap + _, err = rb.FromUnsafeBytes(blob) + if err != nil { + return true + } + + return rb.Contains(repoID) +} + func loadIndexData(r IndexFile) (*indexData, error) { rd := &reader{r: r} diff --git a/testdata/shards/repo2_v16.00000.zoekt b/testdata/shards/repo2_v16.00000.zoekt index eb5aa4d6b..1f16fb5c3 100644 Binary files a/testdata/shards/repo2_v16.00000.zoekt and b/testdata/shards/repo2_v16.00000.zoekt differ diff --git a/testdata/shards/repo_v16.00000.zoekt b/testdata/shards/repo_v16.00000.zoekt index ee0513349..acf2bf08a 100644 Binary files a/testdata/shards/repo_v16.00000.zoekt and b/testdata/shards/repo_v16.00000.zoekt differ diff --git a/toc.go b/toc.go index 8eee56950..427ba0a0b 100644 --- a/toc.go +++ b/toc.go @@ -96,7 +96,8 @@ type indexTOC struct { contentChecksums simpleSection runeDocSections simpleSection - repos simpleSection + repos simpleSection + reposIDsBitmap simpleSection ranks simpleSection } @@ -187,6 +188,8 @@ func (t *indexTOC) sectionsTaggedList() []taggedSection { {"nameBloom", &unusedSimple}, {"contentBloom", &unusedSimple}, {"ranks", &unusedSimple}, + + {"reposIDsBitmap", &t.reposIDsBitmap}, } } diff --git a/write.go b/write.go index 278ebc025..68f6ca853 100644 --- a/write.go +++ b/write.go @@ -23,6 +23,8 @@ import ( "io" "sort" "time" + + "github.com/RoaringBitmap/roaring" ) func (w *writer) writeTOC(toc *indexTOC) { @@ -66,6 +68,12 @@ func (s *compoundSection) writeMap(w *writer, m map[string]uint32) { s.writeStrings(w, keys) } +func writeUint32Bitmap(w *writer, dat []uint32) { + rb := roaring.BitmapOf(dat...) + rb.RunOptimize() + rb.WriteTo(w) +} + func writePostings(w *writer, s *postingsBuilder, ngramText *simpleSection, charOffsets *simpleSection, postings *compoundSection, endRunes *simpleSection, ) { @@ -169,6 +177,12 @@ func (b *IndexBuilder) Write(out io.Writer) error { toc.repos.end(w) } + if repoIDs, ok := b.repoIDs(); ok && next { + toc.reposIDsBitmap.start(w) + writeUint32Bitmap(w, repoIDs) + toc.reposIDsBitmap.end(w) + } + indexTime := b.IndexTime if indexTime.IsZero() { indexTime = time.Now().UTC()