-
Notifications
You must be signed in to change notification settings - Fork 108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
repoID bitmap for speeding up findShard in compound shards #899
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -16,6 +16,9 @@ import ( | |||||
|
||||||
"github.com/google/go-cmp/cmp" | ||||||
"github.com/google/go-cmp/cmp/cmpopts" | ||||||
"github.com/prometheus/client_golang/prometheus/testutil" | ||||||
"github.com/stretchr/testify/require" | ||||||
|
||||||
"github.com/sourcegraph/zoekt" | ||||||
) | ||||||
|
||||||
|
@@ -288,6 +291,80 @@ func TestPartialSuccess(t *testing.T) { | |||||
} | ||||||
} | ||||||
|
||||||
// Tests that we skip looping over repos in compound shards when we know that | ||||||
// the repository we are looking for is not in the shard. | ||||||
func TestSkipCompoundShards(t *testing.T) { | ||||||
metricCompoundShardLookups.Reset() | ||||||
|
||||||
compoundShards := [][]zoekt.Repository{ | ||||||
{ | ||||||
{Name: "repoA", ID: 1}, | ||||||
{Name: "repoB", ID: 2}, | ||||||
{Name: "repoC", ID: 3}, | ||||||
}, | ||||||
{ | ||||||
{Name: "repoD", ID: 4}, | ||||||
{Name: "repoE", ID: 5}, | ||||||
{Name: "repoF", ID: 6}, | ||||||
{Name: "repoF", ID: 7}, | ||||||
{Name: "repoF", ID: 8}, | ||||||
}, | ||||||
} | ||||||
var lookForRepoID uint32 = 99 | ||||||
wantSkippedCount := 2 | ||||||
|
||||||
indexDir := t.TempDir() | ||||||
for _, repositoryGroup := range compoundShards { | ||||||
createTestCompoundShard(t, indexDir, repositoryGroup) | ||||||
} | ||||||
o := &Options{ | ||||||
IndexDir: indexDir, | ||||||
RepositoryDescription: zoekt.Repository{ID: lookForRepoID}, | ||||||
} | ||||||
|
||||||
shard := o.findCompoundShard() | ||||||
require.Empty(t, shard) | ||||||
|
||||||
// Check if the "skipped" counter was incremented | ||||||
skippedCount := int(testutil.ToFloat64(metricCompoundShardLookups.WithLabelValues("skipped"))) | ||||||
require.Equal(t, wantSkippedCount, skippedCount) | ||||||
} | ||||||
|
||||||
// With optimization | ||||||
// BenchmarkFindCompoundShard-16 33505 36016 ns/op | ||||||
// | ||||||
// Without optimization | ||||||
// BenchmarkFindCompoundShard-16 76 15568589 ns/op | ||||||
func BenchmarkFindCompoundShard(b *testing.B) { | ||||||
// Generate a large compound shard | ||||||
const numRepos = 5000 | ||||||
repositories := make([]zoekt.Repository, numRepos) | ||||||
for i := 0; i < numRepos; i++ { | ||||||
repositories[i] = zoekt.Repository{ | ||||||
Name: fmt.Sprintf("repo%d", i+1), | ||||||
ID: uint32(i + 1), | ||||||
} | ||||||
} | ||||||
indexDir := b.TempDir() | ||||||
createTestCompoundShard(b, indexDir, repositories) | ||||||
|
||||||
// pick id that is not in the shard | ||||||
var searchRepoID uint32 = numRepos + 1 | ||||||
|
||||||
b.ResetTimer() | ||||||
for i := 0; i < b.N; i++ { | ||||||
o := &Options{ | ||||||
IndexDir: indexDir, | ||||||
RepositoryDescription: zoekt.Repository{ID: searchRepoID}, | ||||||
} | ||||||
|
||||||
shard := o.findCompoundShard() | ||||||
if shard != "" { | ||||||
b.Fatal("expected emtpy result") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
func TestOptions_FindAllShards(t *testing.T) { | ||||||
type simpleShard struct { | ||||||
Repository zoekt.Repository | ||||||
|
@@ -361,17 +438,17 @@ func TestOptions_FindAllShards(t *testing.T) { | |||||
compoundShards: [][]zoekt.Repository{ | ||||||
{ | ||||||
{Name: "repoA", ID: 1}, | ||||||
{Name: "sameName", ID: 2}, | ||||||
{Name: "sameName", ID: 3}, | ||||||
{Name: "repoB", ID: 2}, | ||||||
{Name: "repoC", ID: 3}, | ||||||
}, | ||||||
{ | ||||||
{Name: "repoB", ID: 4}, | ||||||
{Name: "sameName", ID: 5}, | ||||||
{Name: "sameName", ID: 6}, | ||||||
{Name: "repoD", ID: 4}, | ||||||
{Name: "repoE", ID: 5}, | ||||||
{Name: "repoF", ID: 6}, | ||||||
}, | ||||||
}, | ||||||
expectedShardCount: 1, | ||||||
expectedRepository: zoekt.Repository{Name: "sameName", ID: 5}, | ||||||
expectedRepository: zoekt.Repository{Name: "something-else", ID: 5}, | ||||||
Comment on lines
-364
to
+451
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fly-by change to make the intent of the test more obvious. Before, the name and id both matched so it was not obvious that we match by id only. |
||||||
}, | ||||||
} | ||||||
for _, tt := range tests { | ||||||
|
@@ -840,7 +917,7 @@ func TestIsLowPriority(t *testing.T) { | |||||
} | ||||||
} | ||||||
|
||||||
func createTestShard(t *testing.T, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string { | ||||||
func createTestShard(t testing.TB, indexDir string, r zoekt.Repository, numShards int, optFns ...func(options *Options)) []string { | ||||||
t.Helper() | ||||||
|
||||||
if err := os.MkdirAll(filepath.Dir(indexDir), 0o700); err != nil { | ||||||
|
@@ -891,7 +968,7 @@ func createTestShard(t *testing.T, indexDir string, r zoekt.Repository, numShard | |||||
return o.FindAllShards() | ||||||
} | ||||||
|
||||||
func createTestCompoundShard(t *testing.T, indexDir string, repositories []zoekt.Repository, optFns ...func(options *Options)) { | ||||||
func createTestCompoundShard(t testing.TB, indexDir string, repositories []zoekt.Repository, optFns ...func(options *Options)) { | ||||||
t.Helper() | ||||||
|
||||||
var shardNames []string | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,7 +24,11 @@ import ( | |
"slices" | ||
"sort" | ||
|
||
"github.com/RoaringBitmap/roaring" | ||
"github.com/prometheus/client_golang/prometheus" | ||
"github.com/prometheus/client_golang/prometheus/promauto" | ||
"github.com/rs/xid" | ||
|
||
"github.com/sourcegraph/zoekt" | ||
) | ||
|
||
|
@@ -649,6 +653,103 @@ func IndexFilePaths(p string) ([]string, error) { | |
return exist, nil | ||
} | ||
|
||
// MaybeContainRepo returns true if the shard at path p could contain repoID. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Small comment: this should be |
||
// This only returns false if we are certain it does not. You need to | ||
// double-check if it returns true. | ||
// | ||
// This function is a performance optimization mainly intended to be used by | ||
// builder (see findShard) to avoid unmarshalling large metadata files for | ||
// compound shards. It is best-effort, so if encounters any error returns true | ||
// (ie indicating you need to do more checks). | ||
func maybeContainsRepo(inf IndexFile, repoID uint32) bool { | ||
rd := &reader{r: inf} | ||
var toc indexTOC | ||
err := rd.readTOCSections(&toc, []string{"reposIDsBitmap"}) | ||
if err != nil { | ||
return true | ||
} | ||
|
||
// shard does not yet contain reposIDsBitmap so we can't tell if it contains | ||
// repo. | ||
if toc.reposIDsBitmap.sz == 0 { | ||
return true | ||
} | ||
|
||
blob, err := inf.Read(toc.reposIDsBitmap.off, toc.reposIDsBitmap.sz) | ||
if err != nil { | ||
return true | ||
} | ||
|
||
var rb roaring.Bitmap | ||
_, err = rb.FromUnsafeBytes(blob) | ||
if err != nil { | ||
return true | ||
} | ||
|
||
return rb.Contains(repoID) | ||
} | ||
|
||
var metricCompoundShardLookups = promauto.NewCounterVec(prometheus.CounterOpts{ | ||
Name: "zoekt_compound_shard_lookups", | ||
Help: "Number of compound shard lookups and how much work was done.", | ||
}, []string{"state"}) | ||
|
||
func containsRepo(p string, id uint32) bool { | ||
var err error | ||
earlyReturn := false | ||
|
||
defer func() { | ||
if err != nil { | ||
metricCompoundShardLookups.WithLabelValues("error").Inc() | ||
return | ||
} | ||
if earlyReturn { | ||
metricCompoundShardLookups.WithLabelValues("skipped").Inc() | ||
return | ||
} | ||
metricCompoundShardLookups.WithLabelValues("full_lookup").Inc() | ||
}() | ||
|
||
f, err := os.Open(p) | ||
if err != nil { | ||
return false | ||
} | ||
defer f.Close() | ||
|
||
inf, err := NewIndexFile(f) | ||
if err != nil { | ||
return false | ||
} | ||
defer inf.Close() | ||
|
||
// PERF: Looping over repos can be relatively slow on instances with thousands | ||
// of tiny repos in compound shards. This is a much faster check to see if we | ||
// need to do more work. | ||
// | ||
// If we are still seeing performance issues, we should consider adding | ||
// some sort of global oracle here to avoid filepath.Glob and checking | ||
// each compound shard. | ||
if !maybeContainsRepo(inf, id) { | ||
earlyReturn = true | ||
return false | ||
} | ||
|
||
repos, _, err := ReadMetadata(inf) | ||
if err != nil { | ||
return false | ||
} | ||
for _, repo := range repos { | ||
if repo.Tombstone { | ||
continue | ||
} | ||
if repo.ID == id { | ||
return true | ||
} | ||
} | ||
|
||
return false | ||
} | ||
|
||
func loadIndexData(r IndexFile) (*indexData, error) { | ||
rd := &reader{r: r} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice! I guess this answers my question here: