Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ranking: add phrase boosting to BM25 #917

Merged
merged 6 commits into from
Feb 21, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions index/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,7 @@ nextFileMatch:
}

if opts.UseBM25Scoring {
tf := cp.calculateTermFrequency(finalCands)
d.scoreFilesUsingBM25(&fileMatch, nextDoc, tf, opts)
d.scoreFilesUsingBM25(&fileMatch, nextDoc, finalCands, cp, opts)
} else {
// Use the standard, non-experimental scoring method by default
d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
Expand Down
55 changes: 38 additions & 17 deletions index/score.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,14 +175,6 @@ func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineN
}
}

// scoreWeight != 1 means it affects score
if !epsilonEqualsOne(m.scoreWeight) {
score = score * m.scoreWeight
if opts.DebugScore {
what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
}
}

if score > bestScore.score {
bestScore.score = score
bestScore.debugScore = what
Expand All @@ -193,6 +185,7 @@ func (p *contentProvider) scoreLine(ms []*candidateMatch, language string, lineN
bestScore.debugScore = fmt.Sprintf("score:%.2f <- %s", bestScore.score, strings.TrimSuffix(bestScore.debugScore, ", "))
}

bestScore.score = boostScore(bestScore.score, ms)
return bestScore, symbolInfo
}

Expand Down Expand Up @@ -223,9 +216,10 @@ func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (f
score += tfScore(k, b, L, f)
}

// Check if any index comes from a symbol match tree, and if so hydrate in symbol information
var symbolInfo []*zoekt.Symbol
for _, m := range ms {
// Check if any index comes from a symbol match tree, and if so hydrate in
// symbol information
if m.symbol {
if sec, si, ok := p.findSymbol(m); ok && si != nil {
// findSymbols does not hydrate in Sym. So we need to store it.
Expand All @@ -235,6 +229,8 @@ func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (f
}
}
}

score = boostScore(score, ms)
return score, symbolInfo
}

Expand Down Expand Up @@ -263,6 +259,25 @@ func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch) map[st
return termFreqs
}

// boostScore finds whether any of the matches are part of a boosted match tree, then applies
// the boost to the final score. This follows precedent in other search engines like Lucene, where
// boosts multiply an entire query clause's final score.
//
// As a heuristic, we use the maximum boost across matches to avoid applying the same boost multiple times.
func boostScore(score float64, ms []*candidateMatch) float64 {
maxScoreWeight := 1.0
for _, m := range ms {
if m.scoreWeight > maxScoreWeight {
maxScoreWeight = m.scoreWeight
}
}

if !epsilonEqualsOne(maxScoreWeight) {
score = score * maxScoreWeight
}
return score
}

// scoreFile computes a score for the file match using various scoring signals, like
// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *zoekt.SearchOptions) {
Expand Down Expand Up @@ -324,10 +339,11 @@ func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTr
// keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how
// frequent it appears in the corpus.
//
// Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps
// things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also
// ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file.
func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, tf map[string]int, opts *zoekt.SearchOptions) {
// Unlike standard file scoring, this scoring strategy ignores the individual LineMatch and ChunkMatch scores, instead
// calculating a score over all matches in the file.
func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, cands []*candidateMatch, cp *contentProvider, opts *zoekt.SearchOptions) {
tf := cp.calculateTermFrequency(cands)

// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
k, b := 1.2, 0.75

Expand All @@ -343,14 +359,16 @@ func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32,

L := fileLength / averageFileLength

score := 0.0
bm25Score := 0.0
sumTF := 0 // Just for debugging
for _, f := range tf {
sumTF += f
score += tfScore(k, b, L, f)
bm25Score += tfScore(k, b, L, f)
}
// 2 digits of precision
score = math.Trunc(score*100) / 100

score := boostScore(bm25Score, cands)
boosted := score != bm25Score
score = math.Trunc(score*100) / 100 // 2 digits of precision

md := d.repoMetaData[d.repos[doc]]
fileOrderScore := 1.0 - float64(doc)/float64(len(d.boundaries))
Expand All @@ -370,5 +388,8 @@ func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32,
if opts.DebugScore {
// To make the debug output easier to read, we split the score into the query dependent score and the tiebreaker
fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f (repo-rank: %d, file-rank: %.2f) <- sum-termFrequencies: %d, length-ratio: %.2f", score, md.Rank, fileOrderScore, sumTF, L)
if boosted {
fileMatch.Debug += fmt.Sprintf(" (boosted)")
}
}
}
17 changes: 17 additions & 0 deletions internal/e2e/scoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,23 @@ func TestBM25(t *testing.T) {
wantScore: 3.33,
// line 59: if (System.nanoTime() > System.currentTimeMillis()) {
wantBestLineMatch: 59,
}, {
// phrase boosting
fileName: "example.java",
query: &query.Or{Children: []query.Q{
&query.Boost{Child: &query.Substring{Pattern: "public string apply"}, Boost: 20},
&query.And{Children: []query.Q{
&query.Substring{Pattern: "public"},
&query.Substring{Pattern: "string"},
&query.Substring{Pattern: "apply"},
}},
}},
content: exampleJava,
language: "Java",
// sum-termFrequencies: sum-termFrequencies: 40, length-ratio: 1.00
wantScore: 140.80,
// public String apply(String s) {
wantBestLineMatch: 81,
},
{
// Matches only on filename
Expand Down
Loading