Skip to content

Commit

Permalink
scoring: remove IDF from BM25 scoring (#912)
Browse files Browse the repository at this point in the history
We remove IDF from our BM25 scoring, effectively treating it as constant.

This is supported by our evaluations which showed that for keyword style queries, IDF can down-weight the score of important keywords too much, leading to a worse ranking. The intuition is that for code search, each keyword is important independently of how frequent it appears in the corpus.

Removing IDF allows us to apply BM25 scoring to a wider range of query types. Previously, BM25 was limited to queries with individual terms combined using OR, as IDF was calculated on the fly at query time.

Test plan:
updated tests
  • Loading branch information
stefanhengl authored Feb 13, 2025
1 parent 145e161 commit b437dc7
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 80 deletions.
31 changes: 3 additions & 28 deletions index/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

enry_data "github.com/go-enry/go-enry/v2/data"
"github.com/grafana/regexp"

"github.com/sourcegraph/zoekt"
"github.com/sourcegraph/zoekt/internal/tenant"
"github.com/sourcegraph/zoekt/query"
Expand Down Expand Up @@ -189,12 +190,6 @@ func (d *indexData) Search(ctx context.Context, q query.Q, opts *zoekt.SearchOpt
docCount := uint32(len(d.fileBranchMasks))
lastDoc := int(-1)

// document frequency per term
df := make(termDocumentFrequency)

// term frequency per file index
var tfs []termFrequency

nextFileMatch:
for {
canceled := false
Expand Down Expand Up @@ -320,14 +315,9 @@ nextFileMatch:
fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts)
}

var tf map[string]int
if opts.UseBM25Scoring {
// For BM25 scoring, the calculation of the score is split in two parts. Here we
// calculate the term frequencies for the current document and update the
// document frequencies. Since we don't store document frequencies in the index,
// we have to defer the calculation of the final BM25 score to after the whole
// shard has been processed.
tf = cp.calculateTermFrequency(finalCands, df)
tf := cp.calculateTermFrequency(finalCands)
d.scoreFilesUsingBM25(&fileMatch, nextDoc, tf, opts)
} else {
// Use the standard, non-experimental scoring method by default
d.scoreFile(&fileMatch, nextDoc, mt, known, opts)
Expand All @@ -348,28 +338,13 @@ nextFileMatch:
repoMatchCount += len(fileMatch.LineMatches)
repoMatchCount += matchedChunkRanges

if opts.UseBM25Scoring {
// Invariant: tfs[i] belongs to res.Files[i]
tfs = append(tfs, termFrequency{
doc: nextDoc,
tf: tf,
})
}
res.Files = append(res.Files, fileMatch)

res.Stats.MatchCount += len(fileMatch.LineMatches)
res.Stats.MatchCount += matchedChunkRanges
res.Stats.FileCount++
}

// Calculate BM25 score for all file matches in the shard. We assume that we
// have seen all documents containing any of the terms in the query so that df
// correctly reflects the document frequencies. This is true, for example, if
// all terms in the query are ORed together.
if opts.UseBM25Scoring {
d.scoreFilesUsingBM25(res.Files, tfs, df, opts)
}

for _, md := range d.repoMetaData {
r := md
addRepo(&res, &r)
Expand Down
68 changes: 26 additions & 42 deletions index/score.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,9 @@ func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (f
L := float64(lineLength) / 100.0

score := 0.0
tfs := p.calculateTermFrequency(ms, termDocumentFrequency{})
tfs := p.calculateTermFrequency(ms)
for _, f := range tfs {
score += ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
score += tfScore(k, b, L, f)
}

// Check if any index comes from a symbol match tree, and if so hydrate in symbol information
Expand All @@ -237,14 +237,16 @@ func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (f
return score, symbolInfo
}

// termDocumentFrequency is a map "term" -> "number of documents that contain the term"
type termDocumentFrequency map[string]int
// tfScore is the term frequency score for BM25.
func tfScore(k float64, b float64, L float64, f int) float64 {
return ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
}

// calculateTermFrequency computes the term frequency for the file match.
// Notes:
// - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles.
// - Symbol matches also count more than content matches, to reward matches on symbol definitions.
func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int {
func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch) map[string]int {
// Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and
// ignore whether the index is a word boundary.
termFreqs := map[string]int{}
Expand All @@ -257,9 +259,6 @@ func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df ter
}
}

for term := range termFreqs {
df[term] += 1
}
return termFreqs
}

Expand Down Expand Up @@ -331,19 +330,16 @@ func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTr
}
}

// termFrequency stores the term frequencies for doc.
type termFrequency struct {
doc uint32
tf map[string]int
}

// scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search:
// https://en.wikipedia.org/wiki/Okapi_BM25.
// https://en.wikipedia.org/wiki/Okapi_BM25. Note that we treat the inverse document frequency (idf) as constant. This
// is supported by our evaluations which showed that for keyword style queries, idf can down-weight the score of some
// keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how
// frequent it appears in the corpus.
//
// Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps
// things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also
// things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also
// ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file.
func (d *indexData) scoreFilesUsingBM25(fileMatches []zoekt.FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *zoekt.SearchOptions) {
func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, tf map[string]int, opts *zoekt.SearchOptions) {
// Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html)
k, b := 1.2, 0.75

Expand All @@ -353,34 +349,22 @@ func (d *indexData) scoreFilesUsingBM25(fileMatches []zoekt.FileMatch, tfs []ter
averageFileLength++
}

for i := range tfs {
score := 0.0
// Compute the file length ratio. Usually the calculation would be based on terms, but using
// bytes should work fine, as we're just computing a ratio.
fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])

// Compute the file length ratio. Usually the calculation would be based on terms, but using
// bytes should work fine, as we're just computing a ratio.
doc := tfs[i].doc
fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc])
L := fileLength / averageFileLength

L := fileLength / averageFileLength

sumTF := 0 // Just for debugging
for term, f := range tfs[i].tf {
sumTF += f
tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f))
score += idf(df[term], int(d.numDocs())) * tfScore
}
score := 0.0
sumTF := 0 // Just for debugging
for _, f := range tf {
sumTF += f
score += tfScore(k, b, L, f)
}

fileMatches[i].Score = score
fileMatch.Score = score

if opts.DebugScore {
fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L)
}
if opts.DebugScore {
fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L)
}
}

// idf computes the inverse document frequency for a term. nq is the number of
// documents that contain the term and documentCount is the total number of
// documents in the corpus.
func idf(nq, documentCount int) float64 {
return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5)))
}
20 changes: 10 additions & 10 deletions internal/e2e/scoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ func TestBM25(t *testing.T) {
query: &query.Substring{Pattern: "example"},
content: exampleJava,
language: "Java",
// bm25-score: 0.58 <- sum-termFrequencyScore: 14.00, length-ratio: 1.00
wantScore: 0.58,
// sum-termFrequencyScore: 14.00, length-ratio: 1.00
wantScore: 2.02,
// line 5: private final int exampleField;
wantBestLineMatch: 5,
}, {
Expand All @@ -93,8 +93,8 @@ func TestBM25(t *testing.T) {
}},
content: exampleJava,
language: "Java",
// bm25-score: 1.81 <- sum-termFrequencyScore: 116.00, length-ratio: 1.00
wantScore: 1.81,
// sum-termFrequencyScore: 116.00, length-ratio: 1.00
wantScore: 6.30,
// line 54: private static <A, B> B runInnerInterface(InnerInterface<A, B> fn, A a) {
wantBestLineMatch: 54,
}, {
Expand All @@ -106,8 +106,8 @@ func TestBM25(t *testing.T) {
}},
content: exampleJava,
language: "Java",
// bm25-score: 0.96 <- sum-termFrequencies: 12, length-ratio: 1.00
wantScore: 0.96,
// sum-termFrequencies: 12, length-ratio: 1.00
wantScore: 3.33,
// line 59: if (System.nanoTime() > System.currentTimeMillis()) {
wantBestLineMatch: 59,
},
Expand All @@ -117,16 +117,16 @@ func TestBM25(t *testing.T) {
query: &query.Substring{Pattern: "java"},
content: exampleJava,
language: "Java",
// bm25-score: 0.51 <- sum-termFrequencyScore: 5.00, length-ratio: 1.00
wantScore: 0.51,
// sum-termFrequencyScore: 5.00, length-ratio: 1.00
wantScore: 1.77,
},
{
// Matches only on filename, and content is missing
fileName: "a/b/c/config.go",
query: &query.Substring{Pattern: "config.go"},
language: "Go",
// bm25-score: 0.60 <- sum-termFrequencyScore: 5.00, length-ratio: 0.00
wantScore: 0.60,
// sum-termFrequencyScore: 5.00, length-ratio: 0.00
wantScore: 2.07,
},
}

Expand Down

0 comments on commit b437dc7

Please sign in to comment.