From b437dc7bce01f23414924a825db37561913cbdf4 Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Thu, 13 Feb 2025 14:04:10 +0100 Subject: [PATCH] scoring: remove IDF from BM25 scoring (#912) We remove IDF from our BM25 scoring, effectively treating it as constant. This is supported by our evaluations which showed that for keyword style queries, IDF can down-weight the score of important keywords too much, leading to a worse ranking. The intuition is that for code search, each keyword is important independently of how frequent it appears in the corpus. Removing IDF allows us to apply BM25 scoring to a wider range of query types. Previously, BM25 was limited to queries with individual terms combined using OR, as IDF was calculated on the fly at query time. Test plan: updated tests --- index/eval.go | 31 ++-------------- index/score.go | 68 ++++++++++++++---------------------- internal/e2e/scoring_test.go | 20 +++++------ 3 files changed, 39 insertions(+), 80 deletions(-) diff --git a/index/eval.go b/index/eval.go index b73d15dc..17007627 100644 --- a/index/eval.go +++ b/index/eval.go @@ -25,6 +25,7 @@ import ( enry_data "github.com/go-enry/go-enry/v2/data" "github.com/grafana/regexp" + "github.com/sourcegraph/zoekt" "github.com/sourcegraph/zoekt/internal/tenant" "github.com/sourcegraph/zoekt/query" @@ -189,12 +190,6 @@ func (d *indexData) Search(ctx context.Context, q query.Q, opts *zoekt.SearchOpt docCount := uint32(len(d.fileBranchMasks)) lastDoc := int(-1) - // document frequency per term - df := make(termDocumentFrequency) - - // term frequency per file index - var tfs []termFrequency - nextFileMatch: for { canceled := false @@ -320,14 +315,9 @@ nextFileMatch: fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts) } - var tf map[string]int if opts.UseBM25Scoring { - // For BM25 scoring, the calculation of the score is split in two parts. Here we - // calculate the term frequencies for the current document and update the - // document frequencies. Since we don't store document frequencies in the index, - // we have to defer the calculation of the final BM25 score to after the whole - // shard has been processed. - tf = cp.calculateTermFrequency(finalCands, df) + tf := cp.calculateTermFrequency(finalCands) + d.scoreFilesUsingBM25(&fileMatch, nextDoc, tf, opts) } else { // Use the standard, non-experimental scoring method by default d.scoreFile(&fileMatch, nextDoc, mt, known, opts) @@ -348,13 +338,6 @@ nextFileMatch: repoMatchCount += len(fileMatch.LineMatches) repoMatchCount += matchedChunkRanges - if opts.UseBM25Scoring { - // Invariant: tfs[i] belongs to res.Files[i] - tfs = append(tfs, termFrequency{ - doc: nextDoc, - tf: tf, - }) - } res.Files = append(res.Files, fileMatch) res.Stats.MatchCount += len(fileMatch.LineMatches) @@ -362,14 +345,6 @@ nextFileMatch: res.Stats.FileCount++ } - // Calculate BM25 score for all file matches in the shard. We assume that we - // have seen all documents containing any of the terms in the query so that df - // correctly reflects the document frequencies. This is true, for example, if - // all terms in the query are ORed together. - if opts.UseBM25Scoring { - d.scoreFilesUsingBM25(res.Files, tfs, df, opts) - } - for _, md := range d.repoMetaData { r := md addRepo(&res, &r) diff --git a/index/score.go b/index/score.go index 84bfab4d..f3608cec 100644 --- a/index/score.go +++ b/index/score.go @@ -217,9 +217,9 @@ func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (f L := float64(lineLength) / 100.0 score := 0.0 - tfs := p.calculateTermFrequency(ms, termDocumentFrequency{}) + tfs := p.calculateTermFrequency(ms) for _, f := range tfs { - score += ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) + score += tfScore(k, b, L, f) } // Check if any index comes from a symbol match tree, and if so hydrate in symbol information @@ -237,14 +237,16 @@ func (p *contentProvider) scoreLineBM25(ms []*candidateMatch, lineNumber int) (f return score, symbolInfo } -// termDocumentFrequency is a map "term" -> "number of documents that contain the term" -type termDocumentFrequency map[string]int +// tfScore is the term frequency score for BM25. +func tfScore(k float64, b float64, L float64, f int) float64 { + return ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) +} // calculateTermFrequency computes the term frequency for the file match. // Notes: // - Filename matches count more than content matches. This mimics a common text search strategy to 'boost' matches on document titles. // - Symbol matches also count more than content matches, to reward matches on symbol definitions. -func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { +func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch) map[string]int { // Treat each candidate match as a term and compute the frequencies. For now, ignore case sensitivity and // ignore whether the index is a word boundary. termFreqs := map[string]int{} @@ -257,9 +259,6 @@ func (p *contentProvider) calculateTermFrequency(cands []*candidateMatch, df ter } } - for term := range termFreqs { - df[term] += 1 - } return termFreqs } @@ -331,19 +330,16 @@ func (d *indexData) scoreFile(fileMatch *zoekt.FileMatch, doc uint32, mt matchTr } } -// termFrequency stores the term frequencies for doc. -type termFrequency struct { - doc uint32 - tf map[string]int -} - // scoreFilesUsingBM25 computes the score according to BM25, the most common scoring algorithm for text search: -// https://en.wikipedia.org/wiki/Okapi_BM25. +// https://en.wikipedia.org/wiki/Okapi_BM25. Note that we treat the inverse document frequency (idf) as constant. This +// is supported by our evaluations which showed that for keyword style queries, idf can down-weight the score of some +// keywords too much, leading to a worse ranking. The intuition is that each keyword is important independently of how +// frequent it appears in the corpus. // // Unlike standard file scoring, this scoring strategy ignores all other signals including document ranks. This keeps -// things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also +// things simple for now, since BM25 is not normalized and can be tricky to combine with other scoring signals. It also // ignores the individual LineMatch and ChunkMatch scores, instead calculating a score over all matches in the file. -func (d *indexData) scoreFilesUsingBM25(fileMatches []zoekt.FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *zoekt.SearchOptions) { +func (d *indexData) scoreFilesUsingBM25(fileMatch *zoekt.FileMatch, doc uint32, tf map[string]int, opts *zoekt.SearchOptions) { // Use standard parameter defaults used in Lucene (https://lucene.apache.org/core/10_1_0/core/org/apache/lucene/search/similarities/BM25Similarity.html) k, b := 1.2, 0.75 @@ -353,34 +349,22 @@ func (d *indexData) scoreFilesUsingBM25(fileMatches []zoekt.FileMatch, tfs []ter averageFileLength++ } - for i := range tfs { - score := 0.0 + // Compute the file length ratio. Usually the calculation would be based on terms, but using + // bytes should work fine, as we're just computing a ratio. + fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) - // Compute the file length ratio. Usually the calculation would be based on terms, but using - // bytes should work fine, as we're just computing a ratio. - doc := tfs[i].doc - fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) + L := fileLength / averageFileLength - L := fileLength / averageFileLength - - sumTF := 0 // Just for debugging - for term, f := range tfs[i].tf { - sumTF += f - tfScore := ((k + 1.0) * float64(f)) / (k*(1.0-b+b*L) + float64(f)) - score += idf(df[term], int(d.numDocs())) * tfScore - } + score := 0.0 + sumTF := 0 // Just for debugging + for _, f := range tf { + sumTF += f + score += tfScore(k, b, L, f) + } - fileMatches[i].Score = score + fileMatch.Score = score - if opts.DebugScore { - fileMatches[i].Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L) - } + if opts.DebugScore { + fileMatch.Debug = fmt.Sprintf("bm25-score: %.2f <- sum-termFrequencies: %d, length-ratio: %.2f", score, sumTF, L) } } - -// idf computes the inverse document frequency for a term. nq is the number of -// documents that contain the term and documentCount is the total number of -// documents in the corpus. -func idf(nq, documentCount int) float64 { - return math.Log(1.0 + ((float64(documentCount) - float64(nq) + 0.5) / (float64(nq) + 0.5))) -} diff --git a/internal/e2e/scoring_test.go b/internal/e2e/scoring_test.go index d80186d9..de3fb91c 100644 --- a/internal/e2e/scoring_test.go +++ b/internal/e2e/scoring_test.go @@ -79,8 +79,8 @@ func TestBM25(t *testing.T) { query: &query.Substring{Pattern: "example"}, content: exampleJava, language: "Java", - // bm25-score: 0.58 <- sum-termFrequencyScore: 14.00, length-ratio: 1.00 - wantScore: 0.58, + // sum-termFrequencyScore: 14.00, length-ratio: 1.00 + wantScore: 2.02, // line 5: private final int exampleField; wantBestLineMatch: 5, }, { @@ -93,8 +93,8 @@ func TestBM25(t *testing.T) { }}, content: exampleJava, language: "Java", - // bm25-score: 1.81 <- sum-termFrequencyScore: 116.00, length-ratio: 1.00 - wantScore: 1.81, + // sum-termFrequencyScore: 116.00, length-ratio: 1.00 + wantScore: 6.30, // line 54: private static B runInnerInterface(InnerInterface fn, A a) { wantBestLineMatch: 54, }, { @@ -106,8 +106,8 @@ func TestBM25(t *testing.T) { }}, content: exampleJava, language: "Java", - // bm25-score: 0.96 <- sum-termFrequencies: 12, length-ratio: 1.00 - wantScore: 0.96, + // sum-termFrequencies: 12, length-ratio: 1.00 + wantScore: 3.33, // line 59: if (System.nanoTime() > System.currentTimeMillis()) { wantBestLineMatch: 59, }, @@ -117,16 +117,16 @@ func TestBM25(t *testing.T) { query: &query.Substring{Pattern: "java"}, content: exampleJava, language: "Java", - // bm25-score: 0.51 <- sum-termFrequencyScore: 5.00, length-ratio: 1.00 - wantScore: 0.51, + // sum-termFrequencyScore: 5.00, length-ratio: 1.00 + wantScore: 1.77, }, { // Matches only on filename, and content is missing fileName: "a/b/c/config.go", query: &query.Substring{Pattern: "config.go"}, language: "Go", - // bm25-score: 0.60 <- sum-termFrequencyScore: 5.00, length-ratio: 0.00 - wantScore: 0.60, + // sum-termFrequencyScore: 5.00, length-ratio: 0.00 + wantScore: 2.07, }, }