diff --git a/index/builder.go b/index/builder.go index 34d02f31..cc06603e 100644 --- a/index/builder.go +++ b/index/builder.go @@ -609,7 +609,6 @@ func (b *Builder) Add(doc Document) error { doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { doc.SkipReason = err.Error() - doc.Language = "binary" } b.todo = append(b.todo, &doc) diff --git a/index/shard_builder.go b/index/shard_builder.go index 15e85780..8a3057b6 100644 --- a/index/shard_builder.go +++ b/index/shard_builder.go @@ -396,7 +396,16 @@ func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) { } func DetermineLanguageIfUnknown(doc *Document) { - if doc.Language == "" { + if doc.Language != "" { + return + } + + if doc.SkipReason != "" { + // If this document has been skipped, it's likely very large, or it's a non-code file like binary. + // In this case, we just guess the language based on file name to avoid examining the contents. + // Note: passing nil content is allowed by the go-enry contract (the underlying library we use here). + doc.Language = languages.GetLanguage(doc.Name, nil) + } else { doc.Language = languages.GetLanguage(doc.Name, doc.Content) } } @@ -407,16 +416,12 @@ func (b *ShardBuilder) Add(doc Document) error { if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) - doc.Language = "binary" } if doc.SkipReason != "" { doc.Content = []byte(notIndexedMarker + doc.SkipReason) doc.Symbols = nil doc.SymbolsMetaData = nil - if doc.Language == "" { - doc.Language = "skipped" - } } DetermineLanguageIfUnknown(&doc) diff --git a/index/shard_builder_test.go b/index/shard_builder_test.go index e66b3cf8..662ac8cd 100644 --- a/index/shard_builder_test.go +++ b/index/shard_builder_test.go @@ -47,3 +47,49 @@ func TestShardName(t *testing.T) { }) } } + +func TestDetermineLanguageIfUnknown(t *testing.T) { + tests := []struct { + name string + doc Document + wantLang string + skipContent bool + }{ + { + name: "already has language", + doc: Document{ + Name: "test.java", + Language: "Go", + Content: []byte("package main"), + }, + wantLang: "Go", + }, + { + name: "skipped file", + doc: Document{ + Name: "large.js", + SkipReason: "too large", + Content: []byte(notIndexedMarker + "too large"), + }, + wantLang: "JavaScript", + }, + { + name: "skipped file with unknown extension", + doc: Document{ + Name: "deadb33f", + SkipReason: "binary", + Content: []byte(notIndexedMarker + "binary"), + }, + wantLang: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + DetermineLanguageIfUnknown(&tt.doc) + if tt.doc.Language != tt.wantLang { + t.Errorf("DetermineLanguageIfUnknown() got language = %v, want %v", tt.doc.Language, tt.wantLang) + } + }) + } +}