From 084b5c918c53c4e2eeb51664f3d403095f59f25d Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 21 Jan 2026 14:52:28 +0100 Subject: pkg/codesearch: reduce memory consumption when building index With all references in the index, it become quite big. Merge and dedup the resulting index on the fly. Also intern all strings b/c there are tons of duplicates. This also removes unnecessary duplicates (effectively ODR violations in the kernel) due to use of BUILD_BUG_ON. The macro produces different function calls in different translations units, so the same function may contain __compiletime_assert_N1 call in one TU and __compiletime_assert_N2 in another. Over this reduces resource consumption of index building from: time:296.11s user:16993.71s sys:6661.03s memory:82707MB to: time:194.28s user:16860.01s sys:6647.01s memory: 3243MB 25x reduction in memory consumption. --- pkg/codesearch/database.go | 59 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 8 deletions(-) (limited to 'pkg/codesearch/database.go') diff --git a/pkg/codesearch/database.go b/pkg/codesearch/database.go index 93f194793..dbea6632c 100644 --- a/pkg/codesearch/database.go +++ b/pkg/codesearch/database.go @@ -4,6 +4,9 @@ package codesearch import ( + "fmt" + "maps" + "slices" "strings" "github.com/google/jsonschema-go/jsonschema" @@ -13,6 +16,10 @@ import ( type Database struct { Definitions []*Definition `json:"definitions,omitempty"` + + mergeCache map[string]*Definition + reverseCache map[*Definition]string + stringCache map[string]string } type Definition struct { @@ -52,21 +59,45 @@ var DatabaseFormatHash = func() string { return hash.String(schema, semanticVersion) }() -func (db *Database) Merge(other *Database) { - db.Definitions = append(db.Definitions, other.Definitions...) -} - -func (db *Database) Finalize(v *clangtool.Verifier) { - db.Definitions = clangtool.SortAndDedupSlice(db.Definitions) - - for _, def := range db.Definitions { +func (db *Database) Merge(other *Database, v *clangtool.Verifier) { + if db.mergeCache == nil { + db.mergeCache = make(map[string]*Definition) + db.reverseCache = make(map[*Definition]string) + db.stringCache = make(map[string]string) + } + for _, def := range other.Definitions { + id := fmt.Sprintf("%v-%v-%v", def.Kind, def.Name, def.Body.File) + if _, ok := db.mergeCache[id]; ok { + continue + } + db.mergeCache[id] = def + db.reverseCache[def] = id v.LineRange(def.Body.File, def.Body.StartLine, def.Body.EndLine) if def.Comment.File != "" { v.LineRange(def.Comment.File, def.Comment.StartLine, def.Comment.EndLine) } + db.intern(&def.Kind) + db.intern(&def.Name) + db.intern(&def.Type) + db.intern(&def.Body.File) + db.intern(&def.Comment.File) + for _, ref := range def.Refs { + db.intern(&ref.Kind) + db.intern(&ref.Name) + db.intern(&ref.EntityKind) + } } } +func (db *Database) Finalize(v *clangtool.Verifier) { + db.Definitions = slices.Collect(maps.Values(db.mergeCache)) + slices.SortFunc(db.Definitions, func(a, b *Definition) int { + return strings.Compare(db.reverseCache[a], db.reverseCache[b]) + }) + db.mergeCache = nil + db.reverseCache = nil +} + // SetSoureFile attaches the source file to the entities that need it. // The clang tool could do it, but it looks easier to do it here. func (db *Database) SetSourceFile(file string, updatePath func(string) string) { @@ -78,3 +109,15 @@ func (db *Database) SetSourceFile(file string, updatePath func(string) string) { } } } + +func (db *Database) intern(str *string) { + if *str == "" { + return + } + v, ok := db.stringCache[*str] + if !ok { + v = strings.Clone(*str) + db.stringCache[v] = v + } + *str = v +} -- cgit mrf-deployment