From 084b5c918c53c4e2eeb51664f3d403095f59f25d Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 21 Jan 2026 14:52:28 +0100 Subject: pkg/codesearch: reduce memory consumption when building index With all references in the index, it become quite big. Merge and dedup the resulting index on the fly. Also intern all strings b/c there are tons of duplicates. This also removes unnecessary duplicates (effectively ODR violations in the kernel) due to use of BUILD_BUG_ON. The macro produces different function calls in different translations units, so the same function may contain __compiletime_assert_N1 call in one TU and __compiletime_assert_N2 in another. Over this reduces resource consumption of index building from: time:296.11s user:16993.71s sys:6661.03s memory:82707MB to: time:194.28s user:16860.01s sys:6647.01s memory: 3243MB 25x reduction in memory consumption. --- pkg/clangtool/clangtool.go | 31 +++++++++++--------- pkg/clangtool/tooltest/tooltest.go | 6 ++-- pkg/codesearch/database.go | 59 ++++++++++++++++++++++++++++++++------ pkg/declextract/entity.go | 2 +- 4 files changed, 73 insertions(+), 25 deletions(-) (limited to 'pkg') diff --git a/pkg/clangtool/clangtool.go b/pkg/clangtool/clangtool.go index 8711b5411..dd7c22f0a 100644 --- a/pkg/clangtool/clangtool.go +++ b/pkg/clangtool/clangtool.go @@ -32,7 +32,7 @@ type Config struct { type OutputDataPtr[T any] interface { *T - Merge(*T) + Merge(*T, *Verifier) SetSourceFile(string, func(filename string) string) Finalize(*Verifier) } @@ -73,21 +73,22 @@ func Run[Output any, OutputPtr OutputDataPtr[Output]](cfg *Config) (OutputPtr, e } close(files) + v := NewVerifier(cfg.KernelSrc, cfg.KernelObj) out := OutputPtr(new(Output)) for range cmds { res := <-results if res.err != nil { return nil, res.err } - out.Merge(res.out) + out.Merge(res.out, v) } // Finalize the output (sort, dedup, etc), and let the output verify // that all source file names, line numbers, etc are valid/present. // If there are any bogus entries, it's better to detect them early, // than to crash/error much later when the info is used. // Some of the source files (generated) may be in the obj dir. - srcDirs := []string{cfg.KernelSrc, cfg.KernelObj} - if err := Finalize(out, srcDirs); err != nil { + out.Finalize(v) + if err := v.Error(); err != nil { return nil, err } if cfg.CacheFile != "" { @@ -103,24 +104,26 @@ func Run[Output any, OutputPtr OutputDataPtr[Output]](cfg *Config) (OutputPtr, e return out, nil } -func Finalize[Output any, OutputPtr OutputDataPtr[Output]](out OutputPtr, srcDirs []string) error { - v := &Verifier{ - srcDirs: srcDirs, +type Verifier struct { + srcDirs []string + fileCache map[string]int // file->line count (-1 is cached for missing files) + err strings.Builder +} + +func NewVerifier(src ...string) *Verifier { + return &Verifier{ + srcDirs: src, fileCache: make(map[string]int), } - out.Finalize(v) +} + +func (v *Verifier) Error() error { if v.err.Len() == 0 { return nil } return errors.New(v.err.String()) } -type Verifier struct { - srcDirs []string - fileCache map[string]int // file->line count (-1 is cached for missing files) - err strings.Builder -} - func (v *Verifier) Filename(file string) { if _, ok := v.fileCache[file]; ok { return diff --git a/pkg/clangtool/tooltest/tooltest.go b/pkg/clangtool/tooltest/tooltest.go index 11aae2e88..4f0b3bced 100644 --- a/pkg/clangtool/tooltest/tooltest.go +++ b/pkg/clangtool/tooltest/tooltest.go @@ -42,14 +42,16 @@ func TestClangTool[Output any, OutputPtr clangtool.OutputDataPtr[Output]](t *tes func LoadOutput[Output any, OutputPtr clangtool.OutputDataPtr[Output]](t *testing.T) OutputPtr { out := OutputPtr(new(Output)) + v := clangtool.NewVerifier("testdata") forEachTestFile(t, func(t *testing.T, file string) { tmp, err := osutil.ReadJSON[OutputPtr](file + ".json") if err != nil { t.Fatal(err) } - out.Merge(tmp) + out.Merge(tmp, v) }) - if err := clangtool.Finalize(out, []string{"testdata"}); err != nil { + out.Finalize(v) + if err := v.Error(); err != nil { t.Fatal(err) } return out diff --git a/pkg/codesearch/database.go b/pkg/codesearch/database.go index 93f194793..dbea6632c 100644 --- a/pkg/codesearch/database.go +++ b/pkg/codesearch/database.go @@ -4,6 +4,9 @@ package codesearch import ( + "fmt" + "maps" + "slices" "strings" "github.com/google/jsonschema-go/jsonschema" @@ -13,6 +16,10 @@ import ( type Database struct { Definitions []*Definition `json:"definitions,omitempty"` + + mergeCache map[string]*Definition + reverseCache map[*Definition]string + stringCache map[string]string } type Definition struct { @@ -52,21 +59,45 @@ var DatabaseFormatHash = func() string { return hash.String(schema, semanticVersion) }() -func (db *Database) Merge(other *Database) { - db.Definitions = append(db.Definitions, other.Definitions...) -} - -func (db *Database) Finalize(v *clangtool.Verifier) { - db.Definitions = clangtool.SortAndDedupSlice(db.Definitions) - - for _, def := range db.Definitions { +func (db *Database) Merge(other *Database, v *clangtool.Verifier) { + if db.mergeCache == nil { + db.mergeCache = make(map[string]*Definition) + db.reverseCache = make(map[*Definition]string) + db.stringCache = make(map[string]string) + } + for _, def := range other.Definitions { + id := fmt.Sprintf("%v-%v-%v", def.Kind, def.Name, def.Body.File) + if _, ok := db.mergeCache[id]; ok { + continue + } + db.mergeCache[id] = def + db.reverseCache[def] = id v.LineRange(def.Body.File, def.Body.StartLine, def.Body.EndLine) if def.Comment.File != "" { v.LineRange(def.Comment.File, def.Comment.StartLine, def.Comment.EndLine) } + db.intern(&def.Kind) + db.intern(&def.Name) + db.intern(&def.Type) + db.intern(&def.Body.File) + db.intern(&def.Comment.File) + for _, ref := range def.Refs { + db.intern(&ref.Kind) + db.intern(&ref.Name) + db.intern(&ref.EntityKind) + } } } +func (db *Database) Finalize(v *clangtool.Verifier) { + db.Definitions = slices.Collect(maps.Values(db.mergeCache)) + slices.SortFunc(db.Definitions, func(a, b *Definition) int { + return strings.Compare(db.reverseCache[a], db.reverseCache[b]) + }) + db.mergeCache = nil + db.reverseCache = nil +} + // SetSoureFile attaches the source file to the entities that need it. // The clang tool could do it, but it looks easier to do it here. func (db *Database) SetSourceFile(file string, updatePath func(string) string) { @@ -78,3 +109,15 @@ func (db *Database) SetSourceFile(file string, updatePath func(string) string) { } } } + +func (db *Database) intern(str *string) { + if *str == "" { + return + } + v, ok := db.stringCache[*str] + if !ok { + v = strings.Clone(*str) + db.stringCache[v] = v + } + *str = v +} diff --git a/pkg/declextract/entity.go b/pkg/declextract/entity.go index 3b5e13a6d..82bf00446 100644 --- a/pkg/declextract/entity.go +++ b/pkg/declextract/entity.go @@ -228,7 +228,7 @@ type EntityGlobalAddr struct { Name string } -func (out *Output) Merge(other *Output) { +func (out *Output) Merge(other *Output, v *clangtool.Verifier) { out.Functions = append(out.Functions, other.Functions...) out.Consts = append(out.Consts, other.Consts...) out.Enums = append(out.Enums, other.Enums...) -- cgit mrf-deployment