aboutsummaryrefslogtreecommitdiffstats
path: root/pkg
diff options
context:
space:
mode:
authorDmitry Vyukov <dvyukov@google.com>2026-01-21 14:52:28 +0100
committerDmitry Vyukov <dvyukov@google.com>2026-01-22 11:23:54 +0000
commit084b5c918c53c4e2eeb51664f3d403095f59f25d (patch)
tree0b5bffd7ce76ebf64dea016a4caaada43e7c75f9 /pkg
parent7355a8eb9f9228c3c3b5b6874e33968333115b23 (diff)
pkg/codesearch: reduce memory consumption when building index
With all references in the index, it become quite big. Merge and dedup the resulting index on the fly. Also intern all strings b/c there are tons of duplicates. This also removes unnecessary duplicates (effectively ODR violations in the kernel) due to use of BUILD_BUG_ON. The macro produces different function calls in different translations units, so the same function may contain __compiletime_assert_N1 call in one TU and __compiletime_assert_N2 in another. Over this reduces resource consumption of index building from: time:296.11s user:16993.71s sys:6661.03s memory:82707MB to: time:194.28s user:16860.01s sys:6647.01s memory: 3243MB 25x reduction in memory consumption.
Diffstat (limited to 'pkg')
-rw-r--r--pkg/clangtool/clangtool.go31
-rw-r--r--pkg/clangtool/tooltest/tooltest.go6
-rw-r--r--pkg/codesearch/database.go59
-rw-r--r--pkg/declextract/entity.go2
4 files changed, 73 insertions, 25 deletions
diff --git a/pkg/clangtool/clangtool.go b/pkg/clangtool/clangtool.go
index 8711b5411..dd7c22f0a 100644
--- a/pkg/clangtool/clangtool.go
+++ b/pkg/clangtool/clangtool.go
@@ -32,7 +32,7 @@ type Config struct {
type OutputDataPtr[T any] interface {
*T
- Merge(*T)
+ Merge(*T, *Verifier)
SetSourceFile(string, func(filename string) string)
Finalize(*Verifier)
}
@@ -73,21 +73,22 @@ func Run[Output any, OutputPtr OutputDataPtr[Output]](cfg *Config) (OutputPtr, e
}
close(files)
+ v := NewVerifier(cfg.KernelSrc, cfg.KernelObj)
out := OutputPtr(new(Output))
for range cmds {
res := <-results
if res.err != nil {
return nil, res.err
}
- out.Merge(res.out)
+ out.Merge(res.out, v)
}
// Finalize the output (sort, dedup, etc), and let the output verify
// that all source file names, line numbers, etc are valid/present.
// If there are any bogus entries, it's better to detect them early,
// than to crash/error much later when the info is used.
// Some of the source files (generated) may be in the obj dir.
- srcDirs := []string{cfg.KernelSrc, cfg.KernelObj}
- if err := Finalize(out, srcDirs); err != nil {
+ out.Finalize(v)
+ if err := v.Error(); err != nil {
return nil, err
}
if cfg.CacheFile != "" {
@@ -103,24 +104,26 @@ func Run[Output any, OutputPtr OutputDataPtr[Output]](cfg *Config) (OutputPtr, e
return out, nil
}
-func Finalize[Output any, OutputPtr OutputDataPtr[Output]](out OutputPtr, srcDirs []string) error {
- v := &Verifier{
- srcDirs: srcDirs,
+type Verifier struct {
+ srcDirs []string
+ fileCache map[string]int // file->line count (-1 is cached for missing files)
+ err strings.Builder
+}
+
+func NewVerifier(src ...string) *Verifier {
+ return &Verifier{
+ srcDirs: src,
fileCache: make(map[string]int),
}
- out.Finalize(v)
+}
+
+func (v *Verifier) Error() error {
if v.err.Len() == 0 {
return nil
}
return errors.New(v.err.String())
}
-type Verifier struct {
- srcDirs []string
- fileCache map[string]int // file->line count (-1 is cached for missing files)
- err strings.Builder
-}
-
func (v *Verifier) Filename(file string) {
if _, ok := v.fileCache[file]; ok {
return
diff --git a/pkg/clangtool/tooltest/tooltest.go b/pkg/clangtool/tooltest/tooltest.go
index 11aae2e88..4f0b3bced 100644
--- a/pkg/clangtool/tooltest/tooltest.go
+++ b/pkg/clangtool/tooltest/tooltest.go
@@ -42,14 +42,16 @@ func TestClangTool[Output any, OutputPtr clangtool.OutputDataPtr[Output]](t *tes
func LoadOutput[Output any, OutputPtr clangtool.OutputDataPtr[Output]](t *testing.T) OutputPtr {
out := OutputPtr(new(Output))
+ v := clangtool.NewVerifier("testdata")
forEachTestFile(t, func(t *testing.T, file string) {
tmp, err := osutil.ReadJSON[OutputPtr](file + ".json")
if err != nil {
t.Fatal(err)
}
- out.Merge(tmp)
+ out.Merge(tmp, v)
})
- if err := clangtool.Finalize(out, []string{"testdata"}); err != nil {
+ out.Finalize(v)
+ if err := v.Error(); err != nil {
t.Fatal(err)
}
return out
diff --git a/pkg/codesearch/database.go b/pkg/codesearch/database.go
index 93f194793..dbea6632c 100644
--- a/pkg/codesearch/database.go
+++ b/pkg/codesearch/database.go
@@ -4,6 +4,9 @@
package codesearch
import (
+ "fmt"
+ "maps"
+ "slices"
"strings"
"github.com/google/jsonschema-go/jsonschema"
@@ -13,6 +16,10 @@ import (
type Database struct {
Definitions []*Definition `json:"definitions,omitempty"`
+
+ mergeCache map[string]*Definition
+ reverseCache map[*Definition]string
+ stringCache map[string]string
}
type Definition struct {
@@ -52,21 +59,45 @@ var DatabaseFormatHash = func() string {
return hash.String(schema, semanticVersion)
}()
-func (db *Database) Merge(other *Database) {
- db.Definitions = append(db.Definitions, other.Definitions...)
-}
-
-func (db *Database) Finalize(v *clangtool.Verifier) {
- db.Definitions = clangtool.SortAndDedupSlice(db.Definitions)
-
- for _, def := range db.Definitions {
+func (db *Database) Merge(other *Database, v *clangtool.Verifier) {
+ if db.mergeCache == nil {
+ db.mergeCache = make(map[string]*Definition)
+ db.reverseCache = make(map[*Definition]string)
+ db.stringCache = make(map[string]string)
+ }
+ for _, def := range other.Definitions {
+ id := fmt.Sprintf("%v-%v-%v", def.Kind, def.Name, def.Body.File)
+ if _, ok := db.mergeCache[id]; ok {
+ continue
+ }
+ db.mergeCache[id] = def
+ db.reverseCache[def] = id
v.LineRange(def.Body.File, def.Body.StartLine, def.Body.EndLine)
if def.Comment.File != "" {
v.LineRange(def.Comment.File, def.Comment.StartLine, def.Comment.EndLine)
}
+ db.intern(&def.Kind)
+ db.intern(&def.Name)
+ db.intern(&def.Type)
+ db.intern(&def.Body.File)
+ db.intern(&def.Comment.File)
+ for _, ref := range def.Refs {
+ db.intern(&ref.Kind)
+ db.intern(&ref.Name)
+ db.intern(&ref.EntityKind)
+ }
}
}
+func (db *Database) Finalize(v *clangtool.Verifier) {
+ db.Definitions = slices.Collect(maps.Values(db.mergeCache))
+ slices.SortFunc(db.Definitions, func(a, b *Definition) int {
+ return strings.Compare(db.reverseCache[a], db.reverseCache[b])
+ })
+ db.mergeCache = nil
+ db.reverseCache = nil
+}
+
// SetSoureFile attaches the source file to the entities that need it.
// The clang tool could do it, but it looks easier to do it here.
func (db *Database) SetSourceFile(file string, updatePath func(string) string) {
@@ -78,3 +109,15 @@ func (db *Database) SetSourceFile(file string, updatePath func(string) string) {
}
}
}
+
+func (db *Database) intern(str *string) {
+ if *str == "" {
+ return
+ }
+ v, ok := db.stringCache[*str]
+ if !ok {
+ v = strings.Clone(*str)
+ db.stringCache[v] = v
+ }
+ *str = v
+}
diff --git a/pkg/declextract/entity.go b/pkg/declextract/entity.go
index 3b5e13a6d..82bf00446 100644
--- a/pkg/declextract/entity.go
+++ b/pkg/declextract/entity.go
@@ -228,7 +228,7 @@ type EntityGlobalAddr struct {
Name string
}
-func (out *Output) Merge(other *Output) {
+func (out *Output) Merge(other *Output, v *clangtool.Verifier) {
out.Functions = append(out.Functions, other.Functions...)
out.Consts = append(out.Consts, other.Consts...)
out.Enums = append(out.Enums, other.Enums...)