aboutsummaryrefslogtreecommitdiffstats
path: root/pkg/codesearch/database.go
diff options
context:
space:
mode:
authorDmitry Vyukov <dvyukov@google.com>2026-01-21 14:52:28 +0100
committerDmitry Vyukov <dvyukov@google.com>2026-01-22 11:23:54 +0000
commit084b5c918c53c4e2eeb51664f3d403095f59f25d (patch)
tree0b5bffd7ce76ebf64dea016a4caaada43e7c75f9 /pkg/codesearch/database.go
parent7355a8eb9f9228c3c3b5b6874e33968333115b23 (diff)
pkg/codesearch: reduce memory consumption when building index
With all references in the index, it become quite big. Merge and dedup the resulting index on the fly. Also intern all strings b/c there are tons of duplicates. This also removes unnecessary duplicates (effectively ODR violations in the kernel) due to use of BUILD_BUG_ON. The macro produces different function calls in different translations units, so the same function may contain __compiletime_assert_N1 call in one TU and __compiletime_assert_N2 in another. Over this reduces resource consumption of index building from: time:296.11s user:16993.71s sys:6661.03s memory:82707MB to: time:194.28s user:16860.01s sys:6647.01s memory: 3243MB 25x reduction in memory consumption.
Diffstat (limited to 'pkg/codesearch/database.go')
-rw-r--r--pkg/codesearch/database.go59
1 files changed, 51 insertions, 8 deletions
diff --git a/pkg/codesearch/database.go b/pkg/codesearch/database.go
index 93f194793..dbea6632c 100644
--- a/pkg/codesearch/database.go
+++ b/pkg/codesearch/database.go
@@ -4,6 +4,9 @@
package codesearch
import (
+ "fmt"
+ "maps"
+ "slices"
"strings"
"github.com/google/jsonschema-go/jsonschema"
@@ -13,6 +16,10 @@ import (
type Database struct {
Definitions []*Definition `json:"definitions,omitempty"`
+
+ mergeCache map[string]*Definition
+ reverseCache map[*Definition]string
+ stringCache map[string]string
}
type Definition struct {
@@ -52,21 +59,45 @@ var DatabaseFormatHash = func() string {
return hash.String(schema, semanticVersion)
}()
-func (db *Database) Merge(other *Database) {
- db.Definitions = append(db.Definitions, other.Definitions...)
-}
-
-func (db *Database) Finalize(v *clangtool.Verifier) {
- db.Definitions = clangtool.SortAndDedupSlice(db.Definitions)
-
- for _, def := range db.Definitions {
+func (db *Database) Merge(other *Database, v *clangtool.Verifier) {
+ if db.mergeCache == nil {
+ db.mergeCache = make(map[string]*Definition)
+ db.reverseCache = make(map[*Definition]string)
+ db.stringCache = make(map[string]string)
+ }
+ for _, def := range other.Definitions {
+ id := fmt.Sprintf("%v-%v-%v", def.Kind, def.Name, def.Body.File)
+ if _, ok := db.mergeCache[id]; ok {
+ continue
+ }
+ db.mergeCache[id] = def
+ db.reverseCache[def] = id
v.LineRange(def.Body.File, def.Body.StartLine, def.Body.EndLine)
if def.Comment.File != "" {
v.LineRange(def.Comment.File, def.Comment.StartLine, def.Comment.EndLine)
}
+ db.intern(&def.Kind)
+ db.intern(&def.Name)
+ db.intern(&def.Type)
+ db.intern(&def.Body.File)
+ db.intern(&def.Comment.File)
+ for _, ref := range def.Refs {
+ db.intern(&ref.Kind)
+ db.intern(&ref.Name)
+ db.intern(&ref.EntityKind)
+ }
}
}
+func (db *Database) Finalize(v *clangtool.Verifier) {
+ db.Definitions = slices.Collect(maps.Values(db.mergeCache))
+ slices.SortFunc(db.Definitions, func(a, b *Definition) int {
+ return strings.Compare(db.reverseCache[a], db.reverseCache[b])
+ })
+ db.mergeCache = nil
+ db.reverseCache = nil
+}
+
// SetSoureFile attaches the source file to the entities that need it.
// The clang tool could do it, but it looks easier to do it here.
func (db *Database) SetSourceFile(file string, updatePath func(string) string) {
@@ -78,3 +109,15 @@ func (db *Database) SetSourceFile(file string, updatePath func(string) string) {
}
}
}
+
+func (db *Database) intern(str *string) {
+ if *str == "" {
+ return
+ }
+ v, ok := db.stringCache[*str]
+ if !ok {
+ v = strings.Clone(*str)
+ db.stringCache[v] = v
+ }
+ *str = v
+}