aboutsummaryrefslogtreecommitdiffstats
path: root/pkg/codesearch
diff options
context:
space:
mode:
authorDmitry Vyukov <dvyukov@google.com>2025-11-17 11:17:23 +0100
committerDmitry Vyukov <dvyukov@google.com>2025-11-20 10:10:05 +0000
commit280ea308c321115445df610f1a75b05bbadca5f3 (patch)
treec195c76723c4a08986d74edbfc9e15a4f07fa6c1 /pkg/codesearch
parent94d1e3f8b1838e8a04074464a957e979a5c5e36b (diff)
pkg/codesearch: add skeleton for code searching tool
Add a clang tool that is used for code indexing (tools/clang/codesearch/). It follows conventions and build procedure of the declextract tool. Add pkg/codesearch package that aggregates the info exposed by the clang tools, and allows doing simple queries: - show source code of an entity (function, struct, etc) - show entity comment - show all entities defined in a source file Add tools/syz-codesearch wrapper tool that allows to create index for a kernel build, and then run code queries on it.
Diffstat (limited to 'pkg/codesearch')
-rw-r--r--pkg/codesearch/codesearch.go190
-rw-r--r--pkg/codesearch/codesearch_test.go61
-rw-r--r--pkg/codesearch/database.go56
-rw-r--r--pkg/codesearch/testdata/query-def-comment-close3
-rw-r--r--pkg/codesearch/testdata/query-def-comment-header3
-rw-r--r--pkg/codesearch/testdata/query-def-comment-open7
-rw-r--r--pkg/codesearch/testdata/query-def-source-close8
-rw-r--r--pkg/codesearch/testdata/query-def-source-header8
-rw-r--r--pkg/codesearch/testdata/query-def-source-missing3
-rw-r--r--pkg/codesearch/testdata/query-def-source-open11
-rw-r--r--pkg/codesearch/testdata/query-def-source-same-name-non-static8
-rw-r--r--pkg/codesearch/testdata/query-def-source-same-name-static8
-rw-r--r--pkg/codesearch/testdata/query-file-index-missing3
-rw-r--r--pkg/codesearch/testdata/query-file-index-source7
-rw-r--r--pkg/codesearch/testdata/source0.c22
-rw-r--r--pkg/codesearch/testdata/source0.c.json41
-rw-r--r--pkg/codesearch/testdata/source0.h10
-rw-r--r--pkg/codesearch/testdata/source1.c7
-rw-r--r--pkg/codesearch/testdata/source1.c.json20
-rw-r--r--pkg/codesearch/testdata/source2.c7
-rw-r--r--pkg/codesearch/testdata/source2.c.json19
21 files changed, 502 insertions, 0 deletions
diff --git a/pkg/codesearch/codesearch.go b/pkg/codesearch/codesearch.go
new file mode 100644
index 000000000..c1e99a174
--- /dev/null
+++ b/pkg/codesearch/codesearch.go
@@ -0,0 +1,190 @@
+// Copyright 2025 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+package codesearch
+
+import (
+ "bytes"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+
+ "github.com/google/syzkaller/pkg/osutil"
+)
+
+type Index struct {
+ db *Database
+ srcDirs []string
+}
+
+type Command struct {
+ Name string
+ NArgs int
+ Func func(*Index, []string) (string, error)
+}
+
+// Commands are used to run unit tests and for the syz-codesearch tool.
+var Commands = []Command{
+ {"file-index", 1, func(index *Index, args []string) (string, error) {
+ ok, entities, err := index.FileIndex(args[0])
+ if err != nil || !ok {
+ return notFound, err
+ }
+ b := new(strings.Builder)
+ fmt.Fprintf(b, "file %v defines the following entities:\n\n", args[0])
+ for _, ent := range entities {
+ fmt.Fprintf(b, "%v %v\n", ent.Kind, ent.Name)
+ }
+ return b.String(), nil
+ }},
+ {"def-comment", 2, func(index *Index, args []string) (string, error) {
+ info, err := index.DefinitionComment(args[0], args[1])
+ if err != nil || info == nil {
+ return notFound, err
+ }
+ if info.Body == "" {
+ return fmt.Sprintf("%v %v is defined in %v and is not commented\n",
+ info.Kind, args[1], info.File), nil
+ }
+ return fmt.Sprintf("%v %v is defined in %v and commented as:\n\n%v",
+ info.Kind, args[1], info.File, info.Body), nil
+ }},
+ {"def-source", 3, func(index *Index, args []string) (string, error) {
+ info, err := index.DefinitionSource(args[0], args[1], args[2] == "yes")
+ if err != nil || info == nil {
+ return notFound, err
+ }
+ return fmt.Sprintf("%v %v is defined in %v:\n\n%v", info.Kind, args[1], info.File, info.Body), nil
+ }},
+}
+
+const notFound = "not found\n"
+
+func NewIndex(databaseFile string, srcDirs []string) (*Index, error) {
+ db, err := osutil.ReadJSON[*Database](databaseFile)
+ if err != nil {
+ return nil, err
+ }
+ return &Index{
+ db: db,
+ srcDirs: srcDirs,
+ }, nil
+}
+
+func (index *Index) Command(cmd string, args []string) (string, error) {
+ for _, meta := range Commands {
+ if cmd == meta.Name {
+ if len(args) != meta.NArgs {
+ return "", fmt.Errorf("codesearch command %v requires %v args, but %v provided",
+ cmd, meta.NArgs, len(args))
+ }
+ return meta.Func(index, args)
+ }
+ }
+ return "", fmt.Errorf("unknown codesearch command %v", cmd)
+}
+
+type Entity struct {
+ Kind string
+ Name string
+}
+
+func (index *Index) FileIndex(file string) (bool, []Entity, error) {
+ var entities []Entity
+ for _, def := range index.db.Definitions {
+ if def.Body.File == file {
+ entities = append(entities, Entity{
+ Kind: def.Kind,
+ Name: def.Name,
+ })
+ }
+ }
+ return len(entities) != 0, entities, nil
+}
+
+type EntityInfo struct {
+ File string
+ Kind string
+ Body string
+}
+
+func (index *Index) DefinitionComment(contextFile, name string) (*EntityInfo, error) {
+ return index.definitionSource(contextFile, name, true, false)
+}
+
+func (index *Index) DefinitionSource(contextFile, name string, includeLines bool) (*EntityInfo, error) {
+ return index.definitionSource(contextFile, name, false, includeLines)
+}
+
+func (index *Index) definitionSource(contextFile, name string, comment, includeLines bool) (*EntityInfo, error) {
+ def := index.findDefinition(contextFile, name)
+ if def == nil {
+ return nil, nil
+ }
+ lineRange := def.Body
+ if comment {
+ lineRange = def.Comment
+ }
+ src, err := index.formatSource(lineRange, includeLines)
+ if err != nil {
+ return nil, err
+ }
+ return &EntityInfo{
+ File: def.Body.File,
+ Kind: def.Kind,
+ Body: src,
+ }, nil
+}
+
+func (index *Index) findDefinition(contextFile, name string) *Definition {
+ var weakMatch *Definition
+ for _, def := range index.db.Definitions {
+ if def.Name == name {
+ if def.Body.File == contextFile {
+ return def
+ }
+ if !def.IsStatic {
+ weakMatch = def
+ }
+ }
+ }
+ return weakMatch
+}
+
+func (index *Index) formatSource(lines LineRange, includeLines bool) (string, error) {
+ if lines.File == "" {
+ return "", nil
+ }
+ for _, dir := range index.srcDirs {
+ file := filepath.Join(dir, lines.File)
+ if !osutil.IsExist(file) {
+ continue
+ }
+ return formatSourceFile(file, lines.StartLine, lines.EndLine, includeLines)
+ }
+ return "", fmt.Errorf("codesearch: can't find %q file in any of %v", lines.File, index.srcDirs)
+}
+
+func formatSourceFile(file string, start, end int, includeLines bool) (string, error) {
+ data, err := os.ReadFile(file)
+ if err != nil {
+ return "", err
+ }
+ lines := bytes.Split(data, []byte{'\n'})
+ start--
+ end--
+ if start < 0 || end < start || end > len(lines) {
+ return "", fmt.Errorf("codesearch: bad line range [%v-%v] for file %v with %v lines",
+ start, end, file, len(lines))
+ }
+ b := new(strings.Builder)
+ for line := start; line <= end; line++ {
+ if includeLines {
+ fmt.Fprintf(b, "%4v:\t%s\n", line, lines[line])
+ } else {
+ fmt.Fprintf(b, "%s\n", lines[line])
+ }
+ }
+ return b.String(), nil
+}
diff --git a/pkg/codesearch/codesearch_test.go b/pkg/codesearch/codesearch_test.go
new file mode 100644
index 000000000..7af509294
--- /dev/null
+++ b/pkg/codesearch/codesearch_test.go
@@ -0,0 +1,61 @@
+// Copyright 2025 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+package codesearch
+
+import (
+ "bytes"
+ "os"
+ "path/filepath"
+ "strings"
+ "testing"
+
+ "github.com/google/syzkaller/pkg/clangtool/tooltest"
+ "github.com/google/syzkaller/pkg/osutil"
+)
+
+func TestClangTool(t *testing.T) {
+ tooltest.TestClangTool[Database](t)
+}
+
+func TestCommands(t *testing.T) {
+ db := tooltest.LoadOutput[Database](t)
+ index := &Index{db, []string{"testdata"}}
+ files, err := filepath.Glob(filepath.Join(osutil.Abs("testdata"), "query*"))
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(files) == 0 {
+ t.Fatal("found no qeury files")
+ }
+ covered := make(map[string]bool)
+ for _, file := range files {
+ t.Run(filepath.Base(file), func(t *testing.T) {
+ testCommand(t, index, covered, file)
+ })
+ }
+ for _, cmd := range Commands {
+ if !covered[cmd.Name] {
+ t.Errorf("command %v is not covered, add at least one test", cmd.Name)
+ }
+ }
+}
+
+func testCommand(t *testing.T, index *Index, covered map[string]bool, file string) {
+ data, err := os.ReadFile(file)
+ if err != nil {
+ t.Fatal(err)
+ }
+ query, _, _ := bytes.Cut(data, []byte{'\n'})
+ args := strings.Fields(string(query))
+ if len(args) == 0 {
+ t.Fatal("no command found")
+ }
+ result, err := index.Command(args[0], args[1:])
+ if err != nil {
+ t.Fatal(err)
+ }
+ got := append([]byte(strings.Join(args, " ")+"\n\n"), result...)
+ tooltest.CompareGoldenData(t, file, got)
+ covered[args[0]] = true
+}
diff --git a/pkg/codesearch/database.go b/pkg/codesearch/database.go
new file mode 100644
index 000000000..4757935e9
--- /dev/null
+++ b/pkg/codesearch/database.go
@@ -0,0 +1,56 @@
+// Copyright 2025 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+package codesearch
+
+import (
+ "strings"
+
+ "github.com/google/syzkaller/pkg/clangtool"
+)
+
+type Database struct {
+ Definitions []*Definition `json:"definitions,omitempty"`
+}
+
+type Definition struct {
+ Kind string `json:"kind,omitempty"`
+ Name string `json:"name,omitempty"`
+ Type string `json:"type,omitempty"`
+ IsStatic bool `json:"is_static,omitempty"`
+ Body LineRange `json:"body,omitempty"`
+ Comment LineRange `json:"comment,omitempty"`
+}
+
+type LineRange struct {
+ File string `json:"file,omitempty"`
+ StartLine int `json:"start_line,omitempty"`
+ EndLine int `json:"end_line,omitempty"`
+}
+
+func (db *Database) Merge(other *Database) {
+ db.Definitions = append(db.Definitions, other.Definitions...)
+}
+
+func (db *Database) Finalize(v *clangtool.Verifier) {
+ db.Definitions = clangtool.SortAndDedupSlice(db.Definitions)
+
+ for _, def := range db.Definitions {
+ v.LineRange(def.Body.File, def.Body.StartLine, def.Body.EndLine)
+ if def.Comment.File != "" {
+ v.LineRange(def.Comment.File, def.Comment.StartLine, def.Comment.EndLine)
+ }
+ }
+}
+
+// SetSoureFile attaches the source file to the entities that need it.
+// The clang tool could do it, but it looks easier to do it here.
+func (db *Database) SetSourceFile(file string, updatePath func(string) string) {
+ for _, def := range db.Definitions {
+ def.Body.File = updatePath(def.Body.File)
+ def.Comment.File = updatePath(def.Comment.File)
+ if strings.HasSuffix(def.Body.File, ".c") && def.Body.File != file {
+ def.IsStatic = false
+ }
+ }
+}
diff --git a/pkg/codesearch/testdata/query-def-comment-close b/pkg/codesearch/testdata/query-def-comment-close
new file mode 100644
index 000000000..df6c1c2af
--- /dev/null
+++ b/pkg/codesearch/testdata/query-def-comment-close
@@ -0,0 +1,3 @@
+def-comment source0.c close
+
+function close is defined in source0.c and is not commented
diff --git a/pkg/codesearch/testdata/query-def-comment-header b/pkg/codesearch/testdata/query-def-comment-header
new file mode 100644
index 000000000..a940938b8
--- /dev/null
+++ b/pkg/codesearch/testdata/query-def-comment-header
@@ -0,0 +1,3 @@
+def-comment source0.c function_with_comment_in_header
+
+function function_with_comment_in_header is defined in source0.c and is not commented
diff --git a/pkg/codesearch/testdata/query-def-comment-open b/pkg/codesearch/testdata/query-def-comment-open
new file mode 100644
index 000000000..64bd21812
--- /dev/null
+++ b/pkg/codesearch/testdata/query-def-comment-open
@@ -0,0 +1,7 @@
+def-comment source0.c open
+
+function open is defined in source0.c and commented as:
+
+/*
+ * Comment about open.
+ */
diff --git a/pkg/codesearch/testdata/query-def-source-close b/pkg/codesearch/testdata/query-def-source-close
new file mode 100644
index 000000000..2a9dcefad
--- /dev/null
+++ b/pkg/codesearch/testdata/query-def-source-close
@@ -0,0 +1,8 @@
+def-source source0.c close no
+
+function close is defined in source0.c:
+
+int close()
+{
+ return 0;
+}
diff --git a/pkg/codesearch/testdata/query-def-source-header b/pkg/codesearch/testdata/query-def-source-header
new file mode 100644
index 000000000..fd3ba300b
--- /dev/null
+++ b/pkg/codesearch/testdata/query-def-source-header
@@ -0,0 +1,8 @@
+def-source source0.c function_with_comment_in_header yes
+
+function function_with_comment_in_header is defined in source0.c:
+
+ 18: void function_with_comment_in_header()
+ 19: {
+ 20: same_name_in_several_files();
+ 21: }
diff --git a/pkg/codesearch/testdata/query-def-source-missing b/pkg/codesearch/testdata/query-def-source-missing
new file mode 100644
index 000000000..0b60003c7
--- /dev/null
+++ b/pkg/codesearch/testdata/query-def-source-missing
@@ -0,0 +1,3 @@
+def-source source0.c some_non_existent_function no
+
+not found
diff --git a/pkg/codesearch/testdata/query-def-source-open b/pkg/codesearch/testdata/query-def-source-open
new file mode 100644
index 000000000..bdcec72fd
--- /dev/null
+++ b/pkg/codesearch/testdata/query-def-source-open
@@ -0,0 +1,11 @@
+def-source source0.c open yes
+
+function open is defined in source0.c:
+
+ 5: /*
+ 6: * Comment about open.
+ 7: */
+ 8: int open()
+ 9: {
+ 10: return 0;
+ 11: }
diff --git a/pkg/codesearch/testdata/query-def-source-same-name-non-static b/pkg/codesearch/testdata/query-def-source-same-name-non-static
new file mode 100644
index 000000000..ae09d3313
--- /dev/null
+++ b/pkg/codesearch/testdata/query-def-source-same-name-non-static
@@ -0,0 +1,8 @@
+def-source source0.c same_name_in_several_files no
+
+function same_name_in_several_files is defined in source2.c:
+
+void same_name_in_several_files()
+{
+ // This is non-static version in in source2.c.
+}
diff --git a/pkg/codesearch/testdata/query-def-source-same-name-static b/pkg/codesearch/testdata/query-def-source-same-name-static
new file mode 100644
index 000000000..3d87c010c
--- /dev/null
+++ b/pkg/codesearch/testdata/query-def-source-same-name-static
@@ -0,0 +1,8 @@
+def-source source1.c same_name_in_several_files yes
+
+function same_name_in_several_files is defined in source1.c:
+
+ 3: static void same_name_in_several_files()
+ 4: {
+ 5: // This is static version in source1.c.
+ 6: }
diff --git a/pkg/codesearch/testdata/query-file-index-missing b/pkg/codesearch/testdata/query-file-index-missing
new file mode 100644
index 000000000..1be486378
--- /dev/null
+++ b/pkg/codesearch/testdata/query-file-index-missing
@@ -0,0 +1,3 @@
+file-index some-non-existent-file.c
+
+not found
diff --git a/pkg/codesearch/testdata/query-file-index-source b/pkg/codesearch/testdata/query-file-index-source
new file mode 100644
index 000000000..c238079d0
--- /dev/null
+++ b/pkg/codesearch/testdata/query-file-index-source
@@ -0,0 +1,7 @@
+file-index source0.c
+
+file source0.c defines the following entities:
+
+function close
+function function_with_comment_in_header
+function open
diff --git a/pkg/codesearch/testdata/source0.c b/pkg/codesearch/testdata/source0.c
new file mode 100644
index 000000000..384c4c119
--- /dev/null
+++ b/pkg/codesearch/testdata/source0.c
@@ -0,0 +1,22 @@
+// Copyright 2025 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+#include "source0.h"
+
+/*
+ * Comment about open.
+ */
+int open()
+{
+ return 0;
+}
+
+int close()
+{
+ return 0;
+}
+
+void function_with_comment_in_header()
+{
+ same_name_in_several_files();
+}
diff --git a/pkg/codesearch/testdata/source0.c.json b/pkg/codesearch/testdata/source0.c.json
new file mode 100644
index 000000000..d33aa360c
--- /dev/null
+++ b/pkg/codesearch/testdata/source0.c.json
@@ -0,0 +1,41 @@
+{
+ "definitions": [
+ {
+ "kind": "function",
+ "name": "close",
+ "type": "int ()",
+ "body": {
+ "file": "source0.c",
+ "start_line": 14,
+ "end_line": 17
+ },
+ "comment": {}
+ },
+ {
+ "kind": "function",
+ "name": "function_with_comment_in_header",
+ "type": "void ()",
+ "body": {
+ "file": "source0.c",
+ "start_line": 19,
+ "end_line": 22
+ },
+ "comment": {}
+ },
+ {
+ "kind": "function",
+ "name": "open",
+ "type": "int ()",
+ "body": {
+ "file": "source0.c",
+ "start_line": 6,
+ "end_line": 12
+ },
+ "comment": {
+ "file": "source0.c",
+ "start_line": 6,
+ "end_line": 8
+ }
+ }
+ ]
+} \ No newline at end of file
diff --git a/pkg/codesearch/testdata/source0.h b/pkg/codesearch/testdata/source0.h
new file mode 100644
index 000000000..339975b2e
--- /dev/null
+++ b/pkg/codesearch/testdata/source0.h
@@ -0,0 +1,10 @@
+// Copyright 2025 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+/*
+ * Comment about the function in header.
+ * Multi-line just in case.
+ */
+void function_with_comment_in_header();
+
+void same_name_in_several_files();
diff --git a/pkg/codesearch/testdata/source1.c b/pkg/codesearch/testdata/source1.c
new file mode 100644
index 000000000..ad7d5792c
--- /dev/null
+++ b/pkg/codesearch/testdata/source1.c
@@ -0,0 +1,7 @@
+// Copyright 2025 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+static void same_name_in_several_files()
+{
+ // This is static version in source1.c.
+}
diff --git a/pkg/codesearch/testdata/source1.c.json b/pkg/codesearch/testdata/source1.c.json
new file mode 100644
index 000000000..72278a191
--- /dev/null
+++ b/pkg/codesearch/testdata/source1.c.json
@@ -0,0 +1,20 @@
+{
+ "definitions": [
+ {
+ "kind": "function",
+ "name": "same_name_in_several_files",
+ "type": "void ()",
+ "is_static": true,
+ "body": {
+ "file": "source1.c",
+ "start_line": 4,
+ "end_line": 7
+ },
+ "comment": {
+ "file": "source1.c",
+ "start_line": 1,
+ "end_line": 2
+ }
+ }
+ ]
+} \ No newline at end of file
diff --git a/pkg/codesearch/testdata/source2.c b/pkg/codesearch/testdata/source2.c
new file mode 100644
index 000000000..f7ef3d810
--- /dev/null
+++ b/pkg/codesearch/testdata/source2.c
@@ -0,0 +1,7 @@
+// Copyright 2025 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+void same_name_in_several_files()
+{
+ // This is non-static version in in source2.c.
+}
diff --git a/pkg/codesearch/testdata/source2.c.json b/pkg/codesearch/testdata/source2.c.json
new file mode 100644
index 000000000..4407152db
--- /dev/null
+++ b/pkg/codesearch/testdata/source2.c.json
@@ -0,0 +1,19 @@
+{
+ "definitions": [
+ {
+ "kind": "function",
+ "name": "same_name_in_several_files",
+ "type": "void ()",
+ "body": {
+ "file": "source2.c",
+ "start_line": 4,
+ "end_line": 7
+ },
+ "comment": {
+ "file": "source2.c",
+ "start_line": 1,
+ "end_line": 2
+ }
+ }
+ ]
+} \ No newline at end of file