From 280ea308c321115445df610f1a75b05bbadca5f3 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Mon, 17 Nov 2025 11:17:23 +0100 Subject: pkg/codesearch: add skeleton for code searching tool Add a clang tool that is used for code indexing (tools/clang/codesearch/). It follows conventions and build procedure of the declextract tool. Add pkg/codesearch package that aggregates the info exposed by the clang tools, and allows doing simple queries: - show source code of an entity (function, struct, etc) - show entity comment - show all entities defined in a source file Add tools/syz-codesearch wrapper tool that allows to create index for a kernel build, and then run code queries on it. --- pkg/clangtool/clangtool.go | 5 +- pkg/codesearch/codesearch.go | 190 +++++++++++++++++++++ pkg/codesearch/codesearch_test.go | 61 +++++++ pkg/codesearch/database.go | 56 ++++++ pkg/codesearch/testdata/query-def-comment-close | 3 + pkg/codesearch/testdata/query-def-comment-header | 3 + pkg/codesearch/testdata/query-def-comment-open | 7 + pkg/codesearch/testdata/query-def-source-close | 8 + pkg/codesearch/testdata/query-def-source-header | 8 + pkg/codesearch/testdata/query-def-source-missing | 3 + pkg/codesearch/testdata/query-def-source-open | 11 ++ .../testdata/query-def-source-same-name-non-static | 8 + .../testdata/query-def-source-same-name-static | 8 + pkg/codesearch/testdata/query-file-index-missing | 3 + pkg/codesearch/testdata/query-file-index-source | 7 + pkg/codesearch/testdata/source0.c | 22 +++ pkg/codesearch/testdata/source0.c.json | 41 +++++ pkg/codesearch/testdata/source0.h | 10 ++ pkg/codesearch/testdata/source1.c | 7 + pkg/codesearch/testdata/source1.c.json | 20 +++ pkg/codesearch/testdata/source2.c | 7 + pkg/codesearch/testdata/source2.c.json | 19 +++ 22 files changed, 506 insertions(+), 1 deletion(-) create mode 100644 pkg/codesearch/codesearch.go create mode 100644 pkg/codesearch/codesearch_test.go create mode 100644 pkg/codesearch/database.go create mode 100644 pkg/codesearch/testdata/query-def-comment-close create mode 100644 pkg/codesearch/testdata/query-def-comment-header create mode 100644 pkg/codesearch/testdata/query-def-comment-open create mode 100644 pkg/codesearch/testdata/query-def-source-close create mode 100644 pkg/codesearch/testdata/query-def-source-header create mode 100644 pkg/codesearch/testdata/query-def-source-missing create mode 100644 pkg/codesearch/testdata/query-def-source-open create mode 100644 pkg/codesearch/testdata/query-def-source-same-name-non-static create mode 100644 pkg/codesearch/testdata/query-def-source-same-name-static create mode 100644 pkg/codesearch/testdata/query-file-index-missing create mode 100644 pkg/codesearch/testdata/query-file-index-source create mode 100644 pkg/codesearch/testdata/source0.c create mode 100644 pkg/codesearch/testdata/source0.c.json create mode 100644 pkg/codesearch/testdata/source0.h create mode 100644 pkg/codesearch/testdata/source1.c create mode 100644 pkg/codesearch/testdata/source1.c.json create mode 100644 pkg/codesearch/testdata/source2.c create mode 100644 pkg/codesearch/testdata/source2.c.json (limited to 'pkg') diff --git a/pkg/clangtool/clangtool.go b/pkg/clangtool/clangtool.go index 4e3914cf8..8711b5411 100644 --- a/pkg/clangtool/clangtool.go +++ b/pkg/clangtool/clangtool.go @@ -155,7 +155,10 @@ func runTool[Output any, OutputPtr OutputDataPtr[Output]](cfg *Config, dbFile, f cfg.KernelSrc), cfg.KernelObj), "/") // Suppress warning since we may build the tool on a different clang // version that produces more warnings. - data, err := exec.Command(cfg.ToolBin, "-p", dbFile, "--extra-arg=-w", file).Output() + // Comments are needed for codesearch tool, but may be useful for declextract + // in the future if we try to parse them with LLMs. + data, err := exec.Command(cfg.ToolBin, "-p", dbFile, + "--extra-arg=-w", "--extra-arg=-fparse-all-comments", file).Output() if err != nil { var exitErr *exec.ExitError if errors.As(err, &exitErr) { diff --git a/pkg/codesearch/codesearch.go b/pkg/codesearch/codesearch.go new file mode 100644 index 000000000..c1e99a174 --- /dev/null +++ b/pkg/codesearch/codesearch.go @@ -0,0 +1,190 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +package codesearch + +import ( + "bytes" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/google/syzkaller/pkg/osutil" +) + +type Index struct { + db *Database + srcDirs []string +} + +type Command struct { + Name string + NArgs int + Func func(*Index, []string) (string, error) +} + +// Commands are used to run unit tests and for the syz-codesearch tool. +var Commands = []Command{ + {"file-index", 1, func(index *Index, args []string) (string, error) { + ok, entities, err := index.FileIndex(args[0]) + if err != nil || !ok { + return notFound, err + } + b := new(strings.Builder) + fmt.Fprintf(b, "file %v defines the following entities:\n\n", args[0]) + for _, ent := range entities { + fmt.Fprintf(b, "%v %v\n", ent.Kind, ent.Name) + } + return b.String(), nil + }}, + {"def-comment", 2, func(index *Index, args []string) (string, error) { + info, err := index.DefinitionComment(args[0], args[1]) + if err != nil || info == nil { + return notFound, err + } + if info.Body == "" { + return fmt.Sprintf("%v %v is defined in %v and is not commented\n", + info.Kind, args[1], info.File), nil + } + return fmt.Sprintf("%v %v is defined in %v and commented as:\n\n%v", + info.Kind, args[1], info.File, info.Body), nil + }}, + {"def-source", 3, func(index *Index, args []string) (string, error) { + info, err := index.DefinitionSource(args[0], args[1], args[2] == "yes") + if err != nil || info == nil { + return notFound, err + } + return fmt.Sprintf("%v %v is defined in %v:\n\n%v", info.Kind, args[1], info.File, info.Body), nil + }}, +} + +const notFound = "not found\n" + +func NewIndex(databaseFile string, srcDirs []string) (*Index, error) { + db, err := osutil.ReadJSON[*Database](databaseFile) + if err != nil { + return nil, err + } + return &Index{ + db: db, + srcDirs: srcDirs, + }, nil +} + +func (index *Index) Command(cmd string, args []string) (string, error) { + for _, meta := range Commands { + if cmd == meta.Name { + if len(args) != meta.NArgs { + return "", fmt.Errorf("codesearch command %v requires %v args, but %v provided", + cmd, meta.NArgs, len(args)) + } + return meta.Func(index, args) + } + } + return "", fmt.Errorf("unknown codesearch command %v", cmd) +} + +type Entity struct { + Kind string + Name string +} + +func (index *Index) FileIndex(file string) (bool, []Entity, error) { + var entities []Entity + for _, def := range index.db.Definitions { + if def.Body.File == file { + entities = append(entities, Entity{ + Kind: def.Kind, + Name: def.Name, + }) + } + } + return len(entities) != 0, entities, nil +} + +type EntityInfo struct { + File string + Kind string + Body string +} + +func (index *Index) DefinitionComment(contextFile, name string) (*EntityInfo, error) { + return index.definitionSource(contextFile, name, true, false) +} + +func (index *Index) DefinitionSource(contextFile, name string, includeLines bool) (*EntityInfo, error) { + return index.definitionSource(contextFile, name, false, includeLines) +} + +func (index *Index) definitionSource(contextFile, name string, comment, includeLines bool) (*EntityInfo, error) { + def := index.findDefinition(contextFile, name) + if def == nil { + return nil, nil + } + lineRange := def.Body + if comment { + lineRange = def.Comment + } + src, err := index.formatSource(lineRange, includeLines) + if err != nil { + return nil, err + } + return &EntityInfo{ + File: def.Body.File, + Kind: def.Kind, + Body: src, + }, nil +} + +func (index *Index) findDefinition(contextFile, name string) *Definition { + var weakMatch *Definition + for _, def := range index.db.Definitions { + if def.Name == name { + if def.Body.File == contextFile { + return def + } + if !def.IsStatic { + weakMatch = def + } + } + } + return weakMatch +} + +func (index *Index) formatSource(lines LineRange, includeLines bool) (string, error) { + if lines.File == "" { + return "", nil + } + for _, dir := range index.srcDirs { + file := filepath.Join(dir, lines.File) + if !osutil.IsExist(file) { + continue + } + return formatSourceFile(file, lines.StartLine, lines.EndLine, includeLines) + } + return "", fmt.Errorf("codesearch: can't find %q file in any of %v", lines.File, index.srcDirs) +} + +func formatSourceFile(file string, start, end int, includeLines bool) (string, error) { + data, err := os.ReadFile(file) + if err != nil { + return "", err + } + lines := bytes.Split(data, []byte{'\n'}) + start-- + end-- + if start < 0 || end < start || end > len(lines) { + return "", fmt.Errorf("codesearch: bad line range [%v-%v] for file %v with %v lines", + start, end, file, len(lines)) + } + b := new(strings.Builder) + for line := start; line <= end; line++ { + if includeLines { + fmt.Fprintf(b, "%4v:\t%s\n", line, lines[line]) + } else { + fmt.Fprintf(b, "%s\n", lines[line]) + } + } + return b.String(), nil +} diff --git a/pkg/codesearch/codesearch_test.go b/pkg/codesearch/codesearch_test.go new file mode 100644 index 000000000..7af509294 --- /dev/null +++ b/pkg/codesearch/codesearch_test.go @@ -0,0 +1,61 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +package codesearch + +import ( + "bytes" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/google/syzkaller/pkg/clangtool/tooltest" + "github.com/google/syzkaller/pkg/osutil" +) + +func TestClangTool(t *testing.T) { + tooltest.TestClangTool[Database](t) +} + +func TestCommands(t *testing.T) { + db := tooltest.LoadOutput[Database](t) + index := &Index{db, []string{"testdata"}} + files, err := filepath.Glob(filepath.Join(osutil.Abs("testdata"), "query*")) + if err != nil { + t.Fatal(err) + } + if len(files) == 0 { + t.Fatal("found no qeury files") + } + covered := make(map[string]bool) + for _, file := range files { + t.Run(filepath.Base(file), func(t *testing.T) { + testCommand(t, index, covered, file) + }) + } + for _, cmd := range Commands { + if !covered[cmd.Name] { + t.Errorf("command %v is not covered, add at least one test", cmd.Name) + } + } +} + +func testCommand(t *testing.T, index *Index, covered map[string]bool, file string) { + data, err := os.ReadFile(file) + if err != nil { + t.Fatal(err) + } + query, _, _ := bytes.Cut(data, []byte{'\n'}) + args := strings.Fields(string(query)) + if len(args) == 0 { + t.Fatal("no command found") + } + result, err := index.Command(args[0], args[1:]) + if err != nil { + t.Fatal(err) + } + got := append([]byte(strings.Join(args, " ")+"\n\n"), result...) + tooltest.CompareGoldenData(t, file, got) + covered[args[0]] = true +} diff --git a/pkg/codesearch/database.go b/pkg/codesearch/database.go new file mode 100644 index 000000000..4757935e9 --- /dev/null +++ b/pkg/codesearch/database.go @@ -0,0 +1,56 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +package codesearch + +import ( + "strings" + + "github.com/google/syzkaller/pkg/clangtool" +) + +type Database struct { + Definitions []*Definition `json:"definitions,omitempty"` +} + +type Definition struct { + Kind string `json:"kind,omitempty"` + Name string `json:"name,omitempty"` + Type string `json:"type,omitempty"` + IsStatic bool `json:"is_static,omitempty"` + Body LineRange `json:"body,omitempty"` + Comment LineRange `json:"comment,omitempty"` +} + +type LineRange struct { + File string `json:"file,omitempty"` + StartLine int `json:"start_line,omitempty"` + EndLine int `json:"end_line,omitempty"` +} + +func (db *Database) Merge(other *Database) { + db.Definitions = append(db.Definitions, other.Definitions...) +} + +func (db *Database) Finalize(v *clangtool.Verifier) { + db.Definitions = clangtool.SortAndDedupSlice(db.Definitions) + + for _, def := range db.Definitions { + v.LineRange(def.Body.File, def.Body.StartLine, def.Body.EndLine) + if def.Comment.File != "" { + v.LineRange(def.Comment.File, def.Comment.StartLine, def.Comment.EndLine) + } + } +} + +// SetSoureFile attaches the source file to the entities that need it. +// The clang tool could do it, but it looks easier to do it here. +func (db *Database) SetSourceFile(file string, updatePath func(string) string) { + for _, def := range db.Definitions { + def.Body.File = updatePath(def.Body.File) + def.Comment.File = updatePath(def.Comment.File) + if strings.HasSuffix(def.Body.File, ".c") && def.Body.File != file { + def.IsStatic = false + } + } +} diff --git a/pkg/codesearch/testdata/query-def-comment-close b/pkg/codesearch/testdata/query-def-comment-close new file mode 100644 index 000000000..df6c1c2af --- /dev/null +++ b/pkg/codesearch/testdata/query-def-comment-close @@ -0,0 +1,3 @@ +def-comment source0.c close + +function close is defined in source0.c and is not commented diff --git a/pkg/codesearch/testdata/query-def-comment-header b/pkg/codesearch/testdata/query-def-comment-header new file mode 100644 index 000000000..a940938b8 --- /dev/null +++ b/pkg/codesearch/testdata/query-def-comment-header @@ -0,0 +1,3 @@ +def-comment source0.c function_with_comment_in_header + +function function_with_comment_in_header is defined in source0.c and is not commented diff --git a/pkg/codesearch/testdata/query-def-comment-open b/pkg/codesearch/testdata/query-def-comment-open new file mode 100644 index 000000000..64bd21812 --- /dev/null +++ b/pkg/codesearch/testdata/query-def-comment-open @@ -0,0 +1,7 @@ +def-comment source0.c open + +function open is defined in source0.c and commented as: + +/* + * Comment about open. + */ diff --git a/pkg/codesearch/testdata/query-def-source-close b/pkg/codesearch/testdata/query-def-source-close new file mode 100644 index 000000000..2a9dcefad --- /dev/null +++ b/pkg/codesearch/testdata/query-def-source-close @@ -0,0 +1,8 @@ +def-source source0.c close no + +function close is defined in source0.c: + +int close() +{ + return 0; +} diff --git a/pkg/codesearch/testdata/query-def-source-header b/pkg/codesearch/testdata/query-def-source-header new file mode 100644 index 000000000..fd3ba300b --- /dev/null +++ b/pkg/codesearch/testdata/query-def-source-header @@ -0,0 +1,8 @@ +def-source source0.c function_with_comment_in_header yes + +function function_with_comment_in_header is defined in source0.c: + + 18: void function_with_comment_in_header() + 19: { + 20: same_name_in_several_files(); + 21: } diff --git a/pkg/codesearch/testdata/query-def-source-missing b/pkg/codesearch/testdata/query-def-source-missing new file mode 100644 index 000000000..0b60003c7 --- /dev/null +++ b/pkg/codesearch/testdata/query-def-source-missing @@ -0,0 +1,3 @@ +def-source source0.c some_non_existent_function no + +not found diff --git a/pkg/codesearch/testdata/query-def-source-open b/pkg/codesearch/testdata/query-def-source-open new file mode 100644 index 000000000..bdcec72fd --- /dev/null +++ b/pkg/codesearch/testdata/query-def-source-open @@ -0,0 +1,11 @@ +def-source source0.c open yes + +function open is defined in source0.c: + + 5: /* + 6: * Comment about open. + 7: */ + 8: int open() + 9: { + 10: return 0; + 11: } diff --git a/pkg/codesearch/testdata/query-def-source-same-name-non-static b/pkg/codesearch/testdata/query-def-source-same-name-non-static new file mode 100644 index 000000000..ae09d3313 --- /dev/null +++ b/pkg/codesearch/testdata/query-def-source-same-name-non-static @@ -0,0 +1,8 @@ +def-source source0.c same_name_in_several_files no + +function same_name_in_several_files is defined in source2.c: + +void same_name_in_several_files() +{ + // This is non-static version in in source2.c. +} diff --git a/pkg/codesearch/testdata/query-def-source-same-name-static b/pkg/codesearch/testdata/query-def-source-same-name-static new file mode 100644 index 000000000..3d87c010c --- /dev/null +++ b/pkg/codesearch/testdata/query-def-source-same-name-static @@ -0,0 +1,8 @@ +def-source source1.c same_name_in_several_files yes + +function same_name_in_several_files is defined in source1.c: + + 3: static void same_name_in_several_files() + 4: { + 5: // This is static version in source1.c. + 6: } diff --git a/pkg/codesearch/testdata/query-file-index-missing b/pkg/codesearch/testdata/query-file-index-missing new file mode 100644 index 000000000..1be486378 --- /dev/null +++ b/pkg/codesearch/testdata/query-file-index-missing @@ -0,0 +1,3 @@ +file-index some-non-existent-file.c + +not found diff --git a/pkg/codesearch/testdata/query-file-index-source b/pkg/codesearch/testdata/query-file-index-source new file mode 100644 index 000000000..c238079d0 --- /dev/null +++ b/pkg/codesearch/testdata/query-file-index-source @@ -0,0 +1,7 @@ +file-index source0.c + +file source0.c defines the following entities: + +function close +function function_with_comment_in_header +function open diff --git a/pkg/codesearch/testdata/source0.c b/pkg/codesearch/testdata/source0.c new file mode 100644 index 000000000..384c4c119 --- /dev/null +++ b/pkg/codesearch/testdata/source0.c @@ -0,0 +1,22 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +#include "source0.h" + +/* + * Comment about open. + */ +int open() +{ + return 0; +} + +int close() +{ + return 0; +} + +void function_with_comment_in_header() +{ + same_name_in_several_files(); +} diff --git a/pkg/codesearch/testdata/source0.c.json b/pkg/codesearch/testdata/source0.c.json new file mode 100644 index 000000000..d33aa360c --- /dev/null +++ b/pkg/codesearch/testdata/source0.c.json @@ -0,0 +1,41 @@ +{ + "definitions": [ + { + "kind": "function", + "name": "close", + "type": "int ()", + "body": { + "file": "source0.c", + "start_line": 14, + "end_line": 17 + }, + "comment": {} + }, + { + "kind": "function", + "name": "function_with_comment_in_header", + "type": "void ()", + "body": { + "file": "source0.c", + "start_line": 19, + "end_line": 22 + }, + "comment": {} + }, + { + "kind": "function", + "name": "open", + "type": "int ()", + "body": { + "file": "source0.c", + "start_line": 6, + "end_line": 12 + }, + "comment": { + "file": "source0.c", + "start_line": 6, + "end_line": 8 + } + } + ] +} \ No newline at end of file diff --git a/pkg/codesearch/testdata/source0.h b/pkg/codesearch/testdata/source0.h new file mode 100644 index 000000000..339975b2e --- /dev/null +++ b/pkg/codesearch/testdata/source0.h @@ -0,0 +1,10 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +/* + * Comment about the function in header. + * Multi-line just in case. + */ +void function_with_comment_in_header(); + +void same_name_in_several_files(); diff --git a/pkg/codesearch/testdata/source1.c b/pkg/codesearch/testdata/source1.c new file mode 100644 index 000000000..ad7d5792c --- /dev/null +++ b/pkg/codesearch/testdata/source1.c @@ -0,0 +1,7 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +static void same_name_in_several_files() +{ + // This is static version in source1.c. +} diff --git a/pkg/codesearch/testdata/source1.c.json b/pkg/codesearch/testdata/source1.c.json new file mode 100644 index 000000000..72278a191 --- /dev/null +++ b/pkg/codesearch/testdata/source1.c.json @@ -0,0 +1,20 @@ +{ + "definitions": [ + { + "kind": "function", + "name": "same_name_in_several_files", + "type": "void ()", + "is_static": true, + "body": { + "file": "source1.c", + "start_line": 4, + "end_line": 7 + }, + "comment": { + "file": "source1.c", + "start_line": 1, + "end_line": 2 + } + } + ] +} \ No newline at end of file diff --git a/pkg/codesearch/testdata/source2.c b/pkg/codesearch/testdata/source2.c new file mode 100644 index 000000000..f7ef3d810 --- /dev/null +++ b/pkg/codesearch/testdata/source2.c @@ -0,0 +1,7 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +void same_name_in_several_files() +{ + // This is non-static version in in source2.c. +} diff --git a/pkg/codesearch/testdata/source2.c.json b/pkg/codesearch/testdata/source2.c.json new file mode 100644 index 000000000..4407152db --- /dev/null +++ b/pkg/codesearch/testdata/source2.c.json @@ -0,0 +1,19 @@ +{ + "definitions": [ + { + "kind": "function", + "name": "same_name_in_several_files", + "type": "void ()", + "body": { + "file": "source2.c", + "start_line": 4, + "end_line": 7 + }, + "comment": { + "file": "source2.c", + "start_line": 1, + "end_line": 2 + } + } + ] +} \ No newline at end of file -- cgit mrf-deployment