tools/syz-declextract: refine arg types for syscall variants

Use scope-based dataflow analysis for syscall variants (including ioctls). As the result we only consider code that relates to a partiuclar command/ioctl, and can infer arguments/return types for each command/ioctl independently.
author: Dmitry Vyukov <dvyukov@google.com> 2025-04-08 14:27:33 +0200
committer: Dmitry Vyukov <dvyukov@google.com> 2025-04-09 10:27:41 +0000
commit: 988b336c79bf2f92392015e5075e92f0148ad869 (patch)
tree: 1e28f832accba910334c94ff14cf9715e9e378a5 /pkg/declextract
parent: 16f995ffcf2e3469a7e464ac5d486385641df7d8 (diff)
4 files changed, 128 insertions, 44 deletions
diff --git a/pkg/declextract/declextract.go b/pkg/declextract/declextract.go
index 479a40892..3800ad70c 100644
--- a/pkg/declextract/declextract.go
+++ b/pkg/declextract/declextract.go
@@ -161,20 +161,22 @@ func (ctx *context) processSyscalls() {
 	var syscalls []*Syscall
 	for _, call := range ctx.Syscalls {
 		ctx.processFields(call.Args, "", false)
-		call.returnType = ctx.inferReturnType(call.Func, call.SourceFile)
-		for i, arg := range call.Args {
-			typ := ctx.inferArgType(call.Func, call.SourceFile, i)
-			refineFieldType(arg, typ, false)
-		}
-		ctx.emitSyscall(&syscalls, call, "")
-		for i := range call.Args {
-			cmds := ctx.inferCommandVariants(call.Func, call.SourceFile, i)
+		for varArg := range call.Args {
+			cmds := ctx.inferCommandVariants(call.Func, call.SourceFile, varArg)
 			for _, cmd := range cmds {
 				variant := *call
 				variant.Args = slices.Clone(call.Args)
-				newArg := *variant.Args[i]
-				newArg.syzType = fmt.Sprintf("const[%v]", cmd)
-				variant.Args[i] = &newArg
+				for i, oldArg := range variant.Args {
+					arg := *oldArg
+					if i == varArg {
+						arg.syzType = fmt.Sprintf("const[%v]", cmd)
+					} else {
+						typ := ctx.inferArgType(call.Func, call.SourceFile, i, varArg, cmd)
+						refineFieldType(&arg, typ, false)
+					}
+					variant.Args[i] = &arg
+				}
+				variant.returnType = ctx.inferReturnType(call.Func, call.SourceFile, varArg, cmd)
 				suffix := cmd
 				if call.Func == "__do_sys_ioctl" {
 					suffix = ctx.uniqualize("ioctl cmd", cmd)
@@ -182,6 +184,12 @@ func (ctx *context) processSyscalls() {
 				ctx.emitSyscall(&syscalls, &variant, "_"+suffix)
 			}
 		}
+		call.returnType = ctx.inferReturnType(call.Func, call.SourceFile, -1, "")
+		for i, arg := range call.Args {
+			typ := ctx.inferArgType(call.Func, call.SourceFile, i, -1, "")
+			refineFieldType(arg, typ, false)
+		}
+		ctx.emitSyscall(&syscalls, call, "")
 	}
 	ctx.Syscalls = sortAndDedupSlice(syscalls)
 }
diff --git a/pkg/declextract/entity.go b/pkg/declextract/entity.go
index 5562ff570..740530ca9 100644
--- a/pkg/declextract/entity.go
+++ b/pkg/declextract/entity.go
@@ -45,6 +45,8 @@ type FunctionScope struct {
 	LOC    int           `json:"loc,omitempty"`
 	Calls  []string      `json:"calls,omitempty"`
 	Facts  []*TypingFact `json:"facts,omitempty"`
+
+	fn *Function
 }
 
 type ConstInfo struct {
diff --git a/pkg/declextract/fileops.go b/pkg/declextract/fileops.go
index cacdcaa9e..408ccc4fc 100644
--- a/pkg/declextract/fileops.go
+++ b/pkg/declextract/fileops.go
@@ -61,11 +61,15 @@ func (ctx *context) createFops(fops *FileOps, files []string) {
 }
 
 func (ctx *context) createIoctls(fops *FileOps, suffix, fdt string) {
-	const defaultArgType = "ptr[in, array[int8]]"
-	cmds := ctx.inferCommandVariants(fops.Ioctl, fops.SourceFile, 1)
+	const (
+		cmdArg         = 1
+		argArg         = 2
+		defaultArgType = "ptr[in, array[int8]]"
+	)
+	cmds := ctx.inferCommandVariants(fops.Ioctl, fops.SourceFile, cmdArg)
 	if len(cmds) == 0 {
-		retType := ctx.inferReturnType(fops.Ioctl, fops.SourceFile)
-		argType := ctx.inferArgType(fops.Ioctl, fops.SourceFile, 2)
+		retType := ctx.inferReturnType(fops.Ioctl, fops.SourceFile, -1, "")
+		argType := ctx.inferArgType(fops.Ioctl, fops.SourceFile, argArg, -1, "")
 		if argType == "" {
 			argType = defaultArgType
 		}
@@ -80,10 +84,16 @@ func (ctx *context) createIoctls(fops *FileOps, suffix, fdt string) {
 				Type: typ,
 			}
 			argType = ctx.fieldType(f, nil, "", false)
+		} else {
+			argType = ctx.inferArgType(fops.Ioctl, fops.SourceFile, argArg, cmdArg, cmd)
+			if argType == "" {
+				argType = defaultArgType
+			}
 		}
+		retType := ctx.inferReturnType(fops.Ioctl, fops.SourceFile, cmdArg, cmd)
 		name := ctx.uniqualize("ioctl cmd", cmd)
-		ctx.fmt("ioctl%v_%v(fd %v, cmd const[%v], arg %v)\n",
-			autoSuffix, name, fdt, cmd, argType)
+		ctx.fmt("ioctl%v_%v(fd %v, cmd const[%v], arg %v) %v\n",
+			autoSuffix, name, fdt, cmd, argType, retType)
 	}
 }
 
diff --git a/pkg/declextract/typing.go b/pkg/declextract/typing.go
index f29f8e950..3de53ee62 100644
--- a/pkg/declextract/typing.go
+++ b/pkg/declextract/typing.go
@@ -34,10 +34,7 @@ import (
 // - Infer that pointers are file names (they should flow to some known function for path resolution).
 // - Use SSA analysis to track flow via local variables better. Potentiall we can just rename on every next use
 //   and ignore backwards edges (it's unlikely that backwards edges are required for type inference).
-// - Infer ioctl commands in transitively called functions using data flow.
 // - Infer file_operations associated with an fd by tracking flow to alloc_file_pseudo and friends.
-// - Add context-sensitivity at least on switched arguments (ioctl commands).
-// - Infer other switched arguments besides ioctl commands.
 // - Infer netlink arg types by tracking flow from genl_info::attrs[ATTR_FOO].
 // - Infer simple constraints on arguments, e.g. "if (arg != 0) return -EINVAL".
 // - Use kernel typedefs for typing (e.g. pid_t). We can use them for uapi structs, but also for kernel
@@ -48,6 +45,10 @@ import (
 //   For example, these cases lead to false inference of fd type for returned value:
 //   https://elixir.bootlin.com/linux/v6.13-rc2/source/net/core/sock.c#L1870
 //   https://elixir.bootlin.com/linux/v6.13-rc2/source/net/socket.c#L1742
+// - Use const[0] for unused arguments. If an arg is unused, or only flows to functions where it's unused,
+//   we can consider it as unused.
+// - Detect common patterns for "must be 0" or "must be const" arguments, e.g.:
+//     if (flags != 0) return -EINVAL;
 
 var (
 	// Refines types based on data flows...
@@ -96,7 +97,7 @@ type typingNode struct {
 	id    string
 	fn    *Function
 	arg   int
-	flows [2]map[*typingNode]bool
+	flows [2]map[*typingNode][]*FunctionScope
 }
 
 const (
@@ -107,14 +108,16 @@ const (
 func (ctx *context) processTypingFacts() {
 	for _, fn := range ctx.Functions {
 		for _, scope := range fn.Scopes {
+			scope.fn = fn
 			for _, fact := range scope.Facts {
 				src := ctx.canonicalNode(fn, fact.Src)
 				dst := ctx.canonicalNode(fn, fact.Dst)
 				if src == nil || dst == nil {
 					continue
 				}
-				src.flows[flowTo][dst] = true
-				dst.flows[flowFrom][src] = true
+
+				src.flows[flowTo][dst] = append(src.flows[flowTo][dst], scope)
+				dst.flows[flowFrom][src] = append(dst.flows[flowFrom][src], scope)
 			}
 		}
 	}
@@ -156,7 +159,7 @@ func (ctx *context) canonicalNode(fn *Function, ent *TypingEntity) *typingNode {
 		arg: arg,
 	}
 	for i := range n.flows {
-		n.flows[i] = make(map[*typingNode]bool)
+		n.flows[i] = make(map[*typingNode][]*FunctionScope)
 	}
 	facts[id] = n
 	return n
@@ -179,35 +182,43 @@ func (ent *TypingEntity) ID(fn *Function) (string, string) {
 	}
 }
 
-func (ctx *context) inferReturnType(name, file string) string {
-	return ctx.inferFuncNode(name, file, "ret")
+func (ctx *context) inferReturnType(name, file string, scopeArg int, scopeVal string) string {
+	return ctx.inferFuncNode(name, file, "ret", scopeArg, scopeVal)
+}
+
+func (ctx *context) inferArgType(name, file string, arg, scopeArg int, scopeVal string) string {
+	return ctx.inferFuncNode(name, file, fmt.Sprintf("arg%v", arg), scopeArg, scopeVal)
 }
 
-func (ctx *context) inferArgType(name, file string, arg int) string {
-	return ctx.inferFuncNode(name, file, fmt.Sprintf("arg%v", arg))
+type fnArg struct {
+	fn  *Function
+	arg int
 }
 
-func (ctx *context) inferFuncNode(name, file, node string) string {
+func (ctx *context) inferFuncNode(name, file, node string, scopeArg int, scopeVal string) string {
 	fn := ctx.findFunc(name, file)
 	if fn == nil {
 		return ""
 	}
-	return ctx.inferNodeType(fn.facts[node], fmt.Sprintf("%v %v", name, node))
+	scopeFnArgs := ctx.inferArgFlow(fnArg{fn, scopeArg})
+	return ctx.inferNodeType(fn.facts[node], scopeFnArgs, scopeVal, fmt.Sprintf("%v %v", name, node))
 }
 
 func (ctx *context) inferFieldType(structName, field string) string {
 	name := fmt.Sprintf("%v.%v", structName, field)
-	return ctx.inferNodeType(ctx.facts[name], name)
+	return ctx.inferNodeType(ctx.facts[name], nil, "", name)
 }
 
-func (ctx *context) inferNodeType(n *typingNode, what string) string {
+func (ctx *context) inferNodeType(n *typingNode, scopeFnArgs map[fnArg]bool, scopeVal, what string) string {
 	if n == nil {
 		return ""
 	}
 	ic := &inferContext{
-		visited:  make(map[*typingNode]bool),
-		flowType: flowFrom,
-		maxDepth: maxTraversalDepth,
+		scopeFnArgs: scopeFnArgs,
+		scopeVal:    scopeVal,
+		visited:     make(map[*typingNode]bool),
+		flowType:    flowFrom,
+		maxDepth:    maxTraversalDepth,
 	}
 	ic.walk(n)
 	ic.flowType = flowTo
@@ -220,13 +231,15 @@ func (ctx *context) inferNodeType(n *typingNode, what string) string {
 }
 
 type inferContext struct {
-	path       []*typingNode
-	visited    map[*typingNode]bool
-	result     string
-	resultPath []*typingNode
-	resultFlow int
-	flowType   int
-	maxDepth   int
+	path        []*typingNode
+	visited     map[*typingNode]bool
+	scopeFnArgs map[fnArg]bool
+	scopeVal    string
+	result      string
+	resultPath  []*typingNode
+	resultFlow  int
+	flowType    int
+	maxDepth    int
 }
 
 func (ic *inferContext) walk(n *typingNode) {
@@ -246,13 +259,39 @@ func (ic *inferContext) walk(n *typingNode) {
 		}
 	}
 	if len(ic.path) < ic.maxDepth {
-		for e := range n.flows[ic.flowType] {
-			ic.walk(e)
+		for e, scopes := range n.flows[ic.flowType] {
+			if ic.relevantScope(scopes) {
+				ic.walk(e)
+			}
 		}
 	}
 	ic.path = ic.path[:len(ic.path)-1]
 }
 
+func (ic *inferContext) relevantScope(scopes []*FunctionScope) bool {
+	if ic.scopeFnArgs == nil {
+		// We are not doing scope-limited walk, so all scopes are relevant.
+		return true
+	}
+	for _, scope := range scopes {
+		if scope.Arg == -1 {
+			// Always use global scope.
+			return true
+		}
+		if !ic.scopeFnArgs[fnArg{scope.fn, scope.Arg}] {
+			// The scope argument is not related to the current scope.
+			return true
+		}
+		// For the scope argument, check that it has the right value.
+		for _, val := range scope.Values {
+			if val == ic.scopeVal {
+				return true
+			}
+		}
+	}
+	return false
+}
+
 func refineFieldType(f *Field, typ string, preserveSize bool) {
 	// If our manual heuristics have figured out a more precise fd subtype,
 	// don't replace it with generic fd.
@@ -319,3 +358,28 @@ func (ctx *context) walkCommandVariants(n *typingNode, variants *[]string, visit
 		ctx.walkCommandVariants(e, variants, visited, depth+1)
 	}
 }
+
+// inferArgFlow returns transitive closure of all function arguments that the given argument flows to.
+func (ctx *context) inferArgFlow(arg fnArg) map[fnArg]bool {
+	n := arg.fn.facts[fmt.Sprintf("arg%v", arg.arg)]
+	if n == nil {
+		return nil
+	}
+	fnArgs := make(map[fnArg]bool)
+	visited := make(map[*typingNode]bool)
+	ctx.walkArgFlow(n, fnArgs, visited, 0)
+	return fnArgs
+}
+
+func (ctx *context) walkArgFlow(n *typingNode, fnArgs map[fnArg]bool, visited map[*typingNode]bool, depth int) {
+	if visited[n] || depth >= 10 {
+		return
+	}
+	visited[n] = true
+	if n.arg >= 0 {
+		fnArgs[fnArg{n.fn, n.arg}] = true
+	}
+	for e := range n.flows[flowTo] {
+		ctx.walkArgFlow(e, fnArgs, visited, depth+1)
+	}
+}
author	Dmitry Vyukov <dvyukov@google.com>	2025-04-08 14:27:33 +0200
committer	Dmitry Vyukov <dvyukov@google.com>	2025-04-09 10:27:41 +0000
commit	988b336c79bf2f92392015e5075e92f0148ad869 (patch)
tree	1e28f832accba910334c94ff14cf9715e9e378a5 /pkg/declextract
parent	16f995ffcf2e3469a7e464ac5d486385641df7d8 (diff)