// Copyright 2024 syzkaller project authors. All rights reserved.
// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.

package declextract

import (
	"bytes"
	"fmt"
	"slices"
	"strings"

	"github.com/google/syzkaller/pkg/clangtool"
)

// Argument/field type inference based on data flow analysis.
//
// First, the clang tool produces data flow summary for each function.
// The summary describes how data flows between function arguments, return values, local variables, and struct fields.
// Then, the logic in this file tracks global data flow in the kernel to infer types for syscall arguments,
// return values, and struct fields.
// If data transitively flows from an argument to a known function that accepts a resource of a particular type
// (e.g. __fget_light for file descriptors), then we infer that the original argument is an fd.
// Similarly, if data flows from a known function that creates a resource (e.g. alloc_fd for file descriptors)
// to a syscall return value, then we infer that the syscall returns an fd.
// For struct fields we track data flow in both direction (to/from) to infer their types.
//
// If the inference produces multiple resources, currently we pick the one with the shortest flow path
// (and then additionally pick lexicographically first among them for determinism). Potentially we could
// use a more elaborate strategy that would somehow rank candidates and/or produce multiple candidates
// (that we will then use as a union).
//
// Other potential improvements:
// - Add more functions that consume/produce resources.
// - Refine enum types. If we see an argument is used in bitops with an enum, it has that enum type.
// - Infer pointer types when they flow to copy_from_user (sometimes they are declared as uint64).
// - Infer that pointers are file names (they should flow to some known function for path resolution).
// - Use SSA analysis to track flow via local variables better. Potentiall we can just rename on every next use
//   and ignore backwards edges (it's unlikely that backwards edges are required for type inference).
// - Infer file_operations associated with an fd by tracking flow to alloc_file_pseudo and friends.
// - Infer netlink arg types by tracking flow from genl_info::attrs[ATTR_FOO].
// - Infer simple constraints on arguments, e.g. "if (arg != 0) return -EINVAL".
// - Use kernel typedefs for typing (e.g. pid_t). We can use them for uapi structs, but also for kernel
//   structs and function arguments during dataflow tracking (e.g. if int flows to a pid_t argument, it's a pid).
// - Track side flows. E.g. dup2 argument newfd flows to the return value, and newfd can be inferred to be an fd,
//   but currently we don't infer that the return value is an fd. Potentially we could infer that.
// - Detect cases where returned value is actually an error rather than a resource.
//   For example, these cases lead to false inference of fd type for returned value:
//   https://elixir.bootlin.com/linux/v6.13-rc2/source/net/core/sock.c#L1870
//   https://elixir.bootlin.com/linux/v6.13-rc2/source/net/socket.c#L1742
// - Use const[0] for unused arguments. If an arg is unused, or only flows to functions where it's unused,
//   we can consider it as unused.
// - Detect common patterns for "must be 0" or "must be const" arguments, e.g.:
//     if (flags != 0) return -EINVAL;
// - Capture taking address of functions in functions.
//   If code takes a function address, the target function most likely needs to be accounted
//   in LOC/complexity/coverage analysis (effectively called). We won't see this function
//   to be called via a function pointer later, or it may be passed to a very common function
//   that we won't analyze (e.g. single_open(..., show_callback, ...)).
// - Extract file permissions during ifaceprobe and use that to assign interface accessibility
//   for file interfaces, e.g. for:
//     crw-------   1 root    root      10,   239 Apr  8 20:36 uhid
//   we can say that it's root-only.

var (
	// Refines types based on data flows...
	flowResources = [2]map[string]string{
		// ...to function arguments.
		{
			"__fget_light:arg0":       "fd",
			"__fget_files_rcu:arg1":   "fd",
			"make_kuid:arg1":          "uid",
			"make_kgid:arg1":          "gid",
			"find_pid_ns:arg0":        "pid",
			"pidfd_get_pid:arg0":      "fd_pidfd",
			"__dev_get_by_index:arg1": "ifindex",
		},
		// ...from function return value.
		{
			"alloc_fd:ret":  "fd",
			"pid_nr_ns:ret": "pid",
			"from_kuid:ret": "uid",
			"from_kgid:ret": "gid",
		},
	}
	// These functions/structs/files provide very high false connectivity between unrelated nodes.
	flowIgnoreFuncs = map[string]bool{
		"ptr_to_compat": true,
		"compat_ptr":    true,
	}
	flowIgnoreStructs = map[string]bool{
		"pt_regs": true,
		"io_cqe":  true,
		"inode":   true,
	}
	flowIgnoreFiles = map[string]bool{
		"include/linux/err.h":     true, // PTR_ERR/ERR_PTR/ERR_CAST
		"include/linux/byteorder": true, // ntohl/etc
		"include/linux/uaccess.h": true, // copy_to/from_user
		"fs/befs/endian.h":        true, // cpu_to_fs32/etc
		"fs/ufs/swab.h":           true,
	}
)

// Limit on the flow graph traversal depth to avoid false positives due to false weird connections.
const maxTraversalDepth = 18

type typingNode struct {
	id    string
	fn    *Function
	arg   int
	flows [2]map[*typingNode][]*FunctionScope
}

const (
	flowTo = iota
	flowFrom
)

func (ctx *context) processTypingFacts() {
	for _, fn := range ctx.Functions {
		for _, scope := range fn.Scopes {
			scope.fn = fn
			for _, fact := range scope.Facts {
				src := ctx.canonicalNode(fn, fact.Src)
				dst := ctx.canonicalNode(fn, fact.Dst)
				if src == nil || dst == nil {
					continue
				}

				src.flows[flowTo][dst] = append(src.flows[flowTo][dst], scope)
				dst.flows[flowFrom][src] = append(dst.flows[flowFrom][src], scope)
			}
		}
	}
}

func (ctx *context) canonicalNode(fn *Function, ent *TypingEntity) *typingNode {
	scope, id := ent.ID(fn)
	fullID := id
	facts := ctx.facts
	if scope != "" {
		if scope != fn.Name {
			fn = ctx.findFunc(scope, fn.File)
			if fn == nil {
				return nil
			}
		}
		if flowIgnoreFuncs[fn.Name] || flowIgnoreFiles[fn.File] {
			return nil
		}
		if fn.facts == nil {
			fn.facts = make(map[string]*typingNode)
		}
		facts = fn.facts
		fullID = fmt.Sprintf("%v:%v", scope, id)
	} else if ent.Field != nil && flowIgnoreStructs[ent.Field.Struct] {
		return nil
	}
	n := facts[id]
	if n != nil {
		return n
	}
	arg := -1
	if ent.Argument != nil {
		arg = ent.Argument.Arg
	}
	n = &typingNode{
		id:  fullID,
		fn:  fn,
		arg: arg,
	}
	for i := range n.flows {
		n.flows[i] = make(map[*typingNode][]*FunctionScope)
	}
	facts[id] = n
	return n
}

func (ent *TypingEntity) ID(fn *Function) (string, string) {
	switch {
	case ent.Return != nil:
		return ent.Return.Func, "ret"
	case ent.Argument != nil:
		return ent.Argument.Func, fmt.Sprintf("arg%v", ent.Argument.Arg)
	case ent.Local != nil:
		return fn.Name, fmt.Sprintf("loc.%v", ent.Local.Name)
	case ent.Field != nil:
		return "", fmt.Sprintf("%v.%v", ent.Field.Struct, ent.Field.Field)
	case ent.GlobalAddr != nil:
		return "", ent.GlobalAddr.Name
	default:
		panic("unhandled type")
	}
}

func (ctx *context) inferReturnType(name, file string, scopeArg int, scopeVal string) string {
	return ctx.inferFuncNode(name, file, "ret", scopeArg, scopeVal)
}

func (ctx *context) inferArgType(name, file string, arg, scopeArg int, scopeVal string) string {
	return ctx.inferFuncNode(name, file, fmt.Sprintf("arg%v", arg), scopeArg, scopeVal)
}

type fnArg struct {
	fn  *Function
	arg int
}

func (ctx *context) inferFuncNode(name, file, node string, scopeArg int, scopeVal string) string {
	fn := ctx.findFunc(name, file)
	if fn == nil {
		return ""
	}
	scopeFnArgs := ctx.inferArgFlow(fnArg{fn, scopeArg})
	return ctx.inferNodeType(fn.facts[node], scopeFnArgs, scopeVal, fmt.Sprintf("%v %v", name, node))
}

func (ctx *context) inferFieldType(structName, field string) string {
	name := fmt.Sprintf("%v.%v", structName, field)
	return ctx.inferNodeType(ctx.facts[name], nil, "", name)
}

func (ctx *context) inferNodeType(n *typingNode, scopeFnArgs map[fnArg]bool, scopeVal, what string) string {
	if n == nil {
		return ""
	}
	ic := &inferContext{
		scopeFnArgs: scopeFnArgs,
		scopeVal:    scopeVal,
		visited:     make(map[*typingNode]bool),
		flowType:    flowFrom,
		maxDepth:    maxTraversalDepth,
	}
	ic.walk(n)
	ic.flowType = flowTo
	ic.visited = make(map[*typingNode]bool)
	ic.walk(n)
	if ic.result != "" {
		ctx.trace("inferred %v\n  %v%v", what, ic.result, flowString(ic.resultPath, ic.resultFlow))
	}
	return ic.result
}

type inferContext struct {
	path        []*typingNode
	visited     map[*typingNode]bool
	scopeFnArgs map[fnArg]bool
	scopeVal    string
	result      string
	resultPath  []*typingNode
	resultFlow  int
	flowType    int
	maxDepth    int
}

func (ic *inferContext) walk(n *typingNode) {
	if ic.visited[n] {
		return
	}
	ic.visited[n] = true
	ic.path = append(ic.path, n)
	if result, ok := flowResources[ic.flowType][n.id]; ok {
		// Use lexicographical order just to make the result stable.
		if ic.result == "" || len(ic.path) < ic.maxDepth ||
			len(ic.path) == ic.maxDepth && strings.Compare(result, ic.result) < 0 {
			ic.result = result
			ic.resultPath = slices.Clone(ic.path)
			ic.resultFlow = ic.flowType
			ic.maxDepth = len(ic.path)
		}
	}
	if len(ic.path) < ic.maxDepth {
		for e, scopes := range n.flows[ic.flowType] {
			if relevantScopes(ic.scopeFnArgs, ic.scopeVal, scopes) {
				ic.walk(e)
			}
		}
	}
	ic.path = ic.path[:len(ic.path)-1]
}

func relevantScopes(scopeFnArgs map[fnArg]bool, scopeVal string, scopes []*FunctionScope) bool {
	for _, scope := range scopes {
		if relevantScope(scopeFnArgs, scopeVal, scope) {
			return true
		}
	}
	return false
}

func relevantScope(scopeFnArgs map[fnArg]bool, scopeVal string, scope *FunctionScope) bool {
	if scopeFnArgs == nil {
		// We are not doing scope-limited walk, so all scopes are relevant.
		return true
	}
	if scope.Arg == -1 {
		// Always use global scope.
		return true
	}
	if !scopeFnArgs[fnArg{scope.fn, scope.Arg}] {
		// The scope argument is not related to the current scope.
		return true
	}
	// For the scope argument, check that it has the right value.
	for _, val := range scope.Values {
		if val == scopeVal {
			return true
		}
	}
	return false
}

func refineFieldType(f *Field, typ string, preserveSize bool) {
	// If our manual heuristics have figured out a more precise fd subtype,
	// don't replace it with generic fd.
	if typ == "" || typ == f.syzType ||
		typ == "fd" && (strings.HasPrefix(f.syzType, "fd_") || strings.HasPrefix(f.syzType, "sock")) {
		return
	}
	// For struct fields we need to keep the original size.
	// Sometimes fd is passed as uint64.
	if preserveSize {
		typ = fmt.Sprintf("auto_union[%v, %v]", typ, f.syzType)
	}
	f.syzType = typ
}

func flowString(path []*typingNode, flowType int) string {
	w := new(bytes.Buffer)
	dir := [2]string{"->", "<-"}[flowType]
	for _, e := range path {
		fmt.Fprintf(w, " %v %v", dir, e.id)
	}
	return w.String()
}

func (ctx *context) inferCommandVariants(name, file string, arg int) []string {
	ctx.trace("inferring %v:arg%v variants", name, arg)
	fn := ctx.findFunc(name, file)
	if fn == nil {
		return nil
	}
	var variants []string
	n := fn.facts[fmt.Sprintf("arg%v", arg)]
	if n == nil {
		ctx.collectCommandVariants(fn, arg, &variants)
	} else {
		visited := make(map[*typingNode]bool)
		ctx.walkCommandVariants(n, &variants, visited, 0)
	}
	return clangtool.SortAndDedupSlice(variants)
}

func (ctx *context) collectCommandVariants(fn *Function, arg int, variants *[]string) {
	var values []string
	for _, scope := range fn.Scopes {
		if scope.Arg == arg {
			values = append(values, scope.Values...)
		}
	}
	if len(values) != 0 {
		ctx.trace("  function %v:arg%v implements: %v", fn.Name, arg, values)
		*variants = append(*variants, values...)
	}
}

func (ctx *context) walkCommandVariants(n *typingNode, variants *[]string, visited map[*typingNode]bool, depth int) {
	if visited[n] || depth >= 10 {
		return
	}
	visited[n] = true
	if n.arg >= 0 {
		ctx.collectCommandVariants(n.fn, n.arg, variants)
	}
	for e := range n.flows[flowTo] {
		ctx.walkCommandVariants(e, variants, visited, depth+1)
	}
}

// inferArgFlow returns transitive closure of all function arguments that the given argument flows to.
func (ctx *context) inferArgFlow(arg fnArg) map[fnArg]bool {
	n := arg.fn.facts[fmt.Sprintf("arg%v", arg.arg)]
	if n == nil {
		return nil
	}
	fnArgs := make(map[fnArg]bool)
	visited := make(map[*typingNode]bool)
	ctx.walkArgFlow(n, fnArgs, visited, 0)
	return fnArgs
}

func (ctx *context) walkArgFlow(n *typingNode, fnArgs map[fnArg]bool, visited map[*typingNode]bool, depth int) {
	if visited[n] || depth >= 10 {
		return
	}
	visited[n] = true
	if n.arg >= 0 {
		fnArgs[fnArg{n.fn, n.arg}] = true
	}
	for e := range n.flows[flowTo] {
		ctx.walkArgFlow(e, fnArgs, visited, depth+1)
	}
}