aboutsummaryrefslogtreecommitdiffstats
path: root/pkg/declextract/typing.go
blob: e839d7bc48bf875651d558cb409029fc6ad69f6a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
// Copyright 2024 syzkaller project authors. All rights reserved.
// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.

package declextract

import (
	"bytes"
	"fmt"
	"slices"
	"strings"

	"github.com/google/syzkaller/pkg/clangtool"
)

// Argument/field type inference based on data flow analysis.
//
// First, the clang tool produces data flow summary for each function.
// The summary describes how data flows between function arguments, return values, local variables, and struct fields.
// Then, the logic in this file tracks global data flow in the kernel to infer types for syscall arguments,
// return values, and struct fields.
// If data transitively flows from an argument to a known function that accepts a resource of a particular type
// (e.g. __fget_light for file descriptors), then we infer that the original argument is an fd.
// Similarly, if data flows from a known function that creates a resource (e.g. alloc_fd for file descriptors)
// to a syscall return value, then we infer that the syscall returns an fd.
// For struct fields we track data flow in both direction (to/from) to infer their types.
//
// If the inference produces multiple resources, currently we pick the one with the shortest flow path
// (and then additionally pick lexicographically first among them for determinism). Potentially we could
// use a more elaborate strategy that would somehow rank candidates and/or produce multiple candidates
// (that we will then use as a union).
//
// Other potential improvements:
// - Add more functions that consume/produce resources.
// - Refine enum types. If we see an argument is used in bitops with an enum, it has that enum type.
// - Infer pointer types when they flow to copy_from_user (sometimes they are declared as uint64).
// - Infer that pointers are file names (they should flow to some known function for path resolution).
// - Use SSA analysis to track flow via local variables better. Potentiall we can just rename on every next use
//   and ignore backwards edges (it's unlikely that backwards edges are required for type inference).
// - Infer file_operations associated with an fd by tracking flow to alloc_file_pseudo and friends.
// - Infer netlink arg types by tracking flow from genl_info::attrs[ATTR_FOO].
// - Infer simple constraints on arguments, e.g. "if (arg != 0) return -EINVAL".
// - Use kernel typedefs for typing (e.g. pid_t). We can use them for uapi structs, but also for kernel
//   structs and function arguments during dataflow tracking (e.g. if int flows to a pid_t argument, it's a pid).
// - Track side flows. E.g. dup2 argument newfd flows to the return value, and newfd can be inferred to be an fd,
//   but currently we don't infer that the return value is an fd. Potentially we could infer that.
// - Detect cases where returned value is actually an error rather than a resource.
//   For example, these cases lead to false inference of fd type for returned value:
//   https://elixir.bootlin.com/linux/v6.13-rc2/source/net/core/sock.c#L1870
//   https://elixir.bootlin.com/linux/v6.13-rc2/source/net/socket.c#L1742
// - Use const[0] for unused arguments. If an arg is unused, or only flows to functions where it's unused,
//   we can consider it as unused.
// - Detect common patterns for "must be 0" or "must be const" arguments, e.g.:
//     if (flags != 0) return -EINVAL;
// - Capture taking address of functions in functions.
//   If code takes a function address, the target function most likely needs to be accounted
//   in LOC/complexity/coverage analysis (effectively called). We won't see this function
//   to be called via a function pointer later, or it may be passed to a very common function
//   that we won't analyze (e.g. single_open(..., show_callback, ...)).
// - Extract file permissions during ifaceprobe and use that to assign interface accessibility
//   for file interfaces, e.g. for:
//     crw-------   1 root    root      10,   239 Apr  8 20:36 uhid
//   we can say that it's root-only.

var (
	// Refines types based on data flows...
	flowResources = [2]map[string]string{
		// ...to function arguments.
		{
			"__fget_light:arg0":       "fd",
			"__fget_files_rcu:arg1":   "fd",
			"make_kuid:arg1":          "uid",
			"make_kgid:arg1":          "gid",
			"find_pid_ns:arg0":        "pid",
			"pidfd_get_pid:arg0":      "fd_pidfd",
			"__dev_get_by_index:arg1": "ifindex",
		},
		// ...from function return value.
		{
			"alloc_fd:ret":  "fd",
			"pid_nr_ns:ret": "pid",
			"from_kuid:ret": "uid",
			"from_kgid:ret": "gid",
		},
	}
	// These functions/structs/files provide very high false connectivity between unrelated nodes.
	flowIgnoreFuncs = map[string]bool{
		"ptr_to_compat": true,
		"compat_ptr":    true,
	}
	flowIgnoreStructs = map[string]bool{
		"pt_regs": true,
		"io_cqe":  true,
		"inode":   true,
	}
	flowIgnoreFiles = map[string]bool{
		"include/linux/err.h":     true, // PTR_ERR/ERR_PTR/ERR_CAST
		"include/linux/byteorder": true, // ntohl/etc
		"include/linux/uaccess.h": true, // copy_to/from_user
		"fs/befs/endian.h":        true, // cpu_to_fs32/etc
		"fs/ufs/swab.h":           true,
	}
)

// Limit on the flow graph traversal depth to avoid false positives due to false weird connections.
const maxTraversalDepth = 18

type typingNode struct {
	id    string
	fn    *Function
	arg   int
	flows [2]map[*typingNode][]*FunctionScope
}

const (
	flowTo = iota
	flowFrom
)

func (ctx *context) processTypingFacts() {
	for _, fn := range ctx.Functions {
		for _, scope := range fn.Scopes {
			scope.fn = fn
			for _, fact := range scope.Facts {
				src := ctx.canonicalNode(fn, fact.Src)
				dst := ctx.canonicalNode(fn, fact.Dst)
				if src == nil || dst == nil {
					continue
				}

				src.flows[flowTo][dst] = append(src.flows[flowTo][dst], scope)
				dst.flows[flowFrom][src] = append(dst.flows[flowFrom][src], scope)
			}
		}
	}
}

func (ctx *context) canonicalNode(fn *Function, ent *TypingEntity) *typingNode {
	scope, id := ent.ID(fn)
	fullID := id
	facts := ctx.facts
	if scope != "" {
		if scope != fn.Name {
			fn = ctx.findFunc(scope, fn.File)
			if fn == nil {
				return nil
			}
		}
		if flowIgnoreFuncs[fn.Name] || flowIgnoreFiles[fn.File] {
			return nil
		}
		if fn.facts == nil {
			fn.facts = make(map[string]*typingNode)
		}
		facts = fn.facts
		fullID = fmt.Sprintf("%v:%v", scope, id)
	} else if ent.Field != nil && flowIgnoreStructs[ent.Field.Struct] {
		return nil
	}
	n := facts[id]
	if n != nil {
		return n
	}
	arg := -1
	if ent.Argument != nil {
		arg = ent.Argument.Arg
	}
	n = &typingNode{
		id:  fullID,
		fn:  fn,
		arg: arg,
	}
	for i := range n.flows {
		n.flows[i] = make(map[*typingNode][]*FunctionScope)
	}
	facts[id] = n
	return n
}

func (ent *TypingEntity) ID(fn *Function) (string, string) {
	switch {
	case ent.Return != nil:
		return ent.Return.Func, "ret"
	case ent.Argument != nil:
		return ent.Argument.Func, fmt.Sprintf("arg%v", ent.Argument.Arg)
	case ent.Local != nil:
		return fn.Name, fmt.Sprintf("loc.%v", ent.Local.Name)
	case ent.Field != nil:
		return "", fmt.Sprintf("%v.%v", ent.Field.Struct, ent.Field.Field)
	case ent.GlobalAddr != nil:
		return "", ent.GlobalAddr.Name
	default:
		panic("unhandled type")
	}
}

func (ctx *context) inferReturnType(name, file string, scopeArg int, scopeVal string) string {
	return ctx.inferFuncNode(name, file, "ret", scopeArg, scopeVal)
}

func (ctx *context) inferArgType(name, file string, arg, scopeArg int, scopeVal string) string {
	return ctx.inferFuncNode(name, file, fmt.Sprintf("arg%v", arg), scopeArg, scopeVal)
}

type fnArg struct {
	fn  *Function
	arg int
}

func (ctx *context) inferFuncNode(name, file, node string, scopeArg int, scopeVal string) string {
	fn := ctx.findFunc(name, file)
	if fn == nil {
		return ""
	}
	scopeFnArgs := ctx.inferArgFlow(fnArg{fn, scopeArg})
	return ctx.inferNodeType(fn.facts[node], scopeFnArgs, scopeVal, fmt.Sprintf("%v %v", name, node))
}

func (ctx *context) inferFieldType(structName, field string) string {
	name := fmt.Sprintf("%v.%v", structName, field)
	return ctx.inferNodeType(ctx.facts[name], nil, "", name)
}

func (ctx *context) inferNodeType(n *typingNode, scopeFnArgs map[fnArg]bool, scopeVal, what string) string {
	if n == nil {
		return ""
	}
	ic := &inferContext{
		scopeFnArgs: scopeFnArgs,
		scopeVal:    scopeVal,
		visited:     make(map[*typingNode]bool),
		flowType:    flowFrom,
		maxDepth:    maxTraversalDepth,
	}
	ic.walk(n)
	ic.flowType = flowTo
	ic.visited = make(map[*typingNode]bool)
	ic.walk(n)
	if ic.result != "" {
		ctx.trace("inferred %v\n  %v%v", what, ic.result, flowString(ic.resultPath, ic.resultFlow))
	}
	return ic.result
}

type inferContext struct {
	path        []*typingNode
	visited     map[*typingNode]bool
	scopeFnArgs map[fnArg]bool
	scopeVal    string
	result      string
	resultPath  []*typingNode
	resultFlow  int
	flowType    int
	maxDepth    int
}

func (ic *inferContext) walk(n *typingNode) {
	if ic.visited[n] {
		return
	}
	ic.visited[n] = true
	ic.path = append(ic.path, n)
	if result, ok := flowResources[ic.flowType][n.id]; ok {
		// Use lexicographical order just to make the result stable.
		if ic.result == "" || len(ic.path) < ic.maxDepth ||
			len(ic.path) == ic.maxDepth && strings.Compare(result, ic.result) < 0 {
			ic.result = result
			ic.resultPath = slices.Clone(ic.path)
			ic.resultFlow = ic.flowType
			ic.maxDepth = len(ic.path)
		}
	}
	if len(ic.path) < ic.maxDepth {
		for e, scopes := range n.flows[ic.flowType] {
			if relevantScopes(ic.scopeFnArgs, ic.scopeVal, scopes) {
				ic.walk(e)
			}
		}
	}
	ic.path = ic.path[:len(ic.path)-1]
}

func relevantScopes(scopeFnArgs map[fnArg]bool, scopeVal string, scopes []*FunctionScope) bool {
	for _, scope := range scopes {
		if relevantScope(scopeFnArgs, scopeVal, scope) {
			return true
		}
	}
	return false
}

func relevantScope(scopeFnArgs map[fnArg]bool, scopeVal string, scope *FunctionScope) bool {
	if scopeFnArgs == nil {
		// We are not doing scope-limited walk, so all scopes are relevant.
		return true
	}
	if scope.Arg == -1 {
		// Always use global scope.
		return true
	}
	if !scopeFnArgs[fnArg{scope.fn, scope.Arg}] {
		// The scope argument is not related to the current scope.
		return true
	}
	// For the scope argument, check that it has the right value.
	for _, val := range scope.Values {
		if val == scopeVal {
			return true
		}
	}
	return false
}

func refineFieldType(f *Field, typ string, preserveSize bool) {
	// If our manual heuristics have figured out a more precise fd subtype,
	// don't replace it with generic fd.
	if typ == "" || typ == f.syzType ||
		typ == "fd" && (strings.HasPrefix(f.syzType, "fd_") || strings.HasPrefix(f.syzType, "sock")) {
		return
	}
	// For struct fields we need to keep the original size.
	// Sometimes fd is passed as uint64.
	if preserveSize {
		typ = fmt.Sprintf("auto_union[%v, %v]", typ, f.syzType)
	}
	f.syzType = typ
}

func flowString(path []*typingNode, flowType int) string {
	w := new(bytes.Buffer)
	dir := [2]string{"->", "<-"}[flowType]
	for _, e := range path {
		fmt.Fprintf(w, " %v %v", dir, e.id)
	}
	return w.String()
}

func (ctx *context) inferCommandVariants(name, file string, arg int) []string {
	ctx.trace("inferring %v:arg%v variants", name, arg)
	fn := ctx.findFunc(name, file)
	if fn == nil {
		return nil
	}
	var variants []string
	n := fn.facts[fmt.Sprintf("arg%v", arg)]
	if n == nil {
		ctx.collectCommandVariants(fn, arg, &variants)
	} else {
		visited := make(map[*typingNode]bool)
		ctx.walkCommandVariants(n, &variants, visited, 0)
	}
	return clangtool.SortAndDedupSlice(variants)
}

func (ctx *context) collectCommandVariants(fn *Function, arg int, variants *[]string) {
	var values []string
	for _, scope := range fn.Scopes {
		if scope.Arg == arg {
			values = append(values, scope.Values...)
		}
	}
	if len(values) != 0 {
		ctx.trace("  function %v:arg%v implements: %v", fn.Name, arg, values)
		*variants = append(*variants, values...)
	}
}

func (ctx *context) walkCommandVariants(n *typingNode, variants *[]string, visited map[*typingNode]bool, depth int) {
	if visited[n] || depth >= 10 {
		return
	}
	visited[n] = true
	if n.arg >= 0 {
		ctx.collectCommandVariants(n.fn, n.arg, variants)
	}
	for e := range n.flows[flowTo] {
		ctx.walkCommandVariants(e, variants, visited, depth+1)
	}
}

// inferArgFlow returns transitive closure of all function arguments that the given argument flows to.
func (ctx *context) inferArgFlow(arg fnArg) map[fnArg]bool {
	n := arg.fn.facts[fmt.Sprintf("arg%v", arg.arg)]
	if n == nil {
		return nil
	}
	fnArgs := make(map[fnArg]bool)
	visited := make(map[*typingNode]bool)
	ctx.walkArgFlow(n, fnArgs, visited, 0)
	return fnArgs
}

func (ctx *context) walkArgFlow(n *typingNode, fnArgs map[fnArg]bool, visited map[*typingNode]bool, depth int) {
	if visited[n] || depth >= 10 {
		return
	}
	visited[n] = true
	if n.arg >= 0 {
		fnArgs[fnArg{n.fn, n.arg}] = true
	}
	for e := range n.flows[flowTo] {
		ctx.walkArgFlow(e, fnArgs, visited, depth+1)
	}
}