aboutsummaryrefslogtreecommitdiffstats
path: root/prog/heatmap.go
blob: 5063d51eaa9ec25b721ff4668fb4bfa9f7a497f0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
// Copyright 2022 syzkaller project authors. All rights reserved.
// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.

package prog

import (
	"fmt"
	"math/rand"
)

// Our heatmaps are a flexible mechanism to assign a probability distribution to
// some collection of bytes. Usage:
//  1. Choose a heatmap and initialize it: `hm := MakeXYZHeatmap(data)`.
//     Different heatmaps implement different probability distributions
//     (for now there is only one).
//  2. Select random indices according to the probability distribution:
//     `idx := hm.ChooseLocation(r)`.
type Heatmap interface {
	NumMutations() int
	ChooseLocation() int
}

// Generic heatmaps model a probability distribution based on sparse data,
// prioritising selection of regions which are not a single repeated byte. It
// views data as a series of chunks of length `granularity`, ignoring chunks
// which are a single repeated byte. Indices are chosen uniformly amongst the
// remaining "interesting" segments.
func MakeGenericHeatmap(data []byte, r *rand.Rand) Heatmap {
	if len(data) == 0 {
		panic("cannot create a GenericHeatmap with no data")
	}
	hm := &GenericHeatmap{
		r: r,
	}
	hm.length, hm.segments = calculateLengthAndSegments(data, granularity)
	return hm
}

func (hm *GenericHeatmap) NumMutations() int {
	// At least one mutation.
	n := 1
	// + up to about one mutation every 4 KB of heatmap size.
	n += hm.r.Intn(hm.length/(4<<10) + 1)
	// + up to 4 mutations at random so that even small images can get more than one.
	n += hm.r.Intn(5)
	// But don't do too many as it will most likely corrupt the image.
	return min(n, 10)
}

func (hm *GenericHeatmap) ChooseLocation() int {
	// Uniformly choose an index within one of the segments.
	heatmapIdx := hm.r.Intn(hm.length)
	rawIdx := translateIdx(heatmapIdx, hm.segments)
	return rawIdx
}

type GenericHeatmap struct {
	r        *rand.Rand
	segments []segment // "Interesting" parts of the data.
	length   int       // Sum of all segment lengths.
}

type segment struct {
	offset int
	length int
}

const granularity = 64 // Chunk size in bytes for processing the data.

// Determine the "interesting" segments of data, also returning their combined length.
func calculateLengthAndSegments(data []byte, granularity int) (int, []segment) {
	// Offset and length of current segment, total length of all segments, length of original data.
	offset, currentLength, totalLength, rawLength := 0, 0, 0, len(data)
	segments := []segment{}

	// Save a segment.
	saveSegment := func() {
		if currentLength != 0 {
			segments = append(segments, segment{offset: offset, length: currentLength})
			offset, totalLength, currentLength = offset+currentLength, totalLength+currentLength, 0
		}
	}

	for len(data) > 0 {
		var chunk []byte
		if len(data) < granularity {
			chunk, data = data, nil
		} else {
			chunk, data = data[:granularity], data[granularity:]
		}

		// Check if buffer contains only a single value.
		byt0, isConstant := chunk[0], true
		for _, byt := range chunk {
			if byt != byt0 {
				isConstant = false
				break
			}
		}

		if !isConstant {
			// Non-constant - extend the current segment.
			currentLength += len(chunk)
		} else {
			// Save current segment.
			saveSegment()
			// Skip past the constant bytes.
			offset += len(chunk)
		}
	}

	// Save final segment.
	saveSegment()

	if len(segments) == 0 {
		// We found no segments, i.e. the data is all "boring". Fall back to a
		// uniform probability distribution over the original data by considering it
		// as one long segment.
		return rawLength, append(segments, segment{offset: 0, length: rawLength})
	}

	return totalLength, segments
}

// Convert from an index into "interesting" segments to an index into raw data.
// I.e. view `idx` as an index into the concatenated segments, and translate
// this to an index into the original underlying data. E.g.:
//
//	segs = []segment{{offset: 10, length: 20}, {offset: 50, length: 10}}
//	translateIdx(25, segs) = 5
//
// I.e. we index element 5 of the second segment, so element 55 of the raw data.
func translateIdx(idx int, segs []segment) int {
	if idx < 0 {
		panic(fmt.Sprintf("translateIdx: negative index %v", idx))
	}
	savedIdx := idx
	for _, seg := range segs {
		if idx < seg.length {
			return seg.offset + idx
		}
		idx -= seg.length
	}
	panic(fmt.Sprintf("translateIdx: index out of range %v", savedIdx))
}