6 files changed, 178 insertions, 17 deletions
diff --git a/executor/common_zlib.h b/executor/common_zlib.h
index 24382ac0c..5d37cd0e0 100644
--- a/executor/common_zlib.h
+++ b/executor/common_zlib.h
@@ -476,6 +476,7 @@ static int puff_zlib_to_file(const unsigned char* source, unsigned long sourcele
 	source += ZLIB_HEADER_WIDTH;
 	sourcelen -= ZLIB_HEADER_WIDTH;
 
+	// Note: pkg/image/compression.go also knows this const.
 	const unsigned long max_destlen = 132 << 20;
 	void* ret = mmap(0, max_destlen, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
 	if (ret == MAP_FAILED)
diff --git a/pkg/image/compression.go b/pkg/image/compression.go
index a2a51b146..83a479b62 100644
--- a/pkg/image/compression.go
+++ b/pkg/image/compression.go
@@ -29,14 +29,6 @@ func Compress(rawData []byte) []byte {
 	return buffer.Bytes()
 }
 
-func MustDecompress(compressed []byte) (data []byte, dtor func()) {
-	buf := new(bytes.Buffer)
-	if err := decompressWriter(buf, compressed); err != nil {
-		panic(err)
-	}
-	return buf.Bytes(), func() {}
-}
-
 func DecompressCheck(compressed []byte) error {
 	return decompressWriter(ioutil.Discard, compressed)
 }
diff --git a/pkg/image/compression_nonoptimized.go b/pkg/image/compression_nonoptimized.go
new file mode 100644
index 000000000..81b26350a
--- /dev/null
+++ b/pkg/image/compression_nonoptimized.go
@@ -0,0 +1,25 @@
+// Copyright 2022 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+//go:build windows || 386 || arm
+// +build windows 386 arm
+
+package image
+
+import (
+	"bytes"
+	"sync"
+)
+
+var decompressMu sync.Mutex
+
+func MustDecompress(compressed []byte) (data []byte, dtor func()) {
+	// Don't decompress more than one image at a time since it can consume lots of memory.
+	// Reconsider when/if we move mutation to the host process.
+	decompressMu.Lock()
+	buf := new(bytes.Buffer)
+	if err := decompressWriter(buf, compressed); err != nil {
+		panic(err)
+	}
+	return buf.Bytes(), decompressMu.Unlock
+}
diff --git a/pkg/image/compression_optimized.go b/pkg/image/compression_optimized.go
new file mode 100644
index 000000000..bfb9dece2
--- /dev/null
+++ b/pkg/image/compression_optimized.go
@@ -0,0 +1,117 @@
+// Copyright 2022 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+//go:build !windows && !386 && !arm
+// +build !windows,!386,!arm
+
+package image
+
+import (
+	"bytes"
+	"compress/zlib"
+	"fmt"
+	"io"
+	"sync"
+	"syscall"
+	"unsafe"
+)
+
+// Temporary scratch data used by the decompression procedure.
+type decompressScratch struct {
+	r   bytes.Reader
+	zr  io.Reader
+	buf []byte
+}
+
+var decompressPool = sync.Pool{New: func() interface{} {
+	return &decompressScratch{
+		buf: make([]byte, 8<<10),
+	}
+}}
+
+func MustDecompress(compressed []byte) (data []byte, dtor func()) {
+	// Optimized decompression procedure that is ~2x faster than a naive version
+	// and consumes significantly less memory and generates less garbage.
+	// Images tend to contain lots of 0s, especially the larger images.
+	// The main idea is that we mmap a buffer and then don't write 0s into it
+	// (since it already contains all 0s). As the result if a page is all 0s
+	// then we don't page it in and don't consume memory for it.
+	// Executor uses the same optimization during decompression.
+	scratch := decompressPool.Get().(*decompressScratch)
+	defer decompressPool.Put(scratch)
+	scratch.r.Reset(compressed)
+	if scratch.zr == nil {
+		zr, err := zlib.NewReader(&scratch.r)
+		if err != nil {
+			panic(err)
+		}
+		scratch.zr = zr
+	} else {
+		if err := scratch.zr.(zlib.Resetter).Reset(&scratch.r, nil); err != nil {
+			panic(err)
+		}
+	}
+	// We don't know the size of the uncompressed image.
+	// We could uncompress it into ioutil.Discard first, then allocate memory and uncompress second time
+	// (and it's still faster than the naive uncompress into bytes.Buffer!).
+	// But we know maximum size of images, so just mmap the max size.
+	// It's fast and unused part does not consume memory.
+	// Note: executor/common_zlib.h also knows this const.
+	const maxImageSize = 132 << 20
+	var err error
+	data, err = syscall.Mmap(-1, 0, maxImageSize, syscall.PROT_READ|syscall.PROT_WRITE,
+		syscall.MAP_ANON|syscall.MAP_PRIVATE)
+	if err != nil {
+		panic(err)
+	}
+	dtor = func() {
+		if err := syscall.Munmap(data[:maxImageSize]); err != nil {
+			panic(err)
+		}
+	}
+	offset := 0
+	for {
+		n, err := scratch.zr.Read(scratch.buf)
+		if err != nil && err != io.EOF {
+			panic(err)
+		}
+		if n == 0 {
+			break
+		}
+		if offset+n > len(data) {
+			panic(fmt.Sprintf("bad image size: offset=%v n=%v data=%v", offset, n, len(data)))
+		}
+		// Copy word-at-a-time and avoid bounds checks in the loop,
+		// this is considerably faster than a naive byte loop.
+		// We already checked bounds above.
+		type word uint64
+		const wordSize = unsafe.Sizeof(word(0))
+		// Don't copy the last word b/c otherwise we calculate pointer outside of scratch.buf object
+		// on the last iteration. We don't use it, but unsafe rules prohibit even calculating
+		// such pointers. Alternatively we could add 8 unused bytes to scratch.buf, but it will
+		// play badly with memory allocator size classes (it will consume whole additional page,
+		// or whatever is the alignment for such large objects). We could also break from the middle
+		// of the loop before updating src/dst pointers, but it hurts codegen a lot (compilers like
+		// canonical loop forms).
+		words := uintptr(n-1) / wordSize
+		src := (*word)(unsafe.Pointer(&scratch.buf[0]))
+		dst := (*word)(unsafe.Pointer(&data[offset]))
+		for i := uintptr(0); i < words; i++ {
+			if *src != 0 {
+				*dst = *src
+			}
+			src = (*word)(unsafe.Pointer(uintptr(unsafe.Pointer(src)) + wordSize))
+			dst = (*word)(unsafe.Pointer(uintptr(unsafe.Pointer(dst)) + wordSize))
+		}
+		// Copy any remaining trailing bytes.
+		for i := words * wordSize; i < uintptr(n); i++ {
+			v := scratch.buf[i]
+			if v != 0 {
+				data[uintptr(offset)+i] = v
+			}
+		}
+		offset += n
+	}
+	data = data[:offset]
+	return
+}
diff --git a/pkg/image/compression_test.go b/pkg/image/compression_test.go
index 0107f82aa..15a225052 100644
--- a/pkg/image/compression_test.go
+++ b/pkg/image/compression_test.go
@@ -1,15 +1,20 @@
 // Copyright 2022 syzkaller project authors. All rights reserved.
 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
 
-package image
+package image_test
 
 import (
 	"bytes"
 	"fmt"
+	"io/ioutil"
 	"math/rand"
+	"path/filepath"
 	"testing"
 
+	. "github.com/google/syzkaller/pkg/image"
 	"github.com/google/syzkaller/pkg/testutil"
+	"github.com/google/syzkaller/prog"
+	_ "github.com/google/syzkaller/sys/linux/gen"
 )
 
 func TestCompress(t *testing.T) {
@@ -43,3 +48,32 @@ func TestEncode(t *testing.T) {
 		}
 	}
 }
+
+func BenchmarkDecompress(b *testing.B) {
+	// Extract the largest image seed.
+	data, err := ioutil.ReadFile(filepath.FromSlash("../../sys/linux/test/syz_mount_image_gfs2_0"))
+	if err != nil {
+		b.Fatal(err)
+	}
+	target, err := prog.GetTarget("linux", "amd64")
+	if err != nil {
+		b.Fatal(err)
+	}
+	p, err := target.Deserialize(data, prog.Strict)
+	if err != nil {
+		b.Fatalf("failed to deserialize the program: %v", err)
+	}
+	compressed := p.Calls[0].Args[6].(*prog.PointerArg).Res.(*prog.DataArg).Data()
+	if len(compressed) < 100<<10 {
+		b.Fatalf("compressed data is too small: %v", len(compressed))
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		uncompressed, dtor := MustDecompress(compressed)
+		if len(uncompressed) < 10<<20 {
+			b.Fatalf("uncompressed data is too small: %v", len(uncompressed))
+		}
+		dtor()
+	}
+}
diff --git a/prog/mutation.go b/prog/mutation.go
index 5a38cfa88..09fdfa69c 100644
--- a/prog/mutation.go
+++ b/prog/mutation.go
@@ -9,7 +9,6 @@ import (
 	"math"
 	"math/rand"
 	"sort"
-	"sync"
 
 	"github.com/google/syzkaller/pkg/image"
 )
@@ -385,17 +384,10 @@ func (t *BufferType) mutate(r *randGen, s *state, arg Arg, ctx ArgCtx) (calls []
 	return
 }
 
-var imageMu sync.Mutex
-
 func (r *randGen) mutateImage(compressed []byte) (data []byte, retry bool) {
 	if len(compressed) == 0 {
 		return compressed, true
 	}
-	// Don't decompress more than one image at a time
-	// since it can consume lots of memory.
-	// Reconsider when/if we move mutation to the host process.
-	imageMu.Lock()
-	defer imageMu.Unlock()
 	data, dtor := image.MustDecompress(compressed)
 	defer dtor()
 	if len(data) == 0 {