aboutsummaryrefslogtreecommitdiffstats
path: root/pkg/image/compression_optimized.go
diff options
context:
space:
mode:
authorDmitry Vyukov <dvyukov@google.com>2022-12-17 13:00:48 +0100
committerDmitry Vyukov <dvyukov@google.com>2022-12-22 10:11:08 +0100
commit1f82f0f8c44b9035420b1d9b9800fc9008fa15d7 (patch)
treef77e5257a6056cea9e271127a0d7dd8b472d996f /pkg/image/compression_optimized.go
parenta0df376348d2ad1d3e557ea221e75c78a5d9fd96 (diff)
pkg/image: optimize image decompression
Benchmark results: name old time/op new time/op delta Decompress-8 24.7ms ± 1% 13.4ms ± 4% -45.81% (p=0.000 n=16+19) name old alloc/op new alloc/op delta Decompress-8 67.2MB ± 0% 0.0MB ± 1% -99.98% (p=0.000 n=18+20) name old allocs/op new allocs/op delta Decompress-8 188 ± 0% 167 ± 0% -11.17% (p=0.000 n=20+20) Test process memory consumption drops from 220MB to 80MB.
Diffstat (limited to 'pkg/image/compression_optimized.go')
-rw-r--r--pkg/image/compression_optimized.go117
1 files changed, 117 insertions, 0 deletions
diff --git a/pkg/image/compression_optimized.go b/pkg/image/compression_optimized.go
new file mode 100644
index 000000000..bfb9dece2
--- /dev/null
+++ b/pkg/image/compression_optimized.go
@@ -0,0 +1,117 @@
+// Copyright 2022 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+//go:build !windows && !386 && !arm
+// +build !windows,!386,!arm
+
+package image
+
+import (
+ "bytes"
+ "compress/zlib"
+ "fmt"
+ "io"
+ "sync"
+ "syscall"
+ "unsafe"
+)
+
+// Temporary scratch data used by the decompression procedure.
+type decompressScratch struct {
+ r bytes.Reader
+ zr io.Reader
+ buf []byte
+}
+
+var decompressPool = sync.Pool{New: func() interface{} {
+ return &decompressScratch{
+ buf: make([]byte, 8<<10),
+ }
+}}
+
+func MustDecompress(compressed []byte) (data []byte, dtor func()) {
+ // Optimized decompression procedure that is ~2x faster than a naive version
+ // and consumes significantly less memory and generates less garbage.
+ // Images tend to contain lots of 0s, especially the larger images.
+ // The main idea is that we mmap a buffer and then don't write 0s into it
+ // (since it already contains all 0s). As the result if a page is all 0s
+ // then we don't page it in and don't consume memory for it.
+ // Executor uses the same optimization during decompression.
+ scratch := decompressPool.Get().(*decompressScratch)
+ defer decompressPool.Put(scratch)
+ scratch.r.Reset(compressed)
+ if scratch.zr == nil {
+ zr, err := zlib.NewReader(&scratch.r)
+ if err != nil {
+ panic(err)
+ }
+ scratch.zr = zr
+ } else {
+ if err := scratch.zr.(zlib.Resetter).Reset(&scratch.r, nil); err != nil {
+ panic(err)
+ }
+ }
+ // We don't know the size of the uncompressed image.
+ // We could uncompress it into ioutil.Discard first, then allocate memory and uncompress second time
+ // (and it's still faster than the naive uncompress into bytes.Buffer!).
+ // But we know maximum size of images, so just mmap the max size.
+ // It's fast and unused part does not consume memory.
+ // Note: executor/common_zlib.h also knows this const.
+ const maxImageSize = 132 << 20
+ var err error
+ data, err = syscall.Mmap(-1, 0, maxImageSize, syscall.PROT_READ|syscall.PROT_WRITE,
+ syscall.MAP_ANON|syscall.MAP_PRIVATE)
+ if err != nil {
+ panic(err)
+ }
+ dtor = func() {
+ if err := syscall.Munmap(data[:maxImageSize]); err != nil {
+ panic(err)
+ }
+ }
+ offset := 0
+ for {
+ n, err := scratch.zr.Read(scratch.buf)
+ if err != nil && err != io.EOF {
+ panic(err)
+ }
+ if n == 0 {
+ break
+ }
+ if offset+n > len(data) {
+ panic(fmt.Sprintf("bad image size: offset=%v n=%v data=%v", offset, n, len(data)))
+ }
+ // Copy word-at-a-time and avoid bounds checks in the loop,
+ // this is considerably faster than a naive byte loop.
+ // We already checked bounds above.
+ type word uint64
+ const wordSize = unsafe.Sizeof(word(0))
+ // Don't copy the last word b/c otherwise we calculate pointer outside of scratch.buf object
+ // on the last iteration. We don't use it, but unsafe rules prohibit even calculating
+ // such pointers. Alternatively we could add 8 unused bytes to scratch.buf, but it will
+ // play badly with memory allocator size classes (it will consume whole additional page,
+ // or whatever is the alignment for such large objects). We could also break from the middle
+ // of the loop before updating src/dst pointers, but it hurts codegen a lot (compilers like
+ // canonical loop forms).
+ words := uintptr(n-1) / wordSize
+ src := (*word)(unsafe.Pointer(&scratch.buf[0]))
+ dst := (*word)(unsafe.Pointer(&data[offset]))
+ for i := uintptr(0); i < words; i++ {
+ if *src != 0 {
+ *dst = *src
+ }
+ src = (*word)(unsafe.Pointer(uintptr(unsafe.Pointer(src)) + wordSize))
+ dst = (*word)(unsafe.Pointer(uintptr(unsafe.Pointer(dst)) + wordSize))
+ }
+ // Copy any remaining trailing bytes.
+ for i := words * wordSize; i < uintptr(n); i++ {
+ v := scratch.buf[i]
+ if v != 0 {
+ data[uintptr(offset)+i] = v
+ }
+ }
+ offset += n
+ }
+ data = data[:offset]
+ return
+}