diff options
| author | Dmitry Vyukov <dvyukov@google.com> | 2022-12-17 13:00:48 +0100 |
|---|---|---|
| committer | Dmitry Vyukov <dvyukov@google.com> | 2022-12-22 10:11:08 +0100 |
| commit | 1f82f0f8c44b9035420b1d9b9800fc9008fa15d7 (patch) | |
| tree | f77e5257a6056cea9e271127a0d7dd8b472d996f /pkg/image/compression_optimized.go | |
| parent | a0df376348d2ad1d3e557ea221e75c78a5d9fd96 (diff) | |
pkg/image: optimize image decompression
Benchmark results:
name old time/op new time/op delta
Decompress-8 24.7ms ± 1% 13.4ms ± 4% -45.81% (p=0.000 n=16+19)
name old alloc/op new alloc/op delta
Decompress-8 67.2MB ± 0% 0.0MB ± 1% -99.98% (p=0.000 n=18+20)
name old allocs/op new allocs/op delta
Decompress-8 188 ± 0% 167 ± 0% -11.17% (p=0.000 n=20+20)
Test process memory consumption drops from 220MB to 80MB.
Diffstat (limited to 'pkg/image/compression_optimized.go')
| -rw-r--r-- | pkg/image/compression_optimized.go | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/pkg/image/compression_optimized.go b/pkg/image/compression_optimized.go new file mode 100644 index 000000000..bfb9dece2 --- /dev/null +++ b/pkg/image/compression_optimized.go @@ -0,0 +1,117 @@ +// Copyright 2022 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +//go:build !windows && !386 && !arm +// +build !windows,!386,!arm + +package image + +import ( + "bytes" + "compress/zlib" + "fmt" + "io" + "sync" + "syscall" + "unsafe" +) + +// Temporary scratch data used by the decompression procedure. +type decompressScratch struct { + r bytes.Reader + zr io.Reader + buf []byte +} + +var decompressPool = sync.Pool{New: func() interface{} { + return &decompressScratch{ + buf: make([]byte, 8<<10), + } +}} + +func MustDecompress(compressed []byte) (data []byte, dtor func()) { + // Optimized decompression procedure that is ~2x faster than a naive version + // and consumes significantly less memory and generates less garbage. + // Images tend to contain lots of 0s, especially the larger images. + // The main idea is that we mmap a buffer and then don't write 0s into it + // (since it already contains all 0s). As the result if a page is all 0s + // then we don't page it in and don't consume memory for it. + // Executor uses the same optimization during decompression. + scratch := decompressPool.Get().(*decompressScratch) + defer decompressPool.Put(scratch) + scratch.r.Reset(compressed) + if scratch.zr == nil { + zr, err := zlib.NewReader(&scratch.r) + if err != nil { + panic(err) + } + scratch.zr = zr + } else { + if err := scratch.zr.(zlib.Resetter).Reset(&scratch.r, nil); err != nil { + panic(err) + } + } + // We don't know the size of the uncompressed image. + // We could uncompress it into ioutil.Discard first, then allocate memory and uncompress second time + // (and it's still faster than the naive uncompress into bytes.Buffer!). + // But we know maximum size of images, so just mmap the max size. + // It's fast and unused part does not consume memory. + // Note: executor/common_zlib.h also knows this const. + const maxImageSize = 132 << 20 + var err error + data, err = syscall.Mmap(-1, 0, maxImageSize, syscall.PROT_READ|syscall.PROT_WRITE, + syscall.MAP_ANON|syscall.MAP_PRIVATE) + if err != nil { + panic(err) + } + dtor = func() { + if err := syscall.Munmap(data[:maxImageSize]); err != nil { + panic(err) + } + } + offset := 0 + for { + n, err := scratch.zr.Read(scratch.buf) + if err != nil && err != io.EOF { + panic(err) + } + if n == 0 { + break + } + if offset+n > len(data) { + panic(fmt.Sprintf("bad image size: offset=%v n=%v data=%v", offset, n, len(data))) + } + // Copy word-at-a-time and avoid bounds checks in the loop, + // this is considerably faster than a naive byte loop. + // We already checked bounds above. + type word uint64 + const wordSize = unsafe.Sizeof(word(0)) + // Don't copy the last word b/c otherwise we calculate pointer outside of scratch.buf object + // on the last iteration. We don't use it, but unsafe rules prohibit even calculating + // such pointers. Alternatively we could add 8 unused bytes to scratch.buf, but it will + // play badly with memory allocator size classes (it will consume whole additional page, + // or whatever is the alignment for such large objects). We could also break from the middle + // of the loop before updating src/dst pointers, but it hurts codegen a lot (compilers like + // canonical loop forms). + words := uintptr(n-1) / wordSize + src := (*word)(unsafe.Pointer(&scratch.buf[0])) + dst := (*word)(unsafe.Pointer(&data[offset])) + for i := uintptr(0); i < words; i++ { + if *src != 0 { + *dst = *src + } + src = (*word)(unsafe.Pointer(uintptr(unsafe.Pointer(src)) + wordSize)) + dst = (*word)(unsafe.Pointer(uintptr(unsafe.Pointer(dst)) + wordSize)) + } + // Copy any remaining trailing bytes. + for i := words * wordSize; i < uintptr(n); i++ { + v := scratch.buf[i] + if v != 0 { + data[uintptr(offset)+i] = v + } + } + offset += n + } + data = data[:offset] + return +} |
