aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/apache/arrow/go/v15/internal
diff options
context:
space:
mode:
authorTaras Madan <tarasmadan@google.com>2024-09-10 12:16:33 +0200
committerTaras Madan <tarasmadan@google.com>2024-09-10 14:05:26 +0000
commitc97c816133b42257d0bcf1ee4bd178bb2a7a2b9e (patch)
tree0bcbc2e540bbf8f62f6c17887cdd53b8c2cee637 /vendor/github.com/apache/arrow/go/v15/internal
parent54e657429ab892ad06c90cd7c1a4eb33ba93a3dc (diff)
vendor: update
Diffstat (limited to 'vendor/github.com/apache/arrow/go/v15/internal')
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_block_counter.go452
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_run_reader.go151
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_set_run_reader.go361
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/bitutils/bitmap_generate.go109
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_funcs.go90
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string.go26
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string_go1.19.go37
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/hashing/types.tmpldata42
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go2833
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go.tmpl349
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.go443
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/json/json.go51
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/json/json_tinygo.go51
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/Makefile80
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/buf_reader.go212
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/endians_default.go30
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/endians_s390x.go33
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/math.go33
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/min_max.go212
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_amd64.go55
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_arm64.go65
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.go90
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.s927
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.go56
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.s324
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_noasm.go31
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_ppc64le.go30
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_s390x.go30
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.go88
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.s1044
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go407
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go.tmpl34
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.tmpldata34
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go325
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go.tmpl75
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_arm64.go96
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.go473
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.s3074
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_def.go227
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go96
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go.tmpl34
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_ppc64le.go96
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go96
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go.tmpl34
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_simd.go.tmpl42
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.go473
-rw-r--r--vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.s3074
47 files changed, 17025 insertions, 0 deletions
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_block_counter.go b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_block_counter.go
new file mode 100644
index 000000000..50996b10e
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_block_counter.go
@@ -0,0 +1,452 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package bitutils
+
+import (
+ "math"
+ "math/bits"
+ "unsafe"
+
+ "github.com/apache/arrow/go/v15/arrow/bitutil"
+ "github.com/apache/arrow/go/v15/internal/utils"
+)
+
+func loadWord(byt []byte) uint64 {
+ return utils.ToLEUint64(*(*uint64)(unsafe.Pointer(&byt[0])))
+}
+
+func shiftWord(current, next uint64, shift int64) uint64 {
+ if shift == 0 {
+ return current
+ }
+ return (current >> shift) | (next << (64 - shift))
+}
+
+// BitBlockCount is returned by the various bit block counter utilities
+// in order to return a length of bits and the population count of that
+// slice of bits.
+type BitBlockCount struct {
+ Len int16
+ Popcnt int16
+}
+
+// NoneSet returns true if ALL the bits were 0 in this set, ie: Popcnt == 0
+func (b BitBlockCount) NoneSet() bool {
+ return b.Popcnt == 0
+}
+
+// AllSet returns true if ALL the bits were 1 in this set, ie: Popcnt == Len
+func (b BitBlockCount) AllSet() bool {
+ return b.Len == b.Popcnt
+}
+
+// BitBlockCounter is a utility for grabbing chunks of a bitmap at a time and efficiently
+// counting the number of bits which are 1.
+type BitBlockCounter struct {
+ bitmap []byte
+ bitsRemaining int64
+ bitOffset int8
+}
+
+const (
+ wordBits int64 = 64
+ fourWordsBits int64 = wordBits * 4
+)
+
+// NewBitBlockCounter returns a BitBlockCounter for the passed bitmap starting at startOffset
+// of length nbits.
+func NewBitBlockCounter(bitmap []byte, startOffset, nbits int64) *BitBlockCounter {
+ return &BitBlockCounter{
+ bitmap: bitmap[startOffset/8:],
+ bitsRemaining: nbits,
+ bitOffset: int8(startOffset % 8),
+ }
+}
+
+// getBlockSlow is for returning a block of the requested size when there aren't
+// enough bits remaining to do a full word computation.
+func (b *BitBlockCounter) getBlockSlow(blockSize int64) BitBlockCount {
+ runlen := int16(utils.Min(b.bitsRemaining, blockSize))
+ popcnt := int16(bitutil.CountSetBits(b.bitmap, int(b.bitOffset), int(runlen)))
+ b.bitsRemaining -= int64(runlen)
+ b.bitmap = b.bitmap[runlen/8:]
+ return BitBlockCount{runlen, popcnt}
+}
+
+// NextFourWords returns the next run of available bits, usually 256. The
+// returned pair contains the size of run and the number of true values.
+// The last block will have a length less than 256 if the bitmap length
+// is not a multiple of 256, and will return 0-length blocks in subsequent
+// invocations.
+func (b *BitBlockCounter) NextFourWords() BitBlockCount {
+ if b.bitsRemaining == 0 {
+ return BitBlockCount{0, 0}
+ }
+
+ totalPopcnt := 0
+ if b.bitOffset == 0 {
+ // if we're aligned at 0 bitoffset, then we can easily just jump from
+ // word to word nice and easy.
+ if b.bitsRemaining < fourWordsBits {
+ return b.getBlockSlow(fourWordsBits)
+ }
+ totalPopcnt += bits.OnesCount64(loadWord(b.bitmap))
+ totalPopcnt += bits.OnesCount64(loadWord(b.bitmap[8:]))
+ totalPopcnt += bits.OnesCount64(loadWord(b.bitmap[16:]))
+ totalPopcnt += bits.OnesCount64(loadWord(b.bitmap[24:]))
+ } else {
+ // When the offset is > 0, we need there to be a word beyond the last
+ // aligned word in the bitmap for the bit shifting logic.
+ if b.bitsRemaining < 5*fourWordsBits-int64(b.bitOffset) {
+ return b.getBlockSlow(fourWordsBits)
+ }
+
+ current := loadWord(b.bitmap)
+ next := loadWord(b.bitmap[8:])
+ totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset)))
+
+ current = next
+ next = loadWord(b.bitmap[16:])
+ totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset)))
+
+ current = next
+ next = loadWord(b.bitmap[24:])
+ totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset)))
+
+ current = next
+ next = loadWord(b.bitmap[32:])
+ totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset)))
+ }
+ b.bitmap = b.bitmap[bitutil.BytesForBits(fourWordsBits):]
+ b.bitsRemaining -= fourWordsBits
+ return BitBlockCount{256, int16(totalPopcnt)}
+}
+
+// NextWord returns the next run of available bits, usually 64. The returned
+// pair contains the size of run and the number of true values. The last
+// block will have a length less than 64 if the bitmap length is not a
+// multiple of 64, and will return 0-length blocks in subsequent
+// invocations.
+func (b *BitBlockCounter) NextWord() BitBlockCount {
+ if b.bitsRemaining == 0 {
+ return BitBlockCount{0, 0}
+ }
+ popcnt := 0
+ if b.bitOffset == 0 {
+ if b.bitsRemaining < wordBits {
+ return b.getBlockSlow(wordBits)
+ }
+ popcnt = bits.OnesCount64(loadWord(b.bitmap))
+ } else {
+ // When the offset is > 0, we need there to be a word beyond the last
+ // aligned word in the bitmap for the bit shifting logic.
+ if b.bitsRemaining < (2*wordBits - int64(b.bitOffset)) {
+ return b.getBlockSlow(wordBits)
+ }
+ popcnt = bits.OnesCount64(shiftWord(loadWord(b.bitmap), loadWord(b.bitmap[8:]), int64(b.bitOffset)))
+ }
+ b.bitmap = b.bitmap[wordBits/8:]
+ b.bitsRemaining -= wordBits
+ return BitBlockCount{64, int16(popcnt)}
+}
+
+// OptionalBitBlockCounter is a useful counter to iterate through a possibly
+// nonexistent validity bitmap to allow us to write one code path for both
+// the with-nulls and no-nulls cases without giving up a lot of performance.
+type OptionalBitBlockCounter struct {
+ hasBitmap bool
+ pos int64
+ len int64
+ counter *BitBlockCounter
+}
+
+// NewOptionalBitBlockCounter constructs and returns a new bit block counter that
+// can properly handle the case when a bitmap is null, if it is guaranteed that the
+// the bitmap is not nil, then prefer NewBitBlockCounter here.
+func NewOptionalBitBlockCounter(bitmap []byte, offset, length int64) *OptionalBitBlockCounter {
+ var counter *BitBlockCounter
+ if bitmap != nil {
+ counter = NewBitBlockCounter(bitmap, offset, length)
+ }
+ return &OptionalBitBlockCounter{
+ hasBitmap: bitmap != nil,
+ pos: 0,
+ len: length,
+ counter: counter,
+ }
+}
+
+// NextBlock returns block count for next word when the bitmap is available otherwise
+// return a block with length up to INT16_MAX when there is no validity
+// bitmap (so all the referenced values are not null).
+func (obc *OptionalBitBlockCounter) NextBlock() BitBlockCount {
+ const maxBlockSize = math.MaxInt16
+ if obc.hasBitmap {
+ block := obc.counter.NextWord()
+ obc.pos += int64(block.Len)
+ return block
+ }
+
+ blockSize := int16(utils.Min(maxBlockSize, obc.len-obc.pos))
+ obc.pos += int64(blockSize)
+ // all values are non-null
+ return BitBlockCount{blockSize, blockSize}
+}
+
+// NextWord is like NextBlock, but returns a word-sized block even when there is no
+// validity bitmap
+func (obc *OptionalBitBlockCounter) NextWord() BitBlockCount {
+ const wordsize = 64
+ if obc.hasBitmap {
+ block := obc.counter.NextWord()
+ obc.pos += int64(block.Len)
+ return block
+ }
+ blockSize := int16(utils.Min(wordsize, obc.len-obc.pos))
+ obc.pos += int64(blockSize)
+ // all values are non-null
+ return BitBlockCount{blockSize, blockSize}
+}
+
+// VisitBitBlocks is a utility for easily iterating through the blocks of bits in a bitmap,
+// calling the appropriate visitValid/visitInvalid function as we iterate through the bits.
+// visitValid is called with the bitoffset of the valid bit. Don't use this inside a tight
+// loop when performance is needed and instead prefer manually constructing these loops
+// in that scenario.
+func VisitBitBlocks(bitmap []byte, offset, length int64, visitValid func(pos int64), visitInvalid func()) {
+ counter := NewOptionalBitBlockCounter(bitmap, offset, length)
+ pos := int64(0)
+ for pos < length {
+ block := counter.NextBlock()
+ if block.AllSet() {
+ for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+ visitValid(pos)
+ }
+ } else if block.NoneSet() {
+ for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+ visitInvalid()
+ }
+ } else {
+ for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+ if bitutil.BitIsSet(bitmap, int(offset+pos)) {
+ visitValid(pos)
+ } else {
+ visitInvalid()
+ }
+ }
+ }
+ }
+}
+
+// VisitBitBlocks is a utility for easily iterating through the blocks of bits in a bitmap,
+// calling the appropriate visitValid/visitInvalid function as we iterate through the bits.
+// visitValid is called with the bitoffset of the valid bit. Don't use this inside a tight
+// loop when performance is needed and instead prefer manually constructing these loops
+// in that scenario.
+func VisitBitBlocksShort(bitmap []byte, offset, length int64, visitValid func(pos int64) error, visitInvalid func() error) error {
+ counter := NewOptionalBitBlockCounter(bitmap, offset, length)
+ pos := int64(0)
+ for pos < length {
+ block := counter.NextBlock()
+ if block.AllSet() {
+ for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+ if err := visitValid(pos); err != nil {
+ return err
+ }
+ }
+ } else if block.NoneSet() {
+ for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+ if err := visitInvalid(); err != nil {
+ return err
+ }
+ }
+ } else {
+ for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+ if bitutil.BitIsSet(bitmap, int(offset+pos)) {
+ if err := visitValid(pos); err != nil {
+ return err
+ }
+ } else {
+ if err := visitInvalid(); err != nil {
+ return err
+ }
+ }
+ }
+ }
+ }
+ return nil
+}
+
+func VisitTwoBitBlocks(leftBitmap, rightBitmap []byte, leftOffset, rightOffset int64, len int64, visitValid func(pos int64), visitNull func()) {
+ if leftBitmap == nil || rightBitmap == nil {
+ // at most one is present
+ if leftBitmap == nil {
+ VisitBitBlocks(rightBitmap, rightOffset, len, visitValid, visitNull)
+ } else {
+ VisitBitBlocks(leftBitmap, leftOffset, len, visitValid, visitNull)
+ }
+ return
+ }
+
+ bitCounter := NewBinaryBitBlockCounter(leftBitmap, rightBitmap, leftOffset, rightOffset, len)
+ var pos int64
+ for pos < len {
+ block := bitCounter.NextAndWord()
+ if block.AllSet() {
+ for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+ visitValid(pos)
+ }
+ } else if block.NoneSet() {
+ for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+ visitNull()
+ }
+ } else {
+ for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+ if bitutil.BitIsSet(leftBitmap, int(leftOffset+pos)) && bitutil.BitIsSet(rightBitmap, int(rightOffset+pos)) {
+ visitValid(pos)
+ } else {
+ visitNull()
+ }
+ }
+ }
+ }
+}
+
+type bitOp struct {
+ bit func(bool, bool) bool
+ word func(uint64, uint64) uint64
+}
+
+var (
+ bitBlockAnd = bitOp{
+ bit: func(a, b bool) bool { return a && b },
+ word: func(a, b uint64) uint64 { return a & b },
+ }
+ bitBlockAndNot = bitOp{
+ bit: func(a, b bool) bool { return a && !b },
+ word: func(a, b uint64) uint64 { return a &^ b },
+ }
+ bitBlockOr = bitOp{
+ bit: func(a, b bool) bool { return a || b },
+ word: func(a, b uint64) uint64 { return a | b },
+ }
+ bitBlockOrNot = bitOp{
+ bit: func(a, b bool) bool { return a || !b },
+ word: func(a, b uint64) uint64 { return a | ^b },
+ }
+)
+
+// BinaryBitBlockCounter computes popcounts on the result of bitwise
+// operations between two bitmaps, 64 bits at a time. A 64-bit word
+// is loaded from each bitmap, then the popcount is computed on
+// e.g. the bitwise-and of the two words
+type BinaryBitBlockCounter struct {
+ left []byte
+ right []byte
+ bitsRemaining int64
+ leftOffset, rightOffset int64
+
+ bitsRequiredForWords int64
+}
+
+// NewBinaryBitBlockCounter constructs a binary bit block counter for
+// computing the popcounts on the results of operations between
+// the passed in bitmaps, with their respective offsets.
+func NewBinaryBitBlockCounter(left, right []byte, leftOffset, rightOffset int64, length int64) *BinaryBitBlockCounter {
+ ret := &BinaryBitBlockCounter{
+ left: left[leftOffset/8:],
+ right: right[rightOffset/8:],
+ leftOffset: leftOffset % 8,
+ rightOffset: rightOffset % 8,
+ bitsRemaining: length,
+ }
+
+ leftBitsReq := int64(64)
+ if ret.leftOffset != 0 {
+ leftBitsReq = 64 + (64 - ret.leftOffset)
+ }
+ rightBitsReq := int64(64)
+ if ret.rightOffset != 0 {
+ rightBitsReq = 64 + (64 - ret.rightOffset)
+ }
+
+ if leftBitsReq > rightBitsReq {
+ ret.bitsRequiredForWords = leftBitsReq
+ } else {
+ ret.bitsRequiredForWords = rightBitsReq
+ }
+
+ return ret
+}
+
+// NextAndWord returns the popcount of the bitwise-and of the next run
+// of available bits, up to 64. The returned pair contains the size of
+// the run and the number of true values. the last block will have a
+// length less than 64 if the bitmap length is not a multiple of 64,
+// and will return 0-length blocks in subsequent invocations
+func (b *BinaryBitBlockCounter) NextAndWord() BitBlockCount { return b.nextWord(bitBlockAnd) }
+
+// NextAndNotWord is like NextAndWord but performs x &^ y on each run
+func (b *BinaryBitBlockCounter) NextAndNotWord() BitBlockCount { return b.nextWord(bitBlockAndNot) }
+
+// NextOrWord is like NextAndWord but performs x | y on each run
+func (b *BinaryBitBlockCounter) NextOrWord() BitBlockCount { return b.nextWord(bitBlockOr) }
+
+// NextOrWord is like NextAndWord but performs x | ^y on each run
+func (b *BinaryBitBlockCounter) NextOrNotWord() BitBlockCount { return b.nextWord(bitBlockOrNot) }
+
+func (b *BinaryBitBlockCounter) nextWord(op bitOp) BitBlockCount {
+ if b.bitsRemaining == 0 {
+ return BitBlockCount{}
+ }
+
+ // when offset is >0, we need there to be a word beyond the last
+ // aligned word in the bitmap for the bit shifting logic
+ if b.bitsRemaining < b.bitsRequiredForWords {
+ runLength := int16(b.bitsRemaining)
+ if runLength > int16(wordBits) {
+ runLength = int16(wordBits)
+ }
+
+ var popcount int16
+ for i := int16(0); i < runLength; i++ {
+ if op.bit(bitutil.BitIsSet(b.left, int(b.leftOffset)+int(i)),
+ bitutil.BitIsSet(b.right, int(b.rightOffset)+int(i))) {
+ popcount++
+ }
+ }
+ // this code path should trigger _at most_ 2 times. in the "two times"
+ // case, the first time the run length will be a multiple of 8.
+ b.left = b.left[runLength/8:]
+ b.right = b.right[runLength/8:]
+ b.bitsRemaining -= int64(runLength)
+ return BitBlockCount{Len: runLength, Popcnt: popcount}
+ }
+
+ var popcount int
+ if b.leftOffset == 0 && b.rightOffset == 0 {
+ popcount = bits.OnesCount64(op.word(loadWord(b.left), loadWord(b.right)))
+ } else {
+ leftWord := shiftWord(loadWord(b.left), loadWord(b.left[8:]), b.leftOffset)
+ rightWord := shiftWord(loadWord(b.right), loadWord(b.right[8:]), b.rightOffset)
+ popcount = bits.OnesCount64(op.word(leftWord, rightWord))
+ }
+ b.left = b.left[wordBits/8:]
+ b.right = b.right[wordBits/8:]
+ b.bitsRemaining -= wordBits
+ return BitBlockCount{Len: int16(wordBits), Popcnt: int16(popcount)}
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_run_reader.go b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_run_reader.go
new file mode 100644
index 000000000..f09149d7e
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_run_reader.go
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package bitutils
+
+import (
+ "encoding/binary"
+ "fmt"
+ "math/bits"
+ "unsafe"
+
+ "github.com/apache/arrow/go/v15/arrow"
+ "github.com/apache/arrow/go/v15/arrow/bitutil"
+ "github.com/apache/arrow/go/v15/internal/utils"
+)
+
+// BitRun represents a run of bits with the same value of length Len
+// with Set representing if the group of bits were 1 or 0.
+type BitRun struct {
+ Len int64
+ Set bool
+}
+
+// BitRunReader is an interface that is usable by multiple callers to provide
+// multiple types of bit run readers such as a reverse reader and so on.
+//
+// It's a convenience interface for counting contiguous set/unset bits in a bitmap.
+// In places where BitBlockCounter can be used, then it would be preferred to use that
+// as it would be faster than using BitRunReader.
+type BitRunReader interface {
+ NextRun() BitRun
+}
+
+func (b BitRun) String() string {
+ return fmt.Sprintf("{Length: %d, set=%t}", b.Len, b.Set)
+}
+
+type bitRunReader struct {
+ bitmap []byte
+ pos int64
+ length int64
+ word uint64
+ curRunBitSet bool
+}
+
+// NewBitRunReader returns a reader for the given bitmap, offset and length that
+// grabs runs of the same value bit at a time for easy iteration.
+func NewBitRunReader(bitmap []byte, offset int64, length int64) BitRunReader {
+ ret := &bitRunReader{
+ bitmap: bitmap[offset/8:],
+ pos: offset % 8,
+ length: (offset % 8) + length,
+ }
+
+ if length == 0 {
+ return ret
+ }
+
+ ret.curRunBitSet = bitutil.BitIsNotSet(bitmap, int(offset))
+ bitsRemaining := length + ret.pos
+ ret.loadWord(bitsRemaining)
+ ret.word = ret.word &^ LeastSignificantBitMask(ret.pos)
+ return ret
+}
+
+// NextRun returns a new BitRun containing the number of contiguous bits with the
+// same value. Len == 0 indicates the end of the bitmap.
+func (b *bitRunReader) NextRun() BitRun {
+ if b.pos >= b.length {
+ return BitRun{0, false}
+ }
+
+ // This implementation relies on a efficient implementations of
+ // CountTrailingZeros and assumes that runs are more often then
+ // not. The logic is to incrementally find the next bit change
+ // from the current position. This is done by zeroing all
+ // bits in word_ up to position_ and using the TrailingZeroCount
+ // to find the index of the next set bit.
+
+ // The runs alternate on each call, so flip the bit.
+ b.curRunBitSet = !b.curRunBitSet
+
+ start := b.pos
+ startOffset := start & 63
+
+ // Invert the word for proper use of CountTrailingZeros and
+ // clear bits so CountTrailingZeros can do it magic.
+ b.word = ^b.word &^ LeastSignificantBitMask(startOffset)
+
+ // Go forward until the next change from unset to set.
+ newbits := int64(bits.TrailingZeros64(b.word)) - startOffset
+ b.pos += newbits
+
+ if IsMultipleOf64(b.pos) && b.pos < b.length {
+ b.advanceUntilChange()
+ }
+ return BitRun{b.pos - start, b.curRunBitSet}
+}
+
+func (b *bitRunReader) advanceUntilChange() {
+ newbits := int64(0)
+ for {
+ b.bitmap = b.bitmap[arrow.Uint64SizeBytes:]
+ b.loadNextWord()
+ newbits = int64(bits.TrailingZeros64(b.word))
+ b.pos += newbits
+ if !IsMultipleOf64(b.pos) || b.pos >= b.length || newbits <= 0 {
+ break
+ }
+ }
+}
+
+func (b *bitRunReader) loadNextWord() {
+ b.loadWord(b.length - b.pos)
+}
+
+func (b *bitRunReader) loadWord(bitsRemaining int64) {
+ b.word = 0
+ if bitsRemaining >= 64 {
+ b.word = binary.LittleEndian.Uint64(b.bitmap)
+ } else {
+ nbytes := bitutil.BytesForBits(bitsRemaining)
+ wordptr := (*(*[8]byte)(unsafe.Pointer(&b.word)))[:]
+ copy(wordptr, b.bitmap[:nbytes])
+
+ bitutil.SetBitTo(wordptr, int(bitsRemaining), bitutil.BitIsNotSet(wordptr, int(bitsRemaining-1)))
+ // reset the value to little endian for big endian architectures
+ b.word = utils.ToLEUint64(b.word)
+ }
+
+ // Two cases:
+ // 1. For unset, CountTrailingZeros works naturally so we don't
+ // invert the word.
+ // 2. Otherwise invert so we can use CountTrailingZeros.
+ if b.curRunBitSet {
+ b.word = ^b.word
+ }
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_set_run_reader.go b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_set_run_reader.go
new file mode 100644
index 000000000..374b8d4aa
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_set_run_reader.go
@@ -0,0 +1,361 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package bitutils
+
+import (
+ "encoding/binary"
+ "math/bits"
+
+ "github.com/apache/arrow/go/v15/arrow/bitutil"
+ "github.com/apache/arrow/go/v15/internal/utils"
+)
+
+// IsMultipleOf64 returns whether v is a multiple of 64.
+func IsMultipleOf64(v int64) bool { return v&63 == 0 }
+
+// LeastSignificantBitMask returns a bit mask to return the least significant
+// bits for a value starting from the bit index passed in. ie: if you want a
+// mask for the 4 least significant bits, you call LeastSignificantBitMask(4)
+func LeastSignificantBitMask(index int64) uint64 {
+ return (uint64(1) << index) - 1
+}
+
+// SetBitRun describes a run of contiguous set bits in a bitmap with Pos being
+// the starting position of the run and Length being the number of bits.
+type SetBitRun struct {
+ Pos int64
+ Length int64
+}
+
+// AtEnd returns true if this bit run is the end of the set by checking
+// that the length is 0.
+func (s SetBitRun) AtEnd() bool {
+ return s.Length == 0
+}
+
+// Equal returns whether rhs is the same run as s
+func (s SetBitRun) Equal(rhs SetBitRun) bool {
+ return s.Pos == rhs.Pos && s.Length == rhs.Length
+}
+
+// SetBitRunReader is an interface for reading groups of contiguous set bits
+// from a bitmap. The interface allows us to create different reader implementations
+// that share the same interface easily such as a reverse set reader.
+type SetBitRunReader interface {
+ // NextRun will return the next run of contiguous set bits in the bitmap
+ NextRun() SetBitRun
+ // Reset allows re-using the reader by providing a new bitmap, offset and length. The arguments
+ // match the New function for the reader being used.
+ Reset([]byte, int64, int64)
+ // VisitSetBitRuns calls visitFn for each set in a loop starting from the current position
+ // it's roughly equivalent to simply looping, calling NextRun and calling visitFn on the run
+ // for each run.
+ VisitSetBitRuns(visitFn VisitFn) error
+}
+
+type baseSetBitRunReader struct {
+ bitmap []byte
+ pos int64
+ length int64
+ remaining int64
+ curWord uint64
+ curNumBits int32
+ reversed bool
+
+ firstBit uint64
+}
+
+// NewSetBitRunReader returns a SetBitRunReader for the bitmap starting at startOffset which will read
+// numvalues bits.
+func NewSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader {
+ return newBaseSetBitRunReader(validBits, startOffset, numValues, false)
+}
+
+// NewReverseSetBitRunReader returns a SetBitRunReader like NewSetBitRunReader, except it will
+// return runs starting from the end of the bitmap until it reaches startOffset rather than starting
+// at startOffset and reading from there. The SetBitRuns will still operate the same, so Pos
+// will still be the position of the "left-most" bit of the run or the "start" of the run. It
+// just returns runs starting from the end instead of starting from the beginning.
+func NewReverseSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader {
+ return newBaseSetBitRunReader(validBits, startOffset, numValues, true)
+}
+
+func newBaseSetBitRunReader(bitmap []byte, startOffset, length int64, reverse bool) *baseSetBitRunReader {
+ ret := &baseSetBitRunReader{reversed: reverse}
+ ret.Reset(bitmap, startOffset, length)
+ return ret
+}
+
+func (br *baseSetBitRunReader) Reset(bitmap []byte, startOffset, length int64) {
+ br.bitmap = bitmap
+ br.length = length
+ br.remaining = length
+ br.curNumBits = 0
+ br.curWord = 0
+
+ if !br.reversed {
+ br.pos = startOffset / 8
+ br.firstBit = 1
+
+ bitOffset := int8(startOffset % 8)
+ if length > 0 && bitOffset != 0 {
+ br.curNumBits = int32(utils.Min(int(length), int(8-bitOffset)))
+ br.curWord = br.loadPartial(bitOffset, int64(br.curNumBits))
+ }
+ return
+ }
+
+ br.pos = (startOffset + length) / 8
+ br.firstBit = uint64(0x8000000000000000)
+ endBitOffset := int8((startOffset + length) % 8)
+ if length > 0 && endBitOffset != 0 {
+ br.pos++
+ br.curNumBits = int32(utils.Min(int(length), int(endBitOffset)))
+ br.curWord = br.loadPartial(8-endBitOffset, int64(br.curNumBits))
+ }
+}
+
+func (br *baseSetBitRunReader) consumeBits(word uint64, nbits int32) uint64 {
+ if br.reversed {
+ return word << nbits
+ }
+ return word >> nbits
+}
+
+func (br *baseSetBitRunReader) countFirstZeros(word uint64) int32 {
+ if br.reversed {
+ return int32(bits.LeadingZeros64(word))
+ }
+ return int32(bits.TrailingZeros64(word))
+}
+
+func (br *baseSetBitRunReader) loadPartial(bitOffset int8, numBits int64) uint64 {
+ var word [8]byte
+ nbytes := bitutil.BytesForBits(numBits)
+ if br.reversed {
+ br.pos -= nbytes
+ copy(word[8-nbytes:], br.bitmap[br.pos:br.pos+nbytes])
+ return (binary.LittleEndian.Uint64(word[:]) << bitOffset) &^ LeastSignificantBitMask(64-numBits)
+ }
+
+ copy(word[:], br.bitmap[br.pos:br.pos+nbytes])
+ br.pos += nbytes
+ return (binary.LittleEndian.Uint64(word[:]) >> bitOffset) & LeastSignificantBitMask(numBits)
+}
+
+func (br *baseSetBitRunReader) findCurrentRun() SetBitRun {
+ nzeros := br.countFirstZeros(br.curWord)
+ if nzeros >= br.curNumBits {
+ br.remaining -= int64(br.curNumBits)
+ br.curWord = 0
+ br.curNumBits = 0
+ return SetBitRun{0, 0}
+ }
+
+ br.curWord = br.consumeBits(br.curWord, nzeros)
+ br.curNumBits -= nzeros
+ br.remaining -= int64(nzeros)
+ pos := br.position()
+
+ numOnes := br.countFirstZeros(^br.curWord)
+ br.curWord = br.consumeBits(br.curWord, numOnes)
+ br.curNumBits -= numOnes
+ br.remaining -= int64(numOnes)
+ return SetBitRun{pos, int64(numOnes)}
+}
+
+func (br *baseSetBitRunReader) position() int64 {
+ if br.reversed {
+ return br.remaining
+ }
+ return br.length - br.remaining
+}
+
+func (br *baseSetBitRunReader) adjustRun(run SetBitRun) SetBitRun {
+ if br.reversed {
+ run.Pos -= run.Length
+ }
+ return run
+}
+
+func (br *baseSetBitRunReader) loadFull() (ret uint64) {
+ if br.reversed {
+ br.pos -= 8
+ }
+ ret = binary.LittleEndian.Uint64(br.bitmap[br.pos : br.pos+8])
+ if !br.reversed {
+ br.pos += 8
+ }
+ return
+}
+
+func (br *baseSetBitRunReader) skipNextZeros() {
+ for br.remaining >= 64 {
+ br.curWord = br.loadFull()
+ nzeros := br.countFirstZeros(br.curWord)
+ if nzeros < 64 {
+ br.curWord = br.consumeBits(br.curWord, nzeros)
+ br.curNumBits = 64 - nzeros
+ br.remaining -= int64(nzeros)
+ return
+ }
+ br.remaining -= 64
+ }
+ // run of zeros continues in last bitmap word
+ if br.remaining > 0 {
+ br.curWord = br.loadPartial(0, br.remaining)
+ br.curNumBits = int32(br.remaining)
+ nzeros := int32(utils.Min(int(br.curNumBits), int(br.countFirstZeros(br.curWord))))
+ br.curWord = br.consumeBits(br.curWord, nzeros)
+ br.curNumBits -= nzeros
+ br.remaining -= int64(nzeros)
+ }
+}
+
+func (br *baseSetBitRunReader) countNextOnes() int64 {
+ var length int64
+ if ^br.curWord != 0 {
+ numOnes := br.countFirstZeros(^br.curWord)
+ br.remaining -= int64(numOnes)
+ br.curWord = br.consumeBits(br.curWord, numOnes)
+ br.curNumBits -= numOnes
+ if br.curNumBits != 0 {
+ return int64(numOnes)
+ }
+ length = int64(numOnes)
+ } else {
+ br.remaining -= 64
+ br.curNumBits = 0
+ length = 64
+ }
+
+ for br.remaining >= 64 {
+ br.curWord = br.loadFull()
+ numOnes := br.countFirstZeros(^br.curWord)
+ length += int64(numOnes)
+ br.remaining -= int64(numOnes)
+ if numOnes < 64 {
+ br.curWord = br.consumeBits(br.curWord, numOnes)
+ br.curNumBits = 64 - numOnes
+ return length
+ }
+ }
+
+ if br.remaining > 0 {
+ br.curWord = br.loadPartial(0, br.remaining)
+ br.curNumBits = int32(br.remaining)
+ numOnes := br.countFirstZeros(^br.curWord)
+ br.curWord = br.consumeBits(br.curWord, numOnes)
+ br.curNumBits -= numOnes
+ br.remaining -= int64(numOnes)
+ length += int64(numOnes)
+ }
+ return length
+}
+
+func (br *baseSetBitRunReader) NextRun() SetBitRun {
+ var (
+ pos int64 = 0
+ length int64 = 0
+ )
+
+ if br.curNumBits != 0 {
+ run := br.findCurrentRun()
+ if run.Length != 0 && br.curNumBits != 0 {
+ return br.adjustRun(run)
+ }
+ pos = run.Pos
+ length = run.Length
+ }
+
+ if length == 0 {
+ // we didn't get any ones in curWord, so we can skip any zeros
+ // in the following words
+ br.skipNextZeros()
+ if br.remaining == 0 {
+ return SetBitRun{0, 0}
+ }
+ pos = br.position()
+ } else if br.curNumBits == 0 {
+ if br.remaining >= 64 {
+ br.curWord = br.loadFull()
+ br.curNumBits = 64
+ } else if br.remaining > 0 {
+ br.curWord = br.loadPartial(0, br.remaining)
+ br.curNumBits = int32(br.remaining)
+ } else {
+ return br.adjustRun(SetBitRun{pos, length})
+ }
+ if (br.curWord & br.firstBit) == 0 {
+ return br.adjustRun(SetBitRun{pos, length})
+ }
+ }
+
+ length += br.countNextOnes()
+ return br.adjustRun(SetBitRun{pos, length})
+}
+
+// VisitFn is a callback function for visiting runs of contiguous bits
+type VisitFn func(pos int64, length int64) error
+
+func (br *baseSetBitRunReader) VisitSetBitRuns(visitFn VisitFn) error {
+ for {
+ run := br.NextRun()
+ if run.Length == 0 {
+ break
+ }
+
+ if err := visitFn(run.Pos, run.Length); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// VisitSetBitRuns is just a convenience function for calling NewSetBitRunReader and then VisitSetBitRuns
+func VisitSetBitRuns(bitmap []byte, bitmapOffset int64, length int64, visitFn VisitFn) error {
+ if bitmap == nil {
+ return visitFn(0, length)
+ }
+ rdr := NewSetBitRunReader(bitmap, bitmapOffset, length)
+ for {
+ run := rdr.NextRun()
+ if run.Length == 0 {
+ break
+ }
+
+ if err := visitFn(run.Pos, run.Length); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func VisitSetBitRunsNoErr(bitmap []byte, bitmapOffset int64, length int64, visitFn func(pos, length int64)) {
+ if bitmap == nil {
+ visitFn(0, length)
+ return
+ }
+ rdr := NewSetBitRunReader(bitmap, bitmapOffset, length)
+ for {
+ run := rdr.NextRun()
+ if run.Length == 0 {
+ break
+ }
+ visitFn(run.Pos, run.Length)
+ }
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bitmap_generate.go b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bitmap_generate.go
new file mode 100644
index 000000000..08b5fceab
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bitmap_generate.go
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package bitutils
+
+import "github.com/apache/arrow/go/v15/arrow/bitutil"
+
+// GenerateBits writes sequential bits to a bitmap. Bits preceding the
+// initial start offset are preserved, bits following the bitmap may
+// get clobbered.
+func GenerateBits(bitmap []byte, start, length int64, g func() bool) {
+ if length == 0 {
+ return
+ }
+
+ cur := bitmap[start/8:]
+ mask := bitutil.BitMask[start%8]
+ curbyte := cur[0] & bitutil.PrecedingBitmask[start%8]
+
+ for i := int64(0); i < length; i++ {
+ bit := g()
+ if bit {
+ curbyte = curbyte | mask
+ }
+ mask <<= 1
+ if mask == 0 {
+ mask = 1
+ cur[0] = curbyte
+ cur = cur[1:]
+ curbyte = 0
+ }
+ }
+
+ if mask != 1 {
+ cur[0] = curbyte
+ }
+}
+
+// GenerateBitsUnrolled is like GenerateBits but unrolls its main loop for
+// higher performance.
+//
+// See the benchmarks for evidence.
+func GenerateBitsUnrolled(bitmap []byte, start, length int64, g func() bool) {
+ if length == 0 {
+ return
+ }
+
+ var (
+ curbyte byte
+ cur = bitmap[start/8:]
+ startBitOffset uint64 = uint64(start % 8)
+ mask = bitutil.BitMask[startBitOffset]
+ remaining = length
+ )
+
+ if mask != 0x01 {
+ curbyte = cur[0] & bitutil.PrecedingBitmask[startBitOffset]
+ for mask != 0 && remaining > 0 {
+ if g() {
+ curbyte |= mask
+ }
+ mask <<= 1
+ remaining--
+ }
+ cur[0] = curbyte
+ cur = cur[1:]
+ }
+
+ var outResults [8]byte
+ for remainingBytes := remaining / 8; remainingBytes > 0; remainingBytes-- {
+ for i := 0; i < 8; i++ {
+ if g() {
+ outResults[i] = 1
+ } else {
+ outResults[i] = 0
+ }
+ }
+ cur[0] = (outResults[0] | outResults[1]<<1 | outResults[2]<<2 |
+ outResults[3]<<3 | outResults[4]<<4 | outResults[5]<<5 |
+ outResults[6]<<6 | outResults[7]<<7)
+ cur = cur[1:]
+ }
+
+ remainingBits := remaining % 8
+ if remainingBits > 0 {
+ curbyte = 0
+ mask = 0x01
+ for ; remainingBits > 0; remainingBits-- {
+ if g() {
+ curbyte |= mask
+ }
+ mask <<= 1
+ }
+ cur[0] = curbyte
+ }
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_funcs.go b/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_funcs.go
new file mode 100644
index 000000000..c1bdfeb6d
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_funcs.go
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hashing
+
+import (
+ "math/bits"
+ "unsafe"
+
+ "github.com/zeebo/xxh3"
+)
+
+func hashInt(val uint64, alg uint64) uint64 {
+ // Two of xxhash's prime multipliers (which are chosen for their
+ // bit dispersion properties)
+ var multipliers = [2]uint64{11400714785074694791, 14029467366897019727}
+ // Multiplying by the prime number mixes the low bits into the high bits,
+ // then byte-swapping (which is a single CPU instruction) allows the
+ // combined high and low bits to participate in the initial hash table index.
+ return bits.ReverseBytes64(multipliers[alg] * val)
+}
+
+func hashFloat32(val float32, alg uint64) uint64 {
+ // grab the raw byte pattern of the
+ bt := *(*[4]byte)(unsafe.Pointer(&val))
+ x := uint64(*(*uint32)(unsafe.Pointer(&bt[0])))
+ hx := hashInt(x, alg)
+ hy := hashInt(x, alg^1)
+ return 4 ^ hx ^ hy
+}
+
+func hashFloat64(val float64, alg uint64) uint64 {
+ bt := *(*[8]byte)(unsafe.Pointer(&val))
+ hx := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[4]))), alg)
+ hy := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[0]))), alg^1)
+ return 8 ^ hx ^ hy
+}
+
+// prime constants used for slightly increasing the hash quality further
+var exprimes = [2]uint64{1609587929392839161, 9650029242287828579}
+
+// for smaller amounts of bytes this is faster than even calling into
+// xxh3 to do the Hash, so we specialize in order to get the benefits
+// of that performance.
+func Hash(b []byte, alg uint64) uint64 {
+ n := uint32(len(b))
+ if n <= 16 {
+ switch {
+ case n > 8:
+ // 8 < length <= 16
+ // apply same principle as above, but as two 64-bit ints
+ x := *(*uint64)(unsafe.Pointer(&b[n-8]))
+ y := *(*uint64)(unsafe.Pointer(&b[0]))
+ hx := hashInt(x, alg)
+ hy := hashInt(y, alg^1)
+ return uint64(n) ^ hx ^ hy
+ case n >= 4:
+ // 4 < length <= 8
+ // we can read the bytes as two overlapping 32-bit ints, apply different
+ // hash functions to each in parallel
+ // then xor the results
+ x := *(*uint32)(unsafe.Pointer(&b[n-4]))
+ y := *(*uint32)(unsafe.Pointer(&b[0]))
+ hx := hashInt(uint64(x), alg)
+ hy := hashInt(uint64(y), alg^1)
+ return uint64(n) ^ hx ^ hy
+ case n > 0:
+ x := uint32((n << 24) ^ (uint32(b[0]) << 16) ^ (uint32(b[n/2]) << 8) ^ uint32(b[n-1]))
+ return hashInt(uint64(x), alg)
+ case n == 0:
+ return 1
+ }
+ }
+
+ // increase differentiation enough to improve hash quality
+ return xxh3.Hash(b) + exprimes[alg]
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string.go b/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string.go
new file mode 100644
index 000000000..b772c7d7f
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string.go
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build go1.20 || tinygo
+
+package hashing
+
+import "unsafe"
+
+func hashString(val string, alg uint64) uint64 {
+ buf := unsafe.Slice(unsafe.StringData(val), len(val))
+ return Hash(buf, alg)
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string_go1.19.go b/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string_go1.19.go
new file mode 100644
index 000000000..f38eb5c52
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string_go1.19.go
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !go1.20 && !tinygo
+
+package hashing
+
+import (
+ "reflect"
+ "unsafe"
+)
+
+func hashString(val string, alg uint64) uint64 {
+ if val == "" {
+ return Hash([]byte{}, alg)
+ }
+ // highly efficient way to get byte slice without copy before
+ // the introduction of unsafe.StringData in go1.20
+ // (https://stackoverflow.com/questions/59209493/how-to-use-unsafe-get-a-byte-slice-from-a-string-without-memory-copy)
+ const MaxInt32 = 1<<31 - 1
+ buf := (*[MaxInt32]byte)(unsafe.Pointer((*reflect.StringHeader)(
+ unsafe.Pointer(&val)).Data))[: len(val)&MaxInt32 : len(val)&MaxInt32]
+ return Hash(buf, alg)
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/types.tmpldata b/vendor/github.com/apache/arrow/go/v15/internal/hashing/types.tmpldata
new file mode 100644
index 000000000..0ba6f765d
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/types.tmpldata
@@ -0,0 +1,42 @@
+[
+ {
+ "Name": "Int8",
+ "name": "int8"
+ },
+ {
+ "Name": "Uint8",
+ "name": "uint8"
+ },
+ {
+ "Name": "Int16",
+ "name": "int16"
+ },
+ {
+ "Name": "Uint16",
+ "name": "uint16"
+ },
+ {
+ "Name": "Int32",
+ "name": "int32"
+ },
+ {
+ "Name": "Int64",
+ "name": "int64"
+ },
+ {
+ "Name": "Uint32",
+ "name": "uint32"
+ },
+ {
+ "Name": "Uint64",
+ "name": "uint64"
+ },
+ {
+ "Name": "Float32",
+ "name": "float32"
+ },
+ {
+ "Name": "Float64",
+ "name": "float64"
+ }
+]
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go b/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go
new file mode 100644
index 000000000..39b82cdef
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go
@@ -0,0 +1,2833 @@
+// Code generated by xxh3_memo_table.gen.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hashing
+
+import (
+ "math"
+
+ "github.com/apache/arrow/go/v15/arrow"
+ "github.com/apache/arrow/go/v15/arrow/bitutil"
+ "github.com/apache/arrow/go/v15/internal/utils"
+)
+
+type payloadInt8 struct {
+ val int8
+ memoIdx int32
+}
+
+type entryInt8 struct {
+ h uint64
+ payload payloadInt8
+}
+
+func (e entryInt8) Valid() bool { return e.h != sentinel }
+
+// Int8HashTable is a hashtable specifically for int8 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Int8HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryInt8
+}
+
+// NewInt8HashTable returns a new hash table for int8 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewInt8HashTable(cap uint64) *Int8HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Int8HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryInt8, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Int8HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryInt8, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Int8HashTable) CopyValues(out []int8) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Int8HashTable) CopyValuesSubset(start int, out []int8) {
+ h.VisitEntries(func(e *entryInt8) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Int8HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Int8HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Int8Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryInt8) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Int8HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Int8HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Int8HashTable) Lookup(v uint64, cmp func(int8) bool) (*entryInt8, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Int8HashTable) lookup(v uint64, szMask uint64, cmp func(int8) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryInt8
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Int8HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryInt8, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(int8) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Int8HashTable) Insert(e *entryInt8, v uint64, val int8, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Int8HashTable) VisitEntries(visit func(*entryInt8)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Int8MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Int8MemoTable struct {
+ tbl *Int8HashTable
+ nullIdx int32
+}
+
+// NewInt8MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewInt8MemoTable(num int64) *Int8MemoTable {
+ return &Int8MemoTable{tbl: NewInt8HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+func (Int8MemoTable) TypeTraits() TypeTraits {
+ return arrow.Int8Traits
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Int8MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Int8MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Int8MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Int8MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Int8MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Int8MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]int8))
+}
+
+func (s *Int8MemoTable) WriteOut(out []byte) {
+ s.tbl.CopyValues(arrow.Int8Traits.CastFromBytes(out))
+}
+
+func (s *Int8MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.CopyValuesSubset(start, arrow.Int8Traits.CastFromBytes(out))
+}
+
+func (s *Int8MemoTable) WriteOutLE(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Int8MemoTable) WriteOutSubsetLE(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Int8MemoTable) Get(val interface{}) (int, bool) {
+
+ h := hashInt(uint64(val.(int8)), 0)
+ if e, ok := s.tbl.Lookup(h, func(v int8) bool { return val.(int8) == v }); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Int8MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ h := hashInt(uint64(val.(int8)), 0)
+ e, ok := s.tbl.Lookup(h, func(v int8) bool {
+ return val.(int8) == v
+ })
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(int8), int32(idx))
+ }
+ return
+}
+
+// GetOrInsertBytes is unimplemented
+func (s *Int8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
+ panic("unimplemented")
+}
+
+type payloadUint8 struct {
+ val uint8
+ memoIdx int32
+}
+
+type entryUint8 struct {
+ h uint64
+ payload payloadUint8
+}
+
+func (e entryUint8) Valid() bool { return e.h != sentinel }
+
+// Uint8HashTable is a hashtable specifically for uint8 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Uint8HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryUint8
+}
+
+// NewUint8HashTable returns a new hash table for uint8 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewUint8HashTable(cap uint64) *Uint8HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Uint8HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryUint8, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Uint8HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryUint8, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Uint8HashTable) CopyValues(out []uint8) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Uint8HashTable) CopyValuesSubset(start int, out []uint8) {
+ h.VisitEntries(func(e *entryUint8) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Uint8HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Uint8HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Uint8Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryUint8) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Uint8HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Uint8HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Uint8HashTable) Lookup(v uint64, cmp func(uint8) bool) (*entryUint8, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Uint8HashTable) lookup(v uint64, szMask uint64, cmp func(uint8) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryUint8
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Uint8HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryUint8, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(uint8) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Uint8HashTable) Insert(e *entryUint8, v uint64, val uint8, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Uint8HashTable) VisitEntries(visit func(*entryUint8)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Uint8MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Uint8MemoTable struct {
+ tbl *Uint8HashTable
+ nullIdx int32
+}
+
+// NewUint8MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewUint8MemoTable(num int64) *Uint8MemoTable {
+ return &Uint8MemoTable{tbl: NewUint8HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+func (Uint8MemoTable) TypeTraits() TypeTraits {
+ return arrow.Uint8Traits
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Uint8MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Uint8MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Uint8MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Uint8MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Uint8MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Uint8MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]uint8))
+}
+
+func (s *Uint8MemoTable) WriteOut(out []byte) {
+ s.tbl.CopyValues(arrow.Uint8Traits.CastFromBytes(out))
+}
+
+func (s *Uint8MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.CopyValuesSubset(start, arrow.Uint8Traits.CastFromBytes(out))
+}
+
+func (s *Uint8MemoTable) WriteOutLE(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Uint8MemoTable) WriteOutSubsetLE(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Uint8MemoTable) Get(val interface{}) (int, bool) {
+
+ h := hashInt(uint64(val.(uint8)), 0)
+ if e, ok := s.tbl.Lookup(h, func(v uint8) bool { return val.(uint8) == v }); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Uint8MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ h := hashInt(uint64(val.(uint8)), 0)
+ e, ok := s.tbl.Lookup(h, func(v uint8) bool {
+ return val.(uint8) == v
+ })
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(uint8), int32(idx))
+ }
+ return
+}
+
+// GetOrInsertBytes is unimplemented
+func (s *Uint8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
+ panic("unimplemented")
+}
+
+type payloadInt16 struct {
+ val int16
+ memoIdx int32
+}
+
+type entryInt16 struct {
+ h uint64
+ payload payloadInt16
+}
+
+func (e entryInt16) Valid() bool { return e.h != sentinel }
+
+// Int16HashTable is a hashtable specifically for int16 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Int16HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryInt16
+}
+
+// NewInt16HashTable returns a new hash table for int16 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewInt16HashTable(cap uint64) *Int16HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Int16HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryInt16, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Int16HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryInt16, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Int16HashTable) CopyValues(out []int16) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Int16HashTable) CopyValuesSubset(start int, out []int16) {
+ h.VisitEntries(func(e *entryInt16) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Int16HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Int16HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Int16Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryInt16) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLEInt16(e.payload.val)
+ }
+ })
+}
+
+func (h *Int16HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Int16HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Int16HashTable) Lookup(v uint64, cmp func(int16) bool) (*entryInt16, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Int16HashTable) lookup(v uint64, szMask uint64, cmp func(int16) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryInt16
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Int16HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryInt16, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(int16) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Int16HashTable) Insert(e *entryInt16, v uint64, val int16, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Int16HashTable) VisitEntries(visit func(*entryInt16)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Int16MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Int16MemoTable struct {
+ tbl *Int16HashTable
+ nullIdx int32
+}
+
+// NewInt16MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewInt16MemoTable(num int64) *Int16MemoTable {
+ return &Int16MemoTable{tbl: NewInt16HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+func (Int16MemoTable) TypeTraits() TypeTraits {
+ return arrow.Int16Traits
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Int16MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Int16MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Int16MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Int16MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Int16MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Int16MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]int16))
+}
+
+func (s *Int16MemoTable) WriteOut(out []byte) {
+ s.tbl.CopyValues(arrow.Int16Traits.CastFromBytes(out))
+}
+
+func (s *Int16MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.CopyValuesSubset(start, arrow.Int16Traits.CastFromBytes(out))
+}
+
+func (s *Int16MemoTable) WriteOutLE(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Int16MemoTable) WriteOutSubsetLE(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Int16MemoTable) Get(val interface{}) (int, bool) {
+
+ h := hashInt(uint64(val.(int16)), 0)
+ if e, ok := s.tbl.Lookup(h, func(v int16) bool { return val.(int16) == v }); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Int16MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ h := hashInt(uint64(val.(int16)), 0)
+ e, ok := s.tbl.Lookup(h, func(v int16) bool {
+ return val.(int16) == v
+ })
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(int16), int32(idx))
+ }
+ return
+}
+
+// GetOrInsertBytes is unimplemented
+func (s *Int16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
+ panic("unimplemented")
+}
+
+type payloadUint16 struct {
+ val uint16
+ memoIdx int32
+}
+
+type entryUint16 struct {
+ h uint64
+ payload payloadUint16
+}
+
+func (e entryUint16) Valid() bool { return e.h != sentinel }
+
+// Uint16HashTable is a hashtable specifically for uint16 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Uint16HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryUint16
+}
+
+// NewUint16HashTable returns a new hash table for uint16 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewUint16HashTable(cap uint64) *Uint16HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Uint16HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryUint16, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Uint16HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryUint16, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Uint16HashTable) CopyValues(out []uint16) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Uint16HashTable) CopyValuesSubset(start int, out []uint16) {
+ h.VisitEntries(func(e *entryUint16) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Uint16HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Uint16HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Uint16Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryUint16) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLEUint16(e.payload.val)
+ }
+ })
+}
+
+func (h *Uint16HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Uint16HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Uint16HashTable) Lookup(v uint64, cmp func(uint16) bool) (*entryUint16, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Uint16HashTable) lookup(v uint64, szMask uint64, cmp func(uint16) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryUint16
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Uint16HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryUint16, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(uint16) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Uint16HashTable) Insert(e *entryUint16, v uint64, val uint16, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Uint16HashTable) VisitEntries(visit func(*entryUint16)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Uint16MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Uint16MemoTable struct {
+ tbl *Uint16HashTable
+ nullIdx int32
+}
+
+// NewUint16MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewUint16MemoTable(num int64) *Uint16MemoTable {
+ return &Uint16MemoTable{tbl: NewUint16HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+func (Uint16MemoTable) TypeTraits() TypeTraits {
+ return arrow.Uint16Traits
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Uint16MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Uint16MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Uint16MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Uint16MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Uint16MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Uint16MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]uint16))
+}
+
+func (s *Uint16MemoTable) WriteOut(out []byte) {
+ s.tbl.CopyValues(arrow.Uint16Traits.CastFromBytes(out))
+}
+
+func (s *Uint16MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.CopyValuesSubset(start, arrow.Uint16Traits.CastFromBytes(out))
+}
+
+func (s *Uint16MemoTable) WriteOutLE(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Uint16MemoTable) WriteOutSubsetLE(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Uint16MemoTable) Get(val interface{}) (int, bool) {
+
+ h := hashInt(uint64(val.(uint16)), 0)
+ if e, ok := s.tbl.Lookup(h, func(v uint16) bool { return val.(uint16) == v }); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Uint16MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ h := hashInt(uint64(val.(uint16)), 0)
+ e, ok := s.tbl.Lookup(h, func(v uint16) bool {
+ return val.(uint16) == v
+ })
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(uint16), int32(idx))
+ }
+ return
+}
+
+// GetOrInsertBytes is unimplemented
+func (s *Uint16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
+ panic("unimplemented")
+}
+
+type payloadInt32 struct {
+ val int32
+ memoIdx int32
+}
+
+type entryInt32 struct {
+ h uint64
+ payload payloadInt32
+}
+
+func (e entryInt32) Valid() bool { return e.h != sentinel }
+
+// Int32HashTable is a hashtable specifically for int32 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Int32HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryInt32
+}
+
+// NewInt32HashTable returns a new hash table for int32 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewInt32HashTable(cap uint64) *Int32HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Int32HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryInt32, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Int32HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryInt32, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Int32HashTable) CopyValues(out []int32) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Int32HashTable) CopyValuesSubset(start int, out []int32) {
+ h.VisitEntries(func(e *entryInt32) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Int32HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Int32HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Int32Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryInt32) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLEInt32(e.payload.val)
+ }
+ })
+}
+
+func (h *Int32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Int32HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Int32HashTable) Lookup(v uint64, cmp func(int32) bool) (*entryInt32, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Int32HashTable) lookup(v uint64, szMask uint64, cmp func(int32) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryInt32
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Int32HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryInt32, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(int32) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Int32HashTable) Insert(e *entryInt32, v uint64, val int32, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Int32HashTable) VisitEntries(visit func(*entryInt32)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Int32MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Int32MemoTable struct {
+ tbl *Int32HashTable
+ nullIdx int32
+}
+
+// NewInt32MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewInt32MemoTable(num int64) *Int32MemoTable {
+ return &Int32MemoTable{tbl: NewInt32HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+func (Int32MemoTable) TypeTraits() TypeTraits {
+ return arrow.Int32Traits
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Int32MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Int32MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Int32MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Int32MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Int32MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Int32MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]int32))
+}
+
+func (s *Int32MemoTable) WriteOut(out []byte) {
+ s.tbl.CopyValues(arrow.Int32Traits.CastFromBytes(out))
+}
+
+func (s *Int32MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.CopyValuesSubset(start, arrow.Int32Traits.CastFromBytes(out))
+}
+
+func (s *Int32MemoTable) WriteOutLE(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Int32MemoTable) WriteOutSubsetLE(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Int32MemoTable) Get(val interface{}) (int, bool) {
+
+ h := hashInt(uint64(val.(int32)), 0)
+ if e, ok := s.tbl.Lookup(h, func(v int32) bool { return val.(int32) == v }); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ h := hashInt(uint64(val.(int32)), 0)
+ e, ok := s.tbl.Lookup(h, func(v int32) bool {
+ return val.(int32) == v
+ })
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(int32), int32(idx))
+ }
+ return
+}
+
+// GetOrInsertBytes is unimplemented
+func (s *Int32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
+ panic("unimplemented")
+}
+
+type payloadInt64 struct {
+ val int64
+ memoIdx int32
+}
+
+type entryInt64 struct {
+ h uint64
+ payload payloadInt64
+}
+
+func (e entryInt64) Valid() bool { return e.h != sentinel }
+
+// Int64HashTable is a hashtable specifically for int64 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Int64HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryInt64
+}
+
+// NewInt64HashTable returns a new hash table for int64 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewInt64HashTable(cap uint64) *Int64HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Int64HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryInt64, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Int64HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryInt64, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Int64HashTable) CopyValues(out []int64) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Int64HashTable) CopyValuesSubset(start int, out []int64) {
+ h.VisitEntries(func(e *entryInt64) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Int64HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Int64HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Int64Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryInt64) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLEInt64(e.payload.val)
+ }
+ })
+}
+
+func (h *Int64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Int64HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Int64HashTable) Lookup(v uint64, cmp func(int64) bool) (*entryInt64, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Int64HashTable) lookup(v uint64, szMask uint64, cmp func(int64) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryInt64
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Int64HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryInt64, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(int64) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Int64HashTable) Insert(e *entryInt64, v uint64, val int64, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Int64HashTable) VisitEntries(visit func(*entryInt64)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Int64MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Int64MemoTable struct {
+ tbl *Int64HashTable
+ nullIdx int32
+}
+
+// NewInt64MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewInt64MemoTable(num int64) *Int64MemoTable {
+ return &Int64MemoTable{tbl: NewInt64HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+func (Int64MemoTable) TypeTraits() TypeTraits {
+ return arrow.Int64Traits
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Int64MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Int64MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Int64MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Int64MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Int64MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Int64MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]int64))
+}
+
+func (s *Int64MemoTable) WriteOut(out []byte) {
+ s.tbl.CopyValues(arrow.Int64Traits.CastFromBytes(out))
+}
+
+func (s *Int64MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.CopyValuesSubset(start, arrow.Int64Traits.CastFromBytes(out))
+}
+
+func (s *Int64MemoTable) WriteOutLE(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Int64MemoTable) WriteOutSubsetLE(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Int64MemoTable) Get(val interface{}) (int, bool) {
+
+ h := hashInt(uint64(val.(int64)), 0)
+ if e, ok := s.tbl.Lookup(h, func(v int64) bool { return val.(int64) == v }); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ h := hashInt(uint64(val.(int64)), 0)
+ e, ok := s.tbl.Lookup(h, func(v int64) bool {
+ return val.(int64) == v
+ })
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(int64), int32(idx))
+ }
+ return
+}
+
+// GetOrInsertBytes is unimplemented
+func (s *Int64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
+ panic("unimplemented")
+}
+
+type payloadUint32 struct {
+ val uint32
+ memoIdx int32
+}
+
+type entryUint32 struct {
+ h uint64
+ payload payloadUint32
+}
+
+func (e entryUint32) Valid() bool { return e.h != sentinel }
+
+// Uint32HashTable is a hashtable specifically for uint32 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Uint32HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryUint32
+}
+
+// NewUint32HashTable returns a new hash table for uint32 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewUint32HashTable(cap uint64) *Uint32HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Uint32HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryUint32, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Uint32HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryUint32, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Uint32HashTable) CopyValues(out []uint32) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Uint32HashTable) CopyValuesSubset(start int, out []uint32) {
+ h.VisitEntries(func(e *entryUint32) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Uint32HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Uint32HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Uint32Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryUint32) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLEUint32(e.payload.val)
+ }
+ })
+}
+
+func (h *Uint32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Uint32HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Uint32HashTable) Lookup(v uint64, cmp func(uint32) bool) (*entryUint32, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Uint32HashTable) lookup(v uint64, szMask uint64, cmp func(uint32) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryUint32
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Uint32HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryUint32, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(uint32) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Uint32HashTable) Insert(e *entryUint32, v uint64, val uint32, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Uint32HashTable) VisitEntries(visit func(*entryUint32)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Uint32MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Uint32MemoTable struct {
+ tbl *Uint32HashTable
+ nullIdx int32
+}
+
+// NewUint32MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewUint32MemoTable(num int64) *Uint32MemoTable {
+ return &Uint32MemoTable{tbl: NewUint32HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+func (Uint32MemoTable) TypeTraits() TypeTraits {
+ return arrow.Uint32Traits
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Uint32MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Uint32MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Uint32MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Uint32MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Uint32MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Uint32MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]uint32))
+}
+
+func (s *Uint32MemoTable) WriteOut(out []byte) {
+ s.tbl.CopyValues(arrow.Uint32Traits.CastFromBytes(out))
+}
+
+func (s *Uint32MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.CopyValuesSubset(start, arrow.Uint32Traits.CastFromBytes(out))
+}
+
+func (s *Uint32MemoTable) WriteOutLE(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Uint32MemoTable) WriteOutSubsetLE(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Uint32MemoTable) Get(val interface{}) (int, bool) {
+
+ h := hashInt(uint64(val.(uint32)), 0)
+ if e, ok := s.tbl.Lookup(h, func(v uint32) bool { return val.(uint32) == v }); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Uint32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ h := hashInt(uint64(val.(uint32)), 0)
+ e, ok := s.tbl.Lookup(h, func(v uint32) bool {
+ return val.(uint32) == v
+ })
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(uint32), int32(idx))
+ }
+ return
+}
+
+// GetOrInsertBytes is unimplemented
+func (s *Uint32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
+ panic("unimplemented")
+}
+
+type payloadUint64 struct {
+ val uint64
+ memoIdx int32
+}
+
+type entryUint64 struct {
+ h uint64
+ payload payloadUint64
+}
+
+func (e entryUint64) Valid() bool { return e.h != sentinel }
+
+// Uint64HashTable is a hashtable specifically for uint64 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Uint64HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryUint64
+}
+
+// NewUint64HashTable returns a new hash table for uint64 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewUint64HashTable(cap uint64) *Uint64HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Uint64HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryUint64, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Uint64HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryUint64, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Uint64HashTable) CopyValues(out []uint64) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Uint64HashTable) CopyValuesSubset(start int, out []uint64) {
+ h.VisitEntries(func(e *entryUint64) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Uint64HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Uint64HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Uint64Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryUint64) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLEUint64(e.payload.val)
+ }
+ })
+}
+
+func (h *Uint64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Uint64HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Uint64HashTable) Lookup(v uint64, cmp func(uint64) bool) (*entryUint64, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Uint64HashTable) lookup(v uint64, szMask uint64, cmp func(uint64) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryUint64
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Uint64HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryUint64, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(uint64) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Uint64HashTable) Insert(e *entryUint64, v uint64, val uint64, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Uint64HashTable) VisitEntries(visit func(*entryUint64)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Uint64MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Uint64MemoTable struct {
+ tbl *Uint64HashTable
+ nullIdx int32
+}
+
+// NewUint64MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewUint64MemoTable(num int64) *Uint64MemoTable {
+ return &Uint64MemoTable{tbl: NewUint64HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+func (Uint64MemoTable) TypeTraits() TypeTraits {
+ return arrow.Uint64Traits
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Uint64MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Uint64MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Uint64MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Uint64MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Uint64MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Uint64MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]uint64))
+}
+
+func (s *Uint64MemoTable) WriteOut(out []byte) {
+ s.tbl.CopyValues(arrow.Uint64Traits.CastFromBytes(out))
+}
+
+func (s *Uint64MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.CopyValuesSubset(start, arrow.Uint64Traits.CastFromBytes(out))
+}
+
+func (s *Uint64MemoTable) WriteOutLE(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Uint64MemoTable) WriteOutSubsetLE(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Uint64MemoTable) Get(val interface{}) (int, bool) {
+
+ h := hashInt(uint64(val.(uint64)), 0)
+ if e, ok := s.tbl.Lookup(h, func(v uint64) bool { return val.(uint64) == v }); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Uint64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ h := hashInt(uint64(val.(uint64)), 0)
+ e, ok := s.tbl.Lookup(h, func(v uint64) bool {
+ return val.(uint64) == v
+ })
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(uint64), int32(idx))
+ }
+ return
+}
+
+// GetOrInsertBytes is unimplemented
+func (s *Uint64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
+ panic("unimplemented")
+}
+
+type payloadFloat32 struct {
+ val float32
+ memoIdx int32
+}
+
+type entryFloat32 struct {
+ h uint64
+ payload payloadFloat32
+}
+
+func (e entryFloat32) Valid() bool { return e.h != sentinel }
+
+// Float32HashTable is a hashtable specifically for float32 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Float32HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryFloat32
+}
+
+// NewFloat32HashTable returns a new hash table for float32 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewFloat32HashTable(cap uint64) *Float32HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Float32HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryFloat32, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Float32HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryFloat32, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Float32HashTable) CopyValues(out []float32) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Float32HashTable) CopyValuesSubset(start int, out []float32) {
+ h.VisitEntries(func(e *entryFloat32) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Float32HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Float32HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Float32Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryFloat32) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLEFloat32(e.payload.val)
+ }
+ })
+}
+
+func (h *Float32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Float32HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Float32HashTable) Lookup(v uint64, cmp func(float32) bool) (*entryFloat32, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Float32HashTable) lookup(v uint64, szMask uint64, cmp func(float32) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryFloat32
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Float32HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryFloat32, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(float32) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Float32HashTable) Insert(e *entryFloat32, v uint64, val float32, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Float32HashTable) VisitEntries(visit func(*entryFloat32)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Float32MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Float32MemoTable struct {
+ tbl *Float32HashTable
+ nullIdx int32
+}
+
+// NewFloat32MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewFloat32MemoTable(num int64) *Float32MemoTable {
+ return &Float32MemoTable{tbl: NewFloat32HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+func (Float32MemoTable) TypeTraits() TypeTraits {
+ return arrow.Float32Traits
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Float32MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Float32MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Float32MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Float32MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Float32MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Float32MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]float32))
+}
+
+func (s *Float32MemoTable) WriteOut(out []byte) {
+ s.tbl.CopyValues(arrow.Float32Traits.CastFromBytes(out))
+}
+
+func (s *Float32MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.CopyValuesSubset(start, arrow.Float32Traits.CastFromBytes(out))
+}
+
+func (s *Float32MemoTable) WriteOutLE(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Float32MemoTable) WriteOutSubsetLE(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Float32MemoTable) Get(val interface{}) (int, bool) {
+ var cmp func(float32) bool
+
+ if math.IsNaN(float64(val.(float32))) {
+ cmp = isNan32Cmp
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = float32(math.NaN())
+ } else {
+ cmp = func(v float32) bool { return val.(float32) == v }
+ }
+
+ h := hashFloat32(val.(float32), 0)
+ if e, ok := s.tbl.Lookup(h, cmp); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ var cmp func(float32) bool
+
+ if math.IsNaN(float64(val.(float32))) {
+ cmp = isNan32Cmp
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = float32(math.NaN())
+ } else {
+ cmp = func(v float32) bool { return val.(float32) == v }
+ }
+
+ h := hashFloat32(val.(float32), 0)
+ e, ok := s.tbl.Lookup(h, cmp)
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(float32), int32(idx))
+ }
+ return
+}
+
+// GetOrInsertBytes is unimplemented
+func (s *Float32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
+ panic("unimplemented")
+}
+
+type payloadFloat64 struct {
+ val float64
+ memoIdx int32
+}
+
+type entryFloat64 struct {
+ h uint64
+ payload payloadFloat64
+}
+
+func (e entryFloat64) Valid() bool { return e.h != sentinel }
+
+// Float64HashTable is a hashtable specifically for float64 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Float64HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryFloat64
+}
+
+// NewFloat64HashTable returns a new hash table for float64 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewFloat64HashTable(cap uint64) *Float64HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Float64HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryFloat64, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Float64HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryFloat64, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Float64HashTable) CopyValues(out []float64) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Float64HashTable) CopyValuesSubset(start int, out []float64) {
+ h.VisitEntries(func(e *entryFloat64) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Float64HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Float64HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Float64Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryFloat64) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLEFloat64(e.payload.val)
+ }
+ })
+}
+
+func (h *Float64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Float64HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Float64HashTable) Lookup(v uint64, cmp func(float64) bool) (*entryFloat64, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Float64HashTable) lookup(v uint64, szMask uint64, cmp func(float64) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryFloat64
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Float64HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryFloat64, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(float64) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Float64HashTable) Insert(e *entryFloat64, v uint64, val float64, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Float64HashTable) VisitEntries(visit func(*entryFloat64)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Float64MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Float64MemoTable struct {
+ tbl *Float64HashTable
+ nullIdx int32
+}
+
+// NewFloat64MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewFloat64MemoTable(num int64) *Float64MemoTable {
+ return &Float64MemoTable{tbl: NewFloat64HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+func (Float64MemoTable) TypeTraits() TypeTraits {
+ return arrow.Float64Traits
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Float64MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Float64MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Float64MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Float64MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Float64MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Float64MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]float64))
+}
+
+func (s *Float64MemoTable) WriteOut(out []byte) {
+ s.tbl.CopyValues(arrow.Float64Traits.CastFromBytes(out))
+}
+
+func (s *Float64MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.CopyValuesSubset(start, arrow.Float64Traits.CastFromBytes(out))
+}
+
+func (s *Float64MemoTable) WriteOutLE(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Float64MemoTable) WriteOutSubsetLE(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Float64MemoTable) Get(val interface{}) (int, bool) {
+ var cmp func(float64) bool
+ if math.IsNaN(val.(float64)) {
+ cmp = math.IsNaN
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = math.NaN()
+ } else {
+ cmp = func(v float64) bool { return val.(float64) == v }
+ }
+
+ h := hashFloat64(val.(float64), 0)
+ if e, ok := s.tbl.Lookup(h, cmp); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ var cmp func(float64) bool
+ if math.IsNaN(val.(float64)) {
+ cmp = math.IsNaN
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = math.NaN()
+ } else {
+ cmp = func(v float64) bool { return val.(float64) == v }
+ }
+
+ h := hashFloat64(val.(float64), 0)
+ e, ok := s.tbl.Lookup(h, cmp)
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(float64), int32(idx))
+ }
+ return
+}
+
+// GetOrInsertBytes is unimplemented
+func (s *Float64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
+ panic("unimplemented")
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go.tmpl b/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go.tmpl
new file mode 100644
index 000000000..527008ad6
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go.tmpl
@@ -0,0 +1,349 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hashing
+
+import (
+ "github.com/apache/arrow/go/v15/arrow/bitutil"
+ "github.com/apache/arrow/go/v15/internal/utils"
+)
+
+{{range .In}}
+type payload{{.Name}} struct {
+ val {{.name}}
+ memoIdx int32
+}
+
+type entry{{.Name}} struct {
+ h uint64
+ payload payload{{.Name}}
+}
+
+func (e entry{{.Name}}) Valid() bool { return e.h != sentinel }
+
+// {{.Name}}HashTable is a hashtable specifically for {{.name}} that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type {{.Name}}HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entry{{.Name}}
+}
+
+// New{{.Name}}HashTable returns a new hash table for {{.name}} values
+// initialized with the passed in capacity or 32 whichever is larger.
+func New{{.Name}}HashTable(cap uint64) *{{.Name}}HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &{{.Name}}HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entry{{.Name}}, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *{{.Name}}HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entry{{.Name}}, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *{{.Name}}HashTable) CopyValues(out []{{.name}}) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *{{.Name}}HashTable) CopyValuesSubset(start int, out []{{.name}}) {
+ h.VisitEntries(func(e *entry{{.Name}}) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *{{.Name}}HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *{{.Name}}HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.{{.Name}}Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entry{{.Name}}) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+{{if and (ne .Name "Int8") (ne .Name "Uint8") -}}
+ data[idx] = utils.ToLE{{.Name}}(e.payload.val)
+{{else -}}
+ data[idx] = e.payload.val
+{{end -}}
+ }
+ })
+}
+
+func (h *{{.Name}}HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func ({{.Name}}HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *{{.Name}}HashTable) Lookup(v uint64, cmp func({{.name}}) bool) (*entry{{.Name}}, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *{{.Name}}HashTable) lookup(v uint64, szMask uint64, cmp func({{.name}}) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entry{{.Name}}
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *{{.Name}}HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entry{{.Name}}, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func({{.name}}) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *{{.Name}}HashTable) Insert(e *entry{{.Name}}, v uint64, val {{.name}}, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *{{.Name}}HashTable) VisitEntries(visit func(*entry{{.Name}})) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// {{.Name}}MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type {{.Name}}MemoTable struct {
+ tbl *{{.Name}}HashTable
+ nullIdx int32
+}
+
+// New{{.Name}}MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func New{{.Name}}MemoTable(num int64) *{{.Name}}MemoTable {
+ return &{{.Name}}MemoTable{tbl: New{{.Name}}HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+func ({{.Name}}MemoTable) TypeTraits() TypeTraits {
+ return arrow.{{.Name}}Traits
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *{{.Name}}MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *{{.Name}}MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *{{.Name}}MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *{{.Name}}MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *{{.Name}}MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *{{.Name}}MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]{{.name}}))
+}
+
+func (s *{{.Name}}MemoTable) WriteOut(out []byte) {
+ s.tbl.CopyValues(arrow.{{.Name}}Traits.CastFromBytes(out))
+}
+
+func (s *{{.Name}}MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.CopyValuesSubset(start, arrow.{{.Name}}Traits.CastFromBytes(out))
+}
+
+func (s *{{.Name}}MemoTable) WriteOutLE(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *{{.Name}}MemoTable) WriteOutSubsetLE(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *{{.Name}}MemoTable) Get(val interface{}) (int, bool) {
+{{if and (ne .Name "Float32") (ne .Name "Float64") }}
+ h := hashInt(uint64(val.({{.name}})), 0)
+ if e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { return val.({{.name}}) == v }); ok {
+{{ else -}}
+ var cmp func({{.name}}) bool
+ {{if eq .Name "Float32"}}
+ if math.IsNaN(float64(val.(float32))) {
+ cmp = isNan32Cmp
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = float32(math.NaN())
+ {{ else -}}
+ if math.IsNaN(val.(float64)) {
+ cmp = math.IsNaN
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = math.NaN()
+ {{end -}}
+ } else {
+ cmp = func(v {{.name}}) bool { return val.({{.name}}) == v }
+ }
+
+ h := hash{{.Name}}(val.({{.name}}), 0)
+ if e, ok := s.tbl.Lookup(h, cmp); ok {
+{{ end -}}
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+ {{if and (ne .Name "Float32") (ne .Name "Float64") }}
+ h := hashInt(uint64(val.({{.name}})), 0)
+ e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool {
+ return val.({{.name}}) == v
+ })
+{{ else }}
+ var cmp func({{.name}}) bool
+ {{if eq .Name "Float32"}}
+ if math.IsNaN(float64(val.(float32))) {
+ cmp = isNan32Cmp
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = float32(math.NaN())
+ {{ else -}}
+ if math.IsNaN(val.(float64)) {
+ cmp = math.IsNaN
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = math.NaN()
+ {{end -}}
+ } else {
+ cmp = func(v {{.name}}) bool { return val.({{.name}}) == v }
+ }
+
+ h := hash{{.Name}}(val.({{.name}}), 0)
+ e, ok := s.tbl.Lookup(h, cmp)
+{{ end }}
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.({{.name}}), int32(idx))
+ }
+ return
+}
+
+
+// GetOrInsertBytes is unimplemented
+func (s *{{.Name}}MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
+ panic("unimplemented")
+}
+{{end}}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.go b/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.go
new file mode 100644
index 000000000..283bc1a95
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.go
@@ -0,0 +1,443 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package hashing provides utilities for and an implementation of a hash
+// table which is more performant than the default go map implementation
+// by leveraging xxh3 and some custom hash functions.
+package hashing
+
+import (
+ "bytes"
+ "math"
+ "reflect"
+ "unsafe"
+)
+
+//go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl
+
+type TypeTraits interface {
+ BytesRequired(n int) int
+}
+
+type ByteSlice interface {
+ Bytes() []byte
+}
+
+// MemoTable interface for hash tables and dictionary encoding.
+//
+// Values will remember the order they are inserted to generate a valid
+// dictionary.
+type MemoTable interface {
+ TypeTraits() TypeTraits
+ // Reset drops everything in the table allowing it to be reused
+ Reset()
+ // Size returns the current number of unique values stored in
+ // the table, including whether or not a null value has been
+ // inserted via GetOrInsertNull.
+ Size() int
+ // GetOrInsert returns the index of the table the specified value is,
+ // and a boolean indicating whether or not the value was found in
+ // the table (if false, the value was inserted). An error is returned
+ // if val is not the appropriate type for the table.
+ GetOrInsert(val interface{}) (idx int, existed bool, err error)
+ // GetOrInsertBytes returns the index of the table the specified value is,
+ // and a boolean indicating whether or not the value was found in
+ // the table (if false, the value was inserted). An error is returned
+ // if val is not the appropriate type for the table. This function is intended to be used by
+ // the BinaryMemoTable to prevent unnecessary allocations of the data when converting from a []byte to interface{}.
+ GetOrInsertBytes(val []byte) (idx int, existed bool, err error)
+ // GetOrInsertNull returns the index of the null value in the table,
+ // inserting one if it hasn't already been inserted. It returns a boolean
+ // indicating if the null value already existed or not in the table.
+ GetOrInsertNull() (idx int, existed bool)
+ // GetNull returns the index of the null value in the table, but does not
+ // insert one if it doesn't already exist. Will return -1 if it doesn't exist
+ // indicated by a false value for the boolean.
+ GetNull() (idx int, exists bool)
+ // WriteOut copies the unique values of the memotable out to the byte slice
+ // provided. Must have allocated enough bytes for all the values.
+ WriteOut(out []byte)
+ // WriteOutSubset is like WriteOut, but only writes a subset of values
+ // starting with the index offset.
+ WriteOutSubset(offset int, out []byte)
+}
+
+type NumericMemoTable interface {
+ MemoTable
+ WriteOutLE(out []byte)
+ WriteOutSubsetLE(offset int, out []byte)
+}
+
+const (
+ sentinel uint64 = 0
+ loadFactor int64 = 2
+)
+
+func max(a, b uint64) uint64 {
+ if a > b {
+ return a
+ }
+ return b
+}
+
+var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) }
+
+// KeyNotFound is the constant returned by memo table functions when a key isn't found in the table
+const KeyNotFound = -1
+
+type BinaryBuilderIFace interface {
+ Reserve(int)
+ ReserveData(int)
+ Retain()
+ Resize(int)
+ ResizeData(int)
+ Release()
+ DataLen() int
+ Value(int) []byte
+ Len() int
+ AppendNull()
+ AppendString(string)
+ Append([]byte)
+}
+
+// BinaryMemoTable is our hashtable for binary data using the BinaryBuilder
+// to construct the actual data in an easy to pass around way with minimal copies
+// while using a hash table to keep track of the indexes into the dictionary that
+// is created as we go.
+type BinaryMemoTable struct {
+ tbl *Int32HashTable
+ builder BinaryBuilderIFace
+ nullIdx int
+}
+
+// NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will
+// be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used.
+// initial and valuesize can be used to pre-allocate the table to reduce allocations. With
+// initial being the initial number of entries to allocate for and valuesize being the starting
+// amount of space allocated for writing the actual binary data.
+func NewBinaryMemoTable(initial, valuesize int, bldr BinaryBuilderIFace) *BinaryMemoTable {
+ bldr.Reserve(int(initial))
+ datasize := valuesize
+ if datasize <= 0 {
+ datasize = initial * 4
+ }
+ bldr.ReserveData(datasize)
+ return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound}
+}
+
+type unimplementedtraits struct{}
+
+func (unimplementedtraits) BytesRequired(int) int { panic("unimplemented") }
+
+func (BinaryMemoTable) TypeTraits() TypeTraits {
+ return unimplementedtraits{}
+}
+
+// Reset dumps all of the data in the table allowing it to be reutilized.
+func (s *BinaryMemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.builder.Resize(0)
+ s.builder.ResizeData(0)
+ s.builder.Reserve(int(32))
+ s.builder.ReserveData(int(32) * 4)
+ s.nullIdx = KeyNotFound
+}
+
+// GetNull returns the index of a null that has been inserted into the table or
+// KeyNotFound. The bool returned will be true if there was a null inserted into
+// the table, and false otherwise.
+func (s *BinaryMemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// Size returns the current size of the memo table including the null value
+// if one has been inserted.
+func (s *BinaryMemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// helper function to easily return a byte slice for any given value
+// regardless of the type if it's a []byte, string, or fulfills the
+// ByteSlice interface.
+func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte {
+ switch v := val.(type) {
+ case []byte:
+ return v
+ case ByteSlice:
+ return v.Bytes()
+ case string:
+ var out []byte
+ h := (*reflect.StringHeader)(unsafe.Pointer(&v))
+ s := (*reflect.SliceHeader)(unsafe.Pointer(&out))
+ s.Data = h.Data
+ s.Len = h.Len
+ s.Cap = h.Len
+ return out
+ default:
+ panic("invalid type for binarymemotable")
+ }
+}
+
+// helper function to get the hash value regardless of the underlying binary type
+func (BinaryMemoTable) getHash(val interface{}) uint64 {
+ switch v := val.(type) {
+ case string:
+ return hashString(v, 0)
+ case []byte:
+ return Hash(v, 0)
+ case ByteSlice:
+ return Hash(v.Bytes(), 0)
+ default:
+ panic("invalid type for binarymemotable")
+ }
+}
+
+// helper function to append the given value to the builder regardless
+// of the underlying binary type.
+func (b *BinaryMemoTable) appendVal(val interface{}) {
+ switch v := val.(type) {
+ case string:
+ b.builder.AppendString(v)
+ case []byte:
+ b.builder.Append(v)
+ case ByteSlice:
+ b.builder.Append(v.Bytes())
+ }
+}
+
+func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) {
+ return b.tbl.Lookup(h, func(i int32) bool {
+ return bytes.Equal(val, b.builder.Value(int(i)))
+ })
+}
+
+// Get returns the index of the specified value in the table or KeyNotFound,
+// and a boolean indicating whether it was found in the table.
+func (b *BinaryMemoTable) Get(val interface{}) (int, bool) {
+ if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok {
+ return int(p.payload.val), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsertBytes returns the index of the given value in the table, if not found
+// it is inserted into the table. The return value 'found' indicates whether the value
+// was found in the table (true) or inserted (false) along with any possible error.
+func (b *BinaryMemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
+ h := Hash(val, 0)
+ p, found := b.lookup(h, val)
+ if found {
+ idx = int(p.payload.val)
+ } else {
+ idx = b.Size()
+ b.builder.Append(val)
+ b.tbl.Insert(p, h, int32(idx), -1)
+ }
+ return
+}
+
+// GetOrInsert returns the index of the given value in the table, if not found
+// it is inserted into the table. The return value 'found' indicates whether the value
+// was found in the table (true) or inserted (false) along with any possible error.
+func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+ h := b.getHash(val)
+ p, found := b.lookup(h, b.valAsByteSlice(val))
+ if found {
+ idx = int(p.payload.val)
+ } else {
+ idx = b.Size()
+ b.appendVal(val)
+ b.tbl.Insert(p, h, int32(idx), -1)
+ }
+ return
+}
+
+// GetOrInsertNull retrieves the index of a null in the table or inserts
+// null into the table, returning the index and a boolean indicating if it was
+// found in the table (true) or was inserted (false).
+func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = b.GetNull()
+ if !found {
+ idx = b.Size()
+ b.nullIdx = idx
+ b.builder.AppendNull()
+ }
+ return
+}
+
+func (b *BinaryMemoTable) Value(i int) []byte {
+ return b.builder.Value(i)
+}
+
+// helper function to get the offset into the builder data for a given
+// index value.
+func (b *BinaryMemoTable) findOffset(idx int) uintptr {
+ if b.builder.DataLen() == 0 {
+ // only empty strings, short circuit
+ return 0
+ }
+
+ val := b.builder.Value(idx)
+ for len(val) == 0 {
+ idx++
+ if idx >= b.builder.Len() {
+ break
+ }
+ val = b.builder.Value(idx)
+ }
+ if len(val) != 0 {
+ return uintptr(unsafe.Pointer(&val[0]))
+ }
+ return uintptr(b.builder.DataLen()) + b.findOffset(0)
+}
+
+// CopyOffsets copies the list of offsets into the passed in slice, the offsets
+// being the start and end values of the underlying allocated bytes in the builder
+// for the individual values of the table. out should be at least sized to Size()+1
+func (b *BinaryMemoTable) CopyOffsets(out []int32) {
+ b.CopyOffsetsSubset(0, out)
+}
+
+// CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets,
+// it gets a subset of the offsets in the table starting at the index provided by "start".
+func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int32) {
+ if b.builder.Len() <= start {
+ return
+ }
+
+ first := b.findOffset(0)
+ delta := b.findOffset(start)
+ sz := b.Size()
+ for i := start; i < sz; i++ {
+ offset := int32(b.findOffset(i) - delta)
+ out[i-start] = offset
+ }
+
+ out[sz-start] = int32(b.builder.DataLen() - (int(delta) - int(first)))
+}
+
+// CopyLargeOffsets copies the list of offsets into the passed in slice, the offsets
+// being the start and end values of the underlying allocated bytes in the builder
+// for the individual values of the table. out should be at least sized to Size()+1
+func (b *BinaryMemoTable) CopyLargeOffsets(out []int64) {
+ b.CopyLargeOffsetsSubset(0, out)
+}
+
+// CopyLargeOffsetsSubset is like CopyOffsets but instead of copying all of the offsets,
+// it gets a subset of the offsets in the table starting at the index provided by "start".
+func (b *BinaryMemoTable) CopyLargeOffsetsSubset(start int, out []int64) {
+ if b.builder.Len() <= start {
+ return
+ }
+
+ first := b.findOffset(0)
+ delta := b.findOffset(start)
+ sz := b.Size()
+ for i := start; i < sz; i++ {
+ offset := int64(b.findOffset(i) - delta)
+ out[i-start] = offset
+ }
+
+ out[sz-start] = int64(b.builder.DataLen() - (int(delta) - int(first)))
+}
+
+// CopyValues copies the raw binary data bytes out, out should be a []byte
+// with at least ValuesSize bytes allocated to copy into.
+func (b *BinaryMemoTable) CopyValues(out interface{}) {
+ b.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies the raw binary data bytes out starting with the value
+// at the index start, out should be a []byte with at least ValuesSize bytes allocated
+func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) {
+ if b.builder.Len() <= start {
+ return
+ }
+
+ var (
+ first = b.findOffset(0)
+ offset = b.findOffset(int(start))
+ length = b.builder.DataLen() - int(offset-first)
+ )
+
+ outval := out.([]byte)
+ copy(outval, b.builder.Value(start)[0:length])
+}
+
+func (b *BinaryMemoTable) WriteOut(out []byte) {
+ b.CopyValues(out)
+}
+
+func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) {
+ b.CopyValuesSubset(start, out)
+}
+
+// CopyFixedWidthValues exists to cope with the fact that the table doesn't keep
+// track of the fixed width when inserting the null value the databuffer holds a
+// zero length byte slice for the null value (if found)
+func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) {
+ if start >= b.Size() {
+ return
+ }
+
+ null, exists := b.GetNull()
+ if !exists || null < start {
+ // nothing to skip, proceed as usual
+ b.CopyValuesSubset(start, out)
+ return
+ }
+
+ var (
+ leftOffset = b.findOffset(start)
+ nullOffset = b.findOffset(null)
+ leftSize = nullOffset - leftOffset
+ rightOffset = leftOffset + uintptr(b.ValuesSize())
+ )
+
+ if leftSize > 0 {
+ copy(out, b.builder.Value(start)[0:leftSize])
+ }
+
+ rightSize := rightOffset - nullOffset
+ if rightSize > 0 {
+ // skip the null fixed size value
+ copy(out[int(leftSize)+width:], b.builder.Value(null + 1)[0:rightSize])
+ }
+}
+
+// VisitValues exists to run the visitFn on each value currently in the hash table.
+func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) {
+ for i := int(start); i < b.Size(); i++ {
+ visitFn(b.builder.Value(i))
+ }
+}
+
+// Release is used to tell the underlying builder that it can release the memory allocated
+// when the reference count reaches 0, this is safe to be called from multiple goroutines
+// simultaneously
+func (b *BinaryMemoTable) Release() { b.builder.Release() }
+
+// Retain increases the ref count, it is safe to call it from multiple goroutines
+// simultaneously.
+func (b *BinaryMemoTable) Retain() { b.builder.Retain() }
+
+// ValuesSize returns the current total size of all the raw bytes that have been inserted
+// into the memotable so far.
+func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() }
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/json/json.go b/vendor/github.com/apache/arrow/go/v15/internal/json/json.go
new file mode 100644
index 000000000..319b12c55
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/json/json.go
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !tinygo
+// +build !tinygo
+
+package json
+
+import (
+ "io"
+
+ "github.com/goccy/go-json"
+)
+
+type Decoder = json.Decoder
+type Encoder = json.Encoder
+type Marshaler = json.Marshaler
+type Delim = json.Delim
+type UnmarshalTypeError = json.UnmarshalTypeError
+type Number = json.Number
+type Unmarshaler = json.Unmarshaler
+type RawMessage = json.RawMessage
+
+func Marshal(v interface{}) ([]byte, error) {
+ return json.Marshal(v)
+}
+
+func Unmarshal(data []byte, v interface{}) error {
+ return json.Unmarshal(data, v)
+}
+
+func NewDecoder(r io.Reader) *Decoder {
+ return json.NewDecoder(r)
+}
+
+func NewEncoder(w io.Writer) *Encoder {
+ return json.NewEncoder(w)
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/json/json_tinygo.go b/vendor/github.com/apache/arrow/go/v15/internal/json/json_tinygo.go
new file mode 100644
index 000000000..8e4f447b3
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/json/json_tinygo.go
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build tinygo
+// +build tinygo
+
+package json
+
+import (
+ "io"
+
+ "encoding/json"
+)
+
+type Decoder = json.Decoder
+type Encoder = json.Encoder
+type Marshaler = json.Marshaler
+type Delim = json.Delim
+type UnmarshalTypeError = json.UnmarshalTypeError
+type Number = json.Number
+type Unmarshaler = json.Unmarshaler
+type RawMessage = json.RawMessage
+
+func Marshal(v interface{}) ([]byte, error) {
+ return json.Marshal(v)
+}
+
+func Unmarshal(data []byte, v interface{}) error {
+ return json.Unmarshal(data, v)
+}
+
+func NewDecoder(r io.Reader) *Decoder {
+ return json.NewDecoder(r)
+}
+
+func NewEncoder(w io.Writer) *Encoder {
+ return json.NewEncoder(w)
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/Makefile b/vendor/github.com/apache/arrow/go/v15/internal/utils/Makefile
new file mode 100644
index 000000000..fded9d1d5
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/Makefile
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this converts rotate instructions from "ro[lr] <reg>" -> "ro[lr] <reg>, 1" for yasm compatibility
+PERL_FIXUP_ROTATE=perl -i -pe 's/(ro[rl]\s+\w{2,3})$$/\1, 1/'
+
+C2GOASM=c2goasm
+CC=clang-11
+C_FLAGS=-target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 \
+ -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib
+ASM_FLAGS_AVX2=-mavx2 -mfma
+ASM_FLAGS_SSE4=-msse4
+ASM_FLAGS_BMI2=-mbmi2
+ASM_FLAGS_POPCNT=-mpopcnt
+
+C_FLAGS_NEON=-O3 -fvectorize -mllvm -force-vector-width=16 -fno-asynchronous-unwind-tables -mno-red-zone -mstackrealign -fno-exceptions \
+ -fno-rtti -fno-builtin -ffast-math -fno-jump-tables -I_lib
+
+GO_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -not -name '*_test.go')
+ALL_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -name '*.s' -not -name '*_test.go')
+
+.PHONEY: assembly
+
+INTEL_SOURCES := \
+ min_max_avx2_amd64.s min_max_sse4_amd64.s transpose_ints_avx2_amd64.s transpose_ints_sse4_amd64.s
+
+#
+# ARROW-15336: DO NOT add the assembly target for Arm64 (ARM_SOURCES) until c2goasm added the Arm64 support.
+# min_max_neon_arm64.s was generated by asm2plan9s.
+# And manually formatted it as the Arm64 Plan9.
+#
+
+assembly: $(INTEL_SOURCES)
+
+_lib/min_max_avx2_amd64.s: _lib/min_max.c
+ $(CC) -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+_lib/min_max_sse4_amd64.s: _lib/min_max.c
+ $(CC) -S $(C_FLAGS) $(ASM_FLAGS_SSE4) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+_lib/min_max_neon.s: _lib/min_max.c
+ $(CC) -S $(C_FLAGS_NEON) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+_lib/transpose_ints_avx2_amd64.s: _lib/transpose_ints.c
+ $(CC) -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+_lib/transpose_ints_sse4_amd64.s: _lib/transpose_ints.c
+ $(CC) -S $(C_FLAGS) $(ASM_FLAGS_SSE4) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+_lib/transpose_ints_neon.s: _lib/transpose_ints.c
+ $(CC) -S $(C_FLAGS_NEON) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+min_max_avx2_amd64.s: _lib/min_max_avx2_amd64.s
+ $(C2GOASM) -a -f $^ $@
+
+min_max_sse4_amd64.s: _lib/min_max_sse4_amd64.s
+ $(C2GOASM) -a -f $^ $@
+
+transpose_ints_avx2_amd64.s: _lib/transpose_ints_avx2_amd64.s
+ $(C2GOASM) -a -f $^ $@
+
+transpose_ints_sse4_amd64.s: _lib/transpose_ints_sse4_amd64.s
+ $(C2GOASM) -a -f $^ $@
+
+clean:
+ rm -f $(INTEL_SOURCES)
+ rm -f $(addprefix _lib/,$(INTEL_SOURCES))
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/buf_reader.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/buf_reader.go
new file mode 100644
index 000000000..0b2381da1
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/buf_reader.go
@@ -0,0 +1,212 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+ "bufio"
+ "errors"
+ "fmt"
+ "io"
+)
+
+// bufferedReader is similar to bufio.Reader except
+// it will expand the buffer if necessary when asked to Peek
+// more bytes than are in the buffer
+type bufferedReader struct {
+ bufferSz int
+ buf []byte
+ r, w int
+ rd io.Reader
+ err error
+}
+
+// NewBufferedReader returns a buffered reader with similar semantics to bufio.Reader
+// except Peek will expand the internal buffer if needed rather than return
+// an error.
+func NewBufferedReader(rd io.Reader, sz int) *bufferedReader {
+ // if rd is already a buffered reader whose buffer is >= the requested size
+ // then just return it as is. no need to make a new object.
+ b, ok := rd.(*bufferedReader)
+ if ok && len(b.buf) >= sz {
+ return b
+ }
+
+ r := &bufferedReader{
+ rd: rd,
+ }
+ r.resizeBuffer(sz)
+ return r
+}
+
+func (b *bufferedReader) resetBuffer() {
+ if b.buf == nil {
+ b.buf = make([]byte, b.bufferSz)
+ } else if b.bufferSz > cap(b.buf) {
+ buf := b.buf
+ b.buf = make([]byte, b.bufferSz)
+ copy(b.buf, buf)
+ } else {
+ b.buf = b.buf[:b.bufferSz]
+ }
+}
+
+func (b *bufferedReader) resizeBuffer(newSize int) {
+ b.bufferSz = newSize
+ b.resetBuffer()
+}
+
+func (b *bufferedReader) fill() error {
+ // slide existing data to the beginning
+ if b.r > 0 {
+ copy(b.buf, b.buf[b.r:b.w])
+ b.w -= b.r
+ b.r = 0
+ }
+
+ if b.w >= len(b.buf) {
+ return fmt.Errorf("arrow/bufferedreader: %w", bufio.ErrBufferFull)
+ }
+
+ n, err := io.ReadAtLeast(b.rd, b.buf[b.w:], 1)
+ if n < 0 {
+ return fmt.Errorf("arrow/bufferedreader: filling buffer: %w", bufio.ErrNegativeCount)
+ }
+
+ b.w += n
+ b.err = err
+ return nil
+}
+
+func (b *bufferedReader) readErr() error {
+ err := b.err
+ b.err = nil
+ return err
+}
+
+// Buffered returns the number of bytes currently buffered
+func (b *bufferedReader) Buffered() int { return b.w - b.r }
+
+// SetBufferSize resets the size of the internal buffer to the desired size.
+// Will return an error if newSize is <= 0 or if newSize is less than the size
+// of the buffered data.
+func (b *bufferedReader) SetBufferSize(newSize int) error {
+ if newSize <= 0 {
+ return errors.New("buffer size should be positive")
+ }
+
+ if b.w >= newSize {
+ return errors.New("cannot shrink read buffer if buffered data remains")
+ }
+
+ b.resizeBuffer(newSize)
+ return nil
+}
+
+// Peek will buffer and return n bytes from the underlying reader without advancing
+// the reader itself. If n is larger than the current buffer size, the buffer will
+// be expanded to accommodate the extra bytes rather than error.
+func (b *bufferedReader) Peek(n int) ([]byte, error) {
+ if n < 0 {
+ return nil, fmt.Errorf("arrow/bufferedreader: %w", bufio.ErrNegativeCount)
+ }
+
+ if n > len(b.buf) {
+ if err := b.SetBufferSize(n); err != nil {
+ return nil, err
+ }
+ }
+
+ for b.w-b.r < n && b.w-b.r < len(b.buf) && b.err == nil {
+ b.fill() // b.w-b.r < len(b.buf) => buffer is not full
+ }
+
+ return b.buf[b.r : b.r+n], b.readErr()
+}
+
+// Discard skips the next n bytes either by advancing the internal buffer
+// or by reading that many bytes in and throwing them away.
+func (b *bufferedReader) Discard(n int) (discarded int, err error) {
+ if n < 0 {
+ return 0, fmt.Errorf("arrow/bufferedreader: %w", bufio.ErrNegativeCount)
+ }
+
+ if n == 0 {
+ return
+ }
+
+ remain := n
+ for {
+ skip := b.Buffered()
+ if skip == 0 {
+ b.fill()
+ skip = b.Buffered()
+ }
+ if skip > remain {
+ skip = remain
+ }
+ b.r += skip
+ remain -= skip
+ if remain == 0 {
+ return n, nil
+ }
+ if b.err != nil {
+ return n - remain, b.readErr()
+ }
+ }
+}
+
+func (b *bufferedReader) Read(p []byte) (n int, err error) {
+ n = len(p)
+ if n == 0 {
+ if b.Buffered() > 0 {
+ return 0, nil
+ }
+ return 0, b.readErr()
+ }
+
+ if b.r == b.w {
+ if b.err != nil {
+ return 0, b.readErr()
+ }
+ if len(p) >= len(b.buf) {
+ // large read, empty buffer
+ // read directly into p to avoid extra copy
+ n, b.err = b.rd.Read(p)
+ if n < 0 {
+ return n, fmt.Errorf("arrow/bufferedreader: %w", bufio.ErrNegativeCount)
+ }
+ return n, b.readErr()
+ }
+
+ // one read
+ // don't use b.fill
+ b.r, b.w = 0, 0
+ n, b.err = b.rd.Read(b.buf)
+ if n < 0 {
+ return n, fmt.Errorf("arrow/bufferedreader: %w", bufio.ErrNegativeCount)
+ }
+ if n == 0 {
+ return 0, b.readErr()
+ }
+ b.w += n
+ }
+
+ // copy as much as we can
+ n = copy(p, b.buf[b.r:b.w])
+ b.r += n
+ return n, nil
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/endians_default.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/endians_default.go
new file mode 100644
index 000000000..5fd257f52
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/endians_default.go
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !s390x
+
+package utils
+
+var (
+ ToLEInt16 = func(x int16) int16 { return x }
+ ToLEUint16 = func(x uint16) uint16 { return x }
+ ToLEUint32 = func(x uint32) uint32 { return x }
+ ToLEUint64 = func(x uint64) uint64 { return x }
+ ToLEInt32 = func(x int32) int32 { return x }
+ ToLEInt64 = func(x int64) int64 { return x }
+ ToLEFloat32 = func(x float32) float32 { return x }
+ ToLEFloat64 = func(x float64) float64 { return x }
+)
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/endians_s390x.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/endians_s390x.go
new file mode 100644
index 000000000..7bb27cd81
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/endians_s390x.go
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+ "math"
+ "math/bits"
+)
+
+var (
+ ToLEInt16 = func(x int16) int16 { return int16(bits.ReverseBytes16(uint16(x))) }
+ ToLEUint16 = bits.ReverseBytes16
+ ToLEUint32 = bits.ReverseBytes32
+ ToLEUint64 = bits.ReverseBytes64
+ ToLEInt32 = func(x int32) int32 { return int32(bits.ReverseBytes32(uint32(x))) }
+ ToLEInt64 = func(x int64) int64 { return int64(bits.ReverseBytes64(uint64(x))) }
+ ToLEFloat32 = func(x float32) float32 { return math.Float32frombits(bits.ReverseBytes32(math.Float32bits(x))) }
+ ToLEFloat64 = func(x float64) float64 { return math.Float64frombits(bits.ReverseBytes64(math.Float64bits(x))) }
+)
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/math.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/math.go
new file mode 100644
index 000000000..c8311750e
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/math.go
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import "golang.org/x/exp/constraints"
+
+func Min[T constraints.Ordered](a, b T) T {
+ if a < b {
+ return a
+ }
+ return b
+}
+
+func Max[T constraints.Ordered](a, b T) T {
+ if a > b {
+ return a
+ }
+ return b
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max.go
new file mode 100644
index 000000000..3d7b0024a
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max.go
@@ -0,0 +1,212 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+ "math"
+)
+
+// this file contains pure go implementations of the min_max functions that are
+// SIMD accelerated so that we can fallback to these if the cpu doesn't support
+// AVX2 or SSE4 instructions.
+
+func int8MinMax(values []int8) (min, max int8) {
+ min = math.MaxInt8
+ max = math.MinInt8
+
+ for _, v := range values {
+ if min > v {
+ min = v
+ }
+ if max < v {
+ max = v
+ }
+ }
+ return
+}
+
+func uint8MinMax(values []uint8) (min, max uint8) {
+ min = math.MaxUint8
+ max = 0
+
+ for _, v := range values {
+ if min > v {
+ min = v
+ }
+ if max < v {
+ max = v
+ }
+ }
+ return
+}
+
+func int16MinMax(values []int16) (min, max int16) {
+ min = math.MaxInt16
+ max = math.MinInt16
+
+ for _, v := range values {
+ if min > v {
+ min = v
+ }
+ if max < v {
+ max = v
+ }
+ }
+ return
+}
+
+func uint16MinMax(values []uint16) (min, max uint16) {
+ min = math.MaxUint16
+ max = 0
+
+ for _, v := range values {
+ if min > v {
+ min = v
+ }
+ if max < v {
+ max = v
+ }
+ }
+ return
+}
+
+func int32MinMax(values []int32) (min, max int32) {
+ min = math.MaxInt32
+ max = math.MinInt32
+
+ for _, v := range values {
+ if min > v {
+ min = v
+ }
+ if max < v {
+ max = v
+ }
+ }
+ return
+}
+
+func uint32MinMax(values []uint32) (min, max uint32) {
+ min = math.MaxUint32
+ max = 0
+
+ for _, v := range values {
+ if min > v {
+ min = v
+ }
+ if max < v {
+ max = v
+ }
+ }
+ return
+}
+
+func int64MinMax(values []int64) (min, max int64) {
+ min = math.MaxInt64
+ max = math.MinInt64
+
+ for _, v := range values {
+ if min > v {
+ min = v
+ }
+ if max < v {
+ max = v
+ }
+ }
+ return
+}
+
+func uint64MinMax(values []uint64) (min, max uint64) {
+ min = math.MaxUint64
+ max = 0
+
+ for _, v := range values {
+ if min > v {
+ min = v
+ }
+ if max < v {
+ max = v
+ }
+ }
+ return
+}
+
+var minmaxFuncs = struct {
+ i8 func([]int8) (int8, int8)
+ ui8 func([]uint8) (uint8, uint8)
+ i16 func([]int16) (int16, int16)
+ ui16 func([]uint16) (uint16, uint16)
+ i32 func([]int32) (int32, int32)
+ ui32 func([]uint32) (uint32, uint32)
+ i64 func([]int64) (int64, int64)
+ ui64 func([]uint64) (uint64, uint64)
+}{}
+
+// GetMinMaxInt8 returns the min and max for a int8 slice, using AVX2 or
+// SSE4 cpu extensions if available, falling back to a pure go implementation
+// if they are unavailable or built with the noasm tag.
+func GetMinMaxInt8(v []int8) (min, max int8) {
+ return minmaxFuncs.i8(v)
+}
+
+// GetMinMaxUint8 returns the min and max for a uint8 slice, using AVX2 or
+// SSE4 cpu extensions if available, falling back to a pure go implementation
+// if they are unavailable or built with the noasm tag.
+func GetMinMaxUint8(v []uint8) (min, max uint8) {
+ return minmaxFuncs.ui8(v)
+}
+
+// GetMinMaxInt16 returns the min and max for a int16 slice, using AVX2 or
+// SSE4 cpu extensions if available, falling back to a pure go implementation
+// if they are unavailable or built with the noasm tag.
+func GetMinMaxInt16(v []int16) (min, max int16) {
+ return minmaxFuncs.i16(v)
+}
+
+// GetMinMaxUint16 returns the min and max for a uint16 slice, using AVX2 or
+// SSE4 cpu extensions if available, falling back to a pure go implementation
+// if they are unavailable or built with the noasm tag.
+func GetMinMaxUint16(v []uint16) (min, max uint16) {
+ return minmaxFuncs.ui16(v)
+}
+
+// GetMinMaxInt32 returns the min and max for a int32 slice, using AVX2 or
+// SSE4 cpu extensions if available, falling back to a pure go implementation
+// if they are unavailable or built with the noasm tag.
+func GetMinMaxInt32(v []int32) (min, max int32) {
+ return minmaxFuncs.i32(v)
+}
+
+// GetMinMaxUint32 returns the min and max for a uint32 slice, using AVX2 or
+// SSE4 cpu extensions if available, falling back to a pure go implementation
+// if they are unavailable or built with the noasm tag.
+func GetMinMaxUint32(v []uint32) (min, max uint32) {
+ return minmaxFuncs.ui32(v)
+}
+
+// GetMinMaxInt64 returns the min and max for a int64 slice, using AVX2 or
+// SSE4 cpu extensions if available, falling back to a pure go implementation
+// if they are unavailable or built with the noasm tag.
+func GetMinMaxInt64(v []int64) (min, max int64) {
+ return minmaxFuncs.i64(v)
+}
+
+// GetMinMaxUint64 returns the min and max for a uint64 slice, using AVX2 or
+// SSE4 cpu extensions if available, falling back to a pure go implementation
+// if they are unavailable or built with the noasm tag.
+func GetMinMaxUint64(v []uint64) (min, max uint64) {
+ return minmaxFuncs.ui64(v)
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_amd64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_amd64.go
new file mode 100644
index 000000000..5fccddbee
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_amd64.go
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+import "golang.org/x/sys/cpu"
+
+func init() {
+ // if the CPU supports AVX2 or SSE4 then let's use those to benefit from SIMD
+ // to accelerate the performance for finding the min and max for an integral slice.
+ // otherwise fallback to a pure go implementation if the cpu doesn't have these features.
+ if cpu.X86.HasAVX2 {
+ minmaxFuncs.i8 = int8MaxMinAVX2
+ minmaxFuncs.ui8 = uint8MaxMinAVX2
+ minmaxFuncs.i16 = int16MaxMinAVX2
+ minmaxFuncs.ui16 = uint16MaxMinAVX2
+ minmaxFuncs.i32 = int32MaxMinAVX2
+ minmaxFuncs.ui32 = uint32MaxMinAVX2
+ minmaxFuncs.i64 = int64MaxMinAVX2
+ minmaxFuncs.ui64 = uint64MaxMinAVX2
+ } else if cpu.X86.HasSSE42 {
+ minmaxFuncs.i8 = int8MaxMinSSE4
+ minmaxFuncs.ui8 = uint8MaxMinSSE4
+ minmaxFuncs.i16 = int16MaxMinSSE4
+ minmaxFuncs.ui16 = uint16MaxMinSSE4
+ minmaxFuncs.i32 = int32MaxMinSSE4
+ minmaxFuncs.ui32 = uint32MaxMinSSE4
+ minmaxFuncs.i64 = int64MaxMinSSE4
+ minmaxFuncs.ui64 = uint64MaxMinSSE4
+ } else {
+ minmaxFuncs.i8 = int8MinMax
+ minmaxFuncs.ui8 = uint8MinMax
+ minmaxFuncs.i16 = int16MinMax
+ minmaxFuncs.ui16 = uint16MinMax
+ minmaxFuncs.i32 = int32MinMax
+ minmaxFuncs.ui32 = uint32MinMax
+ minmaxFuncs.i64 = int64MinMax
+ minmaxFuncs.ui64 = uint64MinMax
+ }
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_arm64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_arm64.go
new file mode 100644
index 000000000..7404e95d9
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_arm64.go
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+import (
+ "os"
+ "strings"
+)
+import "golang.org/x/sys/cpu"
+
+func init() {
+ // Added ability to enable extension via environment:
+ // ARM_ENABLE_EXT=NEON go test
+ if ext, ok := os.LookupEnv("ARM_ENABLE_EXT"); ok {
+ exts := strings.Split(ext, ",")
+
+ for _, x := range exts {
+ switch x {
+ case "NEON":
+ cpu.ARM64.HasASIMD = true
+ case "AES":
+ cpu.ARM64.HasAES = true
+ case "PMULL":
+ cpu.ARM64.HasPMULL = true
+ default:
+ cpu.ARM64.HasASIMD = false
+ cpu.ARM64.HasAES = false
+ cpu.ARM64.HasPMULL = false
+ }
+ }
+ }
+ if cpu.ARM64.HasASIMD {
+ minmaxFuncs.i32 = int32MaxMinNEON
+ minmaxFuncs.ui32 = uint32MaxMinNEON
+ minmaxFuncs.i64 = int64MaxMinNEON
+ minmaxFuncs.ui64 = uint64MaxMinNEON
+ } else {
+ minmaxFuncs.i32 = int32MinMax
+ minmaxFuncs.ui32 = uint32MinMax
+ minmaxFuncs.i64 = int64MinMax
+ minmaxFuncs.ui64 = uint64MinMax
+ }
+
+ // haven't yet generated the NEON arm64 for these
+ minmaxFuncs.i8 = int8MinMax
+ minmaxFuncs.ui8 = uint8MinMax
+ minmaxFuncs.i16 = int16MinMax
+ minmaxFuncs.ui16 = uint16MinMax
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.go
new file mode 100644
index 000000000..af6726243
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.go
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+import (
+ "unsafe"
+)
+
+// This file contains convenience functions for utilizing AVX2 intrinsics to quickly
+// and efficiently get the min and max from an integral slice.
+
+//go:noescape
+func _int8_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int8MaxMinAVX2(values []int8) (min, max int8) {
+ _int8_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _uint8_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint8MaxMinAVX2(values []uint8) (min, max uint8) {
+ _uint8_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _int16_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int16MaxMinAVX2(values []int16) (min, max int16) {
+ _int16_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _uint16_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint16MaxMinAVX2(values []uint16) (min, max uint16) {
+ _uint16_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _int32_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int32MaxMinAVX2(values []int32) (min, max int32) {
+ _int32_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _uint32_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint32MaxMinAVX2(values []uint32) (min, max uint32) {
+ _uint32_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _int64_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int64MaxMinAVX2(values []int64) (min, max int64) {
+ _int64_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _uint64_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint64MaxMinAVX2(values []uint64) (min, max uint64) {
+ _uint64_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.s b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.s
new file mode 100644
index 000000000..fe0c36e0e
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.s
@@ -0,0 +1,927 @@
+//+build !noasm !appengine
+// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
+
+DATA LCDATA1<>+0x000(SB)/8, $0x8080808080808080
+DATA LCDATA1<>+0x008(SB)/8, $0x8080808080808080
+DATA LCDATA1<>+0x010(SB)/8, $0x8080808080808080
+DATA LCDATA1<>+0x018(SB)/8, $0x8080808080808080
+DATA LCDATA1<>+0x020(SB)/8, $0x7f7f7f7f7f7f7f7f
+DATA LCDATA1<>+0x028(SB)/8, $0x7f7f7f7f7f7f7f7f
+DATA LCDATA1<>+0x030(SB)/8, $0x7f7f7f7f7f7f7f7f
+DATA LCDATA1<>+0x038(SB)/8, $0x7f7f7f7f7f7f7f7f
+DATA LCDATA1<>+0x040(SB)/8, $0x7f7f7f7f7f7f7f7f
+DATA LCDATA1<>+0x048(SB)/8, $0x7f7f7f7f7f7f7f7f
+DATA LCDATA1<>+0x050(SB)/8, $0x8080808080808080
+DATA LCDATA1<>+0x058(SB)/8, $0x8080808080808080
+GLOBL LCDATA1<>(SB), 8, $96
+
+TEXT ·_int8_max_min_avx2(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+ LEAQ LCDATA1<>(SB), BP
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB0_1
+ WORD $0x8941; BYTE $0xf1 // mov r9d, esi
+ WORD $0xfe83; BYTE $0x3f // cmp esi, 63
+ JA LBB0_4
+ WORD $0xb041; BYTE $0x80 // mov r8b, -128
+ WORD $0xb640; BYTE $0x7f // mov sil, 127
+ WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
+ JMP LBB0_11
+
+LBB0_1:
+ WORD $0xb640; BYTE $0x7f // mov sil, 127
+ WORD $0xb041; BYTE $0x80 // mov r8b, -128
+ JMP LBB0_12
+
+LBB0_4:
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0xc0e28341 // and r10d, -64
+ LONG $0xc0428d49 // lea rax, [r10 - 64]
+ WORD $0x8949; BYTE $0xc0 // mov r8, rax
+ LONG $0x06e8c149 // shr r8, 6
+ LONG $0x01c08349 // add r8, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_5
+ WORD $0x894c; BYTE $0xc6 // mov rsi, r8
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ LONG $0x4d6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword 0[rbp] /* [rip + .LCPI0_0] */
+ LONG $0x456ffdc5; BYTE $0x20 // vmovdqa ymm0, yword 32[rbp] /* [rip + .LCPI0_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0
+ LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1
+
+LBB0_7:
+ LONG $0x246ffec5; BYTE $0x07 // vmovdqu ymm4, yword [rdi + rax]
+ LONG $0x6c6ffec5; WORD $0x2007 // vmovdqu ymm5, yword [rdi + rax + 32]
+ LONG $0x746ffec5; WORD $0x4007 // vmovdqu ymm6, yword [rdi + rax + 64]
+ LONG $0x7c6ffec5; WORD $0x6007 // vmovdqu ymm7, yword [rdi + rax + 96]
+ LONG $0x387de2c4; BYTE $0xc4 // vpminsb ymm0, ymm0, ymm4
+ LONG $0x386de2c4; BYTE $0xd5 // vpminsb ymm2, ymm2, ymm5
+ LONG $0x3c75e2c4; BYTE $0xcc // vpmaxsb ymm1, ymm1, ymm4
+ LONG $0x3c65e2c4; BYTE $0xdd // vpmaxsb ymm3, ymm3, ymm5
+ LONG $0x387de2c4; BYTE $0xc6 // vpminsb ymm0, ymm0, ymm6
+ LONG $0x386de2c4; BYTE $0xd7 // vpminsb ymm2, ymm2, ymm7
+ LONG $0x3c75e2c4; BYTE $0xce // vpmaxsb ymm1, ymm1, ymm6
+ LONG $0x3c65e2c4; BYTE $0xdf // vpmaxsb ymm3, ymm3, ymm7
+ LONG $0x80e88348 // sub rax, -128
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB0_7
+ LONG $0x01c0f641 // test r8b, 1
+ JE LBB0_10
+
+LBB0_9:
+ LONG $0x246ffec5; BYTE $0x07 // vmovdqu ymm4, yword [rdi + rax]
+ LONG $0x6c6ffec5; WORD $0x2007 // vmovdqu ymm5, yword [rdi + rax + 32]
+ LONG $0x3c65e2c4; BYTE $0xdd // vpmaxsb ymm3, ymm3, ymm5
+ LONG $0x3c75e2c4; BYTE $0xcc // vpmaxsb ymm1, ymm1, ymm4
+ LONG $0x386de2c4; BYTE $0xd5 // vpminsb ymm2, ymm2, ymm5
+ LONG $0x387de2c4; BYTE $0xc4 // vpminsb ymm0, ymm0, ymm4
+
+LBB0_10:
+ LONG $0x3c75e2c4; BYTE $0xcb // vpmaxsb ymm1, ymm1, ymm3
+ LONG $0x397de3c4; WORD $0x01cb // vextracti128 xmm3, ymm1, 1
+ LONG $0x3c71e2c4; BYTE $0xcb // vpmaxsb xmm1, xmm1, xmm3
+ LONG $0x4deff1c5; BYTE $0x40 // vpxor xmm1, xmm1, oword 64[rbp] /* [rip + .LCPI0_2] */
+ LONG $0x387de2c4; BYTE $0xc2 // vpminsb ymm0, ymm0, ymm2
+ LONG $0xd171e9c5; BYTE $0x08 // vpsrlw xmm2, xmm1, 8
+ LONG $0xcadaf1c5 // vpminub xmm1, xmm1, xmm2
+ LONG $0x4179e2c4; BYTE $0xc9 // vphminposuw xmm1, xmm1
+ LONG $0x7e79c1c4; BYTE $0xc8 // vmovd r8d, xmm1
+ LONG $0x7ff08041 // xor r8b, 127
+ LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1
+ LONG $0x3879e2c4; BYTE $0xc1 // vpminsb xmm0, xmm0, xmm1
+ LONG $0x45eff9c5; BYTE $0x50 // vpxor xmm0, xmm0, oword 80[rbp] /* [rip + .LCPI0_3] */
+ LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8
+ LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1
+ LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0
+ LONG $0xc67ef9c5 // vmovd esi, xmm0
+ LONG $0x80f68040 // xor sil, -128
+ WORD $0x394d; BYTE $0xca // cmp r10, r9
+ JE LBB0_12
+
+LBB0_11:
+ LONG $0x04b60f42; BYTE $0x17 // movzx eax, byte [rdi + r10]
+ WORD $0x3840; BYTE $0xc6 // cmp sil, al
+ LONG $0xf6b60f40 // movzx esi, sil
+ WORD $0x4f0f; BYTE $0xf0 // cmovg esi, eax
+ WORD $0x3841; BYTE $0xc0 // cmp r8b, al
+ LONG $0xc0b60f45 // movzx r8d, r8b
+ LONG $0xc04c0f44 // cmovl r8d, eax
+ LONG $0x01c28349 // add r10, 1
+ WORD $0x394d; BYTE $0xd1 // cmp r9, r10
+ JNE LBB0_11
+
+LBB0_12:
+ WORD $0x8844; BYTE $0x01 // mov byte [rcx], r8b
+ WORD $0x8840; BYTE $0x32 // mov byte [rdx], sil
+ VZEROUPPER
+ RET
+
+LBB0_5:
+ LONG $0x4d6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword 0[rbp] /* [rip + .LCPI0_0] */
+ LONG $0x456ffdc5; BYTE $0x20 // vmovdqa ymm0, yword 32[rbp] /* [rip + .LCPI0_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0
+ LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1
+ LONG $0x01c0f641 // test r8b, 1
+ JNE LBB0_9
+ JMP LBB0_10
+
+TEXT ·_uint8_max_min_avx2(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB1_1
+ WORD $0x8941; BYTE $0xf1 // mov r9d, esi
+ WORD $0xfe83; BYTE $0x3f // cmp esi, 63
+ JA LBB1_4
+ WORD $0xb640; BYTE $0xff // mov sil, -1
+ WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
+ WORD $0xc031 // xor eax, eax
+ JMP LBB1_11
+
+LBB1_1:
+ WORD $0xb640; BYTE $0xff // mov sil, -1
+ WORD $0xc031 // xor eax, eax
+ JMP LBB1_12
+
+LBB1_4:
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0xc0e28341 // and r10d, -64
+ LONG $0xc0428d49 // lea rax, [r10 - 64]
+ WORD $0x8949; BYTE $0xc0 // mov r8, rax
+ LONG $0x06e8c149 // shr r8, 6
+ LONG $0x01c08349 // add r8, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB1_5
+ WORD $0x894c; BYTE $0xc6 // mov rsi, r8
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0
+ LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2
+ LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3
+
+LBB1_7:
+ LONG $0x246ffec5; BYTE $0x07 // vmovdqu ymm4, yword [rdi + rax]
+ LONG $0x6c6ffec5; WORD $0x2007 // vmovdqu ymm5, yword [rdi + rax + 32]
+ LONG $0x746ffec5; WORD $0x4007 // vmovdqu ymm6, yword [rdi + rax + 64]
+ LONG $0x7c6ffec5; WORD $0x6007 // vmovdqu ymm7, yword [rdi + rax + 96]
+ LONG $0xccdaf5c5 // vpminub ymm1, ymm1, ymm4
+ LONG $0xd5daedc5 // vpminub ymm2, ymm2, ymm5
+ LONG $0xc4defdc5 // vpmaxub ymm0, ymm0, ymm4
+ LONG $0xdddee5c5 // vpmaxub ymm3, ymm3, ymm5
+ LONG $0xcedaf5c5 // vpminub ymm1, ymm1, ymm6
+ LONG $0xd7daedc5 // vpminub ymm2, ymm2, ymm7
+ LONG $0xc6defdc5 // vpmaxub ymm0, ymm0, ymm6
+ LONG $0xdfdee5c5 // vpmaxub ymm3, ymm3, ymm7
+ LONG $0x80e88348 // sub rax, -128
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_7
+ LONG $0x01c0f641 // test r8b, 1
+ JE LBB1_10
+
+LBB1_9:
+ LONG $0x246ffec5; BYTE $0x07 // vmovdqu ymm4, yword [rdi + rax]
+ LONG $0x6c6ffec5; WORD $0x2007 // vmovdqu ymm5, yword [rdi + rax + 32]
+ LONG $0xdddee5c5 // vpmaxub ymm3, ymm3, ymm5
+ LONG $0xc4defdc5 // vpmaxub ymm0, ymm0, ymm4
+ LONG $0xd5daedc5 // vpminub ymm2, ymm2, ymm5
+ LONG $0xccdaf5c5 // vpminub ymm1, ymm1, ymm4
+
+LBB1_10:
+ LONG $0xcadaf5c5 // vpminub ymm1, ymm1, ymm2
+ LONG $0xc3defdc5 // vpmaxub ymm0, ymm0, ymm3
+ LONG $0x397de3c4; WORD $0x01c2 // vextracti128 xmm2, ymm0, 1
+ LONG $0xc2def9c5 // vpmaxub xmm0, xmm0, xmm2
+ LONG $0xd276e9c5 // vpcmpeqd xmm2, xmm2, xmm2
+ LONG $0xc2eff9c5 // vpxor xmm0, xmm0, xmm2
+ LONG $0xd071e9c5; BYTE $0x08 // vpsrlw xmm2, xmm0, 8
+ LONG $0xc2daf9c5 // vpminub xmm0, xmm0, xmm2
+ LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0
+ LONG $0xc07ef9c5 // vmovd eax, xmm0
+ WORD $0xd0f6 // not al
+ LONG $0x397de3c4; WORD $0x01c8 // vextracti128 xmm0, ymm1, 1
+ LONG $0xc0daf1c5 // vpminub xmm0, xmm1, xmm0
+ LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8
+ LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1
+ LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0
+ LONG $0xc67ef9c5 // vmovd esi, xmm0
+ WORD $0x394d; BYTE $0xca // cmp r10, r9
+ JE LBB1_12
+
+LBB1_11:
+ LONG $0x04b60f46; BYTE $0x17 // movzx r8d, byte [rdi + r10]
+ WORD $0x3844; BYTE $0xc6 // cmp sil, r8b
+ LONG $0xf6b60f40 // movzx esi, sil
+ LONG $0xf0430f41 // cmovae esi, r8d
+ WORD $0x3844; BYTE $0xc0 // cmp al, r8b
+ WORD $0xb60f; BYTE $0xc0 // movzx eax, al
+ LONG $0xc0460f41 // cmovbe eax, r8d
+ LONG $0x01c28349 // add r10, 1
+ WORD $0x394d; BYTE $0xd1 // cmp r9, r10
+ JNE LBB1_11
+
+LBB1_12:
+ WORD $0x0188 // mov byte [rcx], al
+ WORD $0x8840; BYTE $0x32 // mov byte [rdx], sil
+ VZEROUPPER
+ RET
+
+LBB1_5:
+ LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0
+ LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2
+ LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3
+ LONG $0x01c0f641 // test r8b, 1
+ JNE LBB1_9
+ JMP LBB1_10
+
+DATA LCDATA2<>+0x000(SB)/8, $0x8000800080008000
+DATA LCDATA2<>+0x008(SB)/8, $0x8000800080008000
+DATA LCDATA2<>+0x010(SB)/8, $0x8000800080008000
+DATA LCDATA2<>+0x018(SB)/8, $0x8000800080008000
+DATA LCDATA2<>+0x020(SB)/8, $0x7fff7fff7fff7fff
+DATA LCDATA2<>+0x028(SB)/8, $0x7fff7fff7fff7fff
+DATA LCDATA2<>+0x030(SB)/8, $0x7fff7fff7fff7fff
+DATA LCDATA2<>+0x038(SB)/8, $0x7fff7fff7fff7fff
+DATA LCDATA2<>+0x040(SB)/8, $0x7fff7fff7fff7fff
+DATA LCDATA2<>+0x048(SB)/8, $0x7fff7fff7fff7fff
+DATA LCDATA2<>+0x050(SB)/8, $0x8000800080008000
+DATA LCDATA2<>+0x058(SB)/8, $0x8000800080008000
+GLOBL LCDATA2<>(SB), 8, $96
+
+TEXT ·_int16_max_min_avx2(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+ LEAQ LCDATA2<>(SB), BP
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB2_1
+ WORD $0x8941; BYTE $0xf1 // mov r9d, esi
+ WORD $0xfe83; BYTE $0x1f // cmp esi, 31
+ JA LBB2_4
+ LONG $0x00b84166; BYTE $0x80 // mov r8w, -32768
+ LONG $0x7fffbe66 // mov si, 32767
+ WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
+ JMP LBB2_11
+
+LBB2_1:
+ LONG $0x7fffbe66 // mov si, 32767
+ LONG $0x00b84166; BYTE $0x80 // mov r8w, -32768
+ JMP LBB2_12
+
+LBB2_4:
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0xe0e28341 // and r10d, -32
+ LONG $0xe0428d49 // lea rax, [r10 - 32]
+ WORD $0x8949; BYTE $0xc0 // mov r8, rax
+ LONG $0x05e8c149 // shr r8, 5
+ LONG $0x01c08349 // add r8, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB2_5
+ WORD $0x894c; BYTE $0xc6 // mov rsi, r8
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ LONG $0x4d6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword 0[rbp] /* [rip + .LCPI2_0] */
+ LONG $0x456ffdc5; BYTE $0x20 // vmovdqa ymm0, yword 32[rbp] /* [rip + .LCPI2_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0
+ LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1
+
+LBB2_7:
+ LONG $0x246ffec5; BYTE $0x47 // vmovdqu ymm4, yword [rdi + 2*rax]
+ LONG $0x6c6ffec5; WORD $0x2047 // vmovdqu ymm5, yword [rdi + 2*rax + 32]
+ LONG $0x746ffec5; WORD $0x4047 // vmovdqu ymm6, yword [rdi + 2*rax + 64]
+ LONG $0x7c6ffec5; WORD $0x6047 // vmovdqu ymm7, yword [rdi + 2*rax + 96]
+ LONG $0xc4eafdc5 // vpminsw ymm0, ymm0, ymm4
+ LONG $0xd5eaedc5 // vpminsw ymm2, ymm2, ymm5
+ LONG $0xcceef5c5 // vpmaxsw ymm1, ymm1, ymm4
+ LONG $0xddeee5c5 // vpmaxsw ymm3, ymm3, ymm5
+ LONG $0xc6eafdc5 // vpminsw ymm0, ymm0, ymm6
+ LONG $0xd7eaedc5 // vpminsw ymm2, ymm2, ymm7
+ LONG $0xceeef5c5 // vpmaxsw ymm1, ymm1, ymm6
+ LONG $0xdfeee5c5 // vpmaxsw ymm3, ymm3, ymm7
+ LONG $0x40c08348 // add rax, 64
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_7
+ LONG $0x01c0f641 // test r8b, 1
+ JE LBB2_10
+
+LBB2_9:
+ LONG $0x246ffec5; BYTE $0x47 // vmovdqu ymm4, yword [rdi + 2*rax]
+ LONG $0x6c6ffec5; WORD $0x2047 // vmovdqu ymm5, yword [rdi + 2*rax + 32]
+ LONG $0xddeee5c5 // vpmaxsw ymm3, ymm3, ymm5
+ LONG $0xcceef5c5 // vpmaxsw ymm1, ymm1, ymm4
+ LONG $0xd5eaedc5 // vpminsw ymm2, ymm2, ymm5
+ LONG $0xc4eafdc5 // vpminsw ymm0, ymm0, ymm4
+
+LBB2_10:
+ LONG $0xcbeef5c5 // vpmaxsw ymm1, ymm1, ymm3
+ LONG $0x397de3c4; WORD $0x01cb // vextracti128 xmm3, ymm1, 1
+ LONG $0xcbeef1c5 // vpmaxsw xmm1, xmm1, xmm3
+ LONG $0x4deff1c5; BYTE $0x40 // vpxor xmm1, xmm1, oword 64[rbp] /* [rip + .LCPI2_2] */
+ LONG $0xc2eafdc5 // vpminsw ymm0, ymm0, ymm2
+ LONG $0x4179e2c4; BYTE $0xc9 // vphminposuw xmm1, xmm1
+ LONG $0x7e79c1c4; BYTE $0xc8 // vmovd r8d, xmm1
+ LONG $0xfff08141; WORD $0x007f; BYTE $0x00 // xor r8d, 32767
+ LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1
+ LONG $0xc1eaf9c5 // vpminsw xmm0, xmm0, xmm1
+ LONG $0x45eff9c5; BYTE $0x50 // vpxor xmm0, xmm0, oword 80[rbp] /* [rip + .LCPI2_3] */
+ LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0
+ LONG $0xc67ef9c5 // vmovd esi, xmm0
+ LONG $0x8000f681; WORD $0x0000 // xor esi, 32768
+ WORD $0x394d; BYTE $0xca // cmp r10, r9
+ JE LBB2_12
+
+LBB2_11:
+ LONG $0x04b70f42; BYTE $0x57 // movzx eax, word [rdi + 2*r10]
+ WORD $0x3966; BYTE $0xc6 // cmp si, ax
+ WORD $0x4f0f; BYTE $0xf0 // cmovg esi, eax
+ LONG $0xc0394166 // cmp r8w, ax
+ LONG $0xc04c0f44 // cmovl r8d, eax
+ LONG $0x01c28349 // add r10, 1
+ WORD $0x394d; BYTE $0xd1 // cmp r9, r10
+ JNE LBB2_11
+
+LBB2_12:
+ LONG $0x01894466 // mov word [rcx], r8w
+ WORD $0x8966; BYTE $0x32 // mov word [rdx], si
+ VZEROUPPER
+ RET
+
+LBB2_5:
+ LONG $0x4d6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword 0[rbp] /* [rip + .LCPI2_0] */
+ LONG $0x456ffdc5; BYTE $0x20 // vmovdqa ymm0, yword 32[rbp] /* [rip + .LCPI2_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0
+ LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1
+ LONG $0x01c0f641 // test r8b, 1
+ JNE LBB2_9
+ JMP LBB2_10
+
+TEXT ·_uint16_max_min_avx2(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB3_1
+ WORD $0x8941; BYTE $0xf1 // mov r9d, esi
+ WORD $0xfe83; BYTE $0x1f // cmp esi, 31
+ JA LBB3_4
+ LONG $0xffb84166; BYTE $0xff // mov r8w, -1
+ WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
+ WORD $0xf631 // xor esi, esi
+ JMP LBB3_11
+
+LBB3_1:
+ LONG $0xffb84166; BYTE $0xff // mov r8w, -1
+ WORD $0xf631 // xor esi, esi
+ JMP LBB3_12
+
+LBB3_4:
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0xe0e28341 // and r10d, -32
+ LONG $0xe0428d49 // lea rax, [r10 - 32]
+ WORD $0x8949; BYTE $0xc0 // mov r8, rax
+ LONG $0x05e8c149 // shr r8, 5
+ LONG $0x01c08349 // add r8, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB3_5
+ WORD $0x894c; BYTE $0xc6 // mov rsi, r8
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0
+ LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2
+ LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3
+
+LBB3_7:
+ LONG $0x246ffec5; BYTE $0x47 // vmovdqu ymm4, yword [rdi + 2*rax]
+ LONG $0x6c6ffec5; WORD $0x2047 // vmovdqu ymm5, yword [rdi + 2*rax + 32]
+ LONG $0x746ffec5; WORD $0x4047 // vmovdqu ymm6, yword [rdi + 2*rax + 64]
+ LONG $0x7c6ffec5; WORD $0x6047 // vmovdqu ymm7, yword [rdi + 2*rax + 96]
+ LONG $0x3a75e2c4; BYTE $0xcc // vpminuw ymm1, ymm1, ymm4
+ LONG $0x3a6de2c4; BYTE $0xd5 // vpminuw ymm2, ymm2, ymm5
+ LONG $0x3e7de2c4; BYTE $0xc4 // vpmaxuw ymm0, ymm0, ymm4
+ LONG $0x3e65e2c4; BYTE $0xdd // vpmaxuw ymm3, ymm3, ymm5
+ LONG $0x3a75e2c4; BYTE $0xce // vpminuw ymm1, ymm1, ymm6
+ LONG $0x3a6de2c4; BYTE $0xd7 // vpminuw ymm2, ymm2, ymm7
+ LONG $0x3e7de2c4; BYTE $0xc6 // vpmaxuw ymm0, ymm0, ymm6
+ LONG $0x3e65e2c4; BYTE $0xdf // vpmaxuw ymm3, ymm3, ymm7
+ LONG $0x40c08348 // add rax, 64
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB3_7
+ LONG $0x01c0f641 // test r8b, 1
+ JE LBB3_10
+
+LBB3_9:
+ LONG $0x246ffec5; BYTE $0x47 // vmovdqu ymm4, yword [rdi + 2*rax]
+ LONG $0x6c6ffec5; WORD $0x2047 // vmovdqu ymm5, yword [rdi + 2*rax + 32]
+ LONG $0x3e65e2c4; BYTE $0xdd // vpmaxuw ymm3, ymm3, ymm5
+ LONG $0x3e7de2c4; BYTE $0xc4 // vpmaxuw ymm0, ymm0, ymm4
+ LONG $0x3a6de2c4; BYTE $0xd5 // vpminuw ymm2, ymm2, ymm5
+ LONG $0x3a75e2c4; BYTE $0xcc // vpminuw ymm1, ymm1, ymm4
+
+LBB3_10:
+ LONG $0x3a75e2c4; BYTE $0xca // vpminuw ymm1, ymm1, ymm2
+ LONG $0x3e7de2c4; BYTE $0xc3 // vpmaxuw ymm0, ymm0, ymm3
+ LONG $0x397de3c4; WORD $0x01c2 // vextracti128 xmm2, ymm0, 1
+ LONG $0x3e79e2c4; BYTE $0xc2 // vpmaxuw xmm0, xmm0, xmm2
+ LONG $0xd276e9c5 // vpcmpeqd xmm2, xmm2, xmm2
+ LONG $0xc2eff9c5 // vpxor xmm0, xmm0, xmm2
+ LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0
+ LONG $0xc67ef9c5 // vmovd esi, xmm0
+ WORD $0xd6f7 // not esi
+ LONG $0x397de3c4; WORD $0x01c8 // vextracti128 xmm0, ymm1, 1
+ LONG $0x3a71e2c4; BYTE $0xc0 // vpminuw xmm0, xmm1, xmm0
+ LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0
+ LONG $0x7e79c1c4; BYTE $0xc0 // vmovd r8d, xmm0
+ WORD $0x394d; BYTE $0xca // cmp r10, r9
+ JE LBB3_12
+
+LBB3_11:
+ LONG $0x04b70f42; BYTE $0x57 // movzx eax, word [rdi + 2*r10]
+ LONG $0xc0394166 // cmp r8w, ax
+ LONG $0xc0430f44 // cmovae r8d, eax
+ WORD $0x3966; BYTE $0xc6 // cmp si, ax
+ WORD $0x460f; BYTE $0xf0 // cmovbe esi, eax
+ LONG $0x01c28349 // add r10, 1
+ WORD $0x394d; BYTE $0xd1 // cmp r9, r10
+ JNE LBB3_11
+
+LBB3_12:
+ WORD $0x8966; BYTE $0x31 // mov word [rcx], si
+ LONG $0x02894466 // mov word [rdx], r8w
+ VZEROUPPER
+ RET
+
+LBB3_5:
+ LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0
+ LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2
+ LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3
+ LONG $0x01c0f641 // test r8b, 1
+ JNE LBB3_9
+ JMP LBB3_10
+
+DATA LCDATA3<>+0x000(SB)/8, $0x7fffffff80000000
+GLOBL LCDATA3<>(SB), 8, $8
+
+TEXT ·_int32_max_min_avx2(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+ LEAQ LCDATA3<>(SB), BP
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB4_1
+ WORD $0x8941; BYTE $0xf0 // mov r8d, esi
+ WORD $0xfe83; BYTE $0x1f // cmp esi, 31
+ JA LBB4_4
+ LONG $0x0000ba41; WORD $0x8000 // mov r10d, -2147483648
+ LONG $0xffffffb8; BYTE $0x7f // mov eax, 2147483647
+ WORD $0x3145; BYTE $0xc9 // xor r9d, r9d
+ JMP LBB4_7
+
+LBB4_1:
+ LONG $0xffffffb8; BYTE $0x7f // mov eax, 2147483647
+ LONG $0x000000be; BYTE $0x80 // mov esi, -2147483648
+ JMP LBB4_8
+
+LBB4_4:
+ WORD $0x8945; BYTE $0xc1 // mov r9d, r8d
+ LONG $0x587de2c4; WORD $0x0065 // vpbroadcastd ymm4, dword 0[rbp] /* [rip + .LCPI4_0] */
+ LONG $0xe0e18341 // and r9d, -32
+ LONG $0x587de2c4; WORD $0x0445 // vpbroadcastd ymm0, dword 4[rbp] /* [rip + .LCPI4_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0
+ LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0
+ LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0
+ LONG $0xec6ffdc5 // vmovdqa ymm5, ymm4
+ LONG $0xf46ffdc5 // vmovdqa ymm6, ymm4
+ LONG $0xfc6ffdc5 // vmovdqa ymm7, ymm4
+
+LBB4_5:
+ LONG $0x046f7ec5; BYTE $0x87 // vmovdqu ymm8, yword [rdi + 4*rax]
+ LONG $0x4c6f7ec5; WORD $0x2087 // vmovdqu ymm9, yword [rdi + 4*rax + 32]
+ LONG $0x546f7ec5; WORD $0x4087 // vmovdqu ymm10, yword [rdi + 4*rax + 64]
+ LONG $0x5c6f7ec5; WORD $0x6087 // vmovdqu ymm11, yword [rdi + 4*rax + 96]
+ LONG $0x397dc2c4; BYTE $0xc0 // vpminsd ymm0, ymm0, ymm8
+ LONG $0x3975c2c4; BYTE $0xc9 // vpminsd ymm1, ymm1, ymm9
+ LONG $0x396dc2c4; BYTE $0xd2 // vpminsd ymm2, ymm2, ymm10
+ LONG $0x3965c2c4; BYTE $0xdb // vpminsd ymm3, ymm3, ymm11
+ LONG $0x3d5dc2c4; BYTE $0xe0 // vpmaxsd ymm4, ymm4, ymm8
+ LONG $0x3d55c2c4; BYTE $0xe9 // vpmaxsd ymm5, ymm5, ymm9
+ LONG $0x3d4dc2c4; BYTE $0xf2 // vpmaxsd ymm6, ymm6, ymm10
+ LONG $0x3d45c2c4; BYTE $0xfb // vpmaxsd ymm7, ymm7, ymm11
+ LONG $0x20c08348 // add rax, 32
+ WORD $0x3949; BYTE $0xc1 // cmp r9, rax
+ JNE LBB4_5
+ LONG $0x3d5de2c4; BYTE $0xe5 // vpmaxsd ymm4, ymm4, ymm5
+ LONG $0x3d5de2c4; BYTE $0xe6 // vpmaxsd ymm4, ymm4, ymm6
+ LONG $0x3d5de2c4; BYTE $0xe7 // vpmaxsd ymm4, ymm4, ymm7
+ LONG $0x397de3c4; WORD $0x01e5 // vextracti128 xmm5, ymm4, 1
+ LONG $0x3d59e2c4; BYTE $0xe5 // vpmaxsd xmm4, xmm4, xmm5
+ LONG $0xec70f9c5; BYTE $0x4e // vpshufd xmm5, xmm4, 78
+ LONG $0x3d59e2c4; BYTE $0xe5 // vpmaxsd xmm4, xmm4, xmm5
+ LONG $0xec70f9c5; BYTE $0xe5 // vpshufd xmm5, xmm4, 229
+ LONG $0x3d59e2c4; BYTE $0xe5 // vpmaxsd xmm4, xmm4, xmm5
+ LONG $0x7e79c1c4; BYTE $0xe2 // vmovd r10d, xmm4
+ LONG $0x397de2c4; BYTE $0xc1 // vpminsd ymm0, ymm0, ymm1
+ LONG $0x397de2c4; BYTE $0xc2 // vpminsd ymm0, ymm0, ymm2
+ LONG $0x397de2c4; BYTE $0xc3 // vpminsd ymm0, ymm0, ymm3
+ LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1
+ LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1
+ LONG $0xc870f9c5; BYTE $0x4e // vpshufd xmm1, xmm0, 78
+ LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1
+ LONG $0xc870f9c5; BYTE $0xe5 // vpshufd xmm1, xmm0, 229
+ LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1
+ LONG $0xc07ef9c5 // vmovd eax, xmm0
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0x394d; BYTE $0xc1 // cmp r9, r8
+ JE LBB4_8
+
+LBB4_7:
+ LONG $0x8f348b42 // mov esi, dword [rdi + 4*r9]
+ WORD $0xf039 // cmp eax, esi
+ WORD $0x4f0f; BYTE $0xc6 // cmovg eax, esi
+ WORD $0x3941; BYTE $0xf2 // cmp r10d, esi
+ LONG $0xf24d0f41 // cmovge esi, r10d
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8941; BYTE $0xf2 // mov r10d, esi
+ WORD $0x394d; BYTE $0xc8 // cmp r8, r9
+ JNE LBB4_7
+
+LBB4_8:
+ WORD $0x3189 // mov dword [rcx], esi
+ WORD $0x0289 // mov dword [rdx], eax
+ VZEROUPPER
+ RET
+
+TEXT ·_uint32_max_min_avx2(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB5_1
+ WORD $0x8941; BYTE $0xf0 // mov r8d, esi
+ WORD $0xfe83; BYTE $0x1f // cmp esi, 31
+ JA LBB5_4
+ WORD $0x3145; BYTE $0xc9 // xor r9d, r9d
+ LONG $0xffffffb8; BYTE $0xff // mov eax, -1
+ WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
+ JMP LBB5_7
+
+LBB5_1:
+ LONG $0xffffffb8; BYTE $0xff // mov eax, -1
+ WORD $0xf631 // xor esi, esi
+ JMP LBB5_8
+
+LBB5_4:
+ WORD $0x8945; BYTE $0xc1 // mov r9d, r8d
+ LONG $0xe0e18341 // and r9d, -32
+ LONG $0xe4efd9c5 // vpxor xmm4, xmm4, xmm4
+ LONG $0xc076fdc5 // vpcmpeqd ymm0, ymm0, ymm0
+ WORD $0xc031 // xor eax, eax
+ LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1
+ LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2
+ LONG $0xdb76e5c5 // vpcmpeqd ymm3, ymm3, ymm3
+ LONG $0xedefd1c5 // vpxor xmm5, xmm5, xmm5
+ LONG $0xf6efc9c5 // vpxor xmm6, xmm6, xmm6
+ LONG $0xffefc1c5 // vpxor xmm7, xmm7, xmm7
+
+LBB5_5:
+ LONG $0x046f7ec5; BYTE $0x87 // vmovdqu ymm8, yword [rdi + 4*rax]
+ LONG $0x4c6f7ec5; WORD $0x2087 // vmovdqu ymm9, yword [rdi + 4*rax + 32]
+ LONG $0x546f7ec5; WORD $0x4087 // vmovdqu ymm10, yword [rdi + 4*rax + 64]
+ LONG $0x5c6f7ec5; WORD $0x6087 // vmovdqu ymm11, yword [rdi + 4*rax + 96]
+ LONG $0x3b7dc2c4; BYTE $0xc0 // vpminud ymm0, ymm0, ymm8
+ LONG $0x3b75c2c4; BYTE $0xc9 // vpminud ymm1, ymm1, ymm9
+ LONG $0x3b6dc2c4; BYTE $0xd2 // vpminud ymm2, ymm2, ymm10
+ LONG $0x3b65c2c4; BYTE $0xdb // vpminud ymm3, ymm3, ymm11
+ LONG $0x3f5dc2c4; BYTE $0xe0 // vpmaxud ymm4, ymm4, ymm8
+ LONG $0x3f55c2c4; BYTE $0xe9 // vpmaxud ymm5, ymm5, ymm9
+ LONG $0x3f4dc2c4; BYTE $0xf2 // vpmaxud ymm6, ymm6, ymm10
+ LONG $0x3f45c2c4; BYTE $0xfb // vpmaxud ymm7, ymm7, ymm11
+ LONG $0x20c08348 // add rax, 32
+ WORD $0x3949; BYTE $0xc1 // cmp r9, rax
+ JNE LBB5_5
+ LONG $0x3f5de2c4; BYTE $0xe5 // vpmaxud ymm4, ymm4, ymm5
+ LONG $0x3f5de2c4; BYTE $0xe6 // vpmaxud ymm4, ymm4, ymm6
+ LONG $0x3f5de2c4; BYTE $0xe7 // vpmaxud ymm4, ymm4, ymm7
+ LONG $0x397de3c4; WORD $0x01e5 // vextracti128 xmm5, ymm4, 1
+ LONG $0x3f59e2c4; BYTE $0xe5 // vpmaxud xmm4, xmm4, xmm5
+ LONG $0xec70f9c5; BYTE $0x4e // vpshufd xmm5, xmm4, 78
+ LONG $0x3f59e2c4; BYTE $0xe5 // vpmaxud xmm4, xmm4, xmm5
+ LONG $0xec70f9c5; BYTE $0xe5 // vpshufd xmm5, xmm4, 229
+ LONG $0x3f59e2c4; BYTE $0xe5 // vpmaxud xmm4, xmm4, xmm5
+ LONG $0x7e79c1c4; BYTE $0xe2 // vmovd r10d, xmm4
+ LONG $0x3b7de2c4; BYTE $0xc1 // vpminud ymm0, ymm0, ymm1
+ LONG $0x3b7de2c4; BYTE $0xc2 // vpminud ymm0, ymm0, ymm2
+ LONG $0x3b7de2c4; BYTE $0xc3 // vpminud ymm0, ymm0, ymm3
+ LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1
+ LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1
+ LONG $0xc870f9c5; BYTE $0x4e // vpshufd xmm1, xmm0, 78
+ LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1
+ LONG $0xc870f9c5; BYTE $0xe5 // vpshufd xmm1, xmm0, 229
+ LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1
+ LONG $0xc07ef9c5 // vmovd eax, xmm0
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0x394d; BYTE $0xc1 // cmp r9, r8
+ JE LBB5_8
+
+LBB5_7:
+ LONG $0x8f348b42 // mov esi, dword [rdi + 4*r9]
+ WORD $0xf039 // cmp eax, esi
+ WORD $0x430f; BYTE $0xc6 // cmovae eax, esi
+ WORD $0x3941; BYTE $0xf2 // cmp r10d, esi
+ LONG $0xf2470f41 // cmova esi, r10d
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8941; BYTE $0xf2 // mov r10d, esi
+ WORD $0x394d; BYTE $0xc8 // cmp r8, r9
+ JNE LBB5_7
+
+LBB5_8:
+ WORD $0x3189 // mov dword [rcx], esi
+ WORD $0x0289 // mov dword [rdx], eax
+ VZEROUPPER
+ RET
+
+DATA LCDATA4<>+0x000(SB)/8, $0x8000000000000000
+DATA LCDATA4<>+0x008(SB)/8, $0x7fffffffffffffff
+GLOBL LCDATA4<>(SB), 8, $16
+
+TEXT ·_int64_max_min_avx2(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+ LEAQ LCDATA4<>(SB), BP
+
+ QUAD $0xffffffffffffb848; WORD $0x7fff // mov rax, 9223372036854775807
+ WORD $0xf685 // test esi, esi
+ JLE LBB6_1
+ WORD $0x8941; BYTE $0xf0 // mov r8d, esi
+ WORD $0xfe83; BYTE $0x0f // cmp esi, 15
+ JA LBB6_4
+ LONG $0x01508d4c // lea r10, [rax + 1]
+ WORD $0x3145; BYTE $0xc9 // xor r9d, r9d
+ JMP LBB6_7
+
+LBB6_1:
+ LONG $0x01708d48 // lea rsi, [rax + 1]
+ JMP LBB6_8
+
+LBB6_4:
+ WORD $0x8945; BYTE $0xc1 // mov r9d, r8d
+ LONG $0x597de2c4; WORD $0x0065 // vpbroadcastq ymm4, qword 0[rbp] /* [rip + .LCPI6_0] */
+ LONG $0xf0e18341 // and r9d, -16
+ LONG $0x597de2c4; WORD $0x0845 // vpbroadcastq ymm0, qword 8[rbp] /* [rip + .LCPI6_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0
+ LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0
+ LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0
+ LONG $0xfc6ffdc5 // vmovdqa ymm7, ymm4
+ LONG $0xf46ffdc5 // vmovdqa ymm6, ymm4
+ LONG $0xec6ffdc5 // vmovdqa ymm5, ymm4
+
+LBB6_5:
+ LONG $0x046f7ec5; BYTE $0xc7 // vmovdqu ymm8, yword [rdi + 8*rax]
+ LONG $0x373d62c4; BYTE $0xc8 // vpcmpgtq ymm9, ymm8, ymm0
+ LONG $0x4b3de3c4; WORD $0x90c0 // vblendvpd ymm0, ymm8, ymm0, ymm9
+ LONG $0x4c6f7ec5; WORD $0x20c7 // vmovdqu ymm9, yword [rdi + 8*rax + 32]
+ LONG $0x373562c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm9, ymm3
+ LONG $0x4b35e3c4; WORD $0xa0db // vblendvpd ymm3, ymm9, ymm3, ymm10
+ LONG $0x546f7ec5; WORD $0x40c7 // vmovdqu ymm10, yword [rdi + 8*rax + 64]
+ LONG $0x372d62c4; BYTE $0xda // vpcmpgtq ymm11, ymm10, ymm2
+ LONG $0x4b2de3c4; WORD $0xb0d2 // vblendvpd ymm2, ymm10, ymm2, ymm11
+ LONG $0x5c6f7ec5; WORD $0x60c7 // vmovdqu ymm11, yword [rdi + 8*rax + 96]
+ LONG $0x372562c4; BYTE $0xe1 // vpcmpgtq ymm12, ymm11, ymm1
+ LONG $0x4b25e3c4; WORD $0xc0c9 // vblendvpd ymm1, ymm11, ymm1, ymm12
+ LONG $0x375d42c4; BYTE $0xe0 // vpcmpgtq ymm12, ymm4, ymm8
+ LONG $0x4b3de3c4; WORD $0xc0e4 // vblendvpd ymm4, ymm8, ymm4, ymm12
+ LONG $0x374542c4; BYTE $0xc1 // vpcmpgtq ymm8, ymm7, ymm9
+ LONG $0x4b35e3c4; WORD $0x80ff // vblendvpd ymm7, ymm9, ymm7, ymm8
+ LONG $0x374d42c4; BYTE $0xc2 // vpcmpgtq ymm8, ymm6, ymm10
+ LONG $0x4b2de3c4; WORD $0x80f6 // vblendvpd ymm6, ymm10, ymm6, ymm8
+ LONG $0x375542c4; BYTE $0xc3 // vpcmpgtq ymm8, ymm5, ymm11
+ LONG $0x4b25e3c4; WORD $0x80ed // vblendvpd ymm5, ymm11, ymm5, ymm8
+ LONG $0x10c08348 // add rax, 16
+ WORD $0x3949; BYTE $0xc1 // cmp r9, rax
+ JNE LBB6_5
+ LONG $0x375d62c4; BYTE $0xc7 // vpcmpgtq ymm8, ymm4, ymm7
+ LONG $0x4b45e3c4; WORD $0x80e4 // vblendvpd ymm4, ymm7, ymm4, ymm8
+ LONG $0x375de2c4; BYTE $0xfe // vpcmpgtq ymm7, ymm4, ymm6
+ LONG $0x4b4de3c4; WORD $0x70e4 // vblendvpd ymm4, ymm6, ymm4, ymm7
+ LONG $0x375de2c4; BYTE $0xf5 // vpcmpgtq ymm6, ymm4, ymm5
+ LONG $0x4b55e3c4; WORD $0x60e4 // vblendvpd ymm4, ymm5, ymm4, ymm6
+ LONG $0x197de3c4; WORD $0x01e5 // vextractf128 xmm5, ymm4, 1
+ LONG $0x3759e2c4; BYTE $0xf5 // vpcmpgtq xmm6, xmm4, xmm5
+ LONG $0x4b51e3c4; WORD $0x60e4 // vblendvpd xmm4, xmm5, xmm4, xmm6
+ LONG $0x0479e3c4; WORD $0x4eec // vpermilps xmm5, xmm4, 78
+ LONG $0x3759e2c4; BYTE $0xf5 // vpcmpgtq xmm6, xmm4, xmm5
+ LONG $0x4b51e3c4; WORD $0x60e4 // vblendvpd xmm4, xmm5, xmm4, xmm6
+ LONG $0x7ef9c1c4; BYTE $0xe2 // vmovq r10, xmm4
+ LONG $0x3765e2c4; BYTE $0xe0 // vpcmpgtq ymm4, ymm3, ymm0
+ LONG $0x4b65e3c4; WORD $0x40c0 // vblendvpd ymm0, ymm3, ymm0, ymm4
+ LONG $0x376de2c4; BYTE $0xd8 // vpcmpgtq ymm3, ymm2, ymm0
+ LONG $0x4b6de3c4; WORD $0x30c0 // vblendvpd ymm0, ymm2, ymm0, ymm3
+ LONG $0x3775e2c4; BYTE $0xd0 // vpcmpgtq ymm2, ymm1, ymm0
+ LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2
+ LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1
+ LONG $0x3771e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm1, xmm0
+ LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2
+ LONG $0x0479e3c4; WORD $0x4ec8 // vpermilps xmm1, xmm0, 78
+ LONG $0x3771e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm1, xmm0
+ LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2
+ LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0
+ WORD $0x894c; BYTE $0xd6 // mov rsi, r10
+ WORD $0x394d; BYTE $0xc1 // cmp r9, r8
+ JE LBB6_8
+
+LBB6_7:
+ LONG $0xcf348b4a // mov rsi, qword [rdi + 8*r9]
+ WORD $0x3948; BYTE $0xf0 // cmp rax, rsi
+ LONG $0xc64f0f48 // cmovg rax, rsi
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ LONG $0xf24d0f49 // cmovge rsi, r10
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8949; BYTE $0xf2 // mov r10, rsi
+ WORD $0x394d; BYTE $0xc8 // cmp r8, r9
+ JNE LBB6_7
+
+LBB6_8:
+ WORD $0x8948; BYTE $0x31 // mov qword [rcx], rsi
+ WORD $0x8948; BYTE $0x02 // mov qword [rdx], rax
+ VZEROUPPER
+ RET
+
+DATA LCDATA5<>+0x000(SB)/8, $0x8000000000000000
+GLOBL LCDATA5<>(SB), 8, $8
+
+TEXT ·_uint64_max_min_avx2(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+ LEAQ LCDATA5<>(SB), BP
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB7_1
+ WORD $0x8941; BYTE $0xf0 // mov r8d, esi
+ WORD $0xfe83; BYTE $0x0f // cmp esi, 15
+ JA LBB7_4
+ LONG $0xffc0c748; WORD $0xffff; BYTE $0xff // mov rax, -1
+ WORD $0x3145; BYTE $0xc9 // xor r9d, r9d
+ WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
+ JMP LBB7_7
+
+LBB7_1:
+ LONG $0xffc0c748; WORD $0xffff; BYTE $0xff // mov rax, -1
+ WORD $0xf631 // xor esi, esi
+ JMP LBB7_8
+
+LBB7_4:
+ WORD $0x8945; BYTE $0xc1 // mov r9d, r8d
+ LONG $0xf0e18341 // and r9d, -16
+ LONG $0xedefd1c5 // vpxor xmm5, xmm5, xmm5
+ LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1
+ WORD $0xc031 // xor eax, eax
+ LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI7_0] */
+ LONG $0xe476ddc5 // vpcmpeqd ymm4, ymm4, ymm4
+ LONG $0xdb76e5c5 // vpcmpeqd ymm3, ymm3, ymm3
+ LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2
+ LONG $0xef3941c4; BYTE $0xc0 // vpxor xmm8, xmm8, xmm8
+ LONG $0xffefc1c5 // vpxor xmm7, xmm7, xmm7
+ LONG $0xf6efc9c5 // vpxor xmm6, xmm6, xmm6
+
+LBB7_5:
+ LONG $0x0c6f7ec5; BYTE $0xc7 // vmovdqu ymm9, yword [rdi + 8*rax]
+ LONG $0xd0ef75c5 // vpxor ymm10, ymm1, ymm0
+ LONG $0xd8ef35c5 // vpxor ymm11, ymm9, ymm0
+ LONG $0x372542c4; BYTE $0xd2 // vpcmpgtq ymm10, ymm11, ymm10
+ LONG $0x4b35e3c4; WORD $0xa0c9 // vblendvpd ymm1, ymm9, ymm1, ymm10
+ LONG $0xd0ef55c5 // vpxor ymm10, ymm5, ymm0
+ LONG $0x372d42c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm10, ymm11
+ LONG $0x4b35e3c4; WORD $0xa0ed // vblendvpd ymm5, ymm9, ymm5, ymm10
+ LONG $0x4c6f7ec5; WORD $0x20c7 // vmovdqu ymm9, yword [rdi + 8*rax + 32]
+ LONG $0xd0ef5dc5 // vpxor ymm10, ymm4, ymm0
+ LONG $0xd8ef35c5 // vpxor ymm11, ymm9, ymm0
+ LONG $0x372542c4; BYTE $0xd2 // vpcmpgtq ymm10, ymm11, ymm10
+ LONG $0x4b35e3c4; WORD $0xa0e4 // vblendvpd ymm4, ymm9, ymm4, ymm10
+ LONG $0xd0ef3dc5 // vpxor ymm10, ymm8, ymm0
+ LONG $0x372d42c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm10, ymm11
+ LONG $0x5c6f7ec5; WORD $0x40c7 // vmovdqu ymm11, yword [rdi + 8*rax + 64]
+ LONG $0x4b3543c4; WORD $0xa0c0 // vblendvpd ymm8, ymm9, ymm8, ymm10
+ LONG $0xc8ef65c5 // vpxor ymm9, ymm3, ymm0
+ LONG $0xd0ef25c5 // vpxor ymm10, ymm11, ymm0
+ LONG $0x372d42c4; BYTE $0xc9 // vpcmpgtq ymm9, ymm10, ymm9
+ LONG $0x4b25e3c4; WORD $0x90db // vblendvpd ymm3, ymm11, ymm3, ymm9
+ LONG $0xc8ef45c5 // vpxor ymm9, ymm7, ymm0
+ LONG $0x373542c4; BYTE $0xca // vpcmpgtq ymm9, ymm9, ymm10
+ LONG $0x4b25e3c4; WORD $0x90ff // vblendvpd ymm7, ymm11, ymm7, ymm9
+ LONG $0x4c6f7ec5; WORD $0x60c7 // vmovdqu ymm9, yword [rdi + 8*rax + 96]
+ LONG $0xd0ef6dc5 // vpxor ymm10, ymm2, ymm0
+ LONG $0xd8ef35c5 // vpxor ymm11, ymm9, ymm0
+ LONG $0x372542c4; BYTE $0xd2 // vpcmpgtq ymm10, ymm11, ymm10
+ LONG $0x4b35e3c4; WORD $0xa0d2 // vblendvpd ymm2, ymm9, ymm2, ymm10
+ LONG $0xd0ef4dc5 // vpxor ymm10, ymm6, ymm0
+ LONG $0x372d42c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm10, ymm11
+ LONG $0x4b35e3c4; WORD $0xa0f6 // vblendvpd ymm6, ymm9, ymm6, ymm10
+ LONG $0x10c08348 // add rax, 16
+ WORD $0x3949; BYTE $0xc1 // cmp r9, rax
+ JNE LBB7_5
+ LONG $0xc8ef3dc5 // vpxor ymm9, ymm8, ymm0
+ LONG $0xd0ef55c5 // vpxor ymm10, ymm5, ymm0
+ LONG $0x372d42c4; BYTE $0xc9 // vpcmpgtq ymm9, ymm10, ymm9
+ LONG $0x4b3de3c4; WORD $0x90ed // vblendvpd ymm5, ymm8, ymm5, ymm9
+ LONG $0xc05755c5 // vxorpd ymm8, ymm5, ymm0
+ LONG $0xc8ef45c5 // vpxor ymm9, ymm7, ymm0
+ LONG $0x373d42c4; BYTE $0xc1 // vpcmpgtq ymm8, ymm8, ymm9
+ LONG $0x4b45e3c4; WORD $0x80ed // vblendvpd ymm5, ymm7, ymm5, ymm8
+ LONG $0xf857d5c5 // vxorpd ymm7, ymm5, ymm0
+ LONG $0xc0ef4dc5 // vpxor ymm8, ymm6, ymm0
+ LONG $0x3745c2c4; BYTE $0xf8 // vpcmpgtq ymm7, ymm7, ymm8
+ LONG $0x4b4de3c4; WORD $0x70ed // vblendvpd ymm5, ymm6, ymm5, ymm7
+ LONG $0x197de3c4; WORD $0x01ee // vextractf128 xmm6, ymm5, 1
+ LONG $0xc05749c5 // vxorpd xmm8, xmm6, xmm0
+ LONG $0xf857d1c5 // vxorpd xmm7, xmm5, xmm0
+ LONG $0x3741c2c4; BYTE $0xf8 // vpcmpgtq xmm7, xmm7, xmm8
+ LONG $0x4b49e3c4; WORD $0x70ed // vblendvpd xmm5, xmm6, xmm5, xmm7
+ LONG $0x0479e3c4; WORD $0x4ef5 // vpermilps xmm6, xmm5, 78
+ LONG $0xc05751c5 // vxorpd xmm8, xmm5, xmm0
+ LONG $0xf857c9c5 // vxorpd xmm7, xmm6, xmm0
+ LONG $0x3739e2c4; BYTE $0xff // vpcmpgtq xmm7, xmm8, xmm7
+ LONG $0x4b49e3c4; WORD $0x70ed // vblendvpd xmm5, xmm6, xmm5, xmm7
+ LONG $0xf0eff5c5 // vpxor ymm6, ymm1, ymm0
+ LONG $0xf8efddc5 // vpxor ymm7, ymm4, ymm0
+ LONG $0x3745e2c4; BYTE $0xf6 // vpcmpgtq ymm6, ymm7, ymm6
+ LONG $0x4b5de3c4; WORD $0x60c9 // vblendvpd ymm1, ymm4, ymm1, ymm6
+ LONG $0xe057f5c5 // vxorpd ymm4, ymm1, ymm0
+ LONG $0xf0efe5c5 // vpxor ymm6, ymm3, ymm0
+ LONG $0x374de2c4; BYTE $0xe4 // vpcmpgtq ymm4, ymm6, ymm4
+ LONG $0x4b65e3c4; WORD $0x40c9 // vblendvpd ymm1, ymm3, ymm1, ymm4
+ LONG $0x7ef9c1c4; BYTE $0xea // vmovq r10, xmm5
+ LONG $0xd857f5c5 // vxorpd ymm3, ymm1, ymm0
+ LONG $0xe0efedc5 // vpxor ymm4, ymm2, ymm0
+ LONG $0x375de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm4, ymm3
+ LONG $0x4b6de3c4; WORD $0x30c9 // vblendvpd ymm1, ymm2, ymm1, ymm3
+ LONG $0x197de3c4; WORD $0x01ca // vextractf128 xmm2, ymm1, 1
+ LONG $0xd857f1c5 // vxorpd xmm3, xmm1, xmm0
+ LONG $0xe057e9c5 // vxorpd xmm4, xmm2, xmm0
+ LONG $0x3759e2c4; BYTE $0xdb // vpcmpgtq xmm3, xmm4, xmm3
+ LONG $0x4b69e3c4; WORD $0x30c9 // vblendvpd xmm1, xmm2, xmm1, xmm3
+ LONG $0x0479e3c4; WORD $0x4ed1 // vpermilps xmm2, xmm1, 78
+ LONG $0xd857f1c5 // vxorpd xmm3, xmm1, xmm0
+ LONG $0xc057e9c5 // vxorpd xmm0, xmm2, xmm0
+ LONG $0x3779e2c4; BYTE $0xc3 // vpcmpgtq xmm0, xmm0, xmm3
+ LONG $0x4b69e3c4; WORD $0x00c1 // vblendvpd xmm0, xmm2, xmm1, xmm0
+ LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0
+ WORD $0x894c; BYTE $0xd6 // mov rsi, r10
+ WORD $0x394d; BYTE $0xc1 // cmp r9, r8
+ JE LBB7_8
+
+LBB7_7:
+ LONG $0xcf348b4a // mov rsi, qword [rdi + 8*r9]
+ WORD $0x3948; BYTE $0xf0 // cmp rax, rsi
+ LONG $0xc6430f48 // cmovae rax, rsi
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ LONG $0xf2470f49 // cmova rsi, r10
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8949; BYTE $0xf2 // mov r10, rsi
+ WORD $0x394d; BYTE $0xc8 // cmp r8, r9
+ JNE LBB7_7
+
+LBB7_8:
+ WORD $0x8948; BYTE $0x31 // mov qword [rcx], rsi
+ WORD $0x8948; BYTE $0x02 // mov qword [rdx], rax
+ VZEROUPPER
+ RET
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.go
new file mode 100644
index 000000000..f9d3c44e3
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.go
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+import "unsafe"
+
+// This file contains convenience functions for utilizing Arm64 Neon intrinsics to quickly
+// and efficiently get the min and max from an integral slice.
+
+//go:noescape
+func _int32_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int32MaxMinNEON(values []int32) (min, max int32) {
+ _int32_max_min_neon(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _uint32_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint32MaxMinNEON(values []uint32) (min, max uint32) {
+ _uint32_max_min_neon(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _int64_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int64MaxMinNEON(values []int64) (min, max int64) {
+ _int64_max_min_neon(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _uint64_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint64MaxMinNEON(values []uint64) (min, max uint64) {
+ _uint64_max_min_neon(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.s b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.s
new file mode 100644
index 000000000..b679bb6e3
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.s
@@ -0,0 +1,324 @@
+//+build !noasm !appengine
+
+// ARROW-15336
+// (C2GOASM doesn't work correctly for Arm64)
+// Partly GENERATED BY asm2plan9s.
+
+
+// func _int32_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+TEXT ·_int32_max_min_neon(SB), $0-32
+
+ MOVD values+0(FP), R0
+ MOVD length+8(FP), R1
+ MOVD minout+16(FP), R2
+ MOVD maxout+24(FP), R3
+
+ WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]!
+ WORD $0x7100043f // cmp w1, #1
+ WORD $0x910003fd // mov x29, sp
+ BLT LBB0_3
+
+ WORD $0x71000c3f // cmp w1, #3
+ WORD $0x2a0103e8 // mov w8, w1
+ BHI LBB0_4
+
+ WORD $0xaa1f03e9 // mov x9, xzr
+ WORD $0x52b0000b // mov w11, #-2147483648
+ WORD $0x12b0000a // mov w10, #2147483647
+ JMP LBB0_7
+LBB0_3:
+ WORD $0x12b0000a // mov w10, #2147483647
+ WORD $0x52b0000b // mov w11, #-2147483648
+ WORD $0xb900006b // str w11, [x3]
+ WORD $0xb900004a // str w10, [x2]
+ WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
+ RET
+LBB0_4:
+ WORD $0x927e7509 // and x9, x8, #0xfffffffc
+ WORD $0x9100200a // add x10, x0, #8
+ WORD $0x0f046402 // movi v2.2s, #128, lsl #24
+ WORD $0x2f046400 // mvni v0.2s, #128, lsl #24
+ WORD $0x2f046401 // mvni v1.2s, #128, lsl #24
+ WORD $0xaa0903eb // mov x11, x9
+ WORD $0x0f046403 // movi v3.2s, #128, lsl #24
+LBB0_5:
+ WORD $0x6d7f9544 // ldp d4, d5, [x10, #-8]
+ WORD $0xf100116b // subs x11, x11, #4
+ WORD $0x9100414a // add x10, x10, #16
+ WORD $0x0ea46c00 // smin v0.2s, v0.2s, v4.2s
+ WORD $0x0ea56c21 // smin v1.2s, v1.2s, v5.2s
+ WORD $0x0ea46442 // smax v2.2s, v2.2s, v4.2s
+ WORD $0x0ea56463 // smax v3.2s, v3.2s, v5.2s
+ BNE LBB0_5
+
+ WORD $0x0ea36442 // smax v2.2s, v2.2s, v3.2s
+ WORD $0x0ea16c00 // smin v0.2s, v0.2s, v1.2s
+ WORD $0x0e0c0441 // dup v1.2s, v2.s[1]
+ WORD $0x0e0c0403 // dup v3.2s, v0.s[1]
+ WORD $0x0ea16441 // smax v1.2s, v2.2s, v1.2s
+ WORD $0x0ea36c00 // smin v0.2s, v0.2s, v3.2s
+ WORD $0xeb08013f // cmp x9, x8
+ WORD $0x1e26002b // fmov w11, s1
+ WORD $0x1e26000a // fmov w10, s0
+ BEQ LBB0_9
+LBB0_7:
+ WORD $0x8b09080c // add x12, x0, x9, lsl #2
+ WORD $0xcb090108 // sub x8, x8, x9
+LBB0_8:
+ WORD $0xb8404589 // ldr w9, [x12], #4
+ WORD $0x6b09015f // cmp w10, w9
+ WORD $0x1a89b14a // csel w10, w10, w9, lt
+ WORD $0x6b09017f // cmp w11, w9
+ WORD $0x1a89c16b // csel w11, w11, w9, gt
+ WORD $0xf1000508 // subs x8, x8, #1
+ BNE LBB0_8
+LBB0_9:
+ WORD $0xb900006b // str w11, [x3]
+ WORD $0xb900004a // str w10, [x2]
+ WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
+ RET
+
+// func _uint32_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+TEXT ·_uint32_max_min_neon(SB), $0-32
+
+ MOVD values+0(FP), R0
+ MOVD length+8(FP), R1
+ MOVD minout+16(FP), R2
+ MOVD maxout+24(FP), R3
+
+ WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]!
+ WORD $0x7100043f // cmp w1, #1
+ WORD $0x910003fd // mov x29, sp
+ BLT LBB1_3
+
+ WORD $0x71000c3f // cmp w1, #3
+ WORD $0x2a0103e8 // mov w8, w1
+ BHI LBB1_4
+
+ WORD $0xaa1f03e9 // mov x9, xzr
+ WORD $0x2a1f03ea // mov w10, wzr
+ WORD $0x1280000b // mov w11, #-1
+ JMP LBB1_7
+LBB1_3:
+ WORD $0x2a1f03ea // mov w10, wzr
+ WORD $0x1280000b // mov w11, #-1
+ WORD $0xb900006a // str w10, [x3]
+ WORD $0xb900004b // str w11, [x2]
+ WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
+ RET
+LBB1_4:
+ WORD $0x927e7509 // and x9, x8, #0xfffffffc
+ WORD $0x6f00e401 // movi v1.2d, #0000000000000000
+ WORD $0x6f07e7e0 // movi v0.2d, #0xffffffffffffffff
+ WORD $0x9100200a // add x10, x0, #8
+ WORD $0x6f07e7e2 // movi v2.2d, #0xffffffffffffffff
+ WORD $0xaa0903eb // mov x11, x9
+ WORD $0x6f00e403 // movi v3.2d, #0000000000000000
+LBB1_5:
+ WORD $0x6d7f9544 // ldp d4, d5, [x10, #-8]
+ WORD $0xf100116b // subs x11, x11, #4
+ WORD $0x9100414a // add x10, x10, #16
+ WORD $0x2ea46c00 // umin v0.2s, v0.2s, v4.2s
+ WORD $0x2ea56c42 // umin v2.2s, v2.2s, v5.2s
+ WORD $0x2ea46421 // umax v1.2s, v1.2s, v4.2s
+ WORD $0x2ea56463 // umax v3.2s, v3.2s, v5.2s
+ BNE LBB1_5
+
+ WORD $0x2ea36421 // umax v1.2s, v1.2s, v3.2s
+ WORD $0x2ea26c00 // umin v0.2s, v0.2s, v2.2s
+ WORD $0x0e0c0422 // dup v2.2s, v1.s[1]
+ WORD $0x0e0c0403 // dup v3.2s, v0.s[1]
+ WORD $0x2ea26421 // umax v1.2s, v1.2s, v2.2s
+ WORD $0x2ea36c00 // umin v0.2s, v0.2s, v3.2s
+ WORD $0xeb08013f // cmp x9, x8
+ WORD $0x1e26002a // fmov w10, s1
+ WORD $0x1e26000b // fmov w11, s0
+ BEQ LBB1_9
+LBB1_7:
+ WORD $0x8b09080c // add x12, x0, x9, lsl #2
+ WORD $0xcb090108 // sub x8, x8, x9
+LBB1_8:
+ WORD $0xb8404589 // ldr w9, [x12], #4
+ WORD $0x6b09017f // cmp w11, w9
+ WORD $0x1a89316b // csel w11, w11, w9, lo
+ WORD $0x6b09015f // cmp w10, w9
+ WORD $0x1a89814a // csel w10, w10, w9, hi
+ WORD $0xf1000508 // subs x8, x8, #1
+ BNE LBB1_8
+LBB1_9:
+ WORD $0xb900006a // str w10, [x3]
+ WORD $0xb900004b // str w11, [x2]
+ WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
+ RET
+
+// func _int64_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+TEXT ·_int64_max_min_neon(SB), $0-32
+
+ MOVD values+0(FP), R0
+ MOVD length+8(FP), R1
+ MOVD minout+16(FP), R2
+ MOVD maxout+24(FP), R3
+
+ WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]!
+ WORD $0x7100043f // cmp w1, #1
+ WORD $0x910003fd // mov x29, sp
+ BLT LBB2_3
+
+ WORD $0x2a0103e8 // mov w8, w1
+ WORD $0xd2f0000b // mov x11, #-9223372036854775808
+ WORD $0x71000c3f // cmp w1, #3
+ WORD $0x92f0000a // mov x10, #9223372036854775807
+ BHI LBB2_4
+
+ WORD $0xaa1f03e9 // mov x9, xzr
+ JMP LBB2_7
+LBB2_3:
+ WORD $0x92f0000a // mov x10, #9223372036854775807
+ WORD $0xd2f0000b // mov x11, #-9223372036854775808
+ WORD $0xf900006b // str x11, [x3]
+ WORD $0xf900004a // str x10, [x2]
+ WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
+ RET
+LBB2_4:
+ WORD $0x927e7509 // and x9, x8, #0xfffffffc
+ WORD $0x4e080d61 // dup v1.2d, x11
+ WORD $0x4e080d40 // dup v0.2d, x10
+ WORD $0x9100400a // add x10, x0, #16
+ WORD $0xaa0903eb // mov x11, x9
+ WORD $0x4ea01c02 // mov v2.16b, v0.16b
+ WORD $0x4ea11c23 // mov v3.16b, v1.16b
+LBB2_5:
+ WORD $0xad7f9544 // ldp q4, q5, [x10, #-16]
+ WORD $0x4ea31c66 // mov v6.16b, v3.16b
+ WORD $0x4ea11c27 // mov v7.16b, v1.16b
+ WORD $0x4ea21c43 // mov v3.16b, v2.16b
+ WORD $0x4ea01c01 // mov v1.16b, v0.16b
+ WORD $0x4ee03480 // cmgt v0.2d, v4.2d, v0.2d
+ WORD $0x4ee234a2 // cmgt v2.2d, v5.2d, v2.2d
+ WORD $0x6e641c20 // bsl v0.16b, v1.16b, v4.16b
+ WORD $0x4ee434e1 // cmgt v1.2d, v7.2d, v4.2d
+ WORD $0x6e651c62 // bsl v2.16b, v3.16b, v5.16b
+ WORD $0x4ee534c3 // cmgt v3.2d, v6.2d, v5.2d
+ WORD $0xf100116b // subs x11, x11, #4
+ WORD $0x6e641ce1 // bsl v1.16b, v7.16b, v4.16b
+ WORD $0x6e651cc3 // bsl v3.16b, v6.16b, v5.16b
+ WORD $0x9100814a // add x10, x10, #32
+ BNE LBB2_5
+
+ WORD $0x4ee33424 // cmgt v4.2d, v1.2d, v3.2d
+ WORD $0x4ee03445 // cmgt v5.2d, v2.2d, v0.2d
+ WORD $0x6e631c24 // bsl v4.16b, v1.16b, v3.16b
+ WORD $0x6e621c05 // bsl v5.16b, v0.16b, v2.16b
+ WORD $0x4e180480 // dup v0.2d, v4.d[1]
+ WORD $0x4e1804a1 // dup v1.2d, v5.d[1]
+ WORD $0x4ee03482 // cmgt v2.2d, v4.2d, v0.2d
+ WORD $0x4ee53423 // cmgt v3.2d, v1.2d, v5.2d
+ WORD $0x6e601c82 // bsl v2.16b, v4.16b, v0.16b
+ WORD $0x6e611ca3 // bsl v3.16b, v5.16b, v1.16b
+ WORD $0xeb08013f // cmp x9, x8
+ WORD $0x9e66004b // fmov x11, d2
+ WORD $0x9e66006a // fmov x10, d3
+ BEQ LBB2_9
+LBB2_7:
+ WORD $0x8b090c0c // add x12, x0, x9, lsl #3
+ WORD $0xcb090108 // sub x8, x8, x9
+LBB2_8:
+ WORD $0xf8408589 // ldr x9, [x12], #8
+ WORD $0xeb09015f // cmp x10, x9
+ WORD $0x9a89b14a // csel x10, x10, x9, lt
+ WORD $0xeb09017f // cmp x11, x9
+ WORD $0x9a89c16b // csel x11, x11, x9, gt
+ WORD $0xf1000508 // subs x8, x8, #1
+ BNE LBB2_8
+LBB2_9:
+ WORD $0xf900006b // str x11, [x3]
+ WORD $0xf900004a // str x10, [x2]
+ WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
+ RET
+
+
+// func _uint64_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+TEXT ·_uint64_max_min_neon(SB), $0-32
+
+ MOVD values+0(FP), R0
+ MOVD length+8(FP), R1
+ MOVD minout+16(FP), R2
+ MOVD maxout+24(FP), R3
+
+ WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]!
+ WORD $0x7100043f // cmp w1, #1
+ WORD $0x910003fd // mov x29, sp
+ BLT LBB3_3
+
+ WORD $0x71000c3f // cmp w1, #3
+ WORD $0x2a0103e8 // mov w8, w1
+ BHI LBB3_4
+
+ WORD $0xaa1f03e9 // mov x9, xzr
+ WORD $0xaa1f03ea // mov x10, xzr
+ WORD $0x9280000b // mov x11, #-1
+ JMP LBB3_7
+LBB3_3:
+ WORD $0xaa1f03ea // mov x10, xzr
+ WORD $0x9280000b // mov x11, #-1
+ WORD $0xf900006a // str x10, [x3]
+ WORD $0xf900004b // str x11, [x2]
+ WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
+ RET
+LBB3_4:
+ WORD $0x927e7509 // and x9, x8, #0xfffffffc
+ WORD $0x9100400a // add x10, x0, #16
+ WORD $0x6f00e401 // movi v1.2d, #0000000000000000
+ WORD $0x6f07e7e0 // movi v0.2d, #0xffffffffffffffff
+ WORD $0x6f07e7e2 // movi v2.2d, #0xffffffffffffffff
+ WORD $0xaa0903eb // mov x11, x9
+ WORD $0x6f00e403 // movi v3.2d, #0000000000000000
+LBB3_5:
+ WORD $0xad7f9544 // ldp q4, q5, [x10, #-16]
+ WORD $0x4ea31c66 // mov v6.16b, v3.16b
+ WORD $0x4ea11c27 // mov v7.16b, v1.16b
+ WORD $0x4ea21c43 // mov v3.16b, v2.16b
+ WORD $0x4ea01c01 // mov v1.16b, v0.16b
+ WORD $0x6ee03480 // cmhi v0.2d, v4.2d, v0.2d
+ WORD $0x6ee234a2 // cmhi v2.2d, v5.2d, v2.2d
+ WORD $0x6e641c20 // bsl v0.16b, v1.16b, v4.16b
+ WORD $0x6ee434e1 // cmhi v1.2d, v7.2d, v4.2d
+ WORD $0x6e651c62 // bsl v2.16b, v3.16b, v5.16b
+ WORD $0x6ee534c3 // cmhi v3.2d, v6.2d, v5.2d
+ WORD $0xf100116b // subs x11, x11, #4
+ WORD $0x6e641ce1 // bsl v1.16b, v7.16b, v4.16b
+ WORD $0x6e651cc3 // bsl v3.16b, v6.16b, v5.16b
+ WORD $0x9100814a // add x10, x10, #32
+ BNE LBB3_5
+
+ WORD $0x6ee33424 // cmhi v4.2d, v1.2d, v3.2d
+ WORD $0x6ee03445 // cmhi v5.2d, v2.2d, v0.2d
+ WORD $0x6e631c24 // bsl v4.16b, v1.16b, v3.16b
+ WORD $0x6e621c05 // bsl v5.16b, v0.16b, v2.16b
+ WORD $0x4e180480 // dup v0.2d, v4.d[1]
+ WORD $0x4e1804a1 // dup v1.2d, v5.d[1]
+ WORD $0x6ee03482 // cmhi v2.2d, v4.2d, v0.2d
+ WORD $0x6ee53423 // cmhi v3.2d, v1.2d, v5.2d
+ WORD $0x6e601c82 // bsl v2.16b, v4.16b, v0.16b
+ WORD $0x6e611ca3 // bsl v3.16b, v5.16b, v1.16b
+ WORD $0xeb08013f // cmp x9, x8
+ WORD $0x9e66004a // fmov x10, d2
+ WORD $0x9e66006b // fmov x11, d3
+ BEQ LBB3_9
+LBB3_7:
+ WORD $0x8b090c0c // add x12, x0, x9, lsl #3
+ WORD $0xcb090108 // sub x8, x8, x9
+LBB3_8:
+ WORD $0xf8408589 // ldr x9, [x12], #8
+ WORD $0xeb09017f // cmp x11, x9
+ WORD $0x9a89316b // csel x11, x11, x9, lo
+ WORD $0xeb09015f // cmp x10, x9
+ WORD $0x9a89814a // csel x10, x10, x9, hi
+ WORD $0xf1000508 // subs x8, x8, #1
+ BNE LBB3_8
+LBB3_9:
+ WORD $0xf900006a // str x10, [x3]
+ WORD $0xf900004b // str x11, [x2]
+ WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
+ RET
+
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_noasm.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_noasm.go
new file mode 100644
index 000000000..19c24b590
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_noasm.go
@@ -0,0 +1,31 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build noasm
+
+package utils
+
+// if building with the 'noasm' tag, then point to the pure go implementations
+func init() {
+ minmaxFuncs.i8 = int8MinMax
+ minmaxFuncs.ui8 = uint8MinMax
+ minmaxFuncs.i16 = int16MinMax
+ minmaxFuncs.ui16 = uint16MinMax
+ minmaxFuncs.i32 = int32MinMax
+ minmaxFuncs.ui32 = uint32MinMax
+ minmaxFuncs.i64 = int64MinMax
+ minmaxFuncs.ui64 = uint64MinMax
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_ppc64le.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_ppc64le.go
new file mode 100644
index 000000000..ffd2db006
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_ppc64le.go
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+func init() {
+ minmaxFuncs.i8 = int8MinMax
+ minmaxFuncs.ui8 = uint8MinMax
+ minmaxFuncs.i16 = int16MinMax
+ minmaxFuncs.ui16 = uint16MinMax
+ minmaxFuncs.i32 = int32MinMax
+ minmaxFuncs.ui32 = uint32MinMax
+ minmaxFuncs.i64 = int64MinMax
+ minmaxFuncs.ui64 = uint64MinMax
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_s390x.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_s390x.go
new file mode 100644
index 000000000..ffd2db006
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_s390x.go
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+func init() {
+ minmaxFuncs.i8 = int8MinMax
+ minmaxFuncs.ui8 = uint8MinMax
+ minmaxFuncs.i16 = int16MinMax
+ minmaxFuncs.ui16 = uint16MinMax
+ minmaxFuncs.i32 = int32MinMax
+ minmaxFuncs.ui32 = uint32MinMax
+ minmaxFuncs.i64 = int64MinMax
+ minmaxFuncs.ui64 = uint64MinMax
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.go
new file mode 100644
index 000000000..1e12a8d17
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.go
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+import "unsafe"
+
+// This file contains convenience functions for utilizing SSE4 intrinsics to quickly
+// and efficiently get the min and max from an integral slice.
+
+//go:noescape
+func _int8_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int8MaxMinSSE4(values []int8) (min, max int8) {
+ _int8_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _uint8_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint8MaxMinSSE4(values []uint8) (min, max uint8) {
+ _uint8_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _int16_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int16MaxMinSSE4(values []int16) (min, max int16) {
+ _int16_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _uint16_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint16MaxMinSSE4(values []uint16) (min, max uint16) {
+ _uint16_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _int32_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int32MaxMinSSE4(values []int32) (min, max int32) {
+ _int32_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _uint32_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint32MaxMinSSE4(values []uint32) (min, max uint32) {
+ _uint32_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _int64_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func int64MaxMinSSE4(values []int64) (min, max int64) {
+ _int64_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
+
+//go:noescape
+func _uint64_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer)
+
+func uint64MaxMinSSE4(values []uint64) (min, max uint64) {
+ _uint64_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max))
+ return
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.s b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.s
new file mode 100644
index 000000000..8f1eccf60
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.s
@@ -0,0 +1,1044 @@
+//+build !noasm !appengine
+// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
+
+DATA LCDATA1<>+0x000(SB)/8, $0x8080808080808080
+DATA LCDATA1<>+0x008(SB)/8, $0x8080808080808080
+DATA LCDATA1<>+0x010(SB)/8, $0x7f7f7f7f7f7f7f7f
+DATA LCDATA1<>+0x018(SB)/8, $0x7f7f7f7f7f7f7f7f
+GLOBL LCDATA1<>(SB), 8, $32
+
+TEXT ·_int8_max_min_sse4(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+ LEAQ LCDATA1<>(SB), BP
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB0_1
+ WORD $0x8941; BYTE $0xf1 // mov r9d, esi
+ WORD $0xfe83; BYTE $0x1f // cmp esi, 31
+ JA LBB0_4
+ WORD $0xb041; BYTE $0x80 // mov r8b, -128
+ WORD $0xb640; BYTE $0x7f // mov sil, 127
+ WORD $0x3145; BYTE $0xdb // xor r11d, r11d
+ JMP LBB0_11
+
+LBB0_1:
+ WORD $0xb640; BYTE $0x7f // mov sil, 127
+ WORD $0xb041; BYTE $0x80 // mov r8b, -128
+ JMP LBB0_12
+
+LBB0_4:
+ WORD $0x8945; BYTE $0xcb // mov r11d, r9d
+ LONG $0xe0e38341 // and r11d, -32
+ LONG $0xe0438d49 // lea rax, [r11 - 32]
+ WORD $0x8949; BYTE $0xc0 // mov r8, rax
+ LONG $0x05e8c149 // shr r8, 5
+ LONG $0x01c08349 // add r8, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_5
+ WORD $0x894d; BYTE $0xc2 // mov r10, r8
+ LONG $0xfee28349 // and r10, -2
+ WORD $0xf749; BYTE $0xda // neg r10
+ LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI0_0] */
+ LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI0_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd06f0f66 // movdqa xmm2, xmm0
+ LONG $0xd96f0f66 // movdqa xmm3, xmm1
+
+LBB0_7:
+ LONG $0x246f0ff3; BYTE $0x07 // movdqu xmm4, oword [rdi + rax]
+ LONG $0x6c6f0ff3; WORD $0x1007 // movdqu xmm5, oword [rdi + rax + 16]
+ LONG $0x746f0ff3; WORD $0x2007 // movdqu xmm6, oword [rdi + rax + 32]
+ LONG $0x7c6f0ff3; WORD $0x3007 // movdqu xmm7, oword [rdi + rax + 48]
+ LONG $0x38380f66; BYTE $0xc4 // pminsb xmm0, xmm4
+ LONG $0x38380f66; BYTE $0xd5 // pminsb xmm2, xmm5
+ LONG $0x3c380f66; BYTE $0xcc // pmaxsb xmm1, xmm4
+ LONG $0x3c380f66; BYTE $0xdd // pmaxsb xmm3, xmm5
+ LONG $0x38380f66; BYTE $0xc6 // pminsb xmm0, xmm6
+ LONG $0x38380f66; BYTE $0xd7 // pminsb xmm2, xmm7
+ LONG $0x3c380f66; BYTE $0xce // pmaxsb xmm1, xmm6
+ LONG $0x3c380f66; BYTE $0xdf // pmaxsb xmm3, xmm7
+ LONG $0x40c08348 // add rax, 64
+ LONG $0x02c28349 // add r10, 2
+ JNE LBB0_7
+ LONG $0x01c0f641 // test r8b, 1
+ JE LBB0_10
+
+LBB0_9:
+ LONG $0x246f0ff3; BYTE $0x07 // movdqu xmm4, oword [rdi + rax]
+ LONG $0x6c6f0ff3; WORD $0x1007 // movdqu xmm5, oword [rdi + rax + 16]
+ LONG $0x3c380f66; BYTE $0xdd // pmaxsb xmm3, xmm5
+ LONG $0x3c380f66; BYTE $0xcc // pmaxsb xmm1, xmm4
+ LONG $0x38380f66; BYTE $0xd5 // pminsb xmm2, xmm5
+ LONG $0x38380f66; BYTE $0xc4 // pminsb xmm0, xmm4
+
+LBB0_10:
+ LONG $0x38380f66; BYTE $0xc2 // pminsb xmm0, xmm2
+ LONG $0x3c380f66; BYTE $0xcb // pmaxsb xmm1, xmm3
+ LONG $0x4def0f66; BYTE $0x10 // pxor xmm1, oword 16[rbp] /* [rip + .LCPI0_1] */
+ LONG $0xd16f0f66 // movdqa xmm2, xmm1
+ LONG $0xd2710f66; BYTE $0x08 // psrlw xmm2, 8
+ LONG $0xd1da0f66 // pminub xmm2, xmm1
+ LONG $0x41380f66; BYTE $0xca // phminposuw xmm1, xmm2
+ LONG $0x7e0f4166; BYTE $0xc8 // movd r8d, xmm1
+ LONG $0x7ff08041 // xor r8b, 127
+ LONG $0x45ef0f66; BYTE $0x00 // pxor xmm0, oword 0[rbp] /* [rip + .LCPI0_0] */
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xd1710f66; BYTE $0x08 // psrlw xmm1, 8
+ LONG $0xc8da0f66 // pminub xmm1, xmm0
+ LONG $0x41380f66; BYTE $0xc1 // phminposuw xmm0, xmm1
+ LONG $0xc67e0f66 // movd esi, xmm0
+ LONG $0x80f68040 // xor sil, -128
+ WORD $0x394d; BYTE $0xcb // cmp r11, r9
+ JE LBB0_12
+
+LBB0_11:
+ LONG $0x04b60f42; BYTE $0x1f // movzx eax, byte [rdi + r11]
+ WORD $0x3840; BYTE $0xc6 // cmp sil, al
+ LONG $0xf6b60f40 // movzx esi, sil
+ WORD $0x4f0f; BYTE $0xf0 // cmovg esi, eax
+ WORD $0x3841; BYTE $0xc0 // cmp r8b, al
+ LONG $0xc0b60f45 // movzx r8d, r8b
+ LONG $0xc04c0f44 // cmovl r8d, eax
+ LONG $0x01c38349 // add r11, 1
+ WORD $0x394d; BYTE $0xd9 // cmp r9, r11
+ JNE LBB0_11
+
+LBB0_12:
+ WORD $0x8844; BYTE $0x01 // mov byte [rcx], r8b
+ WORD $0x8840; BYTE $0x32 // mov byte [rdx], sil
+ RET
+
+LBB0_5:
+ LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI0_0] */
+ LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI0_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd06f0f66 // movdqa xmm2, xmm0
+ LONG $0xd96f0f66 // movdqa xmm3, xmm1
+ LONG $0x01c0f641 // test r8b, 1
+ JNE LBB0_9
+ JMP LBB0_10
+
+TEXT ·_uint8_max_min_sse4(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB1_1
+ WORD $0x8941; BYTE $0xf1 // mov r9d, esi
+ WORD $0xfe83; BYTE $0x1f // cmp esi, 31
+ JA LBB1_4
+ WORD $0xb640; BYTE $0xff // mov sil, -1
+ WORD $0x3145; BYTE $0xdb // xor r11d, r11d
+ WORD $0xc031 // xor eax, eax
+ JMP LBB1_11
+
+LBB1_1:
+ WORD $0xb640; BYTE $0xff // mov sil, -1
+ WORD $0xc031 // xor eax, eax
+ JMP LBB1_12
+
+LBB1_4:
+ WORD $0x8945; BYTE $0xcb // mov r11d, r9d
+ LONG $0xe0e38341 // and r11d, -32
+ LONG $0xe0438d49 // lea rax, [r11 - 32]
+ WORD $0x8949; BYTE $0xc0 // mov r8, rax
+ LONG $0x05e8c149 // shr r8, 5
+ LONG $0x01c08349 // add r8, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB1_5
+ WORD $0x894d; BYTE $0xc2 // mov r10, r8
+ LONG $0xfee28349 // and r10, -2
+ WORD $0xf749; BYTE $0xda // neg r10
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0xc0760f66 // pcmpeqd xmm0, xmm0
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd2760f66 // pcmpeqd xmm2, xmm2
+ LONG $0xdbef0f66 // pxor xmm3, xmm3
+
+LBB1_7:
+ LONG $0x246f0ff3; BYTE $0x07 // movdqu xmm4, oword [rdi + rax]
+ LONG $0x6c6f0ff3; WORD $0x1007 // movdqu xmm5, oword [rdi + rax + 16]
+ LONG $0x746f0ff3; WORD $0x2007 // movdqu xmm6, oword [rdi + rax + 32]
+ LONG $0x7c6f0ff3; WORD $0x3007 // movdqu xmm7, oword [rdi + rax + 48]
+ LONG $0xc4da0f66 // pminub xmm0, xmm4
+ LONG $0xd5da0f66 // pminub xmm2, xmm5
+ LONG $0xccde0f66 // pmaxub xmm1, xmm4
+ LONG $0xddde0f66 // pmaxub xmm3, xmm5
+ LONG $0xc6da0f66 // pminub xmm0, xmm6
+ LONG $0xd7da0f66 // pminub xmm2, xmm7
+ LONG $0xcede0f66 // pmaxub xmm1, xmm6
+ LONG $0xdfde0f66 // pmaxub xmm3, xmm7
+ LONG $0x40c08348 // add rax, 64
+ LONG $0x02c28349 // add r10, 2
+ JNE LBB1_7
+ LONG $0x01c0f641 // test r8b, 1
+ JE LBB1_10
+
+LBB1_9:
+ LONG $0x246f0ff3; BYTE $0x07 // movdqu xmm4, oword [rdi + rax]
+ LONG $0x6c6f0ff3; WORD $0x1007 // movdqu xmm5, oword [rdi + rax + 16]
+ LONG $0xddde0f66 // pmaxub xmm3, xmm5
+ LONG $0xccde0f66 // pmaxub xmm1, xmm4
+ LONG $0xd5da0f66 // pminub xmm2, xmm5
+ LONG $0xc4da0f66 // pminub xmm0, xmm4
+
+LBB1_10:
+ LONG $0xc2da0f66 // pminub xmm0, xmm2
+ LONG $0xcbde0f66 // pmaxub xmm1, xmm3
+ LONG $0xd2760f66 // pcmpeqd xmm2, xmm2
+ LONG $0xd1ef0f66 // pxor xmm2, xmm1
+ LONG $0xca6f0f66 // movdqa xmm1, xmm2
+ LONG $0xd1710f66; BYTE $0x08 // psrlw xmm1, 8
+ LONG $0xcada0f66 // pminub xmm1, xmm2
+ LONG $0x41380f66; BYTE $0xc9 // phminposuw xmm1, xmm1
+ LONG $0xc87e0f66 // movd eax, xmm1
+ WORD $0xd0f6 // not al
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xd1710f66; BYTE $0x08 // psrlw xmm1, 8
+ LONG $0xc8da0f66 // pminub xmm1, xmm0
+ LONG $0x41380f66; BYTE $0xc1 // phminposuw xmm0, xmm1
+ LONG $0xc67e0f66 // movd esi, xmm0
+ WORD $0x394d; BYTE $0xcb // cmp r11, r9
+ JE LBB1_12
+
+LBB1_11:
+ LONG $0x04b60f46; BYTE $0x1f // movzx r8d, byte [rdi + r11]
+ WORD $0x3844; BYTE $0xc6 // cmp sil, r8b
+ LONG $0xf6b60f40 // movzx esi, sil
+ LONG $0xf0430f41 // cmovae esi, r8d
+ WORD $0x3844; BYTE $0xc0 // cmp al, r8b
+ WORD $0xb60f; BYTE $0xc0 // movzx eax, al
+ LONG $0xc0460f41 // cmovbe eax, r8d
+ LONG $0x01c38349 // add r11, 1
+ WORD $0x394d; BYTE $0xd9 // cmp r9, r11
+ JNE LBB1_11
+
+LBB1_12:
+ WORD $0x0188 // mov byte [rcx], al
+ WORD $0x8840; BYTE $0x32 // mov byte [rdx], sil
+ RET
+
+LBB1_5:
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0xc0760f66 // pcmpeqd xmm0, xmm0
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd2760f66 // pcmpeqd xmm2, xmm2
+ LONG $0xdbef0f66 // pxor xmm3, xmm3
+ LONG $0x01c0f641 // test r8b, 1
+ JNE LBB1_9
+ JMP LBB1_10
+
+DATA LCDATA2<>+0x000(SB)/8, $0x8000800080008000
+DATA LCDATA2<>+0x008(SB)/8, $0x8000800080008000
+DATA LCDATA2<>+0x010(SB)/8, $0x7fff7fff7fff7fff
+DATA LCDATA2<>+0x018(SB)/8, $0x7fff7fff7fff7fff
+GLOBL LCDATA2<>(SB), 8, $32
+
+TEXT ·_int16_max_min_sse4(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+ LEAQ LCDATA2<>(SB), BP
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB2_1
+ WORD $0x8941; BYTE $0xf1 // mov r9d, esi
+ WORD $0xfe83; BYTE $0x0f // cmp esi, 15
+ JA LBB2_4
+ LONG $0x00b84166; BYTE $0x80 // mov r8w, -32768
+ LONG $0x7fffbe66 // mov si, 32767
+ WORD $0x3145; BYTE $0xdb // xor r11d, r11d
+ JMP LBB2_11
+
+LBB2_1:
+ LONG $0x7fffbe66 // mov si, 32767
+ LONG $0x00b84166; BYTE $0x80 // mov r8w, -32768
+ JMP LBB2_12
+
+LBB2_4:
+ WORD $0x8945; BYTE $0xcb // mov r11d, r9d
+ LONG $0xf0e38341 // and r11d, -16
+ LONG $0xf0438d49 // lea rax, [r11 - 16]
+ WORD $0x8949; BYTE $0xc0 // mov r8, rax
+ LONG $0x04e8c149 // shr r8, 4
+ LONG $0x01c08349 // add r8, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB2_5
+ WORD $0x894d; BYTE $0xc2 // mov r10, r8
+ LONG $0xfee28349 // and r10, -2
+ WORD $0xf749; BYTE $0xda // neg r10
+ LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI2_0] */
+ LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI2_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd06f0f66 // movdqa xmm2, xmm0
+ LONG $0xd96f0f66 // movdqa xmm3, xmm1
+
+LBB2_7:
+ LONG $0x246f0ff3; BYTE $0x47 // movdqu xmm4, oword [rdi + 2*rax]
+ LONG $0x6c6f0ff3; WORD $0x1047 // movdqu xmm5, oword [rdi + 2*rax + 16]
+ LONG $0x746f0ff3; WORD $0x2047 // movdqu xmm6, oword [rdi + 2*rax + 32]
+ LONG $0x7c6f0ff3; WORD $0x3047 // movdqu xmm7, oword [rdi + 2*rax + 48]
+ LONG $0xc4ea0f66 // pminsw xmm0, xmm4
+ LONG $0xd5ea0f66 // pminsw xmm2, xmm5
+ LONG $0xccee0f66 // pmaxsw xmm1, xmm4
+ LONG $0xddee0f66 // pmaxsw xmm3, xmm5
+ LONG $0xc6ea0f66 // pminsw xmm0, xmm6
+ LONG $0xd7ea0f66 // pminsw xmm2, xmm7
+ LONG $0xceee0f66 // pmaxsw xmm1, xmm6
+ LONG $0xdfee0f66 // pmaxsw xmm3, xmm7
+ LONG $0x20c08348 // add rax, 32
+ LONG $0x02c28349 // add r10, 2
+ JNE LBB2_7
+ LONG $0x01c0f641 // test r8b, 1
+ JE LBB2_10
+
+LBB2_9:
+ LONG $0x246f0ff3; BYTE $0x47 // movdqu xmm4, oword [rdi + 2*rax]
+ LONG $0x6c6f0ff3; WORD $0x1047 // movdqu xmm5, oword [rdi + 2*rax + 16]
+ LONG $0xddee0f66 // pmaxsw xmm3, xmm5
+ LONG $0xccee0f66 // pmaxsw xmm1, xmm4
+ LONG $0xd5ea0f66 // pminsw xmm2, xmm5
+ LONG $0xc4ea0f66 // pminsw xmm0, xmm4
+
+LBB2_10:
+ LONG $0xc2ea0f66 // pminsw xmm0, xmm2
+ LONG $0xcbee0f66 // pmaxsw xmm1, xmm3
+ LONG $0x4def0f66; BYTE $0x10 // pxor xmm1, oword 16[rbp] /* [rip + .LCPI2_1] */
+ LONG $0x41380f66; BYTE $0xc9 // phminposuw xmm1, xmm1
+ LONG $0x7e0f4166; BYTE $0xc8 // movd r8d, xmm1
+ LONG $0xfff08141; WORD $0x007f; BYTE $0x00 // xor r8d, 32767
+ LONG $0x45ef0f66; BYTE $0x00 // pxor xmm0, oword 0[rbp] /* [rip + .LCPI2_0] */
+ LONG $0x41380f66; BYTE $0xc0 // phminposuw xmm0, xmm0
+ LONG $0xc67e0f66 // movd esi, xmm0
+ LONG $0x8000f681; WORD $0x0000 // xor esi, 32768
+ WORD $0x394d; BYTE $0xcb // cmp r11, r9
+ JE LBB2_12
+
+LBB2_11:
+ LONG $0x04b70f42; BYTE $0x5f // movzx eax, word [rdi + 2*r11]
+ WORD $0x3966; BYTE $0xc6 // cmp si, ax
+ WORD $0x4f0f; BYTE $0xf0 // cmovg esi, eax
+ LONG $0xc0394166 // cmp r8w, ax
+ LONG $0xc04c0f44 // cmovl r8d, eax
+ LONG $0x01c38349 // add r11, 1
+ WORD $0x394d; BYTE $0xd9 // cmp r9, r11
+ JNE LBB2_11
+
+LBB2_12:
+ LONG $0x01894466 // mov word [rcx], r8w
+ WORD $0x8966; BYTE $0x32 // mov word [rdx], si
+ RET
+
+LBB2_5:
+ LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI2_0] */
+ LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI2_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd06f0f66 // movdqa xmm2, xmm0
+ LONG $0xd96f0f66 // movdqa xmm3, xmm1
+ LONG $0x01c0f641 // test r8b, 1
+ JNE LBB2_9
+ JMP LBB2_10
+
+TEXT ·_uint16_max_min_sse4(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB3_1
+ WORD $0x8941; BYTE $0xf1 // mov r9d, esi
+ WORD $0xfe83; BYTE $0x0f // cmp esi, 15
+ JA LBB3_4
+ LONG $0xffb84166; BYTE $0xff // mov r8w, -1
+ WORD $0x3145; BYTE $0xdb // xor r11d, r11d
+ WORD $0xf631 // xor esi, esi
+ JMP LBB3_11
+
+LBB3_1:
+ LONG $0xffb84166; BYTE $0xff // mov r8w, -1
+ WORD $0xf631 // xor esi, esi
+ JMP LBB3_12
+
+LBB3_4:
+ WORD $0x8945; BYTE $0xcb // mov r11d, r9d
+ LONG $0xf0e38341 // and r11d, -16
+ LONG $0xf0438d49 // lea rax, [r11 - 16]
+ WORD $0x8949; BYTE $0xc0 // mov r8, rax
+ LONG $0x04e8c149 // shr r8, 4
+ LONG $0x01c08349 // add r8, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB3_5
+ WORD $0x894d; BYTE $0xc2 // mov r10, r8
+ LONG $0xfee28349 // and r10, -2
+ WORD $0xf749; BYTE $0xda // neg r10
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0xc0760f66 // pcmpeqd xmm0, xmm0
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd2760f66 // pcmpeqd xmm2, xmm2
+ LONG $0xdbef0f66 // pxor xmm3, xmm3
+
+LBB3_7:
+ LONG $0x246f0ff3; BYTE $0x47 // movdqu xmm4, oword [rdi + 2*rax]
+ LONG $0x6c6f0ff3; WORD $0x1047 // movdqu xmm5, oword [rdi + 2*rax + 16]
+ LONG $0x746f0ff3; WORD $0x2047 // movdqu xmm6, oword [rdi + 2*rax + 32]
+ LONG $0x7c6f0ff3; WORD $0x3047 // movdqu xmm7, oword [rdi + 2*rax + 48]
+ LONG $0x3a380f66; BYTE $0xc4 // pminuw xmm0, xmm4
+ LONG $0x3a380f66; BYTE $0xd5 // pminuw xmm2, xmm5
+ LONG $0x3e380f66; BYTE $0xcc // pmaxuw xmm1, xmm4
+ LONG $0x3e380f66; BYTE $0xdd // pmaxuw xmm3, xmm5
+ LONG $0x3a380f66; BYTE $0xc6 // pminuw xmm0, xmm6
+ LONG $0x3a380f66; BYTE $0xd7 // pminuw xmm2, xmm7
+ LONG $0x3e380f66; BYTE $0xce // pmaxuw xmm1, xmm6
+ LONG $0x3e380f66; BYTE $0xdf // pmaxuw xmm3, xmm7
+ LONG $0x20c08348 // add rax, 32
+ LONG $0x02c28349 // add r10, 2
+ JNE LBB3_7
+ LONG $0x01c0f641 // test r8b, 1
+ JE LBB3_10
+
+LBB3_9:
+ LONG $0x246f0ff3; BYTE $0x47 // movdqu xmm4, oword [rdi + 2*rax]
+ LONG $0x6c6f0ff3; WORD $0x1047 // movdqu xmm5, oword [rdi + 2*rax + 16]
+ LONG $0x3e380f66; BYTE $0xdd // pmaxuw xmm3, xmm5
+ LONG $0x3e380f66; BYTE $0xcc // pmaxuw xmm1, xmm4
+ LONG $0x3a380f66; BYTE $0xd5 // pminuw xmm2, xmm5
+ LONG $0x3a380f66; BYTE $0xc4 // pminuw xmm0, xmm4
+
+LBB3_10:
+ LONG $0x3a380f66; BYTE $0xc2 // pminuw xmm0, xmm2
+ LONG $0x3e380f66; BYTE $0xcb // pmaxuw xmm1, xmm3
+ LONG $0xd2760f66 // pcmpeqd xmm2, xmm2
+ LONG $0xd1ef0f66 // pxor xmm2, xmm1
+ LONG $0x41380f66; BYTE $0xca // phminposuw xmm1, xmm2
+ LONG $0xce7e0f66 // movd esi, xmm1
+ WORD $0xd6f7 // not esi
+ LONG $0x41380f66; BYTE $0xc0 // phminposuw xmm0, xmm0
+ LONG $0x7e0f4166; BYTE $0xc0 // movd r8d, xmm0
+ WORD $0x394d; BYTE $0xcb // cmp r11, r9
+ JE LBB3_12
+
+LBB3_11:
+ LONG $0x04b70f42; BYTE $0x5f // movzx eax, word [rdi + 2*r11]
+ LONG $0xc0394166 // cmp r8w, ax
+ LONG $0xc0430f44 // cmovae r8d, eax
+ WORD $0x3966; BYTE $0xc6 // cmp si, ax
+ WORD $0x460f; BYTE $0xf0 // cmovbe esi, eax
+ LONG $0x01c38349 // add r11, 1
+ WORD $0x394d; BYTE $0xd9 // cmp r9, r11
+ JNE LBB3_11
+
+LBB3_12:
+ WORD $0x8966; BYTE $0x31 // mov word [rcx], si
+ LONG $0x02894466 // mov word [rdx], r8w
+ RET
+
+LBB3_5:
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0xc0760f66 // pcmpeqd xmm0, xmm0
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd2760f66 // pcmpeqd xmm2, xmm2
+ LONG $0xdbef0f66 // pxor xmm3, xmm3
+ LONG $0x01c0f641 // test r8b, 1
+ JNE LBB3_9
+ JMP LBB3_10
+
+DATA LCDATA3<>+0x000(SB)/8, $0x8000000080000000
+DATA LCDATA3<>+0x008(SB)/8, $0x8000000080000000
+DATA LCDATA3<>+0x010(SB)/8, $0x7fffffff7fffffff
+DATA LCDATA3<>+0x018(SB)/8, $0x7fffffff7fffffff
+GLOBL LCDATA3<>(SB), 8, $32
+
+TEXT ·_int32_max_min_sse4(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+ LEAQ LCDATA3<>(SB), BP
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB4_1
+ WORD $0x8941; BYTE $0xf1 // mov r9d, esi
+ WORD $0xfe83; BYTE $0x07 // cmp esi, 7
+ JA LBB4_6
+ LONG $0x000000b8; BYTE $0x80 // mov eax, -2147483648
+ LONG $0xffffb841; WORD $0x7fff // mov r8d, 2147483647
+ WORD $0x3145; BYTE $0xdb // xor r11d, r11d
+ JMP LBB4_4
+
+LBB4_1:
+ LONG $0xffffb841; WORD $0x7fff // mov r8d, 2147483647
+ LONG $0x000000b8; BYTE $0x80 // mov eax, -2147483648
+ JMP LBB4_13
+
+LBB4_6:
+ WORD $0x8945; BYTE $0xcb // mov r11d, r9d
+ LONG $0xf8e38341 // and r11d, -8
+ LONG $0xf8438d49 // lea rax, [r11 - 8]
+ WORD $0x8949; BYTE $0xc0 // mov r8, rax
+ LONG $0x03e8c149 // shr r8, 3
+ LONG $0x01c08349 // add r8, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB4_7
+ WORD $0x894d; BYTE $0xc2 // mov r10, r8
+ LONG $0xfee28349 // and r10, -2
+ WORD $0xf749; BYTE $0xda // neg r10
+ LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI4_0] */
+ LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI4_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd06f0f66 // movdqa xmm2, xmm0
+ LONG $0xd96f0f66 // movdqa xmm3, xmm1
+
+LBB4_9:
+ LONG $0x246f0ff3; BYTE $0x87 // movdqu xmm4, oword [rdi + 4*rax]
+ LONG $0x6c6f0ff3; WORD $0x1087 // movdqu xmm5, oword [rdi + 4*rax + 16]
+ LONG $0x746f0ff3; WORD $0x2087 // movdqu xmm6, oword [rdi + 4*rax + 32]
+ LONG $0x7c6f0ff3; WORD $0x3087 // movdqu xmm7, oword [rdi + 4*rax + 48]
+ LONG $0x39380f66; BYTE $0xc4 // pminsd xmm0, xmm4
+ LONG $0x39380f66; BYTE $0xd5 // pminsd xmm2, xmm5
+ LONG $0x3d380f66; BYTE $0xcc // pmaxsd xmm1, xmm4
+ LONG $0x3d380f66; BYTE $0xdd // pmaxsd xmm3, xmm5
+ LONG $0x39380f66; BYTE $0xc6 // pminsd xmm0, xmm6
+ LONG $0x39380f66; BYTE $0xd7 // pminsd xmm2, xmm7
+ LONG $0x3d380f66; BYTE $0xce // pmaxsd xmm1, xmm6
+ LONG $0x3d380f66; BYTE $0xdf // pmaxsd xmm3, xmm7
+ LONG $0x10c08348 // add rax, 16
+ LONG $0x02c28349 // add r10, 2
+ JNE LBB4_9
+ LONG $0x01c0f641 // test r8b, 1
+ JE LBB4_12
+
+LBB4_11:
+ LONG $0x246f0ff3; BYTE $0x87 // movdqu xmm4, oword [rdi + 4*rax]
+ LONG $0x6c6f0ff3; WORD $0x1087 // movdqu xmm5, oword [rdi + 4*rax + 16]
+ LONG $0x3d380f66; BYTE $0xdd // pmaxsd xmm3, xmm5
+ LONG $0x3d380f66; BYTE $0xcc // pmaxsd xmm1, xmm4
+ LONG $0x39380f66; BYTE $0xd5 // pminsd xmm2, xmm5
+ LONG $0x39380f66; BYTE $0xc4 // pminsd xmm0, xmm4
+
+LBB4_12:
+ LONG $0x39380f66; BYTE $0xc2 // pminsd xmm0, xmm2
+ LONG $0x3d380f66; BYTE $0xcb // pmaxsd xmm1, xmm3
+ LONG $0xd1700f66; BYTE $0x4e // pshufd xmm2, xmm1, 78
+ LONG $0x3d380f66; BYTE $0xd1 // pmaxsd xmm2, xmm1
+ LONG $0xca700f66; BYTE $0xe5 // pshufd xmm1, xmm2, 229
+ LONG $0x3d380f66; BYTE $0xca // pmaxsd xmm1, xmm2
+ LONG $0xc87e0f66 // movd eax, xmm1
+ LONG $0xc8700f66; BYTE $0x4e // pshufd xmm1, xmm0, 78
+ LONG $0x39380f66; BYTE $0xc8 // pminsd xmm1, xmm0
+ LONG $0xc1700f66; BYTE $0xe5 // pshufd xmm0, xmm1, 229
+ LONG $0x39380f66; BYTE $0xc1 // pminsd xmm0, xmm1
+ LONG $0x7e0f4166; BYTE $0xc0 // movd r8d, xmm0
+ WORD $0x394d; BYTE $0xcb // cmp r11, r9
+ JE LBB4_13
+
+LBB4_4:
+ WORD $0xc689 // mov esi, eax
+
+LBB4_5:
+ LONG $0x9f048b42 // mov eax, dword [rdi + 4*r11]
+ WORD $0x3941; BYTE $0xc0 // cmp r8d, eax
+ LONG $0xc04f0f44 // cmovg r8d, eax
+ WORD $0xc639 // cmp esi, eax
+ WORD $0x4d0f; BYTE $0xc6 // cmovge eax, esi
+ LONG $0x01c38349 // add r11, 1
+ WORD $0xc689 // mov esi, eax
+ WORD $0x394d; BYTE $0xd9 // cmp r9, r11
+ JNE LBB4_5
+
+LBB4_13:
+ WORD $0x0189 // mov dword [rcx], eax
+ WORD $0x8944; BYTE $0x02 // mov dword [rdx], r8d
+ RET
+
+LBB4_7:
+ LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI4_0] */
+ LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI4_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd06f0f66 // movdqa xmm2, xmm0
+ LONG $0xd96f0f66 // movdqa xmm3, xmm1
+ LONG $0x01c0f641 // test r8b, 1
+ JNE LBB4_11
+ JMP LBB4_12
+
+TEXT ·_uint32_max_min_sse4(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB5_1
+ WORD $0x8941; BYTE $0xf1 // mov r9d, esi
+ WORD $0xfe83; BYTE $0x07 // cmp esi, 7
+ JA LBB5_6
+ WORD $0x3145; BYTE $0xdb // xor r11d, r11d
+ LONG $0xffffb841; WORD $0xffff // mov r8d, -1
+ WORD $0xf631 // xor esi, esi
+ JMP LBB5_4
+
+LBB5_1:
+ LONG $0xffffb841; WORD $0xffff // mov r8d, -1
+ WORD $0xf631 // xor esi, esi
+ JMP LBB5_13
+
+LBB5_6:
+ WORD $0x8945; BYTE $0xcb // mov r11d, r9d
+ LONG $0xf8e38341 // and r11d, -8
+ LONG $0xf8438d49 // lea rax, [r11 - 8]
+ WORD $0x8949; BYTE $0xc0 // mov r8, rax
+ LONG $0x03e8c149 // shr r8, 3
+ LONG $0x01c08349 // add r8, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB5_7
+ WORD $0x894d; BYTE $0xc2 // mov r10, r8
+ LONG $0xfee28349 // and r10, -2
+ WORD $0xf749; BYTE $0xda // neg r10
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0xc0760f66 // pcmpeqd xmm0, xmm0
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd2760f66 // pcmpeqd xmm2, xmm2
+ LONG $0xdbef0f66 // pxor xmm3, xmm3
+
+LBB5_9:
+ LONG $0x246f0ff3; BYTE $0x87 // movdqu xmm4, oword [rdi + 4*rax]
+ LONG $0x6c6f0ff3; WORD $0x1087 // movdqu xmm5, oword [rdi + 4*rax + 16]
+ LONG $0x746f0ff3; WORD $0x2087 // movdqu xmm6, oword [rdi + 4*rax + 32]
+ LONG $0x7c6f0ff3; WORD $0x3087 // movdqu xmm7, oword [rdi + 4*rax + 48]
+ LONG $0x3b380f66; BYTE $0xc4 // pminud xmm0, xmm4
+ LONG $0x3b380f66; BYTE $0xd5 // pminud xmm2, xmm5
+ LONG $0x3f380f66; BYTE $0xcc // pmaxud xmm1, xmm4
+ LONG $0x3f380f66; BYTE $0xdd // pmaxud xmm3, xmm5
+ LONG $0x3b380f66; BYTE $0xc6 // pminud xmm0, xmm6
+ LONG $0x3b380f66; BYTE $0xd7 // pminud xmm2, xmm7
+ LONG $0x3f380f66; BYTE $0xce // pmaxud xmm1, xmm6
+ LONG $0x3f380f66; BYTE $0xdf // pmaxud xmm3, xmm7
+ LONG $0x10c08348 // add rax, 16
+ LONG $0x02c28349 // add r10, 2
+ JNE LBB5_9
+ LONG $0x01c0f641 // test r8b, 1
+ JE LBB5_12
+
+LBB5_11:
+ LONG $0x246f0ff3; BYTE $0x87 // movdqu xmm4, oword [rdi + 4*rax]
+ LONG $0x6c6f0ff3; WORD $0x1087 // movdqu xmm5, oword [rdi + 4*rax + 16]
+ LONG $0x3f380f66; BYTE $0xdd // pmaxud xmm3, xmm5
+ LONG $0x3f380f66; BYTE $0xcc // pmaxud xmm1, xmm4
+ LONG $0x3b380f66; BYTE $0xd5 // pminud xmm2, xmm5
+ LONG $0x3b380f66; BYTE $0xc4 // pminud xmm0, xmm4
+
+LBB5_12:
+ LONG $0x3b380f66; BYTE $0xc2 // pminud xmm0, xmm2
+ LONG $0x3f380f66; BYTE $0xcb // pmaxud xmm1, xmm3
+ LONG $0xd1700f66; BYTE $0x4e // pshufd xmm2, xmm1, 78
+ LONG $0x3f380f66; BYTE $0xd1 // pmaxud xmm2, xmm1
+ LONG $0xca700f66; BYTE $0xe5 // pshufd xmm1, xmm2, 229
+ LONG $0x3f380f66; BYTE $0xca // pmaxud xmm1, xmm2
+ LONG $0xce7e0f66 // movd esi, xmm1
+ LONG $0xc8700f66; BYTE $0x4e // pshufd xmm1, xmm0, 78
+ LONG $0x3b380f66; BYTE $0xc8 // pminud xmm1, xmm0
+ LONG $0xc1700f66; BYTE $0xe5 // pshufd xmm0, xmm1, 229
+ LONG $0x3b380f66; BYTE $0xc1 // pminud xmm0, xmm1
+ LONG $0x7e0f4166; BYTE $0xc0 // movd r8d, xmm0
+ WORD $0x394d; BYTE $0xcb // cmp r11, r9
+ JE LBB5_13
+
+LBB5_4:
+ WORD $0xf089 // mov eax, esi
+
+LBB5_5:
+ LONG $0x9f348b42 // mov esi, dword [rdi + 4*r11]
+ WORD $0x3941; BYTE $0xf0 // cmp r8d, esi
+ LONG $0xc6430f44 // cmovae r8d, esi
+ WORD $0xf039 // cmp eax, esi
+ WORD $0x470f; BYTE $0xf0 // cmova esi, eax
+ LONG $0x01c38349 // add r11, 1
+ WORD $0xf089 // mov eax, esi
+ WORD $0x394d; BYTE $0xd9 // cmp r9, r11
+ JNE LBB5_5
+
+LBB5_13:
+ WORD $0x3189 // mov dword [rcx], esi
+ WORD $0x8944; BYTE $0x02 // mov dword [rdx], r8d
+ RET
+
+LBB5_7:
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0xc0760f66 // pcmpeqd xmm0, xmm0
+ WORD $0xc031 // xor eax, eax
+ LONG $0xd2760f66 // pcmpeqd xmm2, xmm2
+ LONG $0xdbef0f66 // pxor xmm3, xmm3
+ LONG $0x01c0f641 // test r8b, 1
+ JNE LBB5_11
+ JMP LBB5_12
+
+DATA LCDATA4<>+0x000(SB)/8, $0x8000000000000000
+DATA LCDATA4<>+0x008(SB)/8, $0x8000000000000000
+DATA LCDATA4<>+0x010(SB)/8, $0x7fffffffffffffff
+DATA LCDATA4<>+0x018(SB)/8, $0x7fffffffffffffff
+GLOBL LCDATA4<>(SB), 8, $32
+
+TEXT ·_int64_max_min_sse4(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+ LEAQ LCDATA4<>(SB), BP
+
+ QUAD $0xffffffffffffb849; WORD $0x7fff // mov r8, 9223372036854775807
+ WORD $0xf685 // test esi, esi
+ JLE LBB6_1
+ WORD $0x8941; BYTE $0xf1 // mov r9d, esi
+ WORD $0xfe83; BYTE $0x03 // cmp esi, 3
+ JA LBB6_6
+ LONG $0x01708d49 // lea rsi, [r8 + 1]
+ WORD $0x3145; BYTE $0xdb // xor r11d, r11d
+ JMP LBB6_4
+
+LBB6_1:
+ LONG $0x01708d49 // lea rsi, [r8 + 1]
+ JMP LBB6_13
+
+LBB6_6:
+ WORD $0x8945; BYTE $0xcb // mov r11d, r9d
+ LONG $0xfce38341 // and r11d, -4
+ LONG $0xfc438d49 // lea rax, [r11 - 4]
+ WORD $0x8949; BYTE $0xc0 // mov r8, rax
+ LONG $0x02e8c149 // shr r8, 2
+ LONG $0x01c08349 // add r8, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB6_7
+ WORD $0x894d; BYTE $0xc2 // mov r10, r8
+ LONG $0xfee28349 // and r10, -2
+ WORD $0xf749; BYTE $0xda // neg r10
+ LONG $0x6f0f4466; WORD $0x004d // movdqa xmm9, oword 0[rbp] /* [rip + .LCPI6_0] */
+ LONG $0x6f0f4466; WORD $0x1045 // movdqa xmm8, oword 16[rbp] /* [rip + .LCPI6_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0x6f0f4166; BYTE $0xd0 // movdqa xmm2, xmm8
+ LONG $0x6f0f4166; BYTE $0xf1 // movdqa xmm6, xmm9
+
+LBB6_9:
+ LONG $0x3c6f0ff3; BYTE $0xc7 // movdqu xmm7, oword [rdi + 8*rax]
+ LONG $0xc76f0f66 // movdqa xmm0, xmm7
+ LONG $0x380f4166; WORD $0xc037 // pcmpgtq xmm0, xmm8
+ LONG $0xe76f0f66 // movdqa xmm4, xmm7
+ LONG $0x380f4166; WORD $0xe015 // blendvpd xmm4, xmm8, xmm0
+ LONG $0x4c6f0ff3; WORD $0x10c7 // movdqu xmm1, oword [rdi + 8*rax + 16]
+ LONG $0xc16f0f66 // movdqa xmm0, xmm1
+ LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2
+ LONG $0xe96f0f66 // movdqa xmm5, xmm1
+ LONG $0x15380f66; BYTE $0xea // blendvpd xmm5, xmm2, xmm0
+ LONG $0x6f0f4166; BYTE $0xc1 // movdqa xmm0, xmm9
+ LONG $0x37380f66; BYTE $0xc7 // pcmpgtq xmm0, xmm7
+ LONG $0x380f4166; WORD $0xf915 // blendvpd xmm7, xmm9, xmm0
+ LONG $0xc66f0f66 // movdqa xmm0, xmm6
+ LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1
+ LONG $0x15380f66; BYTE $0xce // blendvpd xmm1, xmm6, xmm0
+ LONG $0x5c6f0ff3; WORD $0x20c7 // movdqu xmm3, oword [rdi + 8*rax + 32]
+ LONG $0xc36f0f66 // movdqa xmm0, xmm3
+ LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4
+ LONG $0x6f0f4466; BYTE $0xc3 // movdqa xmm8, xmm3
+ LONG $0x380f4466; WORD $0xc415 // blendvpd xmm8, xmm4, xmm0
+ LONG $0x646f0ff3; WORD $0x30c7 // movdqu xmm4, oword [rdi + 8*rax + 48]
+ LONG $0xc46f0f66 // movdqa xmm0, xmm4
+ LONG $0x37380f66; BYTE $0xc5 // pcmpgtq xmm0, xmm5
+ LONG $0xd46f0f66 // movdqa xmm2, xmm4
+ LONG $0x15380f66; BYTE $0xd5 // blendvpd xmm2, xmm5, xmm0
+ LONG $0xc7280f66 // movapd xmm0, xmm7
+ LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3
+ LONG $0x15380f66; BYTE $0xdf // blendvpd xmm3, xmm7, xmm0
+ LONG $0xc1280f66 // movapd xmm0, xmm1
+ LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4
+ LONG $0x15380f66; BYTE $0xe1 // blendvpd xmm4, xmm1, xmm0
+ LONG $0x08c08348 // add rax, 8
+ LONG $0x280f4466; BYTE $0xcb // movapd xmm9, xmm3
+ LONG $0xf4280f66 // movapd xmm6, xmm4
+ LONG $0x02c28349 // add r10, 2
+ JNE LBB6_9
+ LONG $0x01c0f641 // test r8b, 1
+ JE LBB6_12
+
+LBB6_11:
+ LONG $0x4c6f0ff3; WORD $0x10c7 // movdqu xmm1, oword [rdi + 8*rax + 16]
+ LONG $0xc4280f66 // movapd xmm0, xmm4
+ LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1
+ LONG $0xe96f0f66 // movdqa xmm5, xmm1
+ LONG $0x15380f66; BYTE $0xec // blendvpd xmm5, xmm4, xmm0
+ LONG $0x246f0ff3; BYTE $0xc7 // movdqu xmm4, oword [rdi + 8*rax]
+ LONG $0xc3280f66 // movapd xmm0, xmm3
+ LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4
+ LONG $0xf46f0f66 // movdqa xmm6, xmm4
+ LONG $0x15380f66; BYTE $0xf3 // blendvpd xmm6, xmm3, xmm0
+ LONG $0xc16f0f66 // movdqa xmm0, xmm1
+ LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2
+ LONG $0x15380f66; BYTE $0xca // blendvpd xmm1, xmm2, xmm0
+ LONG $0xc46f0f66 // movdqa xmm0, xmm4
+ LONG $0x380f4166; WORD $0xc037 // pcmpgtq xmm0, xmm8
+ LONG $0x380f4166; WORD $0xe015 // blendvpd xmm4, xmm8, xmm0
+ LONG $0x280f4466; BYTE $0xc4 // movapd xmm8, xmm4
+ LONG $0xd1280f66 // movapd xmm2, xmm1
+ LONG $0xde280f66 // movapd xmm3, xmm6
+ LONG $0xe5280f66 // movapd xmm4, xmm5
+
+LBB6_12:
+ LONG $0xc3280f66 // movapd xmm0, xmm3
+ LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4
+ LONG $0x15380f66; BYTE $0xe3 // blendvpd xmm4, xmm3, xmm0
+ LONG $0xcc700f66; BYTE $0x4e // pshufd xmm1, xmm4, 78
+ LONG $0xc46f0f66 // movdqa xmm0, xmm4
+ LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1
+ LONG $0x15380f66; BYTE $0xcc // blendvpd xmm1, xmm4, xmm0
+ LONG $0x7e0f4866; BYTE $0xce // movq rsi, xmm1
+ LONG $0xc26f0f66 // movdqa xmm0, xmm2
+ LONG $0x380f4166; WORD $0xc037 // pcmpgtq xmm0, xmm8
+ LONG $0x380f4166; WORD $0xd015 // blendvpd xmm2, xmm8, xmm0
+ LONG $0xca700f66; BYTE $0x4e // pshufd xmm1, xmm2, 78
+ LONG $0xc16f0f66 // movdqa xmm0, xmm1
+ LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2
+ LONG $0x15380f66; BYTE $0xca // blendvpd xmm1, xmm2, xmm0
+ LONG $0x7e0f4966; BYTE $0xc8 // movq r8, xmm1
+ WORD $0x394d; BYTE $0xcb // cmp r11, r9
+ JE LBB6_13
+
+LBB6_4:
+ WORD $0x8948; BYTE $0xf0 // mov rax, rsi
+
+LBB6_5:
+ LONG $0xdf348b4a // mov rsi, qword [rdi + 8*r11]
+ WORD $0x3949; BYTE $0xf0 // cmp r8, rsi
+ LONG $0xc64f0f4c // cmovg r8, rsi
+ WORD $0x3948; BYTE $0xf0 // cmp rax, rsi
+ LONG $0xf04d0f48 // cmovge rsi, rax
+ LONG $0x01c38349 // add r11, 1
+ WORD $0x8948; BYTE $0xf0 // mov rax, rsi
+ WORD $0x394d; BYTE $0xd9 // cmp r9, r11
+ JNE LBB6_5
+
+LBB6_13:
+ WORD $0x8948; BYTE $0x31 // mov qword [rcx], rsi
+ WORD $0x894c; BYTE $0x02 // mov qword [rdx], r8
+ RET
+
+LBB6_7:
+ LONG $0x5d280f66; BYTE $0x00 // movapd xmm3, oword 0[rbp] /* [rip + .LCPI6_0] */
+ LONG $0x6f0f4466; WORD $0x1045 // movdqa xmm8, oword 16[rbp] /* [rip + .LCPI6_1] */
+ WORD $0xc031 // xor eax, eax
+ LONG $0x6f0f4166; BYTE $0xd0 // movdqa xmm2, xmm8
+ LONG $0xe3280f66 // movapd xmm4, xmm3
+ LONG $0x01c0f641 // test r8b, 1
+ JNE LBB6_11
+ JMP LBB6_12
+
+DATA LCDATA5<>+0x000(SB)/8, $0x8000000000000000
+DATA LCDATA5<>+0x008(SB)/8, $0x8000000000000000
+GLOBL LCDATA5<>(SB), 8, $16
+
+TEXT ·_uint64_max_min_sse4(SB), $0-32
+
+ MOVQ values+0(FP), DI
+ MOVQ length+8(FP), SI
+ MOVQ minout+16(FP), DX
+ MOVQ maxout+24(FP), CX
+ LEAQ LCDATA5<>(SB), BP
+
+ WORD $0xf685 // test esi, esi
+ JLE LBB7_1
+ WORD $0x8941; BYTE $0xf1 // mov r9d, esi
+ WORD $0xfe83; BYTE $0x03 // cmp esi, 3
+ JA LBB7_6
+ LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1
+ WORD $0x3145; BYTE $0xdb // xor r11d, r11d
+ WORD $0xc031 // xor eax, eax
+ JMP LBB7_4
+
+LBB7_1:
+ LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1
+ WORD $0xc031 // xor eax, eax
+ JMP LBB7_13
+
+LBB7_6:
+ WORD $0x8945; BYTE $0xcb // mov r11d, r9d
+ LONG $0xfce38341 // and r11d, -4
+ LONG $0xfc438d49 // lea rax, [r11 - 4]
+ WORD $0x8949; BYTE $0xc0 // mov r8, rax
+ LONG $0x02e8c149 // shr r8, 2
+ LONG $0x01c08349 // add r8, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB7_7
+ WORD $0x894d; BYTE $0xc2 // mov r10, r8
+ LONG $0xfee28349 // and r10, -2
+ WORD $0xf749; BYTE $0xda // neg r10
+ LONG $0xef0f4566; BYTE $0xc9 // pxor xmm9, xmm9
+ LONG $0x760f4566; BYTE $0xd2 // pcmpeqd xmm10, xmm10
+ WORD $0xc031 // xor eax, eax
+ LONG $0x6f0f4466; WORD $0x0045 // movdqa xmm8, oword 0[rbp] /* [rip + .LCPI7_0] */
+ LONG $0x760f4566; BYTE $0xdb // pcmpeqd xmm11, xmm11
+ LONG $0xef0f4566; BYTE $0xe4 // pxor xmm12, xmm12
+
+LBB7_9:
+ LONG $0x6f0f4166; BYTE $0xd2 // movdqa xmm2, xmm10
+ LONG $0xef0f4166; BYTE $0xd0 // pxor xmm2, xmm8
+ LONG $0x246f0ff3; BYTE $0xc7 // movdqu xmm4, oword [rdi + 8*rax]
+ LONG $0x6c6f0ff3; WORD $0x10c7 // movdqu xmm5, oword [rdi + 8*rax + 16]
+ LONG $0x6f0f44f3; WORD $0xc76c; BYTE $0x20 // movdqu xmm13, oword [rdi + 8*rax + 32]
+ LONG $0xc46f0f66 // movdqa xmm0, xmm4
+ LONG $0xef0f4166; BYTE $0xc0 // pxor xmm0, xmm8
+ LONG $0x6f0f4166; BYTE $0xc9 // movdqa xmm1, xmm9
+ LONG $0xef0f4166; BYTE $0xc8 // pxor xmm1, xmm8
+ LONG $0x37380f66; BYTE $0xc8 // pcmpgtq xmm1, xmm0
+ LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2
+ LONG $0xdc6f0f66 // movdqa xmm3, xmm4
+ LONG $0x380f4166; WORD $0xda15 // blendvpd xmm3, xmm10, xmm0
+ LONG $0x746f0ff3; WORD $0x30c7 // movdqu xmm6, oword [rdi + 8*rax + 48]
+ LONG $0x6f0f4166; BYTE $0xfb // movdqa xmm7, xmm11
+ LONG $0xef0f4166; BYTE $0xf8 // pxor xmm7, xmm8
+ LONG $0xc56f0f66 // movdqa xmm0, xmm5
+ LONG $0xef0f4166; BYTE $0xc0 // pxor xmm0, xmm8
+ LONG $0x6f0f4166; BYTE $0xd4 // movdqa xmm2, xmm12
+ LONG $0xef0f4166; BYTE $0xd0 // pxor xmm2, xmm8
+ LONG $0x37380f66; BYTE $0xd0 // pcmpgtq xmm2, xmm0
+ LONG $0x37380f66; BYTE $0xc7 // pcmpgtq xmm0, xmm7
+ LONG $0xfd6f0f66 // movdqa xmm7, xmm5
+ LONG $0x380f4166; WORD $0xfb15 // blendvpd xmm7, xmm11, xmm0
+ LONG $0xc16f0f66 // movdqa xmm0, xmm1
+ LONG $0x380f4166; WORD $0xe115 // blendvpd xmm4, xmm9, xmm0
+ LONG $0xc26f0f66 // movdqa xmm0, xmm2
+ LONG $0x380f4166; WORD $0xec15 // blendvpd xmm5, xmm12, xmm0
+ LONG $0xd3280f66 // movapd xmm2, xmm3
+ LONG $0x570f4166; BYTE $0xd0 // xorpd xmm2, xmm8
+ LONG $0x6f0f4166; BYTE $0xc5 // movdqa xmm0, xmm13
+ LONG $0xef0f4166; BYTE $0xc0 // pxor xmm0, xmm8
+ LONG $0xcc280f66 // movapd xmm1, xmm4
+ LONG $0x570f4166; BYTE $0xc8 // xorpd xmm1, xmm8
+ LONG $0x37380f66; BYTE $0xc8 // pcmpgtq xmm1, xmm0
+ LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2
+ LONG $0x6f0f4566; BYTE $0xd5 // movdqa xmm10, xmm13
+ LONG $0x380f4466; WORD $0xd315 // blendvpd xmm10, xmm3, xmm0
+ LONG $0xdf280f66 // movapd xmm3, xmm7
+ LONG $0x570f4166; BYTE $0xd8 // xorpd xmm3, xmm8
+ LONG $0xc66f0f66 // movdqa xmm0, xmm6
+ LONG $0xef0f4166; BYTE $0xc0 // pxor xmm0, xmm8
+ LONG $0xd5280f66 // movapd xmm2, xmm5
+ LONG $0x570f4166; BYTE $0xd0 // xorpd xmm2, xmm8
+ LONG $0x37380f66; BYTE $0xd0 // pcmpgtq xmm2, xmm0
+ LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3
+ LONG $0x6f0f4466; BYTE $0xde // movdqa xmm11, xmm6
+ LONG $0x380f4466; WORD $0xdf15 // blendvpd xmm11, xmm7, xmm0
+ LONG $0xc16f0f66 // movdqa xmm0, xmm1
+ LONG $0x380f4466; WORD $0xec15 // blendvpd xmm13, xmm4, xmm0
+ LONG $0xc26f0f66 // movdqa xmm0, xmm2
+ LONG $0x15380f66; BYTE $0xf5 // blendvpd xmm6, xmm5, xmm0
+ LONG $0x08c08348 // add rax, 8
+ LONG $0x280f4566; BYTE $0xcd // movapd xmm9, xmm13
+ LONG $0x280f4466; BYTE $0xe6 // movapd xmm12, xmm6
+ LONG $0x02c28349 // add r10, 2
+ JNE LBB7_9
+ LONG $0x01c0f641 // test r8b, 1
+ JE LBB7_12
+
+LBB7_11:
+ LONG $0x24100f66; BYTE $0xc7 // movupd xmm4, oword [rdi + 8*rax]
+ LONG $0x5c100f66; WORD $0x10c7 // movupd xmm3, oword [rdi + 8*rax + 16]
+ LONG $0x6d280f66; BYTE $0x00 // movapd xmm5, oword 0[rbp] /* [rip + .LCPI7_0] */
+ LONG $0xc6280f66 // movapd xmm0, xmm6
+ LONG $0xc5570f66 // xorpd xmm0, xmm5
+ LONG $0xcb280f66 // movapd xmm1, xmm3
+ LONG $0xcd570f66 // xorpd xmm1, xmm5
+ LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1
+ LONG $0xfb280f66 // movapd xmm7, xmm3
+ LONG $0x15380f66; BYTE $0xfe // blendvpd xmm7, xmm6, xmm0
+ LONG $0x280f4166; BYTE $0xc5 // movapd xmm0, xmm13
+ LONG $0xc5570f66 // xorpd xmm0, xmm5
+ LONG $0xd4280f66 // movapd xmm2, xmm4
+ LONG $0xd5570f66 // xorpd xmm2, xmm5
+ LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2
+ LONG $0xf4280f66 // movapd xmm6, xmm4
+ LONG $0x380f4166; WORD $0xf515 // blendvpd xmm6, xmm13, xmm0
+ LONG $0x280f4166; BYTE $0xc3 // movapd xmm0, xmm11
+ LONG $0xc5570f66 // xorpd xmm0, xmm5
+ LONG $0x37380f66; BYTE $0xc8 // pcmpgtq xmm1, xmm0
+ LONG $0xc16f0f66 // movdqa xmm0, xmm1
+ LONG $0x380f4166; WORD $0xdb15 // blendvpd xmm3, xmm11, xmm0
+ LONG $0x570f4166; BYTE $0xea // xorpd xmm5, xmm10
+ LONG $0x37380f66; BYTE $0xd5 // pcmpgtq xmm2, xmm5
+ LONG $0xc26f0f66 // movdqa xmm0, xmm2
+ LONG $0x380f4166; WORD $0xe215 // blendvpd xmm4, xmm10, xmm0
+ LONG $0x280f4466; BYTE $0xd4 // movapd xmm10, xmm4
+ LONG $0x280f4466; BYTE $0xdb // movapd xmm11, xmm3
+ LONG $0x280f4466; BYTE $0xee // movapd xmm13, xmm6
+ LONG $0xf7280f66 // movapd xmm6, xmm7
+
+LBB7_12:
+ LONG $0x4d280f66; BYTE $0x00 // movapd xmm1, oword 0[rbp] /* [rip + .LCPI7_0] */
+ LONG $0xd6280f66 // movapd xmm2, xmm6
+ LONG $0xd1570f66 // xorpd xmm2, xmm1
+ LONG $0x280f4166; BYTE $0xc5 // movapd xmm0, xmm13
+ LONG $0xc1570f66 // xorpd xmm0, xmm1
+ LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2
+ LONG $0x380f4166; WORD $0xf515 // blendvpd xmm6, xmm13, xmm0
+ LONG $0xd6700f66; BYTE $0x4e // pshufd xmm2, xmm6, 78
+ LONG $0xc6280f66 // movapd xmm0, xmm6
+ LONG $0xc1570f66 // xorpd xmm0, xmm1
+ LONG $0xda6f0f66 // movdqa xmm3, xmm2
+ LONG $0xd9ef0f66 // pxor xmm3, xmm1
+ LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3
+ LONG $0x15380f66; BYTE $0xd6 // blendvpd xmm2, xmm6, xmm0
+ LONG $0x7e0f4866; BYTE $0xd0 // movq rax, xmm2
+ LONG $0x6f0f4166; BYTE $0xd2 // movdqa xmm2, xmm10
+ LONG $0xd1ef0f66 // pxor xmm2, xmm1
+ LONG $0x6f0f4166; BYTE $0xc3 // movdqa xmm0, xmm11
+ LONG $0xc1ef0f66 // pxor xmm0, xmm1
+ LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2
+ LONG $0x380f4566; WORD $0xda15 // blendvpd xmm11, xmm10, xmm0
+ LONG $0x700f4166; WORD $0x4ed3 // pshufd xmm2, xmm11, 78
+ LONG $0x6f0f4166; BYTE $0xc3 // movdqa xmm0, xmm11
+ LONG $0xc1ef0f66 // pxor xmm0, xmm1
+ LONG $0xcaef0f66 // pxor xmm1, xmm2
+ LONG $0x37380f66; BYTE $0xc8 // pcmpgtq xmm1, xmm0
+ LONG $0xc16f0f66 // movdqa xmm0, xmm1
+ LONG $0x380f4166; WORD $0xd315 // blendvpd xmm2, xmm11, xmm0
+ LONG $0x7e0f4966; BYTE $0xd0 // movq r8, xmm2
+ WORD $0x394d; BYTE $0xcb // cmp r11, r9
+ JE LBB7_13
+
+LBB7_4:
+ WORD $0x8948; BYTE $0xc6 // mov rsi, rax
+
+LBB7_5:
+ LONG $0xdf048b4a // mov rax, qword [rdi + 8*r11]
+ WORD $0x3949; BYTE $0xc0 // cmp r8, rax
+ LONG $0xc0430f4c // cmovae r8, rax
+ WORD $0x3948; BYTE $0xc6 // cmp rsi, rax
+ LONG $0xc6470f48 // cmova rax, rsi
+ LONG $0x01c38349 // add r11, 1
+ WORD $0x8948; BYTE $0xc6 // mov rsi, rax
+ WORD $0x394d; BYTE $0xd9 // cmp r9, r11
+ JNE LBB7_5
+
+LBB7_13:
+ WORD $0x8948; BYTE $0x01 // mov qword [rcx], rax
+ WORD $0x894c; BYTE $0x02 // mov qword [rdx], r8
+ RET
+
+LBB7_7:
+ LONG $0x570f4566; BYTE $0xed // xorpd xmm13, xmm13
+ LONG $0x760f4566; BYTE $0xd2 // pcmpeqd xmm10, xmm10
+ WORD $0xc031 // xor eax, eax
+ LONG $0x760f4566; BYTE $0xdb // pcmpeqd xmm11, xmm11
+ LONG $0xf6570f66 // xorpd xmm6, xmm6
+ LONG $0x01c0f641 // test r8b, 1
+ JNE LBB7_11
+ JMP LBB7_12
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go
new file mode 100644
index 000000000..1666df129
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go
@@ -0,0 +1,407 @@
+// Code generated by transpose_ints.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+// when we upgrade to support go1.18, this can be massively simplified by using
+// Go Generics, but since we aren't supporting go1.18 yet, I didn't want to use
+// them here so we can maintain the backwards compatibility.
+
+func transposeInt8Int8(src []int8, dest []int8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int8(transposeMap[s])
+ }
+}
+
+func transposeInt8Uint8(src []int8, dest []uint8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint8(transposeMap[s])
+ }
+}
+
+func transposeInt8Int16(src []int8, dest []int16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int16(transposeMap[s])
+ }
+}
+
+func transposeInt8Uint16(src []int8, dest []uint16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint16(transposeMap[s])
+ }
+}
+
+func transposeInt8Int32(src []int8, dest []int32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int32(transposeMap[s])
+ }
+}
+
+func transposeInt8Uint32(src []int8, dest []uint32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint32(transposeMap[s])
+ }
+}
+
+func transposeInt8Int64(src []int8, dest []int64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int64(transposeMap[s])
+ }
+}
+
+func transposeInt8Uint64(src []int8, dest []uint64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint64(transposeMap[s])
+ }
+}
+
+func transposeUint8Int8(src []uint8, dest []int8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int8(transposeMap[s])
+ }
+}
+
+func transposeUint8Uint8(src []uint8, dest []uint8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint8(transposeMap[s])
+ }
+}
+
+func transposeUint8Int16(src []uint8, dest []int16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int16(transposeMap[s])
+ }
+}
+
+func transposeUint8Uint16(src []uint8, dest []uint16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint16(transposeMap[s])
+ }
+}
+
+func transposeUint8Int32(src []uint8, dest []int32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int32(transposeMap[s])
+ }
+}
+
+func transposeUint8Uint32(src []uint8, dest []uint32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint32(transposeMap[s])
+ }
+}
+
+func transposeUint8Int64(src []uint8, dest []int64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int64(transposeMap[s])
+ }
+}
+
+func transposeUint8Uint64(src []uint8, dest []uint64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint64(transposeMap[s])
+ }
+}
+
+func transposeInt16Int8(src []int16, dest []int8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int8(transposeMap[s])
+ }
+}
+
+func transposeInt16Uint8(src []int16, dest []uint8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint8(transposeMap[s])
+ }
+}
+
+func transposeInt16Int16(src []int16, dest []int16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int16(transposeMap[s])
+ }
+}
+
+func transposeInt16Uint16(src []int16, dest []uint16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint16(transposeMap[s])
+ }
+}
+
+func transposeInt16Int32(src []int16, dest []int32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int32(transposeMap[s])
+ }
+}
+
+func transposeInt16Uint32(src []int16, dest []uint32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint32(transposeMap[s])
+ }
+}
+
+func transposeInt16Int64(src []int16, dest []int64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int64(transposeMap[s])
+ }
+}
+
+func transposeInt16Uint64(src []int16, dest []uint64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint64(transposeMap[s])
+ }
+}
+
+func transposeUint16Int8(src []uint16, dest []int8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int8(transposeMap[s])
+ }
+}
+
+func transposeUint16Uint8(src []uint16, dest []uint8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint8(transposeMap[s])
+ }
+}
+
+func transposeUint16Int16(src []uint16, dest []int16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int16(transposeMap[s])
+ }
+}
+
+func transposeUint16Uint16(src []uint16, dest []uint16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint16(transposeMap[s])
+ }
+}
+
+func transposeUint16Int32(src []uint16, dest []int32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int32(transposeMap[s])
+ }
+}
+
+func transposeUint16Uint32(src []uint16, dest []uint32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint32(transposeMap[s])
+ }
+}
+
+func transposeUint16Int64(src []uint16, dest []int64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int64(transposeMap[s])
+ }
+}
+
+func transposeUint16Uint64(src []uint16, dest []uint64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint64(transposeMap[s])
+ }
+}
+
+func transposeInt32Int8(src []int32, dest []int8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int8(transposeMap[s])
+ }
+}
+
+func transposeInt32Uint8(src []int32, dest []uint8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint8(transposeMap[s])
+ }
+}
+
+func transposeInt32Int16(src []int32, dest []int16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int16(transposeMap[s])
+ }
+}
+
+func transposeInt32Uint16(src []int32, dest []uint16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint16(transposeMap[s])
+ }
+}
+
+func transposeInt32Int32(src []int32, dest []int32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int32(transposeMap[s])
+ }
+}
+
+func transposeInt32Uint32(src []int32, dest []uint32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint32(transposeMap[s])
+ }
+}
+
+func transposeInt32Int64(src []int32, dest []int64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int64(transposeMap[s])
+ }
+}
+
+func transposeInt32Uint64(src []int32, dest []uint64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint64(transposeMap[s])
+ }
+}
+
+func transposeUint32Int8(src []uint32, dest []int8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int8(transposeMap[s])
+ }
+}
+
+func transposeUint32Uint8(src []uint32, dest []uint8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint8(transposeMap[s])
+ }
+}
+
+func transposeUint32Int16(src []uint32, dest []int16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int16(transposeMap[s])
+ }
+}
+
+func transposeUint32Uint16(src []uint32, dest []uint16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint16(transposeMap[s])
+ }
+}
+
+func transposeUint32Int32(src []uint32, dest []int32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int32(transposeMap[s])
+ }
+}
+
+func transposeUint32Uint32(src []uint32, dest []uint32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint32(transposeMap[s])
+ }
+}
+
+func transposeUint32Int64(src []uint32, dest []int64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int64(transposeMap[s])
+ }
+}
+
+func transposeUint32Uint64(src []uint32, dest []uint64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint64(transposeMap[s])
+ }
+}
+
+func transposeInt64Int8(src []int64, dest []int8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int8(transposeMap[s])
+ }
+}
+
+func transposeInt64Uint8(src []int64, dest []uint8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint8(transposeMap[s])
+ }
+}
+
+func transposeInt64Int16(src []int64, dest []int16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int16(transposeMap[s])
+ }
+}
+
+func transposeInt64Uint16(src []int64, dest []uint16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint16(transposeMap[s])
+ }
+}
+
+func transposeInt64Int32(src []int64, dest []int32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int32(transposeMap[s])
+ }
+}
+
+func transposeInt64Uint32(src []int64, dest []uint32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint32(transposeMap[s])
+ }
+}
+
+func transposeInt64Int64(src []int64, dest []int64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int64(transposeMap[s])
+ }
+}
+
+func transposeInt64Uint64(src []int64, dest []uint64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint64(transposeMap[s])
+ }
+}
+
+func transposeUint64Int8(src []uint64, dest []int8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int8(transposeMap[s])
+ }
+}
+
+func transposeUint64Uint8(src []uint64, dest []uint8, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint8(transposeMap[s])
+ }
+}
+
+func transposeUint64Int16(src []uint64, dest []int16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int16(transposeMap[s])
+ }
+}
+
+func transposeUint64Uint16(src []uint64, dest []uint16, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint16(transposeMap[s])
+ }
+}
+
+func transposeUint64Int32(src []uint64, dest []int32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int32(transposeMap[s])
+ }
+}
+
+func transposeUint64Uint32(src []uint64, dest []uint32, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint32(transposeMap[s])
+ }
+}
+
+func transposeUint64Int64(src []uint64, dest []int64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = int64(transposeMap[s])
+ }
+}
+
+func transposeUint64Uint64(src []uint64, dest []uint64, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = uint64(transposeMap[s])
+ }
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go.tmpl b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go.tmpl
new file mode 100644
index 000000000..680ae1ee7
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go.tmpl
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+{{ $typelist := .In }}
+{{range .In}}
+{{ $src := .Type }}
+{{ $srcName := .Name }}
+{{ range $typelist }}
+{{ $dest := .Type }}
+{{ $destName := .Name }}
+
+func transpose{{ $srcName }}{{ $destName }}(src []{{$src}}, dest []{{$dest}}, transposeMap []int32) {
+ for i, s := range src {
+ dest[i] = {{ $dest }}(transposeMap[s])
+ }
+}
+
+{{ end }}
+{{ end }}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.tmpldata b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.tmpldata
new file mode 100644
index 000000000..72eaf300c
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.tmpldata
@@ -0,0 +1,34 @@
+[
+ {
+ "Name": "Int8",
+ "Type": "int8"
+ },
+ {
+ "Name": "Uint8",
+ "Type": "uint8"
+ },
+ {
+ "Name": "Int16",
+ "Type": "int16"
+ },
+ {
+ "Name": "Uint16",
+ "Type": "uint16"
+ },
+ {
+ "Name": "Int32",
+ "Type": "int32"
+ },
+ {
+ "Name": "Uint32",
+ "Type": "uint32"
+ },
+ {
+ "Name": "Int64",
+ "Type": "int64"
+ },
+ {
+ "Name": "Uint64",
+ "Type": "uint64"
+ }
+]
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go
new file mode 100644
index 000000000..d4433d368
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go
@@ -0,0 +1,325 @@
+// Code generated by transpose_ints_amd64.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+import (
+ "golang.org/x/sys/cpu"
+)
+
+var (
+ TransposeInt8Int8 func([]int8, []int8, []int32)
+ TransposeInt8Uint8 func([]int8, []uint8, []int32)
+ TransposeInt8Int16 func([]int8, []int16, []int32)
+ TransposeInt8Uint16 func([]int8, []uint16, []int32)
+ TransposeInt8Int32 func([]int8, []int32, []int32)
+ TransposeInt8Uint32 func([]int8, []uint32, []int32)
+ TransposeInt8Int64 func([]int8, []int64, []int32)
+ TransposeInt8Uint64 func([]int8, []uint64, []int32)
+
+ TransposeUint8Int8 func([]uint8, []int8, []int32)
+ TransposeUint8Uint8 func([]uint8, []uint8, []int32)
+ TransposeUint8Int16 func([]uint8, []int16, []int32)
+ TransposeUint8Uint16 func([]uint8, []uint16, []int32)
+ TransposeUint8Int32 func([]uint8, []int32, []int32)
+ TransposeUint8Uint32 func([]uint8, []uint32, []int32)
+ TransposeUint8Int64 func([]uint8, []int64, []int32)
+ TransposeUint8Uint64 func([]uint8, []uint64, []int32)
+
+ TransposeInt16Int8 func([]int16, []int8, []int32)
+ TransposeInt16Uint8 func([]int16, []uint8, []int32)
+ TransposeInt16Int16 func([]int16, []int16, []int32)
+ TransposeInt16Uint16 func([]int16, []uint16, []int32)
+ TransposeInt16Int32 func([]int16, []int32, []int32)
+ TransposeInt16Uint32 func([]int16, []uint32, []int32)
+ TransposeInt16Int64 func([]int16, []int64, []int32)
+ TransposeInt16Uint64 func([]int16, []uint64, []int32)
+
+ TransposeUint16Int8 func([]uint16, []int8, []int32)
+ TransposeUint16Uint8 func([]uint16, []uint8, []int32)
+ TransposeUint16Int16 func([]uint16, []int16, []int32)
+ TransposeUint16Uint16 func([]uint16, []uint16, []int32)
+ TransposeUint16Int32 func([]uint16, []int32, []int32)
+ TransposeUint16Uint32 func([]uint16, []uint32, []int32)
+ TransposeUint16Int64 func([]uint16, []int64, []int32)
+ TransposeUint16Uint64 func([]uint16, []uint64, []int32)
+
+ TransposeInt32Int8 func([]int32, []int8, []int32)
+ TransposeInt32Uint8 func([]int32, []uint8, []int32)
+ TransposeInt32Int16 func([]int32, []int16, []int32)
+ TransposeInt32Uint16 func([]int32, []uint16, []int32)
+ TransposeInt32Int32 func([]int32, []int32, []int32)
+ TransposeInt32Uint32 func([]int32, []uint32, []int32)
+ TransposeInt32Int64 func([]int32, []int64, []int32)
+ TransposeInt32Uint64 func([]int32, []uint64, []int32)
+
+ TransposeUint32Int8 func([]uint32, []int8, []int32)
+ TransposeUint32Uint8 func([]uint32, []uint8, []int32)
+ TransposeUint32Int16 func([]uint32, []int16, []int32)
+ TransposeUint32Uint16 func([]uint32, []uint16, []int32)
+ TransposeUint32Int32 func([]uint32, []int32, []int32)
+ TransposeUint32Uint32 func([]uint32, []uint32, []int32)
+ TransposeUint32Int64 func([]uint32, []int64, []int32)
+ TransposeUint32Uint64 func([]uint32, []uint64, []int32)
+
+ TransposeInt64Int8 func([]int64, []int8, []int32)
+ TransposeInt64Uint8 func([]int64, []uint8, []int32)
+ TransposeInt64Int16 func([]int64, []int16, []int32)
+ TransposeInt64Uint16 func([]int64, []uint16, []int32)
+ TransposeInt64Int32 func([]int64, []int32, []int32)
+ TransposeInt64Uint32 func([]int64, []uint32, []int32)
+ TransposeInt64Int64 func([]int64, []int64, []int32)
+ TransposeInt64Uint64 func([]int64, []uint64, []int32)
+
+ TransposeUint64Int8 func([]uint64, []int8, []int32)
+ TransposeUint64Uint8 func([]uint64, []uint8, []int32)
+ TransposeUint64Int16 func([]uint64, []int16, []int32)
+ TransposeUint64Uint16 func([]uint64, []uint16, []int32)
+ TransposeUint64Int32 func([]uint64, []int32, []int32)
+ TransposeUint64Uint32 func([]uint64, []uint32, []int32)
+ TransposeUint64Int64 func([]uint64, []int64, []int32)
+ TransposeUint64Uint64 func([]uint64, []uint64, []int32)
+)
+
+func init() {
+ if cpu.X86.HasAVX2 {
+
+ TransposeInt8Int8 = transposeInt8Int8avx2
+ TransposeInt8Uint8 = transposeInt8Uint8avx2
+ TransposeInt8Int16 = transposeInt8Int16avx2
+ TransposeInt8Uint16 = transposeInt8Uint16avx2
+ TransposeInt8Int32 = transposeInt8Int32avx2
+ TransposeInt8Uint32 = transposeInt8Uint32avx2
+ TransposeInt8Int64 = transposeInt8Int64avx2
+ TransposeInt8Uint64 = transposeInt8Uint64avx2
+
+ TransposeUint8Int8 = transposeUint8Int8avx2
+ TransposeUint8Uint8 = transposeUint8Uint8avx2
+ TransposeUint8Int16 = transposeUint8Int16avx2
+ TransposeUint8Uint16 = transposeUint8Uint16avx2
+ TransposeUint8Int32 = transposeUint8Int32avx2
+ TransposeUint8Uint32 = transposeUint8Uint32avx2
+ TransposeUint8Int64 = transposeUint8Int64avx2
+ TransposeUint8Uint64 = transposeUint8Uint64avx2
+
+ TransposeInt16Int8 = transposeInt16Int8avx2
+ TransposeInt16Uint8 = transposeInt16Uint8avx2
+ TransposeInt16Int16 = transposeInt16Int16avx2
+ TransposeInt16Uint16 = transposeInt16Uint16avx2
+ TransposeInt16Int32 = transposeInt16Int32avx2
+ TransposeInt16Uint32 = transposeInt16Uint32avx2
+ TransposeInt16Int64 = transposeInt16Int64avx2
+ TransposeInt16Uint64 = transposeInt16Uint64avx2
+
+ TransposeUint16Int8 = transposeUint16Int8avx2
+ TransposeUint16Uint8 = transposeUint16Uint8avx2
+ TransposeUint16Int16 = transposeUint16Int16avx2
+ TransposeUint16Uint16 = transposeUint16Uint16avx2
+ TransposeUint16Int32 = transposeUint16Int32avx2
+ TransposeUint16Uint32 = transposeUint16Uint32avx2
+ TransposeUint16Int64 = transposeUint16Int64avx2
+ TransposeUint16Uint64 = transposeUint16Uint64avx2
+
+ TransposeInt32Int8 = transposeInt32Int8avx2
+ TransposeInt32Uint8 = transposeInt32Uint8avx2
+ TransposeInt32Int16 = transposeInt32Int16avx2
+ TransposeInt32Uint16 = transposeInt32Uint16avx2
+ TransposeInt32Int32 = transposeInt32Int32avx2
+ TransposeInt32Uint32 = transposeInt32Uint32avx2
+ TransposeInt32Int64 = transposeInt32Int64avx2
+ TransposeInt32Uint64 = transposeInt32Uint64avx2
+
+ TransposeUint32Int8 = transposeUint32Int8avx2
+ TransposeUint32Uint8 = transposeUint32Uint8avx2
+ TransposeUint32Int16 = transposeUint32Int16avx2
+ TransposeUint32Uint16 = transposeUint32Uint16avx2
+ TransposeUint32Int32 = transposeUint32Int32avx2
+ TransposeUint32Uint32 = transposeUint32Uint32avx2
+ TransposeUint32Int64 = transposeUint32Int64avx2
+ TransposeUint32Uint64 = transposeUint32Uint64avx2
+
+ TransposeInt64Int8 = transposeInt64Int8avx2
+ TransposeInt64Uint8 = transposeInt64Uint8avx2
+ TransposeInt64Int16 = transposeInt64Int16avx2
+ TransposeInt64Uint16 = transposeInt64Uint16avx2
+ TransposeInt64Int32 = transposeInt64Int32avx2
+ TransposeInt64Uint32 = transposeInt64Uint32avx2
+ TransposeInt64Int64 = transposeInt64Int64avx2
+ TransposeInt64Uint64 = transposeInt64Uint64avx2
+
+ TransposeUint64Int8 = transposeUint64Int8avx2
+ TransposeUint64Uint8 = transposeUint64Uint8avx2
+ TransposeUint64Int16 = transposeUint64Int16avx2
+ TransposeUint64Uint16 = transposeUint64Uint16avx2
+ TransposeUint64Int32 = transposeUint64Int32avx2
+ TransposeUint64Uint32 = transposeUint64Uint32avx2
+ TransposeUint64Int64 = transposeUint64Int64avx2
+ TransposeUint64Uint64 = transposeUint64Uint64avx2
+
+ } else if cpu.X86.HasSSE42 {
+
+ TransposeInt8Int8 = transposeInt8Int8sse4
+ TransposeInt8Uint8 = transposeInt8Uint8sse4
+ TransposeInt8Int16 = transposeInt8Int16sse4
+ TransposeInt8Uint16 = transposeInt8Uint16sse4
+ TransposeInt8Int32 = transposeInt8Int32sse4
+ TransposeInt8Uint32 = transposeInt8Uint32sse4
+ TransposeInt8Int64 = transposeInt8Int64sse4
+ TransposeInt8Uint64 = transposeInt8Uint64sse4
+
+ TransposeUint8Int8 = transposeUint8Int8sse4
+ TransposeUint8Uint8 = transposeUint8Uint8sse4
+ TransposeUint8Int16 = transposeUint8Int16sse4
+ TransposeUint8Uint16 = transposeUint8Uint16sse4
+ TransposeUint8Int32 = transposeUint8Int32sse4
+ TransposeUint8Uint32 = transposeUint8Uint32sse4
+ TransposeUint8Int64 = transposeUint8Int64sse4
+ TransposeUint8Uint64 = transposeUint8Uint64sse4
+
+ TransposeInt16Int8 = transposeInt16Int8sse4
+ TransposeInt16Uint8 = transposeInt16Uint8sse4
+ TransposeInt16Int16 = transposeInt16Int16sse4
+ TransposeInt16Uint16 = transposeInt16Uint16sse4
+ TransposeInt16Int32 = transposeInt16Int32sse4
+ TransposeInt16Uint32 = transposeInt16Uint32sse4
+ TransposeInt16Int64 = transposeInt16Int64sse4
+ TransposeInt16Uint64 = transposeInt16Uint64sse4
+
+ TransposeUint16Int8 = transposeUint16Int8sse4
+ TransposeUint16Uint8 = transposeUint16Uint8sse4
+ TransposeUint16Int16 = transposeUint16Int16sse4
+ TransposeUint16Uint16 = transposeUint16Uint16sse4
+ TransposeUint16Int32 = transposeUint16Int32sse4
+ TransposeUint16Uint32 = transposeUint16Uint32sse4
+ TransposeUint16Int64 = transposeUint16Int64sse4
+ TransposeUint16Uint64 = transposeUint16Uint64sse4
+
+ TransposeInt32Int8 = transposeInt32Int8sse4
+ TransposeInt32Uint8 = transposeInt32Uint8sse4
+ TransposeInt32Int16 = transposeInt32Int16sse4
+ TransposeInt32Uint16 = transposeInt32Uint16sse4
+ TransposeInt32Int32 = transposeInt32Int32sse4
+ TransposeInt32Uint32 = transposeInt32Uint32sse4
+ TransposeInt32Int64 = transposeInt32Int64sse4
+ TransposeInt32Uint64 = transposeInt32Uint64sse4
+
+ TransposeUint32Int8 = transposeUint32Int8sse4
+ TransposeUint32Uint8 = transposeUint32Uint8sse4
+ TransposeUint32Int16 = transposeUint32Int16sse4
+ TransposeUint32Uint16 = transposeUint32Uint16sse4
+ TransposeUint32Int32 = transposeUint32Int32sse4
+ TransposeUint32Uint32 = transposeUint32Uint32sse4
+ TransposeUint32Int64 = transposeUint32Int64sse4
+ TransposeUint32Uint64 = transposeUint32Uint64sse4
+
+ TransposeInt64Int8 = transposeInt64Int8sse4
+ TransposeInt64Uint8 = transposeInt64Uint8sse4
+ TransposeInt64Int16 = transposeInt64Int16sse4
+ TransposeInt64Uint16 = transposeInt64Uint16sse4
+ TransposeInt64Int32 = transposeInt64Int32sse4
+ TransposeInt64Uint32 = transposeInt64Uint32sse4
+ TransposeInt64Int64 = transposeInt64Int64sse4
+ TransposeInt64Uint64 = transposeInt64Uint64sse4
+
+ TransposeUint64Int8 = transposeUint64Int8sse4
+ TransposeUint64Uint8 = transposeUint64Uint8sse4
+ TransposeUint64Int16 = transposeUint64Int16sse4
+ TransposeUint64Uint16 = transposeUint64Uint16sse4
+ TransposeUint64Int32 = transposeUint64Int32sse4
+ TransposeUint64Uint32 = transposeUint64Uint32sse4
+ TransposeUint64Int64 = transposeUint64Int64sse4
+ TransposeUint64Uint64 = transposeUint64Uint64sse4
+
+ } else {
+
+ TransposeInt8Int8 = transposeInt8Int8
+ TransposeInt8Uint8 = transposeInt8Uint8
+ TransposeInt8Int16 = transposeInt8Int16
+ TransposeInt8Uint16 = transposeInt8Uint16
+ TransposeInt8Int32 = transposeInt8Int32
+ TransposeInt8Uint32 = transposeInt8Uint32
+ TransposeInt8Int64 = transposeInt8Int64
+ TransposeInt8Uint64 = transposeInt8Uint64
+
+ TransposeUint8Int8 = transposeUint8Int8
+ TransposeUint8Uint8 = transposeUint8Uint8
+ TransposeUint8Int16 = transposeUint8Int16
+ TransposeUint8Uint16 = transposeUint8Uint16
+ TransposeUint8Int32 = transposeUint8Int32
+ TransposeUint8Uint32 = transposeUint8Uint32
+ TransposeUint8Int64 = transposeUint8Int64
+ TransposeUint8Uint64 = transposeUint8Uint64
+
+ TransposeInt16Int8 = transposeInt16Int8
+ TransposeInt16Uint8 = transposeInt16Uint8
+ TransposeInt16Int16 = transposeInt16Int16
+ TransposeInt16Uint16 = transposeInt16Uint16
+ TransposeInt16Int32 = transposeInt16Int32
+ TransposeInt16Uint32 = transposeInt16Uint32
+ TransposeInt16Int64 = transposeInt16Int64
+ TransposeInt16Uint64 = transposeInt16Uint64
+
+ TransposeUint16Int8 = transposeUint16Int8
+ TransposeUint16Uint8 = transposeUint16Uint8
+ TransposeUint16Int16 = transposeUint16Int16
+ TransposeUint16Uint16 = transposeUint16Uint16
+ TransposeUint16Int32 = transposeUint16Int32
+ TransposeUint16Uint32 = transposeUint16Uint32
+ TransposeUint16Int64 = transposeUint16Int64
+ TransposeUint16Uint64 = transposeUint16Uint64
+
+ TransposeInt32Int8 = transposeInt32Int8
+ TransposeInt32Uint8 = transposeInt32Uint8
+ TransposeInt32Int16 = transposeInt32Int16
+ TransposeInt32Uint16 = transposeInt32Uint16
+ TransposeInt32Int32 = transposeInt32Int32
+ TransposeInt32Uint32 = transposeInt32Uint32
+ TransposeInt32Int64 = transposeInt32Int64
+ TransposeInt32Uint64 = transposeInt32Uint64
+
+ TransposeUint32Int8 = transposeUint32Int8
+ TransposeUint32Uint8 = transposeUint32Uint8
+ TransposeUint32Int16 = transposeUint32Int16
+ TransposeUint32Uint16 = transposeUint32Uint16
+ TransposeUint32Int32 = transposeUint32Int32
+ TransposeUint32Uint32 = transposeUint32Uint32
+ TransposeUint32Int64 = transposeUint32Int64
+ TransposeUint32Uint64 = transposeUint32Uint64
+
+ TransposeInt64Int8 = transposeInt64Int8
+ TransposeInt64Uint8 = transposeInt64Uint8
+ TransposeInt64Int16 = transposeInt64Int16
+ TransposeInt64Uint16 = transposeInt64Uint16
+ TransposeInt64Int32 = transposeInt64Int32
+ TransposeInt64Uint32 = transposeInt64Uint32
+ TransposeInt64Int64 = transposeInt64Int64
+ TransposeInt64Uint64 = transposeInt64Uint64
+
+ TransposeUint64Int8 = transposeUint64Int8
+ TransposeUint64Uint8 = transposeUint64Uint8
+ TransposeUint64Int16 = transposeUint64Int16
+ TransposeUint64Uint16 = transposeUint64Uint16
+ TransposeUint64Int32 = transposeUint64Int32
+ TransposeUint64Uint32 = transposeUint64Uint32
+ TransposeUint64Int64 = transposeUint64Int64
+ TransposeUint64Uint64 = transposeUint64Uint64
+
+ }
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go.tmpl b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go.tmpl
new file mode 100644
index 000000000..eac0208e5
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go.tmpl
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+// +build !noasm
+
+package utils
+
+import (
+ "golang.org/x/sys/cpu"
+)
+
+var (
+{{ $typelist := .In }}
+{{range .In}}
+{{ $src := .Type -}}
+{{ $srcName := .Name -}}
+{{ range $typelist -}}
+{{ $dest := .Type -}}
+{{ $destName := .Name -}}
+ Transpose{{$srcName}}{{$destName}} func([]{{$src}}, []{{$dest}}, []int32)
+{{end}}
+{{end}}
+)
+
+
+func init() {
+ if cpu.X86.HasAVX2 {
+{{ $typelist := .In }}
+{{range .In}}
+{{ $src := .Type -}}
+{{ $srcName := .Name -}}
+{{ range $typelist -}}
+{{ $dest := .Type -}}
+{{ $destName := .Name -}}
+ Transpose{{$srcName}}{{$destName}} = transpose{{ $srcName }}{{ $destName }}avx2
+{{end}}
+{{end}}
+ } else if cpu.X86.HasSSE42 {
+{{ $typelist := .In }}
+{{range .In}}
+{{ $src := .Type -}}
+{{ $srcName := .Name -}}
+{{ range $typelist -}}
+{{ $dest := .Type -}}
+{{ $destName := .Name -}}
+ Transpose{{$srcName}}{{$destName}} = transpose{{ $srcName }}{{ $destName }}sse4
+{{end}}
+{{end}}
+ } else {
+{{ $typelist := .In }}
+{{range .In}}
+{{ $src := .Type -}}
+{{ $srcName := .Name -}}
+{{ range $typelist -}}
+{{ $dest := .Type -}}
+{{ $destName := .Name -}}
+ Transpose{{$srcName}}{{$destName}} = transpose{{ $srcName }}{{ $destName }}
+{{end}}
+{{end}}
+ }
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_arm64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_arm64.go
new file mode 100644
index 000000000..cc957cdaa
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_arm64.go
@@ -0,0 +1,96 @@
+// Code generated by transpose_ints_s390x.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+// if building with the 'noasm' tag, then point to the pure go implementations
+var (
+ TransposeInt8Int8 = transposeInt8Int8
+ TransposeInt8Uint8 = transposeInt8Uint8
+ TransposeInt8Int16 = transposeInt8Int16
+ TransposeInt8Uint16 = transposeInt8Uint16
+ TransposeInt8Int32 = transposeInt8Int32
+ TransposeInt8Uint32 = transposeInt8Uint32
+ TransposeInt8Int64 = transposeInt8Int64
+ TransposeInt8Uint64 = transposeInt8Uint64
+
+ TransposeUint8Int8 = transposeUint8Int8
+ TransposeUint8Uint8 = transposeUint8Uint8
+ TransposeUint8Int16 = transposeUint8Int16
+ TransposeUint8Uint16 = transposeUint8Uint16
+ TransposeUint8Int32 = transposeUint8Int32
+ TransposeUint8Uint32 = transposeUint8Uint32
+ TransposeUint8Int64 = transposeUint8Int64
+ TransposeUint8Uint64 = transposeUint8Uint64
+
+ TransposeInt16Int8 = transposeInt16Int8
+ TransposeInt16Uint8 = transposeInt16Uint8
+ TransposeInt16Int16 = transposeInt16Int16
+ TransposeInt16Uint16 = transposeInt16Uint16
+ TransposeInt16Int32 = transposeInt16Int32
+ TransposeInt16Uint32 = transposeInt16Uint32
+ TransposeInt16Int64 = transposeInt16Int64
+ TransposeInt16Uint64 = transposeInt16Uint64
+
+ TransposeUint16Int8 = transposeUint16Int8
+ TransposeUint16Uint8 = transposeUint16Uint8
+ TransposeUint16Int16 = transposeUint16Int16
+ TransposeUint16Uint16 = transposeUint16Uint16
+ TransposeUint16Int32 = transposeUint16Int32
+ TransposeUint16Uint32 = transposeUint16Uint32
+ TransposeUint16Int64 = transposeUint16Int64
+ TransposeUint16Uint64 = transposeUint16Uint64
+
+ TransposeInt32Int8 = transposeInt32Int8
+ TransposeInt32Uint8 = transposeInt32Uint8
+ TransposeInt32Int16 = transposeInt32Int16
+ TransposeInt32Uint16 = transposeInt32Uint16
+ TransposeInt32Int32 = transposeInt32Int32
+ TransposeInt32Uint32 = transposeInt32Uint32
+ TransposeInt32Int64 = transposeInt32Int64
+ TransposeInt32Uint64 = transposeInt32Uint64
+
+ TransposeUint32Int8 = transposeUint32Int8
+ TransposeUint32Uint8 = transposeUint32Uint8
+ TransposeUint32Int16 = transposeUint32Int16
+ TransposeUint32Uint16 = transposeUint32Uint16
+ TransposeUint32Int32 = transposeUint32Int32
+ TransposeUint32Uint32 = transposeUint32Uint32
+ TransposeUint32Int64 = transposeUint32Int64
+ TransposeUint32Uint64 = transposeUint32Uint64
+
+ TransposeInt64Int8 = transposeInt64Int8
+ TransposeInt64Uint8 = transposeInt64Uint8
+ TransposeInt64Int16 = transposeInt64Int16
+ TransposeInt64Uint16 = transposeInt64Uint16
+ TransposeInt64Int32 = transposeInt64Int32
+ TransposeInt64Uint32 = transposeInt64Uint32
+ TransposeInt64Int64 = transposeInt64Int64
+ TransposeInt64Uint64 = transposeInt64Uint64
+
+ TransposeUint64Int8 = transposeUint64Int8
+ TransposeUint64Uint8 = transposeUint64Uint8
+ TransposeUint64Int16 = transposeUint64Int16
+ TransposeUint64Uint16 = transposeUint64Uint16
+ TransposeUint64Int32 = transposeUint64Int32
+ TransposeUint64Uint32 = transposeUint64Uint32
+ TransposeUint64Int64 = transposeUint64Int64
+ TransposeUint64Uint64 = transposeUint64Uint64
+)
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.go
new file mode 100644
index 000000000..f1421ddf5
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.go
@@ -0,0 +1,473 @@
+// Code generated by transpose_ints_simd.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+import (
+ "unsafe"
+)
+
+//go:noescape
+func _transpose_int8_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Int8avx2(src []int8, dest []int8, transposeMap []int32) {
+ _transpose_int8_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Uint8avx2(src []int8, dest []uint8, transposeMap []int32) {
+ _transpose_int8_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Int16avx2(src []int8, dest []int16, transposeMap []int32) {
+ _transpose_int8_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Uint16avx2(src []int8, dest []uint16, transposeMap []int32) {
+ _transpose_int8_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Int32avx2(src []int8, dest []int32, transposeMap []int32) {
+ _transpose_int8_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Uint32avx2(src []int8, dest []uint32, transposeMap []int32) {
+ _transpose_int8_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Int64avx2(src []int8, dest []int64, transposeMap []int32) {
+ _transpose_int8_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Uint64avx2(src []int8, dest []uint64, transposeMap []int32) {
+ _transpose_int8_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Int8avx2(src []uint8, dest []int8, transposeMap []int32) {
+ _transpose_uint8_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Uint8avx2(src []uint8, dest []uint8, transposeMap []int32) {
+ _transpose_uint8_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Int16avx2(src []uint8, dest []int16, transposeMap []int32) {
+ _transpose_uint8_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Uint16avx2(src []uint8, dest []uint16, transposeMap []int32) {
+ _transpose_uint8_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Int32avx2(src []uint8, dest []int32, transposeMap []int32) {
+ _transpose_uint8_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Uint32avx2(src []uint8, dest []uint32, transposeMap []int32) {
+ _transpose_uint8_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Int64avx2(src []uint8, dest []int64, transposeMap []int32) {
+ _transpose_uint8_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Uint64avx2(src []uint8, dest []uint64, transposeMap []int32) {
+ _transpose_uint8_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Int8avx2(src []int16, dest []int8, transposeMap []int32) {
+ _transpose_int16_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Uint8avx2(src []int16, dest []uint8, transposeMap []int32) {
+ _transpose_int16_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Int16avx2(src []int16, dest []int16, transposeMap []int32) {
+ _transpose_int16_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Uint16avx2(src []int16, dest []uint16, transposeMap []int32) {
+ _transpose_int16_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Int32avx2(src []int16, dest []int32, transposeMap []int32) {
+ _transpose_int16_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Uint32avx2(src []int16, dest []uint32, transposeMap []int32) {
+ _transpose_int16_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Int64avx2(src []int16, dest []int64, transposeMap []int32) {
+ _transpose_int16_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Uint64avx2(src []int16, dest []uint64, transposeMap []int32) {
+ _transpose_int16_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Int8avx2(src []uint16, dest []int8, transposeMap []int32) {
+ _transpose_uint16_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Uint8avx2(src []uint16, dest []uint8, transposeMap []int32) {
+ _transpose_uint16_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Int16avx2(src []uint16, dest []int16, transposeMap []int32) {
+ _transpose_uint16_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Uint16avx2(src []uint16, dest []uint16, transposeMap []int32) {
+ _transpose_uint16_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Int32avx2(src []uint16, dest []int32, transposeMap []int32) {
+ _transpose_uint16_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Uint32avx2(src []uint16, dest []uint32, transposeMap []int32) {
+ _transpose_uint16_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Int64avx2(src []uint16, dest []int64, transposeMap []int32) {
+ _transpose_uint16_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Uint64avx2(src []uint16, dest []uint64, transposeMap []int32) {
+ _transpose_uint16_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Int8avx2(src []int32, dest []int8, transposeMap []int32) {
+ _transpose_int32_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Uint8avx2(src []int32, dest []uint8, transposeMap []int32) {
+ _transpose_int32_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Int16avx2(src []int32, dest []int16, transposeMap []int32) {
+ _transpose_int32_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Uint16avx2(src []int32, dest []uint16, transposeMap []int32) {
+ _transpose_int32_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Int32avx2(src []int32, dest []int32, transposeMap []int32) {
+ _transpose_int32_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Uint32avx2(src []int32, dest []uint32, transposeMap []int32) {
+ _transpose_int32_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Int64avx2(src []int32, dest []int64, transposeMap []int32) {
+ _transpose_int32_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Uint64avx2(src []int32, dest []uint64, transposeMap []int32) {
+ _transpose_int32_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Int8avx2(src []uint32, dest []int8, transposeMap []int32) {
+ _transpose_uint32_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Uint8avx2(src []uint32, dest []uint8, transposeMap []int32) {
+ _transpose_uint32_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Int16avx2(src []uint32, dest []int16, transposeMap []int32) {
+ _transpose_uint32_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Uint16avx2(src []uint32, dest []uint16, transposeMap []int32) {
+ _transpose_uint32_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Int32avx2(src []uint32, dest []int32, transposeMap []int32) {
+ _transpose_uint32_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Uint32avx2(src []uint32, dest []uint32, transposeMap []int32) {
+ _transpose_uint32_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Int64avx2(src []uint32, dest []int64, transposeMap []int32) {
+ _transpose_uint32_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Uint64avx2(src []uint32, dest []uint64, transposeMap []int32) {
+ _transpose_uint32_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Int8avx2(src []int64, dest []int8, transposeMap []int32) {
+ _transpose_int64_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Uint8avx2(src []int64, dest []uint8, transposeMap []int32) {
+ _transpose_int64_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Int16avx2(src []int64, dest []int16, transposeMap []int32) {
+ _transpose_int64_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Uint16avx2(src []int64, dest []uint16, transposeMap []int32) {
+ _transpose_int64_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Int32avx2(src []int64, dest []int32, transposeMap []int32) {
+ _transpose_int64_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Uint32avx2(src []int64, dest []uint32, transposeMap []int32) {
+ _transpose_int64_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Int64avx2(src []int64, dest []int64, transposeMap []int32) {
+ _transpose_int64_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Uint64avx2(src []int64, dest []uint64, transposeMap []int32) {
+ _transpose_int64_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Int8avx2(src []uint64, dest []int8, transposeMap []int32) {
+ _transpose_uint64_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Uint8avx2(src []uint64, dest []uint8, transposeMap []int32) {
+ _transpose_uint64_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Int16avx2(src []uint64, dest []int16, transposeMap []int32) {
+ _transpose_uint64_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Uint16avx2(src []uint64, dest []uint16, transposeMap []int32) {
+ _transpose_uint64_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Int32avx2(src []uint64, dest []int32, transposeMap []int32) {
+ _transpose_uint64_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Uint32avx2(src []uint64, dest []uint32, transposeMap []int32) {
+ _transpose_uint64_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Int64avx2(src []uint64, dest []int64, transposeMap []int32) {
+ _transpose_uint64_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Uint64avx2(src []uint64, dest []uint64, transposeMap []int32) {
+ _transpose_uint64_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.s b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.s
new file mode 100644
index 000000000..fbcc101eb
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.s
@@ -0,0 +1,3074 @@
+//+build !noasm !appengine
+// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
+
+TEXT ·_transpose_uint8_uint8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB0_1
+
+LBB0_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB0_5
+
+LBB0_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB0_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB0_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB0_3
+
+LBB0_4:
+ RET
+
+TEXT ·_transpose_int8_uint8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB1_1
+
+LBB1_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB1_5
+
+LBB1_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB1_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB1_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB1_3
+
+LBB1_4:
+ RET
+
+TEXT ·_transpose_uint16_uint8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB2_1
+
+LBB2_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB2_5
+
+LBB2_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB2_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB2_3:
+ LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB2_3
+
+LBB2_4:
+ RET
+
+TEXT ·_transpose_int16_uint8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB3_1
+
+LBB3_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB3_5
+
+LBB3_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB3_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB3_3:
+ LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB3_3
+
+LBB3_4:
+ RET
+
+TEXT ·_transpose_uint32_uint8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB4_1
+
+LBB4_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB4_5
+
+LBB4_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB4_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB4_3:
+ LONG $0x87048b42 // mov eax, dword [rdi + 4*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB4_3
+
+LBB4_4:
+ RET
+
+TEXT ·_transpose_int32_uint8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB5_1
+
+LBB5_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB5_5
+
+LBB5_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB5_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB5_3:
+ LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB5_3
+
+LBB5_4:
+ RET
+
+TEXT ·_transpose_uint64_uint8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB6_1
+
+LBB6_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB6_5
+
+LBB6_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB6_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB6_3:
+ LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB6_3
+
+LBB6_4:
+ RET
+
+TEXT ·_transpose_int64_uint8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB7_1
+
+LBB7_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB7_5
+
+LBB7_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB7_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB7_3:
+ LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB7_3
+
+LBB7_4:
+ RET
+
+TEXT ·_transpose_uint8_int8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB8_1
+
+LBB8_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB8_5
+
+LBB8_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB8_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB8_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB8_3
+
+LBB8_4:
+ RET
+
+TEXT ·_transpose_int8_int8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB9_1
+
+LBB9_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB9_5
+
+LBB9_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB9_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB9_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB9_3
+
+LBB9_4:
+ RET
+
+TEXT ·_transpose_uint16_int8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB10_1
+
+LBB10_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB10_5
+
+LBB10_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB10_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB10_3:
+ LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB10_3
+
+LBB10_4:
+ RET
+
+TEXT ·_transpose_int16_int8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB11_1
+
+LBB11_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB11_5
+
+LBB11_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB11_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB11_3:
+ LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB11_3
+
+LBB11_4:
+ RET
+
+TEXT ·_transpose_uint32_int8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB12_1
+
+LBB12_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB12_5
+
+LBB12_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB12_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB12_3:
+ LONG $0x87048b42 // mov eax, dword [rdi + 4*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB12_3
+
+LBB12_4:
+ RET
+
+TEXT ·_transpose_int32_int8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB13_1
+
+LBB13_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB13_5
+
+LBB13_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB13_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB13_3:
+ LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB13_3
+
+LBB13_4:
+ RET
+
+TEXT ·_transpose_uint64_int8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB14_1
+
+LBB14_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB14_5
+
+LBB14_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB14_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB14_3:
+ LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB14_3
+
+LBB14_4:
+ RET
+
+TEXT ·_transpose_int64_int8_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB15_1
+
+LBB15_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB15_5
+
+LBB15_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB15_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB15_3:
+ LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB15_3
+
+LBB15_4:
+ RET
+
+TEXT ·_transpose_uint8_uint16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB16_1
+
+LBB16_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB16_5
+
+LBB16_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB16_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB16_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB16_3
+
+LBB16_4:
+ RET
+
+TEXT ·_transpose_int8_uint16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB17_1
+
+LBB17_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB17_5
+
+LBB17_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB17_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB17_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB17_3
+
+LBB17_4:
+ RET
+
+TEXT ·_transpose_uint16_uint16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB18_1
+
+LBB18_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB18_5
+
+LBB18_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB18_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB18_3:
+ LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB18_3
+
+LBB18_4:
+ RET
+
+TEXT ·_transpose_int16_uint16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB19_1
+
+LBB19_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB19_5
+
+LBB19_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB19_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB19_3:
+ LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB19_3
+
+LBB19_4:
+ RET
+
+TEXT ·_transpose_uint32_uint16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB20_1
+
+LBB20_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB20_5
+
+LBB20_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB20_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB20_3:
+ LONG $0x47048b42 // mov eax, dword [rdi + 2*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB20_3
+
+LBB20_4:
+ RET
+
+TEXT ·_transpose_int32_uint16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB21_1
+
+LBB21_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB21_5
+
+LBB21_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB21_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB21_3:
+ LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB21_3
+
+LBB21_4:
+ RET
+
+TEXT ·_transpose_uint64_uint16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB22_1
+
+LBB22_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB22_5
+
+LBB22_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB22_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB22_3:
+ LONG $0x87048b4a // mov rax, qword [rdi + 4*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB22_3
+
+LBB22_4:
+ RET
+
+TEXT ·_transpose_int64_uint16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB23_1
+
+LBB23_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB23_5
+
+LBB23_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB23_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB23_3:
+ LONG $0x87048b4a // mov rax, qword [rdi + 4*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB23_3
+
+LBB23_4:
+ RET
+
+TEXT ·_transpose_uint8_int16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB24_1
+
+LBB24_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB24_5
+
+LBB24_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB24_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB24_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB24_3
+
+LBB24_4:
+ RET
+
+TEXT ·_transpose_int8_int16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB25_1
+
+LBB25_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB25_5
+
+LBB25_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB25_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB25_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB25_3
+
+LBB25_4:
+ RET
+
+TEXT ·_transpose_uint16_int16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB26_1
+
+LBB26_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB26_5
+
+LBB26_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB26_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB26_3:
+ LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB26_3
+
+LBB26_4:
+ RET
+
+TEXT ·_transpose_int16_int16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB27_1
+
+LBB27_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB27_5
+
+LBB27_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB27_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB27_3:
+ LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB27_3
+
+LBB27_4:
+ RET
+
+TEXT ·_transpose_uint32_int16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB28_1
+
+LBB28_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB28_5
+
+LBB28_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB28_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB28_3:
+ LONG $0x47048b42 // mov eax, dword [rdi + 2*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB28_3
+
+LBB28_4:
+ RET
+
+TEXT ·_transpose_int32_int16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB29_1
+
+LBB29_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB29_5
+
+LBB29_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB29_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB29_3:
+ LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB29_3
+
+LBB29_4:
+ RET
+
+TEXT ·_transpose_uint64_int16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB30_1
+
+LBB30_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB30_5
+
+LBB30_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB30_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB30_3:
+ LONG $0x87048b4a // mov rax, qword [rdi + 4*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB30_3
+
+LBB30_4:
+ RET
+
+TEXT ·_transpose_int64_int16_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB31_1
+
+LBB31_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB31_5
+
+LBB31_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB31_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB31_3:
+ LONG $0x87048b4a // mov rax, qword [rdi + 4*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB31_3
+
+LBB31_4:
+ RET
+
+TEXT ·_transpose_uint8_uint32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB32_1
+
+LBB32_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB32_5
+
+LBB32_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB32_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB32_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x86048942 // mov dword [rsi + 4*r8], eax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB32_3
+
+LBB32_4:
+ RET
+
+TEXT ·_transpose_int8_uint32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB33_1
+
+LBB33_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB33_5
+
+LBB33_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB33_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB33_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x86048942 // mov dword [rsi + 4*r8], eax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB33_3
+
+LBB33_4:
+ RET
+
+TEXT ·_transpose_uint16_uint32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB34_1
+
+LBB34_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB34_5
+
+LBB34_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB34_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB34_3:
+ LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x46048942 // mov dword [rsi + 2*r8], eax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB34_3
+
+LBB34_4:
+ RET
+
+TEXT ·_transpose_int16_uint32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB35_1
+
+LBB35_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB35_5
+
+LBB35_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB35_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB35_3:
+ LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x46048942 // mov dword [rsi + 2*r8], eax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB35_3
+
+LBB35_4:
+ RET
+
+TEXT ·_transpose_uint32_uint32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB36_1
+
+LBB36_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB36_5
+
+LBB36_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB36_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB36_3:
+ LONG $0x07048b42 // mov eax, dword [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB36_3
+
+LBB36_4:
+ RET
+
+TEXT ·_transpose_int32_uint32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB37_1
+
+LBB37_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB37_5
+
+LBB37_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB37_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB37_3:
+ LONG $0x0704634a // movsxd rax, dword [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB37_3
+
+LBB37_4:
+ RET
+
+TEXT ·_transpose_uint64_uint32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB38_1
+
+LBB38_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB38_5
+
+LBB38_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB38_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB38_3:
+ LONG $0x47048b4a // mov rax, qword [rdi + 2*r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB38_3
+
+LBB38_4:
+ RET
+
+TEXT ·_transpose_int64_uint32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB39_1
+
+LBB39_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB39_5
+
+LBB39_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB39_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB39_3:
+ LONG $0x47048b4a // mov rax, qword [rdi + 2*r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB39_3
+
+LBB39_4:
+ RET
+
+TEXT ·_transpose_uint8_int32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB40_1
+
+LBB40_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB40_5
+
+LBB40_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB40_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB40_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x86048942 // mov dword [rsi + 4*r8], eax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB40_3
+
+LBB40_4:
+ RET
+
+TEXT ·_transpose_int8_int32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB41_1
+
+LBB41_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB41_5
+
+LBB41_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB41_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB41_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x86048942 // mov dword [rsi + 4*r8], eax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB41_3
+
+LBB41_4:
+ RET
+
+TEXT ·_transpose_uint16_int32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB42_1
+
+LBB42_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB42_5
+
+LBB42_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB42_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB42_3:
+ LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x46048942 // mov dword [rsi + 2*r8], eax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB42_3
+
+LBB42_4:
+ RET
+
+TEXT ·_transpose_int16_int32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB43_1
+
+LBB43_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB43_5
+
+LBB43_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB43_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB43_3:
+ LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x46048942 // mov dword [rsi + 2*r8], eax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB43_3
+
+LBB43_4:
+ RET
+
+TEXT ·_transpose_uint32_int32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB44_1
+
+LBB44_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB44_5
+
+LBB44_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB44_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB44_3:
+ LONG $0x07048b42 // mov eax, dword [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB44_3
+
+LBB44_4:
+ RET
+
+TEXT ·_transpose_int32_int32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB45_1
+
+LBB45_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB45_5
+
+LBB45_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB45_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB45_3:
+ LONG $0x0704634a // movsxd rax, dword [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB45_3
+
+LBB45_4:
+ RET
+
+TEXT ·_transpose_uint64_int32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB46_1
+
+LBB46_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB46_5
+
+LBB46_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB46_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB46_3:
+ LONG $0x47048b4a // mov rax, qword [rdi + 2*r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB46_3
+
+LBB46_4:
+ RET
+
+TEXT ·_transpose_int64_int32_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB47_1
+
+LBB47_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB47_5
+
+LBB47_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB47_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB47_3:
+ LONG $0x47048b4a // mov rax, qword [rdi + 2*r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB47_3
+
+LBB47_4:
+ RET
+
+TEXT ·_transpose_uint8_uint64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB48_1
+
+LBB48_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB48_5
+
+LBB48_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB48_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB48_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0xc604894a // mov qword [rsi + 8*r8], rax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB48_3
+
+LBB48_4:
+ RET
+
+TEXT ·_transpose_int8_uint64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB49_1
+
+LBB49_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB49_5
+
+LBB49_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB49_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB49_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0xc604894a // mov qword [rsi + 8*r8], rax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB49_3
+
+LBB49_4:
+ RET
+
+TEXT ·_transpose_uint16_uint64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB50_1
+
+LBB50_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB50_5
+
+LBB50_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB50_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB50_3:
+ LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x8604894a // mov qword [rsi + 4*r8], rax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB50_3
+
+LBB50_4:
+ RET
+
+TEXT ·_transpose_int16_uint64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB51_1
+
+LBB51_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB51_5
+
+LBB51_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB51_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB51_3:
+ LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x8604894a // mov qword [rsi + 4*r8], rax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB51_3
+
+LBB51_4:
+ RET
+
+TEXT ·_transpose_uint32_uint64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB52_1
+
+LBB52_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB52_5
+
+LBB52_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB52_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB52_3:
+ LONG $0x07048b42 // mov eax, dword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x4604894a // mov qword [rsi + 2*r8], rax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB52_3
+
+LBB52_4:
+ RET
+
+TEXT ·_transpose_int32_uint64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB53_1
+
+LBB53_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB53_5
+
+LBB53_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB53_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB53_3:
+ LONG $0x0704634a // movsxd rax, dword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x4604894a // mov qword [rsi + 2*r8], rax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB53_3
+
+LBB53_4:
+ RET
+
+TEXT ·_transpose_uint64_uint64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB54_1
+
+LBB54_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB54_5
+
+LBB54_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB54_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB54_3:
+ LONG $0x07048b4a // mov rax, qword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x0604894a // mov qword [rsi + r8], rax
+ LONG $0x08c08349 // add r8, 8
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB54_3
+
+LBB54_4:
+ RET
+
+TEXT ·_transpose_int64_uint64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB55_1
+
+LBB55_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB55_5
+
+LBB55_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB55_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB55_3:
+ LONG $0x07048b4a // mov rax, qword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x0604894a // mov qword [rsi + r8], rax
+ LONG $0x08c08349 // add r8, 8
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB55_3
+
+LBB55_4:
+ RET
+
+TEXT ·_transpose_uint8_int64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB56_1
+
+LBB56_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB56_5
+
+LBB56_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB56_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB56_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0xc604894a // mov qword [rsi + 8*r8], rax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB56_3
+
+LBB56_4:
+ RET
+
+TEXT ·_transpose_int8_int64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB57_1
+
+LBB57_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB57_5
+
+LBB57_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB57_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB57_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0xc604894a // mov qword [rsi + 8*r8], rax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB57_3
+
+LBB57_4:
+ RET
+
+TEXT ·_transpose_uint16_int64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB58_1
+
+LBB58_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB58_5
+
+LBB58_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB58_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB58_3:
+ LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x8604894a // mov qword [rsi + 4*r8], rax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB58_3
+
+LBB58_4:
+ RET
+
+TEXT ·_transpose_int16_int64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB59_1
+
+LBB59_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB59_5
+
+LBB59_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB59_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB59_3:
+ LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x8604894a // mov qword [rsi + 4*r8], rax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB59_3
+
+LBB59_4:
+ RET
+
+TEXT ·_transpose_uint32_int64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB60_1
+
+LBB60_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB60_5
+
+LBB60_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB60_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB60_3:
+ LONG $0x07048b42 // mov eax, dword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x4604894a // mov qword [rsi + 2*r8], rax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB60_3
+
+LBB60_4:
+ RET
+
+TEXT ·_transpose_int32_int64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB61_1
+
+LBB61_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB61_5
+
+LBB61_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB61_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB61_3:
+ LONG $0x0704634a // movsxd rax, dword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x4604894a // mov qword [rsi + 2*r8], rax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB61_3
+
+LBB61_4:
+ RET
+
+TEXT ·_transpose_uint64_int64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB62_1
+
+LBB62_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB62_5
+
+LBB62_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB62_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB62_3:
+ LONG $0x07048b4a // mov rax, qword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x0604894a // mov qword [rsi + r8], rax
+ LONG $0x08c08349 // add r8, 8
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB62_3
+
+LBB62_4:
+ RET
+
+TEXT ·_transpose_int64_int64_avx2(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB63_1
+
+LBB63_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB63_5
+
+LBB63_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB63_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB63_3:
+ LONG $0x07048b4a // mov rax, qword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x0604894a // mov qword [rsi + r8], rax
+ LONG $0x08c08349 // add r8, 8
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB63_3
+
+LBB63_4:
+ RET
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_def.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_def.go
new file mode 100644
index 000000000..c52598d71
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_def.go
@@ -0,0 +1,227 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+ "errors"
+
+ "github.com/apache/arrow/go/v15/arrow"
+)
+
+//go:generate go run ../../arrow/_tools/tmpl -i -data=transpose_ints.tmpldata -d arch=avx2 transpose_ints_simd.go.tmpl=transpose_ints_avx2_amd64.go
+//go:generate go run ../../arrow/_tools/tmpl -i -data=transpose_ints.tmpldata -d arch=sse4 transpose_ints_simd.go.tmpl=transpose_ints_sse4_amd64.go
+//go:generate go run ../../arrow/_tools/tmpl -i -data=transpose_ints.tmpldata transpose_ints_s390x.go.tmpl=transpose_ints_s390x.go
+//go:generate go run ../../arrow/_tools/tmpl -i -data=transpose_ints.tmpldata transpose_ints_s390x.go.tmpl=transpose_ints_arm64.go
+//go:generate go run ../../arrow/_tools/tmpl -i -data=transpose_ints.tmpldata transpose_ints_noasm.go.tmpl=transpose_ints_noasm.go
+//go:generate go run ../../arrow/_tools/tmpl -i -data=transpose_ints.tmpldata transpose_ints.go.tmpl=transpose_ints.go
+
+func bufToTyped(typ arrow.DataType, buf []byte, offset, length int) (interface{}, error) {
+ switch typ.ID() {
+ case arrow.INT8:
+ return arrow.Int8Traits.CastFromBytes(buf)[offset : offset+length], nil
+ case arrow.INT16:
+ return arrow.Int16Traits.CastFromBytes(buf)[offset : offset+length], nil
+ case arrow.INT32:
+ return arrow.Int32Traits.CastFromBytes(buf)[offset : offset+length], nil
+ case arrow.INT64:
+ return arrow.Int64Traits.CastFromBytes(buf)[offset : offset+length], nil
+ case arrow.UINT8:
+ return arrow.Uint8Traits.CastFromBytes(buf)[offset : offset+length], nil
+ case arrow.UINT16:
+ return arrow.Uint16Traits.CastFromBytes(buf)[offset : offset+length], nil
+ case arrow.UINT32:
+ return arrow.Uint32Traits.CastFromBytes(buf)[offset : offset+length], nil
+ case arrow.UINT64:
+ return arrow.Uint64Traits.CastFromBytes(buf)[offset : offset+length], nil
+ }
+ return nil, errors.New("only accepts integral types")
+}
+
+// TransposeIntsBuffers takes the data-types, byte buffers, and offsets of a source and destination
+// buffer to perform TransposeInts on with the provided mapping data.
+func TransposeIntsBuffers(inType, outType arrow.DataType, indata, outdata []byte, inOffset, outOffset int, length int, transposeMap []int32) error {
+ src, err := bufToTyped(inType, indata, inOffset, length)
+ if err != nil {
+ return err
+ }
+ dest, err := bufToTyped(outType, outdata, outOffset, length)
+ if err != nil {
+ return err
+ }
+
+ return TransposeInts(src, dest, transposeMap)
+}
+
+// TransposeInts expects two integral slices and the values they map to. Returning
+// an error if either src or dest are not an integral type.
+func TransposeInts(src, dest interface{}, mapping []int32) error {
+ switch s := src.(type) {
+ case []int8:
+ switch d := dest.(type) {
+ case []int8:
+ TransposeInt8Int8(s, d, mapping)
+ case []int16:
+ TransposeInt8Int16(s, d, mapping)
+ case []int32:
+ TransposeInt8Int32(s, d, mapping)
+ case []int64:
+ TransposeInt8Int64(s, d, mapping)
+ case []uint8:
+ TransposeInt8Uint8(s, d, mapping)
+ case []uint16:
+ TransposeInt8Uint16(s, d, mapping)
+ case []uint32:
+ TransposeInt8Uint32(s, d, mapping)
+ case []uint64:
+ TransposeInt8Uint64(s, d, mapping)
+ }
+ case []int16:
+ switch d := dest.(type) {
+ case []int8:
+ TransposeInt16Int8(s, d, mapping)
+ case []int16:
+ TransposeInt16Int16(s, d, mapping)
+ case []int32:
+ TransposeInt16Int32(s, d, mapping)
+ case []int64:
+ TransposeInt16Int64(s, d, mapping)
+ case []uint8:
+ TransposeInt16Uint8(s, d, mapping)
+ case []uint16:
+ TransposeInt16Uint16(s, d, mapping)
+ case []uint32:
+ TransposeInt16Uint32(s, d, mapping)
+ case []uint64:
+ TransposeInt16Uint64(s, d, mapping)
+ }
+ case []int32:
+ switch d := dest.(type) {
+ case []int8:
+ TransposeInt32Int8(s, d, mapping)
+ case []int16:
+ TransposeInt32Int16(s, d, mapping)
+ case []int32:
+ TransposeInt32Int32(s, d, mapping)
+ case []int64:
+ TransposeInt32Int64(s, d, mapping)
+ case []uint8:
+ TransposeInt32Uint8(s, d, mapping)
+ case []uint16:
+ TransposeInt32Uint16(s, d, mapping)
+ case []uint32:
+ TransposeInt32Uint32(s, d, mapping)
+ case []uint64:
+ TransposeInt32Uint64(s, d, mapping)
+ }
+ case []int64:
+ switch d := dest.(type) {
+ case []int8:
+ TransposeInt64Int8(s, d, mapping)
+ case []int16:
+ TransposeInt64Int16(s, d, mapping)
+ case []int32:
+ TransposeInt64Int32(s, d, mapping)
+ case []int64:
+ TransposeInt64Int64(s, d, mapping)
+ case []uint8:
+ TransposeInt64Uint8(s, d, mapping)
+ case []uint16:
+ TransposeInt64Uint16(s, d, mapping)
+ case []uint32:
+ TransposeInt64Uint32(s, d, mapping)
+ case []uint64:
+ TransposeInt64Uint64(s, d, mapping)
+ }
+ case []uint8:
+ switch d := dest.(type) {
+ case []int8:
+ TransposeUint8Int8(s, d, mapping)
+ case []int16:
+ TransposeUint8Int16(s, d, mapping)
+ case []int32:
+ TransposeUint8Int32(s, d, mapping)
+ case []int64:
+ TransposeUint8Int64(s, d, mapping)
+ case []uint8:
+ TransposeUint8Uint8(s, d, mapping)
+ case []uint16:
+ TransposeUint8Uint16(s, d, mapping)
+ case []uint32:
+ TransposeUint8Uint32(s, d, mapping)
+ case []uint64:
+ TransposeUint8Uint64(s, d, mapping)
+ }
+ case []uint16:
+ switch d := dest.(type) {
+ case []int8:
+ TransposeUint16Int8(s, d, mapping)
+ case []int16:
+ TransposeUint16Int16(s, d, mapping)
+ case []int32:
+ TransposeUint16Int32(s, d, mapping)
+ case []int64:
+ TransposeUint16Int64(s, d, mapping)
+ case []uint8:
+ TransposeUint16Uint8(s, d, mapping)
+ case []uint16:
+ TransposeUint16Uint16(s, d, mapping)
+ case []uint32:
+ TransposeUint16Uint32(s, d, mapping)
+ case []uint64:
+ TransposeUint16Uint64(s, d, mapping)
+ }
+ case []uint32:
+ switch d := dest.(type) {
+ case []int8:
+ TransposeUint32Int8(s, d, mapping)
+ case []int16:
+ TransposeUint32Int16(s, d, mapping)
+ case []int32:
+ TransposeUint32Int32(s, d, mapping)
+ case []int64:
+ TransposeUint32Int64(s, d, mapping)
+ case []uint8:
+ TransposeUint32Uint8(s, d, mapping)
+ case []uint16:
+ TransposeUint32Uint16(s, d, mapping)
+ case []uint32:
+ TransposeUint32Uint32(s, d, mapping)
+ case []uint64:
+ TransposeUint32Uint64(s, d, mapping)
+ }
+ case []uint64:
+ switch d := dest.(type) {
+ case []int8:
+ TransposeUint64Int8(s, d, mapping)
+ case []int16:
+ TransposeUint64Int16(s, d, mapping)
+ case []int32:
+ TransposeUint64Int32(s, d, mapping)
+ case []int64:
+ TransposeUint64Int64(s, d, mapping)
+ case []uint8:
+ TransposeUint64Uint8(s, d, mapping)
+ case []uint16:
+ TransposeUint64Uint16(s, d, mapping)
+ case []uint32:
+ TransposeUint64Uint32(s, d, mapping)
+ case []uint64:
+ TransposeUint64Uint64(s, d, mapping)
+ }
+ }
+ return nil
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go
new file mode 100644
index 000000000..461aaf31f
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go
@@ -0,0 +1,96 @@
+// Code generated by transpose_ints_noasm.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build noasm || (!amd64 && !arm64 && !s390x && !ppc64le)
+
+package utils
+
+// if building with the 'noasm' tag, then point to the pure go implementations
+var (
+ TransposeInt8Int8 = transposeInt8Int8
+ TransposeInt8Uint8 = transposeInt8Uint8
+ TransposeInt8Int16 = transposeInt8Int16
+ TransposeInt8Uint16 = transposeInt8Uint16
+ TransposeInt8Int32 = transposeInt8Int32
+ TransposeInt8Uint32 = transposeInt8Uint32
+ TransposeInt8Int64 = transposeInt8Int64
+ TransposeInt8Uint64 = transposeInt8Uint64
+
+ TransposeUint8Int8 = transposeUint8Int8
+ TransposeUint8Uint8 = transposeUint8Uint8
+ TransposeUint8Int16 = transposeUint8Int16
+ TransposeUint8Uint16 = transposeUint8Uint16
+ TransposeUint8Int32 = transposeUint8Int32
+ TransposeUint8Uint32 = transposeUint8Uint32
+ TransposeUint8Int64 = transposeUint8Int64
+ TransposeUint8Uint64 = transposeUint8Uint64
+
+ TransposeInt16Int8 = transposeInt16Int8
+ TransposeInt16Uint8 = transposeInt16Uint8
+ TransposeInt16Int16 = transposeInt16Int16
+ TransposeInt16Uint16 = transposeInt16Uint16
+ TransposeInt16Int32 = transposeInt16Int32
+ TransposeInt16Uint32 = transposeInt16Uint32
+ TransposeInt16Int64 = transposeInt16Int64
+ TransposeInt16Uint64 = transposeInt16Uint64
+
+ TransposeUint16Int8 = transposeUint16Int8
+ TransposeUint16Uint8 = transposeUint16Uint8
+ TransposeUint16Int16 = transposeUint16Int16
+ TransposeUint16Uint16 = transposeUint16Uint16
+ TransposeUint16Int32 = transposeUint16Int32
+ TransposeUint16Uint32 = transposeUint16Uint32
+ TransposeUint16Int64 = transposeUint16Int64
+ TransposeUint16Uint64 = transposeUint16Uint64
+
+ TransposeInt32Int8 = transposeInt32Int8
+ TransposeInt32Uint8 = transposeInt32Uint8
+ TransposeInt32Int16 = transposeInt32Int16
+ TransposeInt32Uint16 = transposeInt32Uint16
+ TransposeInt32Int32 = transposeInt32Int32
+ TransposeInt32Uint32 = transposeInt32Uint32
+ TransposeInt32Int64 = transposeInt32Int64
+ TransposeInt32Uint64 = transposeInt32Uint64
+
+ TransposeUint32Int8 = transposeUint32Int8
+ TransposeUint32Uint8 = transposeUint32Uint8
+ TransposeUint32Int16 = transposeUint32Int16
+ TransposeUint32Uint16 = transposeUint32Uint16
+ TransposeUint32Int32 = transposeUint32Int32
+ TransposeUint32Uint32 = transposeUint32Uint32
+ TransposeUint32Int64 = transposeUint32Int64
+ TransposeUint32Uint64 = transposeUint32Uint64
+
+ TransposeInt64Int8 = transposeInt64Int8
+ TransposeInt64Uint8 = transposeInt64Uint8
+ TransposeInt64Int16 = transposeInt64Int16
+ TransposeInt64Uint16 = transposeInt64Uint16
+ TransposeInt64Int32 = transposeInt64Int32
+ TransposeInt64Uint32 = transposeInt64Uint32
+ TransposeInt64Int64 = transposeInt64Int64
+ TransposeInt64Uint64 = transposeInt64Uint64
+
+ TransposeUint64Int8 = transposeUint64Int8
+ TransposeUint64Uint8 = transposeUint64Uint8
+ TransposeUint64Int16 = transposeUint64Int16
+ TransposeUint64Uint16 = transposeUint64Uint16
+ TransposeUint64Int32 = transposeUint64Int32
+ TransposeUint64Uint32 = transposeUint64Uint32
+ TransposeUint64Int64 = transposeUint64Int64
+ TransposeUint64Uint64 = transposeUint64Uint64
+)
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go.tmpl b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go.tmpl
new file mode 100644
index 000000000..faffdce35
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go.tmpl
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build noasm
+// +build noasm
+
+package utils
+
+// if building with the 'noasm' tag, then point to the pure go implementations
+var (
+{{ $typelist := .In }}
+{{range .In}}
+{{ $src := .Type -}}
+{{ $srcName := .Name -}}
+{{ range $typelist -}}
+{{ $dest := .Type -}}
+{{ $destName := .Name -}}
+ Transpose{{$srcName}}{{$destName}} = transpose{{$srcName}}{{$destName}}
+{{end}}
+{{end}}
+)
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_ppc64le.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_ppc64le.go
new file mode 100644
index 000000000..cc957cdaa
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_ppc64le.go
@@ -0,0 +1,96 @@
+// Code generated by transpose_ints_s390x.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+// if building with the 'noasm' tag, then point to the pure go implementations
+var (
+ TransposeInt8Int8 = transposeInt8Int8
+ TransposeInt8Uint8 = transposeInt8Uint8
+ TransposeInt8Int16 = transposeInt8Int16
+ TransposeInt8Uint16 = transposeInt8Uint16
+ TransposeInt8Int32 = transposeInt8Int32
+ TransposeInt8Uint32 = transposeInt8Uint32
+ TransposeInt8Int64 = transposeInt8Int64
+ TransposeInt8Uint64 = transposeInt8Uint64
+
+ TransposeUint8Int8 = transposeUint8Int8
+ TransposeUint8Uint8 = transposeUint8Uint8
+ TransposeUint8Int16 = transposeUint8Int16
+ TransposeUint8Uint16 = transposeUint8Uint16
+ TransposeUint8Int32 = transposeUint8Int32
+ TransposeUint8Uint32 = transposeUint8Uint32
+ TransposeUint8Int64 = transposeUint8Int64
+ TransposeUint8Uint64 = transposeUint8Uint64
+
+ TransposeInt16Int8 = transposeInt16Int8
+ TransposeInt16Uint8 = transposeInt16Uint8
+ TransposeInt16Int16 = transposeInt16Int16
+ TransposeInt16Uint16 = transposeInt16Uint16
+ TransposeInt16Int32 = transposeInt16Int32
+ TransposeInt16Uint32 = transposeInt16Uint32
+ TransposeInt16Int64 = transposeInt16Int64
+ TransposeInt16Uint64 = transposeInt16Uint64
+
+ TransposeUint16Int8 = transposeUint16Int8
+ TransposeUint16Uint8 = transposeUint16Uint8
+ TransposeUint16Int16 = transposeUint16Int16
+ TransposeUint16Uint16 = transposeUint16Uint16
+ TransposeUint16Int32 = transposeUint16Int32
+ TransposeUint16Uint32 = transposeUint16Uint32
+ TransposeUint16Int64 = transposeUint16Int64
+ TransposeUint16Uint64 = transposeUint16Uint64
+
+ TransposeInt32Int8 = transposeInt32Int8
+ TransposeInt32Uint8 = transposeInt32Uint8
+ TransposeInt32Int16 = transposeInt32Int16
+ TransposeInt32Uint16 = transposeInt32Uint16
+ TransposeInt32Int32 = transposeInt32Int32
+ TransposeInt32Uint32 = transposeInt32Uint32
+ TransposeInt32Int64 = transposeInt32Int64
+ TransposeInt32Uint64 = transposeInt32Uint64
+
+ TransposeUint32Int8 = transposeUint32Int8
+ TransposeUint32Uint8 = transposeUint32Uint8
+ TransposeUint32Int16 = transposeUint32Int16
+ TransposeUint32Uint16 = transposeUint32Uint16
+ TransposeUint32Int32 = transposeUint32Int32
+ TransposeUint32Uint32 = transposeUint32Uint32
+ TransposeUint32Int64 = transposeUint32Int64
+ TransposeUint32Uint64 = transposeUint32Uint64
+
+ TransposeInt64Int8 = transposeInt64Int8
+ TransposeInt64Uint8 = transposeInt64Uint8
+ TransposeInt64Int16 = transposeInt64Int16
+ TransposeInt64Uint16 = transposeInt64Uint16
+ TransposeInt64Int32 = transposeInt64Int32
+ TransposeInt64Uint32 = transposeInt64Uint32
+ TransposeInt64Int64 = transposeInt64Int64
+ TransposeInt64Uint64 = transposeInt64Uint64
+
+ TransposeUint64Int8 = transposeUint64Int8
+ TransposeUint64Uint8 = transposeUint64Uint8
+ TransposeUint64Int16 = transposeUint64Int16
+ TransposeUint64Uint16 = transposeUint64Uint16
+ TransposeUint64Int32 = transposeUint64Int32
+ TransposeUint64Uint32 = transposeUint64Uint32
+ TransposeUint64Int64 = transposeUint64Int64
+ TransposeUint64Uint64 = transposeUint64Uint64
+)
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go
new file mode 100644
index 000000000..cc957cdaa
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go
@@ -0,0 +1,96 @@
+// Code generated by transpose_ints_s390x.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+// if building with the 'noasm' tag, then point to the pure go implementations
+var (
+ TransposeInt8Int8 = transposeInt8Int8
+ TransposeInt8Uint8 = transposeInt8Uint8
+ TransposeInt8Int16 = transposeInt8Int16
+ TransposeInt8Uint16 = transposeInt8Uint16
+ TransposeInt8Int32 = transposeInt8Int32
+ TransposeInt8Uint32 = transposeInt8Uint32
+ TransposeInt8Int64 = transposeInt8Int64
+ TransposeInt8Uint64 = transposeInt8Uint64
+
+ TransposeUint8Int8 = transposeUint8Int8
+ TransposeUint8Uint8 = transposeUint8Uint8
+ TransposeUint8Int16 = transposeUint8Int16
+ TransposeUint8Uint16 = transposeUint8Uint16
+ TransposeUint8Int32 = transposeUint8Int32
+ TransposeUint8Uint32 = transposeUint8Uint32
+ TransposeUint8Int64 = transposeUint8Int64
+ TransposeUint8Uint64 = transposeUint8Uint64
+
+ TransposeInt16Int8 = transposeInt16Int8
+ TransposeInt16Uint8 = transposeInt16Uint8
+ TransposeInt16Int16 = transposeInt16Int16
+ TransposeInt16Uint16 = transposeInt16Uint16
+ TransposeInt16Int32 = transposeInt16Int32
+ TransposeInt16Uint32 = transposeInt16Uint32
+ TransposeInt16Int64 = transposeInt16Int64
+ TransposeInt16Uint64 = transposeInt16Uint64
+
+ TransposeUint16Int8 = transposeUint16Int8
+ TransposeUint16Uint8 = transposeUint16Uint8
+ TransposeUint16Int16 = transposeUint16Int16
+ TransposeUint16Uint16 = transposeUint16Uint16
+ TransposeUint16Int32 = transposeUint16Int32
+ TransposeUint16Uint32 = transposeUint16Uint32
+ TransposeUint16Int64 = transposeUint16Int64
+ TransposeUint16Uint64 = transposeUint16Uint64
+
+ TransposeInt32Int8 = transposeInt32Int8
+ TransposeInt32Uint8 = transposeInt32Uint8
+ TransposeInt32Int16 = transposeInt32Int16
+ TransposeInt32Uint16 = transposeInt32Uint16
+ TransposeInt32Int32 = transposeInt32Int32
+ TransposeInt32Uint32 = transposeInt32Uint32
+ TransposeInt32Int64 = transposeInt32Int64
+ TransposeInt32Uint64 = transposeInt32Uint64
+
+ TransposeUint32Int8 = transposeUint32Int8
+ TransposeUint32Uint8 = transposeUint32Uint8
+ TransposeUint32Int16 = transposeUint32Int16
+ TransposeUint32Uint16 = transposeUint32Uint16
+ TransposeUint32Int32 = transposeUint32Int32
+ TransposeUint32Uint32 = transposeUint32Uint32
+ TransposeUint32Int64 = transposeUint32Int64
+ TransposeUint32Uint64 = transposeUint32Uint64
+
+ TransposeInt64Int8 = transposeInt64Int8
+ TransposeInt64Uint8 = transposeInt64Uint8
+ TransposeInt64Int16 = transposeInt64Int16
+ TransposeInt64Uint16 = transposeInt64Uint16
+ TransposeInt64Int32 = transposeInt64Int32
+ TransposeInt64Uint32 = transposeInt64Uint32
+ TransposeInt64Int64 = transposeInt64Int64
+ TransposeInt64Uint64 = transposeInt64Uint64
+
+ TransposeUint64Int8 = transposeUint64Int8
+ TransposeUint64Uint8 = transposeUint64Uint8
+ TransposeUint64Int16 = transposeUint64Int16
+ TransposeUint64Uint16 = transposeUint64Uint16
+ TransposeUint64Int32 = transposeUint64Int32
+ TransposeUint64Uint32 = transposeUint64Uint32
+ TransposeUint64Int64 = transposeUint64Int64
+ TransposeUint64Uint64 = transposeUint64Uint64
+)
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go.tmpl b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go.tmpl
new file mode 100644
index 000000000..d93c8779c
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go.tmpl
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+// +build !noasm
+
+package utils
+
+// if building with the 'noasm' tag, then point to the pure go implementations
+var (
+{{ $typelist := .In }}
+{{range .In}}
+{{ $src := .Type -}}
+{{ $srcName := .Name -}}
+{{ range $typelist -}}
+{{ $dest := .Type -}}
+{{ $destName := .Name -}}
+ Transpose{{$srcName}}{{$destName}} = transpose{{$srcName}}{{$destName}}
+{{end}}
+{{end}}
+)
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_simd.go.tmpl b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_simd.go.tmpl
new file mode 100644
index 000000000..034d0e9d2
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_simd.go.tmpl
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+// +build !noasm
+
+package utils
+
+import (
+ "unsafe"
+)
+
+{{ $arch := .D.arch}}
+{{ $typelist := .In}}
+{{range .In}}
+{{ $src := .Type }}
+{{ $srcName := .Name }}
+{{ range $typelist}}
+{{ $dest := .Type }}
+{{ $destName := .Name }}
+
+//go:noescape
+func _transpose_{{printf "%s_%s_%s" $src $dest $arch}}(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transpose{{ $srcName }}{{ $destName }}{{ $arch }}(src []{{$src}}, dest []{{$dest}}, transposeMap []int32) {
+ _transpose_{{printf "%s_%s_%s" $src $dest $arch}}(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+{{ end }}
+{{ end }}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.go
new file mode 100644
index 000000000..241ca74a7
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.go
@@ -0,0 +1,473 @@
+// Code generated by transpose_ints_simd.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package utils
+
+import (
+ "unsafe"
+)
+
+//go:noescape
+func _transpose_int8_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Int8sse4(src []int8, dest []int8, transposeMap []int32) {
+ _transpose_int8_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Uint8sse4(src []int8, dest []uint8, transposeMap []int32) {
+ _transpose_int8_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Int16sse4(src []int8, dest []int16, transposeMap []int32) {
+ _transpose_int8_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Uint16sse4(src []int8, dest []uint16, transposeMap []int32) {
+ _transpose_int8_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Int32sse4(src []int8, dest []int32, transposeMap []int32) {
+ _transpose_int8_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Uint32sse4(src []int8, dest []uint32, transposeMap []int32) {
+ _transpose_int8_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Int64sse4(src []int8, dest []int64, transposeMap []int32) {
+ _transpose_int8_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int8_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt8Uint64sse4(src []int8, dest []uint64, transposeMap []int32) {
+ _transpose_int8_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Int8sse4(src []uint8, dest []int8, transposeMap []int32) {
+ _transpose_uint8_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Uint8sse4(src []uint8, dest []uint8, transposeMap []int32) {
+ _transpose_uint8_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Int16sse4(src []uint8, dest []int16, transposeMap []int32) {
+ _transpose_uint8_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Uint16sse4(src []uint8, dest []uint16, transposeMap []int32) {
+ _transpose_uint8_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Int32sse4(src []uint8, dest []int32, transposeMap []int32) {
+ _transpose_uint8_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Uint32sse4(src []uint8, dest []uint32, transposeMap []int32) {
+ _transpose_uint8_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Int64sse4(src []uint8, dest []int64, transposeMap []int32) {
+ _transpose_uint8_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint8_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint8Uint64sse4(src []uint8, dest []uint64, transposeMap []int32) {
+ _transpose_uint8_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Int8sse4(src []int16, dest []int8, transposeMap []int32) {
+ _transpose_int16_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Uint8sse4(src []int16, dest []uint8, transposeMap []int32) {
+ _transpose_int16_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Int16sse4(src []int16, dest []int16, transposeMap []int32) {
+ _transpose_int16_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Uint16sse4(src []int16, dest []uint16, transposeMap []int32) {
+ _transpose_int16_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Int32sse4(src []int16, dest []int32, transposeMap []int32) {
+ _transpose_int16_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Uint32sse4(src []int16, dest []uint32, transposeMap []int32) {
+ _transpose_int16_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Int64sse4(src []int16, dest []int64, transposeMap []int32) {
+ _transpose_int16_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int16_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt16Uint64sse4(src []int16, dest []uint64, transposeMap []int32) {
+ _transpose_int16_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Int8sse4(src []uint16, dest []int8, transposeMap []int32) {
+ _transpose_uint16_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Uint8sse4(src []uint16, dest []uint8, transposeMap []int32) {
+ _transpose_uint16_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Int16sse4(src []uint16, dest []int16, transposeMap []int32) {
+ _transpose_uint16_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Uint16sse4(src []uint16, dest []uint16, transposeMap []int32) {
+ _transpose_uint16_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Int32sse4(src []uint16, dest []int32, transposeMap []int32) {
+ _transpose_uint16_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Uint32sse4(src []uint16, dest []uint32, transposeMap []int32) {
+ _transpose_uint16_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Int64sse4(src []uint16, dest []int64, transposeMap []int32) {
+ _transpose_uint16_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint16_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint16Uint64sse4(src []uint16, dest []uint64, transposeMap []int32) {
+ _transpose_uint16_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Int8sse4(src []int32, dest []int8, transposeMap []int32) {
+ _transpose_int32_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Uint8sse4(src []int32, dest []uint8, transposeMap []int32) {
+ _transpose_int32_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Int16sse4(src []int32, dest []int16, transposeMap []int32) {
+ _transpose_int32_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Uint16sse4(src []int32, dest []uint16, transposeMap []int32) {
+ _transpose_int32_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Int32sse4(src []int32, dest []int32, transposeMap []int32) {
+ _transpose_int32_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Uint32sse4(src []int32, dest []uint32, transposeMap []int32) {
+ _transpose_int32_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Int64sse4(src []int32, dest []int64, transposeMap []int32) {
+ _transpose_int32_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int32_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt32Uint64sse4(src []int32, dest []uint64, transposeMap []int32) {
+ _transpose_int32_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Int8sse4(src []uint32, dest []int8, transposeMap []int32) {
+ _transpose_uint32_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Uint8sse4(src []uint32, dest []uint8, transposeMap []int32) {
+ _transpose_uint32_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Int16sse4(src []uint32, dest []int16, transposeMap []int32) {
+ _transpose_uint32_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Uint16sse4(src []uint32, dest []uint16, transposeMap []int32) {
+ _transpose_uint32_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Int32sse4(src []uint32, dest []int32, transposeMap []int32) {
+ _transpose_uint32_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Uint32sse4(src []uint32, dest []uint32, transposeMap []int32) {
+ _transpose_uint32_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Int64sse4(src []uint32, dest []int64, transposeMap []int32) {
+ _transpose_uint32_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint32_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint32Uint64sse4(src []uint32, dest []uint64, transposeMap []int32) {
+ _transpose_uint32_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Int8sse4(src []int64, dest []int8, transposeMap []int32) {
+ _transpose_int64_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Uint8sse4(src []int64, dest []uint8, transposeMap []int32) {
+ _transpose_int64_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Int16sse4(src []int64, dest []int16, transposeMap []int32) {
+ _transpose_int64_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Uint16sse4(src []int64, dest []uint16, transposeMap []int32) {
+ _transpose_int64_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Int32sse4(src []int64, dest []int32, transposeMap []int32) {
+ _transpose_int64_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Uint32sse4(src []int64, dest []uint32, transposeMap []int32) {
+ _transpose_int64_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Int64sse4(src []int64, dest []int64, transposeMap []int32) {
+ _transpose_int64_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_int64_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeInt64Uint64sse4(src []int64, dest []uint64, transposeMap []int32) {
+ _transpose_int64_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Int8sse4(src []uint64, dest []int8, transposeMap []int32) {
+ _transpose_uint64_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Uint8sse4(src []uint64, dest []uint8, transposeMap []int32) {
+ _transpose_uint64_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Int16sse4(src []uint64, dest []int16, transposeMap []int32) {
+ _transpose_uint64_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Uint16sse4(src []uint64, dest []uint16, transposeMap []int32) {
+ _transpose_uint64_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Int32sse4(src []uint64, dest []int32, transposeMap []int32) {
+ _transpose_uint64_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Uint32sse4(src []uint64, dest []uint32, transposeMap []int32) {
+ _transpose_uint64_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Int64sse4(src []uint64, dest []int64, transposeMap []int32) {
+ _transpose_uint64_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
+
+//go:noescape
+func _transpose_uint64_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer)
+
+func transposeUint64Uint64sse4(src []uint64, dest []uint64, transposeMap []int32) {
+ _transpose_uint64_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0]))
+}
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.s b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.s
new file mode 100644
index 000000000..ee5199a5a
--- /dev/null
+++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.s
@@ -0,0 +1,3074 @@
+//+build !noasm !appengine
+// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
+
+TEXT ·_transpose_uint8_uint8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB0_1
+
+LBB0_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB0_5
+
+LBB0_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB0_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB0_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB0_3
+
+LBB0_4:
+ RET
+
+TEXT ·_transpose_int8_uint8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB1_1
+
+LBB1_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB1_5
+
+LBB1_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB1_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB1_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB1_3
+
+LBB1_4:
+ RET
+
+TEXT ·_transpose_uint16_uint8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB2_1
+
+LBB2_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB2_5
+
+LBB2_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB2_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB2_3:
+ LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB2_3
+
+LBB2_4:
+ RET
+
+TEXT ·_transpose_int16_uint8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB3_1
+
+LBB3_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB3_5
+
+LBB3_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB3_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB3_3:
+ LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB3_3
+
+LBB3_4:
+ RET
+
+TEXT ·_transpose_uint32_uint8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB4_1
+
+LBB4_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB4_5
+
+LBB4_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB4_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB4_3:
+ LONG $0x87048b42 // mov eax, dword [rdi + 4*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB4_3
+
+LBB4_4:
+ RET
+
+TEXT ·_transpose_int32_uint8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB5_1
+
+LBB5_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB5_5
+
+LBB5_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB5_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB5_3:
+ LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB5_3
+
+LBB5_4:
+ RET
+
+TEXT ·_transpose_uint64_uint8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB6_1
+
+LBB6_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB6_5
+
+LBB6_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB6_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB6_3:
+ LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB6_3
+
+LBB6_4:
+ RET
+
+TEXT ·_transpose_int64_uint8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB7_1
+
+LBB7_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB7_5
+
+LBB7_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB7_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB7_3:
+ LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB7_3
+
+LBB7_4:
+ RET
+
+TEXT ·_transpose_uint8_int8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB8_1
+
+LBB8_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB8_5
+
+LBB8_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB8_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB8_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB8_3
+
+LBB8_4:
+ RET
+
+TEXT ·_transpose_int8_int8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB9_1
+
+LBB9_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB9_5
+
+LBB9_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB9_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB9_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB9_3
+
+LBB9_4:
+ RET
+
+TEXT ·_transpose_uint16_int8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB10_1
+
+LBB10_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB10_5
+
+LBB10_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB10_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB10_3:
+ LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB10_3
+
+LBB10_4:
+ RET
+
+TEXT ·_transpose_int16_int8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB11_1
+
+LBB11_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB11_5
+
+LBB11_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB11_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB11_3:
+ LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB11_3
+
+LBB11_4:
+ RET
+
+TEXT ·_transpose_uint32_int8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB12_1
+
+LBB12_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB12_5
+
+LBB12_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB12_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB12_3:
+ LONG $0x87048b42 // mov eax, dword [rdi + 4*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB12_3
+
+LBB12_4:
+ RET
+
+TEXT ·_transpose_int32_int8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB13_1
+
+LBB13_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB13_5
+
+LBB13_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB13_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB13_3:
+ LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB13_3
+
+LBB13_4:
+ RET
+
+TEXT ·_transpose_uint64_int8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB14_1
+
+LBB14_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB14_5
+
+LBB14_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB14_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB14_3:
+ LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB14_3
+
+LBB14_4:
+ RET
+
+TEXT ·_transpose_int64_int8_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB15_1
+
+LBB15_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x1688 // mov byte [rsi], dl
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
+ WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB15_5
+
+LBB15_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB15_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB15_3:
+ LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
+ LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
+ LONG $0x06048842 // mov byte [rsi + r8], al
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB15_3
+
+LBB15_4:
+ RET
+
+TEXT ·_transpose_uint8_uint16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB16_1
+
+LBB16_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB16_5
+
+LBB16_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB16_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB16_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB16_3
+
+LBB16_4:
+ RET
+
+TEXT ·_transpose_int8_uint16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB17_1
+
+LBB17_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB17_5
+
+LBB17_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB17_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB17_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB17_3
+
+LBB17_4:
+ RET
+
+TEXT ·_transpose_uint16_uint16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB18_1
+
+LBB18_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB18_5
+
+LBB18_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB18_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB18_3:
+ LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB18_3
+
+LBB18_4:
+ RET
+
+TEXT ·_transpose_int16_uint16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB19_1
+
+LBB19_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB19_5
+
+LBB19_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB19_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB19_3:
+ LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB19_3
+
+LBB19_4:
+ RET
+
+TEXT ·_transpose_uint32_uint16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB20_1
+
+LBB20_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB20_5
+
+LBB20_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB20_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB20_3:
+ LONG $0x47048b42 // mov eax, dword [rdi + 2*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB20_3
+
+LBB20_4:
+ RET
+
+TEXT ·_transpose_int32_uint16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB21_1
+
+LBB21_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB21_5
+
+LBB21_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB21_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB21_3:
+ LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB21_3
+
+LBB21_4:
+ RET
+
+TEXT ·_transpose_uint64_uint16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB22_1
+
+LBB22_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB22_5
+
+LBB22_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB22_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB22_3:
+ LONG $0x87048b4a // mov rax, qword [rdi + 4*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB22_3
+
+LBB22_4:
+ RET
+
+TEXT ·_transpose_int64_uint16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB23_1
+
+LBB23_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB23_5
+
+LBB23_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB23_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB23_3:
+ LONG $0x87048b4a // mov rax, qword [rdi + 4*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB23_3
+
+LBB23_4:
+ RET
+
+TEXT ·_transpose_uint8_int16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB24_1
+
+LBB24_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB24_5
+
+LBB24_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB24_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB24_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB24_3
+
+LBB24_4:
+ RET
+
+TEXT ·_transpose_int8_int16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB25_1
+
+LBB25_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB25_5
+
+LBB25_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB25_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB25_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB25_3
+
+LBB25_4:
+ RET
+
+TEXT ·_transpose_uint16_int16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB26_1
+
+LBB26_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB26_5
+
+LBB26_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB26_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB26_3:
+ LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB26_3
+
+LBB26_4:
+ RET
+
+TEXT ·_transpose_int16_int16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB27_1
+
+LBB27_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB27_5
+
+LBB27_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB27_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB27_3:
+ LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB27_3
+
+LBB27_4:
+ RET
+
+TEXT ·_transpose_uint32_int16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB28_1
+
+LBB28_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB28_5
+
+LBB28_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB28_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB28_3:
+ LONG $0x47048b42 // mov eax, dword [rdi + 2*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB28_3
+
+LBB28_4:
+ RET
+
+TEXT ·_transpose_int32_int16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB29_1
+
+LBB29_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB29_5
+
+LBB29_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB29_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB29_3:
+ LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB29_3
+
+LBB29_4:
+ RET
+
+TEXT ·_transpose_uint64_int16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB30_1
+
+LBB30_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB30_5
+
+LBB30_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB30_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB30_3:
+ LONG $0x87048b4a // mov rax, qword [rdi + 4*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB30_3
+
+LBB30_4:
+ RET
+
+TEXT ·_transpose_int64_int16_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB31_1
+
+LBB31_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x02568966 // mov word [rsi + 2], dx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x04568966 // mov word [rsi + 4], dx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
+ LONG $0x06568966 // mov word [rsi + 6], dx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x08c68348 // add rsi, 8
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB31_5
+
+LBB31_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB31_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB31_3:
+ LONG $0x87048b4a // mov rax, qword [rdi + 4*r8]
+ LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
+ LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB31_3
+
+LBB31_4:
+ RET
+
+TEXT ·_transpose_uint8_uint32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB32_1
+
+LBB32_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB32_5
+
+LBB32_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB32_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB32_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x86048942 // mov dword [rsi + 4*r8], eax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB32_3
+
+LBB32_4:
+ RET
+
+TEXT ·_transpose_int8_uint32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB33_1
+
+LBB33_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB33_5
+
+LBB33_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB33_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB33_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x86048942 // mov dword [rsi + 4*r8], eax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB33_3
+
+LBB33_4:
+ RET
+
+TEXT ·_transpose_uint16_uint32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB34_1
+
+LBB34_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB34_5
+
+LBB34_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB34_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB34_3:
+ LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x46048942 // mov dword [rsi + 2*r8], eax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB34_3
+
+LBB34_4:
+ RET
+
+TEXT ·_transpose_int16_uint32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB35_1
+
+LBB35_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB35_5
+
+LBB35_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB35_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB35_3:
+ LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x46048942 // mov dword [rsi + 2*r8], eax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB35_3
+
+LBB35_4:
+ RET
+
+TEXT ·_transpose_uint32_uint32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB36_1
+
+LBB36_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB36_5
+
+LBB36_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB36_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB36_3:
+ LONG $0x07048b42 // mov eax, dword [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB36_3
+
+LBB36_4:
+ RET
+
+TEXT ·_transpose_int32_uint32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB37_1
+
+LBB37_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB37_5
+
+LBB37_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB37_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB37_3:
+ LONG $0x0704634a // movsxd rax, dword [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB37_3
+
+LBB37_4:
+ RET
+
+TEXT ·_transpose_uint64_uint32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB38_1
+
+LBB38_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB38_5
+
+LBB38_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB38_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB38_3:
+ LONG $0x47048b4a // mov rax, qword [rdi + 2*r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB38_3
+
+LBB38_4:
+ RET
+
+TEXT ·_transpose_int64_uint32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB39_1
+
+LBB39_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB39_5
+
+LBB39_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB39_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB39_3:
+ LONG $0x47048b4a // mov rax, qword [rdi + 2*r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB39_3
+
+LBB39_4:
+ RET
+
+TEXT ·_transpose_uint8_int32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB40_1
+
+LBB40_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB40_5
+
+LBB40_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB40_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB40_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x86048942 // mov dword [rsi + 4*r8], eax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB40_3
+
+LBB40_4:
+ RET
+
+TEXT ·_transpose_int8_int32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB41_1
+
+LBB41_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB41_5
+
+LBB41_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB41_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB41_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x86048942 // mov dword [rsi + 4*r8], eax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB41_3
+
+LBB41_4:
+ RET
+
+TEXT ·_transpose_uint16_int32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB42_1
+
+LBB42_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB42_5
+
+LBB42_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB42_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB42_3:
+ LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x46048942 // mov dword [rsi + 2*r8], eax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB42_3
+
+LBB42_4:
+ RET
+
+TEXT ·_transpose_int16_int32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB43_1
+
+LBB43_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB43_5
+
+LBB43_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB43_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB43_3:
+ LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x46048942 // mov dword [rsi + 2*r8], eax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB43_3
+
+LBB43_4:
+ RET
+
+TEXT ·_transpose_uint32_int32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB44_1
+
+LBB44_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB44_5
+
+LBB44_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB44_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB44_3:
+ LONG $0x07048b42 // mov eax, dword [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB44_3
+
+LBB44_4:
+ RET
+
+TEXT ·_transpose_int32_int32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB45_1
+
+LBB45_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB45_5
+
+LBB45_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB45_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB45_3:
+ LONG $0x0704634a // movsxd rax, dword [rdi + r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB45_3
+
+LBB45_4:
+ RET
+
+TEXT ·_transpose_uint64_int32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB46_1
+
+LBB46_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB46_5
+
+LBB46_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB46_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB46_3:
+ LONG $0x47048b4a // mov rax, qword [rdi + 2*r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB46_3
+
+LBB46_4:
+ RET
+
+TEXT ·_transpose_int64_int32_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB47_1
+
+LBB47_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x1689 // mov dword [rsi], edx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
+ WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x10c68348 // add rsi, 16
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB47_5
+
+LBB47_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB47_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB47_3:
+ LONG $0x47048b4a // mov rax, qword [rdi + 2*r8]
+ WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
+ LONG $0x06048942 // mov dword [rsi + r8], eax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB47_3
+
+LBB47_4:
+ RET
+
+TEXT ·_transpose_uint8_uint64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB48_1
+
+LBB48_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB48_5
+
+LBB48_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB48_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB48_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0xc604894a // mov qword [rsi + 8*r8], rax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB48_3
+
+LBB48_4:
+ RET
+
+TEXT ·_transpose_int8_uint64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB49_1
+
+LBB49_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB49_5
+
+LBB49_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB49_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB49_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0xc604894a // mov qword [rsi + 8*r8], rax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB49_3
+
+LBB49_4:
+ RET
+
+TEXT ·_transpose_uint16_uint64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB50_1
+
+LBB50_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB50_5
+
+LBB50_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB50_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB50_3:
+ LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x8604894a // mov qword [rsi + 4*r8], rax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB50_3
+
+LBB50_4:
+ RET
+
+TEXT ·_transpose_int16_uint64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB51_1
+
+LBB51_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB51_5
+
+LBB51_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB51_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB51_3:
+ LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x8604894a // mov qword [rsi + 4*r8], rax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB51_3
+
+LBB51_4:
+ RET
+
+TEXT ·_transpose_uint32_uint64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB52_1
+
+LBB52_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB52_5
+
+LBB52_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB52_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB52_3:
+ LONG $0x07048b42 // mov eax, dword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x4604894a // mov qword [rsi + 2*r8], rax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB52_3
+
+LBB52_4:
+ RET
+
+TEXT ·_transpose_int32_uint64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB53_1
+
+LBB53_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB53_5
+
+LBB53_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB53_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB53_3:
+ LONG $0x0704634a // movsxd rax, dword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x4604894a // mov qword [rsi + 2*r8], rax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB53_3
+
+LBB53_4:
+ RET
+
+TEXT ·_transpose_uint64_uint64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB54_1
+
+LBB54_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB54_5
+
+LBB54_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB54_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB54_3:
+ LONG $0x07048b4a // mov rax, qword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x0604894a // mov qword [rsi + r8], rax
+ LONG $0x08c08349 // add r8, 8
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB54_3
+
+LBB54_4:
+ RET
+
+TEXT ·_transpose_int64_uint64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB55_1
+
+LBB55_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB55_5
+
+LBB55_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB55_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB55_3:
+ LONG $0x07048b4a // mov rax, qword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x0604894a // mov qword [rsi + r8], rax
+ LONG $0x08c08349 // add r8, 8
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB55_3
+
+LBB55_4:
+ RET
+
+TEXT ·_transpose_uint8_int64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB56_1
+
+LBB56_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x0157b60f // movzx edx, byte [rdi + 1]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x0257b60f // movzx edx, byte [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x0357b60f // movzx edx, byte [rdi + 3]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB56_5
+
+LBB56_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB56_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB56_3:
+ LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0xc604894a // mov qword [rsi + 8*r8], rax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB56_3
+
+LBB56_4:
+ RET
+
+TEXT ·_transpose_int8_int64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB57_1
+
+LBB57_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17be0f48 // movsx rdx, byte [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x04c78348 // add rdi, 4
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB57_5
+
+LBB57_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB57_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB57_3:
+ LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0xc604894a // mov qword [rsi + 8*r8], rax
+ LONG $0x01c08349 // add r8, 1
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB57_3
+
+LBB57_4:
+ RET
+
+TEXT ·_transpose_uint16_int64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB58_1
+
+LBB58_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x0257b70f // movzx edx, word [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x0457b70f // movzx edx, word [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x0657b70f // movzx edx, word [rdi + 6]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB58_5
+
+LBB58_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB58_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB58_3:
+ LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x8604894a // mov qword [rsi + 4*r8], rax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB58_3
+
+LBB58_4:
+ RET
+
+TEXT ·_transpose_int16_int64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB59_1
+
+LBB59_5:
+ WORD $0xd089 // mov eax, edx
+ LONG $0x17bf0f48 // movsx rdx, word [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB59_5
+
+LBB59_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB59_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB59_3:
+ LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x8604894a // mov qword [rsi + 4*r8], rax
+ LONG $0x02c08349 // add r8, 2
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB59_3
+
+LBB59_4:
+ RET
+
+TEXT ·_transpose_uint32_int64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB60_1
+
+LBB60_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x178b // mov edx, dword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB60_5
+
+LBB60_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB60_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB60_3:
+ LONG $0x07048b42 // mov eax, dword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x4604894a // mov qword [rsi + 2*r8], rax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB60_3
+
+LBB60_4:
+ RET
+
+TEXT ·_transpose_int32_int64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB61_1
+
+LBB61_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB61_5
+
+LBB61_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB61_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB61_3:
+ LONG $0x0704634a // movsxd rax, dword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x4604894a // mov qword [rsi + 2*r8], rax
+ LONG $0x04c08349 // add r8, 4
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB61_3
+
+LBB61_4:
+ RET
+
+TEXT ·_transpose_uint64_int64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB62_1
+
+LBB62_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB62_5
+
+LBB62_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB62_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB62_3:
+ LONG $0x07048b4a // mov rax, qword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x0604894a // mov qword [rsi + r8], rax
+ LONG $0x08c08349 // add r8, 8
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB62_3
+
+LBB62_4:
+ RET
+
+TEXT ·_transpose_int64_int64_sse4(SB), $0-32
+
+ MOVQ src+0(FP), DI
+ MOVQ dest+8(FP), SI
+ MOVQ length+16(FP), DX
+ MOVQ transposeMap+24(FP), CX
+
+ WORD $0xfa83; BYTE $0x04 // cmp edx, 4
+ JL LBB63_1
+
+LBB63_5:
+ WORD $0xd089 // mov eax, edx
+ WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
+ LONG $0x08578b48 // mov rdx, qword [rdi + 8]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x08568948 // mov qword [rsi + 8], rdx
+ LONG $0x10578b48 // mov rdx, qword [rdi + 16]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x10568948 // mov qword [rsi + 16], rdx
+ LONG $0x18578b48 // mov rdx, qword [rdi + 24]
+ LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
+ LONG $0x18568948 // mov qword [rsi + 24], rdx
+ WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x20c68348 // add rsi, 32
+ WORD $0xf883; BYTE $0x07 // cmp eax, 7
+ JG LBB63_5
+
+LBB63_1:
+ WORD $0xd285 // test edx, edx
+ JLE LBB63_4
+ WORD $0xc283; BYTE $0x01 // add edx, 1
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB63_3:
+ LONG $0x07048b4a // mov rax, qword [rdi + r8]
+ LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
+ LONG $0x0604894a // mov qword [rsi + r8], rax
+ LONG $0x08c08349 // add r8, 8
+ WORD $0xc283; BYTE $0xff // add edx, -1
+ WORD $0xfa83; BYTE $0x01 // cmp edx, 1
+ JG LBB63_3
+
+LBB63_4:
+ RET