diff options
| author | Taras Madan <tarasmadan@google.com> | 2024-09-10 12:16:33 +0200 |
|---|---|---|
| committer | Taras Madan <tarasmadan@google.com> | 2024-09-10 14:05:26 +0000 |
| commit | c97c816133b42257d0bcf1ee4bd178bb2a7a2b9e (patch) | |
| tree | 0bcbc2e540bbf8f62f6c17887cdd53b8c2cee637 /vendor/github.com/apache/arrow/go/v15/internal | |
| parent | 54e657429ab892ad06c90cd7c1a4eb33ba93a3dc (diff) | |
vendor: update
Diffstat (limited to 'vendor/github.com/apache/arrow/go/v15/internal')
47 files changed, 17025 insertions, 0 deletions
diff --git a/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_block_counter.go b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_block_counter.go new file mode 100644 index 000000000..50996b10e --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_block_counter.go @@ -0,0 +1,452 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bitutils + +import ( + "math" + "math/bits" + "unsafe" + + "github.com/apache/arrow/go/v15/arrow/bitutil" + "github.com/apache/arrow/go/v15/internal/utils" +) + +func loadWord(byt []byte) uint64 { + return utils.ToLEUint64(*(*uint64)(unsafe.Pointer(&byt[0]))) +} + +func shiftWord(current, next uint64, shift int64) uint64 { + if shift == 0 { + return current + } + return (current >> shift) | (next << (64 - shift)) +} + +// BitBlockCount is returned by the various bit block counter utilities +// in order to return a length of bits and the population count of that +// slice of bits. +type BitBlockCount struct { + Len int16 + Popcnt int16 +} + +// NoneSet returns true if ALL the bits were 0 in this set, ie: Popcnt == 0 +func (b BitBlockCount) NoneSet() bool { + return b.Popcnt == 0 +} + +// AllSet returns true if ALL the bits were 1 in this set, ie: Popcnt == Len +func (b BitBlockCount) AllSet() bool { + return b.Len == b.Popcnt +} + +// BitBlockCounter is a utility for grabbing chunks of a bitmap at a time and efficiently +// counting the number of bits which are 1. +type BitBlockCounter struct { + bitmap []byte + bitsRemaining int64 + bitOffset int8 +} + +const ( + wordBits int64 = 64 + fourWordsBits int64 = wordBits * 4 +) + +// NewBitBlockCounter returns a BitBlockCounter for the passed bitmap starting at startOffset +// of length nbits. +func NewBitBlockCounter(bitmap []byte, startOffset, nbits int64) *BitBlockCounter { + return &BitBlockCounter{ + bitmap: bitmap[startOffset/8:], + bitsRemaining: nbits, + bitOffset: int8(startOffset % 8), + } +} + +// getBlockSlow is for returning a block of the requested size when there aren't +// enough bits remaining to do a full word computation. +func (b *BitBlockCounter) getBlockSlow(blockSize int64) BitBlockCount { + runlen := int16(utils.Min(b.bitsRemaining, blockSize)) + popcnt := int16(bitutil.CountSetBits(b.bitmap, int(b.bitOffset), int(runlen))) + b.bitsRemaining -= int64(runlen) + b.bitmap = b.bitmap[runlen/8:] + return BitBlockCount{runlen, popcnt} +} + +// NextFourWords returns the next run of available bits, usually 256. The +// returned pair contains the size of run and the number of true values. +// The last block will have a length less than 256 if the bitmap length +// is not a multiple of 256, and will return 0-length blocks in subsequent +// invocations. +func (b *BitBlockCounter) NextFourWords() BitBlockCount { + if b.bitsRemaining == 0 { + return BitBlockCount{0, 0} + } + + totalPopcnt := 0 + if b.bitOffset == 0 { + // if we're aligned at 0 bitoffset, then we can easily just jump from + // word to word nice and easy. + if b.bitsRemaining < fourWordsBits { + return b.getBlockSlow(fourWordsBits) + } + totalPopcnt += bits.OnesCount64(loadWord(b.bitmap)) + totalPopcnt += bits.OnesCount64(loadWord(b.bitmap[8:])) + totalPopcnt += bits.OnesCount64(loadWord(b.bitmap[16:])) + totalPopcnt += bits.OnesCount64(loadWord(b.bitmap[24:])) + } else { + // When the offset is > 0, we need there to be a word beyond the last + // aligned word in the bitmap for the bit shifting logic. + if b.bitsRemaining < 5*fourWordsBits-int64(b.bitOffset) { + return b.getBlockSlow(fourWordsBits) + } + + current := loadWord(b.bitmap) + next := loadWord(b.bitmap[8:]) + totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset))) + + current = next + next = loadWord(b.bitmap[16:]) + totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset))) + + current = next + next = loadWord(b.bitmap[24:]) + totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset))) + + current = next + next = loadWord(b.bitmap[32:]) + totalPopcnt += bits.OnesCount64(shiftWord(current, next, int64(b.bitOffset))) + } + b.bitmap = b.bitmap[bitutil.BytesForBits(fourWordsBits):] + b.bitsRemaining -= fourWordsBits + return BitBlockCount{256, int16(totalPopcnt)} +} + +// NextWord returns the next run of available bits, usually 64. The returned +// pair contains the size of run and the number of true values. The last +// block will have a length less than 64 if the bitmap length is not a +// multiple of 64, and will return 0-length blocks in subsequent +// invocations. +func (b *BitBlockCounter) NextWord() BitBlockCount { + if b.bitsRemaining == 0 { + return BitBlockCount{0, 0} + } + popcnt := 0 + if b.bitOffset == 0 { + if b.bitsRemaining < wordBits { + return b.getBlockSlow(wordBits) + } + popcnt = bits.OnesCount64(loadWord(b.bitmap)) + } else { + // When the offset is > 0, we need there to be a word beyond the last + // aligned word in the bitmap for the bit shifting logic. + if b.bitsRemaining < (2*wordBits - int64(b.bitOffset)) { + return b.getBlockSlow(wordBits) + } + popcnt = bits.OnesCount64(shiftWord(loadWord(b.bitmap), loadWord(b.bitmap[8:]), int64(b.bitOffset))) + } + b.bitmap = b.bitmap[wordBits/8:] + b.bitsRemaining -= wordBits + return BitBlockCount{64, int16(popcnt)} +} + +// OptionalBitBlockCounter is a useful counter to iterate through a possibly +// nonexistent validity bitmap to allow us to write one code path for both +// the with-nulls and no-nulls cases without giving up a lot of performance. +type OptionalBitBlockCounter struct { + hasBitmap bool + pos int64 + len int64 + counter *BitBlockCounter +} + +// NewOptionalBitBlockCounter constructs and returns a new bit block counter that +// can properly handle the case when a bitmap is null, if it is guaranteed that the +// the bitmap is not nil, then prefer NewBitBlockCounter here. +func NewOptionalBitBlockCounter(bitmap []byte, offset, length int64) *OptionalBitBlockCounter { + var counter *BitBlockCounter + if bitmap != nil { + counter = NewBitBlockCounter(bitmap, offset, length) + } + return &OptionalBitBlockCounter{ + hasBitmap: bitmap != nil, + pos: 0, + len: length, + counter: counter, + } +} + +// NextBlock returns block count for next word when the bitmap is available otherwise +// return a block with length up to INT16_MAX when there is no validity +// bitmap (so all the referenced values are not null). +func (obc *OptionalBitBlockCounter) NextBlock() BitBlockCount { + const maxBlockSize = math.MaxInt16 + if obc.hasBitmap { + block := obc.counter.NextWord() + obc.pos += int64(block.Len) + return block + } + + blockSize := int16(utils.Min(maxBlockSize, obc.len-obc.pos)) + obc.pos += int64(blockSize) + // all values are non-null + return BitBlockCount{blockSize, blockSize} +} + +// NextWord is like NextBlock, but returns a word-sized block even when there is no +// validity bitmap +func (obc *OptionalBitBlockCounter) NextWord() BitBlockCount { + const wordsize = 64 + if obc.hasBitmap { + block := obc.counter.NextWord() + obc.pos += int64(block.Len) + return block + } + blockSize := int16(utils.Min(wordsize, obc.len-obc.pos)) + obc.pos += int64(blockSize) + // all values are non-null + return BitBlockCount{blockSize, blockSize} +} + +// VisitBitBlocks is a utility for easily iterating through the blocks of bits in a bitmap, +// calling the appropriate visitValid/visitInvalid function as we iterate through the bits. +// visitValid is called with the bitoffset of the valid bit. Don't use this inside a tight +// loop when performance is needed and instead prefer manually constructing these loops +// in that scenario. +func VisitBitBlocks(bitmap []byte, offset, length int64, visitValid func(pos int64), visitInvalid func()) { + counter := NewOptionalBitBlockCounter(bitmap, offset, length) + pos := int64(0) + for pos < length { + block := counter.NextBlock() + if block.AllSet() { + for i := 0; i < int(block.Len); i, pos = i+1, pos+1 { + visitValid(pos) + } + } else if block.NoneSet() { + for i := 0; i < int(block.Len); i, pos = i+1, pos+1 { + visitInvalid() + } + } else { + for i := 0; i < int(block.Len); i, pos = i+1, pos+1 { + if bitutil.BitIsSet(bitmap, int(offset+pos)) { + visitValid(pos) + } else { + visitInvalid() + } + } + } + } +} + +// VisitBitBlocks is a utility for easily iterating through the blocks of bits in a bitmap, +// calling the appropriate visitValid/visitInvalid function as we iterate through the bits. +// visitValid is called with the bitoffset of the valid bit. Don't use this inside a tight +// loop when performance is needed and instead prefer manually constructing these loops +// in that scenario. +func VisitBitBlocksShort(bitmap []byte, offset, length int64, visitValid func(pos int64) error, visitInvalid func() error) error { + counter := NewOptionalBitBlockCounter(bitmap, offset, length) + pos := int64(0) + for pos < length { + block := counter.NextBlock() + if block.AllSet() { + for i := 0; i < int(block.Len); i, pos = i+1, pos+1 { + if err := visitValid(pos); err != nil { + return err + } + } + } else if block.NoneSet() { + for i := 0; i < int(block.Len); i, pos = i+1, pos+1 { + if err := visitInvalid(); err != nil { + return err + } + } + } else { + for i := 0; i < int(block.Len); i, pos = i+1, pos+1 { + if bitutil.BitIsSet(bitmap, int(offset+pos)) { + if err := visitValid(pos); err != nil { + return err + } + } else { + if err := visitInvalid(); err != nil { + return err + } + } + } + } + } + return nil +} + +func VisitTwoBitBlocks(leftBitmap, rightBitmap []byte, leftOffset, rightOffset int64, len int64, visitValid func(pos int64), visitNull func()) { + if leftBitmap == nil || rightBitmap == nil { + // at most one is present + if leftBitmap == nil { + VisitBitBlocks(rightBitmap, rightOffset, len, visitValid, visitNull) + } else { + VisitBitBlocks(leftBitmap, leftOffset, len, visitValid, visitNull) + } + return + } + + bitCounter := NewBinaryBitBlockCounter(leftBitmap, rightBitmap, leftOffset, rightOffset, len) + var pos int64 + for pos < len { + block := bitCounter.NextAndWord() + if block.AllSet() { + for i := 0; i < int(block.Len); i, pos = i+1, pos+1 { + visitValid(pos) + } + } else if block.NoneSet() { + for i := 0; i < int(block.Len); i, pos = i+1, pos+1 { + visitNull() + } + } else { + for i := 0; i < int(block.Len); i, pos = i+1, pos+1 { + if bitutil.BitIsSet(leftBitmap, int(leftOffset+pos)) && bitutil.BitIsSet(rightBitmap, int(rightOffset+pos)) { + visitValid(pos) + } else { + visitNull() + } + } + } + } +} + +type bitOp struct { + bit func(bool, bool) bool + word func(uint64, uint64) uint64 +} + +var ( + bitBlockAnd = bitOp{ + bit: func(a, b bool) bool { return a && b }, + word: func(a, b uint64) uint64 { return a & b }, + } + bitBlockAndNot = bitOp{ + bit: func(a, b bool) bool { return a && !b }, + word: func(a, b uint64) uint64 { return a &^ b }, + } + bitBlockOr = bitOp{ + bit: func(a, b bool) bool { return a || b }, + word: func(a, b uint64) uint64 { return a | b }, + } + bitBlockOrNot = bitOp{ + bit: func(a, b bool) bool { return a || !b }, + word: func(a, b uint64) uint64 { return a | ^b }, + } +) + +// BinaryBitBlockCounter computes popcounts on the result of bitwise +// operations between two bitmaps, 64 bits at a time. A 64-bit word +// is loaded from each bitmap, then the popcount is computed on +// e.g. the bitwise-and of the two words +type BinaryBitBlockCounter struct { + left []byte + right []byte + bitsRemaining int64 + leftOffset, rightOffset int64 + + bitsRequiredForWords int64 +} + +// NewBinaryBitBlockCounter constructs a binary bit block counter for +// computing the popcounts on the results of operations between +// the passed in bitmaps, with their respective offsets. +func NewBinaryBitBlockCounter(left, right []byte, leftOffset, rightOffset int64, length int64) *BinaryBitBlockCounter { + ret := &BinaryBitBlockCounter{ + left: left[leftOffset/8:], + right: right[rightOffset/8:], + leftOffset: leftOffset % 8, + rightOffset: rightOffset % 8, + bitsRemaining: length, + } + + leftBitsReq := int64(64) + if ret.leftOffset != 0 { + leftBitsReq = 64 + (64 - ret.leftOffset) + } + rightBitsReq := int64(64) + if ret.rightOffset != 0 { + rightBitsReq = 64 + (64 - ret.rightOffset) + } + + if leftBitsReq > rightBitsReq { + ret.bitsRequiredForWords = leftBitsReq + } else { + ret.bitsRequiredForWords = rightBitsReq + } + + return ret +} + +// NextAndWord returns the popcount of the bitwise-and of the next run +// of available bits, up to 64. The returned pair contains the size of +// the run and the number of true values. the last block will have a +// length less than 64 if the bitmap length is not a multiple of 64, +// and will return 0-length blocks in subsequent invocations +func (b *BinaryBitBlockCounter) NextAndWord() BitBlockCount { return b.nextWord(bitBlockAnd) } + +// NextAndNotWord is like NextAndWord but performs x &^ y on each run +func (b *BinaryBitBlockCounter) NextAndNotWord() BitBlockCount { return b.nextWord(bitBlockAndNot) } + +// NextOrWord is like NextAndWord but performs x | y on each run +func (b *BinaryBitBlockCounter) NextOrWord() BitBlockCount { return b.nextWord(bitBlockOr) } + +// NextOrWord is like NextAndWord but performs x | ^y on each run +func (b *BinaryBitBlockCounter) NextOrNotWord() BitBlockCount { return b.nextWord(bitBlockOrNot) } + +func (b *BinaryBitBlockCounter) nextWord(op bitOp) BitBlockCount { + if b.bitsRemaining == 0 { + return BitBlockCount{} + } + + // when offset is >0, we need there to be a word beyond the last + // aligned word in the bitmap for the bit shifting logic + if b.bitsRemaining < b.bitsRequiredForWords { + runLength := int16(b.bitsRemaining) + if runLength > int16(wordBits) { + runLength = int16(wordBits) + } + + var popcount int16 + for i := int16(0); i < runLength; i++ { + if op.bit(bitutil.BitIsSet(b.left, int(b.leftOffset)+int(i)), + bitutil.BitIsSet(b.right, int(b.rightOffset)+int(i))) { + popcount++ + } + } + // this code path should trigger _at most_ 2 times. in the "two times" + // case, the first time the run length will be a multiple of 8. + b.left = b.left[runLength/8:] + b.right = b.right[runLength/8:] + b.bitsRemaining -= int64(runLength) + return BitBlockCount{Len: runLength, Popcnt: popcount} + } + + var popcount int + if b.leftOffset == 0 && b.rightOffset == 0 { + popcount = bits.OnesCount64(op.word(loadWord(b.left), loadWord(b.right))) + } else { + leftWord := shiftWord(loadWord(b.left), loadWord(b.left[8:]), b.leftOffset) + rightWord := shiftWord(loadWord(b.right), loadWord(b.right[8:]), b.rightOffset) + popcount = bits.OnesCount64(op.word(leftWord, rightWord)) + } + b.left = b.left[wordBits/8:] + b.right = b.right[wordBits/8:] + b.bitsRemaining -= wordBits + return BitBlockCount{Len: int16(wordBits), Popcnt: int16(popcount)} +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_run_reader.go b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_run_reader.go new file mode 100644 index 000000000..f09149d7e --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_run_reader.go @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bitutils + +import ( + "encoding/binary" + "fmt" + "math/bits" + "unsafe" + + "github.com/apache/arrow/go/v15/arrow" + "github.com/apache/arrow/go/v15/arrow/bitutil" + "github.com/apache/arrow/go/v15/internal/utils" +) + +// BitRun represents a run of bits with the same value of length Len +// with Set representing if the group of bits were 1 or 0. +type BitRun struct { + Len int64 + Set bool +} + +// BitRunReader is an interface that is usable by multiple callers to provide +// multiple types of bit run readers such as a reverse reader and so on. +// +// It's a convenience interface for counting contiguous set/unset bits in a bitmap. +// In places where BitBlockCounter can be used, then it would be preferred to use that +// as it would be faster than using BitRunReader. +type BitRunReader interface { + NextRun() BitRun +} + +func (b BitRun) String() string { + return fmt.Sprintf("{Length: %d, set=%t}", b.Len, b.Set) +} + +type bitRunReader struct { + bitmap []byte + pos int64 + length int64 + word uint64 + curRunBitSet bool +} + +// NewBitRunReader returns a reader for the given bitmap, offset and length that +// grabs runs of the same value bit at a time for easy iteration. +func NewBitRunReader(bitmap []byte, offset int64, length int64) BitRunReader { + ret := &bitRunReader{ + bitmap: bitmap[offset/8:], + pos: offset % 8, + length: (offset % 8) + length, + } + + if length == 0 { + return ret + } + + ret.curRunBitSet = bitutil.BitIsNotSet(bitmap, int(offset)) + bitsRemaining := length + ret.pos + ret.loadWord(bitsRemaining) + ret.word = ret.word &^ LeastSignificantBitMask(ret.pos) + return ret +} + +// NextRun returns a new BitRun containing the number of contiguous bits with the +// same value. Len == 0 indicates the end of the bitmap. +func (b *bitRunReader) NextRun() BitRun { + if b.pos >= b.length { + return BitRun{0, false} + } + + // This implementation relies on a efficient implementations of + // CountTrailingZeros and assumes that runs are more often then + // not. The logic is to incrementally find the next bit change + // from the current position. This is done by zeroing all + // bits in word_ up to position_ and using the TrailingZeroCount + // to find the index of the next set bit. + + // The runs alternate on each call, so flip the bit. + b.curRunBitSet = !b.curRunBitSet + + start := b.pos + startOffset := start & 63 + + // Invert the word for proper use of CountTrailingZeros and + // clear bits so CountTrailingZeros can do it magic. + b.word = ^b.word &^ LeastSignificantBitMask(startOffset) + + // Go forward until the next change from unset to set. + newbits := int64(bits.TrailingZeros64(b.word)) - startOffset + b.pos += newbits + + if IsMultipleOf64(b.pos) && b.pos < b.length { + b.advanceUntilChange() + } + return BitRun{b.pos - start, b.curRunBitSet} +} + +func (b *bitRunReader) advanceUntilChange() { + newbits := int64(0) + for { + b.bitmap = b.bitmap[arrow.Uint64SizeBytes:] + b.loadNextWord() + newbits = int64(bits.TrailingZeros64(b.word)) + b.pos += newbits + if !IsMultipleOf64(b.pos) || b.pos >= b.length || newbits <= 0 { + break + } + } +} + +func (b *bitRunReader) loadNextWord() { + b.loadWord(b.length - b.pos) +} + +func (b *bitRunReader) loadWord(bitsRemaining int64) { + b.word = 0 + if bitsRemaining >= 64 { + b.word = binary.LittleEndian.Uint64(b.bitmap) + } else { + nbytes := bitutil.BytesForBits(bitsRemaining) + wordptr := (*(*[8]byte)(unsafe.Pointer(&b.word)))[:] + copy(wordptr, b.bitmap[:nbytes]) + + bitutil.SetBitTo(wordptr, int(bitsRemaining), bitutil.BitIsNotSet(wordptr, int(bitsRemaining-1))) + // reset the value to little endian for big endian architectures + b.word = utils.ToLEUint64(b.word) + } + + // Two cases: + // 1. For unset, CountTrailingZeros works naturally so we don't + // invert the word. + // 2. Otherwise invert so we can use CountTrailingZeros. + if b.curRunBitSet { + b.word = ^b.word + } +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_set_run_reader.go b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_set_run_reader.go new file mode 100644 index 000000000..374b8d4aa --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bit_set_run_reader.go @@ -0,0 +1,361 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bitutils + +import ( + "encoding/binary" + "math/bits" + + "github.com/apache/arrow/go/v15/arrow/bitutil" + "github.com/apache/arrow/go/v15/internal/utils" +) + +// IsMultipleOf64 returns whether v is a multiple of 64. +func IsMultipleOf64(v int64) bool { return v&63 == 0 } + +// LeastSignificantBitMask returns a bit mask to return the least significant +// bits for a value starting from the bit index passed in. ie: if you want a +// mask for the 4 least significant bits, you call LeastSignificantBitMask(4) +func LeastSignificantBitMask(index int64) uint64 { + return (uint64(1) << index) - 1 +} + +// SetBitRun describes a run of contiguous set bits in a bitmap with Pos being +// the starting position of the run and Length being the number of bits. +type SetBitRun struct { + Pos int64 + Length int64 +} + +// AtEnd returns true if this bit run is the end of the set by checking +// that the length is 0. +func (s SetBitRun) AtEnd() bool { + return s.Length == 0 +} + +// Equal returns whether rhs is the same run as s +func (s SetBitRun) Equal(rhs SetBitRun) bool { + return s.Pos == rhs.Pos && s.Length == rhs.Length +} + +// SetBitRunReader is an interface for reading groups of contiguous set bits +// from a bitmap. The interface allows us to create different reader implementations +// that share the same interface easily such as a reverse set reader. +type SetBitRunReader interface { + // NextRun will return the next run of contiguous set bits in the bitmap + NextRun() SetBitRun + // Reset allows re-using the reader by providing a new bitmap, offset and length. The arguments + // match the New function for the reader being used. + Reset([]byte, int64, int64) + // VisitSetBitRuns calls visitFn for each set in a loop starting from the current position + // it's roughly equivalent to simply looping, calling NextRun and calling visitFn on the run + // for each run. + VisitSetBitRuns(visitFn VisitFn) error +} + +type baseSetBitRunReader struct { + bitmap []byte + pos int64 + length int64 + remaining int64 + curWord uint64 + curNumBits int32 + reversed bool + + firstBit uint64 +} + +// NewSetBitRunReader returns a SetBitRunReader for the bitmap starting at startOffset which will read +// numvalues bits. +func NewSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader { + return newBaseSetBitRunReader(validBits, startOffset, numValues, false) +} + +// NewReverseSetBitRunReader returns a SetBitRunReader like NewSetBitRunReader, except it will +// return runs starting from the end of the bitmap until it reaches startOffset rather than starting +// at startOffset and reading from there. The SetBitRuns will still operate the same, so Pos +// will still be the position of the "left-most" bit of the run or the "start" of the run. It +// just returns runs starting from the end instead of starting from the beginning. +func NewReverseSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader { + return newBaseSetBitRunReader(validBits, startOffset, numValues, true) +} + +func newBaseSetBitRunReader(bitmap []byte, startOffset, length int64, reverse bool) *baseSetBitRunReader { + ret := &baseSetBitRunReader{reversed: reverse} + ret.Reset(bitmap, startOffset, length) + return ret +} + +func (br *baseSetBitRunReader) Reset(bitmap []byte, startOffset, length int64) { + br.bitmap = bitmap + br.length = length + br.remaining = length + br.curNumBits = 0 + br.curWord = 0 + + if !br.reversed { + br.pos = startOffset / 8 + br.firstBit = 1 + + bitOffset := int8(startOffset % 8) + if length > 0 && bitOffset != 0 { + br.curNumBits = int32(utils.Min(int(length), int(8-bitOffset))) + br.curWord = br.loadPartial(bitOffset, int64(br.curNumBits)) + } + return + } + + br.pos = (startOffset + length) / 8 + br.firstBit = uint64(0x8000000000000000) + endBitOffset := int8((startOffset + length) % 8) + if length > 0 && endBitOffset != 0 { + br.pos++ + br.curNumBits = int32(utils.Min(int(length), int(endBitOffset))) + br.curWord = br.loadPartial(8-endBitOffset, int64(br.curNumBits)) + } +} + +func (br *baseSetBitRunReader) consumeBits(word uint64, nbits int32) uint64 { + if br.reversed { + return word << nbits + } + return word >> nbits +} + +func (br *baseSetBitRunReader) countFirstZeros(word uint64) int32 { + if br.reversed { + return int32(bits.LeadingZeros64(word)) + } + return int32(bits.TrailingZeros64(word)) +} + +func (br *baseSetBitRunReader) loadPartial(bitOffset int8, numBits int64) uint64 { + var word [8]byte + nbytes := bitutil.BytesForBits(numBits) + if br.reversed { + br.pos -= nbytes + copy(word[8-nbytes:], br.bitmap[br.pos:br.pos+nbytes]) + return (binary.LittleEndian.Uint64(word[:]) << bitOffset) &^ LeastSignificantBitMask(64-numBits) + } + + copy(word[:], br.bitmap[br.pos:br.pos+nbytes]) + br.pos += nbytes + return (binary.LittleEndian.Uint64(word[:]) >> bitOffset) & LeastSignificantBitMask(numBits) +} + +func (br *baseSetBitRunReader) findCurrentRun() SetBitRun { + nzeros := br.countFirstZeros(br.curWord) + if nzeros >= br.curNumBits { + br.remaining -= int64(br.curNumBits) + br.curWord = 0 + br.curNumBits = 0 + return SetBitRun{0, 0} + } + + br.curWord = br.consumeBits(br.curWord, nzeros) + br.curNumBits -= nzeros + br.remaining -= int64(nzeros) + pos := br.position() + + numOnes := br.countFirstZeros(^br.curWord) + br.curWord = br.consumeBits(br.curWord, numOnes) + br.curNumBits -= numOnes + br.remaining -= int64(numOnes) + return SetBitRun{pos, int64(numOnes)} +} + +func (br *baseSetBitRunReader) position() int64 { + if br.reversed { + return br.remaining + } + return br.length - br.remaining +} + +func (br *baseSetBitRunReader) adjustRun(run SetBitRun) SetBitRun { + if br.reversed { + run.Pos -= run.Length + } + return run +} + +func (br *baseSetBitRunReader) loadFull() (ret uint64) { + if br.reversed { + br.pos -= 8 + } + ret = binary.LittleEndian.Uint64(br.bitmap[br.pos : br.pos+8]) + if !br.reversed { + br.pos += 8 + } + return +} + +func (br *baseSetBitRunReader) skipNextZeros() { + for br.remaining >= 64 { + br.curWord = br.loadFull() + nzeros := br.countFirstZeros(br.curWord) + if nzeros < 64 { + br.curWord = br.consumeBits(br.curWord, nzeros) + br.curNumBits = 64 - nzeros + br.remaining -= int64(nzeros) + return + } + br.remaining -= 64 + } + // run of zeros continues in last bitmap word + if br.remaining > 0 { + br.curWord = br.loadPartial(0, br.remaining) + br.curNumBits = int32(br.remaining) + nzeros := int32(utils.Min(int(br.curNumBits), int(br.countFirstZeros(br.curWord)))) + br.curWord = br.consumeBits(br.curWord, nzeros) + br.curNumBits -= nzeros + br.remaining -= int64(nzeros) + } +} + +func (br *baseSetBitRunReader) countNextOnes() int64 { + var length int64 + if ^br.curWord != 0 { + numOnes := br.countFirstZeros(^br.curWord) + br.remaining -= int64(numOnes) + br.curWord = br.consumeBits(br.curWord, numOnes) + br.curNumBits -= numOnes + if br.curNumBits != 0 { + return int64(numOnes) + } + length = int64(numOnes) + } else { + br.remaining -= 64 + br.curNumBits = 0 + length = 64 + } + + for br.remaining >= 64 { + br.curWord = br.loadFull() + numOnes := br.countFirstZeros(^br.curWord) + length += int64(numOnes) + br.remaining -= int64(numOnes) + if numOnes < 64 { + br.curWord = br.consumeBits(br.curWord, numOnes) + br.curNumBits = 64 - numOnes + return length + } + } + + if br.remaining > 0 { + br.curWord = br.loadPartial(0, br.remaining) + br.curNumBits = int32(br.remaining) + numOnes := br.countFirstZeros(^br.curWord) + br.curWord = br.consumeBits(br.curWord, numOnes) + br.curNumBits -= numOnes + br.remaining -= int64(numOnes) + length += int64(numOnes) + } + return length +} + +func (br *baseSetBitRunReader) NextRun() SetBitRun { + var ( + pos int64 = 0 + length int64 = 0 + ) + + if br.curNumBits != 0 { + run := br.findCurrentRun() + if run.Length != 0 && br.curNumBits != 0 { + return br.adjustRun(run) + } + pos = run.Pos + length = run.Length + } + + if length == 0 { + // we didn't get any ones in curWord, so we can skip any zeros + // in the following words + br.skipNextZeros() + if br.remaining == 0 { + return SetBitRun{0, 0} + } + pos = br.position() + } else if br.curNumBits == 0 { + if br.remaining >= 64 { + br.curWord = br.loadFull() + br.curNumBits = 64 + } else if br.remaining > 0 { + br.curWord = br.loadPartial(0, br.remaining) + br.curNumBits = int32(br.remaining) + } else { + return br.adjustRun(SetBitRun{pos, length}) + } + if (br.curWord & br.firstBit) == 0 { + return br.adjustRun(SetBitRun{pos, length}) + } + } + + length += br.countNextOnes() + return br.adjustRun(SetBitRun{pos, length}) +} + +// VisitFn is a callback function for visiting runs of contiguous bits +type VisitFn func(pos int64, length int64) error + +func (br *baseSetBitRunReader) VisitSetBitRuns(visitFn VisitFn) error { + for { + run := br.NextRun() + if run.Length == 0 { + break + } + + if err := visitFn(run.Pos, run.Length); err != nil { + return err + } + } + return nil +} + +// VisitSetBitRuns is just a convenience function for calling NewSetBitRunReader and then VisitSetBitRuns +func VisitSetBitRuns(bitmap []byte, bitmapOffset int64, length int64, visitFn VisitFn) error { + if bitmap == nil { + return visitFn(0, length) + } + rdr := NewSetBitRunReader(bitmap, bitmapOffset, length) + for { + run := rdr.NextRun() + if run.Length == 0 { + break + } + + if err := visitFn(run.Pos, run.Length); err != nil { + return err + } + } + return nil +} + +func VisitSetBitRunsNoErr(bitmap []byte, bitmapOffset int64, length int64, visitFn func(pos, length int64)) { + if bitmap == nil { + visitFn(0, length) + return + } + rdr := NewSetBitRunReader(bitmap, bitmapOffset, length) + for { + run := rdr.NextRun() + if run.Length == 0 { + break + } + visitFn(run.Pos, run.Length) + } +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bitmap_generate.go b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bitmap_generate.go new file mode 100644 index 000000000..08b5fceab --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/bitutils/bitmap_generate.go @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bitutils + +import "github.com/apache/arrow/go/v15/arrow/bitutil" + +// GenerateBits writes sequential bits to a bitmap. Bits preceding the +// initial start offset are preserved, bits following the bitmap may +// get clobbered. +func GenerateBits(bitmap []byte, start, length int64, g func() bool) { + if length == 0 { + return + } + + cur := bitmap[start/8:] + mask := bitutil.BitMask[start%8] + curbyte := cur[0] & bitutil.PrecedingBitmask[start%8] + + for i := int64(0); i < length; i++ { + bit := g() + if bit { + curbyte = curbyte | mask + } + mask <<= 1 + if mask == 0 { + mask = 1 + cur[0] = curbyte + cur = cur[1:] + curbyte = 0 + } + } + + if mask != 1 { + cur[0] = curbyte + } +} + +// GenerateBitsUnrolled is like GenerateBits but unrolls its main loop for +// higher performance. +// +// See the benchmarks for evidence. +func GenerateBitsUnrolled(bitmap []byte, start, length int64, g func() bool) { + if length == 0 { + return + } + + var ( + curbyte byte + cur = bitmap[start/8:] + startBitOffset uint64 = uint64(start % 8) + mask = bitutil.BitMask[startBitOffset] + remaining = length + ) + + if mask != 0x01 { + curbyte = cur[0] & bitutil.PrecedingBitmask[startBitOffset] + for mask != 0 && remaining > 0 { + if g() { + curbyte |= mask + } + mask <<= 1 + remaining-- + } + cur[0] = curbyte + cur = cur[1:] + } + + var outResults [8]byte + for remainingBytes := remaining / 8; remainingBytes > 0; remainingBytes-- { + for i := 0; i < 8; i++ { + if g() { + outResults[i] = 1 + } else { + outResults[i] = 0 + } + } + cur[0] = (outResults[0] | outResults[1]<<1 | outResults[2]<<2 | + outResults[3]<<3 | outResults[4]<<4 | outResults[5]<<5 | + outResults[6]<<6 | outResults[7]<<7) + cur = cur[1:] + } + + remainingBits := remaining % 8 + if remainingBits > 0 { + curbyte = 0 + mask = 0x01 + for ; remainingBits > 0; remainingBits-- { + if g() { + curbyte |= mask + } + mask <<= 1 + } + cur[0] = curbyte + } +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_funcs.go b/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_funcs.go new file mode 100644 index 000000000..c1bdfeb6d --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_funcs.go @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hashing + +import ( + "math/bits" + "unsafe" + + "github.com/zeebo/xxh3" +) + +func hashInt(val uint64, alg uint64) uint64 { + // Two of xxhash's prime multipliers (which are chosen for their + // bit dispersion properties) + var multipliers = [2]uint64{11400714785074694791, 14029467366897019727} + // Multiplying by the prime number mixes the low bits into the high bits, + // then byte-swapping (which is a single CPU instruction) allows the + // combined high and low bits to participate in the initial hash table index. + return bits.ReverseBytes64(multipliers[alg] * val) +} + +func hashFloat32(val float32, alg uint64) uint64 { + // grab the raw byte pattern of the + bt := *(*[4]byte)(unsafe.Pointer(&val)) + x := uint64(*(*uint32)(unsafe.Pointer(&bt[0]))) + hx := hashInt(x, alg) + hy := hashInt(x, alg^1) + return 4 ^ hx ^ hy +} + +func hashFloat64(val float64, alg uint64) uint64 { + bt := *(*[8]byte)(unsafe.Pointer(&val)) + hx := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[4]))), alg) + hy := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[0]))), alg^1) + return 8 ^ hx ^ hy +} + +// prime constants used for slightly increasing the hash quality further +var exprimes = [2]uint64{1609587929392839161, 9650029242287828579} + +// for smaller amounts of bytes this is faster than even calling into +// xxh3 to do the Hash, so we specialize in order to get the benefits +// of that performance. +func Hash(b []byte, alg uint64) uint64 { + n := uint32(len(b)) + if n <= 16 { + switch { + case n > 8: + // 8 < length <= 16 + // apply same principle as above, but as two 64-bit ints + x := *(*uint64)(unsafe.Pointer(&b[n-8])) + y := *(*uint64)(unsafe.Pointer(&b[0])) + hx := hashInt(x, alg) + hy := hashInt(y, alg^1) + return uint64(n) ^ hx ^ hy + case n >= 4: + // 4 < length <= 8 + // we can read the bytes as two overlapping 32-bit ints, apply different + // hash functions to each in parallel + // then xor the results + x := *(*uint32)(unsafe.Pointer(&b[n-4])) + y := *(*uint32)(unsafe.Pointer(&b[0])) + hx := hashInt(uint64(x), alg) + hy := hashInt(uint64(y), alg^1) + return uint64(n) ^ hx ^ hy + case n > 0: + x := uint32((n << 24) ^ (uint32(b[0]) << 16) ^ (uint32(b[n/2]) << 8) ^ uint32(b[n-1])) + return hashInt(uint64(x), alg) + case n == 0: + return 1 + } + } + + // increase differentiation enough to improve hash quality + return xxh3.Hash(b) + exprimes[alg] +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string.go b/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string.go new file mode 100644 index 000000000..b772c7d7f --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string.go @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.20 || tinygo + +package hashing + +import "unsafe" + +func hashString(val string, alg uint64) uint64 { + buf := unsafe.Slice(unsafe.StringData(val), len(val)) + return Hash(buf, alg) +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string_go1.19.go b/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string_go1.19.go new file mode 100644 index 000000000..f38eb5c52 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/hash_string_go1.19.go @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !go1.20 && !tinygo + +package hashing + +import ( + "reflect" + "unsafe" +) + +func hashString(val string, alg uint64) uint64 { + if val == "" { + return Hash([]byte{}, alg) + } + // highly efficient way to get byte slice without copy before + // the introduction of unsafe.StringData in go1.20 + // (https://stackoverflow.com/questions/59209493/how-to-use-unsafe-get-a-byte-slice-from-a-string-without-memory-copy) + const MaxInt32 = 1<<31 - 1 + buf := (*[MaxInt32]byte)(unsafe.Pointer((*reflect.StringHeader)( + unsafe.Pointer(&val)).Data))[: len(val)&MaxInt32 : len(val)&MaxInt32] + return Hash(buf, alg) +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/types.tmpldata b/vendor/github.com/apache/arrow/go/v15/internal/hashing/types.tmpldata new file mode 100644 index 000000000..0ba6f765d --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/types.tmpldata @@ -0,0 +1,42 @@ +[ + { + "Name": "Int8", + "name": "int8" + }, + { + "Name": "Uint8", + "name": "uint8" + }, + { + "Name": "Int16", + "name": "int16" + }, + { + "Name": "Uint16", + "name": "uint16" + }, + { + "Name": "Int32", + "name": "int32" + }, + { + "Name": "Int64", + "name": "int64" + }, + { + "Name": "Uint32", + "name": "uint32" + }, + { + "Name": "Uint64", + "name": "uint64" + }, + { + "Name": "Float32", + "name": "float32" + }, + { + "Name": "Float64", + "name": "float64" + } +] diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go b/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go new file mode 100644 index 000000000..39b82cdef --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go @@ -0,0 +1,2833 @@ +// Code generated by xxh3_memo_table.gen.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hashing + +import ( + "math" + + "github.com/apache/arrow/go/v15/arrow" + "github.com/apache/arrow/go/v15/arrow/bitutil" + "github.com/apache/arrow/go/v15/internal/utils" +) + +type payloadInt8 struct { + val int8 + memoIdx int32 +} + +type entryInt8 struct { + h uint64 + payload payloadInt8 +} + +func (e entryInt8) Valid() bool { return e.h != sentinel } + +// Int8HashTable is a hashtable specifically for int8 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Int8HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryInt8 +} + +// NewInt8HashTable returns a new hash table for int8 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewInt8HashTable(cap uint64) *Int8HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Int8HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryInt8, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Int8HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryInt8, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Int8HashTable) CopyValues(out []int8) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Int8HashTable) CopyValuesSubset(start int, out []int8) { + h.VisitEntries(func(e *entryInt8) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Int8HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Int8HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Int8Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryInt8) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = e.payload.val + } + }) +} + +func (h *Int8HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Int8HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Int8HashTable) Lookup(v uint64, cmp func(int8) bool) (*entryInt8, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Int8HashTable) lookup(v uint64, szMask uint64, cmp func(int8) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryInt8 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Int8HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryInt8, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(int8) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Int8HashTable) Insert(e *entryInt8, v uint64, val int8, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Int8HashTable) VisitEntries(visit func(*entryInt8)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Int8MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Int8MemoTable struct { + tbl *Int8HashTable + nullIdx int32 +} + +// NewInt8MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewInt8MemoTable(num int64) *Int8MemoTable { + return &Int8MemoTable{tbl: NewInt8HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +func (Int8MemoTable) TypeTraits() TypeTraits { + return arrow.Int8Traits +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Int8MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Int8MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Int8MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Int8MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Int8MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Int8MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]int8)) +} + +func (s *Int8MemoTable) WriteOut(out []byte) { + s.tbl.CopyValues(arrow.Int8Traits.CastFromBytes(out)) +} + +func (s *Int8MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.CopyValuesSubset(start, arrow.Int8Traits.CastFromBytes(out)) +} + +func (s *Int8MemoTable) WriteOutLE(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Int8MemoTable) WriteOutSubsetLE(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Int8MemoTable) Get(val interface{}) (int, bool) { + + h := hashInt(uint64(val.(int8)), 0) + if e, ok := s.tbl.Lookup(h, func(v int8) bool { return val.(int8) == v }); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Int8MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashInt(uint64(val.(int8)), 0) + e, ok := s.tbl.Lookup(h, func(v int8) bool { + return val.(int8) == v + }) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(int8), int32(idx)) + } + return +} + +// GetOrInsertBytes is unimplemented +func (s *Int8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + +type payloadUint8 struct { + val uint8 + memoIdx int32 +} + +type entryUint8 struct { + h uint64 + payload payloadUint8 +} + +func (e entryUint8) Valid() bool { return e.h != sentinel } + +// Uint8HashTable is a hashtable specifically for uint8 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Uint8HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryUint8 +} + +// NewUint8HashTable returns a new hash table for uint8 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewUint8HashTable(cap uint64) *Uint8HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Uint8HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryUint8, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Uint8HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryUint8, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Uint8HashTable) CopyValues(out []uint8) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Uint8HashTable) CopyValuesSubset(start int, out []uint8) { + h.VisitEntries(func(e *entryUint8) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Uint8HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Uint8HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Uint8Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryUint8) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = e.payload.val + } + }) +} + +func (h *Uint8HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Uint8HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Uint8HashTable) Lookup(v uint64, cmp func(uint8) bool) (*entryUint8, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Uint8HashTable) lookup(v uint64, szMask uint64, cmp func(uint8) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryUint8 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Uint8HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryUint8, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(uint8) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Uint8HashTable) Insert(e *entryUint8, v uint64, val uint8, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Uint8HashTable) VisitEntries(visit func(*entryUint8)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Uint8MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Uint8MemoTable struct { + tbl *Uint8HashTable + nullIdx int32 +} + +// NewUint8MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewUint8MemoTable(num int64) *Uint8MemoTable { + return &Uint8MemoTable{tbl: NewUint8HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +func (Uint8MemoTable) TypeTraits() TypeTraits { + return arrow.Uint8Traits +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Uint8MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Uint8MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Uint8MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Uint8MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Uint8MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Uint8MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]uint8)) +} + +func (s *Uint8MemoTable) WriteOut(out []byte) { + s.tbl.CopyValues(arrow.Uint8Traits.CastFromBytes(out)) +} + +func (s *Uint8MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.CopyValuesSubset(start, arrow.Uint8Traits.CastFromBytes(out)) +} + +func (s *Uint8MemoTable) WriteOutLE(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Uint8MemoTable) WriteOutSubsetLE(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Uint8MemoTable) Get(val interface{}) (int, bool) { + + h := hashInt(uint64(val.(uint8)), 0) + if e, ok := s.tbl.Lookup(h, func(v uint8) bool { return val.(uint8) == v }); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Uint8MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashInt(uint64(val.(uint8)), 0) + e, ok := s.tbl.Lookup(h, func(v uint8) bool { + return val.(uint8) == v + }) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(uint8), int32(idx)) + } + return +} + +// GetOrInsertBytes is unimplemented +func (s *Uint8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + +type payloadInt16 struct { + val int16 + memoIdx int32 +} + +type entryInt16 struct { + h uint64 + payload payloadInt16 +} + +func (e entryInt16) Valid() bool { return e.h != sentinel } + +// Int16HashTable is a hashtable specifically for int16 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Int16HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryInt16 +} + +// NewInt16HashTable returns a new hash table for int16 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewInt16HashTable(cap uint64) *Int16HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Int16HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryInt16, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Int16HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryInt16, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Int16HashTable) CopyValues(out []int16) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Int16HashTable) CopyValuesSubset(start int, out []int16) { + h.VisitEntries(func(e *entryInt16) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Int16HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Int16HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Int16Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryInt16) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLEInt16(e.payload.val) + } + }) +} + +func (h *Int16HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Int16HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Int16HashTable) Lookup(v uint64, cmp func(int16) bool) (*entryInt16, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Int16HashTable) lookup(v uint64, szMask uint64, cmp func(int16) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryInt16 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Int16HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryInt16, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(int16) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Int16HashTable) Insert(e *entryInt16, v uint64, val int16, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Int16HashTable) VisitEntries(visit func(*entryInt16)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Int16MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Int16MemoTable struct { + tbl *Int16HashTable + nullIdx int32 +} + +// NewInt16MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewInt16MemoTable(num int64) *Int16MemoTable { + return &Int16MemoTable{tbl: NewInt16HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +func (Int16MemoTable) TypeTraits() TypeTraits { + return arrow.Int16Traits +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Int16MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Int16MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Int16MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Int16MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Int16MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Int16MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]int16)) +} + +func (s *Int16MemoTable) WriteOut(out []byte) { + s.tbl.CopyValues(arrow.Int16Traits.CastFromBytes(out)) +} + +func (s *Int16MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.CopyValuesSubset(start, arrow.Int16Traits.CastFromBytes(out)) +} + +func (s *Int16MemoTable) WriteOutLE(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Int16MemoTable) WriteOutSubsetLE(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Int16MemoTable) Get(val interface{}) (int, bool) { + + h := hashInt(uint64(val.(int16)), 0) + if e, ok := s.tbl.Lookup(h, func(v int16) bool { return val.(int16) == v }); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Int16MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashInt(uint64(val.(int16)), 0) + e, ok := s.tbl.Lookup(h, func(v int16) bool { + return val.(int16) == v + }) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(int16), int32(idx)) + } + return +} + +// GetOrInsertBytes is unimplemented +func (s *Int16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + +type payloadUint16 struct { + val uint16 + memoIdx int32 +} + +type entryUint16 struct { + h uint64 + payload payloadUint16 +} + +func (e entryUint16) Valid() bool { return e.h != sentinel } + +// Uint16HashTable is a hashtable specifically for uint16 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Uint16HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryUint16 +} + +// NewUint16HashTable returns a new hash table for uint16 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewUint16HashTable(cap uint64) *Uint16HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Uint16HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryUint16, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Uint16HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryUint16, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Uint16HashTable) CopyValues(out []uint16) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Uint16HashTable) CopyValuesSubset(start int, out []uint16) { + h.VisitEntries(func(e *entryUint16) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Uint16HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Uint16HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Uint16Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryUint16) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLEUint16(e.payload.val) + } + }) +} + +func (h *Uint16HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Uint16HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Uint16HashTable) Lookup(v uint64, cmp func(uint16) bool) (*entryUint16, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Uint16HashTable) lookup(v uint64, szMask uint64, cmp func(uint16) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryUint16 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Uint16HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryUint16, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(uint16) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Uint16HashTable) Insert(e *entryUint16, v uint64, val uint16, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Uint16HashTable) VisitEntries(visit func(*entryUint16)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Uint16MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Uint16MemoTable struct { + tbl *Uint16HashTable + nullIdx int32 +} + +// NewUint16MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewUint16MemoTable(num int64) *Uint16MemoTable { + return &Uint16MemoTable{tbl: NewUint16HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +func (Uint16MemoTable) TypeTraits() TypeTraits { + return arrow.Uint16Traits +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Uint16MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Uint16MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Uint16MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Uint16MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Uint16MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Uint16MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]uint16)) +} + +func (s *Uint16MemoTable) WriteOut(out []byte) { + s.tbl.CopyValues(arrow.Uint16Traits.CastFromBytes(out)) +} + +func (s *Uint16MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.CopyValuesSubset(start, arrow.Uint16Traits.CastFromBytes(out)) +} + +func (s *Uint16MemoTable) WriteOutLE(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Uint16MemoTable) WriteOutSubsetLE(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Uint16MemoTable) Get(val interface{}) (int, bool) { + + h := hashInt(uint64(val.(uint16)), 0) + if e, ok := s.tbl.Lookup(h, func(v uint16) bool { return val.(uint16) == v }); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Uint16MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashInt(uint64(val.(uint16)), 0) + e, ok := s.tbl.Lookup(h, func(v uint16) bool { + return val.(uint16) == v + }) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(uint16), int32(idx)) + } + return +} + +// GetOrInsertBytes is unimplemented +func (s *Uint16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + +type payloadInt32 struct { + val int32 + memoIdx int32 +} + +type entryInt32 struct { + h uint64 + payload payloadInt32 +} + +func (e entryInt32) Valid() bool { return e.h != sentinel } + +// Int32HashTable is a hashtable specifically for int32 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Int32HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryInt32 +} + +// NewInt32HashTable returns a new hash table for int32 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewInt32HashTable(cap uint64) *Int32HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Int32HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryInt32, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Int32HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryInt32, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Int32HashTable) CopyValues(out []int32) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Int32HashTable) CopyValuesSubset(start int, out []int32) { + h.VisitEntries(func(e *entryInt32) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Int32HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Int32HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Int32Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryInt32) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLEInt32(e.payload.val) + } + }) +} + +func (h *Int32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Int32HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Int32HashTable) Lookup(v uint64, cmp func(int32) bool) (*entryInt32, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Int32HashTable) lookup(v uint64, szMask uint64, cmp func(int32) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryInt32 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Int32HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryInt32, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(int32) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Int32HashTable) Insert(e *entryInt32, v uint64, val int32, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Int32HashTable) VisitEntries(visit func(*entryInt32)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Int32MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Int32MemoTable struct { + tbl *Int32HashTable + nullIdx int32 +} + +// NewInt32MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewInt32MemoTable(num int64) *Int32MemoTable { + return &Int32MemoTable{tbl: NewInt32HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +func (Int32MemoTable) TypeTraits() TypeTraits { + return arrow.Int32Traits +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Int32MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Int32MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Int32MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Int32MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Int32MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Int32MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]int32)) +} + +func (s *Int32MemoTable) WriteOut(out []byte) { + s.tbl.CopyValues(arrow.Int32Traits.CastFromBytes(out)) +} + +func (s *Int32MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.CopyValuesSubset(start, arrow.Int32Traits.CastFromBytes(out)) +} + +func (s *Int32MemoTable) WriteOutLE(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Int32MemoTable) WriteOutSubsetLE(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Int32MemoTable) Get(val interface{}) (int, bool) { + + h := hashInt(uint64(val.(int32)), 0) + if e, ok := s.tbl.Lookup(h, func(v int32) bool { return val.(int32) == v }); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashInt(uint64(val.(int32)), 0) + e, ok := s.tbl.Lookup(h, func(v int32) bool { + return val.(int32) == v + }) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(int32), int32(idx)) + } + return +} + +// GetOrInsertBytes is unimplemented +func (s *Int32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + +type payloadInt64 struct { + val int64 + memoIdx int32 +} + +type entryInt64 struct { + h uint64 + payload payloadInt64 +} + +func (e entryInt64) Valid() bool { return e.h != sentinel } + +// Int64HashTable is a hashtable specifically for int64 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Int64HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryInt64 +} + +// NewInt64HashTable returns a new hash table for int64 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewInt64HashTable(cap uint64) *Int64HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Int64HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryInt64, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Int64HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryInt64, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Int64HashTable) CopyValues(out []int64) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Int64HashTable) CopyValuesSubset(start int, out []int64) { + h.VisitEntries(func(e *entryInt64) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Int64HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Int64HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Int64Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryInt64) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLEInt64(e.payload.val) + } + }) +} + +func (h *Int64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Int64HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Int64HashTable) Lookup(v uint64, cmp func(int64) bool) (*entryInt64, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Int64HashTable) lookup(v uint64, szMask uint64, cmp func(int64) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryInt64 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Int64HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryInt64, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(int64) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Int64HashTable) Insert(e *entryInt64, v uint64, val int64, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Int64HashTable) VisitEntries(visit func(*entryInt64)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Int64MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Int64MemoTable struct { + tbl *Int64HashTable + nullIdx int32 +} + +// NewInt64MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewInt64MemoTable(num int64) *Int64MemoTable { + return &Int64MemoTable{tbl: NewInt64HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +func (Int64MemoTable) TypeTraits() TypeTraits { + return arrow.Int64Traits +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Int64MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Int64MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Int64MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Int64MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Int64MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Int64MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]int64)) +} + +func (s *Int64MemoTable) WriteOut(out []byte) { + s.tbl.CopyValues(arrow.Int64Traits.CastFromBytes(out)) +} + +func (s *Int64MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.CopyValuesSubset(start, arrow.Int64Traits.CastFromBytes(out)) +} + +func (s *Int64MemoTable) WriteOutLE(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Int64MemoTable) WriteOutSubsetLE(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Int64MemoTable) Get(val interface{}) (int, bool) { + + h := hashInt(uint64(val.(int64)), 0) + if e, ok := s.tbl.Lookup(h, func(v int64) bool { return val.(int64) == v }); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashInt(uint64(val.(int64)), 0) + e, ok := s.tbl.Lookup(h, func(v int64) bool { + return val.(int64) == v + }) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(int64), int32(idx)) + } + return +} + +// GetOrInsertBytes is unimplemented +func (s *Int64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + +type payloadUint32 struct { + val uint32 + memoIdx int32 +} + +type entryUint32 struct { + h uint64 + payload payloadUint32 +} + +func (e entryUint32) Valid() bool { return e.h != sentinel } + +// Uint32HashTable is a hashtable specifically for uint32 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Uint32HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryUint32 +} + +// NewUint32HashTable returns a new hash table for uint32 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewUint32HashTable(cap uint64) *Uint32HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Uint32HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryUint32, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Uint32HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryUint32, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Uint32HashTable) CopyValues(out []uint32) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Uint32HashTable) CopyValuesSubset(start int, out []uint32) { + h.VisitEntries(func(e *entryUint32) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Uint32HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Uint32HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Uint32Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryUint32) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLEUint32(e.payload.val) + } + }) +} + +func (h *Uint32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Uint32HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Uint32HashTable) Lookup(v uint64, cmp func(uint32) bool) (*entryUint32, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Uint32HashTable) lookup(v uint64, szMask uint64, cmp func(uint32) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryUint32 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Uint32HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryUint32, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(uint32) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Uint32HashTable) Insert(e *entryUint32, v uint64, val uint32, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Uint32HashTable) VisitEntries(visit func(*entryUint32)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Uint32MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Uint32MemoTable struct { + tbl *Uint32HashTable + nullIdx int32 +} + +// NewUint32MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewUint32MemoTable(num int64) *Uint32MemoTable { + return &Uint32MemoTable{tbl: NewUint32HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +func (Uint32MemoTable) TypeTraits() TypeTraits { + return arrow.Uint32Traits +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Uint32MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Uint32MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Uint32MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Uint32MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Uint32MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Uint32MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]uint32)) +} + +func (s *Uint32MemoTable) WriteOut(out []byte) { + s.tbl.CopyValues(arrow.Uint32Traits.CastFromBytes(out)) +} + +func (s *Uint32MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.CopyValuesSubset(start, arrow.Uint32Traits.CastFromBytes(out)) +} + +func (s *Uint32MemoTable) WriteOutLE(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Uint32MemoTable) WriteOutSubsetLE(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Uint32MemoTable) Get(val interface{}) (int, bool) { + + h := hashInt(uint64(val.(uint32)), 0) + if e, ok := s.tbl.Lookup(h, func(v uint32) bool { return val.(uint32) == v }); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Uint32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashInt(uint64(val.(uint32)), 0) + e, ok := s.tbl.Lookup(h, func(v uint32) bool { + return val.(uint32) == v + }) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(uint32), int32(idx)) + } + return +} + +// GetOrInsertBytes is unimplemented +func (s *Uint32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + +type payloadUint64 struct { + val uint64 + memoIdx int32 +} + +type entryUint64 struct { + h uint64 + payload payloadUint64 +} + +func (e entryUint64) Valid() bool { return e.h != sentinel } + +// Uint64HashTable is a hashtable specifically for uint64 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Uint64HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryUint64 +} + +// NewUint64HashTable returns a new hash table for uint64 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewUint64HashTable(cap uint64) *Uint64HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Uint64HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryUint64, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Uint64HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryUint64, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Uint64HashTable) CopyValues(out []uint64) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Uint64HashTable) CopyValuesSubset(start int, out []uint64) { + h.VisitEntries(func(e *entryUint64) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Uint64HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Uint64HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Uint64Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryUint64) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLEUint64(e.payload.val) + } + }) +} + +func (h *Uint64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Uint64HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Uint64HashTable) Lookup(v uint64, cmp func(uint64) bool) (*entryUint64, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Uint64HashTable) lookup(v uint64, szMask uint64, cmp func(uint64) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryUint64 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Uint64HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryUint64, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(uint64) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Uint64HashTable) Insert(e *entryUint64, v uint64, val uint64, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Uint64HashTable) VisitEntries(visit func(*entryUint64)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Uint64MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Uint64MemoTable struct { + tbl *Uint64HashTable + nullIdx int32 +} + +// NewUint64MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewUint64MemoTable(num int64) *Uint64MemoTable { + return &Uint64MemoTable{tbl: NewUint64HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +func (Uint64MemoTable) TypeTraits() TypeTraits { + return arrow.Uint64Traits +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Uint64MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Uint64MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Uint64MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Uint64MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Uint64MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Uint64MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]uint64)) +} + +func (s *Uint64MemoTable) WriteOut(out []byte) { + s.tbl.CopyValues(arrow.Uint64Traits.CastFromBytes(out)) +} + +func (s *Uint64MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.CopyValuesSubset(start, arrow.Uint64Traits.CastFromBytes(out)) +} + +func (s *Uint64MemoTable) WriteOutLE(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Uint64MemoTable) WriteOutSubsetLE(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Uint64MemoTable) Get(val interface{}) (int, bool) { + + h := hashInt(uint64(val.(uint64)), 0) + if e, ok := s.tbl.Lookup(h, func(v uint64) bool { return val.(uint64) == v }); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Uint64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashInt(uint64(val.(uint64)), 0) + e, ok := s.tbl.Lookup(h, func(v uint64) bool { + return val.(uint64) == v + }) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(uint64), int32(idx)) + } + return +} + +// GetOrInsertBytes is unimplemented +func (s *Uint64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + +type payloadFloat32 struct { + val float32 + memoIdx int32 +} + +type entryFloat32 struct { + h uint64 + payload payloadFloat32 +} + +func (e entryFloat32) Valid() bool { return e.h != sentinel } + +// Float32HashTable is a hashtable specifically for float32 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Float32HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryFloat32 +} + +// NewFloat32HashTable returns a new hash table for float32 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewFloat32HashTable(cap uint64) *Float32HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Float32HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryFloat32, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Float32HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryFloat32, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Float32HashTable) CopyValues(out []float32) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Float32HashTable) CopyValuesSubset(start int, out []float32) { + h.VisitEntries(func(e *entryFloat32) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Float32HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Float32HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Float32Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryFloat32) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLEFloat32(e.payload.val) + } + }) +} + +func (h *Float32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Float32HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Float32HashTable) Lookup(v uint64, cmp func(float32) bool) (*entryFloat32, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Float32HashTable) lookup(v uint64, szMask uint64, cmp func(float32) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryFloat32 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Float32HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryFloat32, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(float32) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Float32HashTable) Insert(e *entryFloat32, v uint64, val float32, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Float32HashTable) VisitEntries(visit func(*entryFloat32)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Float32MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Float32MemoTable struct { + tbl *Float32HashTable + nullIdx int32 +} + +// NewFloat32MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewFloat32MemoTable(num int64) *Float32MemoTable { + return &Float32MemoTable{tbl: NewFloat32HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +func (Float32MemoTable) TypeTraits() TypeTraits { + return arrow.Float32Traits +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Float32MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Float32MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Float32MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Float32MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Float32MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Float32MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]float32)) +} + +func (s *Float32MemoTable) WriteOut(out []byte) { + s.tbl.CopyValues(arrow.Float32Traits.CastFromBytes(out)) +} + +func (s *Float32MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.CopyValuesSubset(start, arrow.Float32Traits.CastFromBytes(out)) +} + +func (s *Float32MemoTable) WriteOutLE(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Float32MemoTable) WriteOutSubsetLE(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Float32MemoTable) Get(val interface{}) (int, bool) { + var cmp func(float32) bool + + if math.IsNaN(float64(val.(float32))) { + cmp = isNan32Cmp + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = float32(math.NaN()) + } else { + cmp = func(v float32) bool { return val.(float32) == v } + } + + h := hashFloat32(val.(float32), 0) + if e, ok := s.tbl.Lookup(h, cmp); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + var cmp func(float32) bool + + if math.IsNaN(float64(val.(float32))) { + cmp = isNan32Cmp + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = float32(math.NaN()) + } else { + cmp = func(v float32) bool { return val.(float32) == v } + } + + h := hashFloat32(val.(float32), 0) + e, ok := s.tbl.Lookup(h, cmp) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(float32), int32(idx)) + } + return +} + +// GetOrInsertBytes is unimplemented +func (s *Float32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} + +type payloadFloat64 struct { + val float64 + memoIdx int32 +} + +type entryFloat64 struct { + h uint64 + payload payloadFloat64 +} + +func (e entryFloat64) Valid() bool { return e.h != sentinel } + +// Float64HashTable is a hashtable specifically for float64 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Float64HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryFloat64 +} + +// NewFloat64HashTable returns a new hash table for float64 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewFloat64HashTable(cap uint64) *Float64HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Float64HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryFloat64, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Float64HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryFloat64, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Float64HashTable) CopyValues(out []float64) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Float64HashTable) CopyValuesSubset(start int, out []float64) { + h.VisitEntries(func(e *entryFloat64) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Float64HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Float64HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Float64Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryFloat64) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLEFloat64(e.payload.val) + } + }) +} + +func (h *Float64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Float64HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Float64HashTable) Lookup(v uint64, cmp func(float64) bool) (*entryFloat64, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Float64HashTable) lookup(v uint64, szMask uint64, cmp func(float64) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryFloat64 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Float64HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryFloat64, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(float64) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Float64HashTable) Insert(e *entryFloat64, v uint64, val float64, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Float64HashTable) VisitEntries(visit func(*entryFloat64)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Float64MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Float64MemoTable struct { + tbl *Float64HashTable + nullIdx int32 +} + +// NewFloat64MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewFloat64MemoTable(num int64) *Float64MemoTable { + return &Float64MemoTable{tbl: NewFloat64HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +func (Float64MemoTable) TypeTraits() TypeTraits { + return arrow.Float64Traits +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Float64MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Float64MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Float64MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Float64MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Float64MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Float64MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]float64)) +} + +func (s *Float64MemoTable) WriteOut(out []byte) { + s.tbl.CopyValues(arrow.Float64Traits.CastFromBytes(out)) +} + +func (s *Float64MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.CopyValuesSubset(start, arrow.Float64Traits.CastFromBytes(out)) +} + +func (s *Float64MemoTable) WriteOutLE(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Float64MemoTable) WriteOutSubsetLE(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Float64MemoTable) Get(val interface{}) (int, bool) { + var cmp func(float64) bool + if math.IsNaN(val.(float64)) { + cmp = math.IsNaN + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = math.NaN() + } else { + cmp = func(v float64) bool { return val.(float64) == v } + } + + h := hashFloat64(val.(float64), 0) + if e, ok := s.tbl.Lookup(h, cmp); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + var cmp func(float64) bool + if math.IsNaN(val.(float64)) { + cmp = math.IsNaN + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = math.NaN() + } else { + cmp = func(v float64) bool { return val.(float64) == v } + } + + h := hashFloat64(val.(float64), 0) + e, ok := s.tbl.Lookup(h, cmp) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(float64), int32(idx)) + } + return +} + +// GetOrInsertBytes is unimplemented +func (s *Float64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go.tmpl b/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go.tmpl new file mode 100644 index 000000000..527008ad6 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.gen.go.tmpl @@ -0,0 +1,349 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hashing + +import ( + "github.com/apache/arrow/go/v15/arrow/bitutil" + "github.com/apache/arrow/go/v15/internal/utils" +) + +{{range .In}} +type payload{{.Name}} struct { + val {{.name}} + memoIdx int32 +} + +type entry{{.Name}} struct { + h uint64 + payload payload{{.Name}} +} + +func (e entry{{.Name}}) Valid() bool { return e.h != sentinel } + +// {{.Name}}HashTable is a hashtable specifically for {{.name}} that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type {{.Name}}HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entry{{.Name}} +} + +// New{{.Name}}HashTable returns a new hash table for {{.name}} values +// initialized with the passed in capacity or 32 whichever is larger. +func New{{.Name}}HashTable(cap uint64) *{{.Name}}HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &{{.Name}}HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entry{{.Name}}, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *{{.Name}}HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entry{{.Name}}, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *{{.Name}}HashTable) CopyValues(out []{{.name}}) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *{{.Name}}HashTable) CopyValuesSubset(start int, out []{{.name}}) { + h.VisitEntries(func(e *entry{{.Name}}) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *{{.Name}}HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *{{.Name}}HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.{{.Name}}Traits.CastFromBytes(out) + h.VisitEntries(func(e *entry{{.Name}}) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { +{{if and (ne .Name "Int8") (ne .Name "Uint8") -}} + data[idx] = utils.ToLE{{.Name}}(e.payload.val) +{{else -}} + data[idx] = e.payload.val +{{end -}} + } + }) +} + +func (h *{{.Name}}HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func ({{.Name}}HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *{{.Name}}HashTable) Lookup(v uint64, cmp func({{.name}}) bool) (*entry{{.Name}}, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *{{.Name}}HashTable) lookup(v uint64, szMask uint64, cmp func({{.name}}) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entry{{.Name}} + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *{{.Name}}HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entry{{.Name}}, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func({{.name}}) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *{{.Name}}HashTable) Insert(e *entry{{.Name}}, v uint64, val {{.name}}, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *{{.Name}}HashTable) VisitEntries(visit func(*entry{{.Name}})) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// {{.Name}}MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type {{.Name}}MemoTable struct { + tbl *{{.Name}}HashTable + nullIdx int32 +} + +// New{{.Name}}MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func New{{.Name}}MemoTable(num int64) *{{.Name}}MemoTable { + return &{{.Name}}MemoTable{tbl: New{{.Name}}HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +func ({{.Name}}MemoTable) TypeTraits() TypeTraits { + return arrow.{{.Name}}Traits +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *{{.Name}}MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *{{.Name}}MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *{{.Name}}MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *{{.Name}}MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *{{.Name}}MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *{{.Name}}MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]{{.name}})) +} + +func (s *{{.Name}}MemoTable) WriteOut(out []byte) { + s.tbl.CopyValues(arrow.{{.Name}}Traits.CastFromBytes(out)) +} + +func (s *{{.Name}}MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.CopyValuesSubset(start, arrow.{{.Name}}Traits.CastFromBytes(out)) +} + +func (s *{{.Name}}MemoTable) WriteOutLE(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *{{.Name}}MemoTable) WriteOutSubsetLE(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *{{.Name}}MemoTable) Get(val interface{}) (int, bool) { +{{if and (ne .Name "Float32") (ne .Name "Float64") }} + h := hashInt(uint64(val.({{.name}})), 0) + if e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { return val.({{.name}}) == v }); ok { +{{ else -}} + var cmp func({{.name}}) bool + {{if eq .Name "Float32"}} + if math.IsNaN(float64(val.(float32))) { + cmp = isNan32Cmp + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = float32(math.NaN()) + {{ else -}} + if math.IsNaN(val.(float64)) { + cmp = math.IsNaN + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = math.NaN() + {{end -}} + } else { + cmp = func(v {{.name}}) bool { return val.({{.name}}) == v } + } + + h := hash{{.Name}}(val.({{.name}}), 0) + if e, ok := s.tbl.Lookup(h, cmp); ok { +{{ end -}} + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + {{if and (ne .Name "Float32") (ne .Name "Float64") }} + h := hashInt(uint64(val.({{.name}})), 0) + e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { + return val.({{.name}}) == v + }) +{{ else }} + var cmp func({{.name}}) bool + {{if eq .Name "Float32"}} + if math.IsNaN(float64(val.(float32))) { + cmp = isNan32Cmp + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = float32(math.NaN()) + {{ else -}} + if math.IsNaN(val.(float64)) { + cmp = math.IsNaN + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = math.NaN() + {{end -}} + } else { + cmp = func(v {{.name}}) bool { return val.({{.name}}) == v } + } + + h := hash{{.Name}}(val.({{.name}}), 0) + e, ok := s.tbl.Lookup(h, cmp) +{{ end }} + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.({{.name}}), int32(idx)) + } + return +} + + +// GetOrInsertBytes is unimplemented +func (s *{{.Name}}MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + panic("unimplemented") +} +{{end}} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.go b/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.go new file mode 100644 index 000000000..283bc1a95 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/hashing/xxh3_memo_table.go @@ -0,0 +1,443 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package hashing provides utilities for and an implementation of a hash +// table which is more performant than the default go map implementation +// by leveraging xxh3 and some custom hash functions. +package hashing + +import ( + "bytes" + "math" + "reflect" + "unsafe" +) + +//go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl + +type TypeTraits interface { + BytesRequired(n int) int +} + +type ByteSlice interface { + Bytes() []byte +} + +// MemoTable interface for hash tables and dictionary encoding. +// +// Values will remember the order they are inserted to generate a valid +// dictionary. +type MemoTable interface { + TypeTraits() TypeTraits + // Reset drops everything in the table allowing it to be reused + Reset() + // Size returns the current number of unique values stored in + // the table, including whether or not a null value has been + // inserted via GetOrInsertNull. + Size() int + // GetOrInsert returns the index of the table the specified value is, + // and a boolean indicating whether or not the value was found in + // the table (if false, the value was inserted). An error is returned + // if val is not the appropriate type for the table. + GetOrInsert(val interface{}) (idx int, existed bool, err error) + // GetOrInsertBytes returns the index of the table the specified value is, + // and a boolean indicating whether or not the value was found in + // the table (if false, the value was inserted). An error is returned + // if val is not the appropriate type for the table. This function is intended to be used by + // the BinaryMemoTable to prevent unnecessary allocations of the data when converting from a []byte to interface{}. + GetOrInsertBytes(val []byte) (idx int, existed bool, err error) + // GetOrInsertNull returns the index of the null value in the table, + // inserting one if it hasn't already been inserted. It returns a boolean + // indicating if the null value already existed or not in the table. + GetOrInsertNull() (idx int, existed bool) + // GetNull returns the index of the null value in the table, but does not + // insert one if it doesn't already exist. Will return -1 if it doesn't exist + // indicated by a false value for the boolean. + GetNull() (idx int, exists bool) + // WriteOut copies the unique values of the memotable out to the byte slice + // provided. Must have allocated enough bytes for all the values. + WriteOut(out []byte) + // WriteOutSubset is like WriteOut, but only writes a subset of values + // starting with the index offset. + WriteOutSubset(offset int, out []byte) +} + +type NumericMemoTable interface { + MemoTable + WriteOutLE(out []byte) + WriteOutSubsetLE(offset int, out []byte) +} + +const ( + sentinel uint64 = 0 + loadFactor int64 = 2 +) + +func max(a, b uint64) uint64 { + if a > b { + return a + } + return b +} + +var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) } + +// KeyNotFound is the constant returned by memo table functions when a key isn't found in the table +const KeyNotFound = -1 + +type BinaryBuilderIFace interface { + Reserve(int) + ReserveData(int) + Retain() + Resize(int) + ResizeData(int) + Release() + DataLen() int + Value(int) []byte + Len() int + AppendNull() + AppendString(string) + Append([]byte) +} + +// BinaryMemoTable is our hashtable for binary data using the BinaryBuilder +// to construct the actual data in an easy to pass around way with minimal copies +// while using a hash table to keep track of the indexes into the dictionary that +// is created as we go. +type BinaryMemoTable struct { + tbl *Int32HashTable + builder BinaryBuilderIFace + nullIdx int +} + +// NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will +// be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used. +// initial and valuesize can be used to pre-allocate the table to reduce allocations. With +// initial being the initial number of entries to allocate for and valuesize being the starting +// amount of space allocated for writing the actual binary data. +func NewBinaryMemoTable(initial, valuesize int, bldr BinaryBuilderIFace) *BinaryMemoTable { + bldr.Reserve(int(initial)) + datasize := valuesize + if datasize <= 0 { + datasize = initial * 4 + } + bldr.ReserveData(datasize) + return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound} +} + +type unimplementedtraits struct{} + +func (unimplementedtraits) BytesRequired(int) int { panic("unimplemented") } + +func (BinaryMemoTable) TypeTraits() TypeTraits { + return unimplementedtraits{} +} + +// Reset dumps all of the data in the table allowing it to be reutilized. +func (s *BinaryMemoTable) Reset() { + s.tbl.Reset(32) + s.builder.Resize(0) + s.builder.ResizeData(0) + s.builder.Reserve(int(32)) + s.builder.ReserveData(int(32) * 4) + s.nullIdx = KeyNotFound +} + +// GetNull returns the index of a null that has been inserted into the table or +// KeyNotFound. The bool returned will be true if there was a null inserted into +// the table, and false otherwise. +func (s *BinaryMemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// Size returns the current size of the memo table including the null value +// if one has been inserted. +func (s *BinaryMemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// helper function to easily return a byte slice for any given value +// regardless of the type if it's a []byte, string, or fulfills the +// ByteSlice interface. +func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte { + switch v := val.(type) { + case []byte: + return v + case ByteSlice: + return v.Bytes() + case string: + var out []byte + h := (*reflect.StringHeader)(unsafe.Pointer(&v)) + s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) + s.Data = h.Data + s.Len = h.Len + s.Cap = h.Len + return out + default: + panic("invalid type for binarymemotable") + } +} + +// helper function to get the hash value regardless of the underlying binary type +func (BinaryMemoTable) getHash(val interface{}) uint64 { + switch v := val.(type) { + case string: + return hashString(v, 0) + case []byte: + return Hash(v, 0) + case ByteSlice: + return Hash(v.Bytes(), 0) + default: + panic("invalid type for binarymemotable") + } +} + +// helper function to append the given value to the builder regardless +// of the underlying binary type. +func (b *BinaryMemoTable) appendVal(val interface{}) { + switch v := val.(type) { + case string: + b.builder.AppendString(v) + case []byte: + b.builder.Append(v) + case ByteSlice: + b.builder.Append(v.Bytes()) + } +} + +func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) { + return b.tbl.Lookup(h, func(i int32) bool { + return bytes.Equal(val, b.builder.Value(int(i))) + }) +} + +// Get returns the index of the specified value in the table or KeyNotFound, +// and a boolean indicating whether it was found in the table. +func (b *BinaryMemoTable) Get(val interface{}) (int, bool) { + if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok { + return int(p.payload.val), ok + } + return KeyNotFound, false +} + +// GetOrInsertBytes returns the index of the given value in the table, if not found +// it is inserted into the table. The return value 'found' indicates whether the value +// was found in the table (true) or inserted (false) along with any possible error. +func (b *BinaryMemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { + h := Hash(val, 0) + p, found := b.lookup(h, val) + if found { + idx = int(p.payload.val) + } else { + idx = b.Size() + b.builder.Append(val) + b.tbl.Insert(p, h, int32(idx), -1) + } + return +} + +// GetOrInsert returns the index of the given value in the table, if not found +// it is inserted into the table. The return value 'found' indicates whether the value +// was found in the table (true) or inserted (false) along with any possible error. +func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + h := b.getHash(val) + p, found := b.lookup(h, b.valAsByteSlice(val)) + if found { + idx = int(p.payload.val) + } else { + idx = b.Size() + b.appendVal(val) + b.tbl.Insert(p, h, int32(idx), -1) + } + return +} + +// GetOrInsertNull retrieves the index of a null in the table or inserts +// null into the table, returning the index and a boolean indicating if it was +// found in the table (true) or was inserted (false). +func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = b.GetNull() + if !found { + idx = b.Size() + b.nullIdx = idx + b.builder.AppendNull() + } + return +} + +func (b *BinaryMemoTable) Value(i int) []byte { + return b.builder.Value(i) +} + +// helper function to get the offset into the builder data for a given +// index value. +func (b *BinaryMemoTable) findOffset(idx int) uintptr { + if b.builder.DataLen() == 0 { + // only empty strings, short circuit + return 0 + } + + val := b.builder.Value(idx) + for len(val) == 0 { + idx++ + if idx >= b.builder.Len() { + break + } + val = b.builder.Value(idx) + } + if len(val) != 0 { + return uintptr(unsafe.Pointer(&val[0])) + } + return uintptr(b.builder.DataLen()) + b.findOffset(0) +} + +// CopyOffsets copies the list of offsets into the passed in slice, the offsets +// being the start and end values of the underlying allocated bytes in the builder +// for the individual values of the table. out should be at least sized to Size()+1 +func (b *BinaryMemoTable) CopyOffsets(out []int32) { + b.CopyOffsetsSubset(0, out) +} + +// CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, +// it gets a subset of the offsets in the table starting at the index provided by "start". +func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int32) { + if b.builder.Len() <= start { + return + } + + first := b.findOffset(0) + delta := b.findOffset(start) + sz := b.Size() + for i := start; i < sz; i++ { + offset := int32(b.findOffset(i) - delta) + out[i-start] = offset + } + + out[sz-start] = int32(b.builder.DataLen() - (int(delta) - int(first))) +} + +// CopyLargeOffsets copies the list of offsets into the passed in slice, the offsets +// being the start and end values of the underlying allocated bytes in the builder +// for the individual values of the table. out should be at least sized to Size()+1 +func (b *BinaryMemoTable) CopyLargeOffsets(out []int64) { + b.CopyLargeOffsetsSubset(0, out) +} + +// CopyLargeOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, +// it gets a subset of the offsets in the table starting at the index provided by "start". +func (b *BinaryMemoTable) CopyLargeOffsetsSubset(start int, out []int64) { + if b.builder.Len() <= start { + return + } + + first := b.findOffset(0) + delta := b.findOffset(start) + sz := b.Size() + for i := start; i < sz; i++ { + offset := int64(b.findOffset(i) - delta) + out[i-start] = offset + } + + out[sz-start] = int64(b.builder.DataLen() - (int(delta) - int(first))) +} + +// CopyValues copies the raw binary data bytes out, out should be a []byte +// with at least ValuesSize bytes allocated to copy into. +func (b *BinaryMemoTable) CopyValues(out interface{}) { + b.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies the raw binary data bytes out starting with the value +// at the index start, out should be a []byte with at least ValuesSize bytes allocated +func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) { + if b.builder.Len() <= start { + return + } + + var ( + first = b.findOffset(0) + offset = b.findOffset(int(start)) + length = b.builder.DataLen() - int(offset-first) + ) + + outval := out.([]byte) + copy(outval, b.builder.Value(start)[0:length]) +} + +func (b *BinaryMemoTable) WriteOut(out []byte) { + b.CopyValues(out) +} + +func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) { + b.CopyValuesSubset(start, out) +} + +// CopyFixedWidthValues exists to cope with the fact that the table doesn't keep +// track of the fixed width when inserting the null value the databuffer holds a +// zero length byte slice for the null value (if found) +func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) { + if start >= b.Size() { + return + } + + null, exists := b.GetNull() + if !exists || null < start { + // nothing to skip, proceed as usual + b.CopyValuesSubset(start, out) + return + } + + var ( + leftOffset = b.findOffset(start) + nullOffset = b.findOffset(null) + leftSize = nullOffset - leftOffset + rightOffset = leftOffset + uintptr(b.ValuesSize()) + ) + + if leftSize > 0 { + copy(out, b.builder.Value(start)[0:leftSize]) + } + + rightSize := rightOffset - nullOffset + if rightSize > 0 { + // skip the null fixed size value + copy(out[int(leftSize)+width:], b.builder.Value(null + 1)[0:rightSize]) + } +} + +// VisitValues exists to run the visitFn on each value currently in the hash table. +func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) { + for i := int(start); i < b.Size(); i++ { + visitFn(b.builder.Value(i)) + } +} + +// Release is used to tell the underlying builder that it can release the memory allocated +// when the reference count reaches 0, this is safe to be called from multiple goroutines +// simultaneously +func (b *BinaryMemoTable) Release() { b.builder.Release() } + +// Retain increases the ref count, it is safe to call it from multiple goroutines +// simultaneously. +func (b *BinaryMemoTable) Retain() { b.builder.Retain() } + +// ValuesSize returns the current total size of all the raw bytes that have been inserted +// into the memotable so far. +func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() } diff --git a/vendor/github.com/apache/arrow/go/v15/internal/json/json.go b/vendor/github.com/apache/arrow/go/v15/internal/json/json.go new file mode 100644 index 000000000..319b12c55 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/json/json.go @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !tinygo +// +build !tinygo + +package json + +import ( + "io" + + "github.com/goccy/go-json" +) + +type Decoder = json.Decoder +type Encoder = json.Encoder +type Marshaler = json.Marshaler +type Delim = json.Delim +type UnmarshalTypeError = json.UnmarshalTypeError +type Number = json.Number +type Unmarshaler = json.Unmarshaler +type RawMessage = json.RawMessage + +func Marshal(v interface{}) ([]byte, error) { + return json.Marshal(v) +} + +func Unmarshal(data []byte, v interface{}) error { + return json.Unmarshal(data, v) +} + +func NewDecoder(r io.Reader) *Decoder { + return json.NewDecoder(r) +} + +func NewEncoder(w io.Writer) *Encoder { + return json.NewEncoder(w) +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/json/json_tinygo.go b/vendor/github.com/apache/arrow/go/v15/internal/json/json_tinygo.go new file mode 100644 index 000000000..8e4f447b3 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/json/json_tinygo.go @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build tinygo +// +build tinygo + +package json + +import ( + "io" + + "encoding/json" +) + +type Decoder = json.Decoder +type Encoder = json.Encoder +type Marshaler = json.Marshaler +type Delim = json.Delim +type UnmarshalTypeError = json.UnmarshalTypeError +type Number = json.Number +type Unmarshaler = json.Unmarshaler +type RawMessage = json.RawMessage + +func Marshal(v interface{}) ([]byte, error) { + return json.Marshal(v) +} + +func Unmarshal(data []byte, v interface{}) error { + return json.Unmarshal(data, v) +} + +func NewDecoder(r io.Reader) *Decoder { + return json.NewDecoder(r) +} + +func NewEncoder(w io.Writer) *Encoder { + return json.NewEncoder(w) +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/Makefile b/vendor/github.com/apache/arrow/go/v15/internal/utils/Makefile new file mode 100644 index 000000000..fded9d1d5 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/Makefile @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# this converts rotate instructions from "ro[lr] <reg>" -> "ro[lr] <reg>, 1" for yasm compatibility +PERL_FIXUP_ROTATE=perl -i -pe 's/(ro[rl]\s+\w{2,3})$$/\1, 1/' + +C2GOASM=c2goasm +CC=clang-11 +C_FLAGS=-target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 \ + -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib +ASM_FLAGS_AVX2=-mavx2 -mfma +ASM_FLAGS_SSE4=-msse4 +ASM_FLAGS_BMI2=-mbmi2 +ASM_FLAGS_POPCNT=-mpopcnt + +C_FLAGS_NEON=-O3 -fvectorize -mllvm -force-vector-width=16 -fno-asynchronous-unwind-tables -mno-red-zone -mstackrealign -fno-exceptions \ + -fno-rtti -fno-builtin -ffast-math -fno-jump-tables -I_lib + +GO_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -not -name '*_test.go') +ALL_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -name '*.s' -not -name '*_test.go') + +.PHONEY: assembly + +INTEL_SOURCES := \ + min_max_avx2_amd64.s min_max_sse4_amd64.s transpose_ints_avx2_amd64.s transpose_ints_sse4_amd64.s + +# +# ARROW-15336: DO NOT add the assembly target for Arm64 (ARM_SOURCES) until c2goasm added the Arm64 support. +# min_max_neon_arm64.s was generated by asm2plan9s. +# And manually formatted it as the Arm64 Plan9. +# + +assembly: $(INTEL_SOURCES) + +_lib/min_max_avx2_amd64.s: _lib/min_max.c + $(CC) -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@ + +_lib/min_max_sse4_amd64.s: _lib/min_max.c + $(CC) -S $(C_FLAGS) $(ASM_FLAGS_SSE4) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@ + +_lib/min_max_neon.s: _lib/min_max.c + $(CC) -S $(C_FLAGS_NEON) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@ + +_lib/transpose_ints_avx2_amd64.s: _lib/transpose_ints.c + $(CC) -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@ + +_lib/transpose_ints_sse4_amd64.s: _lib/transpose_ints.c + $(CC) -S $(C_FLAGS) $(ASM_FLAGS_SSE4) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@ + +_lib/transpose_ints_neon.s: _lib/transpose_ints.c + $(CC) -S $(C_FLAGS_NEON) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@ + +min_max_avx2_amd64.s: _lib/min_max_avx2_amd64.s + $(C2GOASM) -a -f $^ $@ + +min_max_sse4_amd64.s: _lib/min_max_sse4_amd64.s + $(C2GOASM) -a -f $^ $@ + +transpose_ints_avx2_amd64.s: _lib/transpose_ints_avx2_amd64.s + $(C2GOASM) -a -f $^ $@ + +transpose_ints_sse4_amd64.s: _lib/transpose_ints_sse4_amd64.s + $(C2GOASM) -a -f $^ $@ + +clean: + rm -f $(INTEL_SOURCES) + rm -f $(addprefix _lib/,$(INTEL_SOURCES)) diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/buf_reader.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/buf_reader.go new file mode 100644 index 000000000..0b2381da1 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/buf_reader.go @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "bufio" + "errors" + "fmt" + "io" +) + +// bufferedReader is similar to bufio.Reader except +// it will expand the buffer if necessary when asked to Peek +// more bytes than are in the buffer +type bufferedReader struct { + bufferSz int + buf []byte + r, w int + rd io.Reader + err error +} + +// NewBufferedReader returns a buffered reader with similar semantics to bufio.Reader +// except Peek will expand the internal buffer if needed rather than return +// an error. +func NewBufferedReader(rd io.Reader, sz int) *bufferedReader { + // if rd is already a buffered reader whose buffer is >= the requested size + // then just return it as is. no need to make a new object. + b, ok := rd.(*bufferedReader) + if ok && len(b.buf) >= sz { + return b + } + + r := &bufferedReader{ + rd: rd, + } + r.resizeBuffer(sz) + return r +} + +func (b *bufferedReader) resetBuffer() { + if b.buf == nil { + b.buf = make([]byte, b.bufferSz) + } else if b.bufferSz > cap(b.buf) { + buf := b.buf + b.buf = make([]byte, b.bufferSz) + copy(b.buf, buf) + } else { + b.buf = b.buf[:b.bufferSz] + } +} + +func (b *bufferedReader) resizeBuffer(newSize int) { + b.bufferSz = newSize + b.resetBuffer() +} + +func (b *bufferedReader) fill() error { + // slide existing data to the beginning + if b.r > 0 { + copy(b.buf, b.buf[b.r:b.w]) + b.w -= b.r + b.r = 0 + } + + if b.w >= len(b.buf) { + return fmt.Errorf("arrow/bufferedreader: %w", bufio.ErrBufferFull) + } + + n, err := io.ReadAtLeast(b.rd, b.buf[b.w:], 1) + if n < 0 { + return fmt.Errorf("arrow/bufferedreader: filling buffer: %w", bufio.ErrNegativeCount) + } + + b.w += n + b.err = err + return nil +} + +func (b *bufferedReader) readErr() error { + err := b.err + b.err = nil + return err +} + +// Buffered returns the number of bytes currently buffered +func (b *bufferedReader) Buffered() int { return b.w - b.r } + +// SetBufferSize resets the size of the internal buffer to the desired size. +// Will return an error if newSize is <= 0 or if newSize is less than the size +// of the buffered data. +func (b *bufferedReader) SetBufferSize(newSize int) error { + if newSize <= 0 { + return errors.New("buffer size should be positive") + } + + if b.w >= newSize { + return errors.New("cannot shrink read buffer if buffered data remains") + } + + b.resizeBuffer(newSize) + return nil +} + +// Peek will buffer and return n bytes from the underlying reader without advancing +// the reader itself. If n is larger than the current buffer size, the buffer will +// be expanded to accommodate the extra bytes rather than error. +func (b *bufferedReader) Peek(n int) ([]byte, error) { + if n < 0 { + return nil, fmt.Errorf("arrow/bufferedreader: %w", bufio.ErrNegativeCount) + } + + if n > len(b.buf) { + if err := b.SetBufferSize(n); err != nil { + return nil, err + } + } + + for b.w-b.r < n && b.w-b.r < len(b.buf) && b.err == nil { + b.fill() // b.w-b.r < len(b.buf) => buffer is not full + } + + return b.buf[b.r : b.r+n], b.readErr() +} + +// Discard skips the next n bytes either by advancing the internal buffer +// or by reading that many bytes in and throwing them away. +func (b *bufferedReader) Discard(n int) (discarded int, err error) { + if n < 0 { + return 0, fmt.Errorf("arrow/bufferedreader: %w", bufio.ErrNegativeCount) + } + + if n == 0 { + return + } + + remain := n + for { + skip := b.Buffered() + if skip == 0 { + b.fill() + skip = b.Buffered() + } + if skip > remain { + skip = remain + } + b.r += skip + remain -= skip + if remain == 0 { + return n, nil + } + if b.err != nil { + return n - remain, b.readErr() + } + } +} + +func (b *bufferedReader) Read(p []byte) (n int, err error) { + n = len(p) + if n == 0 { + if b.Buffered() > 0 { + return 0, nil + } + return 0, b.readErr() + } + + if b.r == b.w { + if b.err != nil { + return 0, b.readErr() + } + if len(p) >= len(b.buf) { + // large read, empty buffer + // read directly into p to avoid extra copy + n, b.err = b.rd.Read(p) + if n < 0 { + return n, fmt.Errorf("arrow/bufferedreader: %w", bufio.ErrNegativeCount) + } + return n, b.readErr() + } + + // one read + // don't use b.fill + b.r, b.w = 0, 0 + n, b.err = b.rd.Read(b.buf) + if n < 0 { + return n, fmt.Errorf("arrow/bufferedreader: %w", bufio.ErrNegativeCount) + } + if n == 0 { + return 0, b.readErr() + } + b.w += n + } + + // copy as much as we can + n = copy(p, b.buf[b.r:b.w]) + b.r += n + return n, nil +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/endians_default.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/endians_default.go new file mode 100644 index 000000000..5fd257f52 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/endians_default.go @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !s390x + +package utils + +var ( + ToLEInt16 = func(x int16) int16 { return x } + ToLEUint16 = func(x uint16) uint16 { return x } + ToLEUint32 = func(x uint32) uint32 { return x } + ToLEUint64 = func(x uint64) uint64 { return x } + ToLEInt32 = func(x int32) int32 { return x } + ToLEInt64 = func(x int64) int64 { return x } + ToLEFloat32 = func(x float32) float32 { return x } + ToLEFloat64 = func(x float64) float64 { return x } +) diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/endians_s390x.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/endians_s390x.go new file mode 100644 index 000000000..7bb27cd81 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/endians_s390x.go @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "math" + "math/bits" +) + +var ( + ToLEInt16 = func(x int16) int16 { return int16(bits.ReverseBytes16(uint16(x))) } + ToLEUint16 = bits.ReverseBytes16 + ToLEUint32 = bits.ReverseBytes32 + ToLEUint64 = bits.ReverseBytes64 + ToLEInt32 = func(x int32) int32 { return int32(bits.ReverseBytes32(uint32(x))) } + ToLEInt64 = func(x int64) int64 { return int64(bits.ReverseBytes64(uint64(x))) } + ToLEFloat32 = func(x float32) float32 { return math.Float32frombits(bits.ReverseBytes32(math.Float32bits(x))) } + ToLEFloat64 = func(x float64) float64 { return math.Float64frombits(bits.ReverseBytes64(math.Float64bits(x))) } +) diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/math.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/math.go new file mode 100644 index 000000000..c8311750e --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/math.go @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import "golang.org/x/exp/constraints" + +func Min[T constraints.Ordered](a, b T) T { + if a < b { + return a + } + return b +} + +func Max[T constraints.Ordered](a, b T) T { + if a > b { + return a + } + return b +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max.go new file mode 100644 index 000000000..3d7b0024a --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max.go @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "math" +) + +// this file contains pure go implementations of the min_max functions that are +// SIMD accelerated so that we can fallback to these if the cpu doesn't support +// AVX2 or SSE4 instructions. + +func int8MinMax(values []int8) (min, max int8) { + min = math.MaxInt8 + max = math.MinInt8 + + for _, v := range values { + if min > v { + min = v + } + if max < v { + max = v + } + } + return +} + +func uint8MinMax(values []uint8) (min, max uint8) { + min = math.MaxUint8 + max = 0 + + for _, v := range values { + if min > v { + min = v + } + if max < v { + max = v + } + } + return +} + +func int16MinMax(values []int16) (min, max int16) { + min = math.MaxInt16 + max = math.MinInt16 + + for _, v := range values { + if min > v { + min = v + } + if max < v { + max = v + } + } + return +} + +func uint16MinMax(values []uint16) (min, max uint16) { + min = math.MaxUint16 + max = 0 + + for _, v := range values { + if min > v { + min = v + } + if max < v { + max = v + } + } + return +} + +func int32MinMax(values []int32) (min, max int32) { + min = math.MaxInt32 + max = math.MinInt32 + + for _, v := range values { + if min > v { + min = v + } + if max < v { + max = v + } + } + return +} + +func uint32MinMax(values []uint32) (min, max uint32) { + min = math.MaxUint32 + max = 0 + + for _, v := range values { + if min > v { + min = v + } + if max < v { + max = v + } + } + return +} + +func int64MinMax(values []int64) (min, max int64) { + min = math.MaxInt64 + max = math.MinInt64 + + for _, v := range values { + if min > v { + min = v + } + if max < v { + max = v + } + } + return +} + +func uint64MinMax(values []uint64) (min, max uint64) { + min = math.MaxUint64 + max = 0 + + for _, v := range values { + if min > v { + min = v + } + if max < v { + max = v + } + } + return +} + +var minmaxFuncs = struct { + i8 func([]int8) (int8, int8) + ui8 func([]uint8) (uint8, uint8) + i16 func([]int16) (int16, int16) + ui16 func([]uint16) (uint16, uint16) + i32 func([]int32) (int32, int32) + ui32 func([]uint32) (uint32, uint32) + i64 func([]int64) (int64, int64) + ui64 func([]uint64) (uint64, uint64) +}{} + +// GetMinMaxInt8 returns the min and max for a int8 slice, using AVX2 or +// SSE4 cpu extensions if available, falling back to a pure go implementation +// if they are unavailable or built with the noasm tag. +func GetMinMaxInt8(v []int8) (min, max int8) { + return minmaxFuncs.i8(v) +} + +// GetMinMaxUint8 returns the min and max for a uint8 slice, using AVX2 or +// SSE4 cpu extensions if available, falling back to a pure go implementation +// if they are unavailable or built with the noasm tag. +func GetMinMaxUint8(v []uint8) (min, max uint8) { + return minmaxFuncs.ui8(v) +} + +// GetMinMaxInt16 returns the min and max for a int16 slice, using AVX2 or +// SSE4 cpu extensions if available, falling back to a pure go implementation +// if they are unavailable or built with the noasm tag. +func GetMinMaxInt16(v []int16) (min, max int16) { + return minmaxFuncs.i16(v) +} + +// GetMinMaxUint16 returns the min and max for a uint16 slice, using AVX2 or +// SSE4 cpu extensions if available, falling back to a pure go implementation +// if they are unavailable or built with the noasm tag. +func GetMinMaxUint16(v []uint16) (min, max uint16) { + return minmaxFuncs.ui16(v) +} + +// GetMinMaxInt32 returns the min and max for a int32 slice, using AVX2 or +// SSE4 cpu extensions if available, falling back to a pure go implementation +// if they are unavailable or built with the noasm tag. +func GetMinMaxInt32(v []int32) (min, max int32) { + return minmaxFuncs.i32(v) +} + +// GetMinMaxUint32 returns the min and max for a uint32 slice, using AVX2 or +// SSE4 cpu extensions if available, falling back to a pure go implementation +// if they are unavailable or built with the noasm tag. +func GetMinMaxUint32(v []uint32) (min, max uint32) { + return minmaxFuncs.ui32(v) +} + +// GetMinMaxInt64 returns the min and max for a int64 slice, using AVX2 or +// SSE4 cpu extensions if available, falling back to a pure go implementation +// if they are unavailable or built with the noasm tag. +func GetMinMaxInt64(v []int64) (min, max int64) { + return minmaxFuncs.i64(v) +} + +// GetMinMaxUint64 returns the min and max for a uint64 slice, using AVX2 or +// SSE4 cpu extensions if available, falling back to a pure go implementation +// if they are unavailable or built with the noasm tag. +func GetMinMaxUint64(v []uint64) (min, max uint64) { + return minmaxFuncs.ui64(v) +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_amd64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_amd64.go new file mode 100644 index 000000000..5fccddbee --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_amd64.go @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +import "golang.org/x/sys/cpu" + +func init() { + // if the CPU supports AVX2 or SSE4 then let's use those to benefit from SIMD + // to accelerate the performance for finding the min and max for an integral slice. + // otherwise fallback to a pure go implementation if the cpu doesn't have these features. + if cpu.X86.HasAVX2 { + minmaxFuncs.i8 = int8MaxMinAVX2 + minmaxFuncs.ui8 = uint8MaxMinAVX2 + minmaxFuncs.i16 = int16MaxMinAVX2 + minmaxFuncs.ui16 = uint16MaxMinAVX2 + minmaxFuncs.i32 = int32MaxMinAVX2 + minmaxFuncs.ui32 = uint32MaxMinAVX2 + minmaxFuncs.i64 = int64MaxMinAVX2 + minmaxFuncs.ui64 = uint64MaxMinAVX2 + } else if cpu.X86.HasSSE42 { + minmaxFuncs.i8 = int8MaxMinSSE4 + minmaxFuncs.ui8 = uint8MaxMinSSE4 + minmaxFuncs.i16 = int16MaxMinSSE4 + minmaxFuncs.ui16 = uint16MaxMinSSE4 + minmaxFuncs.i32 = int32MaxMinSSE4 + minmaxFuncs.ui32 = uint32MaxMinSSE4 + minmaxFuncs.i64 = int64MaxMinSSE4 + minmaxFuncs.ui64 = uint64MaxMinSSE4 + } else { + minmaxFuncs.i8 = int8MinMax + minmaxFuncs.ui8 = uint8MinMax + minmaxFuncs.i16 = int16MinMax + minmaxFuncs.ui16 = uint16MinMax + minmaxFuncs.i32 = int32MinMax + minmaxFuncs.ui32 = uint32MinMax + minmaxFuncs.i64 = int64MinMax + minmaxFuncs.ui64 = uint64MinMax + } +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_arm64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_arm64.go new file mode 100644 index 000000000..7404e95d9 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_arm64.go @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +import ( + "os" + "strings" +) +import "golang.org/x/sys/cpu" + +func init() { + // Added ability to enable extension via environment: + // ARM_ENABLE_EXT=NEON go test + if ext, ok := os.LookupEnv("ARM_ENABLE_EXT"); ok { + exts := strings.Split(ext, ",") + + for _, x := range exts { + switch x { + case "NEON": + cpu.ARM64.HasASIMD = true + case "AES": + cpu.ARM64.HasAES = true + case "PMULL": + cpu.ARM64.HasPMULL = true + default: + cpu.ARM64.HasASIMD = false + cpu.ARM64.HasAES = false + cpu.ARM64.HasPMULL = false + } + } + } + if cpu.ARM64.HasASIMD { + minmaxFuncs.i32 = int32MaxMinNEON + minmaxFuncs.ui32 = uint32MaxMinNEON + minmaxFuncs.i64 = int64MaxMinNEON + minmaxFuncs.ui64 = uint64MaxMinNEON + } else { + minmaxFuncs.i32 = int32MinMax + minmaxFuncs.ui32 = uint32MinMax + minmaxFuncs.i64 = int64MinMax + minmaxFuncs.ui64 = uint64MinMax + } + + // haven't yet generated the NEON arm64 for these + minmaxFuncs.i8 = int8MinMax + minmaxFuncs.ui8 = uint8MinMax + minmaxFuncs.i16 = int16MinMax + minmaxFuncs.ui16 = uint16MinMax +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.go new file mode 100644 index 000000000..af6726243 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.go @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +import ( + "unsafe" +) + +// This file contains convenience functions for utilizing AVX2 intrinsics to quickly +// and efficiently get the min and max from an integral slice. + +//go:noescape +func _int8_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func int8MaxMinAVX2(values []int8) (min, max int8) { + _int8_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _uint8_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func uint8MaxMinAVX2(values []uint8) (min, max uint8) { + _uint8_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _int16_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func int16MaxMinAVX2(values []int16) (min, max int16) { + _int16_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _uint16_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func uint16MaxMinAVX2(values []uint16) (min, max uint16) { + _uint16_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _int32_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func int32MaxMinAVX2(values []int32) (min, max int32) { + _int32_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _uint32_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func uint32MaxMinAVX2(values []uint32) (min, max uint32) { + _uint32_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _int64_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func int64MaxMinAVX2(values []int64) (min, max int64) { + _int64_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _uint64_max_min_avx2(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func uint64MaxMinAVX2(values []uint64) (min, max uint64) { + _uint64_max_min_avx2(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.s b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.s new file mode 100644 index 000000000..fe0c36e0e --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_avx2_amd64.s @@ -0,0 +1,927 @@ +//+build !noasm !appengine +// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT + +DATA LCDATA1<>+0x000(SB)/8, $0x8080808080808080 +DATA LCDATA1<>+0x008(SB)/8, $0x8080808080808080 +DATA LCDATA1<>+0x010(SB)/8, $0x8080808080808080 +DATA LCDATA1<>+0x018(SB)/8, $0x8080808080808080 +DATA LCDATA1<>+0x020(SB)/8, $0x7f7f7f7f7f7f7f7f +DATA LCDATA1<>+0x028(SB)/8, $0x7f7f7f7f7f7f7f7f +DATA LCDATA1<>+0x030(SB)/8, $0x7f7f7f7f7f7f7f7f +DATA LCDATA1<>+0x038(SB)/8, $0x7f7f7f7f7f7f7f7f +DATA LCDATA1<>+0x040(SB)/8, $0x7f7f7f7f7f7f7f7f +DATA LCDATA1<>+0x048(SB)/8, $0x7f7f7f7f7f7f7f7f +DATA LCDATA1<>+0x050(SB)/8, $0x8080808080808080 +DATA LCDATA1<>+0x058(SB)/8, $0x8080808080808080 +GLOBL LCDATA1<>(SB), 8, $96 + +TEXT ·_int8_max_min_avx2(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + LEAQ LCDATA1<>(SB), BP + + WORD $0xf685 // test esi, esi + JLE LBB0_1 + WORD $0x8941; BYTE $0xf1 // mov r9d, esi + WORD $0xfe83; BYTE $0x3f // cmp esi, 63 + JA LBB0_4 + WORD $0xb041; BYTE $0x80 // mov r8b, -128 + WORD $0xb640; BYTE $0x7f // mov sil, 127 + WORD $0x3145; BYTE $0xd2 // xor r10d, r10d + JMP LBB0_11 + +LBB0_1: + WORD $0xb640; BYTE $0x7f // mov sil, 127 + WORD $0xb041; BYTE $0x80 // mov r8b, -128 + JMP LBB0_12 + +LBB0_4: + WORD $0x8945; BYTE $0xca // mov r10d, r9d + LONG $0xc0e28341 // and r10d, -64 + LONG $0xc0428d49 // lea rax, [r10 - 64] + WORD $0x8949; BYTE $0xc0 // mov r8, rax + LONG $0x06e8c149 // shr r8, 6 + LONG $0x01c08349 // add r8, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB0_5 + WORD $0x894c; BYTE $0xc6 // mov rsi, r8 + LONG $0xfee68348 // and rsi, -2 + WORD $0xf748; BYTE $0xde // neg rsi + LONG $0x4d6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword 0[rbp] /* [rip + .LCPI0_0] */ + LONG $0x456ffdc5; BYTE $0x20 // vmovdqa ymm0, yword 32[rbp] /* [rip + .LCPI0_1] */ + WORD $0xc031 // xor eax, eax + LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 + LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1 + +LBB0_7: + LONG $0x246ffec5; BYTE $0x07 // vmovdqu ymm4, yword [rdi + rax] + LONG $0x6c6ffec5; WORD $0x2007 // vmovdqu ymm5, yword [rdi + rax + 32] + LONG $0x746ffec5; WORD $0x4007 // vmovdqu ymm6, yword [rdi + rax + 64] + LONG $0x7c6ffec5; WORD $0x6007 // vmovdqu ymm7, yword [rdi + rax + 96] + LONG $0x387de2c4; BYTE $0xc4 // vpminsb ymm0, ymm0, ymm4 + LONG $0x386de2c4; BYTE $0xd5 // vpminsb ymm2, ymm2, ymm5 + LONG $0x3c75e2c4; BYTE $0xcc // vpmaxsb ymm1, ymm1, ymm4 + LONG $0x3c65e2c4; BYTE $0xdd // vpmaxsb ymm3, ymm3, ymm5 + LONG $0x387de2c4; BYTE $0xc6 // vpminsb ymm0, ymm0, ymm6 + LONG $0x386de2c4; BYTE $0xd7 // vpminsb ymm2, ymm2, ymm7 + LONG $0x3c75e2c4; BYTE $0xce // vpmaxsb ymm1, ymm1, ymm6 + LONG $0x3c65e2c4; BYTE $0xdf // vpmaxsb ymm3, ymm3, ymm7 + LONG $0x80e88348 // sub rax, -128 + LONG $0x02c68348 // add rsi, 2 + JNE LBB0_7 + LONG $0x01c0f641 // test r8b, 1 + JE LBB0_10 + +LBB0_9: + LONG $0x246ffec5; BYTE $0x07 // vmovdqu ymm4, yword [rdi + rax] + LONG $0x6c6ffec5; WORD $0x2007 // vmovdqu ymm5, yword [rdi + rax + 32] + LONG $0x3c65e2c4; BYTE $0xdd // vpmaxsb ymm3, ymm3, ymm5 + LONG $0x3c75e2c4; BYTE $0xcc // vpmaxsb ymm1, ymm1, ymm4 + LONG $0x386de2c4; BYTE $0xd5 // vpminsb ymm2, ymm2, ymm5 + LONG $0x387de2c4; BYTE $0xc4 // vpminsb ymm0, ymm0, ymm4 + +LBB0_10: + LONG $0x3c75e2c4; BYTE $0xcb // vpmaxsb ymm1, ymm1, ymm3 + LONG $0x397de3c4; WORD $0x01cb // vextracti128 xmm3, ymm1, 1 + LONG $0x3c71e2c4; BYTE $0xcb // vpmaxsb xmm1, xmm1, xmm3 + LONG $0x4deff1c5; BYTE $0x40 // vpxor xmm1, xmm1, oword 64[rbp] /* [rip + .LCPI0_2] */ + LONG $0x387de2c4; BYTE $0xc2 // vpminsb ymm0, ymm0, ymm2 + LONG $0xd171e9c5; BYTE $0x08 // vpsrlw xmm2, xmm1, 8 + LONG $0xcadaf1c5 // vpminub xmm1, xmm1, xmm2 + LONG $0x4179e2c4; BYTE $0xc9 // vphminposuw xmm1, xmm1 + LONG $0x7e79c1c4; BYTE $0xc8 // vmovd r8d, xmm1 + LONG $0x7ff08041 // xor r8b, 127 + LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 + LONG $0x3879e2c4; BYTE $0xc1 // vpminsb xmm0, xmm0, xmm1 + LONG $0x45eff9c5; BYTE $0x50 // vpxor xmm0, xmm0, oword 80[rbp] /* [rip + .LCPI0_3] */ + LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8 + LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 + LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 + LONG $0xc67ef9c5 // vmovd esi, xmm0 + LONG $0x80f68040 // xor sil, -128 + WORD $0x394d; BYTE $0xca // cmp r10, r9 + JE LBB0_12 + +LBB0_11: + LONG $0x04b60f42; BYTE $0x17 // movzx eax, byte [rdi + r10] + WORD $0x3840; BYTE $0xc6 // cmp sil, al + LONG $0xf6b60f40 // movzx esi, sil + WORD $0x4f0f; BYTE $0xf0 // cmovg esi, eax + WORD $0x3841; BYTE $0xc0 // cmp r8b, al + LONG $0xc0b60f45 // movzx r8d, r8b + LONG $0xc04c0f44 // cmovl r8d, eax + LONG $0x01c28349 // add r10, 1 + WORD $0x394d; BYTE $0xd1 // cmp r9, r10 + JNE LBB0_11 + +LBB0_12: + WORD $0x8844; BYTE $0x01 // mov byte [rcx], r8b + WORD $0x8840; BYTE $0x32 // mov byte [rdx], sil + VZEROUPPER + RET + +LBB0_5: + LONG $0x4d6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword 0[rbp] /* [rip + .LCPI0_0] */ + LONG $0x456ffdc5; BYTE $0x20 // vmovdqa ymm0, yword 32[rbp] /* [rip + .LCPI0_1] */ + WORD $0xc031 // xor eax, eax + LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 + LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1 + LONG $0x01c0f641 // test r8b, 1 + JNE LBB0_9 + JMP LBB0_10 + +TEXT ·_uint8_max_min_avx2(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + + WORD $0xf685 // test esi, esi + JLE LBB1_1 + WORD $0x8941; BYTE $0xf1 // mov r9d, esi + WORD $0xfe83; BYTE $0x3f // cmp esi, 63 + JA LBB1_4 + WORD $0xb640; BYTE $0xff // mov sil, -1 + WORD $0x3145; BYTE $0xd2 // xor r10d, r10d + WORD $0xc031 // xor eax, eax + JMP LBB1_11 + +LBB1_1: + WORD $0xb640; BYTE $0xff // mov sil, -1 + WORD $0xc031 // xor eax, eax + JMP LBB1_12 + +LBB1_4: + WORD $0x8945; BYTE $0xca // mov r10d, r9d + LONG $0xc0e28341 // and r10d, -64 + LONG $0xc0428d49 // lea rax, [r10 - 64] + WORD $0x8949; BYTE $0xc0 // mov r8, rax + LONG $0x06e8c149 // shr r8, 6 + LONG $0x01c08349 // add r8, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB1_5 + WORD $0x894c; BYTE $0xc6 // mov rsi, r8 + LONG $0xfee68348 // and rsi, -2 + WORD $0xf748; BYTE $0xde // neg rsi + LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 + LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 + WORD $0xc031 // xor eax, eax + LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 + LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + +LBB1_7: + LONG $0x246ffec5; BYTE $0x07 // vmovdqu ymm4, yword [rdi + rax] + LONG $0x6c6ffec5; WORD $0x2007 // vmovdqu ymm5, yword [rdi + rax + 32] + LONG $0x746ffec5; WORD $0x4007 // vmovdqu ymm6, yword [rdi + rax + 64] + LONG $0x7c6ffec5; WORD $0x6007 // vmovdqu ymm7, yword [rdi + rax + 96] + LONG $0xccdaf5c5 // vpminub ymm1, ymm1, ymm4 + LONG $0xd5daedc5 // vpminub ymm2, ymm2, ymm5 + LONG $0xc4defdc5 // vpmaxub ymm0, ymm0, ymm4 + LONG $0xdddee5c5 // vpmaxub ymm3, ymm3, ymm5 + LONG $0xcedaf5c5 // vpminub ymm1, ymm1, ymm6 + LONG $0xd7daedc5 // vpminub ymm2, ymm2, ymm7 + LONG $0xc6defdc5 // vpmaxub ymm0, ymm0, ymm6 + LONG $0xdfdee5c5 // vpmaxub ymm3, ymm3, ymm7 + LONG $0x80e88348 // sub rax, -128 + LONG $0x02c68348 // add rsi, 2 + JNE LBB1_7 + LONG $0x01c0f641 // test r8b, 1 + JE LBB1_10 + +LBB1_9: + LONG $0x246ffec5; BYTE $0x07 // vmovdqu ymm4, yword [rdi + rax] + LONG $0x6c6ffec5; WORD $0x2007 // vmovdqu ymm5, yword [rdi + rax + 32] + LONG $0xdddee5c5 // vpmaxub ymm3, ymm3, ymm5 + LONG $0xc4defdc5 // vpmaxub ymm0, ymm0, ymm4 + LONG $0xd5daedc5 // vpminub ymm2, ymm2, ymm5 + LONG $0xccdaf5c5 // vpminub ymm1, ymm1, ymm4 + +LBB1_10: + LONG $0xcadaf5c5 // vpminub ymm1, ymm1, ymm2 + LONG $0xc3defdc5 // vpmaxub ymm0, ymm0, ymm3 + LONG $0x397de3c4; WORD $0x01c2 // vextracti128 xmm2, ymm0, 1 + LONG $0xc2def9c5 // vpmaxub xmm0, xmm0, xmm2 + LONG $0xd276e9c5 // vpcmpeqd xmm2, xmm2, xmm2 + LONG $0xc2eff9c5 // vpxor xmm0, xmm0, xmm2 + LONG $0xd071e9c5; BYTE $0x08 // vpsrlw xmm2, xmm0, 8 + LONG $0xc2daf9c5 // vpminub xmm0, xmm0, xmm2 + LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 + LONG $0xc07ef9c5 // vmovd eax, xmm0 + WORD $0xd0f6 // not al + LONG $0x397de3c4; WORD $0x01c8 // vextracti128 xmm0, ymm1, 1 + LONG $0xc0daf1c5 // vpminub xmm0, xmm1, xmm0 + LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8 + LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 + LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 + LONG $0xc67ef9c5 // vmovd esi, xmm0 + WORD $0x394d; BYTE $0xca // cmp r10, r9 + JE LBB1_12 + +LBB1_11: + LONG $0x04b60f46; BYTE $0x17 // movzx r8d, byte [rdi + r10] + WORD $0x3844; BYTE $0xc6 // cmp sil, r8b + LONG $0xf6b60f40 // movzx esi, sil + LONG $0xf0430f41 // cmovae esi, r8d + WORD $0x3844; BYTE $0xc0 // cmp al, r8b + WORD $0xb60f; BYTE $0xc0 // movzx eax, al + LONG $0xc0460f41 // cmovbe eax, r8d + LONG $0x01c28349 // add r10, 1 + WORD $0x394d; BYTE $0xd1 // cmp r9, r10 + JNE LBB1_11 + +LBB1_12: + WORD $0x0188 // mov byte [rcx], al + WORD $0x8840; BYTE $0x32 // mov byte [rdx], sil + VZEROUPPER + RET + +LBB1_5: + LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 + LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 + WORD $0xc031 // xor eax, eax + LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 + LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + LONG $0x01c0f641 // test r8b, 1 + JNE LBB1_9 + JMP LBB1_10 + +DATA LCDATA2<>+0x000(SB)/8, $0x8000800080008000 +DATA LCDATA2<>+0x008(SB)/8, $0x8000800080008000 +DATA LCDATA2<>+0x010(SB)/8, $0x8000800080008000 +DATA LCDATA2<>+0x018(SB)/8, $0x8000800080008000 +DATA LCDATA2<>+0x020(SB)/8, $0x7fff7fff7fff7fff +DATA LCDATA2<>+0x028(SB)/8, $0x7fff7fff7fff7fff +DATA LCDATA2<>+0x030(SB)/8, $0x7fff7fff7fff7fff +DATA LCDATA2<>+0x038(SB)/8, $0x7fff7fff7fff7fff +DATA LCDATA2<>+0x040(SB)/8, $0x7fff7fff7fff7fff +DATA LCDATA2<>+0x048(SB)/8, $0x7fff7fff7fff7fff +DATA LCDATA2<>+0x050(SB)/8, $0x8000800080008000 +DATA LCDATA2<>+0x058(SB)/8, $0x8000800080008000 +GLOBL LCDATA2<>(SB), 8, $96 + +TEXT ·_int16_max_min_avx2(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + LEAQ LCDATA2<>(SB), BP + + WORD $0xf685 // test esi, esi + JLE LBB2_1 + WORD $0x8941; BYTE $0xf1 // mov r9d, esi + WORD $0xfe83; BYTE $0x1f // cmp esi, 31 + JA LBB2_4 + LONG $0x00b84166; BYTE $0x80 // mov r8w, -32768 + LONG $0x7fffbe66 // mov si, 32767 + WORD $0x3145; BYTE $0xd2 // xor r10d, r10d + JMP LBB2_11 + +LBB2_1: + LONG $0x7fffbe66 // mov si, 32767 + LONG $0x00b84166; BYTE $0x80 // mov r8w, -32768 + JMP LBB2_12 + +LBB2_4: + WORD $0x8945; BYTE $0xca // mov r10d, r9d + LONG $0xe0e28341 // and r10d, -32 + LONG $0xe0428d49 // lea rax, [r10 - 32] + WORD $0x8949; BYTE $0xc0 // mov r8, rax + LONG $0x05e8c149 // shr r8, 5 + LONG $0x01c08349 // add r8, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB2_5 + WORD $0x894c; BYTE $0xc6 // mov rsi, r8 + LONG $0xfee68348 // and rsi, -2 + WORD $0xf748; BYTE $0xde // neg rsi + LONG $0x4d6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword 0[rbp] /* [rip + .LCPI2_0] */ + LONG $0x456ffdc5; BYTE $0x20 // vmovdqa ymm0, yword 32[rbp] /* [rip + .LCPI2_1] */ + WORD $0xc031 // xor eax, eax + LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 + LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1 + +LBB2_7: + LONG $0x246ffec5; BYTE $0x47 // vmovdqu ymm4, yword [rdi + 2*rax] + LONG $0x6c6ffec5; WORD $0x2047 // vmovdqu ymm5, yword [rdi + 2*rax + 32] + LONG $0x746ffec5; WORD $0x4047 // vmovdqu ymm6, yword [rdi + 2*rax + 64] + LONG $0x7c6ffec5; WORD $0x6047 // vmovdqu ymm7, yword [rdi + 2*rax + 96] + LONG $0xc4eafdc5 // vpminsw ymm0, ymm0, ymm4 + LONG $0xd5eaedc5 // vpminsw ymm2, ymm2, ymm5 + LONG $0xcceef5c5 // vpmaxsw ymm1, ymm1, ymm4 + LONG $0xddeee5c5 // vpmaxsw ymm3, ymm3, ymm5 + LONG $0xc6eafdc5 // vpminsw ymm0, ymm0, ymm6 + LONG $0xd7eaedc5 // vpminsw ymm2, ymm2, ymm7 + LONG $0xceeef5c5 // vpmaxsw ymm1, ymm1, ymm6 + LONG $0xdfeee5c5 // vpmaxsw ymm3, ymm3, ymm7 + LONG $0x40c08348 // add rax, 64 + LONG $0x02c68348 // add rsi, 2 + JNE LBB2_7 + LONG $0x01c0f641 // test r8b, 1 + JE LBB2_10 + +LBB2_9: + LONG $0x246ffec5; BYTE $0x47 // vmovdqu ymm4, yword [rdi + 2*rax] + LONG $0x6c6ffec5; WORD $0x2047 // vmovdqu ymm5, yword [rdi + 2*rax + 32] + LONG $0xddeee5c5 // vpmaxsw ymm3, ymm3, ymm5 + LONG $0xcceef5c5 // vpmaxsw ymm1, ymm1, ymm4 + LONG $0xd5eaedc5 // vpminsw ymm2, ymm2, ymm5 + LONG $0xc4eafdc5 // vpminsw ymm0, ymm0, ymm4 + +LBB2_10: + LONG $0xcbeef5c5 // vpmaxsw ymm1, ymm1, ymm3 + LONG $0x397de3c4; WORD $0x01cb // vextracti128 xmm3, ymm1, 1 + LONG $0xcbeef1c5 // vpmaxsw xmm1, xmm1, xmm3 + LONG $0x4deff1c5; BYTE $0x40 // vpxor xmm1, xmm1, oword 64[rbp] /* [rip + .LCPI2_2] */ + LONG $0xc2eafdc5 // vpminsw ymm0, ymm0, ymm2 + LONG $0x4179e2c4; BYTE $0xc9 // vphminposuw xmm1, xmm1 + LONG $0x7e79c1c4; BYTE $0xc8 // vmovd r8d, xmm1 + LONG $0xfff08141; WORD $0x007f; BYTE $0x00 // xor r8d, 32767 + LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 + LONG $0xc1eaf9c5 // vpminsw xmm0, xmm0, xmm1 + LONG $0x45eff9c5; BYTE $0x50 // vpxor xmm0, xmm0, oword 80[rbp] /* [rip + .LCPI2_3] */ + LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 + LONG $0xc67ef9c5 // vmovd esi, xmm0 + LONG $0x8000f681; WORD $0x0000 // xor esi, 32768 + WORD $0x394d; BYTE $0xca // cmp r10, r9 + JE LBB2_12 + +LBB2_11: + LONG $0x04b70f42; BYTE $0x57 // movzx eax, word [rdi + 2*r10] + WORD $0x3966; BYTE $0xc6 // cmp si, ax + WORD $0x4f0f; BYTE $0xf0 // cmovg esi, eax + LONG $0xc0394166 // cmp r8w, ax + LONG $0xc04c0f44 // cmovl r8d, eax + LONG $0x01c28349 // add r10, 1 + WORD $0x394d; BYTE $0xd1 // cmp r9, r10 + JNE LBB2_11 + +LBB2_12: + LONG $0x01894466 // mov word [rcx], r8w + WORD $0x8966; BYTE $0x32 // mov word [rdx], si + VZEROUPPER + RET + +LBB2_5: + LONG $0x4d6ffdc5; BYTE $0x00 // vmovdqa ymm1, yword 0[rbp] /* [rip + .LCPI2_0] */ + LONG $0x456ffdc5; BYTE $0x20 // vmovdqa ymm0, yword 32[rbp] /* [rip + .LCPI2_1] */ + WORD $0xc031 // xor eax, eax + LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 + LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1 + LONG $0x01c0f641 // test r8b, 1 + JNE LBB2_9 + JMP LBB2_10 + +TEXT ·_uint16_max_min_avx2(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + + WORD $0xf685 // test esi, esi + JLE LBB3_1 + WORD $0x8941; BYTE $0xf1 // mov r9d, esi + WORD $0xfe83; BYTE $0x1f // cmp esi, 31 + JA LBB3_4 + LONG $0xffb84166; BYTE $0xff // mov r8w, -1 + WORD $0x3145; BYTE $0xd2 // xor r10d, r10d + WORD $0xf631 // xor esi, esi + JMP LBB3_11 + +LBB3_1: + LONG $0xffb84166; BYTE $0xff // mov r8w, -1 + WORD $0xf631 // xor esi, esi + JMP LBB3_12 + +LBB3_4: + WORD $0x8945; BYTE $0xca // mov r10d, r9d + LONG $0xe0e28341 // and r10d, -32 + LONG $0xe0428d49 // lea rax, [r10 - 32] + WORD $0x8949; BYTE $0xc0 // mov r8, rax + LONG $0x05e8c149 // shr r8, 5 + LONG $0x01c08349 // add r8, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB3_5 + WORD $0x894c; BYTE $0xc6 // mov rsi, r8 + LONG $0xfee68348 // and rsi, -2 + WORD $0xf748; BYTE $0xde // neg rsi + LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 + LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 + WORD $0xc031 // xor eax, eax + LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 + LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + +LBB3_7: + LONG $0x246ffec5; BYTE $0x47 // vmovdqu ymm4, yword [rdi + 2*rax] + LONG $0x6c6ffec5; WORD $0x2047 // vmovdqu ymm5, yword [rdi + 2*rax + 32] + LONG $0x746ffec5; WORD $0x4047 // vmovdqu ymm6, yword [rdi + 2*rax + 64] + LONG $0x7c6ffec5; WORD $0x6047 // vmovdqu ymm7, yword [rdi + 2*rax + 96] + LONG $0x3a75e2c4; BYTE $0xcc // vpminuw ymm1, ymm1, ymm4 + LONG $0x3a6de2c4; BYTE $0xd5 // vpminuw ymm2, ymm2, ymm5 + LONG $0x3e7de2c4; BYTE $0xc4 // vpmaxuw ymm0, ymm0, ymm4 + LONG $0x3e65e2c4; BYTE $0xdd // vpmaxuw ymm3, ymm3, ymm5 + LONG $0x3a75e2c4; BYTE $0xce // vpminuw ymm1, ymm1, ymm6 + LONG $0x3a6de2c4; BYTE $0xd7 // vpminuw ymm2, ymm2, ymm7 + LONG $0x3e7de2c4; BYTE $0xc6 // vpmaxuw ymm0, ymm0, ymm6 + LONG $0x3e65e2c4; BYTE $0xdf // vpmaxuw ymm3, ymm3, ymm7 + LONG $0x40c08348 // add rax, 64 + LONG $0x02c68348 // add rsi, 2 + JNE LBB3_7 + LONG $0x01c0f641 // test r8b, 1 + JE LBB3_10 + +LBB3_9: + LONG $0x246ffec5; BYTE $0x47 // vmovdqu ymm4, yword [rdi + 2*rax] + LONG $0x6c6ffec5; WORD $0x2047 // vmovdqu ymm5, yword [rdi + 2*rax + 32] + LONG $0x3e65e2c4; BYTE $0xdd // vpmaxuw ymm3, ymm3, ymm5 + LONG $0x3e7de2c4; BYTE $0xc4 // vpmaxuw ymm0, ymm0, ymm4 + LONG $0x3a6de2c4; BYTE $0xd5 // vpminuw ymm2, ymm2, ymm5 + LONG $0x3a75e2c4; BYTE $0xcc // vpminuw ymm1, ymm1, ymm4 + +LBB3_10: + LONG $0x3a75e2c4; BYTE $0xca // vpminuw ymm1, ymm1, ymm2 + LONG $0x3e7de2c4; BYTE $0xc3 // vpmaxuw ymm0, ymm0, ymm3 + LONG $0x397de3c4; WORD $0x01c2 // vextracti128 xmm2, ymm0, 1 + LONG $0x3e79e2c4; BYTE $0xc2 // vpmaxuw xmm0, xmm0, xmm2 + LONG $0xd276e9c5 // vpcmpeqd xmm2, xmm2, xmm2 + LONG $0xc2eff9c5 // vpxor xmm0, xmm0, xmm2 + LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 + LONG $0xc67ef9c5 // vmovd esi, xmm0 + WORD $0xd6f7 // not esi + LONG $0x397de3c4; WORD $0x01c8 // vextracti128 xmm0, ymm1, 1 + LONG $0x3a71e2c4; BYTE $0xc0 // vpminuw xmm0, xmm1, xmm0 + LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 + LONG $0x7e79c1c4; BYTE $0xc0 // vmovd r8d, xmm0 + WORD $0x394d; BYTE $0xca // cmp r10, r9 + JE LBB3_12 + +LBB3_11: + LONG $0x04b70f42; BYTE $0x57 // movzx eax, word [rdi + 2*r10] + LONG $0xc0394166 // cmp r8w, ax + LONG $0xc0430f44 // cmovae r8d, eax + WORD $0x3966; BYTE $0xc6 // cmp si, ax + WORD $0x460f; BYTE $0xf0 // cmovbe esi, eax + LONG $0x01c28349 // add r10, 1 + WORD $0x394d; BYTE $0xd1 // cmp r9, r10 + JNE LBB3_11 + +LBB3_12: + WORD $0x8966; BYTE $0x31 // mov word [rcx], si + LONG $0x02894466 // mov word [rdx], r8w + VZEROUPPER + RET + +LBB3_5: + LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 + LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 + WORD $0xc031 // xor eax, eax + LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 + LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 + LONG $0x01c0f641 // test r8b, 1 + JNE LBB3_9 + JMP LBB3_10 + +DATA LCDATA3<>+0x000(SB)/8, $0x7fffffff80000000 +GLOBL LCDATA3<>(SB), 8, $8 + +TEXT ·_int32_max_min_avx2(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + LEAQ LCDATA3<>(SB), BP + + WORD $0xf685 // test esi, esi + JLE LBB4_1 + WORD $0x8941; BYTE $0xf0 // mov r8d, esi + WORD $0xfe83; BYTE $0x1f // cmp esi, 31 + JA LBB4_4 + LONG $0x0000ba41; WORD $0x8000 // mov r10d, -2147483648 + LONG $0xffffffb8; BYTE $0x7f // mov eax, 2147483647 + WORD $0x3145; BYTE $0xc9 // xor r9d, r9d + JMP LBB4_7 + +LBB4_1: + LONG $0xffffffb8; BYTE $0x7f // mov eax, 2147483647 + LONG $0x000000be; BYTE $0x80 // mov esi, -2147483648 + JMP LBB4_8 + +LBB4_4: + WORD $0x8945; BYTE $0xc1 // mov r9d, r8d + LONG $0x587de2c4; WORD $0x0065 // vpbroadcastd ymm4, dword 0[rbp] /* [rip + .LCPI4_0] */ + LONG $0xe0e18341 // and r9d, -32 + LONG $0x587de2c4; WORD $0x0445 // vpbroadcastd ymm0, dword 4[rbp] /* [rip + .LCPI4_1] */ + WORD $0xc031 // xor eax, eax + LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 + LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 + LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 + LONG $0xec6ffdc5 // vmovdqa ymm5, ymm4 + LONG $0xf46ffdc5 // vmovdqa ymm6, ymm4 + LONG $0xfc6ffdc5 // vmovdqa ymm7, ymm4 + +LBB4_5: + LONG $0x046f7ec5; BYTE $0x87 // vmovdqu ymm8, yword [rdi + 4*rax] + LONG $0x4c6f7ec5; WORD $0x2087 // vmovdqu ymm9, yword [rdi + 4*rax + 32] + LONG $0x546f7ec5; WORD $0x4087 // vmovdqu ymm10, yword [rdi + 4*rax + 64] + LONG $0x5c6f7ec5; WORD $0x6087 // vmovdqu ymm11, yword [rdi + 4*rax + 96] + LONG $0x397dc2c4; BYTE $0xc0 // vpminsd ymm0, ymm0, ymm8 + LONG $0x3975c2c4; BYTE $0xc9 // vpminsd ymm1, ymm1, ymm9 + LONG $0x396dc2c4; BYTE $0xd2 // vpminsd ymm2, ymm2, ymm10 + LONG $0x3965c2c4; BYTE $0xdb // vpminsd ymm3, ymm3, ymm11 + LONG $0x3d5dc2c4; BYTE $0xe0 // vpmaxsd ymm4, ymm4, ymm8 + LONG $0x3d55c2c4; BYTE $0xe9 // vpmaxsd ymm5, ymm5, ymm9 + LONG $0x3d4dc2c4; BYTE $0xf2 // vpmaxsd ymm6, ymm6, ymm10 + LONG $0x3d45c2c4; BYTE $0xfb // vpmaxsd ymm7, ymm7, ymm11 + LONG $0x20c08348 // add rax, 32 + WORD $0x3949; BYTE $0xc1 // cmp r9, rax + JNE LBB4_5 + LONG $0x3d5de2c4; BYTE $0xe5 // vpmaxsd ymm4, ymm4, ymm5 + LONG $0x3d5de2c4; BYTE $0xe6 // vpmaxsd ymm4, ymm4, ymm6 + LONG $0x3d5de2c4; BYTE $0xe7 // vpmaxsd ymm4, ymm4, ymm7 + LONG $0x397de3c4; WORD $0x01e5 // vextracti128 xmm5, ymm4, 1 + LONG $0x3d59e2c4; BYTE $0xe5 // vpmaxsd xmm4, xmm4, xmm5 + LONG $0xec70f9c5; BYTE $0x4e // vpshufd xmm5, xmm4, 78 + LONG $0x3d59e2c4; BYTE $0xe5 // vpmaxsd xmm4, xmm4, xmm5 + LONG $0xec70f9c5; BYTE $0xe5 // vpshufd xmm5, xmm4, 229 + LONG $0x3d59e2c4; BYTE $0xe5 // vpmaxsd xmm4, xmm4, xmm5 + LONG $0x7e79c1c4; BYTE $0xe2 // vmovd r10d, xmm4 + LONG $0x397de2c4; BYTE $0xc1 // vpminsd ymm0, ymm0, ymm1 + LONG $0x397de2c4; BYTE $0xc2 // vpminsd ymm0, ymm0, ymm2 + LONG $0x397de2c4; BYTE $0xc3 // vpminsd ymm0, ymm0, ymm3 + LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 + LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1 + LONG $0xc870f9c5; BYTE $0x4e // vpshufd xmm1, xmm0, 78 + LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1 + LONG $0xc870f9c5; BYTE $0xe5 // vpshufd xmm1, xmm0, 229 + LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1 + LONG $0xc07ef9c5 // vmovd eax, xmm0 + WORD $0x8944; BYTE $0xd6 // mov esi, r10d + WORD $0x394d; BYTE $0xc1 // cmp r9, r8 + JE LBB4_8 + +LBB4_7: + LONG $0x8f348b42 // mov esi, dword [rdi + 4*r9] + WORD $0xf039 // cmp eax, esi + WORD $0x4f0f; BYTE $0xc6 // cmovg eax, esi + WORD $0x3941; BYTE $0xf2 // cmp r10d, esi + LONG $0xf24d0f41 // cmovge esi, r10d + LONG $0x01c18349 // add r9, 1 + WORD $0x8941; BYTE $0xf2 // mov r10d, esi + WORD $0x394d; BYTE $0xc8 // cmp r8, r9 + JNE LBB4_7 + +LBB4_8: + WORD $0x3189 // mov dword [rcx], esi + WORD $0x0289 // mov dword [rdx], eax + VZEROUPPER + RET + +TEXT ·_uint32_max_min_avx2(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + + WORD $0xf685 // test esi, esi + JLE LBB5_1 + WORD $0x8941; BYTE $0xf0 // mov r8d, esi + WORD $0xfe83; BYTE $0x1f // cmp esi, 31 + JA LBB5_4 + WORD $0x3145; BYTE $0xc9 // xor r9d, r9d + LONG $0xffffffb8; BYTE $0xff // mov eax, -1 + WORD $0x3145; BYTE $0xd2 // xor r10d, r10d + JMP LBB5_7 + +LBB5_1: + LONG $0xffffffb8; BYTE $0xff // mov eax, -1 + WORD $0xf631 // xor esi, esi + JMP LBB5_8 + +LBB5_4: + WORD $0x8945; BYTE $0xc1 // mov r9d, r8d + LONG $0xe0e18341 // and r9d, -32 + LONG $0xe4efd9c5 // vpxor xmm4, xmm4, xmm4 + LONG $0xc076fdc5 // vpcmpeqd ymm0, ymm0, ymm0 + WORD $0xc031 // xor eax, eax + LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 + LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 + LONG $0xdb76e5c5 // vpcmpeqd ymm3, ymm3, ymm3 + LONG $0xedefd1c5 // vpxor xmm5, xmm5, xmm5 + LONG $0xf6efc9c5 // vpxor xmm6, xmm6, xmm6 + LONG $0xffefc1c5 // vpxor xmm7, xmm7, xmm7 + +LBB5_5: + LONG $0x046f7ec5; BYTE $0x87 // vmovdqu ymm8, yword [rdi + 4*rax] + LONG $0x4c6f7ec5; WORD $0x2087 // vmovdqu ymm9, yword [rdi + 4*rax + 32] + LONG $0x546f7ec5; WORD $0x4087 // vmovdqu ymm10, yword [rdi + 4*rax + 64] + LONG $0x5c6f7ec5; WORD $0x6087 // vmovdqu ymm11, yword [rdi + 4*rax + 96] + LONG $0x3b7dc2c4; BYTE $0xc0 // vpminud ymm0, ymm0, ymm8 + LONG $0x3b75c2c4; BYTE $0xc9 // vpminud ymm1, ymm1, ymm9 + LONG $0x3b6dc2c4; BYTE $0xd2 // vpminud ymm2, ymm2, ymm10 + LONG $0x3b65c2c4; BYTE $0xdb // vpminud ymm3, ymm3, ymm11 + LONG $0x3f5dc2c4; BYTE $0xe0 // vpmaxud ymm4, ymm4, ymm8 + LONG $0x3f55c2c4; BYTE $0xe9 // vpmaxud ymm5, ymm5, ymm9 + LONG $0x3f4dc2c4; BYTE $0xf2 // vpmaxud ymm6, ymm6, ymm10 + LONG $0x3f45c2c4; BYTE $0xfb // vpmaxud ymm7, ymm7, ymm11 + LONG $0x20c08348 // add rax, 32 + WORD $0x3949; BYTE $0xc1 // cmp r9, rax + JNE LBB5_5 + LONG $0x3f5de2c4; BYTE $0xe5 // vpmaxud ymm4, ymm4, ymm5 + LONG $0x3f5de2c4; BYTE $0xe6 // vpmaxud ymm4, ymm4, ymm6 + LONG $0x3f5de2c4; BYTE $0xe7 // vpmaxud ymm4, ymm4, ymm7 + LONG $0x397de3c4; WORD $0x01e5 // vextracti128 xmm5, ymm4, 1 + LONG $0x3f59e2c4; BYTE $0xe5 // vpmaxud xmm4, xmm4, xmm5 + LONG $0xec70f9c5; BYTE $0x4e // vpshufd xmm5, xmm4, 78 + LONG $0x3f59e2c4; BYTE $0xe5 // vpmaxud xmm4, xmm4, xmm5 + LONG $0xec70f9c5; BYTE $0xe5 // vpshufd xmm5, xmm4, 229 + LONG $0x3f59e2c4; BYTE $0xe5 // vpmaxud xmm4, xmm4, xmm5 + LONG $0x7e79c1c4; BYTE $0xe2 // vmovd r10d, xmm4 + LONG $0x3b7de2c4; BYTE $0xc1 // vpminud ymm0, ymm0, ymm1 + LONG $0x3b7de2c4; BYTE $0xc2 // vpminud ymm0, ymm0, ymm2 + LONG $0x3b7de2c4; BYTE $0xc3 // vpminud ymm0, ymm0, ymm3 + LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 + LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1 + LONG $0xc870f9c5; BYTE $0x4e // vpshufd xmm1, xmm0, 78 + LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1 + LONG $0xc870f9c5; BYTE $0xe5 // vpshufd xmm1, xmm0, 229 + LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1 + LONG $0xc07ef9c5 // vmovd eax, xmm0 + WORD $0x8944; BYTE $0xd6 // mov esi, r10d + WORD $0x394d; BYTE $0xc1 // cmp r9, r8 + JE LBB5_8 + +LBB5_7: + LONG $0x8f348b42 // mov esi, dword [rdi + 4*r9] + WORD $0xf039 // cmp eax, esi + WORD $0x430f; BYTE $0xc6 // cmovae eax, esi + WORD $0x3941; BYTE $0xf2 // cmp r10d, esi + LONG $0xf2470f41 // cmova esi, r10d + LONG $0x01c18349 // add r9, 1 + WORD $0x8941; BYTE $0xf2 // mov r10d, esi + WORD $0x394d; BYTE $0xc8 // cmp r8, r9 + JNE LBB5_7 + +LBB5_8: + WORD $0x3189 // mov dword [rcx], esi + WORD $0x0289 // mov dword [rdx], eax + VZEROUPPER + RET + +DATA LCDATA4<>+0x000(SB)/8, $0x8000000000000000 +DATA LCDATA4<>+0x008(SB)/8, $0x7fffffffffffffff +GLOBL LCDATA4<>(SB), 8, $16 + +TEXT ·_int64_max_min_avx2(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + LEAQ LCDATA4<>(SB), BP + + QUAD $0xffffffffffffb848; WORD $0x7fff // mov rax, 9223372036854775807 + WORD $0xf685 // test esi, esi + JLE LBB6_1 + WORD $0x8941; BYTE $0xf0 // mov r8d, esi + WORD $0xfe83; BYTE $0x0f // cmp esi, 15 + JA LBB6_4 + LONG $0x01508d4c // lea r10, [rax + 1] + WORD $0x3145; BYTE $0xc9 // xor r9d, r9d + JMP LBB6_7 + +LBB6_1: + LONG $0x01708d48 // lea rsi, [rax + 1] + JMP LBB6_8 + +LBB6_4: + WORD $0x8945; BYTE $0xc1 // mov r9d, r8d + LONG $0x597de2c4; WORD $0x0065 // vpbroadcastq ymm4, qword 0[rbp] /* [rip + .LCPI6_0] */ + LONG $0xf0e18341 // and r9d, -16 + LONG $0x597de2c4; WORD $0x0845 // vpbroadcastq ymm0, qword 8[rbp] /* [rip + .LCPI6_1] */ + WORD $0xc031 // xor eax, eax + LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 + LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 + LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 + LONG $0xfc6ffdc5 // vmovdqa ymm7, ymm4 + LONG $0xf46ffdc5 // vmovdqa ymm6, ymm4 + LONG $0xec6ffdc5 // vmovdqa ymm5, ymm4 + +LBB6_5: + LONG $0x046f7ec5; BYTE $0xc7 // vmovdqu ymm8, yword [rdi + 8*rax] + LONG $0x373d62c4; BYTE $0xc8 // vpcmpgtq ymm9, ymm8, ymm0 + LONG $0x4b3de3c4; WORD $0x90c0 // vblendvpd ymm0, ymm8, ymm0, ymm9 + LONG $0x4c6f7ec5; WORD $0x20c7 // vmovdqu ymm9, yword [rdi + 8*rax + 32] + LONG $0x373562c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm9, ymm3 + LONG $0x4b35e3c4; WORD $0xa0db // vblendvpd ymm3, ymm9, ymm3, ymm10 + LONG $0x546f7ec5; WORD $0x40c7 // vmovdqu ymm10, yword [rdi + 8*rax + 64] + LONG $0x372d62c4; BYTE $0xda // vpcmpgtq ymm11, ymm10, ymm2 + LONG $0x4b2de3c4; WORD $0xb0d2 // vblendvpd ymm2, ymm10, ymm2, ymm11 + LONG $0x5c6f7ec5; WORD $0x60c7 // vmovdqu ymm11, yword [rdi + 8*rax + 96] + LONG $0x372562c4; BYTE $0xe1 // vpcmpgtq ymm12, ymm11, ymm1 + LONG $0x4b25e3c4; WORD $0xc0c9 // vblendvpd ymm1, ymm11, ymm1, ymm12 + LONG $0x375d42c4; BYTE $0xe0 // vpcmpgtq ymm12, ymm4, ymm8 + LONG $0x4b3de3c4; WORD $0xc0e4 // vblendvpd ymm4, ymm8, ymm4, ymm12 + LONG $0x374542c4; BYTE $0xc1 // vpcmpgtq ymm8, ymm7, ymm9 + LONG $0x4b35e3c4; WORD $0x80ff // vblendvpd ymm7, ymm9, ymm7, ymm8 + LONG $0x374d42c4; BYTE $0xc2 // vpcmpgtq ymm8, ymm6, ymm10 + LONG $0x4b2de3c4; WORD $0x80f6 // vblendvpd ymm6, ymm10, ymm6, ymm8 + LONG $0x375542c4; BYTE $0xc3 // vpcmpgtq ymm8, ymm5, ymm11 + LONG $0x4b25e3c4; WORD $0x80ed // vblendvpd ymm5, ymm11, ymm5, ymm8 + LONG $0x10c08348 // add rax, 16 + WORD $0x3949; BYTE $0xc1 // cmp r9, rax + JNE LBB6_5 + LONG $0x375d62c4; BYTE $0xc7 // vpcmpgtq ymm8, ymm4, ymm7 + LONG $0x4b45e3c4; WORD $0x80e4 // vblendvpd ymm4, ymm7, ymm4, ymm8 + LONG $0x375de2c4; BYTE $0xfe // vpcmpgtq ymm7, ymm4, ymm6 + LONG $0x4b4de3c4; WORD $0x70e4 // vblendvpd ymm4, ymm6, ymm4, ymm7 + LONG $0x375de2c4; BYTE $0xf5 // vpcmpgtq ymm6, ymm4, ymm5 + LONG $0x4b55e3c4; WORD $0x60e4 // vblendvpd ymm4, ymm5, ymm4, ymm6 + LONG $0x197de3c4; WORD $0x01e5 // vextractf128 xmm5, ymm4, 1 + LONG $0x3759e2c4; BYTE $0xf5 // vpcmpgtq xmm6, xmm4, xmm5 + LONG $0x4b51e3c4; WORD $0x60e4 // vblendvpd xmm4, xmm5, xmm4, xmm6 + LONG $0x0479e3c4; WORD $0x4eec // vpermilps xmm5, xmm4, 78 + LONG $0x3759e2c4; BYTE $0xf5 // vpcmpgtq xmm6, xmm4, xmm5 + LONG $0x4b51e3c4; WORD $0x60e4 // vblendvpd xmm4, xmm5, xmm4, xmm6 + LONG $0x7ef9c1c4; BYTE $0xe2 // vmovq r10, xmm4 + LONG $0x3765e2c4; BYTE $0xe0 // vpcmpgtq ymm4, ymm3, ymm0 + LONG $0x4b65e3c4; WORD $0x40c0 // vblendvpd ymm0, ymm3, ymm0, ymm4 + LONG $0x376de2c4; BYTE $0xd8 // vpcmpgtq ymm3, ymm2, ymm0 + LONG $0x4b6de3c4; WORD $0x30c0 // vblendvpd ymm0, ymm2, ymm0, ymm3 + LONG $0x3775e2c4; BYTE $0xd0 // vpcmpgtq ymm2, ymm1, ymm0 + LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 + LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 + LONG $0x3771e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm1, xmm0 + LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2 + LONG $0x0479e3c4; WORD $0x4ec8 // vpermilps xmm1, xmm0, 78 + LONG $0x3771e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm1, xmm0 + LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2 + LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 + WORD $0x894c; BYTE $0xd6 // mov rsi, r10 + WORD $0x394d; BYTE $0xc1 // cmp r9, r8 + JE LBB6_8 + +LBB6_7: + LONG $0xcf348b4a // mov rsi, qword [rdi + 8*r9] + WORD $0x3948; BYTE $0xf0 // cmp rax, rsi + LONG $0xc64f0f48 // cmovg rax, rsi + WORD $0x3949; BYTE $0xf2 // cmp r10, rsi + LONG $0xf24d0f49 // cmovge rsi, r10 + LONG $0x01c18349 // add r9, 1 + WORD $0x8949; BYTE $0xf2 // mov r10, rsi + WORD $0x394d; BYTE $0xc8 // cmp r8, r9 + JNE LBB6_7 + +LBB6_8: + WORD $0x8948; BYTE $0x31 // mov qword [rcx], rsi + WORD $0x8948; BYTE $0x02 // mov qword [rdx], rax + VZEROUPPER + RET + +DATA LCDATA5<>+0x000(SB)/8, $0x8000000000000000 +GLOBL LCDATA5<>(SB), 8, $8 + +TEXT ·_uint64_max_min_avx2(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + LEAQ LCDATA5<>(SB), BP + + WORD $0xf685 // test esi, esi + JLE LBB7_1 + WORD $0x8941; BYTE $0xf0 // mov r8d, esi + WORD $0xfe83; BYTE $0x0f // cmp esi, 15 + JA LBB7_4 + LONG $0xffc0c748; WORD $0xffff; BYTE $0xff // mov rax, -1 + WORD $0x3145; BYTE $0xc9 // xor r9d, r9d + WORD $0x3145; BYTE $0xd2 // xor r10d, r10d + JMP LBB7_7 + +LBB7_1: + LONG $0xffc0c748; WORD $0xffff; BYTE $0xff // mov rax, -1 + WORD $0xf631 // xor esi, esi + JMP LBB7_8 + +LBB7_4: + WORD $0x8945; BYTE $0xc1 // mov r9d, r8d + LONG $0xf0e18341 // and r9d, -16 + LONG $0xedefd1c5 // vpxor xmm5, xmm5, xmm5 + LONG $0xc976f5c5 // vpcmpeqd ymm1, ymm1, ymm1 + WORD $0xc031 // xor eax, eax + LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI7_0] */ + LONG $0xe476ddc5 // vpcmpeqd ymm4, ymm4, ymm4 + LONG $0xdb76e5c5 // vpcmpeqd ymm3, ymm3, ymm3 + LONG $0xd276edc5 // vpcmpeqd ymm2, ymm2, ymm2 + LONG $0xef3941c4; BYTE $0xc0 // vpxor xmm8, xmm8, xmm8 + LONG $0xffefc1c5 // vpxor xmm7, xmm7, xmm7 + LONG $0xf6efc9c5 // vpxor xmm6, xmm6, xmm6 + +LBB7_5: + LONG $0x0c6f7ec5; BYTE $0xc7 // vmovdqu ymm9, yword [rdi + 8*rax] + LONG $0xd0ef75c5 // vpxor ymm10, ymm1, ymm0 + LONG $0xd8ef35c5 // vpxor ymm11, ymm9, ymm0 + LONG $0x372542c4; BYTE $0xd2 // vpcmpgtq ymm10, ymm11, ymm10 + LONG $0x4b35e3c4; WORD $0xa0c9 // vblendvpd ymm1, ymm9, ymm1, ymm10 + LONG $0xd0ef55c5 // vpxor ymm10, ymm5, ymm0 + LONG $0x372d42c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm10, ymm11 + LONG $0x4b35e3c4; WORD $0xa0ed // vblendvpd ymm5, ymm9, ymm5, ymm10 + LONG $0x4c6f7ec5; WORD $0x20c7 // vmovdqu ymm9, yword [rdi + 8*rax + 32] + LONG $0xd0ef5dc5 // vpxor ymm10, ymm4, ymm0 + LONG $0xd8ef35c5 // vpxor ymm11, ymm9, ymm0 + LONG $0x372542c4; BYTE $0xd2 // vpcmpgtq ymm10, ymm11, ymm10 + LONG $0x4b35e3c4; WORD $0xa0e4 // vblendvpd ymm4, ymm9, ymm4, ymm10 + LONG $0xd0ef3dc5 // vpxor ymm10, ymm8, ymm0 + LONG $0x372d42c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm10, ymm11 + LONG $0x5c6f7ec5; WORD $0x40c7 // vmovdqu ymm11, yword [rdi + 8*rax + 64] + LONG $0x4b3543c4; WORD $0xa0c0 // vblendvpd ymm8, ymm9, ymm8, ymm10 + LONG $0xc8ef65c5 // vpxor ymm9, ymm3, ymm0 + LONG $0xd0ef25c5 // vpxor ymm10, ymm11, ymm0 + LONG $0x372d42c4; BYTE $0xc9 // vpcmpgtq ymm9, ymm10, ymm9 + LONG $0x4b25e3c4; WORD $0x90db // vblendvpd ymm3, ymm11, ymm3, ymm9 + LONG $0xc8ef45c5 // vpxor ymm9, ymm7, ymm0 + LONG $0x373542c4; BYTE $0xca // vpcmpgtq ymm9, ymm9, ymm10 + LONG $0x4b25e3c4; WORD $0x90ff // vblendvpd ymm7, ymm11, ymm7, ymm9 + LONG $0x4c6f7ec5; WORD $0x60c7 // vmovdqu ymm9, yword [rdi + 8*rax + 96] + LONG $0xd0ef6dc5 // vpxor ymm10, ymm2, ymm0 + LONG $0xd8ef35c5 // vpxor ymm11, ymm9, ymm0 + LONG $0x372542c4; BYTE $0xd2 // vpcmpgtq ymm10, ymm11, ymm10 + LONG $0x4b35e3c4; WORD $0xa0d2 // vblendvpd ymm2, ymm9, ymm2, ymm10 + LONG $0xd0ef4dc5 // vpxor ymm10, ymm6, ymm0 + LONG $0x372d42c4; BYTE $0xd3 // vpcmpgtq ymm10, ymm10, ymm11 + LONG $0x4b35e3c4; WORD $0xa0f6 // vblendvpd ymm6, ymm9, ymm6, ymm10 + LONG $0x10c08348 // add rax, 16 + WORD $0x3949; BYTE $0xc1 // cmp r9, rax + JNE LBB7_5 + LONG $0xc8ef3dc5 // vpxor ymm9, ymm8, ymm0 + LONG $0xd0ef55c5 // vpxor ymm10, ymm5, ymm0 + LONG $0x372d42c4; BYTE $0xc9 // vpcmpgtq ymm9, ymm10, ymm9 + LONG $0x4b3de3c4; WORD $0x90ed // vblendvpd ymm5, ymm8, ymm5, ymm9 + LONG $0xc05755c5 // vxorpd ymm8, ymm5, ymm0 + LONG $0xc8ef45c5 // vpxor ymm9, ymm7, ymm0 + LONG $0x373d42c4; BYTE $0xc1 // vpcmpgtq ymm8, ymm8, ymm9 + LONG $0x4b45e3c4; WORD $0x80ed // vblendvpd ymm5, ymm7, ymm5, ymm8 + LONG $0xf857d5c5 // vxorpd ymm7, ymm5, ymm0 + LONG $0xc0ef4dc5 // vpxor ymm8, ymm6, ymm0 + LONG $0x3745c2c4; BYTE $0xf8 // vpcmpgtq ymm7, ymm7, ymm8 + LONG $0x4b4de3c4; WORD $0x70ed // vblendvpd ymm5, ymm6, ymm5, ymm7 + LONG $0x197de3c4; WORD $0x01ee // vextractf128 xmm6, ymm5, 1 + LONG $0xc05749c5 // vxorpd xmm8, xmm6, xmm0 + LONG $0xf857d1c5 // vxorpd xmm7, xmm5, xmm0 + LONG $0x3741c2c4; BYTE $0xf8 // vpcmpgtq xmm7, xmm7, xmm8 + LONG $0x4b49e3c4; WORD $0x70ed // vblendvpd xmm5, xmm6, xmm5, xmm7 + LONG $0x0479e3c4; WORD $0x4ef5 // vpermilps xmm6, xmm5, 78 + LONG $0xc05751c5 // vxorpd xmm8, xmm5, xmm0 + LONG $0xf857c9c5 // vxorpd xmm7, xmm6, xmm0 + LONG $0x3739e2c4; BYTE $0xff // vpcmpgtq xmm7, xmm8, xmm7 + LONG $0x4b49e3c4; WORD $0x70ed // vblendvpd xmm5, xmm6, xmm5, xmm7 + LONG $0xf0eff5c5 // vpxor ymm6, ymm1, ymm0 + LONG $0xf8efddc5 // vpxor ymm7, ymm4, ymm0 + LONG $0x3745e2c4; BYTE $0xf6 // vpcmpgtq ymm6, ymm7, ymm6 + LONG $0x4b5de3c4; WORD $0x60c9 // vblendvpd ymm1, ymm4, ymm1, ymm6 + LONG $0xe057f5c5 // vxorpd ymm4, ymm1, ymm0 + LONG $0xf0efe5c5 // vpxor ymm6, ymm3, ymm0 + LONG $0x374de2c4; BYTE $0xe4 // vpcmpgtq ymm4, ymm6, ymm4 + LONG $0x4b65e3c4; WORD $0x40c9 // vblendvpd ymm1, ymm3, ymm1, ymm4 + LONG $0x7ef9c1c4; BYTE $0xea // vmovq r10, xmm5 + LONG $0xd857f5c5 // vxorpd ymm3, ymm1, ymm0 + LONG $0xe0efedc5 // vpxor ymm4, ymm2, ymm0 + LONG $0x375de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm4, ymm3 + LONG $0x4b6de3c4; WORD $0x30c9 // vblendvpd ymm1, ymm2, ymm1, ymm3 + LONG $0x197de3c4; WORD $0x01ca // vextractf128 xmm2, ymm1, 1 + LONG $0xd857f1c5 // vxorpd xmm3, xmm1, xmm0 + LONG $0xe057e9c5 // vxorpd xmm4, xmm2, xmm0 + LONG $0x3759e2c4; BYTE $0xdb // vpcmpgtq xmm3, xmm4, xmm3 + LONG $0x4b69e3c4; WORD $0x30c9 // vblendvpd xmm1, xmm2, xmm1, xmm3 + LONG $0x0479e3c4; WORD $0x4ed1 // vpermilps xmm2, xmm1, 78 + LONG $0xd857f1c5 // vxorpd xmm3, xmm1, xmm0 + LONG $0xc057e9c5 // vxorpd xmm0, xmm2, xmm0 + LONG $0x3779e2c4; BYTE $0xc3 // vpcmpgtq xmm0, xmm0, xmm3 + LONG $0x4b69e3c4; WORD $0x00c1 // vblendvpd xmm0, xmm2, xmm1, xmm0 + LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 + WORD $0x894c; BYTE $0xd6 // mov rsi, r10 + WORD $0x394d; BYTE $0xc1 // cmp r9, r8 + JE LBB7_8 + +LBB7_7: + LONG $0xcf348b4a // mov rsi, qword [rdi + 8*r9] + WORD $0x3948; BYTE $0xf0 // cmp rax, rsi + LONG $0xc6430f48 // cmovae rax, rsi + WORD $0x3949; BYTE $0xf2 // cmp r10, rsi + LONG $0xf2470f49 // cmova rsi, r10 + LONG $0x01c18349 // add r9, 1 + WORD $0x8949; BYTE $0xf2 // mov r10, rsi + WORD $0x394d; BYTE $0xc8 // cmp r8, r9 + JNE LBB7_7 + +LBB7_8: + WORD $0x8948; BYTE $0x31 // mov qword [rcx], rsi + WORD $0x8948; BYTE $0x02 // mov qword [rdx], rax + VZEROUPPER + RET diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.go new file mode 100644 index 000000000..f9d3c44e3 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.go @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +import "unsafe" + +// This file contains convenience functions for utilizing Arm64 Neon intrinsics to quickly +// and efficiently get the min and max from an integral slice. + +//go:noescape +func _int32_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func int32MaxMinNEON(values []int32) (min, max int32) { + _int32_max_min_neon(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _uint32_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func uint32MaxMinNEON(values []uint32) (min, max uint32) { + _uint32_max_min_neon(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _int64_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func int64MaxMinNEON(values []int64) (min, max int64) { + _int64_max_min_neon(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _uint64_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func uint64MaxMinNEON(values []uint64) (min, max uint64) { + _uint64_max_min_neon(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.s b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.s new file mode 100644 index 000000000..b679bb6e3 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_neon_arm64.s @@ -0,0 +1,324 @@ +//+build !noasm !appengine + +// ARROW-15336 +// (C2GOASM doesn't work correctly for Arm64) +// Partly GENERATED BY asm2plan9s. + + +// func _int32_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) +TEXT ·_int32_max_min_neon(SB), $0-32 + + MOVD values+0(FP), R0 + MOVD length+8(FP), R1 + MOVD minout+16(FP), R2 + MOVD maxout+24(FP), R3 + + WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! + WORD $0x7100043f // cmp w1, #1 + WORD $0x910003fd // mov x29, sp + BLT LBB0_3 + + WORD $0x71000c3f // cmp w1, #3 + WORD $0x2a0103e8 // mov w8, w1 + BHI LBB0_4 + + WORD $0xaa1f03e9 // mov x9, xzr + WORD $0x52b0000b // mov w11, #-2147483648 + WORD $0x12b0000a // mov w10, #2147483647 + JMP LBB0_7 +LBB0_3: + WORD $0x12b0000a // mov w10, #2147483647 + WORD $0x52b0000b // mov w11, #-2147483648 + WORD $0xb900006b // str w11, [x3] + WORD $0xb900004a // str w10, [x2] + WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 + RET +LBB0_4: + WORD $0x927e7509 // and x9, x8, #0xfffffffc + WORD $0x9100200a // add x10, x0, #8 + WORD $0x0f046402 // movi v2.2s, #128, lsl #24 + WORD $0x2f046400 // mvni v0.2s, #128, lsl #24 + WORD $0x2f046401 // mvni v1.2s, #128, lsl #24 + WORD $0xaa0903eb // mov x11, x9 + WORD $0x0f046403 // movi v3.2s, #128, lsl #24 +LBB0_5: + WORD $0x6d7f9544 // ldp d4, d5, [x10, #-8] + WORD $0xf100116b // subs x11, x11, #4 + WORD $0x9100414a // add x10, x10, #16 + WORD $0x0ea46c00 // smin v0.2s, v0.2s, v4.2s + WORD $0x0ea56c21 // smin v1.2s, v1.2s, v5.2s + WORD $0x0ea46442 // smax v2.2s, v2.2s, v4.2s + WORD $0x0ea56463 // smax v3.2s, v3.2s, v5.2s + BNE LBB0_5 + + WORD $0x0ea36442 // smax v2.2s, v2.2s, v3.2s + WORD $0x0ea16c00 // smin v0.2s, v0.2s, v1.2s + WORD $0x0e0c0441 // dup v1.2s, v2.s[1] + WORD $0x0e0c0403 // dup v3.2s, v0.s[1] + WORD $0x0ea16441 // smax v1.2s, v2.2s, v1.2s + WORD $0x0ea36c00 // smin v0.2s, v0.2s, v3.2s + WORD $0xeb08013f // cmp x9, x8 + WORD $0x1e26002b // fmov w11, s1 + WORD $0x1e26000a // fmov w10, s0 + BEQ LBB0_9 +LBB0_7: + WORD $0x8b09080c // add x12, x0, x9, lsl #2 + WORD $0xcb090108 // sub x8, x8, x9 +LBB0_8: + WORD $0xb8404589 // ldr w9, [x12], #4 + WORD $0x6b09015f // cmp w10, w9 + WORD $0x1a89b14a // csel w10, w10, w9, lt + WORD $0x6b09017f // cmp w11, w9 + WORD $0x1a89c16b // csel w11, w11, w9, gt + WORD $0xf1000508 // subs x8, x8, #1 + BNE LBB0_8 +LBB0_9: + WORD $0xb900006b // str w11, [x3] + WORD $0xb900004a // str w10, [x2] + WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 + RET + +// func _uint32_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) +TEXT ·_uint32_max_min_neon(SB), $0-32 + + MOVD values+0(FP), R0 + MOVD length+8(FP), R1 + MOVD minout+16(FP), R2 + MOVD maxout+24(FP), R3 + + WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! + WORD $0x7100043f // cmp w1, #1 + WORD $0x910003fd // mov x29, sp + BLT LBB1_3 + + WORD $0x71000c3f // cmp w1, #3 + WORD $0x2a0103e8 // mov w8, w1 + BHI LBB1_4 + + WORD $0xaa1f03e9 // mov x9, xzr + WORD $0x2a1f03ea // mov w10, wzr + WORD $0x1280000b // mov w11, #-1 + JMP LBB1_7 +LBB1_3: + WORD $0x2a1f03ea // mov w10, wzr + WORD $0x1280000b // mov w11, #-1 + WORD $0xb900006a // str w10, [x3] + WORD $0xb900004b // str w11, [x2] + WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 + RET +LBB1_4: + WORD $0x927e7509 // and x9, x8, #0xfffffffc + WORD $0x6f00e401 // movi v1.2d, #0000000000000000 + WORD $0x6f07e7e0 // movi v0.2d, #0xffffffffffffffff + WORD $0x9100200a // add x10, x0, #8 + WORD $0x6f07e7e2 // movi v2.2d, #0xffffffffffffffff + WORD $0xaa0903eb // mov x11, x9 + WORD $0x6f00e403 // movi v3.2d, #0000000000000000 +LBB1_5: + WORD $0x6d7f9544 // ldp d4, d5, [x10, #-8] + WORD $0xf100116b // subs x11, x11, #4 + WORD $0x9100414a // add x10, x10, #16 + WORD $0x2ea46c00 // umin v0.2s, v0.2s, v4.2s + WORD $0x2ea56c42 // umin v2.2s, v2.2s, v5.2s + WORD $0x2ea46421 // umax v1.2s, v1.2s, v4.2s + WORD $0x2ea56463 // umax v3.2s, v3.2s, v5.2s + BNE LBB1_5 + + WORD $0x2ea36421 // umax v1.2s, v1.2s, v3.2s + WORD $0x2ea26c00 // umin v0.2s, v0.2s, v2.2s + WORD $0x0e0c0422 // dup v2.2s, v1.s[1] + WORD $0x0e0c0403 // dup v3.2s, v0.s[1] + WORD $0x2ea26421 // umax v1.2s, v1.2s, v2.2s + WORD $0x2ea36c00 // umin v0.2s, v0.2s, v3.2s + WORD $0xeb08013f // cmp x9, x8 + WORD $0x1e26002a // fmov w10, s1 + WORD $0x1e26000b // fmov w11, s0 + BEQ LBB1_9 +LBB1_7: + WORD $0x8b09080c // add x12, x0, x9, lsl #2 + WORD $0xcb090108 // sub x8, x8, x9 +LBB1_8: + WORD $0xb8404589 // ldr w9, [x12], #4 + WORD $0x6b09017f // cmp w11, w9 + WORD $0x1a89316b // csel w11, w11, w9, lo + WORD $0x6b09015f // cmp w10, w9 + WORD $0x1a89814a // csel w10, w10, w9, hi + WORD $0xf1000508 // subs x8, x8, #1 + BNE LBB1_8 +LBB1_9: + WORD $0xb900006a // str w10, [x3] + WORD $0xb900004b // str w11, [x2] + WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 + RET + +// func _int64_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) +TEXT ·_int64_max_min_neon(SB), $0-32 + + MOVD values+0(FP), R0 + MOVD length+8(FP), R1 + MOVD minout+16(FP), R2 + MOVD maxout+24(FP), R3 + + WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! + WORD $0x7100043f // cmp w1, #1 + WORD $0x910003fd // mov x29, sp + BLT LBB2_3 + + WORD $0x2a0103e8 // mov w8, w1 + WORD $0xd2f0000b // mov x11, #-9223372036854775808 + WORD $0x71000c3f // cmp w1, #3 + WORD $0x92f0000a // mov x10, #9223372036854775807 + BHI LBB2_4 + + WORD $0xaa1f03e9 // mov x9, xzr + JMP LBB2_7 +LBB2_3: + WORD $0x92f0000a // mov x10, #9223372036854775807 + WORD $0xd2f0000b // mov x11, #-9223372036854775808 + WORD $0xf900006b // str x11, [x3] + WORD $0xf900004a // str x10, [x2] + WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 + RET +LBB2_4: + WORD $0x927e7509 // and x9, x8, #0xfffffffc + WORD $0x4e080d61 // dup v1.2d, x11 + WORD $0x4e080d40 // dup v0.2d, x10 + WORD $0x9100400a // add x10, x0, #16 + WORD $0xaa0903eb // mov x11, x9 + WORD $0x4ea01c02 // mov v2.16b, v0.16b + WORD $0x4ea11c23 // mov v3.16b, v1.16b +LBB2_5: + WORD $0xad7f9544 // ldp q4, q5, [x10, #-16] + WORD $0x4ea31c66 // mov v6.16b, v3.16b + WORD $0x4ea11c27 // mov v7.16b, v1.16b + WORD $0x4ea21c43 // mov v3.16b, v2.16b + WORD $0x4ea01c01 // mov v1.16b, v0.16b + WORD $0x4ee03480 // cmgt v0.2d, v4.2d, v0.2d + WORD $0x4ee234a2 // cmgt v2.2d, v5.2d, v2.2d + WORD $0x6e641c20 // bsl v0.16b, v1.16b, v4.16b + WORD $0x4ee434e1 // cmgt v1.2d, v7.2d, v4.2d + WORD $0x6e651c62 // bsl v2.16b, v3.16b, v5.16b + WORD $0x4ee534c3 // cmgt v3.2d, v6.2d, v5.2d + WORD $0xf100116b // subs x11, x11, #4 + WORD $0x6e641ce1 // bsl v1.16b, v7.16b, v4.16b + WORD $0x6e651cc3 // bsl v3.16b, v6.16b, v5.16b + WORD $0x9100814a // add x10, x10, #32 + BNE LBB2_5 + + WORD $0x4ee33424 // cmgt v4.2d, v1.2d, v3.2d + WORD $0x4ee03445 // cmgt v5.2d, v2.2d, v0.2d + WORD $0x6e631c24 // bsl v4.16b, v1.16b, v3.16b + WORD $0x6e621c05 // bsl v5.16b, v0.16b, v2.16b + WORD $0x4e180480 // dup v0.2d, v4.d[1] + WORD $0x4e1804a1 // dup v1.2d, v5.d[1] + WORD $0x4ee03482 // cmgt v2.2d, v4.2d, v0.2d + WORD $0x4ee53423 // cmgt v3.2d, v1.2d, v5.2d + WORD $0x6e601c82 // bsl v2.16b, v4.16b, v0.16b + WORD $0x6e611ca3 // bsl v3.16b, v5.16b, v1.16b + WORD $0xeb08013f // cmp x9, x8 + WORD $0x9e66004b // fmov x11, d2 + WORD $0x9e66006a // fmov x10, d3 + BEQ LBB2_9 +LBB2_7: + WORD $0x8b090c0c // add x12, x0, x9, lsl #3 + WORD $0xcb090108 // sub x8, x8, x9 +LBB2_8: + WORD $0xf8408589 // ldr x9, [x12], #8 + WORD $0xeb09015f // cmp x10, x9 + WORD $0x9a89b14a // csel x10, x10, x9, lt + WORD $0xeb09017f // cmp x11, x9 + WORD $0x9a89c16b // csel x11, x11, x9, gt + WORD $0xf1000508 // subs x8, x8, #1 + BNE LBB2_8 +LBB2_9: + WORD $0xf900006b // str x11, [x3] + WORD $0xf900004a // str x10, [x2] + WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 + RET + + +// func _uint64_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) +TEXT ·_uint64_max_min_neon(SB), $0-32 + + MOVD values+0(FP), R0 + MOVD length+8(FP), R1 + MOVD minout+16(FP), R2 + MOVD maxout+24(FP), R3 + + WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! + WORD $0x7100043f // cmp w1, #1 + WORD $0x910003fd // mov x29, sp + BLT LBB3_3 + + WORD $0x71000c3f // cmp w1, #3 + WORD $0x2a0103e8 // mov w8, w1 + BHI LBB3_4 + + WORD $0xaa1f03e9 // mov x9, xzr + WORD $0xaa1f03ea // mov x10, xzr + WORD $0x9280000b // mov x11, #-1 + JMP LBB3_7 +LBB3_3: + WORD $0xaa1f03ea // mov x10, xzr + WORD $0x9280000b // mov x11, #-1 + WORD $0xf900006a // str x10, [x3] + WORD $0xf900004b // str x11, [x2] + WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 + RET +LBB3_4: + WORD $0x927e7509 // and x9, x8, #0xfffffffc + WORD $0x9100400a // add x10, x0, #16 + WORD $0x6f00e401 // movi v1.2d, #0000000000000000 + WORD $0x6f07e7e0 // movi v0.2d, #0xffffffffffffffff + WORD $0x6f07e7e2 // movi v2.2d, #0xffffffffffffffff + WORD $0xaa0903eb // mov x11, x9 + WORD $0x6f00e403 // movi v3.2d, #0000000000000000 +LBB3_5: + WORD $0xad7f9544 // ldp q4, q5, [x10, #-16] + WORD $0x4ea31c66 // mov v6.16b, v3.16b + WORD $0x4ea11c27 // mov v7.16b, v1.16b + WORD $0x4ea21c43 // mov v3.16b, v2.16b + WORD $0x4ea01c01 // mov v1.16b, v0.16b + WORD $0x6ee03480 // cmhi v0.2d, v4.2d, v0.2d + WORD $0x6ee234a2 // cmhi v2.2d, v5.2d, v2.2d + WORD $0x6e641c20 // bsl v0.16b, v1.16b, v4.16b + WORD $0x6ee434e1 // cmhi v1.2d, v7.2d, v4.2d + WORD $0x6e651c62 // bsl v2.16b, v3.16b, v5.16b + WORD $0x6ee534c3 // cmhi v3.2d, v6.2d, v5.2d + WORD $0xf100116b // subs x11, x11, #4 + WORD $0x6e641ce1 // bsl v1.16b, v7.16b, v4.16b + WORD $0x6e651cc3 // bsl v3.16b, v6.16b, v5.16b + WORD $0x9100814a // add x10, x10, #32 + BNE LBB3_5 + + WORD $0x6ee33424 // cmhi v4.2d, v1.2d, v3.2d + WORD $0x6ee03445 // cmhi v5.2d, v2.2d, v0.2d + WORD $0x6e631c24 // bsl v4.16b, v1.16b, v3.16b + WORD $0x6e621c05 // bsl v5.16b, v0.16b, v2.16b + WORD $0x4e180480 // dup v0.2d, v4.d[1] + WORD $0x4e1804a1 // dup v1.2d, v5.d[1] + WORD $0x6ee03482 // cmhi v2.2d, v4.2d, v0.2d + WORD $0x6ee53423 // cmhi v3.2d, v1.2d, v5.2d + WORD $0x6e601c82 // bsl v2.16b, v4.16b, v0.16b + WORD $0x6e611ca3 // bsl v3.16b, v5.16b, v1.16b + WORD $0xeb08013f // cmp x9, x8 + WORD $0x9e66004a // fmov x10, d2 + WORD $0x9e66006b // fmov x11, d3 + BEQ LBB3_9 +LBB3_7: + WORD $0x8b090c0c // add x12, x0, x9, lsl #3 + WORD $0xcb090108 // sub x8, x8, x9 +LBB3_8: + WORD $0xf8408589 // ldr x9, [x12], #8 + WORD $0xeb09017f // cmp x11, x9 + WORD $0x9a89316b // csel x11, x11, x9, lo + WORD $0xeb09015f // cmp x10, x9 + WORD $0x9a89814a // csel x10, x10, x9, hi + WORD $0xf1000508 // subs x8, x8, #1 + BNE LBB3_8 +LBB3_9: + WORD $0xf900006a // str x10, [x3] + WORD $0xf900004b // str x11, [x2] + WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 + RET + diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_noasm.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_noasm.go new file mode 100644 index 000000000..19c24b590 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_noasm.go @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build noasm + +package utils + +// if building with the 'noasm' tag, then point to the pure go implementations +func init() { + minmaxFuncs.i8 = int8MinMax + minmaxFuncs.ui8 = uint8MinMax + minmaxFuncs.i16 = int16MinMax + minmaxFuncs.ui16 = uint16MinMax + minmaxFuncs.i32 = int32MinMax + minmaxFuncs.ui32 = uint32MinMax + minmaxFuncs.i64 = int64MinMax + minmaxFuncs.ui64 = uint64MinMax +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_ppc64le.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_ppc64le.go new file mode 100644 index 000000000..ffd2db006 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_ppc64le.go @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +func init() { + minmaxFuncs.i8 = int8MinMax + minmaxFuncs.ui8 = uint8MinMax + minmaxFuncs.i16 = int16MinMax + minmaxFuncs.ui16 = uint16MinMax + minmaxFuncs.i32 = int32MinMax + minmaxFuncs.ui32 = uint32MinMax + minmaxFuncs.i64 = int64MinMax + minmaxFuncs.ui64 = uint64MinMax +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_s390x.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_s390x.go new file mode 100644 index 000000000..ffd2db006 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_s390x.go @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +func init() { + minmaxFuncs.i8 = int8MinMax + minmaxFuncs.ui8 = uint8MinMax + minmaxFuncs.i16 = int16MinMax + minmaxFuncs.ui16 = uint16MinMax + minmaxFuncs.i32 = int32MinMax + minmaxFuncs.ui32 = uint32MinMax + minmaxFuncs.i64 = int64MinMax + minmaxFuncs.ui64 = uint64MinMax +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.go new file mode 100644 index 000000000..1e12a8d17 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.go @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +import "unsafe" + +// This file contains convenience functions for utilizing SSE4 intrinsics to quickly +// and efficiently get the min and max from an integral slice. + +//go:noescape +func _int8_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func int8MaxMinSSE4(values []int8) (min, max int8) { + _int8_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _uint8_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func uint8MaxMinSSE4(values []uint8) (min, max uint8) { + _uint8_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _int16_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func int16MaxMinSSE4(values []int16) (min, max int16) { + _int16_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _uint16_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func uint16MaxMinSSE4(values []uint16) (min, max uint16) { + _uint16_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _int32_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func int32MaxMinSSE4(values []int32) (min, max int32) { + _int32_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _uint32_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func uint32MaxMinSSE4(values []uint32) (min, max uint32) { + _uint32_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _int64_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func int64MaxMinSSE4(values []int64) (min, max int64) { + _int64_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} + +//go:noescape +func _uint64_max_min_sse4(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) + +func uint64MaxMinSSE4(values []uint64) (min, max uint64) { + _uint64_max_min_sse4(unsafe.Pointer(&values[0]), len(values), unsafe.Pointer(&min), unsafe.Pointer(&max)) + return +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.s b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.s new file mode 100644 index 000000000..8f1eccf60 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/min_max_sse4_amd64.s @@ -0,0 +1,1044 @@ +//+build !noasm !appengine +// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT + +DATA LCDATA1<>+0x000(SB)/8, $0x8080808080808080 +DATA LCDATA1<>+0x008(SB)/8, $0x8080808080808080 +DATA LCDATA1<>+0x010(SB)/8, $0x7f7f7f7f7f7f7f7f +DATA LCDATA1<>+0x018(SB)/8, $0x7f7f7f7f7f7f7f7f +GLOBL LCDATA1<>(SB), 8, $32 + +TEXT ·_int8_max_min_sse4(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + LEAQ LCDATA1<>(SB), BP + + WORD $0xf685 // test esi, esi + JLE LBB0_1 + WORD $0x8941; BYTE $0xf1 // mov r9d, esi + WORD $0xfe83; BYTE $0x1f // cmp esi, 31 + JA LBB0_4 + WORD $0xb041; BYTE $0x80 // mov r8b, -128 + WORD $0xb640; BYTE $0x7f // mov sil, 127 + WORD $0x3145; BYTE $0xdb // xor r11d, r11d + JMP LBB0_11 + +LBB0_1: + WORD $0xb640; BYTE $0x7f // mov sil, 127 + WORD $0xb041; BYTE $0x80 // mov r8b, -128 + JMP LBB0_12 + +LBB0_4: + WORD $0x8945; BYTE $0xcb // mov r11d, r9d + LONG $0xe0e38341 // and r11d, -32 + LONG $0xe0438d49 // lea rax, [r11 - 32] + WORD $0x8949; BYTE $0xc0 // mov r8, rax + LONG $0x05e8c149 // shr r8, 5 + LONG $0x01c08349 // add r8, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB0_5 + WORD $0x894d; BYTE $0xc2 // mov r10, r8 + LONG $0xfee28349 // and r10, -2 + WORD $0xf749; BYTE $0xda // neg r10 + LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI0_0] */ + LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI0_1] */ + WORD $0xc031 // xor eax, eax + LONG $0xd06f0f66 // movdqa xmm2, xmm0 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + +LBB0_7: + LONG $0x246f0ff3; BYTE $0x07 // movdqu xmm4, oword [rdi + rax] + LONG $0x6c6f0ff3; WORD $0x1007 // movdqu xmm5, oword [rdi + rax + 16] + LONG $0x746f0ff3; WORD $0x2007 // movdqu xmm6, oword [rdi + rax + 32] + LONG $0x7c6f0ff3; WORD $0x3007 // movdqu xmm7, oword [rdi + rax + 48] + LONG $0x38380f66; BYTE $0xc4 // pminsb xmm0, xmm4 + LONG $0x38380f66; BYTE $0xd5 // pminsb xmm2, xmm5 + LONG $0x3c380f66; BYTE $0xcc // pmaxsb xmm1, xmm4 + LONG $0x3c380f66; BYTE $0xdd // pmaxsb xmm3, xmm5 + LONG $0x38380f66; BYTE $0xc6 // pminsb xmm0, xmm6 + LONG $0x38380f66; BYTE $0xd7 // pminsb xmm2, xmm7 + LONG $0x3c380f66; BYTE $0xce // pmaxsb xmm1, xmm6 + LONG $0x3c380f66; BYTE $0xdf // pmaxsb xmm3, xmm7 + LONG $0x40c08348 // add rax, 64 + LONG $0x02c28349 // add r10, 2 + JNE LBB0_7 + LONG $0x01c0f641 // test r8b, 1 + JE LBB0_10 + +LBB0_9: + LONG $0x246f0ff3; BYTE $0x07 // movdqu xmm4, oword [rdi + rax] + LONG $0x6c6f0ff3; WORD $0x1007 // movdqu xmm5, oword [rdi + rax + 16] + LONG $0x3c380f66; BYTE $0xdd // pmaxsb xmm3, xmm5 + LONG $0x3c380f66; BYTE $0xcc // pmaxsb xmm1, xmm4 + LONG $0x38380f66; BYTE $0xd5 // pminsb xmm2, xmm5 + LONG $0x38380f66; BYTE $0xc4 // pminsb xmm0, xmm4 + +LBB0_10: + LONG $0x38380f66; BYTE $0xc2 // pminsb xmm0, xmm2 + LONG $0x3c380f66; BYTE $0xcb // pmaxsb xmm1, xmm3 + LONG $0x4def0f66; BYTE $0x10 // pxor xmm1, oword 16[rbp] /* [rip + .LCPI0_1] */ + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2710f66; BYTE $0x08 // psrlw xmm2, 8 + LONG $0xd1da0f66 // pminub xmm2, xmm1 + LONG $0x41380f66; BYTE $0xca // phminposuw xmm1, xmm2 + LONG $0x7e0f4166; BYTE $0xc8 // movd r8d, xmm1 + LONG $0x7ff08041 // xor r8b, 127 + LONG $0x45ef0f66; BYTE $0x00 // pxor xmm0, oword 0[rbp] /* [rip + .LCPI0_0] */ + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1710f66; BYTE $0x08 // psrlw xmm1, 8 + LONG $0xc8da0f66 // pminub xmm1, xmm0 + LONG $0x41380f66; BYTE $0xc1 // phminposuw xmm0, xmm1 + LONG $0xc67e0f66 // movd esi, xmm0 + LONG $0x80f68040 // xor sil, -128 + WORD $0x394d; BYTE $0xcb // cmp r11, r9 + JE LBB0_12 + +LBB0_11: + LONG $0x04b60f42; BYTE $0x1f // movzx eax, byte [rdi + r11] + WORD $0x3840; BYTE $0xc6 // cmp sil, al + LONG $0xf6b60f40 // movzx esi, sil + WORD $0x4f0f; BYTE $0xf0 // cmovg esi, eax + WORD $0x3841; BYTE $0xc0 // cmp r8b, al + LONG $0xc0b60f45 // movzx r8d, r8b + LONG $0xc04c0f44 // cmovl r8d, eax + LONG $0x01c38349 // add r11, 1 + WORD $0x394d; BYTE $0xd9 // cmp r9, r11 + JNE LBB0_11 + +LBB0_12: + WORD $0x8844; BYTE $0x01 // mov byte [rcx], r8b + WORD $0x8840; BYTE $0x32 // mov byte [rdx], sil + RET + +LBB0_5: + LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI0_0] */ + LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI0_1] */ + WORD $0xc031 // xor eax, eax + LONG $0xd06f0f66 // movdqa xmm2, xmm0 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0x01c0f641 // test r8b, 1 + JNE LBB0_9 + JMP LBB0_10 + +TEXT ·_uint8_max_min_sse4(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + + WORD $0xf685 // test esi, esi + JLE LBB1_1 + WORD $0x8941; BYTE $0xf1 // mov r9d, esi + WORD $0xfe83; BYTE $0x1f // cmp esi, 31 + JA LBB1_4 + WORD $0xb640; BYTE $0xff // mov sil, -1 + WORD $0x3145; BYTE $0xdb // xor r11d, r11d + WORD $0xc031 // xor eax, eax + JMP LBB1_11 + +LBB1_1: + WORD $0xb640; BYTE $0xff // mov sil, -1 + WORD $0xc031 // xor eax, eax + JMP LBB1_12 + +LBB1_4: + WORD $0x8945; BYTE $0xcb // mov r11d, r9d + LONG $0xe0e38341 // and r11d, -32 + LONG $0xe0438d49 // lea rax, [r11 - 32] + WORD $0x8949; BYTE $0xc0 // mov r8, rax + LONG $0x05e8c149 // shr r8, 5 + LONG $0x01c08349 // add r8, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB1_5 + WORD $0x894d; BYTE $0xc2 // mov r10, r8 + LONG $0xfee28349 // and r10, -2 + WORD $0xf749; BYTE $0xda // neg r10 + LONG $0xc9ef0f66 // pxor xmm1, xmm1 + LONG $0xc0760f66 // pcmpeqd xmm0, xmm0 + WORD $0xc031 // xor eax, eax + LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 + LONG $0xdbef0f66 // pxor xmm3, xmm3 + +LBB1_7: + LONG $0x246f0ff3; BYTE $0x07 // movdqu xmm4, oword [rdi + rax] + LONG $0x6c6f0ff3; WORD $0x1007 // movdqu xmm5, oword [rdi + rax + 16] + LONG $0x746f0ff3; WORD $0x2007 // movdqu xmm6, oword [rdi + rax + 32] + LONG $0x7c6f0ff3; WORD $0x3007 // movdqu xmm7, oword [rdi + rax + 48] + LONG $0xc4da0f66 // pminub xmm0, xmm4 + LONG $0xd5da0f66 // pminub xmm2, xmm5 + LONG $0xccde0f66 // pmaxub xmm1, xmm4 + LONG $0xddde0f66 // pmaxub xmm3, xmm5 + LONG $0xc6da0f66 // pminub xmm0, xmm6 + LONG $0xd7da0f66 // pminub xmm2, xmm7 + LONG $0xcede0f66 // pmaxub xmm1, xmm6 + LONG $0xdfde0f66 // pmaxub xmm3, xmm7 + LONG $0x40c08348 // add rax, 64 + LONG $0x02c28349 // add r10, 2 + JNE LBB1_7 + LONG $0x01c0f641 // test r8b, 1 + JE LBB1_10 + +LBB1_9: + LONG $0x246f0ff3; BYTE $0x07 // movdqu xmm4, oword [rdi + rax] + LONG $0x6c6f0ff3; WORD $0x1007 // movdqu xmm5, oword [rdi + rax + 16] + LONG $0xddde0f66 // pmaxub xmm3, xmm5 + LONG $0xccde0f66 // pmaxub xmm1, xmm4 + LONG $0xd5da0f66 // pminub xmm2, xmm5 + LONG $0xc4da0f66 // pminub xmm0, xmm4 + +LBB1_10: + LONG $0xc2da0f66 // pminub xmm0, xmm2 + LONG $0xcbde0f66 // pmaxub xmm1, xmm3 + LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xca6f0f66 // movdqa xmm1, xmm2 + LONG $0xd1710f66; BYTE $0x08 // psrlw xmm1, 8 + LONG $0xcada0f66 // pminub xmm1, xmm2 + LONG $0x41380f66; BYTE $0xc9 // phminposuw xmm1, xmm1 + LONG $0xc87e0f66 // movd eax, xmm1 + WORD $0xd0f6 // not al + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1710f66; BYTE $0x08 // psrlw xmm1, 8 + LONG $0xc8da0f66 // pminub xmm1, xmm0 + LONG $0x41380f66; BYTE $0xc1 // phminposuw xmm0, xmm1 + LONG $0xc67e0f66 // movd esi, xmm0 + WORD $0x394d; BYTE $0xcb // cmp r11, r9 + JE LBB1_12 + +LBB1_11: + LONG $0x04b60f46; BYTE $0x1f // movzx r8d, byte [rdi + r11] + WORD $0x3844; BYTE $0xc6 // cmp sil, r8b + LONG $0xf6b60f40 // movzx esi, sil + LONG $0xf0430f41 // cmovae esi, r8d + WORD $0x3844; BYTE $0xc0 // cmp al, r8b + WORD $0xb60f; BYTE $0xc0 // movzx eax, al + LONG $0xc0460f41 // cmovbe eax, r8d + LONG $0x01c38349 // add r11, 1 + WORD $0x394d; BYTE $0xd9 // cmp r9, r11 + JNE LBB1_11 + +LBB1_12: + WORD $0x0188 // mov byte [rcx], al + WORD $0x8840; BYTE $0x32 // mov byte [rdx], sil + RET + +LBB1_5: + LONG $0xc9ef0f66 // pxor xmm1, xmm1 + LONG $0xc0760f66 // pcmpeqd xmm0, xmm0 + WORD $0xc031 // xor eax, eax + LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 + LONG $0xdbef0f66 // pxor xmm3, xmm3 + LONG $0x01c0f641 // test r8b, 1 + JNE LBB1_9 + JMP LBB1_10 + +DATA LCDATA2<>+0x000(SB)/8, $0x8000800080008000 +DATA LCDATA2<>+0x008(SB)/8, $0x8000800080008000 +DATA LCDATA2<>+0x010(SB)/8, $0x7fff7fff7fff7fff +DATA LCDATA2<>+0x018(SB)/8, $0x7fff7fff7fff7fff +GLOBL LCDATA2<>(SB), 8, $32 + +TEXT ·_int16_max_min_sse4(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + LEAQ LCDATA2<>(SB), BP + + WORD $0xf685 // test esi, esi + JLE LBB2_1 + WORD $0x8941; BYTE $0xf1 // mov r9d, esi + WORD $0xfe83; BYTE $0x0f // cmp esi, 15 + JA LBB2_4 + LONG $0x00b84166; BYTE $0x80 // mov r8w, -32768 + LONG $0x7fffbe66 // mov si, 32767 + WORD $0x3145; BYTE $0xdb // xor r11d, r11d + JMP LBB2_11 + +LBB2_1: + LONG $0x7fffbe66 // mov si, 32767 + LONG $0x00b84166; BYTE $0x80 // mov r8w, -32768 + JMP LBB2_12 + +LBB2_4: + WORD $0x8945; BYTE $0xcb // mov r11d, r9d + LONG $0xf0e38341 // and r11d, -16 + LONG $0xf0438d49 // lea rax, [r11 - 16] + WORD $0x8949; BYTE $0xc0 // mov r8, rax + LONG $0x04e8c149 // shr r8, 4 + LONG $0x01c08349 // add r8, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB2_5 + WORD $0x894d; BYTE $0xc2 // mov r10, r8 + LONG $0xfee28349 // and r10, -2 + WORD $0xf749; BYTE $0xda // neg r10 + LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI2_0] */ + LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI2_1] */ + WORD $0xc031 // xor eax, eax + LONG $0xd06f0f66 // movdqa xmm2, xmm0 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + +LBB2_7: + LONG $0x246f0ff3; BYTE $0x47 // movdqu xmm4, oword [rdi + 2*rax] + LONG $0x6c6f0ff3; WORD $0x1047 // movdqu xmm5, oword [rdi + 2*rax + 16] + LONG $0x746f0ff3; WORD $0x2047 // movdqu xmm6, oword [rdi + 2*rax + 32] + LONG $0x7c6f0ff3; WORD $0x3047 // movdqu xmm7, oword [rdi + 2*rax + 48] + LONG $0xc4ea0f66 // pminsw xmm0, xmm4 + LONG $0xd5ea0f66 // pminsw xmm2, xmm5 + LONG $0xccee0f66 // pmaxsw xmm1, xmm4 + LONG $0xddee0f66 // pmaxsw xmm3, xmm5 + LONG $0xc6ea0f66 // pminsw xmm0, xmm6 + LONG $0xd7ea0f66 // pminsw xmm2, xmm7 + LONG $0xceee0f66 // pmaxsw xmm1, xmm6 + LONG $0xdfee0f66 // pmaxsw xmm3, xmm7 + LONG $0x20c08348 // add rax, 32 + LONG $0x02c28349 // add r10, 2 + JNE LBB2_7 + LONG $0x01c0f641 // test r8b, 1 + JE LBB2_10 + +LBB2_9: + LONG $0x246f0ff3; BYTE $0x47 // movdqu xmm4, oword [rdi + 2*rax] + LONG $0x6c6f0ff3; WORD $0x1047 // movdqu xmm5, oword [rdi + 2*rax + 16] + LONG $0xddee0f66 // pmaxsw xmm3, xmm5 + LONG $0xccee0f66 // pmaxsw xmm1, xmm4 + LONG $0xd5ea0f66 // pminsw xmm2, xmm5 + LONG $0xc4ea0f66 // pminsw xmm0, xmm4 + +LBB2_10: + LONG $0xc2ea0f66 // pminsw xmm0, xmm2 + LONG $0xcbee0f66 // pmaxsw xmm1, xmm3 + LONG $0x4def0f66; BYTE $0x10 // pxor xmm1, oword 16[rbp] /* [rip + .LCPI2_1] */ + LONG $0x41380f66; BYTE $0xc9 // phminposuw xmm1, xmm1 + LONG $0x7e0f4166; BYTE $0xc8 // movd r8d, xmm1 + LONG $0xfff08141; WORD $0x007f; BYTE $0x00 // xor r8d, 32767 + LONG $0x45ef0f66; BYTE $0x00 // pxor xmm0, oword 0[rbp] /* [rip + .LCPI2_0] */ + LONG $0x41380f66; BYTE $0xc0 // phminposuw xmm0, xmm0 + LONG $0xc67e0f66 // movd esi, xmm0 + LONG $0x8000f681; WORD $0x0000 // xor esi, 32768 + WORD $0x394d; BYTE $0xcb // cmp r11, r9 + JE LBB2_12 + +LBB2_11: + LONG $0x04b70f42; BYTE $0x5f // movzx eax, word [rdi + 2*r11] + WORD $0x3966; BYTE $0xc6 // cmp si, ax + WORD $0x4f0f; BYTE $0xf0 // cmovg esi, eax + LONG $0xc0394166 // cmp r8w, ax + LONG $0xc04c0f44 // cmovl r8d, eax + LONG $0x01c38349 // add r11, 1 + WORD $0x394d; BYTE $0xd9 // cmp r9, r11 + JNE LBB2_11 + +LBB2_12: + LONG $0x01894466 // mov word [rcx], r8w + WORD $0x8966; BYTE $0x32 // mov word [rdx], si + RET + +LBB2_5: + LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI2_0] */ + LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI2_1] */ + WORD $0xc031 // xor eax, eax + LONG $0xd06f0f66 // movdqa xmm2, xmm0 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0x01c0f641 // test r8b, 1 + JNE LBB2_9 + JMP LBB2_10 + +TEXT ·_uint16_max_min_sse4(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + + WORD $0xf685 // test esi, esi + JLE LBB3_1 + WORD $0x8941; BYTE $0xf1 // mov r9d, esi + WORD $0xfe83; BYTE $0x0f // cmp esi, 15 + JA LBB3_4 + LONG $0xffb84166; BYTE $0xff // mov r8w, -1 + WORD $0x3145; BYTE $0xdb // xor r11d, r11d + WORD $0xf631 // xor esi, esi + JMP LBB3_11 + +LBB3_1: + LONG $0xffb84166; BYTE $0xff // mov r8w, -1 + WORD $0xf631 // xor esi, esi + JMP LBB3_12 + +LBB3_4: + WORD $0x8945; BYTE $0xcb // mov r11d, r9d + LONG $0xf0e38341 // and r11d, -16 + LONG $0xf0438d49 // lea rax, [r11 - 16] + WORD $0x8949; BYTE $0xc0 // mov r8, rax + LONG $0x04e8c149 // shr r8, 4 + LONG $0x01c08349 // add r8, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB3_5 + WORD $0x894d; BYTE $0xc2 // mov r10, r8 + LONG $0xfee28349 // and r10, -2 + WORD $0xf749; BYTE $0xda // neg r10 + LONG $0xc9ef0f66 // pxor xmm1, xmm1 + LONG $0xc0760f66 // pcmpeqd xmm0, xmm0 + WORD $0xc031 // xor eax, eax + LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 + LONG $0xdbef0f66 // pxor xmm3, xmm3 + +LBB3_7: + LONG $0x246f0ff3; BYTE $0x47 // movdqu xmm4, oword [rdi + 2*rax] + LONG $0x6c6f0ff3; WORD $0x1047 // movdqu xmm5, oword [rdi + 2*rax + 16] + LONG $0x746f0ff3; WORD $0x2047 // movdqu xmm6, oword [rdi + 2*rax + 32] + LONG $0x7c6f0ff3; WORD $0x3047 // movdqu xmm7, oword [rdi + 2*rax + 48] + LONG $0x3a380f66; BYTE $0xc4 // pminuw xmm0, xmm4 + LONG $0x3a380f66; BYTE $0xd5 // pminuw xmm2, xmm5 + LONG $0x3e380f66; BYTE $0xcc // pmaxuw xmm1, xmm4 + LONG $0x3e380f66; BYTE $0xdd // pmaxuw xmm3, xmm5 + LONG $0x3a380f66; BYTE $0xc6 // pminuw xmm0, xmm6 + LONG $0x3a380f66; BYTE $0xd7 // pminuw xmm2, xmm7 + LONG $0x3e380f66; BYTE $0xce // pmaxuw xmm1, xmm6 + LONG $0x3e380f66; BYTE $0xdf // pmaxuw xmm3, xmm7 + LONG $0x20c08348 // add rax, 32 + LONG $0x02c28349 // add r10, 2 + JNE LBB3_7 + LONG $0x01c0f641 // test r8b, 1 + JE LBB3_10 + +LBB3_9: + LONG $0x246f0ff3; BYTE $0x47 // movdqu xmm4, oword [rdi + 2*rax] + LONG $0x6c6f0ff3; WORD $0x1047 // movdqu xmm5, oword [rdi + 2*rax + 16] + LONG $0x3e380f66; BYTE $0xdd // pmaxuw xmm3, xmm5 + LONG $0x3e380f66; BYTE $0xcc // pmaxuw xmm1, xmm4 + LONG $0x3a380f66; BYTE $0xd5 // pminuw xmm2, xmm5 + LONG $0x3a380f66; BYTE $0xc4 // pminuw xmm0, xmm4 + +LBB3_10: + LONG $0x3a380f66; BYTE $0xc2 // pminuw xmm0, xmm2 + LONG $0x3e380f66; BYTE $0xcb // pmaxuw xmm1, xmm3 + LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0x41380f66; BYTE $0xca // phminposuw xmm1, xmm2 + LONG $0xce7e0f66 // movd esi, xmm1 + WORD $0xd6f7 // not esi + LONG $0x41380f66; BYTE $0xc0 // phminposuw xmm0, xmm0 + LONG $0x7e0f4166; BYTE $0xc0 // movd r8d, xmm0 + WORD $0x394d; BYTE $0xcb // cmp r11, r9 + JE LBB3_12 + +LBB3_11: + LONG $0x04b70f42; BYTE $0x5f // movzx eax, word [rdi + 2*r11] + LONG $0xc0394166 // cmp r8w, ax + LONG $0xc0430f44 // cmovae r8d, eax + WORD $0x3966; BYTE $0xc6 // cmp si, ax + WORD $0x460f; BYTE $0xf0 // cmovbe esi, eax + LONG $0x01c38349 // add r11, 1 + WORD $0x394d; BYTE $0xd9 // cmp r9, r11 + JNE LBB3_11 + +LBB3_12: + WORD $0x8966; BYTE $0x31 // mov word [rcx], si + LONG $0x02894466 // mov word [rdx], r8w + RET + +LBB3_5: + LONG $0xc9ef0f66 // pxor xmm1, xmm1 + LONG $0xc0760f66 // pcmpeqd xmm0, xmm0 + WORD $0xc031 // xor eax, eax + LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 + LONG $0xdbef0f66 // pxor xmm3, xmm3 + LONG $0x01c0f641 // test r8b, 1 + JNE LBB3_9 + JMP LBB3_10 + +DATA LCDATA3<>+0x000(SB)/8, $0x8000000080000000 +DATA LCDATA3<>+0x008(SB)/8, $0x8000000080000000 +DATA LCDATA3<>+0x010(SB)/8, $0x7fffffff7fffffff +DATA LCDATA3<>+0x018(SB)/8, $0x7fffffff7fffffff +GLOBL LCDATA3<>(SB), 8, $32 + +TEXT ·_int32_max_min_sse4(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + LEAQ LCDATA3<>(SB), BP + + WORD $0xf685 // test esi, esi + JLE LBB4_1 + WORD $0x8941; BYTE $0xf1 // mov r9d, esi + WORD $0xfe83; BYTE $0x07 // cmp esi, 7 + JA LBB4_6 + LONG $0x000000b8; BYTE $0x80 // mov eax, -2147483648 + LONG $0xffffb841; WORD $0x7fff // mov r8d, 2147483647 + WORD $0x3145; BYTE $0xdb // xor r11d, r11d + JMP LBB4_4 + +LBB4_1: + LONG $0xffffb841; WORD $0x7fff // mov r8d, 2147483647 + LONG $0x000000b8; BYTE $0x80 // mov eax, -2147483648 + JMP LBB4_13 + +LBB4_6: + WORD $0x8945; BYTE $0xcb // mov r11d, r9d + LONG $0xf8e38341 // and r11d, -8 + LONG $0xf8438d49 // lea rax, [r11 - 8] + WORD $0x8949; BYTE $0xc0 // mov r8, rax + LONG $0x03e8c149 // shr r8, 3 + LONG $0x01c08349 // add r8, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB4_7 + WORD $0x894d; BYTE $0xc2 // mov r10, r8 + LONG $0xfee28349 // and r10, -2 + WORD $0xf749; BYTE $0xda // neg r10 + LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI4_0] */ + LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI4_1] */ + WORD $0xc031 // xor eax, eax + LONG $0xd06f0f66 // movdqa xmm2, xmm0 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + +LBB4_9: + LONG $0x246f0ff3; BYTE $0x87 // movdqu xmm4, oword [rdi + 4*rax] + LONG $0x6c6f0ff3; WORD $0x1087 // movdqu xmm5, oword [rdi + 4*rax + 16] + LONG $0x746f0ff3; WORD $0x2087 // movdqu xmm6, oword [rdi + 4*rax + 32] + LONG $0x7c6f0ff3; WORD $0x3087 // movdqu xmm7, oword [rdi + 4*rax + 48] + LONG $0x39380f66; BYTE $0xc4 // pminsd xmm0, xmm4 + LONG $0x39380f66; BYTE $0xd5 // pminsd xmm2, xmm5 + LONG $0x3d380f66; BYTE $0xcc // pmaxsd xmm1, xmm4 + LONG $0x3d380f66; BYTE $0xdd // pmaxsd xmm3, xmm5 + LONG $0x39380f66; BYTE $0xc6 // pminsd xmm0, xmm6 + LONG $0x39380f66; BYTE $0xd7 // pminsd xmm2, xmm7 + LONG $0x3d380f66; BYTE $0xce // pmaxsd xmm1, xmm6 + LONG $0x3d380f66; BYTE $0xdf // pmaxsd xmm3, xmm7 + LONG $0x10c08348 // add rax, 16 + LONG $0x02c28349 // add r10, 2 + JNE LBB4_9 + LONG $0x01c0f641 // test r8b, 1 + JE LBB4_12 + +LBB4_11: + LONG $0x246f0ff3; BYTE $0x87 // movdqu xmm4, oword [rdi + 4*rax] + LONG $0x6c6f0ff3; WORD $0x1087 // movdqu xmm5, oword [rdi + 4*rax + 16] + LONG $0x3d380f66; BYTE $0xdd // pmaxsd xmm3, xmm5 + LONG $0x3d380f66; BYTE $0xcc // pmaxsd xmm1, xmm4 + LONG $0x39380f66; BYTE $0xd5 // pminsd xmm2, xmm5 + LONG $0x39380f66; BYTE $0xc4 // pminsd xmm0, xmm4 + +LBB4_12: + LONG $0x39380f66; BYTE $0xc2 // pminsd xmm0, xmm2 + LONG $0x3d380f66; BYTE $0xcb // pmaxsd xmm1, xmm3 + LONG $0xd1700f66; BYTE $0x4e // pshufd xmm2, xmm1, 78 + LONG $0x3d380f66; BYTE $0xd1 // pmaxsd xmm2, xmm1 + LONG $0xca700f66; BYTE $0xe5 // pshufd xmm1, xmm2, 229 + LONG $0x3d380f66; BYTE $0xca // pmaxsd xmm1, xmm2 + LONG $0xc87e0f66 // movd eax, xmm1 + LONG $0xc8700f66; BYTE $0x4e // pshufd xmm1, xmm0, 78 + LONG $0x39380f66; BYTE $0xc8 // pminsd xmm1, xmm0 + LONG $0xc1700f66; BYTE $0xe5 // pshufd xmm0, xmm1, 229 + LONG $0x39380f66; BYTE $0xc1 // pminsd xmm0, xmm1 + LONG $0x7e0f4166; BYTE $0xc0 // movd r8d, xmm0 + WORD $0x394d; BYTE $0xcb // cmp r11, r9 + JE LBB4_13 + +LBB4_4: + WORD $0xc689 // mov esi, eax + +LBB4_5: + LONG $0x9f048b42 // mov eax, dword [rdi + 4*r11] + WORD $0x3941; BYTE $0xc0 // cmp r8d, eax + LONG $0xc04f0f44 // cmovg r8d, eax + WORD $0xc639 // cmp esi, eax + WORD $0x4d0f; BYTE $0xc6 // cmovge eax, esi + LONG $0x01c38349 // add r11, 1 + WORD $0xc689 // mov esi, eax + WORD $0x394d; BYTE $0xd9 // cmp r9, r11 + JNE LBB4_5 + +LBB4_13: + WORD $0x0189 // mov dword [rcx], eax + WORD $0x8944; BYTE $0x02 // mov dword [rdx], r8d + RET + +LBB4_7: + LONG $0x4d6f0f66; BYTE $0x00 // movdqa xmm1, oword 0[rbp] /* [rip + .LCPI4_0] */ + LONG $0x456f0f66; BYTE $0x10 // movdqa xmm0, oword 16[rbp] /* [rip + .LCPI4_1] */ + WORD $0xc031 // xor eax, eax + LONG $0xd06f0f66 // movdqa xmm2, xmm0 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0x01c0f641 // test r8b, 1 + JNE LBB4_11 + JMP LBB4_12 + +TEXT ·_uint32_max_min_sse4(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + + WORD $0xf685 // test esi, esi + JLE LBB5_1 + WORD $0x8941; BYTE $0xf1 // mov r9d, esi + WORD $0xfe83; BYTE $0x07 // cmp esi, 7 + JA LBB5_6 + WORD $0x3145; BYTE $0xdb // xor r11d, r11d + LONG $0xffffb841; WORD $0xffff // mov r8d, -1 + WORD $0xf631 // xor esi, esi + JMP LBB5_4 + +LBB5_1: + LONG $0xffffb841; WORD $0xffff // mov r8d, -1 + WORD $0xf631 // xor esi, esi + JMP LBB5_13 + +LBB5_6: + WORD $0x8945; BYTE $0xcb // mov r11d, r9d + LONG $0xf8e38341 // and r11d, -8 + LONG $0xf8438d49 // lea rax, [r11 - 8] + WORD $0x8949; BYTE $0xc0 // mov r8, rax + LONG $0x03e8c149 // shr r8, 3 + LONG $0x01c08349 // add r8, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB5_7 + WORD $0x894d; BYTE $0xc2 // mov r10, r8 + LONG $0xfee28349 // and r10, -2 + WORD $0xf749; BYTE $0xda // neg r10 + LONG $0xc9ef0f66 // pxor xmm1, xmm1 + LONG $0xc0760f66 // pcmpeqd xmm0, xmm0 + WORD $0xc031 // xor eax, eax + LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 + LONG $0xdbef0f66 // pxor xmm3, xmm3 + +LBB5_9: + LONG $0x246f0ff3; BYTE $0x87 // movdqu xmm4, oword [rdi + 4*rax] + LONG $0x6c6f0ff3; WORD $0x1087 // movdqu xmm5, oword [rdi + 4*rax + 16] + LONG $0x746f0ff3; WORD $0x2087 // movdqu xmm6, oword [rdi + 4*rax + 32] + LONG $0x7c6f0ff3; WORD $0x3087 // movdqu xmm7, oword [rdi + 4*rax + 48] + LONG $0x3b380f66; BYTE $0xc4 // pminud xmm0, xmm4 + LONG $0x3b380f66; BYTE $0xd5 // pminud xmm2, xmm5 + LONG $0x3f380f66; BYTE $0xcc // pmaxud xmm1, xmm4 + LONG $0x3f380f66; BYTE $0xdd // pmaxud xmm3, xmm5 + LONG $0x3b380f66; BYTE $0xc6 // pminud xmm0, xmm6 + LONG $0x3b380f66; BYTE $0xd7 // pminud xmm2, xmm7 + LONG $0x3f380f66; BYTE $0xce // pmaxud xmm1, xmm6 + LONG $0x3f380f66; BYTE $0xdf // pmaxud xmm3, xmm7 + LONG $0x10c08348 // add rax, 16 + LONG $0x02c28349 // add r10, 2 + JNE LBB5_9 + LONG $0x01c0f641 // test r8b, 1 + JE LBB5_12 + +LBB5_11: + LONG $0x246f0ff3; BYTE $0x87 // movdqu xmm4, oword [rdi + 4*rax] + LONG $0x6c6f0ff3; WORD $0x1087 // movdqu xmm5, oword [rdi + 4*rax + 16] + LONG $0x3f380f66; BYTE $0xdd // pmaxud xmm3, xmm5 + LONG $0x3f380f66; BYTE $0xcc // pmaxud xmm1, xmm4 + LONG $0x3b380f66; BYTE $0xd5 // pminud xmm2, xmm5 + LONG $0x3b380f66; BYTE $0xc4 // pminud xmm0, xmm4 + +LBB5_12: + LONG $0x3b380f66; BYTE $0xc2 // pminud xmm0, xmm2 + LONG $0x3f380f66; BYTE $0xcb // pmaxud xmm1, xmm3 + LONG $0xd1700f66; BYTE $0x4e // pshufd xmm2, xmm1, 78 + LONG $0x3f380f66; BYTE $0xd1 // pmaxud xmm2, xmm1 + LONG $0xca700f66; BYTE $0xe5 // pshufd xmm1, xmm2, 229 + LONG $0x3f380f66; BYTE $0xca // pmaxud xmm1, xmm2 + LONG $0xce7e0f66 // movd esi, xmm1 + LONG $0xc8700f66; BYTE $0x4e // pshufd xmm1, xmm0, 78 + LONG $0x3b380f66; BYTE $0xc8 // pminud xmm1, xmm0 + LONG $0xc1700f66; BYTE $0xe5 // pshufd xmm0, xmm1, 229 + LONG $0x3b380f66; BYTE $0xc1 // pminud xmm0, xmm1 + LONG $0x7e0f4166; BYTE $0xc0 // movd r8d, xmm0 + WORD $0x394d; BYTE $0xcb // cmp r11, r9 + JE LBB5_13 + +LBB5_4: + WORD $0xf089 // mov eax, esi + +LBB5_5: + LONG $0x9f348b42 // mov esi, dword [rdi + 4*r11] + WORD $0x3941; BYTE $0xf0 // cmp r8d, esi + LONG $0xc6430f44 // cmovae r8d, esi + WORD $0xf039 // cmp eax, esi + WORD $0x470f; BYTE $0xf0 // cmova esi, eax + LONG $0x01c38349 // add r11, 1 + WORD $0xf089 // mov eax, esi + WORD $0x394d; BYTE $0xd9 // cmp r9, r11 + JNE LBB5_5 + +LBB5_13: + WORD $0x3189 // mov dword [rcx], esi + WORD $0x8944; BYTE $0x02 // mov dword [rdx], r8d + RET + +LBB5_7: + LONG $0xc9ef0f66 // pxor xmm1, xmm1 + LONG $0xc0760f66 // pcmpeqd xmm0, xmm0 + WORD $0xc031 // xor eax, eax + LONG $0xd2760f66 // pcmpeqd xmm2, xmm2 + LONG $0xdbef0f66 // pxor xmm3, xmm3 + LONG $0x01c0f641 // test r8b, 1 + JNE LBB5_11 + JMP LBB5_12 + +DATA LCDATA4<>+0x000(SB)/8, $0x8000000000000000 +DATA LCDATA4<>+0x008(SB)/8, $0x8000000000000000 +DATA LCDATA4<>+0x010(SB)/8, $0x7fffffffffffffff +DATA LCDATA4<>+0x018(SB)/8, $0x7fffffffffffffff +GLOBL LCDATA4<>(SB), 8, $32 + +TEXT ·_int64_max_min_sse4(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + LEAQ LCDATA4<>(SB), BP + + QUAD $0xffffffffffffb849; WORD $0x7fff // mov r8, 9223372036854775807 + WORD $0xf685 // test esi, esi + JLE LBB6_1 + WORD $0x8941; BYTE $0xf1 // mov r9d, esi + WORD $0xfe83; BYTE $0x03 // cmp esi, 3 + JA LBB6_6 + LONG $0x01708d49 // lea rsi, [r8 + 1] + WORD $0x3145; BYTE $0xdb // xor r11d, r11d + JMP LBB6_4 + +LBB6_1: + LONG $0x01708d49 // lea rsi, [r8 + 1] + JMP LBB6_13 + +LBB6_6: + WORD $0x8945; BYTE $0xcb // mov r11d, r9d + LONG $0xfce38341 // and r11d, -4 + LONG $0xfc438d49 // lea rax, [r11 - 4] + WORD $0x8949; BYTE $0xc0 // mov r8, rax + LONG $0x02e8c149 // shr r8, 2 + LONG $0x01c08349 // add r8, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB6_7 + WORD $0x894d; BYTE $0xc2 // mov r10, r8 + LONG $0xfee28349 // and r10, -2 + WORD $0xf749; BYTE $0xda // neg r10 + LONG $0x6f0f4466; WORD $0x004d // movdqa xmm9, oword 0[rbp] /* [rip + .LCPI6_0] */ + LONG $0x6f0f4466; WORD $0x1045 // movdqa xmm8, oword 16[rbp] /* [rip + .LCPI6_1] */ + WORD $0xc031 // xor eax, eax + LONG $0x6f0f4166; BYTE $0xd0 // movdqa xmm2, xmm8 + LONG $0x6f0f4166; BYTE $0xf1 // movdqa xmm6, xmm9 + +LBB6_9: + LONG $0x3c6f0ff3; BYTE $0xc7 // movdqu xmm7, oword [rdi + 8*rax] + LONG $0xc76f0f66 // movdqa xmm0, xmm7 + LONG $0x380f4166; WORD $0xc037 // pcmpgtq xmm0, xmm8 + LONG $0xe76f0f66 // movdqa xmm4, xmm7 + LONG $0x380f4166; WORD $0xe015 // blendvpd xmm4, xmm8, xmm0 + LONG $0x4c6f0ff3; WORD $0x10c7 // movdqu xmm1, oword [rdi + 8*rax + 16] + LONG $0xc16f0f66 // movdqa xmm0, xmm1 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0xe96f0f66 // movdqa xmm5, xmm1 + LONG $0x15380f66; BYTE $0xea // blendvpd xmm5, xmm2, xmm0 + LONG $0x6f0f4166; BYTE $0xc1 // movdqa xmm0, xmm9 + LONG $0x37380f66; BYTE $0xc7 // pcmpgtq xmm0, xmm7 + LONG $0x380f4166; WORD $0xf915 // blendvpd xmm7, xmm9, xmm0 + LONG $0xc66f0f66 // movdqa xmm0, xmm6 + LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 + LONG $0x15380f66; BYTE $0xce // blendvpd xmm1, xmm6, xmm0 + LONG $0x5c6f0ff3; WORD $0x20c7 // movdqu xmm3, oword [rdi + 8*rax + 32] + LONG $0xc36f0f66 // movdqa xmm0, xmm3 + LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4 + LONG $0x6f0f4466; BYTE $0xc3 // movdqa xmm8, xmm3 + LONG $0x380f4466; WORD $0xc415 // blendvpd xmm8, xmm4, xmm0 + LONG $0x646f0ff3; WORD $0x30c7 // movdqu xmm4, oword [rdi + 8*rax + 48] + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0x37380f66; BYTE $0xc5 // pcmpgtq xmm0, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0x15380f66; BYTE $0xd5 // blendvpd xmm2, xmm5, xmm0 + LONG $0xc7280f66 // movapd xmm0, xmm7 + LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3 + LONG $0x15380f66; BYTE $0xdf // blendvpd xmm3, xmm7, xmm0 + LONG $0xc1280f66 // movapd xmm0, xmm1 + LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4 + LONG $0x15380f66; BYTE $0xe1 // blendvpd xmm4, xmm1, xmm0 + LONG $0x08c08348 // add rax, 8 + LONG $0x280f4466; BYTE $0xcb // movapd xmm9, xmm3 + LONG $0xf4280f66 // movapd xmm6, xmm4 + LONG $0x02c28349 // add r10, 2 + JNE LBB6_9 + LONG $0x01c0f641 // test r8b, 1 + JE LBB6_12 + +LBB6_11: + LONG $0x4c6f0ff3; WORD $0x10c7 // movdqu xmm1, oword [rdi + 8*rax + 16] + LONG $0xc4280f66 // movapd xmm0, xmm4 + LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 + LONG $0xe96f0f66 // movdqa xmm5, xmm1 + LONG $0x15380f66; BYTE $0xec // blendvpd xmm5, xmm4, xmm0 + LONG $0x246f0ff3; BYTE $0xc7 // movdqu xmm4, oword [rdi + 8*rax] + LONG $0xc3280f66 // movapd xmm0, xmm3 + LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4 + LONG $0xf46f0f66 // movdqa xmm6, xmm4 + LONG $0x15380f66; BYTE $0xf3 // blendvpd xmm6, xmm3, xmm0 + LONG $0xc16f0f66 // movdqa xmm0, xmm1 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0x15380f66; BYTE $0xca // blendvpd xmm1, xmm2, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0x380f4166; WORD $0xc037 // pcmpgtq xmm0, xmm8 + LONG $0x380f4166; WORD $0xe015 // blendvpd xmm4, xmm8, xmm0 + LONG $0x280f4466; BYTE $0xc4 // movapd xmm8, xmm4 + LONG $0xd1280f66 // movapd xmm2, xmm1 + LONG $0xde280f66 // movapd xmm3, xmm6 + LONG $0xe5280f66 // movapd xmm4, xmm5 + +LBB6_12: + LONG $0xc3280f66 // movapd xmm0, xmm3 + LONG $0x37380f66; BYTE $0xc4 // pcmpgtq xmm0, xmm4 + LONG $0x15380f66; BYTE $0xe3 // blendvpd xmm4, xmm3, xmm0 + LONG $0xcc700f66; BYTE $0x4e // pshufd xmm1, xmm4, 78 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 + LONG $0x15380f66; BYTE $0xcc // blendvpd xmm1, xmm4, xmm0 + LONG $0x7e0f4866; BYTE $0xce // movq rsi, xmm1 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0x380f4166; WORD $0xc037 // pcmpgtq xmm0, xmm8 + LONG $0x380f4166; WORD $0xd015 // blendvpd xmm2, xmm8, xmm0 + LONG $0xca700f66; BYTE $0x4e // pshufd xmm1, xmm2, 78 + LONG $0xc16f0f66 // movdqa xmm0, xmm1 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0x15380f66; BYTE $0xca // blendvpd xmm1, xmm2, xmm0 + LONG $0x7e0f4966; BYTE $0xc8 // movq r8, xmm1 + WORD $0x394d; BYTE $0xcb // cmp r11, r9 + JE LBB6_13 + +LBB6_4: + WORD $0x8948; BYTE $0xf0 // mov rax, rsi + +LBB6_5: + LONG $0xdf348b4a // mov rsi, qword [rdi + 8*r11] + WORD $0x3949; BYTE $0xf0 // cmp r8, rsi + LONG $0xc64f0f4c // cmovg r8, rsi + WORD $0x3948; BYTE $0xf0 // cmp rax, rsi + LONG $0xf04d0f48 // cmovge rsi, rax + LONG $0x01c38349 // add r11, 1 + WORD $0x8948; BYTE $0xf0 // mov rax, rsi + WORD $0x394d; BYTE $0xd9 // cmp r9, r11 + JNE LBB6_5 + +LBB6_13: + WORD $0x8948; BYTE $0x31 // mov qword [rcx], rsi + WORD $0x894c; BYTE $0x02 // mov qword [rdx], r8 + RET + +LBB6_7: + LONG $0x5d280f66; BYTE $0x00 // movapd xmm3, oword 0[rbp] /* [rip + .LCPI6_0] */ + LONG $0x6f0f4466; WORD $0x1045 // movdqa xmm8, oword 16[rbp] /* [rip + .LCPI6_1] */ + WORD $0xc031 // xor eax, eax + LONG $0x6f0f4166; BYTE $0xd0 // movdqa xmm2, xmm8 + LONG $0xe3280f66 // movapd xmm4, xmm3 + LONG $0x01c0f641 // test r8b, 1 + JNE LBB6_11 + JMP LBB6_12 + +DATA LCDATA5<>+0x000(SB)/8, $0x8000000000000000 +DATA LCDATA5<>+0x008(SB)/8, $0x8000000000000000 +GLOBL LCDATA5<>(SB), 8, $16 + +TEXT ·_uint64_max_min_sse4(SB), $0-32 + + MOVQ values+0(FP), DI + MOVQ length+8(FP), SI + MOVQ minout+16(FP), DX + MOVQ maxout+24(FP), CX + LEAQ LCDATA5<>(SB), BP + + WORD $0xf685 // test esi, esi + JLE LBB7_1 + WORD $0x8941; BYTE $0xf1 // mov r9d, esi + WORD $0xfe83; BYTE $0x03 // cmp esi, 3 + JA LBB7_6 + LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1 + WORD $0x3145; BYTE $0xdb // xor r11d, r11d + WORD $0xc031 // xor eax, eax + JMP LBB7_4 + +LBB7_1: + LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1 + WORD $0xc031 // xor eax, eax + JMP LBB7_13 + +LBB7_6: + WORD $0x8945; BYTE $0xcb // mov r11d, r9d + LONG $0xfce38341 // and r11d, -4 + LONG $0xfc438d49 // lea rax, [r11 - 4] + WORD $0x8949; BYTE $0xc0 // mov r8, rax + LONG $0x02e8c149 // shr r8, 2 + LONG $0x01c08349 // add r8, 1 + WORD $0x8548; BYTE $0xc0 // test rax, rax + JE LBB7_7 + WORD $0x894d; BYTE $0xc2 // mov r10, r8 + LONG $0xfee28349 // and r10, -2 + WORD $0xf749; BYTE $0xda // neg r10 + LONG $0xef0f4566; BYTE $0xc9 // pxor xmm9, xmm9 + LONG $0x760f4566; BYTE $0xd2 // pcmpeqd xmm10, xmm10 + WORD $0xc031 // xor eax, eax + LONG $0x6f0f4466; WORD $0x0045 // movdqa xmm8, oword 0[rbp] /* [rip + .LCPI7_0] */ + LONG $0x760f4566; BYTE $0xdb // pcmpeqd xmm11, xmm11 + LONG $0xef0f4566; BYTE $0xe4 // pxor xmm12, xmm12 + +LBB7_9: + LONG $0x6f0f4166; BYTE $0xd2 // movdqa xmm2, xmm10 + LONG $0xef0f4166; BYTE $0xd0 // pxor xmm2, xmm8 + LONG $0x246f0ff3; BYTE $0xc7 // movdqu xmm4, oword [rdi + 8*rax] + LONG $0x6c6f0ff3; WORD $0x10c7 // movdqu xmm5, oword [rdi + 8*rax + 16] + LONG $0x6f0f44f3; WORD $0xc76c; BYTE $0x20 // movdqu xmm13, oword [rdi + 8*rax + 32] + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xef0f4166; BYTE $0xc0 // pxor xmm0, xmm8 + LONG $0x6f0f4166; BYTE $0xc9 // movdqa xmm1, xmm9 + LONG $0xef0f4166; BYTE $0xc8 // pxor xmm1, xmm8 + LONG $0x37380f66; BYTE $0xc8 // pcmpgtq xmm1, xmm0 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0xdc6f0f66 // movdqa xmm3, xmm4 + LONG $0x380f4166; WORD $0xda15 // blendvpd xmm3, xmm10, xmm0 + LONG $0x746f0ff3; WORD $0x30c7 // movdqu xmm6, oword [rdi + 8*rax + 48] + LONG $0x6f0f4166; BYTE $0xfb // movdqa xmm7, xmm11 + LONG $0xef0f4166; BYTE $0xf8 // pxor xmm7, xmm8 + LONG $0xc56f0f66 // movdqa xmm0, xmm5 + LONG $0xef0f4166; BYTE $0xc0 // pxor xmm0, xmm8 + LONG $0x6f0f4166; BYTE $0xd4 // movdqa xmm2, xmm12 + LONG $0xef0f4166; BYTE $0xd0 // pxor xmm2, xmm8 + LONG $0x37380f66; BYTE $0xd0 // pcmpgtq xmm2, xmm0 + LONG $0x37380f66; BYTE $0xc7 // pcmpgtq xmm0, xmm7 + LONG $0xfd6f0f66 // movdqa xmm7, xmm5 + LONG $0x380f4166; WORD $0xfb15 // blendvpd xmm7, xmm11, xmm0 + LONG $0xc16f0f66 // movdqa xmm0, xmm1 + LONG $0x380f4166; WORD $0xe115 // blendvpd xmm4, xmm9, xmm0 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0x380f4166; WORD $0xec15 // blendvpd xmm5, xmm12, xmm0 + LONG $0xd3280f66 // movapd xmm2, xmm3 + LONG $0x570f4166; BYTE $0xd0 // xorpd xmm2, xmm8 + LONG $0x6f0f4166; BYTE $0xc5 // movdqa xmm0, xmm13 + LONG $0xef0f4166; BYTE $0xc0 // pxor xmm0, xmm8 + LONG $0xcc280f66 // movapd xmm1, xmm4 + LONG $0x570f4166; BYTE $0xc8 // xorpd xmm1, xmm8 + LONG $0x37380f66; BYTE $0xc8 // pcmpgtq xmm1, xmm0 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0x6f0f4566; BYTE $0xd5 // movdqa xmm10, xmm13 + LONG $0x380f4466; WORD $0xd315 // blendvpd xmm10, xmm3, xmm0 + LONG $0xdf280f66 // movapd xmm3, xmm7 + LONG $0x570f4166; BYTE $0xd8 // xorpd xmm3, xmm8 + LONG $0xc66f0f66 // movdqa xmm0, xmm6 + LONG $0xef0f4166; BYTE $0xc0 // pxor xmm0, xmm8 + LONG $0xd5280f66 // movapd xmm2, xmm5 + LONG $0x570f4166; BYTE $0xd0 // xorpd xmm2, xmm8 + LONG $0x37380f66; BYTE $0xd0 // pcmpgtq xmm2, xmm0 + LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3 + LONG $0x6f0f4466; BYTE $0xde // movdqa xmm11, xmm6 + LONG $0x380f4466; WORD $0xdf15 // blendvpd xmm11, xmm7, xmm0 + LONG $0xc16f0f66 // movdqa xmm0, xmm1 + LONG $0x380f4466; WORD $0xec15 // blendvpd xmm13, xmm4, xmm0 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0x15380f66; BYTE $0xf5 // blendvpd xmm6, xmm5, xmm0 + LONG $0x08c08348 // add rax, 8 + LONG $0x280f4566; BYTE $0xcd // movapd xmm9, xmm13 + LONG $0x280f4466; BYTE $0xe6 // movapd xmm12, xmm6 + LONG $0x02c28349 // add r10, 2 + JNE LBB7_9 + LONG $0x01c0f641 // test r8b, 1 + JE LBB7_12 + +LBB7_11: + LONG $0x24100f66; BYTE $0xc7 // movupd xmm4, oword [rdi + 8*rax] + LONG $0x5c100f66; WORD $0x10c7 // movupd xmm3, oword [rdi + 8*rax + 16] + LONG $0x6d280f66; BYTE $0x00 // movapd xmm5, oword 0[rbp] /* [rip + .LCPI7_0] */ + LONG $0xc6280f66 // movapd xmm0, xmm6 + LONG $0xc5570f66 // xorpd xmm0, xmm5 + LONG $0xcb280f66 // movapd xmm1, xmm3 + LONG $0xcd570f66 // xorpd xmm1, xmm5 + LONG $0x37380f66; BYTE $0xc1 // pcmpgtq xmm0, xmm1 + LONG $0xfb280f66 // movapd xmm7, xmm3 + LONG $0x15380f66; BYTE $0xfe // blendvpd xmm7, xmm6, xmm0 + LONG $0x280f4166; BYTE $0xc5 // movapd xmm0, xmm13 + LONG $0xc5570f66 // xorpd xmm0, xmm5 + LONG $0xd4280f66 // movapd xmm2, xmm4 + LONG $0xd5570f66 // xorpd xmm2, xmm5 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0xf4280f66 // movapd xmm6, xmm4 + LONG $0x380f4166; WORD $0xf515 // blendvpd xmm6, xmm13, xmm0 + LONG $0x280f4166; BYTE $0xc3 // movapd xmm0, xmm11 + LONG $0xc5570f66 // xorpd xmm0, xmm5 + LONG $0x37380f66; BYTE $0xc8 // pcmpgtq xmm1, xmm0 + LONG $0xc16f0f66 // movdqa xmm0, xmm1 + LONG $0x380f4166; WORD $0xdb15 // blendvpd xmm3, xmm11, xmm0 + LONG $0x570f4166; BYTE $0xea // xorpd xmm5, xmm10 + LONG $0x37380f66; BYTE $0xd5 // pcmpgtq xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0x380f4166; WORD $0xe215 // blendvpd xmm4, xmm10, xmm0 + LONG $0x280f4466; BYTE $0xd4 // movapd xmm10, xmm4 + LONG $0x280f4466; BYTE $0xdb // movapd xmm11, xmm3 + LONG $0x280f4466; BYTE $0xee // movapd xmm13, xmm6 + LONG $0xf7280f66 // movapd xmm6, xmm7 + +LBB7_12: + LONG $0x4d280f66; BYTE $0x00 // movapd xmm1, oword 0[rbp] /* [rip + .LCPI7_0] */ + LONG $0xd6280f66 // movapd xmm2, xmm6 + LONG $0xd1570f66 // xorpd xmm2, xmm1 + LONG $0x280f4166; BYTE $0xc5 // movapd xmm0, xmm13 + LONG $0xc1570f66 // xorpd xmm0, xmm1 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0x380f4166; WORD $0xf515 // blendvpd xmm6, xmm13, xmm0 + LONG $0xd6700f66; BYTE $0x4e // pshufd xmm2, xmm6, 78 + LONG $0xc6280f66 // movapd xmm0, xmm6 + LONG $0xc1570f66 // xorpd xmm0, xmm1 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd9ef0f66 // pxor xmm3, xmm1 + LONG $0x37380f66; BYTE $0xc3 // pcmpgtq xmm0, xmm3 + LONG $0x15380f66; BYTE $0xd6 // blendvpd xmm2, xmm6, xmm0 + LONG $0x7e0f4866; BYTE $0xd0 // movq rax, xmm2 + LONG $0x6f0f4166; BYTE $0xd2 // movdqa xmm2, xmm10 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0x6f0f4166; BYTE $0xc3 // movdqa xmm0, xmm11 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0x37380f66; BYTE $0xc2 // pcmpgtq xmm0, xmm2 + LONG $0x380f4566; WORD $0xda15 // blendvpd xmm11, xmm10, xmm0 + LONG $0x700f4166; WORD $0x4ed3 // pshufd xmm2, xmm11, 78 + LONG $0x6f0f4166; BYTE $0xc3 // movdqa xmm0, xmm11 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0x37380f66; BYTE $0xc8 // pcmpgtq xmm1, xmm0 + LONG $0xc16f0f66 // movdqa xmm0, xmm1 + LONG $0x380f4166; WORD $0xd315 // blendvpd xmm2, xmm11, xmm0 + LONG $0x7e0f4966; BYTE $0xd0 // movq r8, xmm2 + WORD $0x394d; BYTE $0xcb // cmp r11, r9 + JE LBB7_13 + +LBB7_4: + WORD $0x8948; BYTE $0xc6 // mov rsi, rax + +LBB7_5: + LONG $0xdf048b4a // mov rax, qword [rdi + 8*r11] + WORD $0x3949; BYTE $0xc0 // cmp r8, rax + LONG $0xc0430f4c // cmovae r8, rax + WORD $0x3948; BYTE $0xc6 // cmp rsi, rax + LONG $0xc6470f48 // cmova rax, rsi + LONG $0x01c38349 // add r11, 1 + WORD $0x8948; BYTE $0xc6 // mov rsi, rax + WORD $0x394d; BYTE $0xd9 // cmp r9, r11 + JNE LBB7_5 + +LBB7_13: + WORD $0x8948; BYTE $0x01 // mov qword [rcx], rax + WORD $0x894c; BYTE $0x02 // mov qword [rdx], r8 + RET + +LBB7_7: + LONG $0x570f4566; BYTE $0xed // xorpd xmm13, xmm13 + LONG $0x760f4566; BYTE $0xd2 // pcmpeqd xmm10, xmm10 + WORD $0xc031 // xor eax, eax + LONG $0x760f4566; BYTE $0xdb // pcmpeqd xmm11, xmm11 + LONG $0xf6570f66 // xorpd xmm6, xmm6 + LONG $0x01c0f641 // test r8b, 1 + JNE LBB7_11 + JMP LBB7_12 diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go new file mode 100644 index 000000000..1666df129 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go @@ -0,0 +1,407 @@ +// Code generated by transpose_ints.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +// when we upgrade to support go1.18, this can be massively simplified by using +// Go Generics, but since we aren't supporting go1.18 yet, I didn't want to use +// them here so we can maintain the backwards compatibility. + +func transposeInt8Int8(src []int8, dest []int8, transposeMap []int32) { + for i, s := range src { + dest[i] = int8(transposeMap[s]) + } +} + +func transposeInt8Uint8(src []int8, dest []uint8, transposeMap []int32) { + for i, s := range src { + dest[i] = uint8(transposeMap[s]) + } +} + +func transposeInt8Int16(src []int8, dest []int16, transposeMap []int32) { + for i, s := range src { + dest[i] = int16(transposeMap[s]) + } +} + +func transposeInt8Uint16(src []int8, dest []uint16, transposeMap []int32) { + for i, s := range src { + dest[i] = uint16(transposeMap[s]) + } +} + +func transposeInt8Int32(src []int8, dest []int32, transposeMap []int32) { + for i, s := range src { + dest[i] = int32(transposeMap[s]) + } +} + +func transposeInt8Uint32(src []int8, dest []uint32, transposeMap []int32) { + for i, s := range src { + dest[i] = uint32(transposeMap[s]) + } +} + +func transposeInt8Int64(src []int8, dest []int64, transposeMap []int32) { + for i, s := range src { + dest[i] = int64(transposeMap[s]) + } +} + +func transposeInt8Uint64(src []int8, dest []uint64, transposeMap []int32) { + for i, s := range src { + dest[i] = uint64(transposeMap[s]) + } +} + +func transposeUint8Int8(src []uint8, dest []int8, transposeMap []int32) { + for i, s := range src { + dest[i] = int8(transposeMap[s]) + } +} + +func transposeUint8Uint8(src []uint8, dest []uint8, transposeMap []int32) { + for i, s := range src { + dest[i] = uint8(transposeMap[s]) + } +} + +func transposeUint8Int16(src []uint8, dest []int16, transposeMap []int32) { + for i, s := range src { + dest[i] = int16(transposeMap[s]) + } +} + +func transposeUint8Uint16(src []uint8, dest []uint16, transposeMap []int32) { + for i, s := range src { + dest[i] = uint16(transposeMap[s]) + } +} + +func transposeUint8Int32(src []uint8, dest []int32, transposeMap []int32) { + for i, s := range src { + dest[i] = int32(transposeMap[s]) + } +} + +func transposeUint8Uint32(src []uint8, dest []uint32, transposeMap []int32) { + for i, s := range src { + dest[i] = uint32(transposeMap[s]) + } +} + +func transposeUint8Int64(src []uint8, dest []int64, transposeMap []int32) { + for i, s := range src { + dest[i] = int64(transposeMap[s]) + } +} + +func transposeUint8Uint64(src []uint8, dest []uint64, transposeMap []int32) { + for i, s := range src { + dest[i] = uint64(transposeMap[s]) + } +} + +func transposeInt16Int8(src []int16, dest []int8, transposeMap []int32) { + for i, s := range src { + dest[i] = int8(transposeMap[s]) + } +} + +func transposeInt16Uint8(src []int16, dest []uint8, transposeMap []int32) { + for i, s := range src { + dest[i] = uint8(transposeMap[s]) + } +} + +func transposeInt16Int16(src []int16, dest []int16, transposeMap []int32) { + for i, s := range src { + dest[i] = int16(transposeMap[s]) + } +} + +func transposeInt16Uint16(src []int16, dest []uint16, transposeMap []int32) { + for i, s := range src { + dest[i] = uint16(transposeMap[s]) + } +} + +func transposeInt16Int32(src []int16, dest []int32, transposeMap []int32) { + for i, s := range src { + dest[i] = int32(transposeMap[s]) + } +} + +func transposeInt16Uint32(src []int16, dest []uint32, transposeMap []int32) { + for i, s := range src { + dest[i] = uint32(transposeMap[s]) + } +} + +func transposeInt16Int64(src []int16, dest []int64, transposeMap []int32) { + for i, s := range src { + dest[i] = int64(transposeMap[s]) + } +} + +func transposeInt16Uint64(src []int16, dest []uint64, transposeMap []int32) { + for i, s := range src { + dest[i] = uint64(transposeMap[s]) + } +} + +func transposeUint16Int8(src []uint16, dest []int8, transposeMap []int32) { + for i, s := range src { + dest[i] = int8(transposeMap[s]) + } +} + +func transposeUint16Uint8(src []uint16, dest []uint8, transposeMap []int32) { + for i, s := range src { + dest[i] = uint8(transposeMap[s]) + } +} + +func transposeUint16Int16(src []uint16, dest []int16, transposeMap []int32) { + for i, s := range src { + dest[i] = int16(transposeMap[s]) + } +} + +func transposeUint16Uint16(src []uint16, dest []uint16, transposeMap []int32) { + for i, s := range src { + dest[i] = uint16(transposeMap[s]) + } +} + +func transposeUint16Int32(src []uint16, dest []int32, transposeMap []int32) { + for i, s := range src { + dest[i] = int32(transposeMap[s]) + } +} + +func transposeUint16Uint32(src []uint16, dest []uint32, transposeMap []int32) { + for i, s := range src { + dest[i] = uint32(transposeMap[s]) + } +} + +func transposeUint16Int64(src []uint16, dest []int64, transposeMap []int32) { + for i, s := range src { + dest[i] = int64(transposeMap[s]) + } +} + +func transposeUint16Uint64(src []uint16, dest []uint64, transposeMap []int32) { + for i, s := range src { + dest[i] = uint64(transposeMap[s]) + } +} + +func transposeInt32Int8(src []int32, dest []int8, transposeMap []int32) { + for i, s := range src { + dest[i] = int8(transposeMap[s]) + } +} + +func transposeInt32Uint8(src []int32, dest []uint8, transposeMap []int32) { + for i, s := range src { + dest[i] = uint8(transposeMap[s]) + } +} + +func transposeInt32Int16(src []int32, dest []int16, transposeMap []int32) { + for i, s := range src { + dest[i] = int16(transposeMap[s]) + } +} + +func transposeInt32Uint16(src []int32, dest []uint16, transposeMap []int32) { + for i, s := range src { + dest[i] = uint16(transposeMap[s]) + } +} + +func transposeInt32Int32(src []int32, dest []int32, transposeMap []int32) { + for i, s := range src { + dest[i] = int32(transposeMap[s]) + } +} + +func transposeInt32Uint32(src []int32, dest []uint32, transposeMap []int32) { + for i, s := range src { + dest[i] = uint32(transposeMap[s]) + } +} + +func transposeInt32Int64(src []int32, dest []int64, transposeMap []int32) { + for i, s := range src { + dest[i] = int64(transposeMap[s]) + } +} + +func transposeInt32Uint64(src []int32, dest []uint64, transposeMap []int32) { + for i, s := range src { + dest[i] = uint64(transposeMap[s]) + } +} + +func transposeUint32Int8(src []uint32, dest []int8, transposeMap []int32) { + for i, s := range src { + dest[i] = int8(transposeMap[s]) + } +} + +func transposeUint32Uint8(src []uint32, dest []uint8, transposeMap []int32) { + for i, s := range src { + dest[i] = uint8(transposeMap[s]) + } +} + +func transposeUint32Int16(src []uint32, dest []int16, transposeMap []int32) { + for i, s := range src { + dest[i] = int16(transposeMap[s]) + } +} + +func transposeUint32Uint16(src []uint32, dest []uint16, transposeMap []int32) { + for i, s := range src { + dest[i] = uint16(transposeMap[s]) + } +} + +func transposeUint32Int32(src []uint32, dest []int32, transposeMap []int32) { + for i, s := range src { + dest[i] = int32(transposeMap[s]) + } +} + +func transposeUint32Uint32(src []uint32, dest []uint32, transposeMap []int32) { + for i, s := range src { + dest[i] = uint32(transposeMap[s]) + } +} + +func transposeUint32Int64(src []uint32, dest []int64, transposeMap []int32) { + for i, s := range src { + dest[i] = int64(transposeMap[s]) + } +} + +func transposeUint32Uint64(src []uint32, dest []uint64, transposeMap []int32) { + for i, s := range src { + dest[i] = uint64(transposeMap[s]) + } +} + +func transposeInt64Int8(src []int64, dest []int8, transposeMap []int32) { + for i, s := range src { + dest[i] = int8(transposeMap[s]) + } +} + +func transposeInt64Uint8(src []int64, dest []uint8, transposeMap []int32) { + for i, s := range src { + dest[i] = uint8(transposeMap[s]) + } +} + +func transposeInt64Int16(src []int64, dest []int16, transposeMap []int32) { + for i, s := range src { + dest[i] = int16(transposeMap[s]) + } +} + +func transposeInt64Uint16(src []int64, dest []uint16, transposeMap []int32) { + for i, s := range src { + dest[i] = uint16(transposeMap[s]) + } +} + +func transposeInt64Int32(src []int64, dest []int32, transposeMap []int32) { + for i, s := range src { + dest[i] = int32(transposeMap[s]) + } +} + +func transposeInt64Uint32(src []int64, dest []uint32, transposeMap []int32) { + for i, s := range src { + dest[i] = uint32(transposeMap[s]) + } +} + +func transposeInt64Int64(src []int64, dest []int64, transposeMap []int32) { + for i, s := range src { + dest[i] = int64(transposeMap[s]) + } +} + +func transposeInt64Uint64(src []int64, dest []uint64, transposeMap []int32) { + for i, s := range src { + dest[i] = uint64(transposeMap[s]) + } +} + +func transposeUint64Int8(src []uint64, dest []int8, transposeMap []int32) { + for i, s := range src { + dest[i] = int8(transposeMap[s]) + } +} + +func transposeUint64Uint8(src []uint64, dest []uint8, transposeMap []int32) { + for i, s := range src { + dest[i] = uint8(transposeMap[s]) + } +} + +func transposeUint64Int16(src []uint64, dest []int16, transposeMap []int32) { + for i, s := range src { + dest[i] = int16(transposeMap[s]) + } +} + +func transposeUint64Uint16(src []uint64, dest []uint16, transposeMap []int32) { + for i, s := range src { + dest[i] = uint16(transposeMap[s]) + } +} + +func transposeUint64Int32(src []uint64, dest []int32, transposeMap []int32) { + for i, s := range src { + dest[i] = int32(transposeMap[s]) + } +} + +func transposeUint64Uint32(src []uint64, dest []uint32, transposeMap []int32) { + for i, s := range src { + dest[i] = uint32(transposeMap[s]) + } +} + +func transposeUint64Int64(src []uint64, dest []int64, transposeMap []int32) { + for i, s := range src { + dest[i] = int64(transposeMap[s]) + } +} + +func transposeUint64Uint64(src []uint64, dest []uint64, transposeMap []int32) { + for i, s := range src { + dest[i] = uint64(transposeMap[s]) + } +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go.tmpl b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go.tmpl new file mode 100644 index 000000000..680ae1ee7 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.go.tmpl @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +{{ $typelist := .In }} +{{range .In}} +{{ $src := .Type }} +{{ $srcName := .Name }} +{{ range $typelist }} +{{ $dest := .Type }} +{{ $destName := .Name }} + +func transpose{{ $srcName }}{{ $destName }}(src []{{$src}}, dest []{{$dest}}, transposeMap []int32) { + for i, s := range src { + dest[i] = {{ $dest }}(transposeMap[s]) + } +} + +{{ end }} +{{ end }} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.tmpldata b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.tmpldata new file mode 100644 index 000000000..72eaf300c --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints.tmpldata @@ -0,0 +1,34 @@ +[ + { + "Name": "Int8", + "Type": "int8" + }, + { + "Name": "Uint8", + "Type": "uint8" + }, + { + "Name": "Int16", + "Type": "int16" + }, + { + "Name": "Uint16", + "Type": "uint16" + }, + { + "Name": "Int32", + "Type": "int32" + }, + { + "Name": "Uint32", + "Type": "uint32" + }, + { + "Name": "Int64", + "Type": "int64" + }, + { + "Name": "Uint64", + "Type": "uint64" + } +] diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go new file mode 100644 index 000000000..d4433d368 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go @@ -0,0 +1,325 @@ +// Code generated by transpose_ints_amd64.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +import ( + "golang.org/x/sys/cpu" +) + +var ( + TransposeInt8Int8 func([]int8, []int8, []int32) + TransposeInt8Uint8 func([]int8, []uint8, []int32) + TransposeInt8Int16 func([]int8, []int16, []int32) + TransposeInt8Uint16 func([]int8, []uint16, []int32) + TransposeInt8Int32 func([]int8, []int32, []int32) + TransposeInt8Uint32 func([]int8, []uint32, []int32) + TransposeInt8Int64 func([]int8, []int64, []int32) + TransposeInt8Uint64 func([]int8, []uint64, []int32) + + TransposeUint8Int8 func([]uint8, []int8, []int32) + TransposeUint8Uint8 func([]uint8, []uint8, []int32) + TransposeUint8Int16 func([]uint8, []int16, []int32) + TransposeUint8Uint16 func([]uint8, []uint16, []int32) + TransposeUint8Int32 func([]uint8, []int32, []int32) + TransposeUint8Uint32 func([]uint8, []uint32, []int32) + TransposeUint8Int64 func([]uint8, []int64, []int32) + TransposeUint8Uint64 func([]uint8, []uint64, []int32) + + TransposeInt16Int8 func([]int16, []int8, []int32) + TransposeInt16Uint8 func([]int16, []uint8, []int32) + TransposeInt16Int16 func([]int16, []int16, []int32) + TransposeInt16Uint16 func([]int16, []uint16, []int32) + TransposeInt16Int32 func([]int16, []int32, []int32) + TransposeInt16Uint32 func([]int16, []uint32, []int32) + TransposeInt16Int64 func([]int16, []int64, []int32) + TransposeInt16Uint64 func([]int16, []uint64, []int32) + + TransposeUint16Int8 func([]uint16, []int8, []int32) + TransposeUint16Uint8 func([]uint16, []uint8, []int32) + TransposeUint16Int16 func([]uint16, []int16, []int32) + TransposeUint16Uint16 func([]uint16, []uint16, []int32) + TransposeUint16Int32 func([]uint16, []int32, []int32) + TransposeUint16Uint32 func([]uint16, []uint32, []int32) + TransposeUint16Int64 func([]uint16, []int64, []int32) + TransposeUint16Uint64 func([]uint16, []uint64, []int32) + + TransposeInt32Int8 func([]int32, []int8, []int32) + TransposeInt32Uint8 func([]int32, []uint8, []int32) + TransposeInt32Int16 func([]int32, []int16, []int32) + TransposeInt32Uint16 func([]int32, []uint16, []int32) + TransposeInt32Int32 func([]int32, []int32, []int32) + TransposeInt32Uint32 func([]int32, []uint32, []int32) + TransposeInt32Int64 func([]int32, []int64, []int32) + TransposeInt32Uint64 func([]int32, []uint64, []int32) + + TransposeUint32Int8 func([]uint32, []int8, []int32) + TransposeUint32Uint8 func([]uint32, []uint8, []int32) + TransposeUint32Int16 func([]uint32, []int16, []int32) + TransposeUint32Uint16 func([]uint32, []uint16, []int32) + TransposeUint32Int32 func([]uint32, []int32, []int32) + TransposeUint32Uint32 func([]uint32, []uint32, []int32) + TransposeUint32Int64 func([]uint32, []int64, []int32) + TransposeUint32Uint64 func([]uint32, []uint64, []int32) + + TransposeInt64Int8 func([]int64, []int8, []int32) + TransposeInt64Uint8 func([]int64, []uint8, []int32) + TransposeInt64Int16 func([]int64, []int16, []int32) + TransposeInt64Uint16 func([]int64, []uint16, []int32) + TransposeInt64Int32 func([]int64, []int32, []int32) + TransposeInt64Uint32 func([]int64, []uint32, []int32) + TransposeInt64Int64 func([]int64, []int64, []int32) + TransposeInt64Uint64 func([]int64, []uint64, []int32) + + TransposeUint64Int8 func([]uint64, []int8, []int32) + TransposeUint64Uint8 func([]uint64, []uint8, []int32) + TransposeUint64Int16 func([]uint64, []int16, []int32) + TransposeUint64Uint16 func([]uint64, []uint16, []int32) + TransposeUint64Int32 func([]uint64, []int32, []int32) + TransposeUint64Uint32 func([]uint64, []uint32, []int32) + TransposeUint64Int64 func([]uint64, []int64, []int32) + TransposeUint64Uint64 func([]uint64, []uint64, []int32) +) + +func init() { + if cpu.X86.HasAVX2 { + + TransposeInt8Int8 = transposeInt8Int8avx2 + TransposeInt8Uint8 = transposeInt8Uint8avx2 + TransposeInt8Int16 = transposeInt8Int16avx2 + TransposeInt8Uint16 = transposeInt8Uint16avx2 + TransposeInt8Int32 = transposeInt8Int32avx2 + TransposeInt8Uint32 = transposeInt8Uint32avx2 + TransposeInt8Int64 = transposeInt8Int64avx2 + TransposeInt8Uint64 = transposeInt8Uint64avx2 + + TransposeUint8Int8 = transposeUint8Int8avx2 + TransposeUint8Uint8 = transposeUint8Uint8avx2 + TransposeUint8Int16 = transposeUint8Int16avx2 + TransposeUint8Uint16 = transposeUint8Uint16avx2 + TransposeUint8Int32 = transposeUint8Int32avx2 + TransposeUint8Uint32 = transposeUint8Uint32avx2 + TransposeUint8Int64 = transposeUint8Int64avx2 + TransposeUint8Uint64 = transposeUint8Uint64avx2 + + TransposeInt16Int8 = transposeInt16Int8avx2 + TransposeInt16Uint8 = transposeInt16Uint8avx2 + TransposeInt16Int16 = transposeInt16Int16avx2 + TransposeInt16Uint16 = transposeInt16Uint16avx2 + TransposeInt16Int32 = transposeInt16Int32avx2 + TransposeInt16Uint32 = transposeInt16Uint32avx2 + TransposeInt16Int64 = transposeInt16Int64avx2 + TransposeInt16Uint64 = transposeInt16Uint64avx2 + + TransposeUint16Int8 = transposeUint16Int8avx2 + TransposeUint16Uint8 = transposeUint16Uint8avx2 + TransposeUint16Int16 = transposeUint16Int16avx2 + TransposeUint16Uint16 = transposeUint16Uint16avx2 + TransposeUint16Int32 = transposeUint16Int32avx2 + TransposeUint16Uint32 = transposeUint16Uint32avx2 + TransposeUint16Int64 = transposeUint16Int64avx2 + TransposeUint16Uint64 = transposeUint16Uint64avx2 + + TransposeInt32Int8 = transposeInt32Int8avx2 + TransposeInt32Uint8 = transposeInt32Uint8avx2 + TransposeInt32Int16 = transposeInt32Int16avx2 + TransposeInt32Uint16 = transposeInt32Uint16avx2 + TransposeInt32Int32 = transposeInt32Int32avx2 + TransposeInt32Uint32 = transposeInt32Uint32avx2 + TransposeInt32Int64 = transposeInt32Int64avx2 + TransposeInt32Uint64 = transposeInt32Uint64avx2 + + TransposeUint32Int8 = transposeUint32Int8avx2 + TransposeUint32Uint8 = transposeUint32Uint8avx2 + TransposeUint32Int16 = transposeUint32Int16avx2 + TransposeUint32Uint16 = transposeUint32Uint16avx2 + TransposeUint32Int32 = transposeUint32Int32avx2 + TransposeUint32Uint32 = transposeUint32Uint32avx2 + TransposeUint32Int64 = transposeUint32Int64avx2 + TransposeUint32Uint64 = transposeUint32Uint64avx2 + + TransposeInt64Int8 = transposeInt64Int8avx2 + TransposeInt64Uint8 = transposeInt64Uint8avx2 + TransposeInt64Int16 = transposeInt64Int16avx2 + TransposeInt64Uint16 = transposeInt64Uint16avx2 + TransposeInt64Int32 = transposeInt64Int32avx2 + TransposeInt64Uint32 = transposeInt64Uint32avx2 + TransposeInt64Int64 = transposeInt64Int64avx2 + TransposeInt64Uint64 = transposeInt64Uint64avx2 + + TransposeUint64Int8 = transposeUint64Int8avx2 + TransposeUint64Uint8 = transposeUint64Uint8avx2 + TransposeUint64Int16 = transposeUint64Int16avx2 + TransposeUint64Uint16 = transposeUint64Uint16avx2 + TransposeUint64Int32 = transposeUint64Int32avx2 + TransposeUint64Uint32 = transposeUint64Uint32avx2 + TransposeUint64Int64 = transposeUint64Int64avx2 + TransposeUint64Uint64 = transposeUint64Uint64avx2 + + } else if cpu.X86.HasSSE42 { + + TransposeInt8Int8 = transposeInt8Int8sse4 + TransposeInt8Uint8 = transposeInt8Uint8sse4 + TransposeInt8Int16 = transposeInt8Int16sse4 + TransposeInt8Uint16 = transposeInt8Uint16sse4 + TransposeInt8Int32 = transposeInt8Int32sse4 + TransposeInt8Uint32 = transposeInt8Uint32sse4 + TransposeInt8Int64 = transposeInt8Int64sse4 + TransposeInt8Uint64 = transposeInt8Uint64sse4 + + TransposeUint8Int8 = transposeUint8Int8sse4 + TransposeUint8Uint8 = transposeUint8Uint8sse4 + TransposeUint8Int16 = transposeUint8Int16sse4 + TransposeUint8Uint16 = transposeUint8Uint16sse4 + TransposeUint8Int32 = transposeUint8Int32sse4 + TransposeUint8Uint32 = transposeUint8Uint32sse4 + TransposeUint8Int64 = transposeUint8Int64sse4 + TransposeUint8Uint64 = transposeUint8Uint64sse4 + + TransposeInt16Int8 = transposeInt16Int8sse4 + TransposeInt16Uint8 = transposeInt16Uint8sse4 + TransposeInt16Int16 = transposeInt16Int16sse4 + TransposeInt16Uint16 = transposeInt16Uint16sse4 + TransposeInt16Int32 = transposeInt16Int32sse4 + TransposeInt16Uint32 = transposeInt16Uint32sse4 + TransposeInt16Int64 = transposeInt16Int64sse4 + TransposeInt16Uint64 = transposeInt16Uint64sse4 + + TransposeUint16Int8 = transposeUint16Int8sse4 + TransposeUint16Uint8 = transposeUint16Uint8sse4 + TransposeUint16Int16 = transposeUint16Int16sse4 + TransposeUint16Uint16 = transposeUint16Uint16sse4 + TransposeUint16Int32 = transposeUint16Int32sse4 + TransposeUint16Uint32 = transposeUint16Uint32sse4 + TransposeUint16Int64 = transposeUint16Int64sse4 + TransposeUint16Uint64 = transposeUint16Uint64sse4 + + TransposeInt32Int8 = transposeInt32Int8sse4 + TransposeInt32Uint8 = transposeInt32Uint8sse4 + TransposeInt32Int16 = transposeInt32Int16sse4 + TransposeInt32Uint16 = transposeInt32Uint16sse4 + TransposeInt32Int32 = transposeInt32Int32sse4 + TransposeInt32Uint32 = transposeInt32Uint32sse4 + TransposeInt32Int64 = transposeInt32Int64sse4 + TransposeInt32Uint64 = transposeInt32Uint64sse4 + + TransposeUint32Int8 = transposeUint32Int8sse4 + TransposeUint32Uint8 = transposeUint32Uint8sse4 + TransposeUint32Int16 = transposeUint32Int16sse4 + TransposeUint32Uint16 = transposeUint32Uint16sse4 + TransposeUint32Int32 = transposeUint32Int32sse4 + TransposeUint32Uint32 = transposeUint32Uint32sse4 + TransposeUint32Int64 = transposeUint32Int64sse4 + TransposeUint32Uint64 = transposeUint32Uint64sse4 + + TransposeInt64Int8 = transposeInt64Int8sse4 + TransposeInt64Uint8 = transposeInt64Uint8sse4 + TransposeInt64Int16 = transposeInt64Int16sse4 + TransposeInt64Uint16 = transposeInt64Uint16sse4 + TransposeInt64Int32 = transposeInt64Int32sse4 + TransposeInt64Uint32 = transposeInt64Uint32sse4 + TransposeInt64Int64 = transposeInt64Int64sse4 + TransposeInt64Uint64 = transposeInt64Uint64sse4 + + TransposeUint64Int8 = transposeUint64Int8sse4 + TransposeUint64Uint8 = transposeUint64Uint8sse4 + TransposeUint64Int16 = transposeUint64Int16sse4 + TransposeUint64Uint16 = transposeUint64Uint16sse4 + TransposeUint64Int32 = transposeUint64Int32sse4 + TransposeUint64Uint32 = transposeUint64Uint32sse4 + TransposeUint64Int64 = transposeUint64Int64sse4 + TransposeUint64Uint64 = transposeUint64Uint64sse4 + + } else { + + TransposeInt8Int8 = transposeInt8Int8 + TransposeInt8Uint8 = transposeInt8Uint8 + TransposeInt8Int16 = transposeInt8Int16 + TransposeInt8Uint16 = transposeInt8Uint16 + TransposeInt8Int32 = transposeInt8Int32 + TransposeInt8Uint32 = transposeInt8Uint32 + TransposeInt8Int64 = transposeInt8Int64 + TransposeInt8Uint64 = transposeInt8Uint64 + + TransposeUint8Int8 = transposeUint8Int8 + TransposeUint8Uint8 = transposeUint8Uint8 + TransposeUint8Int16 = transposeUint8Int16 + TransposeUint8Uint16 = transposeUint8Uint16 + TransposeUint8Int32 = transposeUint8Int32 + TransposeUint8Uint32 = transposeUint8Uint32 + TransposeUint8Int64 = transposeUint8Int64 + TransposeUint8Uint64 = transposeUint8Uint64 + + TransposeInt16Int8 = transposeInt16Int8 + TransposeInt16Uint8 = transposeInt16Uint8 + TransposeInt16Int16 = transposeInt16Int16 + TransposeInt16Uint16 = transposeInt16Uint16 + TransposeInt16Int32 = transposeInt16Int32 + TransposeInt16Uint32 = transposeInt16Uint32 + TransposeInt16Int64 = transposeInt16Int64 + TransposeInt16Uint64 = transposeInt16Uint64 + + TransposeUint16Int8 = transposeUint16Int8 + TransposeUint16Uint8 = transposeUint16Uint8 + TransposeUint16Int16 = transposeUint16Int16 + TransposeUint16Uint16 = transposeUint16Uint16 + TransposeUint16Int32 = transposeUint16Int32 + TransposeUint16Uint32 = transposeUint16Uint32 + TransposeUint16Int64 = transposeUint16Int64 + TransposeUint16Uint64 = transposeUint16Uint64 + + TransposeInt32Int8 = transposeInt32Int8 + TransposeInt32Uint8 = transposeInt32Uint8 + TransposeInt32Int16 = transposeInt32Int16 + TransposeInt32Uint16 = transposeInt32Uint16 + TransposeInt32Int32 = transposeInt32Int32 + TransposeInt32Uint32 = transposeInt32Uint32 + TransposeInt32Int64 = transposeInt32Int64 + TransposeInt32Uint64 = transposeInt32Uint64 + + TransposeUint32Int8 = transposeUint32Int8 + TransposeUint32Uint8 = transposeUint32Uint8 + TransposeUint32Int16 = transposeUint32Int16 + TransposeUint32Uint16 = transposeUint32Uint16 + TransposeUint32Int32 = transposeUint32Int32 + TransposeUint32Uint32 = transposeUint32Uint32 + TransposeUint32Int64 = transposeUint32Int64 + TransposeUint32Uint64 = transposeUint32Uint64 + + TransposeInt64Int8 = transposeInt64Int8 + TransposeInt64Uint8 = transposeInt64Uint8 + TransposeInt64Int16 = transposeInt64Int16 + TransposeInt64Uint16 = transposeInt64Uint16 + TransposeInt64Int32 = transposeInt64Int32 + TransposeInt64Uint32 = transposeInt64Uint32 + TransposeInt64Int64 = transposeInt64Int64 + TransposeInt64Uint64 = transposeInt64Uint64 + + TransposeUint64Int8 = transposeUint64Int8 + TransposeUint64Uint8 = transposeUint64Uint8 + TransposeUint64Int16 = transposeUint64Int16 + TransposeUint64Uint16 = transposeUint64Uint16 + TransposeUint64Int32 = transposeUint64Int32 + TransposeUint64Uint32 = transposeUint64Uint32 + TransposeUint64Int64 = transposeUint64Int64 + TransposeUint64Uint64 = transposeUint64Uint64 + + } +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go.tmpl b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go.tmpl new file mode 100644 index 000000000..eac0208e5 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_amd64.go.tmpl @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm +// +build !noasm + +package utils + +import ( + "golang.org/x/sys/cpu" +) + +var ( +{{ $typelist := .In }} +{{range .In}} +{{ $src := .Type -}} +{{ $srcName := .Name -}} +{{ range $typelist -}} +{{ $dest := .Type -}} +{{ $destName := .Name -}} + Transpose{{$srcName}}{{$destName}} func([]{{$src}}, []{{$dest}}, []int32) +{{end}} +{{end}} +) + + +func init() { + if cpu.X86.HasAVX2 { +{{ $typelist := .In }} +{{range .In}} +{{ $src := .Type -}} +{{ $srcName := .Name -}} +{{ range $typelist -}} +{{ $dest := .Type -}} +{{ $destName := .Name -}} + Transpose{{$srcName}}{{$destName}} = transpose{{ $srcName }}{{ $destName }}avx2 +{{end}} +{{end}} + } else if cpu.X86.HasSSE42 { +{{ $typelist := .In }} +{{range .In}} +{{ $src := .Type -}} +{{ $srcName := .Name -}} +{{ range $typelist -}} +{{ $dest := .Type -}} +{{ $destName := .Name -}} + Transpose{{$srcName}}{{$destName}} = transpose{{ $srcName }}{{ $destName }}sse4 +{{end}} +{{end}} + } else { +{{ $typelist := .In }} +{{range .In}} +{{ $src := .Type -}} +{{ $srcName := .Name -}} +{{ range $typelist -}} +{{ $dest := .Type -}} +{{ $destName := .Name -}} + Transpose{{$srcName}}{{$destName}} = transpose{{ $srcName }}{{ $destName }} +{{end}} +{{end}} + } +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_arm64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_arm64.go new file mode 100644 index 000000000..cc957cdaa --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_arm64.go @@ -0,0 +1,96 @@ +// Code generated by transpose_ints_s390x.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +// if building with the 'noasm' tag, then point to the pure go implementations +var ( + TransposeInt8Int8 = transposeInt8Int8 + TransposeInt8Uint8 = transposeInt8Uint8 + TransposeInt8Int16 = transposeInt8Int16 + TransposeInt8Uint16 = transposeInt8Uint16 + TransposeInt8Int32 = transposeInt8Int32 + TransposeInt8Uint32 = transposeInt8Uint32 + TransposeInt8Int64 = transposeInt8Int64 + TransposeInt8Uint64 = transposeInt8Uint64 + + TransposeUint8Int8 = transposeUint8Int8 + TransposeUint8Uint8 = transposeUint8Uint8 + TransposeUint8Int16 = transposeUint8Int16 + TransposeUint8Uint16 = transposeUint8Uint16 + TransposeUint8Int32 = transposeUint8Int32 + TransposeUint8Uint32 = transposeUint8Uint32 + TransposeUint8Int64 = transposeUint8Int64 + TransposeUint8Uint64 = transposeUint8Uint64 + + TransposeInt16Int8 = transposeInt16Int8 + TransposeInt16Uint8 = transposeInt16Uint8 + TransposeInt16Int16 = transposeInt16Int16 + TransposeInt16Uint16 = transposeInt16Uint16 + TransposeInt16Int32 = transposeInt16Int32 + TransposeInt16Uint32 = transposeInt16Uint32 + TransposeInt16Int64 = transposeInt16Int64 + TransposeInt16Uint64 = transposeInt16Uint64 + + TransposeUint16Int8 = transposeUint16Int8 + TransposeUint16Uint8 = transposeUint16Uint8 + TransposeUint16Int16 = transposeUint16Int16 + TransposeUint16Uint16 = transposeUint16Uint16 + TransposeUint16Int32 = transposeUint16Int32 + TransposeUint16Uint32 = transposeUint16Uint32 + TransposeUint16Int64 = transposeUint16Int64 + TransposeUint16Uint64 = transposeUint16Uint64 + + TransposeInt32Int8 = transposeInt32Int8 + TransposeInt32Uint8 = transposeInt32Uint8 + TransposeInt32Int16 = transposeInt32Int16 + TransposeInt32Uint16 = transposeInt32Uint16 + TransposeInt32Int32 = transposeInt32Int32 + TransposeInt32Uint32 = transposeInt32Uint32 + TransposeInt32Int64 = transposeInt32Int64 + TransposeInt32Uint64 = transposeInt32Uint64 + + TransposeUint32Int8 = transposeUint32Int8 + TransposeUint32Uint8 = transposeUint32Uint8 + TransposeUint32Int16 = transposeUint32Int16 + TransposeUint32Uint16 = transposeUint32Uint16 + TransposeUint32Int32 = transposeUint32Int32 + TransposeUint32Uint32 = transposeUint32Uint32 + TransposeUint32Int64 = transposeUint32Int64 + TransposeUint32Uint64 = transposeUint32Uint64 + + TransposeInt64Int8 = transposeInt64Int8 + TransposeInt64Uint8 = transposeInt64Uint8 + TransposeInt64Int16 = transposeInt64Int16 + TransposeInt64Uint16 = transposeInt64Uint16 + TransposeInt64Int32 = transposeInt64Int32 + TransposeInt64Uint32 = transposeInt64Uint32 + TransposeInt64Int64 = transposeInt64Int64 + TransposeInt64Uint64 = transposeInt64Uint64 + + TransposeUint64Int8 = transposeUint64Int8 + TransposeUint64Uint8 = transposeUint64Uint8 + TransposeUint64Int16 = transposeUint64Int16 + TransposeUint64Uint16 = transposeUint64Uint16 + TransposeUint64Int32 = transposeUint64Int32 + TransposeUint64Uint32 = transposeUint64Uint32 + TransposeUint64Int64 = transposeUint64Int64 + TransposeUint64Uint64 = transposeUint64Uint64 +) diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.go new file mode 100644 index 000000000..f1421ddf5 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.go @@ -0,0 +1,473 @@ +// Code generated by transpose_ints_simd.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +import ( + "unsafe" +) + +//go:noescape +func _transpose_int8_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Int8avx2(src []int8, dest []int8, transposeMap []int32) { + _transpose_int8_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Uint8avx2(src []int8, dest []uint8, transposeMap []int32) { + _transpose_int8_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Int16avx2(src []int8, dest []int16, transposeMap []int32) { + _transpose_int8_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Uint16avx2(src []int8, dest []uint16, transposeMap []int32) { + _transpose_int8_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Int32avx2(src []int8, dest []int32, transposeMap []int32) { + _transpose_int8_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Uint32avx2(src []int8, dest []uint32, transposeMap []int32) { + _transpose_int8_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Int64avx2(src []int8, dest []int64, transposeMap []int32) { + _transpose_int8_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Uint64avx2(src []int8, dest []uint64, transposeMap []int32) { + _transpose_int8_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Int8avx2(src []uint8, dest []int8, transposeMap []int32) { + _transpose_uint8_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Uint8avx2(src []uint8, dest []uint8, transposeMap []int32) { + _transpose_uint8_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Int16avx2(src []uint8, dest []int16, transposeMap []int32) { + _transpose_uint8_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Uint16avx2(src []uint8, dest []uint16, transposeMap []int32) { + _transpose_uint8_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Int32avx2(src []uint8, dest []int32, transposeMap []int32) { + _transpose_uint8_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Uint32avx2(src []uint8, dest []uint32, transposeMap []int32) { + _transpose_uint8_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Int64avx2(src []uint8, dest []int64, transposeMap []int32) { + _transpose_uint8_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Uint64avx2(src []uint8, dest []uint64, transposeMap []int32) { + _transpose_uint8_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Int8avx2(src []int16, dest []int8, transposeMap []int32) { + _transpose_int16_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Uint8avx2(src []int16, dest []uint8, transposeMap []int32) { + _transpose_int16_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Int16avx2(src []int16, dest []int16, transposeMap []int32) { + _transpose_int16_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Uint16avx2(src []int16, dest []uint16, transposeMap []int32) { + _transpose_int16_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Int32avx2(src []int16, dest []int32, transposeMap []int32) { + _transpose_int16_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Uint32avx2(src []int16, dest []uint32, transposeMap []int32) { + _transpose_int16_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Int64avx2(src []int16, dest []int64, transposeMap []int32) { + _transpose_int16_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Uint64avx2(src []int16, dest []uint64, transposeMap []int32) { + _transpose_int16_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Int8avx2(src []uint16, dest []int8, transposeMap []int32) { + _transpose_uint16_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Uint8avx2(src []uint16, dest []uint8, transposeMap []int32) { + _transpose_uint16_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Int16avx2(src []uint16, dest []int16, transposeMap []int32) { + _transpose_uint16_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Uint16avx2(src []uint16, dest []uint16, transposeMap []int32) { + _transpose_uint16_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Int32avx2(src []uint16, dest []int32, transposeMap []int32) { + _transpose_uint16_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Uint32avx2(src []uint16, dest []uint32, transposeMap []int32) { + _transpose_uint16_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Int64avx2(src []uint16, dest []int64, transposeMap []int32) { + _transpose_uint16_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Uint64avx2(src []uint16, dest []uint64, transposeMap []int32) { + _transpose_uint16_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Int8avx2(src []int32, dest []int8, transposeMap []int32) { + _transpose_int32_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Uint8avx2(src []int32, dest []uint8, transposeMap []int32) { + _transpose_int32_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Int16avx2(src []int32, dest []int16, transposeMap []int32) { + _transpose_int32_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Uint16avx2(src []int32, dest []uint16, transposeMap []int32) { + _transpose_int32_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Int32avx2(src []int32, dest []int32, transposeMap []int32) { + _transpose_int32_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Uint32avx2(src []int32, dest []uint32, transposeMap []int32) { + _transpose_int32_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Int64avx2(src []int32, dest []int64, transposeMap []int32) { + _transpose_int32_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Uint64avx2(src []int32, dest []uint64, transposeMap []int32) { + _transpose_int32_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Int8avx2(src []uint32, dest []int8, transposeMap []int32) { + _transpose_uint32_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Uint8avx2(src []uint32, dest []uint8, transposeMap []int32) { + _transpose_uint32_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Int16avx2(src []uint32, dest []int16, transposeMap []int32) { + _transpose_uint32_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Uint16avx2(src []uint32, dest []uint16, transposeMap []int32) { + _transpose_uint32_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Int32avx2(src []uint32, dest []int32, transposeMap []int32) { + _transpose_uint32_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Uint32avx2(src []uint32, dest []uint32, transposeMap []int32) { + _transpose_uint32_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Int64avx2(src []uint32, dest []int64, transposeMap []int32) { + _transpose_uint32_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Uint64avx2(src []uint32, dest []uint64, transposeMap []int32) { + _transpose_uint32_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Int8avx2(src []int64, dest []int8, transposeMap []int32) { + _transpose_int64_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Uint8avx2(src []int64, dest []uint8, transposeMap []int32) { + _transpose_int64_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Int16avx2(src []int64, dest []int16, transposeMap []int32) { + _transpose_int64_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Uint16avx2(src []int64, dest []uint16, transposeMap []int32) { + _transpose_int64_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Int32avx2(src []int64, dest []int32, transposeMap []int32) { + _transpose_int64_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Uint32avx2(src []int64, dest []uint32, transposeMap []int32) { + _transpose_int64_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Int64avx2(src []int64, dest []int64, transposeMap []int32) { + _transpose_int64_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Uint64avx2(src []int64, dest []uint64, transposeMap []int32) { + _transpose_int64_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_int8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Int8avx2(src []uint64, dest []int8, transposeMap []int32) { + _transpose_uint64_int8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_uint8_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Uint8avx2(src []uint64, dest []uint8, transposeMap []int32) { + _transpose_uint64_uint8_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_int16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Int16avx2(src []uint64, dest []int16, transposeMap []int32) { + _transpose_uint64_int16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_uint16_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Uint16avx2(src []uint64, dest []uint16, transposeMap []int32) { + _transpose_uint64_uint16_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_int32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Int32avx2(src []uint64, dest []int32, transposeMap []int32) { + _transpose_uint64_int32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_uint32_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Uint32avx2(src []uint64, dest []uint32, transposeMap []int32) { + _transpose_uint64_uint32_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_int64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Int64avx2(src []uint64, dest []int64, transposeMap []int32) { + _transpose_uint64_int64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_uint64_avx2(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Uint64avx2(src []uint64, dest []uint64, transposeMap []int32) { + _transpose_uint64_uint64_avx2(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.s b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.s new file mode 100644 index 000000000..fbcc101eb --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_avx2_amd64.s @@ -0,0 +1,3074 @@ +//+build !noasm !appengine +// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT + +TEXT ·_transpose_uint8_uint8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB0_1 + +LBB0_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB0_5 + +LBB0_1: + WORD $0xd285 // test edx, edx + JLE LBB0_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB0_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB0_3 + +LBB0_4: + RET + +TEXT ·_transpose_int8_uint8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB1_1 + +LBB1_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB1_5 + +LBB1_1: + WORD $0xd285 // test edx, edx + JLE LBB1_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB1_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB1_3 + +LBB1_4: + RET + +TEXT ·_transpose_uint16_uint8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB2_1 + +LBB2_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x0257b70f // movzx edx, word [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x0457b70f // movzx edx, word [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x0657b70f // movzx edx, word [rdi + 6] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB2_5 + +LBB2_1: + WORD $0xd285 // test edx, edx + JLE LBB2_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB2_3: + LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB2_3 + +LBB2_4: + RET + +TEXT ·_transpose_int16_uint8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB3_1 + +LBB3_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB3_5 + +LBB3_1: + WORD $0xd285 // test edx, edx + JLE LBB3_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB3_3: + LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB3_3 + +LBB3_4: + RET + +TEXT ·_transpose_uint32_uint8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB4_1 + +LBB4_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB4_5 + +LBB4_1: + WORD $0xd285 // test edx, edx + JLE LBB4_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB4_3: + LONG $0x87048b42 // mov eax, dword [rdi + 4*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB4_3 + +LBB4_4: + RET + +TEXT ·_transpose_int32_uint8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB5_1 + +LBB5_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB5_5 + +LBB5_1: + WORD $0xd285 // test edx, edx + JLE LBB5_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB5_3: + LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB5_3 + +LBB5_4: + RET + +TEXT ·_transpose_uint64_uint8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB6_1 + +LBB6_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB6_5 + +LBB6_1: + WORD $0xd285 // test edx, edx + JLE LBB6_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB6_3: + LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB6_3 + +LBB6_4: + RET + +TEXT ·_transpose_int64_uint8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB7_1 + +LBB7_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB7_5 + +LBB7_1: + WORD $0xd285 // test edx, edx + JLE LBB7_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB7_3: + LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB7_3 + +LBB7_4: + RET + +TEXT ·_transpose_uint8_int8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB8_1 + +LBB8_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB8_5 + +LBB8_1: + WORD $0xd285 // test edx, edx + JLE LBB8_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB8_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB8_3 + +LBB8_4: + RET + +TEXT ·_transpose_int8_int8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB9_1 + +LBB9_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB9_5 + +LBB9_1: + WORD $0xd285 // test edx, edx + JLE LBB9_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB9_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB9_3 + +LBB9_4: + RET + +TEXT ·_transpose_uint16_int8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB10_1 + +LBB10_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x0257b70f // movzx edx, word [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x0457b70f // movzx edx, word [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x0657b70f // movzx edx, word [rdi + 6] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB10_5 + +LBB10_1: + WORD $0xd285 // test edx, edx + JLE LBB10_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB10_3: + LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB10_3 + +LBB10_4: + RET + +TEXT ·_transpose_int16_int8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB11_1 + +LBB11_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB11_5 + +LBB11_1: + WORD $0xd285 // test edx, edx + JLE LBB11_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB11_3: + LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB11_3 + +LBB11_4: + RET + +TEXT ·_transpose_uint32_int8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB12_1 + +LBB12_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB12_5 + +LBB12_1: + WORD $0xd285 // test edx, edx + JLE LBB12_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB12_3: + LONG $0x87048b42 // mov eax, dword [rdi + 4*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB12_3 + +LBB12_4: + RET + +TEXT ·_transpose_int32_int8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB13_1 + +LBB13_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB13_5 + +LBB13_1: + WORD $0xd285 // test edx, edx + JLE LBB13_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB13_3: + LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB13_3 + +LBB13_4: + RET + +TEXT ·_transpose_uint64_int8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB14_1 + +LBB14_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB14_5 + +LBB14_1: + WORD $0xd285 // test edx, edx + JLE LBB14_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB14_3: + LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB14_3 + +LBB14_4: + RET + +TEXT ·_transpose_int64_int8_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB15_1 + +LBB15_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB15_5 + +LBB15_1: + WORD $0xd285 // test edx, edx + JLE LBB15_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB15_3: + LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB15_3 + +LBB15_4: + RET + +TEXT ·_transpose_uint8_uint16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB16_1 + +LBB16_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB16_5 + +LBB16_1: + WORD $0xd285 // test edx, edx + JLE LBB16_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB16_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB16_3 + +LBB16_4: + RET + +TEXT ·_transpose_int8_uint16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB17_1 + +LBB17_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB17_5 + +LBB17_1: + WORD $0xd285 // test edx, edx + JLE LBB17_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB17_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB17_3 + +LBB17_4: + RET + +TEXT ·_transpose_uint16_uint16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB18_1 + +LBB18_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x0257b70f // movzx edx, word [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x0457b70f // movzx edx, word [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x0657b70f // movzx edx, word [rdi + 6] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB18_5 + +LBB18_1: + WORD $0xd285 // test edx, edx + JLE LBB18_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB18_3: + LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB18_3 + +LBB18_4: + RET + +TEXT ·_transpose_int16_uint16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB19_1 + +LBB19_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB19_5 + +LBB19_1: + WORD $0xd285 // test edx, edx + JLE LBB19_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB19_3: + LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB19_3 + +LBB19_4: + RET + +TEXT ·_transpose_uint32_uint16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB20_1 + +LBB20_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB20_5 + +LBB20_1: + WORD $0xd285 // test edx, edx + JLE LBB20_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB20_3: + LONG $0x47048b42 // mov eax, dword [rdi + 2*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB20_3 + +LBB20_4: + RET + +TEXT ·_transpose_int32_uint16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB21_1 + +LBB21_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB21_5 + +LBB21_1: + WORD $0xd285 // test edx, edx + JLE LBB21_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB21_3: + LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB21_3 + +LBB21_4: + RET + +TEXT ·_transpose_uint64_uint16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB22_1 + +LBB22_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB22_5 + +LBB22_1: + WORD $0xd285 // test edx, edx + JLE LBB22_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB22_3: + LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB22_3 + +LBB22_4: + RET + +TEXT ·_transpose_int64_uint16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB23_1 + +LBB23_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB23_5 + +LBB23_1: + WORD $0xd285 // test edx, edx + JLE LBB23_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB23_3: + LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB23_3 + +LBB23_4: + RET + +TEXT ·_transpose_uint8_int16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB24_1 + +LBB24_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB24_5 + +LBB24_1: + WORD $0xd285 // test edx, edx + JLE LBB24_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB24_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB24_3 + +LBB24_4: + RET + +TEXT ·_transpose_int8_int16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB25_1 + +LBB25_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB25_5 + +LBB25_1: + WORD $0xd285 // test edx, edx + JLE LBB25_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB25_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB25_3 + +LBB25_4: + RET + +TEXT ·_transpose_uint16_int16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB26_1 + +LBB26_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x0257b70f // movzx edx, word [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x0457b70f // movzx edx, word [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x0657b70f // movzx edx, word [rdi + 6] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB26_5 + +LBB26_1: + WORD $0xd285 // test edx, edx + JLE LBB26_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB26_3: + LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB26_3 + +LBB26_4: + RET + +TEXT ·_transpose_int16_int16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB27_1 + +LBB27_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB27_5 + +LBB27_1: + WORD $0xd285 // test edx, edx + JLE LBB27_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB27_3: + LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB27_3 + +LBB27_4: + RET + +TEXT ·_transpose_uint32_int16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB28_1 + +LBB28_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB28_5 + +LBB28_1: + WORD $0xd285 // test edx, edx + JLE LBB28_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB28_3: + LONG $0x47048b42 // mov eax, dword [rdi + 2*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB28_3 + +LBB28_4: + RET + +TEXT ·_transpose_int32_int16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB29_1 + +LBB29_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB29_5 + +LBB29_1: + WORD $0xd285 // test edx, edx + JLE LBB29_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB29_3: + LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB29_3 + +LBB29_4: + RET + +TEXT ·_transpose_uint64_int16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB30_1 + +LBB30_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB30_5 + +LBB30_1: + WORD $0xd285 // test edx, edx + JLE LBB30_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB30_3: + LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB30_3 + +LBB30_4: + RET + +TEXT ·_transpose_int64_int16_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB31_1 + +LBB31_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB31_5 + +LBB31_1: + WORD $0xd285 // test edx, edx + JLE LBB31_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB31_3: + LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB31_3 + +LBB31_4: + RET + +TEXT ·_transpose_uint8_uint32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB32_1 + +LBB32_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB32_5 + +LBB32_1: + WORD $0xd285 // test edx, edx + JLE LBB32_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB32_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x86048942 // mov dword [rsi + 4*r8], eax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB32_3 + +LBB32_4: + RET + +TEXT ·_transpose_int8_uint32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB33_1 + +LBB33_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB33_5 + +LBB33_1: + WORD $0xd285 // test edx, edx + JLE LBB33_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB33_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x86048942 // mov dword [rsi + 4*r8], eax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB33_3 + +LBB33_4: + RET + +TEXT ·_transpose_uint16_uint32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB34_1 + +LBB34_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x0257b70f // movzx edx, word [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x0457b70f // movzx edx, word [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x0657b70f // movzx edx, word [rdi + 6] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB34_5 + +LBB34_1: + WORD $0xd285 // test edx, edx + JLE LBB34_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB34_3: + LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x46048942 // mov dword [rsi + 2*r8], eax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB34_3 + +LBB34_4: + RET + +TEXT ·_transpose_int16_uint32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB35_1 + +LBB35_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB35_5 + +LBB35_1: + WORD $0xd285 // test edx, edx + JLE LBB35_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB35_3: + LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x46048942 // mov dword [rsi + 2*r8], eax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB35_3 + +LBB35_4: + RET + +TEXT ·_transpose_uint32_uint32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB36_1 + +LBB36_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB36_5 + +LBB36_1: + WORD $0xd285 // test edx, edx + JLE LBB36_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB36_3: + LONG $0x07048b42 // mov eax, dword [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB36_3 + +LBB36_4: + RET + +TEXT ·_transpose_int32_uint32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB37_1 + +LBB37_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB37_5 + +LBB37_1: + WORD $0xd285 // test edx, edx + JLE LBB37_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB37_3: + LONG $0x0704634a // movsxd rax, dword [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB37_3 + +LBB37_4: + RET + +TEXT ·_transpose_uint64_uint32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB38_1 + +LBB38_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB38_5 + +LBB38_1: + WORD $0xd285 // test edx, edx + JLE LBB38_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB38_3: + LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB38_3 + +LBB38_4: + RET + +TEXT ·_transpose_int64_uint32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB39_1 + +LBB39_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB39_5 + +LBB39_1: + WORD $0xd285 // test edx, edx + JLE LBB39_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB39_3: + LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB39_3 + +LBB39_4: + RET + +TEXT ·_transpose_uint8_int32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB40_1 + +LBB40_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB40_5 + +LBB40_1: + WORD $0xd285 // test edx, edx + JLE LBB40_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB40_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x86048942 // mov dword [rsi + 4*r8], eax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB40_3 + +LBB40_4: + RET + +TEXT ·_transpose_int8_int32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB41_1 + +LBB41_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB41_5 + +LBB41_1: + WORD $0xd285 // test edx, edx + JLE LBB41_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB41_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x86048942 // mov dword [rsi + 4*r8], eax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB41_3 + +LBB41_4: + RET + +TEXT ·_transpose_uint16_int32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB42_1 + +LBB42_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x0257b70f // movzx edx, word [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x0457b70f // movzx edx, word [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x0657b70f // movzx edx, word [rdi + 6] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB42_5 + +LBB42_1: + WORD $0xd285 // test edx, edx + JLE LBB42_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB42_3: + LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x46048942 // mov dword [rsi + 2*r8], eax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB42_3 + +LBB42_4: + RET + +TEXT ·_transpose_int16_int32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB43_1 + +LBB43_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB43_5 + +LBB43_1: + WORD $0xd285 // test edx, edx + JLE LBB43_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB43_3: + LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x46048942 // mov dword [rsi + 2*r8], eax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB43_3 + +LBB43_4: + RET + +TEXT ·_transpose_uint32_int32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB44_1 + +LBB44_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB44_5 + +LBB44_1: + WORD $0xd285 // test edx, edx + JLE LBB44_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB44_3: + LONG $0x07048b42 // mov eax, dword [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB44_3 + +LBB44_4: + RET + +TEXT ·_transpose_int32_int32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB45_1 + +LBB45_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB45_5 + +LBB45_1: + WORD $0xd285 // test edx, edx + JLE LBB45_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB45_3: + LONG $0x0704634a // movsxd rax, dword [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB45_3 + +LBB45_4: + RET + +TEXT ·_transpose_uint64_int32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB46_1 + +LBB46_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB46_5 + +LBB46_1: + WORD $0xd285 // test edx, edx + JLE LBB46_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB46_3: + LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB46_3 + +LBB46_4: + RET + +TEXT ·_transpose_int64_int32_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB47_1 + +LBB47_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB47_5 + +LBB47_1: + WORD $0xd285 // test edx, edx + JLE LBB47_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB47_3: + LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB47_3 + +LBB47_4: + RET + +TEXT ·_transpose_uint8_uint64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB48_1 + +LBB48_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB48_5 + +LBB48_1: + WORD $0xd285 // test edx, edx + JLE LBB48_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB48_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0xc604894a // mov qword [rsi + 8*r8], rax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB48_3 + +LBB48_4: + RET + +TEXT ·_transpose_int8_uint64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB49_1 + +LBB49_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB49_5 + +LBB49_1: + WORD $0xd285 // test edx, edx + JLE LBB49_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB49_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0xc604894a // mov qword [rsi + 8*r8], rax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB49_3 + +LBB49_4: + RET + +TEXT ·_transpose_uint16_uint64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB50_1 + +LBB50_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x0257b70f // movzx edx, word [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x0457b70f // movzx edx, word [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x0657b70f // movzx edx, word [rdi + 6] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB50_5 + +LBB50_1: + WORD $0xd285 // test edx, edx + JLE LBB50_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB50_3: + LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x8604894a // mov qword [rsi + 4*r8], rax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB50_3 + +LBB50_4: + RET + +TEXT ·_transpose_int16_uint64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB51_1 + +LBB51_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB51_5 + +LBB51_1: + WORD $0xd285 // test edx, edx + JLE LBB51_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB51_3: + LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x8604894a // mov qword [rsi + 4*r8], rax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB51_3 + +LBB51_4: + RET + +TEXT ·_transpose_uint32_uint64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB52_1 + +LBB52_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB52_5 + +LBB52_1: + WORD $0xd285 // test edx, edx + JLE LBB52_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB52_3: + LONG $0x07048b42 // mov eax, dword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x4604894a // mov qword [rsi + 2*r8], rax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB52_3 + +LBB52_4: + RET + +TEXT ·_transpose_int32_uint64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB53_1 + +LBB53_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB53_5 + +LBB53_1: + WORD $0xd285 // test edx, edx + JLE LBB53_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB53_3: + LONG $0x0704634a // movsxd rax, dword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x4604894a // mov qword [rsi + 2*r8], rax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB53_3 + +LBB53_4: + RET + +TEXT ·_transpose_uint64_uint64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB54_1 + +LBB54_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB54_5 + +LBB54_1: + WORD $0xd285 // test edx, edx + JLE LBB54_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB54_3: + LONG $0x07048b4a // mov rax, qword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x0604894a // mov qword [rsi + r8], rax + LONG $0x08c08349 // add r8, 8 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB54_3 + +LBB54_4: + RET + +TEXT ·_transpose_int64_uint64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB55_1 + +LBB55_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB55_5 + +LBB55_1: + WORD $0xd285 // test edx, edx + JLE LBB55_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB55_3: + LONG $0x07048b4a // mov rax, qword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x0604894a // mov qword [rsi + r8], rax + LONG $0x08c08349 // add r8, 8 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB55_3 + +LBB55_4: + RET + +TEXT ·_transpose_uint8_int64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB56_1 + +LBB56_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB56_5 + +LBB56_1: + WORD $0xd285 // test edx, edx + JLE LBB56_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB56_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0xc604894a // mov qword [rsi + 8*r8], rax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB56_3 + +LBB56_4: + RET + +TEXT ·_transpose_int8_int64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB57_1 + +LBB57_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB57_5 + +LBB57_1: + WORD $0xd285 // test edx, edx + JLE LBB57_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB57_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0xc604894a // mov qword [rsi + 8*r8], rax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB57_3 + +LBB57_4: + RET + +TEXT ·_transpose_uint16_int64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB58_1 + +LBB58_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x0257b70f // movzx edx, word [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x0457b70f // movzx edx, word [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x0657b70f // movzx edx, word [rdi + 6] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB58_5 + +LBB58_1: + WORD $0xd285 // test edx, edx + JLE LBB58_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB58_3: + LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x8604894a // mov qword [rsi + 4*r8], rax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB58_3 + +LBB58_4: + RET + +TEXT ·_transpose_int16_int64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB59_1 + +LBB59_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB59_5 + +LBB59_1: + WORD $0xd285 // test edx, edx + JLE LBB59_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB59_3: + LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x8604894a // mov qword [rsi + 4*r8], rax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB59_3 + +LBB59_4: + RET + +TEXT ·_transpose_uint32_int64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB60_1 + +LBB60_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB60_5 + +LBB60_1: + WORD $0xd285 // test edx, edx + JLE LBB60_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB60_3: + LONG $0x07048b42 // mov eax, dword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x4604894a // mov qword [rsi + 2*r8], rax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB60_3 + +LBB60_4: + RET + +TEXT ·_transpose_int32_int64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB61_1 + +LBB61_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB61_5 + +LBB61_1: + WORD $0xd285 // test edx, edx + JLE LBB61_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB61_3: + LONG $0x0704634a // movsxd rax, dword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x4604894a // mov qword [rsi + 2*r8], rax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB61_3 + +LBB61_4: + RET + +TEXT ·_transpose_uint64_int64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB62_1 + +LBB62_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB62_5 + +LBB62_1: + WORD $0xd285 // test edx, edx + JLE LBB62_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB62_3: + LONG $0x07048b4a // mov rax, qword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x0604894a // mov qword [rsi + r8], rax + LONG $0x08c08349 // add r8, 8 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB62_3 + +LBB62_4: + RET + +TEXT ·_transpose_int64_int64_avx2(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB63_1 + +LBB63_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB63_5 + +LBB63_1: + WORD $0xd285 // test edx, edx + JLE LBB63_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB63_3: + LONG $0x07048b4a // mov rax, qword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x0604894a // mov qword [rsi + r8], rax + LONG $0x08c08349 // add r8, 8 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB63_3 + +LBB63_4: + RET diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_def.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_def.go new file mode 100644 index 000000000..c52598d71 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_def.go @@ -0,0 +1,227 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "errors" + + "github.com/apache/arrow/go/v15/arrow" +) + +//go:generate go run ../../arrow/_tools/tmpl -i -data=transpose_ints.tmpldata -d arch=avx2 transpose_ints_simd.go.tmpl=transpose_ints_avx2_amd64.go +//go:generate go run ../../arrow/_tools/tmpl -i -data=transpose_ints.tmpldata -d arch=sse4 transpose_ints_simd.go.tmpl=transpose_ints_sse4_amd64.go +//go:generate go run ../../arrow/_tools/tmpl -i -data=transpose_ints.tmpldata transpose_ints_s390x.go.tmpl=transpose_ints_s390x.go +//go:generate go run ../../arrow/_tools/tmpl -i -data=transpose_ints.tmpldata transpose_ints_s390x.go.tmpl=transpose_ints_arm64.go +//go:generate go run ../../arrow/_tools/tmpl -i -data=transpose_ints.tmpldata transpose_ints_noasm.go.tmpl=transpose_ints_noasm.go +//go:generate go run ../../arrow/_tools/tmpl -i -data=transpose_ints.tmpldata transpose_ints.go.tmpl=transpose_ints.go + +func bufToTyped(typ arrow.DataType, buf []byte, offset, length int) (interface{}, error) { + switch typ.ID() { + case arrow.INT8: + return arrow.Int8Traits.CastFromBytes(buf)[offset : offset+length], nil + case arrow.INT16: + return arrow.Int16Traits.CastFromBytes(buf)[offset : offset+length], nil + case arrow.INT32: + return arrow.Int32Traits.CastFromBytes(buf)[offset : offset+length], nil + case arrow.INT64: + return arrow.Int64Traits.CastFromBytes(buf)[offset : offset+length], nil + case arrow.UINT8: + return arrow.Uint8Traits.CastFromBytes(buf)[offset : offset+length], nil + case arrow.UINT16: + return arrow.Uint16Traits.CastFromBytes(buf)[offset : offset+length], nil + case arrow.UINT32: + return arrow.Uint32Traits.CastFromBytes(buf)[offset : offset+length], nil + case arrow.UINT64: + return arrow.Uint64Traits.CastFromBytes(buf)[offset : offset+length], nil + } + return nil, errors.New("only accepts integral types") +} + +// TransposeIntsBuffers takes the data-types, byte buffers, and offsets of a source and destination +// buffer to perform TransposeInts on with the provided mapping data. +func TransposeIntsBuffers(inType, outType arrow.DataType, indata, outdata []byte, inOffset, outOffset int, length int, transposeMap []int32) error { + src, err := bufToTyped(inType, indata, inOffset, length) + if err != nil { + return err + } + dest, err := bufToTyped(outType, outdata, outOffset, length) + if err != nil { + return err + } + + return TransposeInts(src, dest, transposeMap) +} + +// TransposeInts expects two integral slices and the values they map to. Returning +// an error if either src or dest are not an integral type. +func TransposeInts(src, dest interface{}, mapping []int32) error { + switch s := src.(type) { + case []int8: + switch d := dest.(type) { + case []int8: + TransposeInt8Int8(s, d, mapping) + case []int16: + TransposeInt8Int16(s, d, mapping) + case []int32: + TransposeInt8Int32(s, d, mapping) + case []int64: + TransposeInt8Int64(s, d, mapping) + case []uint8: + TransposeInt8Uint8(s, d, mapping) + case []uint16: + TransposeInt8Uint16(s, d, mapping) + case []uint32: + TransposeInt8Uint32(s, d, mapping) + case []uint64: + TransposeInt8Uint64(s, d, mapping) + } + case []int16: + switch d := dest.(type) { + case []int8: + TransposeInt16Int8(s, d, mapping) + case []int16: + TransposeInt16Int16(s, d, mapping) + case []int32: + TransposeInt16Int32(s, d, mapping) + case []int64: + TransposeInt16Int64(s, d, mapping) + case []uint8: + TransposeInt16Uint8(s, d, mapping) + case []uint16: + TransposeInt16Uint16(s, d, mapping) + case []uint32: + TransposeInt16Uint32(s, d, mapping) + case []uint64: + TransposeInt16Uint64(s, d, mapping) + } + case []int32: + switch d := dest.(type) { + case []int8: + TransposeInt32Int8(s, d, mapping) + case []int16: + TransposeInt32Int16(s, d, mapping) + case []int32: + TransposeInt32Int32(s, d, mapping) + case []int64: + TransposeInt32Int64(s, d, mapping) + case []uint8: + TransposeInt32Uint8(s, d, mapping) + case []uint16: + TransposeInt32Uint16(s, d, mapping) + case []uint32: + TransposeInt32Uint32(s, d, mapping) + case []uint64: + TransposeInt32Uint64(s, d, mapping) + } + case []int64: + switch d := dest.(type) { + case []int8: + TransposeInt64Int8(s, d, mapping) + case []int16: + TransposeInt64Int16(s, d, mapping) + case []int32: + TransposeInt64Int32(s, d, mapping) + case []int64: + TransposeInt64Int64(s, d, mapping) + case []uint8: + TransposeInt64Uint8(s, d, mapping) + case []uint16: + TransposeInt64Uint16(s, d, mapping) + case []uint32: + TransposeInt64Uint32(s, d, mapping) + case []uint64: + TransposeInt64Uint64(s, d, mapping) + } + case []uint8: + switch d := dest.(type) { + case []int8: + TransposeUint8Int8(s, d, mapping) + case []int16: + TransposeUint8Int16(s, d, mapping) + case []int32: + TransposeUint8Int32(s, d, mapping) + case []int64: + TransposeUint8Int64(s, d, mapping) + case []uint8: + TransposeUint8Uint8(s, d, mapping) + case []uint16: + TransposeUint8Uint16(s, d, mapping) + case []uint32: + TransposeUint8Uint32(s, d, mapping) + case []uint64: + TransposeUint8Uint64(s, d, mapping) + } + case []uint16: + switch d := dest.(type) { + case []int8: + TransposeUint16Int8(s, d, mapping) + case []int16: + TransposeUint16Int16(s, d, mapping) + case []int32: + TransposeUint16Int32(s, d, mapping) + case []int64: + TransposeUint16Int64(s, d, mapping) + case []uint8: + TransposeUint16Uint8(s, d, mapping) + case []uint16: + TransposeUint16Uint16(s, d, mapping) + case []uint32: + TransposeUint16Uint32(s, d, mapping) + case []uint64: + TransposeUint16Uint64(s, d, mapping) + } + case []uint32: + switch d := dest.(type) { + case []int8: + TransposeUint32Int8(s, d, mapping) + case []int16: + TransposeUint32Int16(s, d, mapping) + case []int32: + TransposeUint32Int32(s, d, mapping) + case []int64: + TransposeUint32Int64(s, d, mapping) + case []uint8: + TransposeUint32Uint8(s, d, mapping) + case []uint16: + TransposeUint32Uint16(s, d, mapping) + case []uint32: + TransposeUint32Uint32(s, d, mapping) + case []uint64: + TransposeUint32Uint64(s, d, mapping) + } + case []uint64: + switch d := dest.(type) { + case []int8: + TransposeUint64Int8(s, d, mapping) + case []int16: + TransposeUint64Int16(s, d, mapping) + case []int32: + TransposeUint64Int32(s, d, mapping) + case []int64: + TransposeUint64Int64(s, d, mapping) + case []uint8: + TransposeUint64Uint8(s, d, mapping) + case []uint16: + TransposeUint64Uint16(s, d, mapping) + case []uint32: + TransposeUint64Uint32(s, d, mapping) + case []uint64: + TransposeUint64Uint64(s, d, mapping) + } + } + return nil +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go new file mode 100644 index 000000000..461aaf31f --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go @@ -0,0 +1,96 @@ +// Code generated by transpose_ints_noasm.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build noasm || (!amd64 && !arm64 && !s390x && !ppc64le) + +package utils + +// if building with the 'noasm' tag, then point to the pure go implementations +var ( + TransposeInt8Int8 = transposeInt8Int8 + TransposeInt8Uint8 = transposeInt8Uint8 + TransposeInt8Int16 = transposeInt8Int16 + TransposeInt8Uint16 = transposeInt8Uint16 + TransposeInt8Int32 = transposeInt8Int32 + TransposeInt8Uint32 = transposeInt8Uint32 + TransposeInt8Int64 = transposeInt8Int64 + TransposeInt8Uint64 = transposeInt8Uint64 + + TransposeUint8Int8 = transposeUint8Int8 + TransposeUint8Uint8 = transposeUint8Uint8 + TransposeUint8Int16 = transposeUint8Int16 + TransposeUint8Uint16 = transposeUint8Uint16 + TransposeUint8Int32 = transposeUint8Int32 + TransposeUint8Uint32 = transposeUint8Uint32 + TransposeUint8Int64 = transposeUint8Int64 + TransposeUint8Uint64 = transposeUint8Uint64 + + TransposeInt16Int8 = transposeInt16Int8 + TransposeInt16Uint8 = transposeInt16Uint8 + TransposeInt16Int16 = transposeInt16Int16 + TransposeInt16Uint16 = transposeInt16Uint16 + TransposeInt16Int32 = transposeInt16Int32 + TransposeInt16Uint32 = transposeInt16Uint32 + TransposeInt16Int64 = transposeInt16Int64 + TransposeInt16Uint64 = transposeInt16Uint64 + + TransposeUint16Int8 = transposeUint16Int8 + TransposeUint16Uint8 = transposeUint16Uint8 + TransposeUint16Int16 = transposeUint16Int16 + TransposeUint16Uint16 = transposeUint16Uint16 + TransposeUint16Int32 = transposeUint16Int32 + TransposeUint16Uint32 = transposeUint16Uint32 + TransposeUint16Int64 = transposeUint16Int64 + TransposeUint16Uint64 = transposeUint16Uint64 + + TransposeInt32Int8 = transposeInt32Int8 + TransposeInt32Uint8 = transposeInt32Uint8 + TransposeInt32Int16 = transposeInt32Int16 + TransposeInt32Uint16 = transposeInt32Uint16 + TransposeInt32Int32 = transposeInt32Int32 + TransposeInt32Uint32 = transposeInt32Uint32 + TransposeInt32Int64 = transposeInt32Int64 + TransposeInt32Uint64 = transposeInt32Uint64 + + TransposeUint32Int8 = transposeUint32Int8 + TransposeUint32Uint8 = transposeUint32Uint8 + TransposeUint32Int16 = transposeUint32Int16 + TransposeUint32Uint16 = transposeUint32Uint16 + TransposeUint32Int32 = transposeUint32Int32 + TransposeUint32Uint32 = transposeUint32Uint32 + TransposeUint32Int64 = transposeUint32Int64 + TransposeUint32Uint64 = transposeUint32Uint64 + + TransposeInt64Int8 = transposeInt64Int8 + TransposeInt64Uint8 = transposeInt64Uint8 + TransposeInt64Int16 = transposeInt64Int16 + TransposeInt64Uint16 = transposeInt64Uint16 + TransposeInt64Int32 = transposeInt64Int32 + TransposeInt64Uint32 = transposeInt64Uint32 + TransposeInt64Int64 = transposeInt64Int64 + TransposeInt64Uint64 = transposeInt64Uint64 + + TransposeUint64Int8 = transposeUint64Int8 + TransposeUint64Uint8 = transposeUint64Uint8 + TransposeUint64Int16 = transposeUint64Int16 + TransposeUint64Uint16 = transposeUint64Uint16 + TransposeUint64Int32 = transposeUint64Int32 + TransposeUint64Uint32 = transposeUint64Uint32 + TransposeUint64Int64 = transposeUint64Int64 + TransposeUint64Uint64 = transposeUint64Uint64 +) diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go.tmpl b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go.tmpl new file mode 100644 index 000000000..faffdce35 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_noasm.go.tmpl @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build noasm +// +build noasm + +package utils + +// if building with the 'noasm' tag, then point to the pure go implementations +var ( +{{ $typelist := .In }} +{{range .In}} +{{ $src := .Type -}} +{{ $srcName := .Name -}} +{{ range $typelist -}} +{{ $dest := .Type -}} +{{ $destName := .Name -}} + Transpose{{$srcName}}{{$destName}} = transpose{{$srcName}}{{$destName}} +{{end}} +{{end}} +) diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_ppc64le.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_ppc64le.go new file mode 100644 index 000000000..cc957cdaa --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_ppc64le.go @@ -0,0 +1,96 @@ +// Code generated by transpose_ints_s390x.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +// if building with the 'noasm' tag, then point to the pure go implementations +var ( + TransposeInt8Int8 = transposeInt8Int8 + TransposeInt8Uint8 = transposeInt8Uint8 + TransposeInt8Int16 = transposeInt8Int16 + TransposeInt8Uint16 = transposeInt8Uint16 + TransposeInt8Int32 = transposeInt8Int32 + TransposeInt8Uint32 = transposeInt8Uint32 + TransposeInt8Int64 = transposeInt8Int64 + TransposeInt8Uint64 = transposeInt8Uint64 + + TransposeUint8Int8 = transposeUint8Int8 + TransposeUint8Uint8 = transposeUint8Uint8 + TransposeUint8Int16 = transposeUint8Int16 + TransposeUint8Uint16 = transposeUint8Uint16 + TransposeUint8Int32 = transposeUint8Int32 + TransposeUint8Uint32 = transposeUint8Uint32 + TransposeUint8Int64 = transposeUint8Int64 + TransposeUint8Uint64 = transposeUint8Uint64 + + TransposeInt16Int8 = transposeInt16Int8 + TransposeInt16Uint8 = transposeInt16Uint8 + TransposeInt16Int16 = transposeInt16Int16 + TransposeInt16Uint16 = transposeInt16Uint16 + TransposeInt16Int32 = transposeInt16Int32 + TransposeInt16Uint32 = transposeInt16Uint32 + TransposeInt16Int64 = transposeInt16Int64 + TransposeInt16Uint64 = transposeInt16Uint64 + + TransposeUint16Int8 = transposeUint16Int8 + TransposeUint16Uint8 = transposeUint16Uint8 + TransposeUint16Int16 = transposeUint16Int16 + TransposeUint16Uint16 = transposeUint16Uint16 + TransposeUint16Int32 = transposeUint16Int32 + TransposeUint16Uint32 = transposeUint16Uint32 + TransposeUint16Int64 = transposeUint16Int64 + TransposeUint16Uint64 = transposeUint16Uint64 + + TransposeInt32Int8 = transposeInt32Int8 + TransposeInt32Uint8 = transposeInt32Uint8 + TransposeInt32Int16 = transposeInt32Int16 + TransposeInt32Uint16 = transposeInt32Uint16 + TransposeInt32Int32 = transposeInt32Int32 + TransposeInt32Uint32 = transposeInt32Uint32 + TransposeInt32Int64 = transposeInt32Int64 + TransposeInt32Uint64 = transposeInt32Uint64 + + TransposeUint32Int8 = transposeUint32Int8 + TransposeUint32Uint8 = transposeUint32Uint8 + TransposeUint32Int16 = transposeUint32Int16 + TransposeUint32Uint16 = transposeUint32Uint16 + TransposeUint32Int32 = transposeUint32Int32 + TransposeUint32Uint32 = transposeUint32Uint32 + TransposeUint32Int64 = transposeUint32Int64 + TransposeUint32Uint64 = transposeUint32Uint64 + + TransposeInt64Int8 = transposeInt64Int8 + TransposeInt64Uint8 = transposeInt64Uint8 + TransposeInt64Int16 = transposeInt64Int16 + TransposeInt64Uint16 = transposeInt64Uint16 + TransposeInt64Int32 = transposeInt64Int32 + TransposeInt64Uint32 = transposeInt64Uint32 + TransposeInt64Int64 = transposeInt64Int64 + TransposeInt64Uint64 = transposeInt64Uint64 + + TransposeUint64Int8 = transposeUint64Int8 + TransposeUint64Uint8 = transposeUint64Uint8 + TransposeUint64Int16 = transposeUint64Int16 + TransposeUint64Uint16 = transposeUint64Uint16 + TransposeUint64Int32 = transposeUint64Int32 + TransposeUint64Uint32 = transposeUint64Uint32 + TransposeUint64Int64 = transposeUint64Int64 + TransposeUint64Uint64 = transposeUint64Uint64 +) diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go new file mode 100644 index 000000000..cc957cdaa --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go @@ -0,0 +1,96 @@ +// Code generated by transpose_ints_s390x.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +// if building with the 'noasm' tag, then point to the pure go implementations +var ( + TransposeInt8Int8 = transposeInt8Int8 + TransposeInt8Uint8 = transposeInt8Uint8 + TransposeInt8Int16 = transposeInt8Int16 + TransposeInt8Uint16 = transposeInt8Uint16 + TransposeInt8Int32 = transposeInt8Int32 + TransposeInt8Uint32 = transposeInt8Uint32 + TransposeInt8Int64 = transposeInt8Int64 + TransposeInt8Uint64 = transposeInt8Uint64 + + TransposeUint8Int8 = transposeUint8Int8 + TransposeUint8Uint8 = transposeUint8Uint8 + TransposeUint8Int16 = transposeUint8Int16 + TransposeUint8Uint16 = transposeUint8Uint16 + TransposeUint8Int32 = transposeUint8Int32 + TransposeUint8Uint32 = transposeUint8Uint32 + TransposeUint8Int64 = transposeUint8Int64 + TransposeUint8Uint64 = transposeUint8Uint64 + + TransposeInt16Int8 = transposeInt16Int8 + TransposeInt16Uint8 = transposeInt16Uint8 + TransposeInt16Int16 = transposeInt16Int16 + TransposeInt16Uint16 = transposeInt16Uint16 + TransposeInt16Int32 = transposeInt16Int32 + TransposeInt16Uint32 = transposeInt16Uint32 + TransposeInt16Int64 = transposeInt16Int64 + TransposeInt16Uint64 = transposeInt16Uint64 + + TransposeUint16Int8 = transposeUint16Int8 + TransposeUint16Uint8 = transposeUint16Uint8 + TransposeUint16Int16 = transposeUint16Int16 + TransposeUint16Uint16 = transposeUint16Uint16 + TransposeUint16Int32 = transposeUint16Int32 + TransposeUint16Uint32 = transposeUint16Uint32 + TransposeUint16Int64 = transposeUint16Int64 + TransposeUint16Uint64 = transposeUint16Uint64 + + TransposeInt32Int8 = transposeInt32Int8 + TransposeInt32Uint8 = transposeInt32Uint8 + TransposeInt32Int16 = transposeInt32Int16 + TransposeInt32Uint16 = transposeInt32Uint16 + TransposeInt32Int32 = transposeInt32Int32 + TransposeInt32Uint32 = transposeInt32Uint32 + TransposeInt32Int64 = transposeInt32Int64 + TransposeInt32Uint64 = transposeInt32Uint64 + + TransposeUint32Int8 = transposeUint32Int8 + TransposeUint32Uint8 = transposeUint32Uint8 + TransposeUint32Int16 = transposeUint32Int16 + TransposeUint32Uint16 = transposeUint32Uint16 + TransposeUint32Int32 = transposeUint32Int32 + TransposeUint32Uint32 = transposeUint32Uint32 + TransposeUint32Int64 = transposeUint32Int64 + TransposeUint32Uint64 = transposeUint32Uint64 + + TransposeInt64Int8 = transposeInt64Int8 + TransposeInt64Uint8 = transposeInt64Uint8 + TransposeInt64Int16 = transposeInt64Int16 + TransposeInt64Uint16 = transposeInt64Uint16 + TransposeInt64Int32 = transposeInt64Int32 + TransposeInt64Uint32 = transposeInt64Uint32 + TransposeInt64Int64 = transposeInt64Int64 + TransposeInt64Uint64 = transposeInt64Uint64 + + TransposeUint64Int8 = transposeUint64Int8 + TransposeUint64Uint8 = transposeUint64Uint8 + TransposeUint64Int16 = transposeUint64Int16 + TransposeUint64Uint16 = transposeUint64Uint16 + TransposeUint64Int32 = transposeUint64Int32 + TransposeUint64Uint32 = transposeUint64Uint32 + TransposeUint64Int64 = transposeUint64Int64 + TransposeUint64Uint64 = transposeUint64Uint64 +) diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go.tmpl b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go.tmpl new file mode 100644 index 000000000..d93c8779c --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_s390x.go.tmpl @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm +// +build !noasm + +package utils + +// if building with the 'noasm' tag, then point to the pure go implementations +var ( +{{ $typelist := .In }} +{{range .In}} +{{ $src := .Type -}} +{{ $srcName := .Name -}} +{{ range $typelist -}} +{{ $dest := .Type -}} +{{ $destName := .Name -}} + Transpose{{$srcName}}{{$destName}} = transpose{{$srcName}}{{$destName}} +{{end}} +{{end}} +) diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_simd.go.tmpl b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_simd.go.tmpl new file mode 100644 index 000000000..034d0e9d2 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_simd.go.tmpl @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm +// +build !noasm + +package utils + +import ( + "unsafe" +) + +{{ $arch := .D.arch}} +{{ $typelist := .In}} +{{range .In}} +{{ $src := .Type }} +{{ $srcName := .Name }} +{{ range $typelist}} +{{ $dest := .Type }} +{{ $destName := .Name }} + +//go:noescape +func _transpose_{{printf "%s_%s_%s" $src $dest $arch}}(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transpose{{ $srcName }}{{ $destName }}{{ $arch }}(src []{{$src}}, dest []{{$dest}}, transposeMap []int32) { + _transpose_{{printf "%s_%s_%s" $src $dest $arch}}(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} +{{ end }} +{{ end }} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.go b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.go new file mode 100644 index 000000000..241ca74a7 --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.go @@ -0,0 +1,473 @@ +// Code generated by transpose_ints_simd.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !noasm + +package utils + +import ( + "unsafe" +) + +//go:noescape +func _transpose_int8_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Int8sse4(src []int8, dest []int8, transposeMap []int32) { + _transpose_int8_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Uint8sse4(src []int8, dest []uint8, transposeMap []int32) { + _transpose_int8_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Int16sse4(src []int8, dest []int16, transposeMap []int32) { + _transpose_int8_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Uint16sse4(src []int8, dest []uint16, transposeMap []int32) { + _transpose_int8_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Int32sse4(src []int8, dest []int32, transposeMap []int32) { + _transpose_int8_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Uint32sse4(src []int8, dest []uint32, transposeMap []int32) { + _transpose_int8_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Int64sse4(src []int8, dest []int64, transposeMap []int32) { + _transpose_int8_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int8_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt8Uint64sse4(src []int8, dest []uint64, transposeMap []int32) { + _transpose_int8_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Int8sse4(src []uint8, dest []int8, transposeMap []int32) { + _transpose_uint8_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Uint8sse4(src []uint8, dest []uint8, transposeMap []int32) { + _transpose_uint8_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Int16sse4(src []uint8, dest []int16, transposeMap []int32) { + _transpose_uint8_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Uint16sse4(src []uint8, dest []uint16, transposeMap []int32) { + _transpose_uint8_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Int32sse4(src []uint8, dest []int32, transposeMap []int32) { + _transpose_uint8_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Uint32sse4(src []uint8, dest []uint32, transposeMap []int32) { + _transpose_uint8_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Int64sse4(src []uint8, dest []int64, transposeMap []int32) { + _transpose_uint8_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint8_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint8Uint64sse4(src []uint8, dest []uint64, transposeMap []int32) { + _transpose_uint8_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Int8sse4(src []int16, dest []int8, transposeMap []int32) { + _transpose_int16_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Uint8sse4(src []int16, dest []uint8, transposeMap []int32) { + _transpose_int16_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Int16sse4(src []int16, dest []int16, transposeMap []int32) { + _transpose_int16_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Uint16sse4(src []int16, dest []uint16, transposeMap []int32) { + _transpose_int16_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Int32sse4(src []int16, dest []int32, transposeMap []int32) { + _transpose_int16_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Uint32sse4(src []int16, dest []uint32, transposeMap []int32) { + _transpose_int16_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Int64sse4(src []int16, dest []int64, transposeMap []int32) { + _transpose_int16_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int16_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt16Uint64sse4(src []int16, dest []uint64, transposeMap []int32) { + _transpose_int16_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Int8sse4(src []uint16, dest []int8, transposeMap []int32) { + _transpose_uint16_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Uint8sse4(src []uint16, dest []uint8, transposeMap []int32) { + _transpose_uint16_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Int16sse4(src []uint16, dest []int16, transposeMap []int32) { + _transpose_uint16_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Uint16sse4(src []uint16, dest []uint16, transposeMap []int32) { + _transpose_uint16_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Int32sse4(src []uint16, dest []int32, transposeMap []int32) { + _transpose_uint16_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Uint32sse4(src []uint16, dest []uint32, transposeMap []int32) { + _transpose_uint16_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Int64sse4(src []uint16, dest []int64, transposeMap []int32) { + _transpose_uint16_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint16_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint16Uint64sse4(src []uint16, dest []uint64, transposeMap []int32) { + _transpose_uint16_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Int8sse4(src []int32, dest []int8, transposeMap []int32) { + _transpose_int32_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Uint8sse4(src []int32, dest []uint8, transposeMap []int32) { + _transpose_int32_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Int16sse4(src []int32, dest []int16, transposeMap []int32) { + _transpose_int32_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Uint16sse4(src []int32, dest []uint16, transposeMap []int32) { + _transpose_int32_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Int32sse4(src []int32, dest []int32, transposeMap []int32) { + _transpose_int32_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Uint32sse4(src []int32, dest []uint32, transposeMap []int32) { + _transpose_int32_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Int64sse4(src []int32, dest []int64, transposeMap []int32) { + _transpose_int32_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int32_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt32Uint64sse4(src []int32, dest []uint64, transposeMap []int32) { + _transpose_int32_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Int8sse4(src []uint32, dest []int8, transposeMap []int32) { + _transpose_uint32_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Uint8sse4(src []uint32, dest []uint8, transposeMap []int32) { + _transpose_uint32_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Int16sse4(src []uint32, dest []int16, transposeMap []int32) { + _transpose_uint32_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Uint16sse4(src []uint32, dest []uint16, transposeMap []int32) { + _transpose_uint32_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Int32sse4(src []uint32, dest []int32, transposeMap []int32) { + _transpose_uint32_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Uint32sse4(src []uint32, dest []uint32, transposeMap []int32) { + _transpose_uint32_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Int64sse4(src []uint32, dest []int64, transposeMap []int32) { + _transpose_uint32_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint32_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint32Uint64sse4(src []uint32, dest []uint64, transposeMap []int32) { + _transpose_uint32_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Int8sse4(src []int64, dest []int8, transposeMap []int32) { + _transpose_int64_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Uint8sse4(src []int64, dest []uint8, transposeMap []int32) { + _transpose_int64_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Int16sse4(src []int64, dest []int16, transposeMap []int32) { + _transpose_int64_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Uint16sse4(src []int64, dest []uint16, transposeMap []int32) { + _transpose_int64_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Int32sse4(src []int64, dest []int32, transposeMap []int32) { + _transpose_int64_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Uint32sse4(src []int64, dest []uint32, transposeMap []int32) { + _transpose_int64_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Int64sse4(src []int64, dest []int64, transposeMap []int32) { + _transpose_int64_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_int64_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeInt64Uint64sse4(src []int64, dest []uint64, transposeMap []int32) { + _transpose_int64_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_int8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Int8sse4(src []uint64, dest []int8, transposeMap []int32) { + _transpose_uint64_int8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_uint8_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Uint8sse4(src []uint64, dest []uint8, transposeMap []int32) { + _transpose_uint64_uint8_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_int16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Int16sse4(src []uint64, dest []int16, transposeMap []int32) { + _transpose_uint64_int16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_uint16_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Uint16sse4(src []uint64, dest []uint16, transposeMap []int32) { + _transpose_uint64_uint16_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_int32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Int32sse4(src []uint64, dest []int32, transposeMap []int32) { + _transpose_uint64_int32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_uint32_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Uint32sse4(src []uint64, dest []uint32, transposeMap []int32) { + _transpose_uint64_uint32_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_int64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Int64sse4(src []uint64, dest []int64, transposeMap []int32) { + _transpose_uint64_int64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} + +//go:noescape +func _transpose_uint64_uint64_sse4(src, dest unsafe.Pointer, length int, transposeMap unsafe.Pointer) + +func transposeUint64Uint64sse4(src []uint64, dest []uint64, transposeMap []int32) { + _transpose_uint64_uint64_sse4(unsafe.Pointer(&src[0]), unsafe.Pointer(&dest[0]), len(dest), unsafe.Pointer(&transposeMap[0])) +} diff --git a/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.s b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.s new file mode 100644 index 000000000..ee5199a5a --- /dev/null +++ b/vendor/github.com/apache/arrow/go/v15/internal/utils/transpose_ints_sse4_amd64.s @@ -0,0 +1,3074 @@ +//+build !noasm !appengine +// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT + +TEXT ·_transpose_uint8_uint8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB0_1 + +LBB0_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB0_5 + +LBB0_1: + WORD $0xd285 // test edx, edx + JLE LBB0_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB0_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB0_3 + +LBB0_4: + RET + +TEXT ·_transpose_int8_uint8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB1_1 + +LBB1_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB1_5 + +LBB1_1: + WORD $0xd285 // test edx, edx + JLE LBB1_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB1_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB1_3 + +LBB1_4: + RET + +TEXT ·_transpose_uint16_uint8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB2_1 + +LBB2_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x0257b70f // movzx edx, word [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x0457b70f // movzx edx, word [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x0657b70f // movzx edx, word [rdi + 6] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB2_5 + +LBB2_1: + WORD $0xd285 // test edx, edx + JLE LBB2_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB2_3: + LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB2_3 + +LBB2_4: + RET + +TEXT ·_transpose_int16_uint8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB3_1 + +LBB3_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB3_5 + +LBB3_1: + WORD $0xd285 // test edx, edx + JLE LBB3_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB3_3: + LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB3_3 + +LBB3_4: + RET + +TEXT ·_transpose_uint32_uint8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB4_1 + +LBB4_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB4_5 + +LBB4_1: + WORD $0xd285 // test edx, edx + JLE LBB4_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB4_3: + LONG $0x87048b42 // mov eax, dword [rdi + 4*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB4_3 + +LBB4_4: + RET + +TEXT ·_transpose_int32_uint8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB5_1 + +LBB5_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB5_5 + +LBB5_1: + WORD $0xd285 // test edx, edx + JLE LBB5_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB5_3: + LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB5_3 + +LBB5_4: + RET + +TEXT ·_transpose_uint64_uint8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB6_1 + +LBB6_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB6_5 + +LBB6_1: + WORD $0xd285 // test edx, edx + JLE LBB6_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB6_3: + LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB6_3 + +LBB6_4: + RET + +TEXT ·_transpose_int64_uint8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB7_1 + +LBB7_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB7_5 + +LBB7_1: + WORD $0xd285 // test edx, edx + JLE LBB7_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB7_3: + LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB7_3 + +LBB7_4: + RET + +TEXT ·_transpose_uint8_int8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB8_1 + +LBB8_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB8_5 + +LBB8_1: + WORD $0xd285 // test edx, edx + JLE LBB8_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB8_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB8_3 + +LBB8_4: + RET + +TEXT ·_transpose_int8_int8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB9_1 + +LBB9_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB9_5 + +LBB9_1: + WORD $0xd285 // test edx, edx + JLE LBB9_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB9_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB9_3 + +LBB9_4: + RET + +TEXT ·_transpose_uint16_int8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB10_1 + +LBB10_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x0257b70f // movzx edx, word [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x0457b70f // movzx edx, word [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x0657b70f // movzx edx, word [rdi + 6] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB10_5 + +LBB10_1: + WORD $0xd285 // test edx, edx + JLE LBB10_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB10_3: + LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB10_3 + +LBB10_4: + RET + +TEXT ·_transpose_int16_int8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB11_1 + +LBB11_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB11_5 + +LBB11_1: + WORD $0xd285 // test edx, edx + JLE LBB11_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB11_3: + LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB11_3 + +LBB11_4: + RET + +TEXT ·_transpose_uint32_int8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB12_1 + +LBB12_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB12_5 + +LBB12_1: + WORD $0xd285 // test edx, edx + JLE LBB12_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB12_3: + LONG $0x87048b42 // mov eax, dword [rdi + 4*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB12_3 + +LBB12_4: + RET + +TEXT ·_transpose_int32_int8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB13_1 + +LBB13_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB13_5 + +LBB13_1: + WORD $0xd285 // test edx, edx + JLE LBB13_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB13_3: + LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB13_3 + +LBB13_4: + RET + +TEXT ·_transpose_uint64_int8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB14_1 + +LBB14_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB14_5 + +LBB14_1: + WORD $0xd285 // test edx, edx + JLE LBB14_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB14_3: + LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB14_3 + +LBB14_4: + RET + +TEXT ·_transpose_int64_int8_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB15_1 + +LBB15_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x1688 // mov byte [rsi], dl + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx] + WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x04c68348 // add rsi, 4 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB15_5 + +LBB15_1: + WORD $0xd285 // test edx, edx + JLE LBB15_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB15_3: + LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] + LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax] + LONG $0x06048842 // mov byte [rsi + r8], al + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB15_3 + +LBB15_4: + RET + +TEXT ·_transpose_uint8_uint16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB16_1 + +LBB16_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB16_5 + +LBB16_1: + WORD $0xd285 // test edx, edx + JLE LBB16_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB16_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB16_3 + +LBB16_4: + RET + +TEXT ·_transpose_int8_uint16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB17_1 + +LBB17_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB17_5 + +LBB17_1: + WORD $0xd285 // test edx, edx + JLE LBB17_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB17_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB17_3 + +LBB17_4: + RET + +TEXT ·_transpose_uint16_uint16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB18_1 + +LBB18_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x0257b70f // movzx edx, word [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x0457b70f // movzx edx, word [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x0657b70f // movzx edx, word [rdi + 6] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB18_5 + +LBB18_1: + WORD $0xd285 // test edx, edx + JLE LBB18_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB18_3: + LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB18_3 + +LBB18_4: + RET + +TEXT ·_transpose_int16_uint16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB19_1 + +LBB19_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB19_5 + +LBB19_1: + WORD $0xd285 // test edx, edx + JLE LBB19_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB19_3: + LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB19_3 + +LBB19_4: + RET + +TEXT ·_transpose_uint32_uint16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB20_1 + +LBB20_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB20_5 + +LBB20_1: + WORD $0xd285 // test edx, edx + JLE LBB20_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB20_3: + LONG $0x47048b42 // mov eax, dword [rdi + 2*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB20_3 + +LBB20_4: + RET + +TEXT ·_transpose_int32_uint16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB21_1 + +LBB21_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB21_5 + +LBB21_1: + WORD $0xd285 // test edx, edx + JLE LBB21_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB21_3: + LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB21_3 + +LBB21_4: + RET + +TEXT ·_transpose_uint64_uint16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB22_1 + +LBB22_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB22_5 + +LBB22_1: + WORD $0xd285 // test edx, edx + JLE LBB22_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB22_3: + LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB22_3 + +LBB22_4: + RET + +TEXT ·_transpose_int64_uint16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB23_1 + +LBB23_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB23_5 + +LBB23_1: + WORD $0xd285 // test edx, edx + JLE LBB23_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB23_3: + LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB23_3 + +LBB23_4: + RET + +TEXT ·_transpose_uint8_int16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB24_1 + +LBB24_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB24_5 + +LBB24_1: + WORD $0xd285 // test edx, edx + JLE LBB24_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB24_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB24_3 + +LBB24_4: + RET + +TEXT ·_transpose_int8_int16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB25_1 + +LBB25_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB25_5 + +LBB25_1: + WORD $0xd285 // test edx, edx + JLE LBB25_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB25_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB25_3 + +LBB25_4: + RET + +TEXT ·_transpose_uint16_int16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB26_1 + +LBB26_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x0257b70f // movzx edx, word [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x0457b70f // movzx edx, word [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x0657b70f // movzx edx, word [rdi + 6] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB26_5 + +LBB26_1: + WORD $0xd285 // test edx, edx + JLE LBB26_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB26_3: + LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB26_3 + +LBB26_4: + RET + +TEXT ·_transpose_int16_int16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB27_1 + +LBB27_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB27_5 + +LBB27_1: + WORD $0xd285 // test edx, edx + JLE LBB27_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB27_3: + LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB27_3 + +LBB27_4: + RET + +TEXT ·_transpose_uint32_int16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB28_1 + +LBB28_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB28_5 + +LBB28_1: + WORD $0xd285 // test edx, edx + JLE LBB28_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB28_3: + LONG $0x47048b42 // mov eax, dword [rdi + 2*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB28_3 + +LBB28_4: + RET + +TEXT ·_transpose_int32_int16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB29_1 + +LBB29_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB29_5 + +LBB29_1: + WORD $0xd285 // test edx, edx + JLE LBB29_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB29_3: + LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB29_3 + +LBB29_4: + RET + +TEXT ·_transpose_uint64_int16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB30_1 + +LBB30_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB30_5 + +LBB30_1: + WORD $0xd285 // test edx, edx + JLE LBB30_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB30_3: + LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB30_3 + +LBB30_4: + RET + +TEXT ·_transpose_int64_int16_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB31_1 + +LBB31_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + WORD $0x8966; BYTE $0x16 // mov word [rsi], dx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x02568966 // mov word [rsi + 2], dx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x04568966 // mov word [rsi + 4], dx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx] + LONG $0x06568966 // mov word [rsi + 6], dx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x08c68348 // add rsi, 8 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB31_5 + +LBB31_1: + WORD $0xd285 // test edx, edx + JLE LBB31_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB31_3: + LONG $0x87048b4a // mov rax, qword [rdi + 4*r8] + LONG $0x8104b70f // movzx eax, word [rcx + 4*rax] + LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB31_3 + +LBB31_4: + RET + +TEXT ·_transpose_uint8_uint32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB32_1 + +LBB32_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB32_5 + +LBB32_1: + WORD $0xd285 // test edx, edx + JLE LBB32_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB32_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x86048942 // mov dword [rsi + 4*r8], eax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB32_3 + +LBB32_4: + RET + +TEXT ·_transpose_int8_uint32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB33_1 + +LBB33_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB33_5 + +LBB33_1: + WORD $0xd285 // test edx, edx + JLE LBB33_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB33_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x86048942 // mov dword [rsi + 4*r8], eax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB33_3 + +LBB33_4: + RET + +TEXT ·_transpose_uint16_uint32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB34_1 + +LBB34_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x0257b70f // movzx edx, word [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x0457b70f // movzx edx, word [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x0657b70f // movzx edx, word [rdi + 6] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB34_5 + +LBB34_1: + WORD $0xd285 // test edx, edx + JLE LBB34_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB34_3: + LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x46048942 // mov dword [rsi + 2*r8], eax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB34_3 + +LBB34_4: + RET + +TEXT ·_transpose_int16_uint32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB35_1 + +LBB35_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB35_5 + +LBB35_1: + WORD $0xd285 // test edx, edx + JLE LBB35_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB35_3: + LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x46048942 // mov dword [rsi + 2*r8], eax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB35_3 + +LBB35_4: + RET + +TEXT ·_transpose_uint32_uint32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB36_1 + +LBB36_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB36_5 + +LBB36_1: + WORD $0xd285 // test edx, edx + JLE LBB36_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB36_3: + LONG $0x07048b42 // mov eax, dword [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB36_3 + +LBB36_4: + RET + +TEXT ·_transpose_int32_uint32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB37_1 + +LBB37_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB37_5 + +LBB37_1: + WORD $0xd285 // test edx, edx + JLE LBB37_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB37_3: + LONG $0x0704634a // movsxd rax, dword [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB37_3 + +LBB37_4: + RET + +TEXT ·_transpose_uint64_uint32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB38_1 + +LBB38_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB38_5 + +LBB38_1: + WORD $0xd285 // test edx, edx + JLE LBB38_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB38_3: + LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB38_3 + +LBB38_4: + RET + +TEXT ·_transpose_int64_uint32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB39_1 + +LBB39_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB39_5 + +LBB39_1: + WORD $0xd285 // test edx, edx + JLE LBB39_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB39_3: + LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB39_3 + +LBB39_4: + RET + +TEXT ·_transpose_uint8_int32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB40_1 + +LBB40_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB40_5 + +LBB40_1: + WORD $0xd285 // test edx, edx + JLE LBB40_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB40_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x86048942 // mov dword [rsi + 4*r8], eax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB40_3 + +LBB40_4: + RET + +TEXT ·_transpose_int8_int32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB41_1 + +LBB41_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB41_5 + +LBB41_1: + WORD $0xd285 // test edx, edx + JLE LBB41_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB41_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x86048942 // mov dword [rsi + 4*r8], eax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB41_3 + +LBB41_4: + RET + +TEXT ·_transpose_uint16_int32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB42_1 + +LBB42_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x0257b70f // movzx edx, word [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x0457b70f // movzx edx, word [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x0657b70f // movzx edx, word [rdi + 6] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB42_5 + +LBB42_1: + WORD $0xd285 // test edx, edx + JLE LBB42_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB42_3: + LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x46048942 // mov dword [rsi + 2*r8], eax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB42_3 + +LBB42_4: + RET + +TEXT ·_transpose_int16_int32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB43_1 + +LBB43_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB43_5 + +LBB43_1: + WORD $0xd285 // test edx, edx + JLE LBB43_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB43_3: + LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x46048942 // mov dword [rsi + 2*r8], eax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB43_3 + +LBB43_4: + RET + +TEXT ·_transpose_uint32_int32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB44_1 + +LBB44_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB44_5 + +LBB44_1: + WORD $0xd285 // test edx, edx + JLE LBB44_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB44_3: + LONG $0x07048b42 // mov eax, dword [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB44_3 + +LBB44_4: + RET + +TEXT ·_transpose_int32_int32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB45_1 + +LBB45_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB45_5 + +LBB45_1: + WORD $0xd285 // test edx, edx + JLE LBB45_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB45_3: + LONG $0x0704634a // movsxd rax, dword [rdi + r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB45_3 + +LBB45_4: + RET + +TEXT ·_transpose_uint64_int32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB46_1 + +LBB46_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB46_5 + +LBB46_1: + WORD $0xd285 // test edx, edx + JLE LBB46_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB46_3: + LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB46_3 + +LBB46_4: + RET + +TEXT ·_transpose_int64_int32_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB47_1 + +LBB47_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x1689 // mov dword [rsi], edx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx] + WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x10c68348 // add rsi, 16 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB47_5 + +LBB47_1: + WORD $0xd285 // test edx, edx + JLE LBB47_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB47_3: + LONG $0x47048b4a // mov rax, qword [rdi + 2*r8] + WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax] + LONG $0x06048942 // mov dword [rsi + r8], eax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB47_3 + +LBB47_4: + RET + +TEXT ·_transpose_uint8_uint64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB48_1 + +LBB48_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB48_5 + +LBB48_1: + WORD $0xd285 // test edx, edx + JLE LBB48_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB48_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0xc604894a // mov qword [rsi + 8*r8], rax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB48_3 + +LBB48_4: + RET + +TEXT ·_transpose_int8_uint64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB49_1 + +LBB49_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB49_5 + +LBB49_1: + WORD $0xd285 // test edx, edx + JLE LBB49_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB49_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0xc604894a // mov qword [rsi + 8*r8], rax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB49_3 + +LBB49_4: + RET + +TEXT ·_transpose_uint16_uint64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB50_1 + +LBB50_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x0257b70f // movzx edx, word [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x0457b70f // movzx edx, word [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x0657b70f // movzx edx, word [rdi + 6] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB50_5 + +LBB50_1: + WORD $0xd285 // test edx, edx + JLE LBB50_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB50_3: + LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x8604894a // mov qword [rsi + 4*r8], rax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB50_3 + +LBB50_4: + RET + +TEXT ·_transpose_int16_uint64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB51_1 + +LBB51_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB51_5 + +LBB51_1: + WORD $0xd285 // test edx, edx + JLE LBB51_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB51_3: + LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x8604894a // mov qword [rsi + 4*r8], rax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB51_3 + +LBB51_4: + RET + +TEXT ·_transpose_uint32_uint64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB52_1 + +LBB52_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB52_5 + +LBB52_1: + WORD $0xd285 // test edx, edx + JLE LBB52_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB52_3: + LONG $0x07048b42 // mov eax, dword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x4604894a // mov qword [rsi + 2*r8], rax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB52_3 + +LBB52_4: + RET + +TEXT ·_transpose_int32_uint64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB53_1 + +LBB53_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB53_5 + +LBB53_1: + WORD $0xd285 // test edx, edx + JLE LBB53_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB53_3: + LONG $0x0704634a // movsxd rax, dword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x4604894a // mov qword [rsi + 2*r8], rax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB53_3 + +LBB53_4: + RET + +TEXT ·_transpose_uint64_uint64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB54_1 + +LBB54_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB54_5 + +LBB54_1: + WORD $0xd285 // test edx, edx + JLE LBB54_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB54_3: + LONG $0x07048b4a // mov rax, qword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x0604894a // mov qword [rsi + r8], rax + LONG $0x08c08349 // add r8, 8 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB54_3 + +LBB54_4: + RET + +TEXT ·_transpose_int64_uint64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB55_1 + +LBB55_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB55_5 + +LBB55_1: + WORD $0xd285 // test edx, edx + JLE LBB55_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB55_3: + LONG $0x07048b4a // mov rax, qword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x0604894a // mov qword [rsi + r8], rax + LONG $0x08c08349 // add r8, 8 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB55_3 + +LBB55_4: + RET + +TEXT ·_transpose_uint8_int64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB56_1 + +LBB56_5: + WORD $0xd089 // mov eax, edx + WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x0157b60f // movzx edx, byte [rdi + 1] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x0257b60f // movzx edx, byte [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x0357b60f // movzx edx, byte [rdi + 3] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB56_5 + +LBB56_1: + WORD $0xd285 // test edx, edx + JLE LBB56_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB56_3: + LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0xc604894a // mov qword [rsi + 8*r8], rax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB56_3 + +LBB56_4: + RET + +TEXT ·_transpose_int8_int64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB57_1 + +LBB57_5: + WORD $0xd089 // mov eax, edx + LONG $0x17be0f48 // movsx rdx, byte [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x04c78348 // add rdi, 4 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB57_5 + +LBB57_1: + WORD $0xd285 // test edx, edx + JLE LBB57_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB57_3: + LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0xc604894a // mov qword [rsi + 8*r8], rax + LONG $0x01c08349 // add r8, 1 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB57_3 + +LBB57_4: + RET + +TEXT ·_transpose_uint16_int64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB58_1 + +LBB58_5: + WORD $0xd089 // mov eax, edx + WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x0257b70f // movzx edx, word [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x0457b70f // movzx edx, word [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x0657b70f // movzx edx, word [rdi + 6] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB58_5 + +LBB58_1: + WORD $0xd285 // test edx, edx + JLE LBB58_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB58_3: + LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x8604894a // mov qword [rsi + 4*r8], rax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB58_3 + +LBB58_4: + RET + +TEXT ·_transpose_int16_int64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB59_1 + +LBB59_5: + WORD $0xd089 // mov eax, edx + LONG $0x17bf0f48 // movsx rdx, word [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x08c78348 // add rdi, 8 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB59_5 + +LBB59_1: + WORD $0xd285 // test edx, edx + JLE LBB59_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB59_3: + LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x8604894a // mov qword [rsi + 4*r8], rax + LONG $0x02c08349 // add r8, 2 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB59_3 + +LBB59_4: + RET + +TEXT ·_transpose_uint32_int64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB60_1 + +LBB60_5: + WORD $0xd089 // mov eax, edx + WORD $0x178b // mov edx, dword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB60_5 + +LBB60_1: + WORD $0xd285 // test edx, edx + JLE LBB60_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB60_3: + LONG $0x07048b42 // mov eax, dword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x4604894a // mov qword [rsi + 2*r8], rax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB60_3 + +LBB60_4: + RET + +TEXT ·_transpose_int32_int64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB61_1 + +LBB61_5: + WORD $0xd089 // mov eax, edx + WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x04576348 // movsxd rdx, dword [rdi + 4] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x08576348 // movsxd rdx, dword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x0c576348 // movsxd rdx, dword [rdi + 12] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x10c78348 // add rdi, 16 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB61_5 + +LBB61_1: + WORD $0xd285 // test edx, edx + JLE LBB61_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB61_3: + LONG $0x0704634a // movsxd rax, dword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x4604894a // mov qword [rsi + 2*r8], rax + LONG $0x04c08349 // add r8, 4 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB61_3 + +LBB61_4: + RET + +TEXT ·_transpose_uint64_int64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB62_1 + +LBB62_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB62_5 + +LBB62_1: + WORD $0xd285 // test edx, edx + JLE LBB62_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB62_3: + LONG $0x07048b4a // mov rax, qword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x0604894a // mov qword [rsi + r8], rax + LONG $0x08c08349 // add r8, 8 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB62_3 + +LBB62_4: + RET + +TEXT ·_transpose_int64_int64_sse4(SB), $0-32 + + MOVQ src+0(FP), DI + MOVQ dest+8(FP), SI + MOVQ length+16(FP), DX + MOVQ transposeMap+24(FP), CX + + WORD $0xfa83; BYTE $0x04 // cmp edx, 4 + JL LBB63_1 + +LBB63_5: + WORD $0xd089 // mov eax, edx + WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx + LONG $0x08578b48 // mov rdx, qword [rdi + 8] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x08568948 // mov qword [rsi + 8], rdx + LONG $0x10578b48 // mov rdx, qword [rdi + 16] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x10568948 // mov qword [rsi + 16], rdx + LONG $0x18578b48 // mov rdx, qword [rdi + 24] + LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx] + LONG $0x18568948 // mov qword [rsi + 24], rdx + WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4] + LONG $0x20c78348 // add rdi, 32 + LONG $0x20c68348 // add rsi, 32 + WORD $0xf883; BYTE $0x07 // cmp eax, 7 + JG LBB63_5 + +LBB63_1: + WORD $0xd285 // test edx, edx + JLE LBB63_4 + WORD $0xc283; BYTE $0x01 // add edx, 1 + WORD $0x3145; BYTE $0xc0 // xor r8d, r8d + +LBB63_3: + LONG $0x07048b4a // mov rax, qword [rdi + r8] + LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax] + LONG $0x0604894a // mov qword [rsi + r8], rax + LONG $0x08c08349 // add r8, 8 + WORD $0xc283; BYTE $0xff // add edx, -1 + WORD $0xfa83; BYTE $0x01 // cmp edx, 1 + JG LBB63_3 + +LBB63_4: + RET |
