xhex_amd64.go raw

   1  // Copyright (c) 2020. Temple3x (temple3x@gmail.com)
   2  // Copyright (c) 2017 Zach Bjornson
   3  //
   4  // Licensed under the Apache License, Version 2.0 (the "License");
   5  // you may not use this file except in compliance with the License.
   6  // You may obtain a copy of the License at
   7  //
   8  //      http://www.apache.org/licenses/LICENSE-2.0
   9  //
  10  // Unless required by applicable law or agreed to in writing, software
  11  // distributed under the License is distributed on an "AS IS" BASIS,
  12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  // See the License for the specific language governing permissions and
  14  // limitations under the License.
  15  
  16  package xhex
  17  
  18  import "github.com/templexxx/cpu"
  19  
  20  func init() {
  21  	if cpu.X86.HasAVX2 {
  22  		encode = func(dst, src []byte) {
  23  			n := len(src)
  24  			if n == 0 {
  25  				return
  26  			}
  27  			encodeAVX2(&dst[0], &src[0], n)
  28  			done := n >> 4 << 4
  29  			if done == n {
  30  				return
  31  			}
  32  
  33  			// Deal with unaligned part.
  34  			dst = dst[done*2:]
  35  			src = src[done:]
  36  			encodeBase(dst, src)
  37  		}
  38  
  39  		decode = func(dst, src []byte) error {
  40  			n := len(src)
  41  			if n == 0 {
  42  				return nil
  43  			}
  44  			decodeAVX2(&dst[0], &src[0], n)
  45  			done := n >> 5 << 5
  46  			if done == n {
  47  				return nil
  48  			}
  49  			// Deal with unaligned part.
  50  			dst = dst[done/2:]
  51  			src = src[done:]
  52  			return decodeBase(dst, src)
  53  		}
  54  	}
  55  }
  56  
  57  // This mask will help to replace the byte in the higher position with byte in lower (each two bytes),
  58  // and leave the lower part 0.
  59  // e.g.
  60  // Before Packed Shuffle Bytes:
  61  // [82 0 253 0 252 0 7 0 33 0 130 0 101 0 79 0 22 0 63 0 95 0 15 0 154 0 98 0 29 0 114 0]
  62  // After Packed Shuffle Bytes with this mask:
  63  // [0 82 0 253 0 252 0 7 0 33 0 130 0 101 0 79 0 22 0 63 0 95 0 15 0 154 0 98 0 29 0 114]
  64  //
  65  // If negative integer make you uncomfortable, you could use:
  66  // []uint8{129, 0, 129, 2, 129, 4, 129, 6, 129, 8, 129, 10, 129, 12, 129, 14,
  67  //	129, 0, 129, 2, 129, 4, 129, 6, 129, 8, 129, 10, 129, 12, 129, 14}
  68  // They have same effect indeed.
  69  var replaceHighMask = []int8{-1, 0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, 12, -1, 14,
  70  	-1, 0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, 12, -1, 14}
  71  
  72  // This two masks are used for separating high and low nibbles and extend into 16-bit elements.
  73  var decodeMask1 = []int8{0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, 12, -1, 14, -1}
  74  var decodeMask2 = []int8{1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1}
  75  
  76  // encodeAVX2 encodes bytes multiple of 16(src) with AVX2 instructions.
  77  // After lots of attempts, the algorithm described in https://github.com/zbjornson/fast-hex is the finally answer.
  78  // There are ways to achieve same goal, fast-hex is the one of fastest,
  79  // and the algorithm is easy to understand.
  80  //
  81  // More details about others:
  82  // I have roughly read a Go version SIMD hex encoding: https://github.com/tmthrgd/go-hex,
  83  // it should be the best choice, but the assembly codes are generated but not handwritten,
  84  // it's awful to review it, and I found AVX-SSE transition penalty in the codes.
  85  //go:noescape
  86  func encodeAVX2(dst, src *byte, n int)
  87  
  88  // decodeAVX2 decodes bytes multiple of 32(src) with AVX2 instructions.
  89  // The main idea is still from https://github.com/zbjornson/fast-hex, but with some modification:
  90  // In fast-hex, decode 64bytes in src every loop,
  91  // in decodeAVX2, decode 32bytes in src every loop.
  92  //go:noesacpe
  93  func decodeAVX2(dst, src *byte, n int)
  94