1 // Copyright (c) 2020. Temple3x (temple3x@gmail.com)
2 // Copyright (c) 2017 Zach Bjornson
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 16 package xhex
17 18 import "github.com/templexxx/cpu"
19 20 func init() {
21 if cpu.X86.HasAVX2 {
22 encode = func(dst, src []byte) {
23 n := len(src)
24 if n == 0 {
25 return
26 }
27 encodeAVX2(&dst[0], &src[0], n)
28 done := n >> 4 << 4
29 if done == n {
30 return
31 }
32 33 // Deal with unaligned part.
34 dst = dst[done*2:]
35 src = src[done:]
36 encodeBase(dst, src)
37 }
38 39 decode = func(dst, src []byte) error {
40 n := len(src)
41 if n == 0 {
42 return nil
43 }
44 decodeAVX2(&dst[0], &src[0], n)
45 done := n >> 5 << 5
46 if done == n {
47 return nil
48 }
49 // Deal with unaligned part.
50 dst = dst[done/2:]
51 src = src[done:]
52 return decodeBase(dst, src)
53 }
54 }
55 }
56 57 // This mask will help to replace the byte in the higher position with byte in lower (each two bytes),
58 // and leave the lower part 0.
59 // e.g.
60 // Before Packed Shuffle Bytes:
61 // [82 0 253 0 252 0 7 0 33 0 130 0 101 0 79 0 22 0 63 0 95 0 15 0 154 0 98 0 29 0 114 0]
62 // After Packed Shuffle Bytes with this mask:
63 // [0 82 0 253 0 252 0 7 0 33 0 130 0 101 0 79 0 22 0 63 0 95 0 15 0 154 0 98 0 29 0 114]
64 //
65 // If negative integer make you uncomfortable, you could use:
66 // []uint8{129, 0, 129, 2, 129, 4, 129, 6, 129, 8, 129, 10, 129, 12, 129, 14,
67 // 129, 0, 129, 2, 129, 4, 129, 6, 129, 8, 129, 10, 129, 12, 129, 14}
68 // They have same effect indeed.
69 var replaceHighMask = []int8{-1, 0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, 12, -1, 14,
70 -1, 0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, 12, -1, 14}
71 72 // This two masks are used for separating high and low nibbles and extend into 16-bit elements.
73 var decodeMask1 = []int8{0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, 12, -1, 14, -1}
74 var decodeMask2 = []int8{1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1}
75 76 // encodeAVX2 encodes bytes multiple of 16(src) with AVX2 instructions.
77 // After lots of attempts, the algorithm described in https://github.com/zbjornson/fast-hex is the finally answer.
78 // There are ways to achieve same goal, fast-hex is the one of fastest,
79 // and the algorithm is easy to understand.
80 //
81 // More details about others:
82 // I have roughly read a Go version SIMD hex encoding: https://github.com/tmthrgd/go-hex,
83 // it should be the best choice, but the assembly codes are generated but not handwritten,
84 // it's awful to review it, and I found AVX-SSE transition penalty in the codes.
85 //go:noescape
86 func encodeAVX2(dst, src *byte, n int)
87 88 // decodeAVX2 decodes bytes multiple of 32(src) with AVX2 instructions.
89 // The main idea is still from https://github.com/zbjornson/fast-hex, but with some modification:
90 // In fast-hex, decode 64bytes in src every loop,
91 // in decodeAVX2, decode 32bytes in src every loop.
92 //go:noesacpe
93 func decodeAVX2(dst, src *byte, n int)
94