xhex_amd64.s raw
1 #include "textflag.h"
2
3 #define dst R8
4 #define src R9
5 #define len R11
6 #define mask Y0
7 #define rmask Y1
8 #define tbl Y2
9
10 // HEX_TBL doubles normal hex table for AVX2 register (expand 16Bytes to 32Bytes)
11 DATA HEX_TBL<>+0x00(SB)/32, $"0123456789abcdef0123456789abcdef"
12 GLOBL HEX_TBL<>(SB), RODATA, $32
13
14 // func encodeAVX2(dst, src *byte, n int)
15 TEXT ·encodeAVX2(SB), NOSPLIT, $0
16 MOVQ d+0(FP), dst
17 MOVQ s+8(FP), src
18 MOVQ n+16(FP), len
19 SHRQ $4, len // n / 16.
20 TESTQ len, len
21 JZ ret // If 0, return.
22
23 // Preparation.
24 // Make a 32Bytes mask filled with $0x0f.
25 MOVQ $0x0f, DX
26 MOVQ DX, X0
27 VPBROADCASTB X0, mask
28 MOVQ ·replaceHighMask(SB), AX
29 VMOVDQU (AX), rmask
30 VMOVDQU HEX_TBL<>(SB), tbl
31
32 loop16b:
33 VMOVDQU (src), X3 // Load 16bytes source.
34 VPMOVZXBW X3, Y3 // Zero extend 16bytes to 32bytes.
35 VPSRLW $4, Y3, Y4 // >> 4.
36 VPSHUFB rmask, Y3, Y3
37 VPOR Y4, Y3, Y4
38 VPAND mask, Y4, Y4 // Clean high bits.
39 VPSHUFB Y4, tbl, Y4
40 VMOVDQU Y4, (dst)
41
42 ADDQ $16, src
43 ADDQ $32, dst
44 SUBQ $1, len
45 JNE loop16b
46 VZEROUPPER
47
48 ret:
49 RET
50
51 #define mask15 X0
52 #define maska X1
53 #define maskb X2
54 #define mask9 X10
55
56 // func decodeAVX2(dst, src *byte, n int)
57 TEXT ·decodeAVX2(SB), NOSPLIT, $0
58 MOVQ d+0(FP), dst
59 MOVQ s+8(FP), src
60 MOVQ n+16(FP), len
61 SHRQ $5, len // n / 32.
62 TESTQ len, len
63 JZ ret // If 0, return.
64
65 MOVQ $0x0f, DX
66 MOVQ DX, X0
67 VPBROADCASTW X0, mask15
68 MOVQ $9, DX
69 MOVQ DX, X10
70 VPBROADCASTW X10, mask9
71
72 MOVQ ·decodeMask1(SB), AX
73 VMOVDQU (AX), maska
74 MOVQ ·decodeMask2(SB), BX
75 VMOVDQU (BX), maskb
76
77 loop32b:
78 VMOVDQU (src), X3
79 VMOVDQU 16(src), X4
80
81 VPSHUFB maska, X3, X5
82 VPSHUFB maskb, X3, X6
83 VPSHUFB maska, X4, X7
84 VPSHUFB maskb, X4, X8
85
86 VPAND mask15, X5, X11
87 VPSRAW $6, X5, X12
88 VPMADDUBSW mask9, X12, X12
89 VPADDW X11, X12, X5
90
91 VPAND mask15, X6, X11
92 VPSRAW $6, X6, X12
93 VPMADDUBSW mask9, X12, X12
94 VPADDW X11, X12, X6
95
96 VPAND mask15, X7, X11
97 VPSRAW $6, X7, X12
98 VPMADDUBSW mask9, X12, X12
99 VPADDW X11, X12, X7
100
101 VPAND mask15, X8, X11
102 VPSRAW $6, X8, X12
103 VPMADDUBSW mask9, X12, X12
104 VPADDW X11, X12, X8
105
106 VPSLLW $4, X5, X5
107 VPSLLW $4, X7, X7
108 VPOR X5, X6, X5
109 VPOR X7, X8, X7
110
111 VPACKUSWB X5, X7, X9
112 VPSHUFD $78, X9, X9
113
114 VMOVDQU X9, (dst)
115
116 ADDQ $32, src
117 ADDQ $16, dst
118 SUBQ $1, len
119 JNE loop32b
120 VZEROUPPER
121
122 ret:
123 RET
124