mask_amd64.s raw
1 #include "textflag.h"
2
3 // func maskAsm(b *byte, len int, key uint32)
4 TEXT ·maskAsm(SB), NOSPLIT, $0-28
5 // AX = b
6 // CX = len (left length)
7 // SI = key (uint32)
8 // DI = uint64(SI) | uint64(SI)<<32
9 MOVQ b+0(FP), AX
10 MOVQ len+8(FP), CX
11 MOVL key+16(FP), SI
12
13 // calculate the DI
14 // DI = SI<<32 | SI
15 MOVL SI, DI
16 MOVQ DI, DX
17 SHLQ $32, DI
18 ORQ DX, DI
19
20 CMPQ CX, $15
21 JLE less_than_16
22 CMPQ CX, $63
23 JLE less_than_64
24 CMPQ CX, $128
25 JLE sse
26 TESTQ $31, AX
27 JNZ unaligned
28
29 unaligned_loop_1byte:
30 XORB SI, (AX)
31 INCQ AX
32 DECQ CX
33 ROLL $24, SI
34 TESTQ $7, AX
35 JNZ unaligned_loop_1byte
36
37 // calculate DI again since SI was modified
38 // DI = SI<<32 | SI
39 MOVL SI, DI
40 MOVQ DI, DX
41 SHLQ $32, DI
42 ORQ DX, DI
43
44 TESTQ $31, AX
45 JZ sse
46
47 unaligned:
48 TESTQ $7, AX // AND $7 & len, if not zero jump to loop_1b.
49 JNZ unaligned_loop_1byte
50
51 unaligned_loop:
52 // we don't need to check the CX since we know it's above 128
53 XORQ DI, (AX)
54 ADDQ $8, AX
55 SUBQ $8, CX
56 TESTQ $31, AX
57 JNZ unaligned_loop
58 JMP sse
59
60 sse:
61 CMPQ CX, $0x40
62 JL less_than_64
63 MOVQ DI, X0
64 PUNPCKLQDQ X0, X0
65
66 sse_loop:
67 MOVOU 0*16(AX), X1
68 MOVOU 1*16(AX), X2
69 MOVOU 2*16(AX), X3
70 MOVOU 3*16(AX), X4
71 PXOR X0, X1
72 PXOR X0, X2
73 PXOR X0, X3
74 PXOR X0, X4
75 MOVOU X1, 0*16(AX)
76 MOVOU X2, 1*16(AX)
77 MOVOU X3, 2*16(AX)
78 MOVOU X4, 3*16(AX)
79 ADDQ $0x40, AX
80 SUBQ $0x40, CX
81 CMPQ CX, $0x40
82 JAE sse_loop
83
84 less_than_64:
85 TESTQ $32, CX
86 JZ less_than_32
87 XORQ DI, (AX)
88 XORQ DI, 8(AX)
89 XORQ DI, 16(AX)
90 XORQ DI, 24(AX)
91 ADDQ $32, AX
92
93 less_than_32:
94 TESTQ $16, CX
95 JZ less_than_16
96 XORQ DI, (AX)
97 XORQ DI, 8(AX)
98 ADDQ $16, AX
99
100 less_than_16:
101 TESTQ $8, CX
102 JZ less_than_8
103 XORQ DI, (AX)
104 ADDQ $8, AX
105
106 less_than_8:
107 TESTQ $4, CX
108 JZ less_than_4
109 XORL SI, (AX)
110 ADDQ $4, AX
111
112 less_than_4:
113 TESTQ $2, CX
114 JZ less_than_2
115 XORW SI, (AX)
116 ROLL $16, SI
117 ADDQ $2, AX
118
119 less_than_2:
120 TESTQ $1, CX
121 JZ done
122 XORB SI, (AX)
123 ROLL $24, SI
124
125 done:
126 MOVL SI, ret+24(FP)
127 RET
128