mask_arm64.s raw
1 #include "textflag.h"
2
3 // func maskAsm(b *byte, len int, key uint32)
4 TEXT ·maskAsm(SB), NOSPLIT, $0-28
5 // R0 = b
6 // R1 = len
7 // R3 = key (uint32)
8 // R2 = uint64(key)<<32 | uint64(key)
9 MOVD b_ptr+0(FP), R0
10 MOVD b_len+8(FP), R1
11 MOVWU key+16(FP), R3
12 MOVD R3, R2
13 ORR R2<<32, R2, R2
14 VDUP R2, V0.D2
15 CMP $64, R1
16 BLT less_than_64
17
18 loop_64:
19 VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16]
20 VEOR V1.B16, V0.B16, V1.B16
21 VEOR V2.B16, V0.B16, V2.B16
22 VEOR V3.B16, V0.B16, V3.B16
23 VEOR V4.B16, V0.B16, V4.B16
24 VST1.P [V1.B16, V2.B16, V3.B16, V4.B16], 64(R0)
25 SUBS $64, R1
26 CMP $64, R1
27 BGE loop_64
28
29 less_than_64:
30 CBZ R1, end
31 TBZ $5, R1, less_than_32
32 VLD1 (R0), [V1.B16, V2.B16]
33 VEOR V1.B16, V0.B16, V1.B16
34 VEOR V2.B16, V0.B16, V2.B16
35 VST1.P [V1.B16, V2.B16], 32(R0)
36
37 less_than_32:
38 TBZ $4, R1, less_than_16
39 LDP (R0), (R11, R12)
40 EOR R11, R2, R11
41 EOR R12, R2, R12
42 STP.P (R11, R12), 16(R0)
43
44 less_than_16:
45 TBZ $3, R1, less_than_8
46 MOVD (R0), R11
47 EOR R2, R11, R11
48 MOVD.P R11, 8(R0)
49
50 less_than_8:
51 TBZ $2, R1, less_than_4
52 MOVWU (R0), R11
53 EORW R2, R11, R11
54 MOVWU.P R11, 4(R0)
55
56 less_than_4:
57 TBZ $1, R1, less_than_2
58 MOVHU (R0), R11
59 EORW R3, R11, R11
60 MOVHU.P R11, 2(R0)
61 RORW $16, R3
62
63 less_than_2:
64 TBZ $0, R1, end
65 MOVBU (R0), R11
66 EORW R3, R11, R11
67 MOVBU.P R11, 1(R0)
68 RORW $8, R3
69
70 end:
71 MOVWU R3, ret+24(FP)
72 RET
73