sha256block_amd64_avx2.mx raw
1 // Copyright 2024 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package main
6
7 import (
8 . "github.com/mmcloughlin/avo/build"
9 . "github.com/mmcloughlin/avo/operand"
10 . "github.com/mmcloughlin/avo/reg"
11 )
12
13 // The avx2-version is described in an Intel White-Paper:
14 // "Fast SHA-256 Implementations on Intel Architecture Processors"
15 // To find it, surf to http://www.intel.com/p/en_US/embedded
16 // and search for that title.
17 // AVX2 version by Intel, same algorithm as code in Linux kernel:
18 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
19 // by
20 // James Guilford <james.guilford@intel.com>
21 // Kirk Yap <kirk.s.yap@intel.com>
22 // Tim Chen <tim.c.chen@linux.intel.com>
23
24 func blockAVX2() {
25 Implement("blockAVX2")
26 AllocLocal(536)
27
28 Load(Param("dig"), CTX) // d.h[8]
29 Load(Param("p").Base(), INP)
30 Load(Param("p").Len(), NUM_BYTES)
31
32 LEAQ(Mem{Base: INP, Index: NUM_BYTES, Scale: 1, Disp: -64}, NUM_BYTES) // Pointer to the last block
33 MOVQ(NUM_BYTES, Mem{Base: SP}.Offset(_INP_END))
34
35 CMPQ(NUM_BYTES, INP)
36 JE(LabelRef("avx2_only_one_block"))
37
38 Comment("Load initial digest")
39 CTX := Mem{Base: CTX}
40 MOVL(CTX.Offset(0), a) // a = H0
41 MOVL(CTX.Offset(4), b) // b = H1
42 MOVL(CTX.Offset(8), c) // c = H2
43 MOVL(CTX.Offset(12), d) // d = H3
44 MOVL(CTX.Offset(16), e) // e = H4
45 MOVL(CTX.Offset(20), f) // f = H5
46 MOVL(CTX.Offset(24), g) // g = H6
47 MOVL(CTX.Offset(28), h) // h = H7
48
49 avx2_loop0()
50 avx2_last_block_enter()
51 avx2_loop1()
52 avx2_loop2()
53 avx2_loop3()
54 avx2_do_last_block()
55 avx2_only_one_block()
56 done_hash()
57 }
58
59 func avx2_loop0() {
60 Label("avx2_loop0")
61 Comment("at each iteration works with one block (512 bit)")
62 VMOVDQU(Mem{Base: INP}.Offset(0*32), XTMP0)
63 VMOVDQU(Mem{Base: INP}.Offset(1*32), XTMP1)
64 VMOVDQU(Mem{Base: INP}.Offset(2*32), XTMP2)
65 VMOVDQU(Mem{Base: INP}.Offset(3*32), XTMP3)
66
67 flip_mask := flip_mask_DATA()
68
69 VMOVDQU(flip_mask, BYTE_FLIP_MASK)
70
71 Comment("Apply Byte Flip Mask: LE -> BE")
72 VPSHUFB(BYTE_FLIP_MASK, XTMP0, XTMP0)
73 VPSHUFB(BYTE_FLIP_MASK, XTMP1, XTMP1)
74 VPSHUFB(BYTE_FLIP_MASK, XTMP2, XTMP2)
75 VPSHUFB(BYTE_FLIP_MASK, XTMP3, XTMP3)
76
77 Comment("Transpose data into high/low parts")
78 VPERM2I128(Imm(0x20), XTMP2, XTMP0, XDWORD0) // w3, w2, w1, w0
79 VPERM2I128(Imm(0x31), XTMP2, XTMP0, XDWORD1) // w7, w6, w5, w4
80 VPERM2I128(Imm(0x20), XTMP3, XTMP1, XDWORD2) // w11, w10, w9, w8
81 VPERM2I128(Imm(0x31), XTMP3, XTMP1, XDWORD3) // w15, w14, w13, w12
82
83 K256 := K256_DATA()
84 LEAQ(K256, TBL) // Loading address of table with round-specific constants
85 }
86
87 func avx2_last_block_enter() {
88 Label("avx2_last_block_enter")
89 ADDQ(Imm(64), INP)
90 MOVQ(INP, Mem{Base: SP}.Offset(_INP))
91 XORQ(SRND, SRND)
92 }
93
94 // for w0 - w47
95 func avx2_loop1() {
96 Label("avx2_loop1")
97
98 Comment("Do 4 rounds and scheduling")
99 VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((0 * 32)), XDWORD0, XFER)
100 VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
101 roundAndSchedN0(_XFER+0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
102 roundAndSchedN1(_XFER+0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
103 roundAndSchedN2(_XFER+0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
104 roundAndSchedN3(_XFER+0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
105
106 Comment("Do 4 rounds and scheduling")
107 VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
108 VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
109 roundAndSchedN0(_XFER+1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
110 roundAndSchedN1(_XFER+1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
111 roundAndSchedN2(_XFER+1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
112 roundAndSchedN3(_XFER+1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
113
114 Comment("Do 4 rounds and scheduling")
115 VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((2 * 32)), XDWORD2, XFER)
116 VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+2*32))
117 roundAndSchedN0(_XFER+2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
118 roundAndSchedN1(_XFER+2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
119 roundAndSchedN2(_XFER+2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
120 roundAndSchedN3(_XFER+2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
121
122 Comment("Do 4 rounds and scheduling")
123 VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((3 * 32)), XDWORD3, XFER)
124 VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+3*32))
125 roundAndSchedN0(_XFER+3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
126 roundAndSchedN1(_XFER+3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
127 roundAndSchedN2(_XFER+3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
128 roundAndSchedN3(_XFER+3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
129
130 ADDQ(Imm(4*32), SRND)
131 CMPQ(SRND, U32(3*4*32))
132 JB(LabelRef("avx2_loop1"))
133 }
134
135 // w48 - w63 processed with no scheduling (last 16 rounds)
136 func avx2_loop2() {
137 Label("avx2_loop2")
138 VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(0*32), XDWORD0, XFER)
139 VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
140 doRoundN0(_XFER+0*32, a, b, c, d, e, f, g, h, h)
141 doRoundN1(_XFER+0*32, h, a, b, c, d, e, f, g, h)
142 doRoundN2(_XFER+0*32, g, h, a, b, c, d, e, f, g)
143 doRoundN3(_XFER+0*32, f, g, h, a, b, c, d, e, f)
144
145 VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
146 VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
147 doRoundN0(_XFER+1*32, e, f, g, h, a, b, c, d, e)
148 doRoundN1(_XFER+1*32, d, e, f, g, h, a, b, c, d)
149 doRoundN2(_XFER+1*32, c, d, e, f, g, h, a, b, c)
150 doRoundN3(_XFER+1*32, b, c, d, e, f, g, h, a, b)
151
152 ADDQ(Imm(2*32), SRND)
153
154 VMOVDQU(XDWORD2, XDWORD0)
155 VMOVDQU(XDWORD3, XDWORD1)
156
157 CMPQ(SRND, U32(4*4*32))
158 JB(LabelRef("avx2_loop2"))
159
160 Load(Param("dig"), CTX) // d.h[8]
161 MOVQ(Mem{Base: SP}.Offset(_INP), INP)
162
163 registers := []GPPhysical{a, b, c, d, e, f, g, h}
164 for i, reg := range registers {
165 addm(Mem{Base: CTX}.Offset(i*4), reg)
166 }
167
168 CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
169 JB(LabelRef("done_hash"))
170
171 XORQ(SRND, SRND)
172 }
173
174 // Do second block using previously scheduled results
175 func avx2_loop3() {
176 Label("avx2_loop3")
177 doRoundN0(_XFER+0*32+16, a, b, c, d, e, f, g, h, a)
178 doRoundN1(_XFER+0*32+16, h, a, b, c, d, e, f, g, h)
179 doRoundN2(_XFER+0*32+16, g, h, a, b, c, d, e, f, g)
180 doRoundN3(_XFER+0*32+16, f, g, h, a, b, c, d, e, f)
181
182 doRoundN0(_XFER+1*32+16, e, f, g, h, a, b, c, d, e)
183 doRoundN1(_XFER+1*32+16, d, e, f, g, h, a, b, c, d)
184 doRoundN2(_XFER+1*32+16, c, d, e, f, g, h, a, b, c)
185 doRoundN3(_XFER+1*32+16, b, c, d, e, f, g, h, a, b)
186
187 ADDQ(Imm(2*32), SRND)
188 CMPQ(SRND, U32(4*4*32))
189 JB(LabelRef("avx2_loop3"))
190
191 Load(Param("dig"), CTX) // d.h[8]
192 MOVQ(Mem{Base: SP}.Offset(_INP), INP)
193 ADDQ(Imm(64), INP)
194
195 registers := []GPPhysical{a, b, c, d, e, f, g, h}
196 for i, reg := range registers {
197 addm(Mem{Base: CTX}.Offset(i*4), reg)
198 }
199
200 CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
201 JA(LabelRef("avx2_loop0"))
202 JB(LabelRef("done_hash"))
203 }
204
205 func avx2_do_last_block() {
206 Label("avx2_do_last_block")
207 VMOVDQU(Mem{Base: INP}.Offset(0), XWORD0)
208 VMOVDQU(Mem{Base: INP}.Offset(16), XWORD1)
209 VMOVDQU(Mem{Base: INP}.Offset(32), XWORD2)
210 VMOVDQU(Mem{Base: INP}.Offset(48), XWORD3)
211
212 flip_mask := flip_mask_DATA()
213 VMOVDQU(flip_mask, BYTE_FLIP_MASK)
214
215 VPSHUFB(X_BYTE_FLIP_MASK, XWORD0, XWORD0)
216 VPSHUFB(X_BYTE_FLIP_MASK, XWORD1, XWORD1)
217 VPSHUFB(X_BYTE_FLIP_MASK, XWORD2, XWORD2)
218 VPSHUFB(X_BYTE_FLIP_MASK, XWORD3, XWORD3)
219
220 K256 := K256_DATA()
221 LEAQ(K256, TBL)
222
223 JMP(LabelRef("avx2_last_block_enter"))
224 }
225
226 // Load initial digest
227 func avx2_only_one_block() {
228 Label("avx2_only_one_block")
229 registers := []GPPhysical{a, b, c, d, e, f, g, h}
230 for i, reg := range registers {
231 MOVL(Mem{Base: CTX}.Offset(i*4), reg)
232 }
233 JMP(LabelRef("avx2_do_last_block"))
234 }
235
236 func done_hash() {
237 Label("done_hash")
238 VZEROUPPER()
239 RET()
240 }
241
242 // addm (mem), reg
243 // - Add reg to mem using reg-mem add and store
244 func addm(P1 Mem, P2 GPPhysical) {
245 ADDL(P2, P1)
246 MOVL(P1, P2)
247 }
248
249 var (
250 XDWORD0 VecPhysical = Y4
251 XDWORD1 = Y5
252 XDWORD2 = Y6
253 XDWORD3 = Y7
254
255 XWORD0 = X4
256 XWORD1 = X5
257 XWORD2 = X6
258 XWORD3 = X7
259
260 XTMP0 = Y0
261 XTMP1 = Y1
262 XTMP2 = Y2
263 XTMP3 = Y3
264 XTMP4 = Y8
265 XTMP5 = Y11
266
267 XFER = Y9
268
269 BYTE_FLIP_MASK = Y13 // mask to convert LE -> BE
270 X_BYTE_FLIP_MASK = X13
271
272 NUM_BYTES GPPhysical = RDX
273 INP = RDI
274
275 CTX = RSI // Beginning of digest in memory (a, b, c, ... , h)
276
277 a = EAX
278 b = EBX
279 c = ECX
280 d = R8L
281 e = EDX
282 f = R9L
283 g = R10L
284 h = R11L
285
286 old_h = R11L
287
288 TBL = RBP
289
290 SRND = RSI // SRND is same register as CTX
291
292 T1 = R12L
293
294 y0 = R13L
295 y1 = R14L
296 y2 = R15L
297 y3 = EDI
298
299 // Offsets
300 XFER_SIZE = 2 * 64 * 4
301 INP_END_SIZE = 8
302 INP_SIZE = 8
303
304 _XFER = 0
305 _INP_END = _XFER + XFER_SIZE
306 _INP = _INP_END + INP_END_SIZE
307 STACK_SIZE = _INP + INP_SIZE
308 )
309
310 func roundAndSchedN0(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
311 // ############################# RND N + 0 ############################//
312 MOVL(a, y3) // y3 = a
313 RORXL(Imm(25), e, y0) // y0 = e >> 25
314 RORXL(Imm(11), e, y1) // y1 = e >> 11
315
316 ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h
317 ORL(c, y3) // y3 = a|c
318 VPALIGNR(Imm(4), XDWORD2, XDWORD3, XTMP0) // XTMP0 = W[-7]
319 MOVL(f, y2) // y2 = f
320 RORXL(Imm(13), a, T1) // T1 = a >> 13
321
322 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
323 XORL(g, y2) // y2 = f^g
324 VPADDD(XDWORD0, XTMP0, XTMP0) // XTMP0 = W[-7] + W[-16]
325 RORXL(Imm(6), e, y1) // y1 = (e >> 6)
326
327 ANDL(e, y2) // y2 = (f^g)&e
328 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
329 RORXL(Imm(22), a, y1) // y1 = a >> 22
330 ADDL(h, d) // d = k + w + h + d
331
332 ANDL(b, y3) // y3 = (a|c)&b
333 VPALIGNR(Imm(4), XDWORD0, XDWORD1, XTMP1) // XTMP1 = W[-15]
334 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
335 RORXL(Imm(2), a, T1) // T1 = (a >> 2)
336
337 XORL(g, y2) // y2 = CH = ((f^g)&e)^g
338 VPSRLD(Imm(7), XTMP1, XTMP2) //
339 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
340 MOVL(a, T1) // T1 = a
341 ANDL(c, T1) // T1 = a&c
342
343 ADDL(y0, y2) // y2 = S1 + CH
344 VPSLLD(Imm(32-7), XTMP1, XTMP3) //
345 ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
346 ADDL(y1, h) // h = k + w + h + S0
347
348 ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
349 VPOR(XTMP2, XTMP3, XTMP3) // XTMP3 = W[-15] ror 7
350
351 VPSRLD(Imm(18), XTMP1, XTMP2)
352 ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0
353 ADDL(y3, h) // h = t1 + S0 + MAJ
354 }
355
356 func roundAndSchedN1(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
357 // ################################### RND N + 1 ############################
358 MOVL(a, y3) // y3 = a
359 RORXL(Imm(25), e, y0) // y0 = e >> 25
360 RORXL(Imm(11), e, y1) // y1 = e >> 11
361 ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h
362 ORL(c, y3) // y3 = a|c
363
364 VPSRLD(Imm(3), XTMP1, XTMP4) // XTMP4 = W[-15] >> 3
365 MOVL(f, y2) // y2 = f
366 RORXL(Imm(13), a, T1) // T1 = a >> 13
367 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
368 XORL(g, y2) // y2 = f^g
369
370 RORXL(Imm(6), e, y1) // y1 = (e >> 6)
371 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
372 RORXL(Imm(22), a, y1) // y1 = a >> 22
373 ANDL(e, y2) // y2 = (f^g)&e
374 ADDL(h, d) // d = k + w + h + d
375
376 VPSLLD(Imm(32-18), XTMP1, XTMP1)
377 ANDL(b, y3) // y3 = (a|c)&b
378 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
379
380 VPXOR(XTMP1, XTMP3, XTMP3)
381 RORXL(Imm(2), a, T1) // T1 = (a >> 2)
382 XORL(g, y2) // y2 = CH = ((f^g)&e)^g
383
384 VPXOR(XTMP2, XTMP3, XTMP3) // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
385 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
386 MOVL(a, T1) // T1 = a
387 ANDL(c, T1) // T1 = a&c
388 ADDL(y0, y2) // y2 = S1 + CH
389
390 VPXOR(XTMP4, XTMP3, XTMP1) // XTMP1 = s0
391 VPSHUFD(Imm(0xFA), XDWORD3, XTMP2) // XTMP2 = W[-2] {BBAA}
392 ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
393 ADDL(y1, h) // h = k + w + h + S0
394
395 VPADDD(XTMP1, XTMP0, XTMP0) // XTMP0 = W[-16] + W[-7] + s0
396 ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
397 ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0
398 ADDL(y3, h) // h = t1 + S0 + MAJ
399
400 VPSRLD(Imm(10), XTMP2, XTMP4) // XTMP4 = W[-2] >> 10 {BBAA}
401 }
402
403 func roundAndSchedN2(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
404 // ################################### RND N + 2 ############################
405 var shuff_00BA Mem = shuff_00BA_DATA()
406
407 MOVL(a, y3) // y3 = a
408 RORXL(Imm(25), e, y0) // y0 = e >> 25
409 ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h
410
411 VPSRLQ(Imm(19), XTMP2, XTMP3) // XTMP3 = W[-2] ror 19 {xBxA}
412 RORXL(Imm(11), e, y1) // y1 = e >> 11
413 ORL(c, y3) // y3 = a|c
414 MOVL(f, y2) // y2 = f
415 XORL(g, y2) // y2 = f^g
416
417 RORXL(Imm(13), a, T1) // T1 = a >> 13
418 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
419 VPSRLQ(Imm(17), XTMP2, XTMP2) // XTMP2 = W[-2] ror 17 {xBxA}
420 ANDL(e, y2) // y2 = (f^g)&e
421
422 RORXL(Imm(6), e, y1) // y1 = (e >> 6)
423 VPXOR(XTMP3, XTMP2, XTMP2)
424 ADDL(h, d) // d = k + w + h + d
425 ANDL(b, y3) // y3 = (a|c)&b
426
427 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
428 RORXL(Imm(22), a, y1) // y1 = a >> 22
429 VPXOR(XTMP2, XTMP4, XTMP4) // XTMP4 = s1 {xBxA}
430 XORL(g, y2) // y2 = CH = ((f^g)&e)^g
431
432 VPSHUFB(shuff_00BA, XTMP4, XTMP4) // XTMP4 = s1 {00BA}
433
434 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
435 RORXL(Imm(2), a, T1) // T1 = (a >> 2)
436 VPADDD(XTMP4, XTMP0, XTMP0) // XTMP0 = {..., ..., W[1], W[0]}
437
438 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
439 MOVL(a, T1) // T1 = a
440 ANDL(c, T1) // T1 = a&c
441 ADDL(y0, y2) // y2 = S1 + CH
442 VPSHUFD(Imm(80), XTMP0, XTMP2) // XTMP2 = W[-2] {DDCC}
443
444 ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
445 ADDL(y1, h) // h = k + w + h + S0
446 ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
447 ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0
448
449 ADDL(y3, h) // h = t1 + S0 + MAJ
450 }
451
452 func roundAndSchedN3(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
453 // ################################### RND N + 3 ############################
454 var shuff_DC00 Mem = shuff_DC00_DATA()
455
456 MOVL(a, y3) // y3 = a
457 RORXL(Imm(25), e, y0) // y0 = e >> 25
458 RORXL(Imm(11), e, y1) // y1 = e >> 11
459 ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h
460 ORL(c, y3) // y3 = a|c
461
462 VPSRLD(Imm(10), XTMP2, XTMP5) // XTMP5 = W[-2] >> 10 {DDCC}
463 MOVL(f, y2) // y2 = f
464 RORXL(Imm(13), a, T1) // T1 = a >> 13
465 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
466 XORL(g, y2) // y2 = f^g
467
468 VPSRLQ(Imm(19), XTMP2, XTMP3) // XTMP3 = W[-2] ror 19 {xDxC}
469 RORXL(Imm(6), e, y1) // y1 = (e >> 6)
470 ANDL(e, y2) // y2 = (f^g)&e
471 ADDL(h, d) // d = k + w + h + d
472 ANDL(b, y3) // y3 = (a|c)&b
473
474 VPSRLQ(Imm(17), XTMP2, XTMP2) // XTMP2 = W[-2] ror 17 {xDxC}
475 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
476 XORL(g, y2) // y2 = CH = ((f^g)&e)^g
477
478 VPXOR(XTMP3, XTMP2, XTMP2)
479 RORXL(Imm(22), a, y1) // y1 = a >> 22
480 ADDL(y0, y2) // y2 = S1 + CH
481
482 VPXOR(XTMP2, XTMP5, XTMP5) // XTMP5 = s1 {xDxC}
483 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
484 ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
485
486 RORXL(Imm(2), a, T1) // T1 = (a >> 2)
487
488 VPSHUFB(shuff_DC00, XTMP5, XTMP5) // XTMP5 = s1 {DC00}
489
490 VPADDD(XTMP0, XTMP5, XDWORD0) // XDWORD0 = {W[3], W[2], W[1], W[0]}
491 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
492 MOVL(a, T1) // T1 = a
493 ANDL(c, T1) // T1 = a&c
494 ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
495
496 ADDL(y1, h) // h = k + w + h + S0
497 ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0
498 ADDL(y3, h) // h = t1 + S0 + MAJ
499 }
500
501 func doRoundN0(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
502 // ################################### RND N + 0 ###########################
503 MOVL(f, y2) // y2 = f
504 RORXL(Imm(25), e, y0) // y0 = e >> 25
505 RORXL(Imm(11), e, y1) // y1 = e >> 11
506 XORL(g, y2) // y2 = f^g
507
508 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
509 RORXL(Imm(6), e, y1) // y1 = (e >> 6)
510 ANDL(e, y2) // y2 = (f^g)&e
511
512 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
513 RORXL(Imm(13), a, T1) // T1 = a >> 13
514 XORL(g, y2) // y2 = CH = ((f^g)&e)^g
515 RORXL(Imm(22), a, y1) // y1 = a >> 22
516 MOVL(a, y3) // y3 = a
517
518 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
519 RORXL(Imm(2), a, T1) // T1 = (a >> 2)
520 ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h
521 ORL(c, y3) // y3 = a|c
522
523 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
524 MOVL(a, T1) // T1 = a
525 ANDL(b, y3) // y3 = (a|c)&b
526 ANDL(c, T1) // T1 = a&c
527 ADDL(y0, y2) // y2 = S1 + CH
528
529 ADDL(h, d) // d = k + w + h + d
530 ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
531 ADDL(y1, h) // h = k + w + h + S0
532 ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
533 }
534
535 func doRoundN1(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
536 // ################################### RND N + 1 ###########################
537 ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0
538 MOVL(f, y2) // y2 = f
539 RORXL(Imm(25), e, y0) // y0 = e >> 25
540 RORXL(Imm(11), e, y1) // y1 = e >> 11
541 XORL(g, y2) // y2 = f^g
542
543 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
544 RORXL(Imm(6), e, y1) // y1 = (e >> 6)
545 ANDL(e, y2) // y2 = (f^g)&e
546 ADDL(y3, old_h) // h = t1 + S0 + MAJ
547
548 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
549 RORXL(Imm(13), a, T1) // T1 = a >> 13
550 XORL(g, y2) // y2 = CH = ((f^g)&e)^g
551 RORXL(Imm(22), a, y1) // y1 = a >> 22
552 MOVL(a, y3) // y3 = a
553
554 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
555 RORXL(Imm(2), a, T1) // T1 = (a >> 2)
556 ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h
557 ORL(c, y3) // y3 = a|c
558
559 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
560 MOVL(a, T1) // T1 = a
561 ANDL(b, y3) // y3 = (a|c)&b
562 ANDL(c, T1) // T1 = a&c
563 ADDL(y0, y2) // y2 = S1 + CH
564
565 ADDL(h, d) // d = k + w + h + d
566 ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
567 ADDL(y1, h) // h = k + w + h + S0
568
569 ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
570 }
571
572 func doRoundN2(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
573 // ################################### RND N + 2 ##############################
574 ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0
575 MOVL(f, y2) // y2 = f
576 RORXL(Imm(25), e, y0) // y0 = e >> 25
577 RORXL(Imm(11), e, y1) // y1 = e >> 11
578 XORL(g, y2) // y2 = f^g
579
580 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
581 RORXL(Imm(6), e, y1) // y1 = (e >> 6)
582 ANDL(e, y2) // y2 = (f^g)&e
583 ADDL(y3, old_h) // h = t1 + S0 + MAJ
584
585 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
586 RORXL(Imm(13), a, T1) // T1 = a >> 13
587 XORL(g, y2) // y2 = CH = ((f^g)&e)^g
588 RORXL(Imm(22), a, y1) // y1 = a >> 22
589 MOVL(a, y3) // y3 = a
590
591 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
592 RORXL(Imm(2), a, T1) // T1 = (a >> 2)
593 ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h
594 ORL(c, y3) // y3 = a|c
595
596 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
597 MOVL(a, T1) // T1 = a
598 ANDL(b, y3) // y3 = (a|c)&b
599 ANDL(c, T1) // T1 = a&c
600 ADDL(y0, y2) // y2 = S1 + CH
601
602 ADDL(h, d) // d = k + w + h + d
603 ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
604 ADDL(y1, h) // h = k + w + h + S0
605
606 ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
607 }
608
609 func doRoundN3(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
610 // ################################### RND N + 3 ###########################
611 ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0
612 MOVL(f, y2) // y2 = f
613 RORXL(Imm(25), e, y0) // y0 = e >> 25
614 RORXL(Imm(11), e, y1) // y1 = e >> 11
615 XORL(g, y2) // y2 = f^g
616
617 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
618 RORXL(Imm(6), e, y1) // y1 = (e >> 6)
619 ANDL(e, y2) // y2 = (f^g)&e
620 ADDL(y3, old_h) // h = t1 + S0 + MAJ
621
622 XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
623 RORXL(Imm(13), a, T1) // T1 = a >> 13
624 XORL(g, y2) // y2 = CH = ((f^g)&e)^g
625 RORXL(Imm(22), a, y1) // y1 = a >> 22
626 MOVL(a, y3) // y3 = a
627
628 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
629 RORXL(Imm(2), a, T1) // T1 = (a >> 2)
630 ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h
631 ORL(c, y3) // y3 = a|c
632
633 XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
634 MOVL(a, T1) // T1 = a
635 ANDL(b, y3) // y3 = (a|c)&b
636 ANDL(c, T1) // T1 = a&c
637 ADDL(y0, y2) // y2 = S1 + CH
638
639 ADDL(h, d) // d = k + w + h + d
640 ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
641 ADDL(y1, h) // h = k + w + h + S0
642
643 ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
644
645 ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0
646
647 ADDL(y3, h) // h = t1 + S0 + MAJ
648 }
649
650 // Pointers for memoizing Data section symbols
651 var flip_maskPtr, shuff_00BAPtr, shuff_DC00Ptr, K256Ptr *Mem
652
653 // shuffle byte order from LE to BE
654 func flip_mask_DATA() Mem {
655 if flip_maskPtr != nil {
656 return *flip_maskPtr
657 }
658
659 flip_mask := GLOBL("flip_mask", RODATA)
660 flip_maskPtr = &flip_mask
661
662 DATA(0x00, U64(0x0405060700010203))
663 DATA(0x08, U64(0x0c0d0e0f08090a0b))
664 DATA(0x10, U64(0x0405060700010203))
665 DATA(0x18, U64(0x0c0d0e0f08090a0b))
666 return flip_mask
667 }
668
669 // shuffle xBxA -> 00BA
670 func shuff_00BA_DATA() Mem {
671 if shuff_00BAPtr != nil {
672 return *shuff_00BAPtr
673 }
674
675 shuff_00BA := GLOBL("shuff_00BA", RODATA)
676 shuff_00BAPtr = &shuff_00BA
677
678 DATA(0x00, U64(0x0b0a090803020100))
679 DATA(0x08, U64(0xFFFFFFFFFFFFFFFF))
680 DATA(0x10, U64(0x0b0a090803020100))
681 DATA(0x18, U64(0xFFFFFFFFFFFFFFFF))
682 return shuff_00BA
683 }
684
685 // shuffle xDxC -> DC00
686 func shuff_DC00_DATA() Mem {
687 if shuff_DC00Ptr != nil {
688 return *shuff_DC00Ptr
689 }
690
691 shuff_DC00 := GLOBL("shuff_DC00", RODATA)
692 shuff_DC00Ptr = &shuff_DC00
693
694 DATA(0x00, U64(0xFFFFFFFFFFFFFFFF))
695 DATA(0x08, U64(0x0b0a090803020100))
696 DATA(0x10, U64(0xFFFFFFFFFFFFFFFF))
697 DATA(0x18, U64(0x0b0a090803020100))
698 return shuff_DC00
699 }
700
701 // Round specific constants
702 func K256_DATA() Mem {
703 if K256Ptr != nil {
704 return *K256Ptr
705 }
706
707 K256 := GLOBL("K256", NOPTR+RODATA)
708 K256Ptr = &K256
709
710 offset_idx := 0
711
712 for i := 0; i < len(_K); i += 4 {
713 DATA((offset_idx+0)*4, U32(_K[i+0])) // k1
714 DATA((offset_idx+1)*4, U32(_K[i+1])) // k2
715 DATA((offset_idx+2)*4, U32(_K[i+2])) // k3
716 DATA((offset_idx+3)*4, U32(_K[i+3])) // k4
717
718 DATA((offset_idx+4)*4, U32(_K[i+0])) // k1
719 DATA((offset_idx+5)*4, U32(_K[i+1])) // k2
720 DATA((offset_idx+6)*4, U32(_K[i+2])) // k3
721 DATA((offset_idx+7)*4, U32(_K[i+3])) // k4
722 offset_idx += 8
723 }
724 return K256
725 }
726