field_amd64.s raw
1 //go:build amd64
2
3 #include "textflag.h"
4
5 // Field multiplication assembly for secp256k1 using 5x52-bit limb representation.
6 // Ported from bitcoin-core/secp256k1 field_5x52_asm_impl.h
7 //
8 // The field element is represented as 5 limbs of 52 bits each:
9 // n[0..4] where value = sum(n[i] * 2^(52*i))
10 //
11 // Field prime p = 2^256 - 2^32 - 977
12 // Reduction constant R = 2^256 mod p = 2^32 + 977 = 0x1000003D1
13 // For 5x52: R shifted = 0x1000003D10 (for 52-bit alignment)
14 //
15 // Stack layout for fieldMulAsm (96 bytes):
16 // 0(SP) - d_lo
17 // 8(SP) - d_hi
18 // 16(SP) - c_lo
19 // 24(SP) - c_hi
20 // 32(SP) - t3
21 // 40(SP) - t4
22 // 48(SP) - tx
23 // 56(SP) - u0
24 // 64(SP) - temp storage
25 // 72(SP) - temp storage 2
26 // 80(SP) - saved b pointer
27
28 // Macro-like operations implemented inline:
29 // rshift52: shift 128-bit value right by 52
30 // result_lo = (in_lo >> 52) | (in_hi << 12)
31 // result_hi = in_hi >> 52
32
33 // func fieldMulAsm(r, a, b *FieldElement)
34 TEXT ·fieldMulAsm(SB), NOSPLIT, $96-24
35 MOVQ r+0(FP), DI
36 MOVQ a+8(FP), SI
37 MOVQ b+16(FP), BX
38
39 // Save b pointer
40 MOVQ BX, 80(SP)
41
42 // Load a[0..4] into registers
43 MOVQ 0(SI), R8 // a0
44 MOVQ 8(SI), R9 // a1
45 MOVQ 16(SI), R10 // a2
46 MOVQ 24(SI), R11 // a3
47 MOVQ 32(SI), R12 // a4
48
49 // Constants we'll use frequently
50 // M = 0xFFFFFFFFFFFFF (2^52 - 1)
51 // R = 0x1000003D10
52
53 // === Step 1: d = a0*b3 + a1*b2 + a2*b1 + a3*b0 ===
54 MOVQ R8, AX
55 MULQ 24(BX) // a0 * b3
56 MOVQ AX, 0(SP) // d_lo
57 MOVQ DX, 8(SP) // d_hi
58
59 MOVQ R9, AX
60 MULQ 16(BX) // a1 * b2
61 ADDQ AX, 0(SP)
62 ADCQ DX, 8(SP)
63
64 MOVQ R10, AX
65 MULQ 8(BX) // a2 * b1
66 ADDQ AX, 0(SP)
67 ADCQ DX, 8(SP)
68
69 MOVQ R11, AX
70 MULQ 0(BX) // a3 * b0
71 ADDQ AX, 0(SP)
72 ADCQ DX, 8(SP)
73
74 // === Step 2: c = a4*b4 ===
75 MOVQ R12, AX
76 MULQ 32(BX) // a4 * b4
77 MOVQ AX, 16(SP) // c_lo
78 MOVQ DX, 24(SP) // c_hi
79
80 // === Step 3: d += R * c_lo ===
81 // Note: we use full c_lo (64 bits), NOT c_lo & M
82 MOVQ 16(SP), AX // c_lo (full 64 bits)
83 MOVQ $0x1000003D10, CX // R
84 MULQ CX // R * c_lo -> DX:AX
85 ADDQ AX, 0(SP) // d_lo += product_lo
86 ADCQ DX, 8(SP) // d_hi += product_hi + carry
87
88 // === Step 4: c >>= 64 (just take c_hi) ===
89 MOVQ 24(SP), AX // c_hi
90 MOVQ AX, 16(SP) // new c = c_hi (single 64-bit now)
91 MOVQ $0, 24(SP) // c_hi = 0
92
93 // === Step 5: t3 = d & M; d >>= 52 ===
94 MOVQ 0(SP), AX // d_lo
95 MOVQ $0xFFFFFFFFFFFFF, CX
96 ANDQ CX, AX // t3 = d & M
97 MOVQ AX, 32(SP) // save t3
98
99 // d >>= 52: d_lo = (d_lo >> 52) | (d_hi << 12); d_hi >>= 52
100 MOVQ 0(SP), AX // d_lo
101 MOVQ 8(SP), CX // d_hi
102 SHRQ $52, AX // d_lo >> 52
103 MOVQ CX, DX
104 SHLQ $12, DX // d_hi << 12
105 ORQ DX, AX // new d_lo
106 SHRQ $52, CX // new d_hi
107 MOVQ AX, 0(SP)
108 MOVQ CX, 8(SP)
109
110 // === Step 6: d += a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0 ===
111 MOVQ 80(SP), BX // restore b pointer
112
113 MOVQ R8, AX
114 MULQ 32(BX) // a0 * b4
115 ADDQ AX, 0(SP)
116 ADCQ DX, 8(SP)
117
118 MOVQ R9, AX
119 MULQ 24(BX) // a1 * b3
120 ADDQ AX, 0(SP)
121 ADCQ DX, 8(SP)
122
123 MOVQ R10, AX
124 MULQ 16(BX) // a2 * b2
125 ADDQ AX, 0(SP)
126 ADCQ DX, 8(SP)
127
128 MOVQ R11, AX
129 MULQ 8(BX) // a3 * b1
130 ADDQ AX, 0(SP)
131 ADCQ DX, 8(SP)
132
133 MOVQ R12, AX
134 MULQ 0(BX) // a4 * b0
135 ADDQ AX, 0(SP)
136 ADCQ DX, 8(SP)
137
138 // === Step 7: d += (R << 12) * c ===
139 // R << 12 = 0x1000003D10 << 12 = 0x1000003D10000
140 MOVQ 16(SP), AX // c (from c >>= 64)
141 MOVQ $0x1000003D10000, CX
142 MULQ CX // (R << 12) * c
143 ADDQ AX, 0(SP)
144 ADCQ DX, 8(SP)
145
146 // === Step 8: t4 = d & M; tx = t4 >> 48; t4 &= (M >> 4) ===
147 MOVQ 0(SP), AX // d_lo
148 MOVQ $0xFFFFFFFFFFFFF, CX
149 ANDQ CX, AX // t4 = d & M
150 MOVQ AX, 40(SP) // save t4 (before modifications)
151
152 SHRQ $48, AX // tx = t4 >> 48
153 MOVQ AX, 48(SP) // save tx
154
155 MOVQ 40(SP), AX
156 MOVQ $0x0FFFFFFFFFFFF, CX // M >> 4 = 2^48 - 1
157 ANDQ CX, AX // t4 &= (M >> 4)
158 MOVQ AX, 40(SP) // save final t4
159
160 // === Step 9: d >>= 52 ===
161 MOVQ 0(SP), AX
162 MOVQ 8(SP), CX
163 SHRQ $52, AX
164 MOVQ CX, DX
165 SHLQ $12, DX
166 ORQ DX, AX
167 SHRQ $52, CX
168 MOVQ AX, 0(SP)
169 MOVQ CX, 8(SP)
170
171 // === Step 10: c = a0*b0 ===
172 MOVQ R8, AX
173 MULQ 0(BX) // a0 * b0
174 MOVQ AX, 16(SP) // c_lo
175 MOVQ DX, 24(SP) // c_hi
176
177 // === Step 11: d += a1*b4 + a2*b3 + a3*b2 + a4*b1 ===
178 MOVQ R9, AX
179 MULQ 32(BX) // a1 * b4
180 ADDQ AX, 0(SP)
181 ADCQ DX, 8(SP)
182
183 MOVQ R10, AX
184 MULQ 24(BX) // a2 * b3
185 ADDQ AX, 0(SP)
186 ADCQ DX, 8(SP)
187
188 MOVQ R11, AX
189 MULQ 16(BX) // a3 * b2
190 ADDQ AX, 0(SP)
191 ADCQ DX, 8(SP)
192
193 MOVQ R12, AX
194 MULQ 8(BX) // a4 * b1
195 ADDQ AX, 0(SP)
196 ADCQ DX, 8(SP)
197
198 // === Step 12: u0 = d & M; d >>= 52; u0 = (u0 << 4) | tx ===
199 MOVQ 0(SP), AX
200 MOVQ $0xFFFFFFFFFFFFF, CX
201 ANDQ CX, AX // u0 = d & M
202 SHLQ $4, AX // u0 << 4
203 ORQ 48(SP), AX // u0 |= tx
204 MOVQ AX, 56(SP) // save u0
205
206 // d >>= 52
207 MOVQ 0(SP), AX
208 MOVQ 8(SP), CX
209 SHRQ $52, AX
210 MOVQ CX, DX
211 SHLQ $12, DX
212 ORQ DX, AX
213 SHRQ $52, CX
214 MOVQ AX, 0(SP)
215 MOVQ CX, 8(SP)
216
217 // === Step 13: c += (R >> 4) * u0 ===
218 // R >> 4 = 0x1000003D10 >> 4 = 0x1000003D1
219 MOVQ 56(SP), AX // u0
220 MOVQ $0x1000003D1, CX
221 MULQ CX // (R >> 4) * u0
222 ADDQ AX, 16(SP) // c_lo
223 ADCQ DX, 24(SP) // c_hi
224
225 // === Step 14: r[0] = c & M; c >>= 52 ===
226 MOVQ 16(SP), AX
227 MOVQ $0xFFFFFFFFFFFFF, CX
228 ANDQ CX, AX
229 MOVQ AX, 0(DI) // store r[0]
230
231 MOVQ 16(SP), AX
232 MOVQ 24(SP), CX
233 SHRQ $52, AX
234 MOVQ CX, DX
235 SHLQ $12, DX
236 ORQ DX, AX
237 SHRQ $52, CX
238 MOVQ AX, 16(SP)
239 MOVQ CX, 24(SP)
240
241 // === Step 15: c += a0*b1 + a1*b0 ===
242 MOVQ R8, AX
243 MULQ 8(BX) // a0 * b1
244 ADDQ AX, 16(SP)
245 ADCQ DX, 24(SP)
246
247 MOVQ R9, AX
248 MULQ 0(BX) // a1 * b0
249 ADDQ AX, 16(SP)
250 ADCQ DX, 24(SP)
251
252 // === Step 16: d += a2*b4 + a3*b3 + a4*b2 ===
253 MOVQ R10, AX
254 MULQ 32(BX) // a2 * b4
255 ADDQ AX, 0(SP)
256 ADCQ DX, 8(SP)
257
258 MOVQ R11, AX
259 MULQ 24(BX) // a3 * b3
260 ADDQ AX, 0(SP)
261 ADCQ DX, 8(SP)
262
263 MOVQ R12, AX
264 MULQ 16(BX) // a4 * b2
265 ADDQ AX, 0(SP)
266 ADCQ DX, 8(SP)
267
268 // === Step 17: c += R * (d & M); d >>= 52 ===
269 MOVQ 0(SP), AX
270 MOVQ $0xFFFFFFFFFFFFF, CX
271 ANDQ CX, AX // d & M
272 MOVQ $0x1000003D10, CX // R
273 MULQ CX // R * (d & M)
274 ADDQ AX, 16(SP)
275 ADCQ DX, 24(SP)
276
277 // d >>= 52
278 MOVQ 0(SP), AX
279 MOVQ 8(SP), CX
280 SHRQ $52, AX
281 MOVQ CX, DX
282 SHLQ $12, DX
283 ORQ DX, AX
284 SHRQ $52, CX
285 MOVQ AX, 0(SP)
286 MOVQ CX, 8(SP)
287
288 // === Step 18: r[1] = c & M; c >>= 52 ===
289 MOVQ 16(SP), AX
290 MOVQ $0xFFFFFFFFFFFFF, CX
291 ANDQ CX, AX
292 MOVQ AX, 8(DI) // store r[1]
293
294 MOVQ 16(SP), AX
295 MOVQ 24(SP), CX
296 SHRQ $52, AX
297 MOVQ CX, DX
298 SHLQ $12, DX
299 ORQ DX, AX
300 SHRQ $52, CX
301 MOVQ AX, 16(SP)
302 MOVQ CX, 24(SP)
303
304 // === Step 19: c += a0*b2 + a1*b1 + a2*b0 ===
305 MOVQ R8, AX
306 MULQ 16(BX) // a0 * b2
307 ADDQ AX, 16(SP)
308 ADCQ DX, 24(SP)
309
310 MOVQ R9, AX
311 MULQ 8(BX) // a1 * b1
312 ADDQ AX, 16(SP)
313 ADCQ DX, 24(SP)
314
315 MOVQ R10, AX
316 MULQ 0(BX) // a2 * b0
317 ADDQ AX, 16(SP)
318 ADCQ DX, 24(SP)
319
320 // === Step 20: d += a3*b4 + a4*b3 ===
321 MOVQ R11, AX
322 MULQ 32(BX) // a3 * b4
323 ADDQ AX, 0(SP)
324 ADCQ DX, 8(SP)
325
326 MOVQ R12, AX
327 MULQ 24(BX) // a4 * b3
328 ADDQ AX, 0(SP)
329 ADCQ DX, 8(SP)
330
331 // === Step 21: c += R * d_lo; d >>= 64 ===
332 // Note: use full d_lo here, not d & M
333 MOVQ 0(SP), AX // d_lo
334 MOVQ $0x1000003D10, CX // R
335 MULQ CX // R * d_lo
336 ADDQ AX, 16(SP)
337 ADCQ DX, 24(SP)
338
339 // d >>= 64 (just take d_hi)
340 MOVQ 8(SP), AX
341 MOVQ AX, 0(SP)
342 MOVQ $0, 8(SP)
343
344 // === Step 22: r[2] = c & M; c >>= 52 ===
345 MOVQ 16(SP), AX
346 MOVQ $0xFFFFFFFFFFFFF, CX
347 ANDQ CX, AX
348 MOVQ AX, 16(DI) // store r[2]
349
350 MOVQ 16(SP), AX
351 MOVQ 24(SP), CX
352 SHRQ $52, AX
353 MOVQ CX, DX
354 SHLQ $12, DX
355 ORQ DX, AX
356 SHRQ $52, CX
357 MOVQ AX, 16(SP)
358 MOVQ CX, 24(SP)
359
360 // === Step 23: c += (R << 12) * d + t3 ===
361 MOVQ 0(SP), AX // d (after d >>= 64)
362 MOVQ $0x1000003D10000, CX // R << 12
363 MULQ CX // (R << 12) * d
364 ADDQ AX, 16(SP)
365 ADCQ DX, 24(SP)
366
367 MOVQ 32(SP), AX // t3
368 ADDQ AX, 16(SP)
369 ADCQ $0, 24(SP)
370
371 // === Step 24: r[3] = c & M; c >>= 52 ===
372 MOVQ 16(SP), AX
373 MOVQ $0xFFFFFFFFFFFFF, CX
374 ANDQ CX, AX
375 MOVQ AX, 24(DI) // store r[3]
376
377 MOVQ 16(SP), AX
378 MOVQ 24(SP), CX
379 SHRQ $52, AX
380 MOVQ CX, DX
381 SHLQ $12, DX
382 ORQ DX, AX
383
384 // === Step 25: r[4] = c + t4 ===
385 ADDQ 40(SP), AX // c + t4
386 MOVQ AX, 32(DI) // store r[4]
387
388 RET
389
390 // func fieldSqrAsm(r, a *FieldElement)
391 // Squares a field element in 5x52 representation.
392 // This follows the bitcoin-core secp256k1_fe_sqr_inner algorithm.
393 // Squaring is optimized since a*a has symmetric terms: a[i]*a[j] appears twice.
394 TEXT ·fieldSqrAsm(SB), NOSPLIT, $96-16
395 MOVQ r+0(FP), DI
396 MOVQ a+8(FP), SI
397
398 // Load a[0..4] into registers
399 MOVQ 0(SI), R8 // a0
400 MOVQ 8(SI), R9 // a1
401 MOVQ 16(SI), R10 // a2
402 MOVQ 24(SI), R11 // a3
403 MOVQ 32(SI), R12 // a4
404
405 // === Step 1: d = 2*a0*a3 + 2*a1*a2 ===
406 MOVQ R8, AX
407 ADDQ AX, AX // 2*a0
408 MULQ R11 // 2*a0 * a3
409 MOVQ AX, 0(SP) // d_lo
410 MOVQ DX, 8(SP) // d_hi
411
412 MOVQ R9, AX
413 ADDQ AX, AX // 2*a1
414 MULQ R10 // 2*a1 * a2
415 ADDQ AX, 0(SP)
416 ADCQ DX, 8(SP)
417
418 // === Step 2: c = a4*a4 ===
419 MOVQ R12, AX
420 MULQ R12 // a4 * a4
421 MOVQ AX, 16(SP) // c_lo
422 MOVQ DX, 24(SP) // c_hi
423
424 // === Step 3: d += R * c_lo ===
425 // Note: use full c_lo (64 bits), NOT c_lo & M
426 MOVQ 16(SP), AX // c_lo (full 64 bits)
427 MOVQ $0x1000003D10, CX
428 MULQ CX
429 ADDQ AX, 0(SP)
430 ADCQ DX, 8(SP)
431
432 // === Step 4: c >>= 64 ===
433 MOVQ 24(SP), AX
434 MOVQ AX, 16(SP)
435 MOVQ $0, 24(SP)
436
437 // === Step 5: t3 = d & M; d >>= 52 ===
438 MOVQ 0(SP), AX
439 MOVQ $0xFFFFFFFFFFFFF, CX
440 ANDQ CX, AX
441 MOVQ AX, 32(SP) // t3
442
443 MOVQ 0(SP), AX
444 MOVQ 8(SP), CX
445 SHRQ $52, AX
446 MOVQ CX, DX
447 SHLQ $12, DX
448 ORQ DX, AX
449 SHRQ $52, CX
450 MOVQ AX, 0(SP)
451 MOVQ CX, 8(SP)
452
453 // === Step 6: d += 2*a0*a4 + 2*a1*a3 + a2*a2 ===
454 // Pre-compute 2*a4 for later use
455 MOVQ R12, CX
456 ADDQ CX, CX // 2*a4
457 MOVQ CX, 64(SP) // save 2*a4
458
459 MOVQ R8, AX
460 MULQ CX // a0 * 2*a4
461 ADDQ AX, 0(SP)
462 ADCQ DX, 8(SP)
463
464 MOVQ R9, AX
465 ADDQ AX, AX // 2*a1
466 MULQ R11 // 2*a1 * a3
467 ADDQ AX, 0(SP)
468 ADCQ DX, 8(SP)
469
470 MOVQ R10, AX
471 MULQ R10 // a2 * a2
472 ADDQ AX, 0(SP)
473 ADCQ DX, 8(SP)
474
475 // === Step 7: d += (R << 12) * c ===
476 MOVQ 16(SP), AX
477 MOVQ $0x1000003D10000, CX
478 MULQ CX
479 ADDQ AX, 0(SP)
480 ADCQ DX, 8(SP)
481
482 // === Step 8: t4 = d & M; tx = t4 >> 48; t4 &= (M >> 4) ===
483 MOVQ 0(SP), AX
484 MOVQ $0xFFFFFFFFFFFFF, CX
485 ANDQ CX, AX
486 MOVQ AX, 40(SP) // full t4
487
488 SHRQ $48, AX
489 MOVQ AX, 48(SP) // tx
490
491 MOVQ 40(SP), AX
492 MOVQ $0x0FFFFFFFFFFFF, CX
493 ANDQ CX, AX
494 MOVQ AX, 40(SP) // t4
495
496 // === Step 9: d >>= 52 ===
497 MOVQ 0(SP), AX
498 MOVQ 8(SP), CX
499 SHRQ $52, AX
500 MOVQ CX, DX
501 SHLQ $12, DX
502 ORQ DX, AX
503 SHRQ $52, CX
504 MOVQ AX, 0(SP)
505 MOVQ CX, 8(SP)
506
507 // === Step 10: c = a0*a0 ===
508 MOVQ R8, AX
509 MULQ R8
510 MOVQ AX, 16(SP)
511 MOVQ DX, 24(SP)
512
513 // === Step 11: d += a1*2*a4 + 2*a2*a3 ===
514 MOVQ R9, AX
515 MULQ 64(SP) // a1 * 2*a4
516 ADDQ AX, 0(SP)
517 ADCQ DX, 8(SP)
518
519 MOVQ R10, AX
520 ADDQ AX, AX // 2*a2
521 MULQ R11 // 2*a2 * a3
522 ADDQ AX, 0(SP)
523 ADCQ DX, 8(SP)
524
525 // === Step 12: u0 = d & M; d >>= 52; u0 = (u0 << 4) | tx ===
526 MOVQ 0(SP), AX
527 MOVQ $0xFFFFFFFFFFFFF, CX
528 ANDQ CX, AX
529 SHLQ $4, AX
530 ORQ 48(SP), AX
531 MOVQ AX, 56(SP) // u0
532
533 MOVQ 0(SP), AX
534 MOVQ 8(SP), CX
535 SHRQ $52, AX
536 MOVQ CX, DX
537 SHLQ $12, DX
538 ORQ DX, AX
539 SHRQ $52, CX
540 MOVQ AX, 0(SP)
541 MOVQ CX, 8(SP)
542
543 // === Step 13: c += (R >> 4) * u0 ===
544 MOVQ 56(SP), AX
545 MOVQ $0x1000003D1, CX
546 MULQ CX
547 ADDQ AX, 16(SP)
548 ADCQ DX, 24(SP)
549
550 // === Step 14: r[0] = c & M; c >>= 52 ===
551 MOVQ 16(SP), AX
552 MOVQ $0xFFFFFFFFFFFFF, CX
553 ANDQ CX, AX
554 MOVQ AX, 0(DI)
555
556 MOVQ 16(SP), AX
557 MOVQ 24(SP), CX
558 SHRQ $52, AX
559 MOVQ CX, DX
560 SHLQ $12, DX
561 ORQ DX, AX
562 SHRQ $52, CX
563 MOVQ AX, 16(SP)
564 MOVQ CX, 24(SP)
565
566 // === Step 15: c += 2*a0*a1 ===
567 MOVQ R8, AX
568 ADDQ AX, AX
569 MULQ R9
570 ADDQ AX, 16(SP)
571 ADCQ DX, 24(SP)
572
573 // === Step 16: d += a2*2*a4 + a3*a3 ===
574 MOVQ R10, AX
575 MULQ 64(SP) // a2 * 2*a4
576 ADDQ AX, 0(SP)
577 ADCQ DX, 8(SP)
578
579 MOVQ R11, AX
580 MULQ R11 // a3 * a3
581 ADDQ AX, 0(SP)
582 ADCQ DX, 8(SP)
583
584 // === Step 17: c += R * (d & M); d >>= 52 ===
585 MOVQ 0(SP), AX
586 MOVQ $0xFFFFFFFFFFFFF, CX
587 ANDQ CX, AX
588 MOVQ $0x1000003D10, CX
589 MULQ CX
590 ADDQ AX, 16(SP)
591 ADCQ DX, 24(SP)
592
593 MOVQ 0(SP), AX
594 MOVQ 8(SP), CX
595 SHRQ $52, AX
596 MOVQ CX, DX
597 SHLQ $12, DX
598 ORQ DX, AX
599 SHRQ $52, CX
600 MOVQ AX, 0(SP)
601 MOVQ CX, 8(SP)
602
603 // === Step 18: r[1] = c & M; c >>= 52 ===
604 MOVQ 16(SP), AX
605 MOVQ $0xFFFFFFFFFFFFF, CX
606 ANDQ CX, AX
607 MOVQ AX, 8(DI)
608
609 MOVQ 16(SP), AX
610 MOVQ 24(SP), CX
611 SHRQ $52, AX
612 MOVQ CX, DX
613 SHLQ $12, DX
614 ORQ DX, AX
615 SHRQ $52, CX
616 MOVQ AX, 16(SP)
617 MOVQ CX, 24(SP)
618
619 // === Step 19: c += 2*a0*a2 + a1*a1 ===
620 MOVQ R8, AX
621 ADDQ AX, AX
622 MULQ R10
623 ADDQ AX, 16(SP)
624 ADCQ DX, 24(SP)
625
626 MOVQ R9, AX
627 MULQ R9
628 ADDQ AX, 16(SP)
629 ADCQ DX, 24(SP)
630
631 // === Step 20: d += a3*2*a4 ===
632 MOVQ R11, AX
633 MULQ 64(SP)
634 ADDQ AX, 0(SP)
635 ADCQ DX, 8(SP)
636
637 // === Step 21: c += R * d_lo; d >>= 64 ===
638 MOVQ 0(SP), AX
639 MOVQ $0x1000003D10, CX
640 MULQ CX
641 ADDQ AX, 16(SP)
642 ADCQ DX, 24(SP)
643
644 MOVQ 8(SP), AX
645 MOVQ AX, 0(SP)
646 MOVQ $0, 8(SP)
647
648 // === Step 22: r[2] = c & M; c >>= 52 ===
649 MOVQ 16(SP), AX
650 MOVQ $0xFFFFFFFFFFFFF, CX
651 ANDQ CX, AX
652 MOVQ AX, 16(DI)
653
654 MOVQ 16(SP), AX
655 MOVQ 24(SP), CX
656 SHRQ $52, AX
657 MOVQ CX, DX
658 SHLQ $12, DX
659 ORQ DX, AX
660 SHRQ $52, CX
661 MOVQ AX, 16(SP)
662 MOVQ CX, 24(SP)
663
664 // === Step 23: c += (R << 12) * d + t3 ===
665 MOVQ 0(SP), AX
666 MOVQ $0x1000003D10000, CX
667 MULQ CX
668 ADDQ AX, 16(SP)
669 ADCQ DX, 24(SP)
670
671 MOVQ 32(SP), AX
672 ADDQ AX, 16(SP)
673 ADCQ $0, 24(SP)
674
675 // === Step 24: r[3] = c & M; c >>= 52 ===
676 MOVQ 16(SP), AX
677 MOVQ $0xFFFFFFFFFFFFF, CX
678 ANDQ CX, AX
679 MOVQ AX, 24(DI)
680
681 MOVQ 16(SP), AX
682 MOVQ 24(SP), CX
683 SHRQ $52, AX
684 MOVQ CX, DX
685 SHLQ $12, DX
686 ORQ DX, AX
687
688 // === Step 25: r[4] = c + t4 ===
689 ADDQ 40(SP), AX
690 MOVQ AX, 32(DI)
691
692 RET
693