field_amd64_bmi2.s raw
1 //go:build amd64
2
3 #include "textflag.h"
4
5 // Field multiplication assembly for secp256k1 using BMI2+ADX instructions.
6 // Uses MULX for flag-free multiplication and ADCX/ADOX for parallel carry chains.
7 //
8 // The field element is represented as 5 limbs of 52 bits each:
9 // n[0..4] where value = sum(n[i] * 2^(52*i))
10 //
11 // Field prime p = 2^256 - 2^32 - 977
12 // Reduction constant R = 2^256 mod p = 2^32 + 977 = 0x1000003D1
13 // For 5x52: R shifted = 0x1000003D10 (for 52-bit alignment)
14 //
15 // BMI2 Instructions used:
16 // MULXQ src, lo, hi - unsigned multiply RDX * src -> hi:lo (flags unchanged)
17 //
18 // ADX Instructions used:
19 // ADCXQ src, dst - dst += src + CF (only modifies CF)
20 // ADOXQ src, dst - dst += src + OF (only modifies OF)
21 //
22 // ADCX/ADOX allow parallel carry chains: ADCX uses CF only, ADOX uses OF only.
23 // This enables the CPU to execute two independent addition chains in parallel.
24 //
25 // Stack layout for fieldMulAsmBMI2 (96 bytes):
26 // 0(SP) - d_lo
27 // 8(SP) - d_hi
28 // 16(SP) - c_lo
29 // 24(SP) - c_hi
30 // 32(SP) - t3
31 // 40(SP) - t4
32 // 48(SP) - tx
33 // 56(SP) - u0
34 // 64(SP) - temp storage
35 // 72(SP) - temp storage 2
36 // 80(SP) - saved b pointer
37
38 // func fieldMulAsmBMI2(r, a, b *FieldElement)
39 TEXT ·fieldMulAsmBMI2(SB), NOSPLIT, $96-24
40 MOVQ r+0(FP), DI
41 MOVQ a+8(FP), SI
42 MOVQ b+16(FP), BX
43
44 // Save b pointer
45 MOVQ BX, 80(SP)
46
47 // Load a[0..4] into registers
48 MOVQ 0(SI), R8 // a0
49 MOVQ 8(SI), R9 // a1
50 MOVQ 16(SI), R10 // a2
51 MOVQ 24(SI), R11 // a3
52 MOVQ 32(SI), R12 // a4
53
54 // Constants:
55 // M = 0xFFFFFFFFFFFFF (2^52 - 1)
56 // R = 0x1000003D10
57
58 // === Step 1: d = a0*b3 + a1*b2 + a2*b1 + a3*b0 ===
59 // Using MULX: put multiplier in RDX, result in specified regs
60 MOVQ 24(BX), DX // b3
61 MULXQ R8, AX, CX // a0 * b3 -> CX:AX
62 MOVQ AX, 0(SP) // d_lo
63 MOVQ CX, 8(SP) // d_hi
64
65 MOVQ 16(BX), DX // b2
66 MULXQ R9, AX, CX // a1 * b2 -> CX:AX
67 ADDQ AX, 0(SP)
68 ADCQ CX, 8(SP)
69
70 MOVQ 8(BX), DX // b1
71 MULXQ R10, AX, CX // a2 * b1 -> CX:AX
72 ADDQ AX, 0(SP)
73 ADCQ CX, 8(SP)
74
75 MOVQ 0(BX), DX // b0
76 MULXQ R11, AX, CX // a3 * b0 -> CX:AX
77 ADDQ AX, 0(SP)
78 ADCQ CX, 8(SP)
79
80 // === Step 2: c = a4*b4 ===
81 MOVQ 32(BX), DX // b4
82 MULXQ R12, AX, CX // a4 * b4 -> CX:AX
83 MOVQ AX, 16(SP) // c_lo
84 MOVQ CX, 24(SP) // c_hi
85
86 // === Step 3: d += R * c_lo ===
87 MOVQ 16(SP), DX // c_lo
88 MOVQ $0x1000003D10, R13 // R constant
89 MULXQ R13, AX, CX // R * c_lo -> CX:AX
90 ADDQ AX, 0(SP)
91 ADCQ CX, 8(SP)
92
93 // === Step 4: c >>= 64 ===
94 MOVQ 24(SP), AX
95 MOVQ AX, 16(SP)
96 MOVQ $0, 24(SP)
97
98 // === Step 5: t3 = d & M; d >>= 52 ===
99 MOVQ 0(SP), AX
100 MOVQ $0xFFFFFFFFFFFFF, R14 // M constant (keep in register)
101 ANDQ R14, AX
102 MOVQ AX, 32(SP) // t3
103
104 MOVQ 0(SP), AX
105 MOVQ 8(SP), CX
106 SHRQ $52, AX
107 MOVQ CX, DX
108 SHLQ $12, DX
109 ORQ DX, AX
110 SHRQ $52, CX
111 MOVQ AX, 0(SP)
112 MOVQ CX, 8(SP)
113
114 // === Step 6: d += a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0 ===
115 MOVQ 80(SP), BX // restore b pointer
116
117 MOVQ 32(BX), DX // b4
118 MULXQ R8, AX, CX // a0 * b4
119 ADDQ AX, 0(SP)
120 ADCQ CX, 8(SP)
121
122 MOVQ 24(BX), DX // b3
123 MULXQ R9, AX, CX // a1 * b3
124 ADDQ AX, 0(SP)
125 ADCQ CX, 8(SP)
126
127 MOVQ 16(BX), DX // b2
128 MULXQ R10, AX, CX // a2 * b2
129 ADDQ AX, 0(SP)
130 ADCQ CX, 8(SP)
131
132 MOVQ 8(BX), DX // b1
133 MULXQ R11, AX, CX // a3 * b1
134 ADDQ AX, 0(SP)
135 ADCQ CX, 8(SP)
136
137 MOVQ 0(BX), DX // b0
138 MULXQ R12, AX, CX // a4 * b0
139 ADDQ AX, 0(SP)
140 ADCQ CX, 8(SP)
141
142 // === Step 7: d += (R << 12) * c ===
143 MOVQ 16(SP), DX // c
144 MOVQ $0x1000003D10000, R15 // R << 12
145 MULXQ R15, AX, CX
146 ADDQ AX, 0(SP)
147 ADCQ CX, 8(SP)
148
149 // === Step 8: t4 = d & M; tx = t4 >> 48; t4 &= (M >> 4) ===
150 MOVQ 0(SP), AX
151 ANDQ R14, AX // t4 = d & M
152 MOVQ AX, 40(SP)
153
154 SHRQ $48, AX
155 MOVQ AX, 48(SP) // tx
156
157 MOVQ 40(SP), AX
158 MOVQ $0x0FFFFFFFFFFFF, CX
159 ANDQ CX, AX
160 MOVQ AX, 40(SP) // t4
161
162 // === Step 9: d >>= 52 ===
163 MOVQ 0(SP), AX
164 MOVQ 8(SP), CX
165 SHRQ $52, AX
166 MOVQ CX, DX
167 SHLQ $12, DX
168 ORQ DX, AX
169 SHRQ $52, CX
170 MOVQ AX, 0(SP)
171 MOVQ CX, 8(SP)
172
173 // === Step 10: c = a0*b0 ===
174 MOVQ 0(BX), DX // b0
175 MULXQ R8, AX, CX // a0 * b0
176 MOVQ AX, 16(SP)
177 MOVQ CX, 24(SP)
178
179 // === Step 11: d += a1*b4 + a2*b3 + a3*b2 + a4*b1 ===
180 MOVQ 32(BX), DX // b4
181 MULXQ R9, AX, CX // a1 * b4
182 ADDQ AX, 0(SP)
183 ADCQ CX, 8(SP)
184
185 MOVQ 24(BX), DX // b3
186 MULXQ R10, AX, CX // a2 * b3
187 ADDQ AX, 0(SP)
188 ADCQ CX, 8(SP)
189
190 MOVQ 16(BX), DX // b2
191 MULXQ R11, AX, CX // a3 * b2
192 ADDQ AX, 0(SP)
193 ADCQ CX, 8(SP)
194
195 MOVQ 8(BX), DX // b1
196 MULXQ R12, AX, CX // a4 * b1
197 ADDQ AX, 0(SP)
198 ADCQ CX, 8(SP)
199
200 // === Step 12: u0 = d & M; d >>= 52; u0 = (u0 << 4) | tx ===
201 MOVQ 0(SP), AX
202 ANDQ R14, AX // u0 = d & M
203 SHLQ $4, AX
204 ORQ 48(SP), AX
205 MOVQ AX, 56(SP) // u0
206
207 MOVQ 0(SP), AX
208 MOVQ 8(SP), CX
209 SHRQ $52, AX
210 MOVQ CX, DX
211 SHLQ $12, DX
212 ORQ DX, AX
213 SHRQ $52, CX
214 MOVQ AX, 0(SP)
215 MOVQ CX, 8(SP)
216
217 // === Step 13: c += (R >> 4) * u0 ===
218 MOVQ 56(SP), DX // u0
219 MOVQ $0x1000003D1, R13 // R >> 4
220 MULXQ R13, AX, CX
221 ADDQ AX, 16(SP)
222 ADCQ CX, 24(SP)
223
224 // === Step 14: r[0] = c & M; c >>= 52 ===
225 MOVQ 16(SP), AX
226 ANDQ R14, AX
227 MOVQ AX, 0(DI) // store r[0]
228
229 MOVQ 16(SP), AX
230 MOVQ 24(SP), CX
231 SHRQ $52, AX
232 MOVQ CX, DX
233 SHLQ $12, DX
234 ORQ DX, AX
235 SHRQ $52, CX
236 MOVQ AX, 16(SP)
237 MOVQ CX, 24(SP)
238
239 // === Steps 15-16: Parallel c and d updates using ADCX/ADOX ===
240 // Step 15: c += a0*b1 + a1*b0 (CF chain via ADCX)
241 // Step 16: d += a2*b4 + a3*b3 + a4*b2 (OF chain via ADOX)
242 // Save r pointer before reusing DI
243 MOVQ DI, 64(SP) // save r pointer
244
245 // Load all accumulators into registers for ADCX/ADOX (register-only ops)
246 MOVQ 16(SP), R13 // c_lo
247 MOVQ 24(SP), R15 // c_hi
248 MOVQ 0(SP), SI // d_lo (reuse SI since we don't need 'a' anymore)
249 MOVQ 8(SP), DI // d_hi (reuse DI)
250
251 // Clear CF and OF
252 XORQ AX, AX
253
254 // First pair: c += a0*b1, d += a2*b4
255 MOVQ 8(BX), DX // b1
256 MULXQ R8, AX, CX // a0 * b1 -> CX:AX
257 ADCXQ AX, R13 // c_lo += lo (CF chain)
258 ADCXQ CX, R15 // c_hi += hi + CF
259
260 MOVQ 32(BX), DX // b4
261 MULXQ R10, AX, CX // a2 * b4 -> CX:AX
262 ADOXQ AX, SI // d_lo += lo (OF chain)
263 ADOXQ CX, DI // d_hi += hi + OF
264
265 // Second pair: c += a1*b0, d += a3*b3
266 MOVQ 0(BX), DX // b0
267 MULXQ R9, AX, CX // a1 * b0 -> CX:AX
268 ADCXQ AX, R13 // c_lo += lo
269 ADCXQ CX, R15 // c_hi += hi + CF
270
271 MOVQ 24(BX), DX // b3
272 MULXQ R11, AX, CX // a3 * b3 -> CX:AX
273 ADOXQ AX, SI // d_lo += lo
274 ADOXQ CX, DI // d_hi += hi + OF
275
276 // Third: d += a4*b2 (only d, no more c operations)
277 MOVQ 16(BX), DX // b2
278 MULXQ R12, AX, CX // a4 * b2 -> CX:AX
279 ADOXQ AX, SI // d_lo += lo
280 ADOXQ CX, DI // d_hi += hi + OF
281
282 // Store results back
283 MOVQ R13, 16(SP) // c_lo
284 MOVQ R15, 24(SP) // c_hi
285 MOVQ SI, 0(SP) // d_lo
286 MOVQ DI, 8(SP) // d_hi
287 MOVQ 64(SP), DI // restore r pointer
288
289 // === Step 17: c += R * (d & M); d >>= 52 ===
290 MOVQ 0(SP), AX
291 ANDQ R14, AX // d & M
292 MOVQ AX, DX
293 MOVQ $0x1000003D10, R13 // R
294 MULXQ R13, AX, CX
295 ADDQ AX, 16(SP)
296 ADCQ CX, 24(SP)
297
298 MOVQ 0(SP), AX
299 MOVQ 8(SP), CX
300 SHRQ $52, AX
301 MOVQ CX, DX
302 SHLQ $12, DX
303 ORQ DX, AX
304 SHRQ $52, CX
305 MOVQ AX, 0(SP)
306 MOVQ CX, 8(SP)
307
308 // === Step 18: r[1] = c & M; c >>= 52 ===
309 MOVQ 16(SP), AX
310 ANDQ R14, AX
311 MOVQ AX, 8(DI) // store r[1]
312
313 MOVQ 16(SP), AX
314 MOVQ 24(SP), CX
315 SHRQ $52, AX
316 MOVQ CX, DX
317 SHLQ $12, DX
318 ORQ DX, AX
319 SHRQ $52, CX
320 MOVQ AX, 16(SP)
321 MOVQ CX, 24(SP)
322
323 // === Steps 19-20: Parallel c and d updates using ADCX/ADOX ===
324 // Step 19: c += a0*b2 + a1*b1 + a2*b0 (CF chain via ADCX)
325 // Step 20: d += a3*b4 + a4*b3 (OF chain via ADOX)
326 // Save r pointer before reusing DI
327 MOVQ DI, 64(SP) // save r pointer
328
329 // Load all accumulators into registers
330 MOVQ 16(SP), R13 // c_lo
331 MOVQ 24(SP), R15 // c_hi
332 MOVQ 0(SP), SI // d_lo
333 MOVQ 8(SP), DI // d_hi
334
335 // Clear CF and OF
336 XORQ AX, AX
337
338 // First pair: c += a0*b2, d += a3*b4
339 MOVQ 16(BX), DX // b2
340 MULXQ R8, AX, CX // a0 * b2 -> CX:AX
341 ADCXQ AX, R13 // c_lo += lo
342 ADCXQ CX, R15 // c_hi += hi + CF
343
344 MOVQ 32(BX), DX // b4
345 MULXQ R11, AX, CX // a3 * b4 -> CX:AX
346 ADOXQ AX, SI // d_lo += lo
347 ADOXQ CX, DI // d_hi += hi + OF
348
349 // Second pair: c += a1*b1, d += a4*b3
350 MOVQ 8(BX), DX // b1
351 MULXQ R9, AX, CX // a1 * b1 -> CX:AX
352 ADCXQ AX, R13 // c_lo += lo
353 ADCXQ CX, R15 // c_hi += hi + CF
354
355 MOVQ 24(BX), DX // b3
356 MULXQ R12, AX, CX // a4 * b3 -> CX:AX
357 ADOXQ AX, SI // d_lo += lo
358 ADOXQ CX, DI // d_hi += hi + OF
359
360 // Third: c += a2*b0 (only c, no more d operations)
361 MOVQ 0(BX), DX // b0
362 MULXQ R10, AX, CX // a2 * b0 -> CX:AX
363 ADCXQ AX, R13 // c_lo += lo
364 ADCXQ CX, R15 // c_hi += hi + CF
365
366 // Store results back
367 MOVQ R13, 16(SP) // c_lo
368 MOVQ R15, 24(SP) // c_hi
369 MOVQ SI, 0(SP) // d_lo
370 MOVQ DI, 8(SP) // d_hi
371 MOVQ 64(SP), DI // restore r pointer
372
373 // === Step 21: c += R * d_lo; d >>= 64 ===
374 MOVQ 0(SP), DX // d_lo
375 MOVQ $0x1000003D10, R13 // R
376 MULXQ R13, AX, CX
377 ADDQ AX, 16(SP)
378 ADCQ CX, 24(SP)
379
380 MOVQ 8(SP), AX
381 MOVQ AX, 0(SP)
382 MOVQ $0, 8(SP)
383
384 // === Step 22: r[2] = c & M; c >>= 52 ===
385 MOVQ 16(SP), AX
386 ANDQ R14, AX
387 MOVQ AX, 16(DI) // store r[2]
388
389 MOVQ 16(SP), AX
390 MOVQ 24(SP), CX
391 SHRQ $52, AX
392 MOVQ CX, DX
393 SHLQ $12, DX
394 ORQ DX, AX
395 SHRQ $52, CX
396 MOVQ AX, 16(SP)
397 MOVQ CX, 24(SP)
398
399 // === Step 23: c += (R << 12) * d + t3 ===
400 MOVQ 0(SP), DX // d
401 MOVQ $0x1000003D10000, R15 // R << 12 (reload since R15 was used for c_hi)
402 MULXQ R15, AX, CX // (R << 12) * d
403 ADDQ AX, 16(SP)
404 ADCQ CX, 24(SP)
405
406 MOVQ 32(SP), AX // t3
407 ADDQ AX, 16(SP)
408 ADCQ $0, 24(SP)
409
410 // === Step 24: r[3] = c & M; c >>= 52 ===
411 MOVQ 16(SP), AX
412 ANDQ R14, AX
413 MOVQ AX, 24(DI) // store r[3]
414
415 MOVQ 16(SP), AX
416 MOVQ 24(SP), CX
417 SHRQ $52, AX
418 MOVQ CX, DX
419 SHLQ $12, DX
420 ORQ DX, AX
421
422 // === Step 25: r[4] = c + t4 ===
423 ADDQ 40(SP), AX
424 MOVQ AX, 32(DI) // store r[4]
425
426 RET
427
428
429 // func fieldSqrAsmBMI2(r, a *FieldElement)
430 // Squares a field element using BMI2 instructions.
431 TEXT ·fieldSqrAsmBMI2(SB), NOSPLIT, $96-16
432 MOVQ r+0(FP), DI
433 MOVQ a+8(FP), SI
434
435 // Load a[0..4] into registers
436 MOVQ 0(SI), R8 // a0
437 MOVQ 8(SI), R9 // a1
438 MOVQ 16(SI), R10 // a2
439 MOVQ 24(SI), R11 // a3
440 MOVQ 32(SI), R12 // a4
441
442 // Keep M constant in R14
443 MOVQ $0xFFFFFFFFFFFFF, R14
444
445 // === Step 1: d = 2*a0*a3 + 2*a1*a2 ===
446 MOVQ R8, DX
447 ADDQ DX, DX // 2*a0
448 MULXQ R11, AX, CX // 2*a0 * a3
449 MOVQ AX, 0(SP)
450 MOVQ CX, 8(SP)
451
452 MOVQ R9, DX
453 ADDQ DX, DX // 2*a1
454 MULXQ R10, AX, CX // 2*a1 * a2
455 ADDQ AX, 0(SP)
456 ADCQ CX, 8(SP)
457
458 // === Step 2: c = a4*a4 ===
459 MOVQ R12, DX
460 MULXQ R12, AX, CX // a4 * a4
461 MOVQ AX, 16(SP)
462 MOVQ CX, 24(SP)
463
464 // === Step 3: d += R * c_lo ===
465 MOVQ 16(SP), DX
466 MOVQ $0x1000003D10, R13
467 MULXQ R13, AX, CX
468 ADDQ AX, 0(SP)
469 ADCQ CX, 8(SP)
470
471 // === Step 4: c >>= 64 ===
472 MOVQ 24(SP), AX
473 MOVQ AX, 16(SP)
474 MOVQ $0, 24(SP)
475
476 // === Step 5: t3 = d & M; d >>= 52 ===
477 MOVQ 0(SP), AX
478 ANDQ R14, AX
479 MOVQ AX, 32(SP) // t3
480
481 MOVQ 0(SP), AX
482 MOVQ 8(SP), CX
483 SHRQ $52, AX
484 MOVQ CX, DX
485 SHLQ $12, DX
486 ORQ DX, AX
487 SHRQ $52, CX
488 MOVQ AX, 0(SP)
489 MOVQ CX, 8(SP)
490
491 // === Step 6: d += 2*a0*a4 + 2*a1*a3 + a2*a2 ===
492 // Pre-compute 2*a4
493 MOVQ R12, R15
494 ADDQ R15, R15 // 2*a4
495
496 MOVQ R8, DX
497 MULXQ R15, AX, CX // a0 * 2*a4
498 ADDQ AX, 0(SP)
499 ADCQ CX, 8(SP)
500
501 MOVQ R9, DX
502 ADDQ DX, DX // 2*a1
503 MULXQ R11, AX, CX // 2*a1 * a3
504 ADDQ AX, 0(SP)
505 ADCQ CX, 8(SP)
506
507 MOVQ R10, DX
508 MULXQ R10, AX, CX // a2 * a2
509 ADDQ AX, 0(SP)
510 ADCQ CX, 8(SP)
511
512 // === Step 7: d += (R << 12) * c ===
513 MOVQ 16(SP), DX
514 MOVQ $0x1000003D10000, R13
515 MULXQ R13, AX, CX
516 ADDQ AX, 0(SP)
517 ADCQ CX, 8(SP)
518
519 // === Step 8: t4 = d & M; tx = t4 >> 48; t4 &= (M >> 4) ===
520 MOVQ 0(SP), AX
521 ANDQ R14, AX
522 MOVQ AX, 40(SP)
523
524 SHRQ $48, AX
525 MOVQ AX, 48(SP) // tx
526
527 MOVQ 40(SP), AX
528 MOVQ $0x0FFFFFFFFFFFF, CX
529 ANDQ CX, AX
530 MOVQ AX, 40(SP) // t4
531
532 // === Step 9: d >>= 52 ===
533 MOVQ 0(SP), AX
534 MOVQ 8(SP), CX
535 SHRQ $52, AX
536 MOVQ CX, DX
537 SHLQ $12, DX
538 ORQ DX, AX
539 SHRQ $52, CX
540 MOVQ AX, 0(SP)
541 MOVQ CX, 8(SP)
542
543 // === Step 10: c = a0*a0 ===
544 MOVQ R8, DX
545 MULXQ R8, AX, CX
546 MOVQ AX, 16(SP)
547 MOVQ CX, 24(SP)
548
549 // === Step 11: d += a1*2*a4 + 2*a2*a3 ===
550 // Save a2 before doubling (needed later in step 16 and 19)
551 MOVQ R10, 64(SP) // save original a2
552
553 MOVQ R9, DX
554 MULXQ R15, AX, CX // a1 * 2*a4
555 ADDQ AX, 0(SP)
556 ADCQ CX, 8(SP)
557
558 MOVQ R10, DX
559 ADDQ DX, DX // 2*a2
560 MULXQ R11, AX, CX // 2*a2 * a3
561 ADDQ AX, 0(SP)
562 ADCQ CX, 8(SP)
563
564 // === Step 12: u0 = d & M; d >>= 52; u0 = (u0 << 4) | tx ===
565 MOVQ 0(SP), AX
566 ANDQ R14, AX
567 SHLQ $4, AX
568 ORQ 48(SP), AX
569 MOVQ AX, 56(SP) // u0
570
571 MOVQ 0(SP), AX
572 MOVQ 8(SP), CX
573 SHRQ $52, AX
574 MOVQ CX, DX
575 SHLQ $12, DX
576 ORQ DX, AX
577 SHRQ $52, CX
578 MOVQ AX, 0(SP)
579 MOVQ CX, 8(SP)
580
581 // === Step 13: c += (R >> 4) * u0 ===
582 MOVQ 56(SP), DX
583 MOVQ $0x1000003D1, R13
584 MULXQ R13, AX, CX
585 ADDQ AX, 16(SP)
586 ADCQ CX, 24(SP)
587
588 // === Step 14: r[0] = c & M; c >>= 52 ===
589 MOVQ 16(SP), AX
590 ANDQ R14, AX
591 MOVQ AX, 0(DI)
592
593 MOVQ 16(SP), AX
594 MOVQ 24(SP), CX
595 SHRQ $52, AX
596 MOVQ CX, DX
597 SHLQ $12, DX
598 ORQ DX, AX
599 SHRQ $52, CX
600 MOVQ AX, 16(SP)
601 MOVQ CX, 24(SP)
602
603 // === Steps 15-16: Parallel c and d updates using ADCX/ADOX ===
604 // Step 15: c += 2*a0*a1 (CF chain via ADCX)
605 // Step 16: d += a2*2*a4 + a3*a3 (OF chain via ADOX)
606 // Save r pointer and load accumulators
607 MOVQ DI, 72(SP) // save r pointer (64(SP) has saved a2)
608
609 MOVQ 16(SP), R13 // c_lo
610 MOVQ 24(SP), BX // c_hi (use BX since we need SI/DI)
611 MOVQ 0(SP), SI // d_lo
612 MOVQ 8(SP), DI // d_hi
613
614 // Clear CF and OF
615 XORQ AX, AX
616
617 // c += 2*a0*a1
618 MOVQ R8, DX
619 ADDQ DX, DX // 2*a0
620 MULXQ R9, AX, CX // 2*a0 * a1 -> CX:AX
621 ADCXQ AX, R13 // c_lo += lo (CF chain)
622 ADCXQ CX, BX // c_hi += hi + CF
623
624 // d += a2*2*a4
625 MOVQ 64(SP), DX // load saved original a2
626 MULXQ R15, AX, CX // a2 * 2*a4 -> CX:AX
627 ADOXQ AX, SI // d_lo += lo (OF chain)
628 ADOXQ CX, DI // d_hi += hi + OF
629
630 // d += a3*a3
631 MOVQ R11, DX
632 MULXQ R11, AX, CX // a3 * a3 -> CX:AX
633 ADOXQ AX, SI // d_lo += lo
634 ADOXQ CX, DI // d_hi += hi + OF
635
636 // Store results back
637 MOVQ R13, 16(SP) // c_lo
638 MOVQ BX, 24(SP) // c_hi
639 MOVQ SI, 0(SP) // d_lo
640 MOVQ DI, 8(SP) // d_hi
641 MOVQ 72(SP), DI // restore r pointer
642
643 // === Step 17: c += R * (d & M); d >>= 52 ===
644 MOVQ 0(SP), AX
645 ANDQ R14, AX
646 MOVQ AX, DX
647 MOVQ $0x1000003D10, R13
648 MULXQ R13, AX, CX
649 ADDQ AX, 16(SP)
650 ADCQ CX, 24(SP)
651
652 MOVQ 0(SP), AX
653 MOVQ 8(SP), CX
654 SHRQ $52, AX
655 MOVQ CX, DX
656 SHLQ $12, DX
657 ORQ DX, AX
658 SHRQ $52, CX
659 MOVQ AX, 0(SP)
660 MOVQ CX, 8(SP)
661
662 // === Step 18: r[1] = c & M; c >>= 52 ===
663 MOVQ 16(SP), AX
664 ANDQ R14, AX
665 MOVQ AX, 8(DI)
666
667 MOVQ 16(SP), AX
668 MOVQ 24(SP), CX
669 SHRQ $52, AX
670 MOVQ CX, DX
671 SHLQ $12, DX
672 ORQ DX, AX
673 SHRQ $52, CX
674 MOVQ AX, 16(SP)
675 MOVQ CX, 24(SP)
676
677 // === Steps 19-20: Parallel c and d updates using ADCX/ADOX ===
678 // Step 19: c += 2*a0*a2 + a1*a1 (CF chain via ADCX)
679 // Step 20: d += a3*2*a4 (OF chain via ADOX)
680 // Save r pointer and load accumulators
681 MOVQ DI, 72(SP) // save r pointer
682
683 MOVQ 16(SP), R13 // c_lo
684 MOVQ 24(SP), BX // c_hi
685 MOVQ 0(SP), SI // d_lo
686 MOVQ 8(SP), DI // d_hi
687
688 // Clear CF and OF
689 XORQ AX, AX
690
691 // c += 2*a0*a2
692 MOVQ R8, DX // a0 (R8 was never modified)
693 ADDQ DX, DX // 2*a0
694 MOVQ 64(SP), AX // load saved original a2
695 MULXQ AX, AX, CX // 2*a0 * a2 -> CX:AX
696 ADCXQ AX, R13 // c_lo += lo
697 ADCXQ CX, BX // c_hi += hi + CF
698
699 // d += a3*2*a4
700 MOVQ R11, DX
701 MULXQ R15, AX, CX // a3 * 2*a4 -> CX:AX
702 ADOXQ AX, SI // d_lo += lo
703 ADOXQ CX, DI // d_hi += hi + OF
704
705 // c += a1*a1
706 MOVQ R9, DX
707 MULXQ R9, AX, CX // a1 * a1 -> CX:AX
708 ADCXQ AX, R13 // c_lo += lo
709 ADCXQ CX, BX // c_hi += hi + CF
710
711 // Store results back
712 MOVQ R13, 16(SP) // c_lo
713 MOVQ BX, 24(SP) // c_hi
714 MOVQ SI, 0(SP) // d_lo
715 MOVQ DI, 8(SP) // d_hi
716 MOVQ 72(SP), DI // restore r pointer
717
718 // === Step 21: c += R * d_lo; d >>= 64 ===
719 MOVQ 0(SP), DX
720 MOVQ $0x1000003D10, R13
721 MULXQ R13, AX, CX
722 ADDQ AX, 16(SP)
723 ADCQ CX, 24(SP)
724
725 MOVQ 8(SP), AX
726 MOVQ AX, 0(SP)
727 MOVQ $0, 8(SP)
728
729 // === Step 22: r[2] = c & M; c >>= 52 ===
730 MOVQ 16(SP), AX
731 ANDQ R14, AX
732 MOVQ AX, 16(DI)
733
734 MOVQ 16(SP), AX
735 MOVQ 24(SP), CX
736 SHRQ $52, AX
737 MOVQ CX, DX
738 SHLQ $12, DX
739 ORQ DX, AX
740 SHRQ $52, CX
741 MOVQ AX, 16(SP)
742 MOVQ CX, 24(SP)
743
744 // === Step 23: c += (R << 12) * d + t3 ===
745 MOVQ 0(SP), DX
746 MOVQ $0x1000003D10000, R13
747 MULXQ R13, AX, CX
748 ADDQ AX, 16(SP)
749 ADCQ CX, 24(SP)
750
751 MOVQ 32(SP), AX
752 ADDQ AX, 16(SP)
753 ADCQ $0, 24(SP)
754
755 // === Step 24: r[3] = c & M; c >>= 52 ===
756 MOVQ 16(SP), AX
757 ANDQ R14, AX
758 MOVQ AX, 24(DI)
759
760 MOVQ 16(SP), AX
761 MOVQ 24(SP), CX
762 SHRQ $52, AX
763 MOVQ CX, DX
764 SHLQ $12, DX
765 ORQ DX, AX
766
767 // === Step 25: r[4] = c + t4 ===
768 ADDQ 40(SP), AX
769 MOVQ AX, 32(DI)
770
771 RET
772