field_amd64_bmi2.s raw

   1  //go:build amd64
   2  
   3  #include "textflag.h"
   4  
   5  // Field multiplication assembly for secp256k1 using BMI2+ADX instructions.
   6  // Uses MULX for flag-free multiplication and ADCX/ADOX for parallel carry chains.
   7  //
   8  // The field element is represented as 5 limbs of 52 bits each:
   9  //   n[0..4] where value = sum(n[i] * 2^(52*i))
  10  //
  11  // Field prime p = 2^256 - 2^32 - 977
  12  // Reduction constant R = 2^256 mod p = 2^32 + 977 = 0x1000003D1
  13  // For 5x52: R shifted = 0x1000003D10 (for 52-bit alignment)
  14  //
  15  // BMI2 Instructions used:
  16  //   MULXQ src, lo, hi  - unsigned multiply RDX * src -> hi:lo (flags unchanged)
  17  //
  18  // ADX Instructions used:
  19  //   ADCXQ src, dst     - dst += src + CF (only modifies CF)
  20  //   ADOXQ src, dst     - dst += src + OF (only modifies OF)
  21  //
  22  // ADCX/ADOX allow parallel carry chains: ADCX uses CF only, ADOX uses OF only.
  23  // This enables the CPU to execute two independent addition chains in parallel.
  24  //
  25  // Stack layout for fieldMulAsmBMI2 (96 bytes):
  26  //   0(SP)  - d_lo
  27  //   8(SP)  - d_hi
  28  //   16(SP) - c_lo
  29  //   24(SP) - c_hi
  30  //   32(SP) - t3
  31  //   40(SP) - t4
  32  //   48(SP) - tx
  33  //   56(SP) - u0
  34  //   64(SP) - temp storage
  35  //   72(SP) - temp storage 2
  36  //   80(SP) - saved b pointer
  37  
  38  // func fieldMulAsmBMI2(r, a, b *FieldElement)
  39  TEXT ·fieldMulAsmBMI2(SB), NOSPLIT, $96-24
  40  	MOVQ r+0(FP), DI
  41  	MOVQ a+8(FP), SI
  42  	MOVQ b+16(FP), BX
  43  
  44  	// Save b pointer
  45  	MOVQ BX, 80(SP)
  46  
  47  	// Load a[0..4] into registers
  48  	MOVQ 0(SI), R8       // a0
  49  	MOVQ 8(SI), R9       // a1
  50  	MOVQ 16(SI), R10     // a2
  51  	MOVQ 24(SI), R11     // a3
  52  	MOVQ 32(SI), R12     // a4
  53  
  54  	// Constants:
  55  	// M = 0xFFFFFFFFFFFFF (2^52 - 1)
  56  	// R = 0x1000003D10
  57  
  58  	// === Step 1: d = a0*b3 + a1*b2 + a2*b1 + a3*b0 ===
  59  	// Using MULX: put multiplier in RDX, result in specified regs
  60  	MOVQ 24(BX), DX      // b3
  61  	MULXQ R8, AX, CX     // a0 * b3 -> CX:AX
  62  	MOVQ AX, 0(SP)       // d_lo
  63  	MOVQ CX, 8(SP)       // d_hi
  64  
  65  	MOVQ 16(BX), DX      // b2
  66  	MULXQ R9, AX, CX     // a1 * b2 -> CX:AX
  67  	ADDQ AX, 0(SP)
  68  	ADCQ CX, 8(SP)
  69  
  70  	MOVQ 8(BX), DX       // b1
  71  	MULXQ R10, AX, CX    // a2 * b1 -> CX:AX
  72  	ADDQ AX, 0(SP)
  73  	ADCQ CX, 8(SP)
  74  
  75  	MOVQ 0(BX), DX       // b0
  76  	MULXQ R11, AX, CX    // a3 * b0 -> CX:AX
  77  	ADDQ AX, 0(SP)
  78  	ADCQ CX, 8(SP)
  79  
  80  	// === Step 2: c = a4*b4 ===
  81  	MOVQ 32(BX), DX      // b4
  82  	MULXQ R12, AX, CX    // a4 * b4 -> CX:AX
  83  	MOVQ AX, 16(SP)      // c_lo
  84  	MOVQ CX, 24(SP)      // c_hi
  85  
  86  	// === Step 3: d += R * c_lo ===
  87  	MOVQ 16(SP), DX      // c_lo
  88  	MOVQ $0x1000003D10, R13  // R constant
  89  	MULXQ R13, AX, CX    // R * c_lo -> CX:AX
  90  	ADDQ AX, 0(SP)
  91  	ADCQ CX, 8(SP)
  92  
  93  	// === Step 4: c >>= 64 ===
  94  	MOVQ 24(SP), AX
  95  	MOVQ AX, 16(SP)
  96  	MOVQ $0, 24(SP)
  97  
  98  	// === Step 5: t3 = d & M; d >>= 52 ===
  99  	MOVQ 0(SP), AX
 100  	MOVQ $0xFFFFFFFFFFFFF, R14  // M constant (keep in register)
 101  	ANDQ R14, AX
 102  	MOVQ AX, 32(SP)      // t3
 103  
 104  	MOVQ 0(SP), AX
 105  	MOVQ 8(SP), CX
 106  	SHRQ $52, AX
 107  	MOVQ CX, DX
 108  	SHLQ $12, DX
 109  	ORQ DX, AX
 110  	SHRQ $52, CX
 111  	MOVQ AX, 0(SP)
 112  	MOVQ CX, 8(SP)
 113  
 114  	// === Step 6: d += a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0 ===
 115  	MOVQ 80(SP), BX      // restore b pointer
 116  
 117  	MOVQ 32(BX), DX      // b4
 118  	MULXQ R8, AX, CX     // a0 * b4
 119  	ADDQ AX, 0(SP)
 120  	ADCQ CX, 8(SP)
 121  
 122  	MOVQ 24(BX), DX      // b3
 123  	MULXQ R9, AX, CX     // a1 * b3
 124  	ADDQ AX, 0(SP)
 125  	ADCQ CX, 8(SP)
 126  
 127  	MOVQ 16(BX), DX      // b2
 128  	MULXQ R10, AX, CX    // a2 * b2
 129  	ADDQ AX, 0(SP)
 130  	ADCQ CX, 8(SP)
 131  
 132  	MOVQ 8(BX), DX       // b1
 133  	MULXQ R11, AX, CX    // a3 * b1
 134  	ADDQ AX, 0(SP)
 135  	ADCQ CX, 8(SP)
 136  
 137  	MOVQ 0(BX), DX       // b0
 138  	MULXQ R12, AX, CX    // a4 * b0
 139  	ADDQ AX, 0(SP)
 140  	ADCQ CX, 8(SP)
 141  
 142  	// === Step 7: d += (R << 12) * c ===
 143  	MOVQ 16(SP), DX      // c
 144  	MOVQ $0x1000003D10000, R15  // R << 12
 145  	MULXQ R15, AX, CX
 146  	ADDQ AX, 0(SP)
 147  	ADCQ CX, 8(SP)
 148  
 149  	// === Step 8: t4 = d & M; tx = t4 >> 48; t4 &= (M >> 4) ===
 150  	MOVQ 0(SP), AX
 151  	ANDQ R14, AX         // t4 = d & M
 152  	MOVQ AX, 40(SP)
 153  
 154  	SHRQ $48, AX
 155  	MOVQ AX, 48(SP)      // tx
 156  
 157  	MOVQ 40(SP), AX
 158  	MOVQ $0x0FFFFFFFFFFFF, CX
 159  	ANDQ CX, AX
 160  	MOVQ AX, 40(SP)      // t4
 161  
 162  	// === Step 9: d >>= 52 ===
 163  	MOVQ 0(SP), AX
 164  	MOVQ 8(SP), CX
 165  	SHRQ $52, AX
 166  	MOVQ CX, DX
 167  	SHLQ $12, DX
 168  	ORQ DX, AX
 169  	SHRQ $52, CX
 170  	MOVQ AX, 0(SP)
 171  	MOVQ CX, 8(SP)
 172  
 173  	// === Step 10: c = a0*b0 ===
 174  	MOVQ 0(BX), DX       // b0
 175  	MULXQ R8, AX, CX     // a0 * b0
 176  	MOVQ AX, 16(SP)
 177  	MOVQ CX, 24(SP)
 178  
 179  	// === Step 11: d += a1*b4 + a2*b3 + a3*b2 + a4*b1 ===
 180  	MOVQ 32(BX), DX      // b4
 181  	MULXQ R9, AX, CX     // a1 * b4
 182  	ADDQ AX, 0(SP)
 183  	ADCQ CX, 8(SP)
 184  
 185  	MOVQ 24(BX), DX      // b3
 186  	MULXQ R10, AX, CX    // a2 * b3
 187  	ADDQ AX, 0(SP)
 188  	ADCQ CX, 8(SP)
 189  
 190  	MOVQ 16(BX), DX      // b2
 191  	MULXQ R11, AX, CX    // a3 * b2
 192  	ADDQ AX, 0(SP)
 193  	ADCQ CX, 8(SP)
 194  
 195  	MOVQ 8(BX), DX       // b1
 196  	MULXQ R12, AX, CX    // a4 * b1
 197  	ADDQ AX, 0(SP)
 198  	ADCQ CX, 8(SP)
 199  
 200  	// === Step 12: u0 = d & M; d >>= 52; u0 = (u0 << 4) | tx ===
 201  	MOVQ 0(SP), AX
 202  	ANDQ R14, AX         // u0 = d & M
 203  	SHLQ $4, AX
 204  	ORQ 48(SP), AX
 205  	MOVQ AX, 56(SP)      // u0
 206  
 207  	MOVQ 0(SP), AX
 208  	MOVQ 8(SP), CX
 209  	SHRQ $52, AX
 210  	MOVQ CX, DX
 211  	SHLQ $12, DX
 212  	ORQ DX, AX
 213  	SHRQ $52, CX
 214  	MOVQ AX, 0(SP)
 215  	MOVQ CX, 8(SP)
 216  
 217  	// === Step 13: c += (R >> 4) * u0 ===
 218  	MOVQ 56(SP), DX      // u0
 219  	MOVQ $0x1000003D1, R13  // R >> 4
 220  	MULXQ R13, AX, CX
 221  	ADDQ AX, 16(SP)
 222  	ADCQ CX, 24(SP)
 223  
 224  	// === Step 14: r[0] = c & M; c >>= 52 ===
 225  	MOVQ 16(SP), AX
 226  	ANDQ R14, AX
 227  	MOVQ AX, 0(DI)       // store r[0]
 228  
 229  	MOVQ 16(SP), AX
 230  	MOVQ 24(SP), CX
 231  	SHRQ $52, AX
 232  	MOVQ CX, DX
 233  	SHLQ $12, DX
 234  	ORQ DX, AX
 235  	SHRQ $52, CX
 236  	MOVQ AX, 16(SP)
 237  	MOVQ CX, 24(SP)
 238  
 239  	// === Steps 15-16: Parallel c and d updates using ADCX/ADOX ===
 240  	// Step 15: c += a0*b1 + a1*b0 (CF chain via ADCX)
 241  	// Step 16: d += a2*b4 + a3*b3 + a4*b2 (OF chain via ADOX)
 242  	// Save r pointer before reusing DI
 243  	MOVQ DI, 64(SP)      // save r pointer
 244  
 245  	// Load all accumulators into registers for ADCX/ADOX (register-only ops)
 246  	MOVQ 16(SP), R13     // c_lo
 247  	MOVQ 24(SP), R15     // c_hi
 248  	MOVQ 0(SP), SI       // d_lo (reuse SI since we don't need 'a' anymore)
 249  	MOVQ 8(SP), DI       // d_hi (reuse DI)
 250  
 251  	// Clear CF and OF
 252  	XORQ AX, AX
 253  
 254  	// First pair: c += a0*b1, d += a2*b4
 255  	MOVQ 8(BX), DX       // b1
 256  	MULXQ R8, AX, CX     // a0 * b1 -> CX:AX
 257  	ADCXQ AX, R13        // c_lo += lo (CF chain)
 258  	ADCXQ CX, R15        // c_hi += hi + CF
 259  
 260  	MOVQ 32(BX), DX      // b4
 261  	MULXQ R10, AX, CX    // a2 * b4 -> CX:AX
 262  	ADOXQ AX, SI         // d_lo += lo (OF chain)
 263  	ADOXQ CX, DI         // d_hi += hi + OF
 264  
 265  	// Second pair: c += a1*b0, d += a3*b3
 266  	MOVQ 0(BX), DX       // b0
 267  	MULXQ R9, AX, CX     // a1 * b0 -> CX:AX
 268  	ADCXQ AX, R13        // c_lo += lo
 269  	ADCXQ CX, R15        // c_hi += hi + CF
 270  
 271  	MOVQ 24(BX), DX      // b3
 272  	MULXQ R11, AX, CX    // a3 * b3 -> CX:AX
 273  	ADOXQ AX, SI         // d_lo += lo
 274  	ADOXQ CX, DI         // d_hi += hi + OF
 275  
 276  	// Third: d += a4*b2 (only d, no more c operations)
 277  	MOVQ 16(BX), DX      // b2
 278  	MULXQ R12, AX, CX    // a4 * b2 -> CX:AX
 279  	ADOXQ AX, SI         // d_lo += lo
 280  	ADOXQ CX, DI         // d_hi += hi + OF
 281  
 282  	// Store results back
 283  	MOVQ R13, 16(SP)     // c_lo
 284  	MOVQ R15, 24(SP)     // c_hi
 285  	MOVQ SI, 0(SP)       // d_lo
 286  	MOVQ DI, 8(SP)       // d_hi
 287  	MOVQ 64(SP), DI      // restore r pointer
 288  
 289  	// === Step 17: c += R * (d & M); d >>= 52 ===
 290  	MOVQ 0(SP), AX
 291  	ANDQ R14, AX         // d & M
 292  	MOVQ AX, DX
 293  	MOVQ $0x1000003D10, R13  // R
 294  	MULXQ R13, AX, CX
 295  	ADDQ AX, 16(SP)
 296  	ADCQ CX, 24(SP)
 297  
 298  	MOVQ 0(SP), AX
 299  	MOVQ 8(SP), CX
 300  	SHRQ $52, AX
 301  	MOVQ CX, DX
 302  	SHLQ $12, DX
 303  	ORQ DX, AX
 304  	SHRQ $52, CX
 305  	MOVQ AX, 0(SP)
 306  	MOVQ CX, 8(SP)
 307  
 308  	// === Step 18: r[1] = c & M; c >>= 52 ===
 309  	MOVQ 16(SP), AX
 310  	ANDQ R14, AX
 311  	MOVQ AX, 8(DI)       // store r[1]
 312  
 313  	MOVQ 16(SP), AX
 314  	MOVQ 24(SP), CX
 315  	SHRQ $52, AX
 316  	MOVQ CX, DX
 317  	SHLQ $12, DX
 318  	ORQ DX, AX
 319  	SHRQ $52, CX
 320  	MOVQ AX, 16(SP)
 321  	MOVQ CX, 24(SP)
 322  
 323  	// === Steps 19-20: Parallel c and d updates using ADCX/ADOX ===
 324  	// Step 19: c += a0*b2 + a1*b1 + a2*b0 (CF chain via ADCX)
 325  	// Step 20: d += a3*b4 + a4*b3 (OF chain via ADOX)
 326  	// Save r pointer before reusing DI
 327  	MOVQ DI, 64(SP)      // save r pointer
 328  
 329  	// Load all accumulators into registers
 330  	MOVQ 16(SP), R13     // c_lo
 331  	MOVQ 24(SP), R15     // c_hi
 332  	MOVQ 0(SP), SI       // d_lo
 333  	MOVQ 8(SP), DI       // d_hi
 334  
 335  	// Clear CF and OF
 336  	XORQ AX, AX
 337  
 338  	// First pair: c += a0*b2, d += a3*b4
 339  	MOVQ 16(BX), DX      // b2
 340  	MULXQ R8, AX, CX     // a0 * b2 -> CX:AX
 341  	ADCXQ AX, R13        // c_lo += lo
 342  	ADCXQ CX, R15        // c_hi += hi + CF
 343  
 344  	MOVQ 32(BX), DX      // b4
 345  	MULXQ R11, AX, CX    // a3 * b4 -> CX:AX
 346  	ADOXQ AX, SI         // d_lo += lo
 347  	ADOXQ CX, DI         // d_hi += hi + OF
 348  
 349  	// Second pair: c += a1*b1, d += a4*b3
 350  	MOVQ 8(BX), DX       // b1
 351  	MULXQ R9, AX, CX     // a1 * b1 -> CX:AX
 352  	ADCXQ AX, R13        // c_lo += lo
 353  	ADCXQ CX, R15        // c_hi += hi + CF
 354  
 355  	MOVQ 24(BX), DX      // b3
 356  	MULXQ R12, AX, CX    // a4 * b3 -> CX:AX
 357  	ADOXQ AX, SI         // d_lo += lo
 358  	ADOXQ CX, DI         // d_hi += hi + OF
 359  
 360  	// Third: c += a2*b0 (only c, no more d operations)
 361  	MOVQ 0(BX), DX       // b0
 362  	MULXQ R10, AX, CX    // a2 * b0 -> CX:AX
 363  	ADCXQ AX, R13        // c_lo += lo
 364  	ADCXQ CX, R15        // c_hi += hi + CF
 365  
 366  	// Store results back
 367  	MOVQ R13, 16(SP)     // c_lo
 368  	MOVQ R15, 24(SP)     // c_hi
 369  	MOVQ SI, 0(SP)       // d_lo
 370  	MOVQ DI, 8(SP)       // d_hi
 371  	MOVQ 64(SP), DI      // restore r pointer
 372  
 373  	// === Step 21: c += R * d_lo; d >>= 64 ===
 374  	MOVQ 0(SP), DX       // d_lo
 375  	MOVQ $0x1000003D10, R13  // R
 376  	MULXQ R13, AX, CX
 377  	ADDQ AX, 16(SP)
 378  	ADCQ CX, 24(SP)
 379  
 380  	MOVQ 8(SP), AX
 381  	MOVQ AX, 0(SP)
 382  	MOVQ $0, 8(SP)
 383  
 384  	// === Step 22: r[2] = c & M; c >>= 52 ===
 385  	MOVQ 16(SP), AX
 386  	ANDQ R14, AX
 387  	MOVQ AX, 16(DI)      // store r[2]
 388  
 389  	MOVQ 16(SP), AX
 390  	MOVQ 24(SP), CX
 391  	SHRQ $52, AX
 392  	MOVQ CX, DX
 393  	SHLQ $12, DX
 394  	ORQ DX, AX
 395  	SHRQ $52, CX
 396  	MOVQ AX, 16(SP)
 397  	MOVQ CX, 24(SP)
 398  
 399  	// === Step 23: c += (R << 12) * d + t3 ===
 400  	MOVQ 0(SP), DX       // d
 401  	MOVQ $0x1000003D10000, R15  // R << 12 (reload since R15 was used for c_hi)
 402  	MULXQ R15, AX, CX    // (R << 12) * d
 403  	ADDQ AX, 16(SP)
 404  	ADCQ CX, 24(SP)
 405  
 406  	MOVQ 32(SP), AX      // t3
 407  	ADDQ AX, 16(SP)
 408  	ADCQ $0, 24(SP)
 409  
 410  	// === Step 24: r[3] = c & M; c >>= 52 ===
 411  	MOVQ 16(SP), AX
 412  	ANDQ R14, AX
 413  	MOVQ AX, 24(DI)      // store r[3]
 414  
 415  	MOVQ 16(SP), AX
 416  	MOVQ 24(SP), CX
 417  	SHRQ $52, AX
 418  	MOVQ CX, DX
 419  	SHLQ $12, DX
 420  	ORQ DX, AX
 421  
 422  	// === Step 25: r[4] = c + t4 ===
 423  	ADDQ 40(SP), AX
 424  	MOVQ AX, 32(DI)      // store r[4]
 425  
 426  	RET
 427  
 428  
 429  // func fieldSqrAsmBMI2(r, a *FieldElement)
 430  // Squares a field element using BMI2 instructions.
 431  TEXT ·fieldSqrAsmBMI2(SB), NOSPLIT, $96-16
 432  	MOVQ r+0(FP), DI
 433  	MOVQ a+8(FP), SI
 434  
 435  	// Load a[0..4] into registers
 436  	MOVQ 0(SI), R8       // a0
 437  	MOVQ 8(SI), R9       // a1
 438  	MOVQ 16(SI), R10     // a2
 439  	MOVQ 24(SI), R11     // a3
 440  	MOVQ 32(SI), R12     // a4
 441  
 442  	// Keep M constant in R14
 443  	MOVQ $0xFFFFFFFFFFFFF, R14
 444  
 445  	// === Step 1: d = 2*a0*a3 + 2*a1*a2 ===
 446  	MOVQ R8, DX
 447  	ADDQ DX, DX          // 2*a0
 448  	MULXQ R11, AX, CX    // 2*a0 * a3
 449  	MOVQ AX, 0(SP)
 450  	MOVQ CX, 8(SP)
 451  
 452  	MOVQ R9, DX
 453  	ADDQ DX, DX          // 2*a1
 454  	MULXQ R10, AX, CX    // 2*a1 * a2
 455  	ADDQ AX, 0(SP)
 456  	ADCQ CX, 8(SP)
 457  
 458  	// === Step 2: c = a4*a4 ===
 459  	MOVQ R12, DX
 460  	MULXQ R12, AX, CX    // a4 * a4
 461  	MOVQ AX, 16(SP)
 462  	MOVQ CX, 24(SP)
 463  
 464  	// === Step 3: d += R * c_lo ===
 465  	MOVQ 16(SP), DX
 466  	MOVQ $0x1000003D10, R13
 467  	MULXQ R13, AX, CX
 468  	ADDQ AX, 0(SP)
 469  	ADCQ CX, 8(SP)
 470  
 471  	// === Step 4: c >>= 64 ===
 472  	MOVQ 24(SP), AX
 473  	MOVQ AX, 16(SP)
 474  	MOVQ $0, 24(SP)
 475  
 476  	// === Step 5: t3 = d & M; d >>= 52 ===
 477  	MOVQ 0(SP), AX
 478  	ANDQ R14, AX
 479  	MOVQ AX, 32(SP)      // t3
 480  
 481  	MOVQ 0(SP), AX
 482  	MOVQ 8(SP), CX
 483  	SHRQ $52, AX
 484  	MOVQ CX, DX
 485  	SHLQ $12, DX
 486  	ORQ DX, AX
 487  	SHRQ $52, CX
 488  	MOVQ AX, 0(SP)
 489  	MOVQ CX, 8(SP)
 490  
 491  	// === Step 6: d += 2*a0*a4 + 2*a1*a3 + a2*a2 ===
 492  	// Pre-compute 2*a4
 493  	MOVQ R12, R15
 494  	ADDQ R15, R15        // 2*a4
 495  
 496  	MOVQ R8, DX
 497  	MULXQ R15, AX, CX    // a0 * 2*a4
 498  	ADDQ AX, 0(SP)
 499  	ADCQ CX, 8(SP)
 500  
 501  	MOVQ R9, DX
 502  	ADDQ DX, DX          // 2*a1
 503  	MULXQ R11, AX, CX    // 2*a1 * a3
 504  	ADDQ AX, 0(SP)
 505  	ADCQ CX, 8(SP)
 506  
 507  	MOVQ R10, DX
 508  	MULXQ R10, AX, CX    // a2 * a2
 509  	ADDQ AX, 0(SP)
 510  	ADCQ CX, 8(SP)
 511  
 512  	// === Step 7: d += (R << 12) * c ===
 513  	MOVQ 16(SP), DX
 514  	MOVQ $0x1000003D10000, R13
 515  	MULXQ R13, AX, CX
 516  	ADDQ AX, 0(SP)
 517  	ADCQ CX, 8(SP)
 518  
 519  	// === Step 8: t4 = d & M; tx = t4 >> 48; t4 &= (M >> 4) ===
 520  	MOVQ 0(SP), AX
 521  	ANDQ R14, AX
 522  	MOVQ AX, 40(SP)
 523  
 524  	SHRQ $48, AX
 525  	MOVQ AX, 48(SP)      // tx
 526  
 527  	MOVQ 40(SP), AX
 528  	MOVQ $0x0FFFFFFFFFFFF, CX
 529  	ANDQ CX, AX
 530  	MOVQ AX, 40(SP)      // t4
 531  
 532  	// === Step 9: d >>= 52 ===
 533  	MOVQ 0(SP), AX
 534  	MOVQ 8(SP), CX
 535  	SHRQ $52, AX
 536  	MOVQ CX, DX
 537  	SHLQ $12, DX
 538  	ORQ DX, AX
 539  	SHRQ $52, CX
 540  	MOVQ AX, 0(SP)
 541  	MOVQ CX, 8(SP)
 542  
 543  	// === Step 10: c = a0*a0 ===
 544  	MOVQ R8, DX
 545  	MULXQ R8, AX, CX
 546  	MOVQ AX, 16(SP)
 547  	MOVQ CX, 24(SP)
 548  
 549  	// === Step 11: d += a1*2*a4 + 2*a2*a3 ===
 550  	// Save a2 before doubling (needed later in step 16 and 19)
 551  	MOVQ R10, 64(SP)     // save original a2
 552  
 553  	MOVQ R9, DX
 554  	MULXQ R15, AX, CX    // a1 * 2*a4
 555  	ADDQ AX, 0(SP)
 556  	ADCQ CX, 8(SP)
 557  
 558  	MOVQ R10, DX
 559  	ADDQ DX, DX          // 2*a2
 560  	MULXQ R11, AX, CX    // 2*a2 * a3
 561  	ADDQ AX, 0(SP)
 562  	ADCQ CX, 8(SP)
 563  
 564  	// === Step 12: u0 = d & M; d >>= 52; u0 = (u0 << 4) | tx ===
 565  	MOVQ 0(SP), AX
 566  	ANDQ R14, AX
 567  	SHLQ $4, AX
 568  	ORQ 48(SP), AX
 569  	MOVQ AX, 56(SP)      // u0
 570  
 571  	MOVQ 0(SP), AX
 572  	MOVQ 8(SP), CX
 573  	SHRQ $52, AX
 574  	MOVQ CX, DX
 575  	SHLQ $12, DX
 576  	ORQ DX, AX
 577  	SHRQ $52, CX
 578  	MOVQ AX, 0(SP)
 579  	MOVQ CX, 8(SP)
 580  
 581  	// === Step 13: c += (R >> 4) * u0 ===
 582  	MOVQ 56(SP), DX
 583  	MOVQ $0x1000003D1, R13
 584  	MULXQ R13, AX, CX
 585  	ADDQ AX, 16(SP)
 586  	ADCQ CX, 24(SP)
 587  
 588  	// === Step 14: r[0] = c & M; c >>= 52 ===
 589  	MOVQ 16(SP), AX
 590  	ANDQ R14, AX
 591  	MOVQ AX, 0(DI)
 592  
 593  	MOVQ 16(SP), AX
 594  	MOVQ 24(SP), CX
 595  	SHRQ $52, AX
 596  	MOVQ CX, DX
 597  	SHLQ $12, DX
 598  	ORQ DX, AX
 599  	SHRQ $52, CX
 600  	MOVQ AX, 16(SP)
 601  	MOVQ CX, 24(SP)
 602  
 603  	// === Steps 15-16: Parallel c and d updates using ADCX/ADOX ===
 604  	// Step 15: c += 2*a0*a1 (CF chain via ADCX)
 605  	// Step 16: d += a2*2*a4 + a3*a3 (OF chain via ADOX)
 606  	// Save r pointer and load accumulators
 607  	MOVQ DI, 72(SP)      // save r pointer (64(SP) has saved a2)
 608  
 609  	MOVQ 16(SP), R13     // c_lo
 610  	MOVQ 24(SP), BX      // c_hi (use BX since we need SI/DI)
 611  	MOVQ 0(SP), SI       // d_lo
 612  	MOVQ 8(SP), DI       // d_hi
 613  
 614  	// Clear CF and OF
 615  	XORQ AX, AX
 616  
 617  	// c += 2*a0*a1
 618  	MOVQ R8, DX
 619  	ADDQ DX, DX          // 2*a0
 620  	MULXQ R9, AX, CX     // 2*a0 * a1 -> CX:AX
 621  	ADCXQ AX, R13        // c_lo += lo (CF chain)
 622  	ADCXQ CX, BX         // c_hi += hi + CF
 623  
 624  	// d += a2*2*a4
 625  	MOVQ 64(SP), DX      // load saved original a2
 626  	MULXQ R15, AX, CX    // a2 * 2*a4 -> CX:AX
 627  	ADOXQ AX, SI         // d_lo += lo (OF chain)
 628  	ADOXQ CX, DI         // d_hi += hi + OF
 629  
 630  	// d += a3*a3
 631  	MOVQ R11, DX
 632  	MULXQ R11, AX, CX    // a3 * a3 -> CX:AX
 633  	ADOXQ AX, SI         // d_lo += lo
 634  	ADOXQ CX, DI         // d_hi += hi + OF
 635  
 636  	// Store results back
 637  	MOVQ R13, 16(SP)     // c_lo
 638  	MOVQ BX, 24(SP)      // c_hi
 639  	MOVQ SI, 0(SP)       // d_lo
 640  	MOVQ DI, 8(SP)       // d_hi
 641  	MOVQ 72(SP), DI      // restore r pointer
 642  
 643  	// === Step 17: c += R * (d & M); d >>= 52 ===
 644  	MOVQ 0(SP), AX
 645  	ANDQ R14, AX
 646  	MOVQ AX, DX
 647  	MOVQ $0x1000003D10, R13
 648  	MULXQ R13, AX, CX
 649  	ADDQ AX, 16(SP)
 650  	ADCQ CX, 24(SP)
 651  
 652  	MOVQ 0(SP), AX
 653  	MOVQ 8(SP), CX
 654  	SHRQ $52, AX
 655  	MOVQ CX, DX
 656  	SHLQ $12, DX
 657  	ORQ DX, AX
 658  	SHRQ $52, CX
 659  	MOVQ AX, 0(SP)
 660  	MOVQ CX, 8(SP)
 661  
 662  	// === Step 18: r[1] = c & M; c >>= 52 ===
 663  	MOVQ 16(SP), AX
 664  	ANDQ R14, AX
 665  	MOVQ AX, 8(DI)
 666  
 667  	MOVQ 16(SP), AX
 668  	MOVQ 24(SP), CX
 669  	SHRQ $52, AX
 670  	MOVQ CX, DX
 671  	SHLQ $12, DX
 672  	ORQ DX, AX
 673  	SHRQ $52, CX
 674  	MOVQ AX, 16(SP)
 675  	MOVQ CX, 24(SP)
 676  
 677  	// === Steps 19-20: Parallel c and d updates using ADCX/ADOX ===
 678  	// Step 19: c += 2*a0*a2 + a1*a1 (CF chain via ADCX)
 679  	// Step 20: d += a3*2*a4 (OF chain via ADOX)
 680  	// Save r pointer and load accumulators
 681  	MOVQ DI, 72(SP)      // save r pointer
 682  
 683  	MOVQ 16(SP), R13     // c_lo
 684  	MOVQ 24(SP), BX      // c_hi
 685  	MOVQ 0(SP), SI       // d_lo
 686  	MOVQ 8(SP), DI       // d_hi
 687  
 688  	// Clear CF and OF
 689  	XORQ AX, AX
 690  
 691  	// c += 2*a0*a2
 692  	MOVQ R8, DX          // a0 (R8 was never modified)
 693  	ADDQ DX, DX          // 2*a0
 694  	MOVQ 64(SP), AX      // load saved original a2
 695  	MULXQ AX, AX, CX     // 2*a0 * a2 -> CX:AX
 696  	ADCXQ AX, R13        // c_lo += lo
 697  	ADCXQ CX, BX         // c_hi += hi + CF
 698  
 699  	// d += a3*2*a4
 700  	MOVQ R11, DX
 701  	MULXQ R15, AX, CX    // a3 * 2*a4 -> CX:AX
 702  	ADOXQ AX, SI         // d_lo += lo
 703  	ADOXQ CX, DI         // d_hi += hi + OF
 704  
 705  	// c += a1*a1
 706  	MOVQ R9, DX
 707  	MULXQ R9, AX, CX     // a1 * a1 -> CX:AX
 708  	ADCXQ AX, R13        // c_lo += lo
 709  	ADCXQ CX, BX         // c_hi += hi + CF
 710  
 711  	// Store results back
 712  	MOVQ R13, 16(SP)     // c_lo
 713  	MOVQ BX, 24(SP)      // c_hi
 714  	MOVQ SI, 0(SP)       // d_lo
 715  	MOVQ DI, 8(SP)       // d_hi
 716  	MOVQ 72(SP), DI      // restore r pointer
 717  
 718  	// === Step 21: c += R * d_lo; d >>= 64 ===
 719  	MOVQ 0(SP), DX
 720  	MOVQ $0x1000003D10, R13
 721  	MULXQ R13, AX, CX
 722  	ADDQ AX, 16(SP)
 723  	ADCQ CX, 24(SP)
 724  
 725  	MOVQ 8(SP), AX
 726  	MOVQ AX, 0(SP)
 727  	MOVQ $0, 8(SP)
 728  
 729  	// === Step 22: r[2] = c & M; c >>= 52 ===
 730  	MOVQ 16(SP), AX
 731  	ANDQ R14, AX
 732  	MOVQ AX, 16(DI)
 733  
 734  	MOVQ 16(SP), AX
 735  	MOVQ 24(SP), CX
 736  	SHRQ $52, AX
 737  	MOVQ CX, DX
 738  	SHLQ $12, DX
 739  	ORQ DX, AX
 740  	SHRQ $52, CX
 741  	MOVQ AX, 16(SP)
 742  	MOVQ CX, 24(SP)
 743  
 744  	// === Step 23: c += (R << 12) * d + t3 ===
 745  	MOVQ 0(SP), DX
 746  	MOVQ $0x1000003D10000, R13
 747  	MULXQ R13, AX, CX
 748  	ADDQ AX, 16(SP)
 749  	ADCQ CX, 24(SP)
 750  
 751  	MOVQ 32(SP), AX
 752  	ADDQ AX, 16(SP)
 753  	ADCQ $0, 24(SP)
 754  
 755  	// === Step 24: r[3] = c & M; c >>= 52 ===
 756  	MOVQ 16(SP), AX
 757  	ANDQ R14, AX
 758  	MOVQ AX, 24(DI)
 759  
 760  	MOVQ 16(SP), AX
 761  	MOVQ 24(SP), CX
 762  	SHRQ $52, AX
 763  	MOVQ CX, DX
 764  	SHLQ $12, DX
 765  	ORQ DX, AX
 766  
 767  	// === Step 25: r[4] = c + t4 ===
 768  	ADDQ 40(SP), AX
 769  	MOVQ AX, 32(DI)
 770  
 771  	RET
 772