scalar_amd64.s raw

   1  //go:build amd64
   2  
   3  #include "textflag.h"
   4  
   5  // Constants for scalar reduction
   6  // n = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141
   7  DATA p256k1ScalarN<>+0x00(SB)/8, $0xBFD25E8CD0364141
   8  DATA p256k1ScalarN<>+0x08(SB)/8, $0xBAAEDCE6AF48A03B
   9  DATA p256k1ScalarN<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFE
  10  DATA p256k1ScalarN<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
  11  GLOBL p256k1ScalarN<>(SB), RODATA|NOPTR, $32
  12  
  13  // 2^256 - n (for reduction)
  14  // NC0 = 0x402DA1732FC9BEBF
  15  // NC1 = 0x4551231950B75FC4
  16  // NC2 = 1
  17  DATA p256k1ScalarNC<>+0x00(SB)/8, $0x402DA1732FC9BEBF
  18  DATA p256k1ScalarNC<>+0x08(SB)/8, $0x4551231950B75FC4
  19  DATA p256k1ScalarNC<>+0x10(SB)/8, $0x0000000000000001
  20  DATA p256k1ScalarNC<>+0x18(SB)/8, $0x0000000000000000
  21  GLOBL p256k1ScalarNC<>(SB), RODATA|NOPTR, $32
  22  
  23  // func scalarAddAVX2(r, a, b *Scalar)
  24  // Adds two 256-bit scalars with carry chain and modular reduction.
  25  TEXT ·scalarAddAVX2(SB), NOSPLIT, $0-24
  26  	MOVQ r+0(FP), DI
  27  	MOVQ a+8(FP), SI
  28  	MOVQ b+16(FP), DX
  29  
  30  	// Load a and b into registers (scalar loads for carry chain)
  31  	MOVQ 0(SI), AX      // a.d[0]
  32  	MOVQ 8(SI), BX      // a.d[1]
  33  	MOVQ 16(SI), CX     // a.d[2]
  34  	MOVQ 24(SI), R8     // a.d[3]
  35  
  36  	// Add b with carry chain
  37  	ADDQ 0(DX), AX      // a.d[0] + b.d[0]
  38  	ADCQ 8(DX), BX      // a.d[1] + b.d[1] + carry
  39  	ADCQ 16(DX), CX     // a.d[2] + b.d[2] + carry
  40  	ADCQ 24(DX), R8     // a.d[3] + b.d[3] + carry
  41  
  42  	// Save carry flag
  43  	SETCS R9B
  44  
  45  	// Store preliminary result
  46  	MOVQ AX, 0(DI)
  47  	MOVQ BX, 8(DI)
  48  	MOVQ CX, 16(DI)
  49  	MOVQ R8, 24(DI)
  50  
  51  	// Check if we need to reduce (carry set or result >= n)
  52  	TESTB R9B, R9B
  53  	JNZ add_reduce
  54  
  55  	// Compare with n (from high to low)
  56  	MOVQ $0xFFFFFFFFFFFFFFFF, R10
  57  	CMPQ R8, R10
  58  	JB add_done
  59  	JA add_reduce
  60  	MOVQ p256k1ScalarN<>+0x10(SB), R10
  61  	CMPQ CX, R10
  62  	JB add_done
  63  	JA add_reduce
  64  	MOVQ p256k1ScalarN<>+0x08(SB), R10
  65  	CMPQ BX, R10
  66  	JB add_done
  67  	JA add_reduce
  68  	MOVQ p256k1ScalarN<>+0x00(SB), R10
  69  	CMPQ AX, R10
  70  	JB add_done
  71  
  72  add_reduce:
  73  	// Add 2^256 - n (which is equivalent to subtracting n)
  74  	MOVQ 0(DI), AX
  75  	MOVQ 8(DI), BX
  76  	MOVQ 16(DI), CX
  77  	MOVQ 24(DI), R8
  78  
  79  	MOVQ p256k1ScalarNC<>+0x00(SB), R10
  80  	ADDQ R10, AX
  81  	MOVQ p256k1ScalarNC<>+0x08(SB), R10
  82  	ADCQ R10, BX
  83  	MOVQ p256k1ScalarNC<>+0x10(SB), R10
  84  	ADCQ R10, CX
  85  	MOVQ p256k1ScalarNC<>+0x18(SB), R10
  86  	ADCQ R10, R8
  87  
  88  	MOVQ AX, 0(DI)
  89  	MOVQ BX, 8(DI)
  90  	MOVQ CX, 16(DI)
  91  	MOVQ R8, 24(DI)
  92  
  93  add_done:
  94  	VZEROUPPER
  95  	RET
  96  
  97  // func scalarSubAVX2(r, a, b *Scalar)
  98  // Subtracts two 256-bit scalars.
  99  TEXT ·scalarSubAVX2(SB), NOSPLIT, $0-24
 100  	MOVQ r+0(FP), DI
 101  	MOVQ a+8(FP), SI
 102  	MOVQ b+16(FP), DX
 103  
 104  	// Load a
 105  	MOVQ 0(SI), AX
 106  	MOVQ 8(SI), BX
 107  	MOVQ 16(SI), CX
 108  	MOVQ 24(SI), R8
 109  
 110  	// Subtract b with borrow chain
 111  	SUBQ 0(DX), AX
 112  	SBBQ 8(DX), BX
 113  	SBBQ 16(DX), CX
 114  	SBBQ 24(DX), R8
 115  
 116  	// Save borrow flag
 117  	SETCS R9B
 118  
 119  	// Store preliminary result
 120  	MOVQ AX, 0(DI)
 121  	MOVQ BX, 8(DI)
 122  	MOVQ CX, 16(DI)
 123  	MOVQ R8, 24(DI)
 124  
 125  	// If borrow, add n back
 126  	TESTB R9B, R9B
 127  	JZ sub_done
 128  
 129  	// Add n
 130  	MOVQ p256k1ScalarN<>+0x00(SB), R10
 131  	ADDQ R10, AX
 132  	MOVQ p256k1ScalarN<>+0x08(SB), R10
 133  	ADCQ R10, BX
 134  	MOVQ p256k1ScalarN<>+0x10(SB), R10
 135  	ADCQ R10, CX
 136  	MOVQ p256k1ScalarN<>+0x18(SB), R10
 137  	ADCQ R10, R8
 138  
 139  	MOVQ AX, 0(DI)
 140  	MOVQ BX, 8(DI)
 141  	MOVQ CX, 16(DI)
 142  	MOVQ R8, 24(DI)
 143  
 144  sub_done:
 145  	VZEROUPPER
 146  	RET
 147  
 148  // func scalarMulAVX2(r, a, b *Scalar)
 149  // Multiplies two 256-bit scalars and reduces mod n.
 150  // This implementation follows the bitcoin-core secp256k1 algorithm exactly.
 151  TEXT ·scalarMulAVX2(SB), NOSPLIT, $128-24
 152  	MOVQ r+0(FP), DI
 153  	MOVQ a+8(FP), SI
 154  	MOVQ b+16(FP), DX
 155  
 156  	// Load a limbs
 157  	MOVQ 0(SI), R8      // a0
 158  	MOVQ 8(SI), R9      // a1
 159  	MOVQ 16(SI), R10    // a2
 160  	MOVQ 24(SI), R11    // a3
 161  
 162  	// Store b pointer for later use
 163  	MOVQ DX, R12
 164  
 165  	// Compute 512-bit product using schoolbook multiplication
 166  	// Product stored on stack at SP+0 to SP+56 (8 limbs: l0..l7)
 167  
 168  	// Initialize product to zero
 169  	XORQ AX, AX
 170  	MOVQ AX, 0(SP)      // l0
 171  	MOVQ AX, 8(SP)      // l1
 172  	MOVQ AX, 16(SP)     // l2
 173  	MOVQ AX, 24(SP)     // l3
 174  	MOVQ AX, 32(SP)     // l4
 175  	MOVQ AX, 40(SP)     // l5
 176  	MOVQ AX, 48(SP)     // l6
 177  	MOVQ AX, 56(SP)     // l7
 178  
 179  	// Multiply a0 * b[0..3]
 180  	MOVQ R8, AX
 181  	MULQ 0(R12)         // a0 * b0
 182  	MOVQ AX, 0(SP)
 183  	MOVQ DX, R13        // carry
 184  
 185  	MOVQ R8, AX
 186  	MULQ 8(R12)         // a0 * b1
 187  	ADDQ R13, AX
 188  	ADCQ $0, DX
 189  	MOVQ AX, 8(SP)
 190  	MOVQ DX, R13
 191  
 192  	MOVQ R8, AX
 193  	MULQ 16(R12)        // a0 * b2
 194  	ADDQ R13, AX
 195  	ADCQ $0, DX
 196  	MOVQ AX, 16(SP)
 197  	MOVQ DX, R13
 198  
 199  	MOVQ R8, AX
 200  	MULQ 24(R12)        // a0 * b3
 201  	ADDQ R13, AX
 202  	ADCQ $0, DX
 203  	MOVQ AX, 24(SP)
 204  	MOVQ DX, 32(SP)
 205  
 206  	// Multiply a1 * b[0..3] and add
 207  	MOVQ R9, AX
 208  	MULQ 0(R12)         // a1 * b0
 209  	ADDQ AX, 8(SP)
 210  	ADCQ DX, 16(SP)
 211  	ADCQ $0, 24(SP)
 212  	ADCQ $0, 32(SP)
 213  
 214  	MOVQ R9, AX
 215  	MULQ 8(R12)         // a1 * b1
 216  	ADDQ AX, 16(SP)
 217  	ADCQ DX, 24(SP)
 218  	ADCQ $0, 32(SP)
 219  
 220  	MOVQ R9, AX
 221  	MULQ 16(R12)        // a1 * b2
 222  	ADDQ AX, 24(SP)
 223  	ADCQ DX, 32(SP)
 224  	ADCQ $0, 40(SP)
 225  
 226  	MOVQ R9, AX
 227  	MULQ 24(R12)        // a1 * b3
 228  	ADDQ AX, 32(SP)
 229  	ADCQ DX, 40(SP)
 230  
 231  	// Multiply a2 * b[0..3] and add
 232  	MOVQ R10, AX
 233  	MULQ 0(R12)         // a2 * b0
 234  	ADDQ AX, 16(SP)
 235  	ADCQ DX, 24(SP)
 236  	ADCQ $0, 32(SP)
 237  	ADCQ $0, 40(SP)
 238  
 239  	MOVQ R10, AX
 240  	MULQ 8(R12)         // a2 * b1
 241  	ADDQ AX, 24(SP)
 242  	ADCQ DX, 32(SP)
 243  	ADCQ $0, 40(SP)
 244  
 245  	MOVQ R10, AX
 246  	MULQ 16(R12)        // a2 * b2
 247  	ADDQ AX, 32(SP)
 248  	ADCQ DX, 40(SP)
 249  	ADCQ $0, 48(SP)
 250  
 251  	MOVQ R10, AX
 252  	MULQ 24(R12)        // a2 * b3
 253  	ADDQ AX, 40(SP)
 254  	ADCQ DX, 48(SP)
 255  
 256  	// Multiply a3 * b[0..3] and add
 257  	MOVQ R11, AX
 258  	MULQ 0(R12)         // a3 * b0
 259  	ADDQ AX, 24(SP)
 260  	ADCQ DX, 32(SP)
 261  	ADCQ $0, 40(SP)
 262  	ADCQ $0, 48(SP)
 263  
 264  	MOVQ R11, AX
 265  	MULQ 8(R12)         // a3 * b1
 266  	ADDQ AX, 32(SP)
 267  	ADCQ DX, 40(SP)
 268  	ADCQ $0, 48(SP)
 269  
 270  	MOVQ R11, AX
 271  	MULQ 16(R12)        // a3 * b2
 272  	ADDQ AX, 40(SP)
 273  	ADCQ DX, 48(SP)
 274  	ADCQ $0, 56(SP)
 275  
 276  	MOVQ R11, AX
 277  	MULQ 24(R12)        // a3 * b3
 278  	ADDQ AX, 48(SP)
 279  	ADCQ DX, 56(SP)
 280  
 281  	// Now we have the 512-bit product in SP+0..SP+56 (l[0..7])
 282  	// Reduce using the exact algorithm from bitcoin-core secp256k1
 283  	//
 284  	// Phase 1: Reduce 512 bits into 385 bits
 285  	// m[0..6] = l[0..3] + n[0..3] * SECP256K1_N_C
 286  	// where n[0..3] = l[4..7] (high 256 bits)
 287  	//
 288  	// NC0 = 0x402DA1732FC9BEBF
 289  	// NC1 = 0x4551231950B75FC4
 290  	// NC2 = 1
 291  
 292  	// Load high limbs (l4..l7 = n0..n3)
 293  	MOVQ 32(SP), R8     // n0 = l4
 294  	MOVQ 40(SP), R9     // n1 = l5
 295  	MOVQ 48(SP), R10    // n2 = l6
 296  	MOVQ 56(SP), R11    // n3 = l7
 297  
 298  	// Load constants
 299  	MOVQ $0x402DA1732FC9BEBF, R12  // NC0
 300  	MOVQ $0x4551231950B75FC4, R13  // NC1
 301  
 302  	// Use stack locations 64-112 for intermediate m values
 303  	// We'll use a 160-bit accumulator approach like the C code
 304  	// c0 (R14), c1 (R15), c2 (stored on stack at 120(SP))
 305  
 306  	// === m0 ===
 307  	// c0 = l[0], c1 = 0
 308  	// muladd_fast(n0, NC0): hi,lo = n0*NC0; c0 += lo, c1 += hi + carry
 309  	// m0 = extract_fast() = c0; c0 = c1; c1 = 0
 310  	MOVQ 0(SP), R14     // c0 = l0
 311  	XORQ R15, R15       // c1 = 0
 312  	MOVQ R8, AX
 313  	MULQ R12            // DX:AX = n0 * NC0
 314  	ADDQ AX, R14        // c0 += lo
 315  	ADCQ DX, R15        // c1 += hi + carry
 316  	MOVQ R14, 64(SP)    // m0 = c0
 317  	MOVQ R15, R14       // c0 = c1
 318  	XORQ R15, R15       // c1 = 0
 319  	MOVQ $0, 120(SP)    // c2 = 0
 320  
 321  	// === m1 ===
 322  	// sumadd_fast(l[1])
 323  	// muladd(n1, NC0)
 324  	// muladd(n0, NC1)
 325  	// m1 = extract()
 326  	ADDQ 8(SP), R14     // c0 += l1
 327  	ADCQ $0, R15        // c1 += carry
 328  
 329  	MOVQ R9, AX
 330  	MULQ R12            // DX:AX = n1 * NC0
 331  	ADDQ AX, R14        // c0 += lo
 332  	ADCQ DX, R15        // c1 += hi + carry
 333  	ADCQ $0, 120(SP)    // c2 += carry
 334  
 335  	MOVQ R8, AX
 336  	MULQ R13            // DX:AX = n0 * NC1
 337  	ADDQ AX, R14        // c0 += lo
 338  	ADCQ DX, R15        // c1 += hi + carry
 339  	ADCQ $0, 120(SP)    // c2 += carry
 340  
 341  	MOVQ R14, 72(SP)    // m1 = c0
 342  	MOVQ R15, R14       // c0 = c1
 343  	MOVQ 120(SP), R15   // c1 = c2
 344  	MOVQ $0, 120(SP)    // c2 = 0
 345  
 346  	// === m2 ===
 347  	// sumadd(l[2])
 348  	// muladd(n2, NC0)
 349  	// muladd(n1, NC1)
 350  	// sumadd(n0)  (because NC2 = 1)
 351  	// m2 = extract()
 352  	ADDQ 16(SP), R14    // c0 += l2
 353  	ADCQ $0, R15
 354  	ADCQ $0, 120(SP)
 355  
 356  	MOVQ R10, AX
 357  	MULQ R12            // DX:AX = n2 * NC0
 358  	ADDQ AX, R14
 359  	ADCQ DX, R15
 360  	ADCQ $0, 120(SP)
 361  
 362  	MOVQ R9, AX
 363  	MULQ R13            // DX:AX = n1 * NC1
 364  	ADDQ AX, R14
 365  	ADCQ DX, R15
 366  	ADCQ $0, 120(SP)
 367  
 368  	ADDQ R8, R14        // c0 += n0 (n0 * NC2 = n0 * 1)
 369  	ADCQ $0, R15
 370  	ADCQ $0, 120(SP)
 371  
 372  	MOVQ R14, 80(SP)    // m2 = c0
 373  	MOVQ R15, R14       // c0 = c1
 374  	MOVQ 120(SP), R15   // c1 = c2
 375  	MOVQ $0, 120(SP)    // c2 = 0
 376  
 377  	// === m3 ===
 378  	// sumadd(l[3])
 379  	// muladd(n3, NC0)
 380  	// muladd(n2, NC1)
 381  	// sumadd(n1)
 382  	// m3 = extract()
 383  	ADDQ 24(SP), R14    // c0 += l3
 384  	ADCQ $0, R15
 385  	ADCQ $0, 120(SP)
 386  
 387  	MOVQ R11, AX
 388  	MULQ R12            // DX:AX = n3 * NC0
 389  	ADDQ AX, R14
 390  	ADCQ DX, R15
 391  	ADCQ $0, 120(SP)
 392  
 393  	MOVQ R10, AX
 394  	MULQ R13            // DX:AX = n2 * NC1
 395  	ADDQ AX, R14
 396  	ADCQ DX, R15
 397  	ADCQ $0, 120(SP)
 398  
 399  	ADDQ R9, R14        // c0 += n1
 400  	ADCQ $0, R15
 401  	ADCQ $0, 120(SP)
 402  
 403  	MOVQ R14, 88(SP)    // m3 = c0
 404  	MOVQ R15, R14       // c0 = c1
 405  	MOVQ 120(SP), R15   // c1 = c2
 406  	MOVQ $0, 120(SP)    // c2 = 0
 407  
 408  	// === m4 ===
 409  	// muladd(n3, NC1)
 410  	// sumadd(n2)
 411  	// m4 = extract()
 412  	MOVQ R11, AX
 413  	MULQ R13            // DX:AX = n3 * NC1
 414  	ADDQ AX, R14
 415  	ADCQ DX, R15
 416  	ADCQ $0, 120(SP)
 417  
 418  	ADDQ R10, R14       // c0 += n2
 419  	ADCQ $0, R15
 420  	ADCQ $0, 120(SP)
 421  
 422  	MOVQ R14, 96(SP)    // m4 = c0
 423  	MOVQ R15, R14       // c0 = c1
 424  	MOVQ 120(SP), R15   // c1 = c2
 425  
 426  	// === m5 ===
 427  	// sumadd_fast(n3)
 428  	// m5 = extract_fast()
 429  	ADDQ R11, R14       // c0 += n3
 430  	ADCQ $0, R15        // c1 += carry
 431  
 432  	MOVQ R14, 104(SP)   // m5 = c0
 433  	MOVQ R15, R14       // c0 = c1
 434  
 435  	// === m6 ===
 436  	// m6 = c0 (low 32 bits only, but we keep full 64 bits for simplicity)
 437  	MOVQ R14, 112(SP)   // m6 = c0
 438  
 439  	// Phase 2: Reduce 385 bits into 258 bits
 440  	// p[0..4] = m[0..3] + m[4..6] * SECP256K1_N_C
 441  	// m4, m5 are 64-bit, m6 is at most 33 bits
 442  
 443  	// Load m values
 444  	MOVQ 96(SP), R8     // m4
 445  	MOVQ 104(SP), R9    // m5
 446  	MOVQ 112(SP), R10   // m6
 447  
 448  	// === p0 ===
 449  	// c0 = m0, c1 = 0
 450  	// muladd_fast(m4, NC0)
 451  	// p0 = extract_fast()
 452  	MOVQ 64(SP), R14    // c0 = m0
 453  	XORQ R15, R15       // c1 = 0
 454  
 455  	MOVQ R8, AX
 456  	MULQ R12            // DX:AX = m4 * NC0
 457  	ADDQ AX, R14
 458  	ADCQ DX, R15
 459  
 460  	MOVQ R14, 64(SP)    // p0 = c0 (reuse m0 location)
 461  	MOVQ R15, R14       // c0 = c1
 462  	XORQ R15, R15       // c1 = 0
 463  	MOVQ $0, 120(SP)    // c2 = 0
 464  
 465  	// === p1 ===
 466  	// sumadd_fast(m1)
 467  	// muladd(m5, NC0)
 468  	// muladd(m4, NC1)
 469  	// p1 = extract()
 470  	ADDQ 72(SP), R14    // c0 += m1
 471  	ADCQ $0, R15
 472  
 473  	MOVQ R9, AX
 474  	MULQ R12            // DX:AX = m5 * NC0
 475  	ADDQ AX, R14
 476  	ADCQ DX, R15
 477  	ADCQ $0, 120(SP)
 478  
 479  	MOVQ R8, AX
 480  	MULQ R13            // DX:AX = m4 * NC1
 481  	ADDQ AX, R14
 482  	ADCQ DX, R15
 483  	ADCQ $0, 120(SP)
 484  
 485  	MOVQ R14, 72(SP)    // p1 = c0
 486  	MOVQ R15, R14       // c0 = c1
 487  	MOVQ 120(SP), R15   // c1 = c2
 488  	MOVQ $0, 120(SP)    // c2 = 0
 489  
 490  	// === p2 ===
 491  	// sumadd(m2)
 492  	// muladd(m6, NC0)
 493  	// muladd(m5, NC1)
 494  	// sumadd(m4)
 495  	// p2 = extract()
 496  	ADDQ 80(SP), R14    // c0 += m2
 497  	ADCQ $0, R15
 498  	ADCQ $0, 120(SP)
 499  
 500  	MOVQ R10, AX
 501  	MULQ R12            // DX:AX = m6 * NC0
 502  	ADDQ AX, R14
 503  	ADCQ DX, R15
 504  	ADCQ $0, 120(SP)
 505  
 506  	MOVQ R9, AX
 507  	MULQ R13            // DX:AX = m5 * NC1
 508  	ADDQ AX, R14
 509  	ADCQ DX, R15
 510  	ADCQ $0, 120(SP)
 511  
 512  	ADDQ R8, R14        // c0 += m4
 513  	ADCQ $0, R15
 514  	ADCQ $0, 120(SP)
 515  
 516  	MOVQ R14, 80(SP)    // p2 = c0
 517  	MOVQ R15, R14       // c0 = c1
 518  	MOVQ 120(SP), R15   // c1 = c2
 519  
 520  	// === p3 ===
 521  	// sumadd_fast(m3)
 522  	// muladd_fast(m6, NC1)
 523  	// sumadd_fast(m5)
 524  	// p3 = extract_fast()
 525  	ADDQ 88(SP), R14    // c0 += m3
 526  	ADCQ $0, R15
 527  
 528  	MOVQ R10, AX
 529  	MULQ R13            // DX:AX = m6 * NC1
 530  	ADDQ AX, R14
 531  	ADCQ DX, R15
 532  
 533  	ADDQ R9, R14        // c0 += m5
 534  	ADCQ $0, R15
 535  
 536  	MOVQ R14, 88(SP)    // p3 = c0
 537  	// p4 = c1 + m6
 538  	ADDQ R15, R10       // p4 = c1 + m6
 539  
 540  	// === p4 ===
 541  	MOVQ R10, 96(SP)    // p4
 542  
 543  	// Phase 3: Reduce 258 bits into 256 bits
 544  	// r[0..3] = p[0..3] + p[4] * SECP256K1_N_C
 545  	// Then check for overflow and reduce once more if needed
 546  
 547  	// Use 128-bit arithmetic for this phase
 548  	// t = p0 + p4 * NC0
 549  	MOVQ 96(SP), R11    // p4
 550  
 551  	// r0 = (p0 + p4 * NC0) mod 2^64, carry to next
 552  	MOVQ R11, AX
 553  	MULQ R12            // DX:AX = p4 * NC0
 554  	ADDQ 64(SP), AX     // AX = p0 + lo
 555  	ADCQ $0, DX         // DX = hi + carry
 556  	MOVQ AX, R8         // r0
 557  	MOVQ DX, R14        // carry
 558  
 559  	// r1 = p1 + p4 * NC1 + carry
 560  	MOVQ R11, AX
 561  	MULQ R13            // DX:AX = p4 * NC1
 562  	ADDQ R14, AX        // AX += carry
 563  	ADCQ $0, DX
 564  	ADDQ 72(SP), AX     // AX += p1
 565  	ADCQ $0, DX
 566  	MOVQ AX, R9         // r1
 567  	MOVQ DX, R14        // carry
 568  
 569  	// r2 = p2 + p4 * NC2 + carry = p2 + p4 + carry
 570  	MOVQ 80(SP), AX
 571  	ADDQ R14, AX        // AX = p2 + carry
 572  	MOVQ $0, DX
 573  	ADCQ $0, DX
 574  	ADDQ R11, AX        // AX += p4 (NC2 = 1)
 575  	ADCQ $0, DX
 576  	MOVQ AX, R10        // r2
 577  	MOVQ DX, R14        // carry
 578  
 579  	// r3 = p3 + carry
 580  	MOVQ 88(SP), AX
 581  	ADDQ R14, AX
 582  	SETCS R14B          // final carry
 583  	MOVQ AX, R11        // r3
 584  
 585  	// Check if we need to reduce (carry or result >= n)
 586  	TESTB R14B, R14B
 587  	JNZ mul_do_final_reduce
 588  
 589  	// Compare with n (from high to low)
 590  	MOVQ $0xFFFFFFFFFFFFFFFF, R15
 591  	CMPQ R11, R15
 592  	JB mul_store_result
 593  	JA mul_do_final_reduce
 594  	MOVQ $0xFFFFFFFFFFFFFFFE, R15
 595  	CMPQ R10, R15
 596  	JB mul_store_result
 597  	JA mul_do_final_reduce
 598  	MOVQ $0xBAAEDCE6AF48A03B, R15
 599  	CMPQ R9, R15
 600  	JB mul_store_result
 601  	JA mul_do_final_reduce
 602  	MOVQ $0xBFD25E8CD0364141, R15
 603  	CMPQ R8, R15
 604  	JB mul_store_result
 605  
 606  mul_do_final_reduce:
 607  	// Add 2^256 - n
 608  	ADDQ R12, R8        // r0 += NC0
 609  	ADCQ R13, R9        // r1 += NC1
 610  	ADCQ $1, R10        // r2 += NC2 = 1
 611  	ADCQ $0, R11        // r3 += 0
 612  
 613  mul_store_result:
 614  	// Store result
 615  	MOVQ r+0(FP), DI
 616  	MOVQ R8, 0(DI)
 617  	MOVQ R9, 8(DI)
 618  	MOVQ R10, 16(DI)
 619  	MOVQ R11, 24(DI)
 620  
 621  	VZEROUPPER
 622  	RET
 623