field_4x64_amd64.s raw

   1  //go:build amd64 && !purego
   2  
   3  #include "textflag.h"
   4  
   5  // Field multiplication for secp256k1 using 4x64-bit limbs with BMI2 instructions.
   6  // Uses MULX for flag-free multiplication.
   7  //
   8  // The field element is represented as 4 limbs of 64 bits each:
   9  //   n[0..3] where value = n[0] + n[1]*2^64 + n[2]*2^128 + n[3]*2^192
  10  //
  11  // Field prime p = 2^256 - 2^32 - 977
  12  // Reduction constant R = 2^256 mod p = 2^32 + 977 = 0x1000003D1
  13  //
  14  // func field4x64MulAsm(r, a, b *[4]uint64)
  15  TEXT ·field4x64MulAsm(SB), NOSPLIT, $0-24
  16      MOVQ r+0(FP), DI      // result pointer
  17      MOVQ a+8(FP), SI      // a pointer
  18      MOVQ b+16(FP), CX     // b pointer
  19  
  20      // Load a[0..3]
  21      MOVQ 0(SI), R8        // a0
  22      MOVQ 8(SI), R9        // a1
  23      MOVQ 16(SI), R10      // a2
  24      MOVQ 24(SI), R11      // a3
  25  
  26      // We'll compute the 512-bit product in R12:R13:R14:R15:AX:BX:BP:DX
  27      // Actually, we'll use a different approach: accumulate column by column
  28  
  29      // Column 0: a0*b0
  30      MOVQ 0(CX), DX        // b0 into DX for MULX
  31      MULXQ R8, R12, R13    // a0*b0 -> R13:R12 (hi:lo)
  32  
  33      // Column 1: a0*b1 + a1*b0
  34      MOVQ 8(CX), DX        // b1
  35      MULXQ R8, AX, BX      // a0*b1 -> BX:AX
  36      ADDQ AX, R13
  37      ADCQ $0, BX
  38  
  39      MOVQ 0(CX), DX        // b0
  40      MULXQ R9, AX, R14     // a1*b0 -> R14:AX
  41      ADDQ AX, R13
  42      ADCQ BX, R14
  43      MOVQ $0, R15
  44      ADCQ $0, R15
  45  
  46      // Column 2: a0*b2 + a1*b1 + a2*b0
  47      MOVQ 16(CX), DX       // b2
  48      MULXQ R8, AX, BX      // a0*b2 -> BX:AX
  49      ADDQ AX, R14
  50      ADCQ BX, R15
  51  
  52      MOVQ 8(CX), DX        // b1
  53      MULXQ R9, AX, BX      // a1*b1 -> BX:AX
  54      ADDQ AX, R14
  55      ADCQ BX, R15
  56      MOVQ $0, BP
  57      ADCQ $0, BP
  58  
  59      MOVQ 0(CX), DX        // b0
  60      MULXQ R10, AX, BX     // a2*b0 -> BX:AX
  61      ADDQ AX, R14
  62      ADCQ BX, R15
  63      ADCQ $0, BP
  64  
  65      // Column 3: a0*b3 + a1*b2 + a2*b1 + a3*b0
  66      // Save R12-R14 (columns 0-2), use them for column 3+
  67      MOVQ R12, 0(DI)       // Save r0
  68      MOVQ R13, 8(DI)       // Save r1
  69      MOVQ R14, 16(DI)      // Save r2
  70  
  71      // Now R12, R13, R14 are free
  72      MOVQ R15, R12         // r3 accumulator low
  73      MOVQ BP, R13          // r3 accumulator high
  74      XORQ R14, R14         // r4 accumulator
  75  
  76      MOVQ 24(CX), DX       // b3
  77      MULXQ R8, AX, BX      // a0*b3 -> BX:AX
  78      ADDQ AX, R12
  79      ADCQ BX, R13
  80      ADCQ $0, R14
  81  
  82      MOVQ 16(CX), DX       // b2
  83      MULXQ R9, AX, BX      // a1*b2 -> BX:AX
  84      ADDQ AX, R12
  85      ADCQ BX, R13
  86      ADCQ $0, R14
  87  
  88      MOVQ 8(CX), DX        // b1
  89      MULXQ R10, AX, BX     // a2*b1 -> BX:AX
  90      ADDQ AX, R12
  91      ADCQ BX, R13
  92      ADCQ $0, R14
  93  
  94      MOVQ 0(CX), DX        // b0
  95      MULXQ R11, AX, BX     // a3*b0 -> BX:AX
  96      ADDQ AX, R12
  97      ADCQ BX, R13
  98      ADCQ $0, R14
  99  
 100      MOVQ R12, 24(DI)      // Save r3
 101  
 102      // Column 4: a1*b3 + a2*b2 + a3*b1
 103      MOVQ R13, R12         // r4 accumulator low
 104      MOVQ R14, R13         // r4 accumulator high
 105      XORQ R14, R14
 106  
 107      MOVQ 24(CX), DX       // b3
 108      MULXQ R9, AX, BX      // a1*b3 -> BX:AX
 109      ADDQ AX, R12
 110      ADCQ BX, R13
 111      ADCQ $0, R14
 112  
 113      MOVQ 16(CX), DX       // b2
 114      MULXQ R10, AX, BX     // a2*b2 -> BX:AX
 115      ADDQ AX, R12
 116      ADCQ BX, R13
 117      ADCQ $0, R14
 118  
 119      MOVQ 8(CX), DX        // b1
 120      MULXQ R11, AX, BX     // a3*b1 -> BX:AX
 121      ADDQ AX, R12
 122      ADCQ BX, R13
 123      ADCQ $0, R14
 124  
 125      // r4 is in R12, carry in R13:R14
 126  
 127      // Column 5: a2*b3 + a3*b2
 128      MOVQ R13, R15         // r5 accumulator low
 129      MOVQ R14, BP          // r5 accumulator high
 130      XORQ R8, R8           // reuse R8 for r6
 131  
 132      MOVQ 24(CX), DX       // b3
 133      MULXQ R10, AX, BX     // a2*b3 -> BX:AX
 134      ADDQ AX, R15
 135      ADCQ BX, BP
 136      ADCQ $0, R8
 137  
 138      MOVQ 16(CX), DX       // b2
 139      MULXQ R11, AX, BX     // a3*b2 -> BX:AX
 140      ADDQ AX, R15
 141      ADCQ BX, BP
 142      ADCQ $0, R8
 143  
 144      // Column 6: a3*b3
 145      MOVQ BP, R9           // r6 accumulator low
 146      MOVQ R8, R10          // r6 accumulator high (will be r7)
 147  
 148      MOVQ 24(CX), DX       // b3
 149      MULXQ R11, AX, BX     // a3*b3 -> BX:AX
 150      ADDQ AX, R9
 151      ADCQ BX, R10
 152  
 153      // Now we have:
 154      // r[0..3] in memory at DI
 155      // r[4] = R12
 156      // r[5] = R15
 157      // r[6] = R9
 158      // r[7] = R10
 159  
 160      // === Reduction: r[4..7] * R where R = 0x1000003D1 ===
 161      // t[i] = r[i+4] * R, then add t to r[0..3]
 162  
 163      MOVQ $0x1000003D1, DX // R constant
 164  
 165      // t0 = r4 * R
 166      MULXQ R12, R8, R11    // r4 * R -> R11:R8 (hi:lo)
 167  
 168      // t1 = r5 * R + hi(t0)
 169      MULXQ R15, AX, BX     // r5 * R -> BX:AX
 170      ADDQ R11, AX
 171      ADCQ $0, BX
 172      MOVQ AX, R11          // t1 low
 173      MOVQ BX, R12          // t1 hi -> will be t2
 174  
 175      // t2 = r6 * R + hi(t1)
 176      MULXQ R9, AX, BX      // r6 * R -> BX:AX
 177      ADDQ R12, AX
 178      ADCQ $0, BX
 179      MOVQ AX, R12          // t2 low
 180      MOVQ BX, R13          // t2 hi -> will be t3
 181  
 182      // t3 = r7 * R + hi(t2)
 183      MULXQ R10, AX, BX     // r7 * R -> BX:AX
 184      ADDQ R13, AX
 185      ADCQ $0, BX
 186      MOVQ AX, R13          // t3 low
 187      MOVQ BX, R14          // t4 (overflow)
 188  
 189      // Add t[0..3] to r[0..3]
 190      ADDQ R8, 0(DI)        // r0 += t0
 191      ADCQ R11, 8(DI)       // r1 += t1
 192      ADCQ R12, 16(DI)      // r2 += t2
 193      ADCQ R13, 24(DI)      // r3 += t3
 194      ADCQ $0, R14          // capture final carry into t4
 195  
 196      // If t4 != 0, we need another reduction round
 197      TESTQ R14, R14
 198      JZ done
 199  
 200      // overflow * R
 201      MULXQ R14, AX, BX     // t4 * R -> BX:AX
 202      ADDQ AX, 0(DI)
 203      ADCQ BX, 8(DI)
 204      ADCQ $0, 16(DI)
 205      ADCQ $0, 24(DI)
 206      // If this still overflows, add R one more time (extremely rare)
 207      JNC done
 208      MOVQ $0x1000003D1, AX
 209      ADDQ AX, 0(DI)
 210      ADCQ $0, 8(DI)
 211      ADCQ $0, 16(DI)
 212      ADCQ $0, 24(DI)
 213  
 214  done:
 215      RET
 216  
 217  // func field4x64SqrAsm(r, a *[4]uint64)
 218  // Optimized squaring: exploits symmetry a[i]*a[j] = a[j]*a[i]
 219  // For now, inline calls to mul logic with b=a
 220  TEXT ·field4x64SqrAsm(SB), NOSPLIT, $0-16
 221      MOVQ r+0(FP), DI      // result pointer
 222      MOVQ a+8(FP), SI      // a pointer
 223      MOVQ SI, CX           // b = a (same pointer)
 224  
 225      // Load a[0..3]
 226      MOVQ 0(SI), R8        // a0
 227      MOVQ 8(SI), R9        // a1
 228      MOVQ 16(SI), R10      // a2
 229      MOVQ 24(SI), R11      // a3
 230  
 231      // Column 0: a0*a0
 232      MOVQ R8, DX           // a0 into DX for MULX
 233      MULXQ R8, R12, R13    // a0*a0 -> R13:R12 (hi:lo)
 234  
 235      // Column 1: 2*a0*a1
 236      // Need to compute: R14:R13 += 2*(BX:AX) where BX:AX = a0*a1
 237      MOVQ R9, DX           // a1
 238      MULXQ R8, AX, BX      // a0*a1 -> BX:AX
 239      XORQ R14, R14
 240      XORQ R15, R15
 241      ADDQ AX, R13          // R13 += AX, CF1
 242      ADCQ $0, R14          // R14 = CF1
 243      ADDQ AX, R13          // R13 += AX again (2*AX total), CF2
 244      ADCQ BX, R14          // R14 += BX + CF2
 245      ADCQ $0, R15          // R15 = overflow from R14
 246      ADDQ BX, R14          // R14 += BX again (2*BX total), CF3
 247      ADCQ $0, R15          // R15 += CF3
 248  
 249      // Column 2: 2*a0*a2 + a1*a1
 250      MOVQ R10, DX          // a2
 251      MULXQ R8, AX, BX      // a0*a2 -> BX:AX
 252      ADDQ AX, R14
 253      ADCQ BX, R15
 254      ADDQ AX, R14          // double it
 255      ADCQ BX, R15
 256      MOVQ $0, BP
 257      ADCQ $0, BP
 258  
 259      MOVQ R9, DX           // a1
 260      MULXQ R9, AX, BX      // a1*a1 -> BX:AX
 261      ADDQ AX, R14
 262      ADCQ BX, R15
 263      ADCQ $0, BP
 264  
 265      // Save r0, r1, r2
 266      MOVQ R12, 0(DI)
 267      MOVQ R13, 8(DI)
 268      MOVQ R14, 16(DI)
 269  
 270      // Column 3: 2*a0*a3 + 2*a1*a2
 271      MOVQ R15, R12
 272      MOVQ BP, R13
 273      XORQ R14, R14
 274  
 275      MOVQ R11, DX          // a3
 276      MULXQ R8, AX, BX      // a0*a3 -> BX:AX
 277      ADDQ AX, R12
 278      ADCQ BX, R13
 279      ADCQ $0, R14
 280      ADDQ AX, R12          // double
 281      ADCQ BX, R13
 282      ADCQ $0, R14
 283  
 284      MOVQ R10, DX          // a2
 285      MULXQ R9, AX, BX      // a1*a2 -> BX:AX
 286      ADDQ AX, R12
 287      ADCQ BX, R13
 288      ADCQ $0, R14
 289      ADDQ AX, R12          // double
 290      ADCQ BX, R13
 291      ADCQ $0, R14
 292  
 293      MOVQ R12, 24(DI)      // Save r3
 294  
 295      // Column 4: 2*a1*a3 + a2*a2
 296      MOVQ R13, R12
 297      MOVQ R14, R13
 298      XORQ R14, R14
 299  
 300      MOVQ R11, DX          // a3
 301      MULXQ R9, AX, BX      // a1*a3 -> BX:AX
 302      ADDQ AX, R12
 303      ADCQ BX, R13
 304      ADCQ $0, R14
 305      ADDQ AX, R12          // double
 306      ADCQ BX, R13
 307      ADCQ $0, R14
 308  
 309      MOVQ R10, DX          // a2
 310      MULXQ R10, AX, BX     // a2*a2 -> BX:AX
 311      ADDQ AX, R12
 312      ADCQ BX, R13
 313      ADCQ $0, R14
 314  
 315      // Column 5: 2*a2*a3
 316      MOVQ R13, R15
 317      MOVQ R14, BP
 318      XORQ R8, R8
 319  
 320      MOVQ R11, DX          // a3
 321      MULXQ R10, AX, BX     // a2*a3 -> BX:AX
 322      ADDQ AX, R15
 323      ADCQ BX, BP
 324      ADCQ $0, R8
 325      ADDQ AX, R15          // double
 326      ADCQ BX, BP
 327      ADCQ $0, R8
 328  
 329      // Column 6: a3*a3
 330      MOVQ BP, R9
 331      MOVQ R8, R10
 332  
 333      MOVQ R11, DX          // a3
 334      MULXQ R11, AX, BX     // a3*a3 -> BX:AX
 335      ADDQ AX, R9
 336      ADCQ BX, R10
 337  
 338      // Now we have:
 339      // r[0..3] in memory at DI
 340      // r[4] = R12, r[5] = R15, r[6] = R9, r[7] = R10
 341  
 342      // === Reduction: r[4..7] * R where R = 0x1000003D1 ===
 343      MOVQ $0x1000003D1, DX
 344  
 345      // t0 = r4 * R
 346      MULXQ R12, R8, R11    // r4 * R -> R11:R8
 347  
 348      // t1 = r5 * R + hi(t0)
 349      MULXQ R15, AX, BX     // r5 * R -> BX:AX
 350      ADDQ R11, AX
 351      ADCQ $0, BX
 352      MOVQ AX, R11
 353      MOVQ BX, R12
 354  
 355      // t2 = r6 * R + hi(t1)
 356      MULXQ R9, AX, BX      // r6 * R -> BX:AX
 357      ADDQ R12, AX
 358      ADCQ $0, BX
 359      MOVQ AX, R12
 360      MOVQ BX, R13
 361  
 362      // t3 = r7 * R + hi(t2)
 363      MULXQ R10, AX, BX     // r7 * R -> BX:AX
 364      ADDQ R13, AX
 365      ADCQ $0, BX
 366      MOVQ AX, R13
 367      MOVQ BX, R14
 368  
 369      // Add t[0..3] to r[0..3]
 370      ADDQ R8, 0(DI)
 371      ADCQ R11, 8(DI)
 372      ADCQ R12, 16(DI)
 373      ADCQ R13, 24(DI)
 374      ADCQ $0, R14
 375  
 376      // If t4 != 0, we need another reduction round
 377      TESTQ R14, R14
 378      JZ sqr_done
 379  
 380      // overflow * R
 381      MULXQ R14, AX, BX
 382      ADDQ AX, 0(DI)
 383      ADCQ BX, 8(DI)
 384      ADCQ $0, 16(DI)
 385      ADCQ $0, 24(DI)
 386      JNC sqr_done
 387      MOVQ $0x1000003D1, AX
 388      ADDQ AX, 0(DI)
 389      ADCQ $0, 8(DI)
 390      ADCQ $0, 16(DI)
 391      ADCQ $0, 24(DI)
 392  
 393  sqr_done:
 394      RET
 395