field_amd64.s raw

   1  //go:build amd64 && !purego
   2  
   3  #include "textflag.h"
   4  
   5  // Montgomery multiplication constants for the Gnarl 216-bit prime.
   6  // P = 0x9563a6_b6d81bb9b02e5e5d_121e79ccd682cc99_31f9791e0f9ee4f5
   7  // pPrime = -P^{-1} mod 2^64 = 0xf07d39ef3ea058a3
   8  
   9  DATA p0<>+0(SB)/8, $0x31f9791e0f9ee4f5
  10  DATA p1<>+0(SB)/8, $0x121e79ccd682cc99
  11  DATA p2<>+0(SB)/8, $0xb6d81bb9b02e5e5d
  12  DATA p3<>+0(SB)/8, $0x00000000009563a6
  13  DATA pp<>+0(SB)/8, $0xf07d39ef3ea058a3
  14  GLOBL p0<>(SB), RODATA|NOPTR, $8
  15  GLOBL p1<>(SB), RODATA|NOPTR, $8
  16  GLOBL p2<>(SB), RODATA|NOPTR, $8
  17  GLOBL p3<>(SB), RODATA|NOPTR, $8
  18  GLOBL pp<>(SB), RODATA|NOPTR, $8
  19  
  20  // func montMul(r, a, b *fe)
  21  //
  22  // 4-limb CIOS Montgomery multiplication.
  23  // r = a * b * R^{-1} mod P where R = 2^256.
  24  //
  25  // Register allocation:
  26  //   BX = r pointer
  27  //   SI = a pointer
  28  //   DI = b pointer
  29  //   R8,R9,R10,R11 = t0,t1,t2,t3 (accumulator)
  30  //   R12 = t4 (overflow limb)
  31  //   R13 = current a[i] or m value
  32  //   R14 = carry accumulator
  33  //   AX,DX = MULQ operands/results
  34  TEXT ·montMul(SB), NOSPLIT, $0-24
  35  	MOVQ r+0(FP), BX
  36  	MOVQ a+8(FP), SI
  37  	MOVQ b+16(FP), DI
  38  
  39  	// Zero accumulator.
  40  	XORQ R8, R8
  41  	XORQ R9, R9
  42  	XORQ R10, R10
  43  	XORQ R11, R11
  44  	XORQ R12, R12
  45  
  46  	// ---- i = 0: t += a[0] * b ----
  47  	MOVQ 0(SI), R13       // R13 = a[0]
  48  
  49  	MOVQ R13, AX
  50  	MULQ 0(DI)            // DX:AX = a[0] * b[0]
  51  	ADDQ AX, R8           // t0 += lo
  52  	ADCQ $0, DX
  53  	MOVQ DX, R14          // R14 = carry
  54  
  55  	MOVQ R13, AX
  56  	MULQ 8(DI)            // DX:AX = a[0] * b[1]
  57  	ADDQ R14, R9          // t1 += prev carry
  58  	ADCQ $0, DX
  59  	ADDQ AX, R9           // t1 += lo
  60  	ADCQ $0, DX
  61  	MOVQ DX, R14
  62  
  63  	MOVQ R13, AX
  64  	MULQ 16(DI)           // DX:AX = a[0] * b[2]
  65  	ADDQ R14, R10         // t2 += prev carry
  66  	ADCQ $0, DX
  67  	ADDQ AX, R10          // t2 += lo
  68  	ADCQ $0, DX
  69  	MOVQ DX, R14
  70  
  71  	MOVQ R13, AX
  72  	MULQ 24(DI)           // DX:AX = a[0] * b[3]
  73  	ADDQ R14, R11         // t3 += prev carry
  74  	ADCQ $0, DX
  75  	ADDQ AX, R11          // t3 += lo
  76  	ADCQ DX, R12          // t4 += hi
  77  
  78  	// Montgomery reduction: m = t0 * pPrime; t += m * P; shift right.
  79  	MOVQ R8, AX
  80  	MULQ pp<>(SB)         // DX:AX = t0 * pPrime
  81  	MOVQ AX, R13          // R13 = m (only low 64 bits matter)
  82  
  83  	MOVQ R13, AX
  84  	MULQ p0<>(SB)         // DX:AX = m * P[0]
  85  	ADDQ AX, R8           // t0 += lo (cancels to 0 mod 2^64)
  86  	ADCQ $0, DX
  87  	MOVQ DX, R14          // R14 = carry
  88  
  89  	MOVQ R13, AX
  90  	MULQ p1<>(SB)         // DX:AX = m * P[1]
  91  	ADDQ R14, R9
  92  	ADCQ $0, DX
  93  	ADDQ AX, R9
  94  	ADCQ $0, DX
  95  	MOVQ DX, R14
  96  
  97  	MOVQ R13, AX
  98  	MULQ p2<>(SB)         // DX:AX = m * P[2]
  99  	ADDQ R14, R10
 100  	ADCQ $0, DX
 101  	ADDQ AX, R10
 102  	ADCQ $0, DX
 103  	MOVQ DX, R14
 104  
 105  	MOVQ R13, AX
 106  	MULQ p3<>(SB)         // DX:AX = m * P[3]
 107  	ADDQ R14, R11
 108  	ADCQ $0, DX
 109  	ADDQ AX, R11
 110  	ADCQ DX, R12
 111  
 112  	// Shift: t0=t1, t1=t2, t2=t3, t3=t4, t4=0
 113  	MOVQ R9, R8
 114  	MOVQ R10, R9
 115  	MOVQ R11, R10
 116  	MOVQ R12, R11
 117  	XORQ R12, R12
 118  
 119  	// ---- i = 1: t += a[1] * b ----
 120  	MOVQ 8(SI), R13
 121  
 122  	MOVQ R13, AX
 123  	MULQ 0(DI)
 124  	ADDQ AX, R8
 125  	ADCQ $0, DX
 126  	MOVQ DX, R14
 127  
 128  	MOVQ R13, AX
 129  	MULQ 8(DI)
 130  	ADDQ R14, R9
 131  	ADCQ $0, DX
 132  	ADDQ AX, R9
 133  	ADCQ $0, DX
 134  	MOVQ DX, R14
 135  
 136  	MOVQ R13, AX
 137  	MULQ 16(DI)
 138  	ADDQ R14, R10
 139  	ADCQ $0, DX
 140  	ADDQ AX, R10
 141  	ADCQ $0, DX
 142  	MOVQ DX, R14
 143  
 144  	MOVQ R13, AX
 145  	MULQ 24(DI)
 146  	ADDQ R14, R11
 147  	ADCQ $0, DX
 148  	ADDQ AX, R11
 149  	ADCQ DX, R12
 150  
 151  	// Montgomery reduction i=1
 152  	MOVQ R8, AX
 153  	MULQ pp<>(SB)
 154  	MOVQ AX, R13
 155  
 156  	MOVQ R13, AX
 157  	MULQ p0<>(SB)
 158  	ADDQ AX, R8
 159  	ADCQ $0, DX
 160  	MOVQ DX, R14
 161  
 162  	MOVQ R13, AX
 163  	MULQ p1<>(SB)
 164  	ADDQ R14, R9
 165  	ADCQ $0, DX
 166  	ADDQ AX, R9
 167  	ADCQ $0, DX
 168  	MOVQ DX, R14
 169  
 170  	MOVQ R13, AX
 171  	MULQ p2<>(SB)
 172  	ADDQ R14, R10
 173  	ADCQ $0, DX
 174  	ADDQ AX, R10
 175  	ADCQ $0, DX
 176  	MOVQ DX, R14
 177  
 178  	MOVQ R13, AX
 179  	MULQ p3<>(SB)
 180  	ADDQ R14, R11
 181  	ADCQ $0, DX
 182  	ADDQ AX, R11
 183  	ADCQ DX, R12
 184  
 185  	MOVQ R9, R8
 186  	MOVQ R10, R9
 187  	MOVQ R11, R10
 188  	MOVQ R12, R11
 189  	XORQ R12, R12
 190  
 191  	// ---- i = 2: t += a[2] * b ----
 192  	MOVQ 16(SI), R13
 193  
 194  	MOVQ R13, AX
 195  	MULQ 0(DI)
 196  	ADDQ AX, R8
 197  	ADCQ $0, DX
 198  	MOVQ DX, R14
 199  
 200  	MOVQ R13, AX
 201  	MULQ 8(DI)
 202  	ADDQ R14, R9
 203  	ADCQ $0, DX
 204  	ADDQ AX, R9
 205  	ADCQ $0, DX
 206  	MOVQ DX, R14
 207  
 208  	MOVQ R13, AX
 209  	MULQ 16(DI)
 210  	ADDQ R14, R10
 211  	ADCQ $0, DX
 212  	ADDQ AX, R10
 213  	ADCQ $0, DX
 214  	MOVQ DX, R14
 215  
 216  	MOVQ R13, AX
 217  	MULQ 24(DI)
 218  	ADDQ R14, R11
 219  	ADCQ $0, DX
 220  	ADDQ AX, R11
 221  	ADCQ DX, R12
 222  
 223  	// Montgomery reduction i=2
 224  	MOVQ R8, AX
 225  	MULQ pp<>(SB)
 226  	MOVQ AX, R13
 227  
 228  	MOVQ R13, AX
 229  	MULQ p0<>(SB)
 230  	ADDQ AX, R8
 231  	ADCQ $0, DX
 232  	MOVQ DX, R14
 233  
 234  	MOVQ R13, AX
 235  	MULQ p1<>(SB)
 236  	ADDQ R14, R9
 237  	ADCQ $0, DX
 238  	ADDQ AX, R9
 239  	ADCQ $0, DX
 240  	MOVQ DX, R14
 241  
 242  	MOVQ R13, AX
 243  	MULQ p2<>(SB)
 244  	ADDQ R14, R10
 245  	ADCQ $0, DX
 246  	ADDQ AX, R10
 247  	ADCQ $0, DX
 248  	MOVQ DX, R14
 249  
 250  	MOVQ R13, AX
 251  	MULQ p3<>(SB)
 252  	ADDQ R14, R11
 253  	ADCQ $0, DX
 254  	ADDQ AX, R11
 255  	ADCQ DX, R12
 256  
 257  	MOVQ R9, R8
 258  	MOVQ R10, R9
 259  	MOVQ R11, R10
 260  	MOVQ R12, R11
 261  	XORQ R12, R12
 262  
 263  	// ---- i = 3: t += a[3] * b ----
 264  	MOVQ 24(SI), R13
 265  
 266  	MOVQ R13, AX
 267  	MULQ 0(DI)
 268  	ADDQ AX, R8
 269  	ADCQ $0, DX
 270  	MOVQ DX, R14
 271  
 272  	MOVQ R13, AX
 273  	MULQ 8(DI)
 274  	ADDQ R14, R9
 275  	ADCQ $0, DX
 276  	ADDQ AX, R9
 277  	ADCQ $0, DX
 278  	MOVQ DX, R14
 279  
 280  	MOVQ R13, AX
 281  	MULQ 16(DI)
 282  	ADDQ R14, R10
 283  	ADCQ $0, DX
 284  	ADDQ AX, R10
 285  	ADCQ $0, DX
 286  	MOVQ DX, R14
 287  
 288  	MOVQ R13, AX
 289  	MULQ 24(DI)
 290  	ADDQ R14, R11
 291  	ADCQ $0, DX
 292  	ADDQ AX, R11
 293  	ADCQ DX, R12
 294  
 295  	// Montgomery reduction i=3
 296  	MOVQ R8, AX
 297  	MULQ pp<>(SB)
 298  	MOVQ AX, R13
 299  
 300  	MOVQ R13, AX
 301  	MULQ p0<>(SB)
 302  	ADDQ AX, R8
 303  	ADCQ $0, DX
 304  	MOVQ DX, R14
 305  
 306  	MOVQ R13, AX
 307  	MULQ p1<>(SB)
 308  	ADDQ R14, R9
 309  	ADCQ $0, DX
 310  	ADDQ AX, R9
 311  	ADCQ $0, DX
 312  	MOVQ DX, R14
 313  
 314  	MOVQ R13, AX
 315  	MULQ p2<>(SB)
 316  	ADDQ R14, R10
 317  	ADCQ $0, DX
 318  	ADDQ AX, R10
 319  	ADCQ $0, DX
 320  	MOVQ DX, R14
 321  
 322  	MOVQ R13, AX
 323  	MULQ p3<>(SB)
 324  	ADDQ R14, R11
 325  	ADCQ $0, DX
 326  	ADDQ AX, R11
 327  	ADCQ DX, R12
 328  
 329  	MOVQ R9, R8
 330  	MOVQ R10, R9
 331  	MOVQ R11, R10
 332  	MOVQ R12, R11
 333  
 334  	// Conditional subtraction: if t >= P, t -= P.
 335  	MOVQ R8, AX
 336  	MOVQ R9, CX
 337  	MOVQ R10, DX
 338  	MOVQ R11, R13
 339  
 340  	SUBQ p0<>(SB), AX
 341  	SBBQ p1<>(SB), CX
 342  	SBBQ p2<>(SB), DX
 343  	SBBQ p3<>(SB), R13
 344  
 345  	// If borrow (CF=1), keep original t; else use t-P.
 346  	CMOVQCS R8, AX
 347  	CMOVQCS R9, CX
 348  	CMOVQCS R10, DX
 349  	CMOVQCS R11, R13
 350  
 351  	MOVQ AX, 0(BX)
 352  	MOVQ CX, 8(BX)
 353  	MOVQ DX, 16(BX)
 354  	MOVQ R13, 24(BX)
 355  	RET
 356  
 357  // func montSquare(r, a *fe)
 358  //
 359  // Computes r = a^2 * R^{-1} mod P. Inlined CIOS with b = a.
 360  TEXT ·montSquare(SB), NOSPLIT, $0-16
 361  	MOVQ r+0(FP), BX
 362  	MOVQ a+8(FP), SI
 363  	MOVQ SI, DI           // b = a
 364  
 365  	XORQ R8, R8
 366  	XORQ R9, R9
 367  	XORQ R10, R10
 368  	XORQ R11, R11
 369  	XORQ R12, R12
 370  
 371  	// ---- i = 0 ----
 372  	MOVQ 0(SI), R13
 373  	MOVQ R13, AX
 374  	MULQ 0(DI)
 375  	ADDQ AX, R8
 376  	ADCQ $0, DX
 377  	MOVQ DX, R14
 378  	MOVQ R13, AX
 379  	MULQ 8(DI)
 380  	ADDQ R14, R9
 381  	ADCQ $0, DX
 382  	ADDQ AX, R9
 383  	ADCQ $0, DX
 384  	MOVQ DX, R14
 385  	MOVQ R13, AX
 386  	MULQ 16(DI)
 387  	ADDQ R14, R10
 388  	ADCQ $0, DX
 389  	ADDQ AX, R10
 390  	ADCQ $0, DX
 391  	MOVQ DX, R14
 392  	MOVQ R13, AX
 393  	MULQ 24(DI)
 394  	ADDQ R14, R11
 395  	ADCQ $0, DX
 396  	ADDQ AX, R11
 397  	ADCQ DX, R12
 398  	MOVQ R8, AX
 399  	MULQ pp<>(SB)
 400  	MOVQ AX, R13
 401  	MOVQ R13, AX
 402  	MULQ p0<>(SB)
 403  	ADDQ AX, R8
 404  	ADCQ $0, DX
 405  	MOVQ DX, R14
 406  	MOVQ R13, AX
 407  	MULQ p1<>(SB)
 408  	ADDQ R14, R9
 409  	ADCQ $0, DX
 410  	ADDQ AX, R9
 411  	ADCQ $0, DX
 412  	MOVQ DX, R14
 413  	MOVQ R13, AX
 414  	MULQ p2<>(SB)
 415  	ADDQ R14, R10
 416  	ADCQ $0, DX
 417  	ADDQ AX, R10
 418  	ADCQ $0, DX
 419  	MOVQ DX, R14
 420  	MOVQ R13, AX
 421  	MULQ p3<>(SB)
 422  	ADDQ R14, R11
 423  	ADCQ $0, DX
 424  	ADDQ AX, R11
 425  	ADCQ DX, R12
 426  	MOVQ R9, R8
 427  	MOVQ R10, R9
 428  	MOVQ R11, R10
 429  	MOVQ R12, R11
 430  	XORQ R12, R12
 431  
 432  	// ---- i = 1 ----
 433  	MOVQ 8(SI), R13
 434  	MOVQ R13, AX
 435  	MULQ 0(DI)
 436  	ADDQ AX, R8
 437  	ADCQ $0, DX
 438  	MOVQ DX, R14
 439  	MOVQ R13, AX
 440  	MULQ 8(DI)
 441  	ADDQ R14, R9
 442  	ADCQ $0, DX
 443  	ADDQ AX, R9
 444  	ADCQ $0, DX
 445  	MOVQ DX, R14
 446  	MOVQ R13, AX
 447  	MULQ 16(DI)
 448  	ADDQ R14, R10
 449  	ADCQ $0, DX
 450  	ADDQ AX, R10
 451  	ADCQ $0, DX
 452  	MOVQ DX, R14
 453  	MOVQ R13, AX
 454  	MULQ 24(DI)
 455  	ADDQ R14, R11
 456  	ADCQ $0, DX
 457  	ADDQ AX, R11
 458  	ADCQ DX, R12
 459  	MOVQ R8, AX
 460  	MULQ pp<>(SB)
 461  	MOVQ AX, R13
 462  	MOVQ R13, AX
 463  	MULQ p0<>(SB)
 464  	ADDQ AX, R8
 465  	ADCQ $0, DX
 466  	MOVQ DX, R14
 467  	MOVQ R13, AX
 468  	MULQ p1<>(SB)
 469  	ADDQ R14, R9
 470  	ADCQ $0, DX
 471  	ADDQ AX, R9
 472  	ADCQ $0, DX
 473  	MOVQ DX, R14
 474  	MOVQ R13, AX
 475  	MULQ p2<>(SB)
 476  	ADDQ R14, R10
 477  	ADCQ $0, DX
 478  	ADDQ AX, R10
 479  	ADCQ $0, DX
 480  	MOVQ DX, R14
 481  	MOVQ R13, AX
 482  	MULQ p3<>(SB)
 483  	ADDQ R14, R11
 484  	ADCQ $0, DX
 485  	ADDQ AX, R11
 486  	ADCQ DX, R12
 487  	MOVQ R9, R8
 488  	MOVQ R10, R9
 489  	MOVQ R11, R10
 490  	MOVQ R12, R11
 491  	XORQ R12, R12
 492  
 493  	// ---- i = 2 ----
 494  	MOVQ 16(SI), R13
 495  	MOVQ R13, AX
 496  	MULQ 0(DI)
 497  	ADDQ AX, R8
 498  	ADCQ $0, DX
 499  	MOVQ DX, R14
 500  	MOVQ R13, AX
 501  	MULQ 8(DI)
 502  	ADDQ R14, R9
 503  	ADCQ $0, DX
 504  	ADDQ AX, R9
 505  	ADCQ $0, DX
 506  	MOVQ DX, R14
 507  	MOVQ R13, AX
 508  	MULQ 16(DI)
 509  	ADDQ R14, R10
 510  	ADCQ $0, DX
 511  	ADDQ AX, R10
 512  	ADCQ $0, DX
 513  	MOVQ DX, R14
 514  	MOVQ R13, AX
 515  	MULQ 24(DI)
 516  	ADDQ R14, R11
 517  	ADCQ $0, DX
 518  	ADDQ AX, R11
 519  	ADCQ DX, R12
 520  	MOVQ R8, AX
 521  	MULQ pp<>(SB)
 522  	MOVQ AX, R13
 523  	MOVQ R13, AX
 524  	MULQ p0<>(SB)
 525  	ADDQ AX, R8
 526  	ADCQ $0, DX
 527  	MOVQ DX, R14
 528  	MOVQ R13, AX
 529  	MULQ p1<>(SB)
 530  	ADDQ R14, R9
 531  	ADCQ $0, DX
 532  	ADDQ AX, R9
 533  	ADCQ $0, DX
 534  	MOVQ DX, R14
 535  	MOVQ R13, AX
 536  	MULQ p2<>(SB)
 537  	ADDQ R14, R10
 538  	ADCQ $0, DX
 539  	ADDQ AX, R10
 540  	ADCQ $0, DX
 541  	MOVQ DX, R14
 542  	MOVQ R13, AX
 543  	MULQ p3<>(SB)
 544  	ADDQ R14, R11
 545  	ADCQ $0, DX
 546  	ADDQ AX, R11
 547  	ADCQ DX, R12
 548  	MOVQ R9, R8
 549  	MOVQ R10, R9
 550  	MOVQ R11, R10
 551  	MOVQ R12, R11
 552  	XORQ R12, R12
 553  
 554  	// ---- i = 3 ----
 555  	MOVQ 24(SI), R13
 556  	MOVQ R13, AX
 557  	MULQ 0(DI)
 558  	ADDQ AX, R8
 559  	ADCQ $0, DX
 560  	MOVQ DX, R14
 561  	MOVQ R13, AX
 562  	MULQ 8(DI)
 563  	ADDQ R14, R9
 564  	ADCQ $0, DX
 565  	ADDQ AX, R9
 566  	ADCQ $0, DX
 567  	MOVQ DX, R14
 568  	MOVQ R13, AX
 569  	MULQ 16(DI)
 570  	ADDQ R14, R10
 571  	ADCQ $0, DX
 572  	ADDQ AX, R10
 573  	ADCQ $0, DX
 574  	MOVQ DX, R14
 575  	MOVQ R13, AX
 576  	MULQ 24(DI)
 577  	ADDQ R14, R11
 578  	ADCQ $0, DX
 579  	ADDQ AX, R11
 580  	ADCQ DX, R12
 581  	MOVQ R8, AX
 582  	MULQ pp<>(SB)
 583  	MOVQ AX, R13
 584  	MOVQ R13, AX
 585  	MULQ p0<>(SB)
 586  	ADDQ AX, R8
 587  	ADCQ $0, DX
 588  	MOVQ DX, R14
 589  	MOVQ R13, AX
 590  	MULQ p1<>(SB)
 591  	ADDQ R14, R9
 592  	ADCQ $0, DX
 593  	ADDQ AX, R9
 594  	ADCQ $0, DX
 595  	MOVQ DX, R14
 596  	MOVQ R13, AX
 597  	MULQ p2<>(SB)
 598  	ADDQ R14, R10
 599  	ADCQ $0, DX
 600  	ADDQ AX, R10
 601  	ADCQ $0, DX
 602  	MOVQ DX, R14
 603  	MOVQ R13, AX
 604  	MULQ p3<>(SB)
 605  	ADDQ R14, R11
 606  	ADCQ $0, DX
 607  	ADDQ AX, R11
 608  	ADCQ DX, R12
 609  	MOVQ R9, R8
 610  	MOVQ R10, R9
 611  	MOVQ R11, R10
 612  	MOVQ R12, R11
 613  
 614  	// Conditional subtraction.
 615  	MOVQ R8, AX
 616  	MOVQ R9, CX
 617  	MOVQ R10, DX
 618  	MOVQ R11, R13
 619  	SUBQ p0<>(SB), AX
 620  	SBBQ p1<>(SB), CX
 621  	SBBQ p2<>(SB), DX
 622  	SBBQ p3<>(SB), R13
 623  	CMOVQCS R8, AX
 624  	CMOVQCS R9, CX
 625  	CMOVQCS R10, DX
 626  	CMOVQCS R11, R13
 627  	MOVQ AX, 0(BX)
 628  	MOVQ CX, 8(BX)
 629  	MOVQ DX, 16(BX)
 630  	MOVQ R13, 24(BX)
 631  	RET
 632