p256_asm_amd64.s raw

   1  // Code generated by command: go run p256_asm.go -out ../p256_asm_amd64.s. DO NOT EDIT.
   2  
   3  //go:build !purego
   4  
   5  #include "textflag.h"
   6  
   7  // func p256MovCond(res *P256Point, a *P256Point, b *P256Point, cond int)
   8  // Requires: SSE2
   9  TEXT ·p256MovCond(SB), NOSPLIT, $0-32
  10  	MOVQ    res+0(FP), DI
  11  	MOVQ    a+8(FP), SI
  12  	MOVQ    b+16(FP), CX
  13  	MOVQ    cond+24(FP), X12
  14  	PXOR    X13, X13
  15  	PSHUFD  $0x00, X12, X12
  16  	PCMPEQL X13, X12
  17  	MOVOU   X12, X0
  18  	MOVOU   (SI), X6
  19  	PANDN   X6, X0
  20  	MOVOU   X12, X1
  21  	MOVOU   16(SI), X7
  22  	PANDN   X7, X1
  23  	MOVOU   X12, X2
  24  	MOVOU   32(SI), X8
  25  	PANDN   X8, X2
  26  	MOVOU   X12, X3
  27  	MOVOU   48(SI), X9
  28  	PANDN   X9, X3
  29  	MOVOU   X12, X4
  30  	MOVOU   64(SI), X10
  31  	PANDN   X10, X4
  32  	MOVOU   X12, X5
  33  	MOVOU   80(SI), X11
  34  	PANDN   X11, X5
  35  	MOVOU   (CX), X6
  36  	MOVOU   16(CX), X7
  37  	MOVOU   32(CX), X8
  38  	MOVOU   48(CX), X9
  39  	MOVOU   64(CX), X10
  40  	MOVOU   80(CX), X11
  41  	PAND    X12, X6
  42  	PAND    X12, X7
  43  	PAND    X12, X8
  44  	PAND    X12, X9
  45  	PAND    X12, X10
  46  	PAND    X12, X11
  47  	PXOR    X6, X0
  48  	PXOR    X7, X1
  49  	PXOR    X8, X2
  50  	PXOR    X9, X3
  51  	PXOR    X10, X4
  52  	PXOR    X11, X5
  53  	MOVOU   X0, (DI)
  54  	MOVOU   X1, 16(DI)
  55  	MOVOU   X2, 32(DI)
  56  	MOVOU   X3, 48(DI)
  57  	MOVOU   X4, 64(DI)
  58  	MOVOU   X5, 80(DI)
  59  	RET
  60  
  61  // func p256NegCond(val *p256Element, cond int)
  62  // Requires: CMOV
  63  TEXT ·p256NegCond(SB), NOSPLIT, $0-16
  64  	MOVQ val+0(FP), DI
  65  	MOVQ cond+8(FP), R14
  66  
  67  	// acc = poly
  68  	MOVQ $-1, R8
  69  	MOVQ p256const0<>+0(SB), R9
  70  	MOVQ $+0, R10
  71  	MOVQ p256const1<>+0(SB), R11
  72  
  73  	// Load the original value
  74  	MOVQ (DI), R13
  75  	MOVQ 8(DI), SI
  76  	MOVQ 16(DI), CX
  77  	MOVQ 24(DI), R15
  78  
  79  	// Speculatively subtract
  80  	SUBQ R13, R8
  81  	SBBQ SI, R9
  82  	SBBQ CX, R10
  83  	SBBQ R15, R11
  84  
  85  	// If condition is 0, keep original value
  86  	TESTQ   R14, R14
  87  	CMOVQEQ R13, R8
  88  	CMOVQEQ SI, R9
  89  	CMOVQEQ CX, R10
  90  	CMOVQEQ R15, R11
  91  
  92  	// Store result
  93  	MOVQ R8, (DI)
  94  	MOVQ R9, 8(DI)
  95  	MOVQ R10, 16(DI)
  96  	MOVQ R11, 24(DI)
  97  	RET
  98  
  99  DATA p256const0<>+0(SB)/8, $0x00000000ffffffff
 100  GLOBL p256const0<>(SB), RODATA, $8
 101  
 102  DATA p256const1<>+0(SB)/8, $0xffffffff00000001
 103  GLOBL p256const1<>(SB), RODATA, $8
 104  
 105  // func p256Sqr(res *p256Element, in *p256Element, n int)
 106  // Requires: CMOV
 107  TEXT ·p256Sqr(SB), NOSPLIT, $0-24
 108  	MOVQ res+0(FP), DI
 109  	MOVQ in+8(FP), SI
 110  	MOVQ n+16(FP), BX
 111  
 112  sqrLoop:
 113  	// y[1:] * y[0]
 114  	MOVQ (SI), R14
 115  	MOVQ 8(SI), AX
 116  	MULQ R14
 117  	MOVQ AX, R9
 118  	MOVQ DX, R10
 119  	MOVQ 16(SI), AX
 120  	MULQ R14
 121  	ADDQ AX, R10
 122  	ADCQ $0x00, DX
 123  	MOVQ DX, R11
 124  	MOVQ 24(SI), AX
 125  	MULQ R14
 126  	ADDQ AX, R11
 127  	ADCQ $0x00, DX
 128  	MOVQ DX, R12
 129  
 130  	// y[2:] * y[1]
 131  	MOVQ 8(SI), R14
 132  	MOVQ 16(SI), AX
 133  	MULQ R14
 134  	ADDQ AX, R11
 135  	ADCQ $0x00, DX
 136  	MOVQ DX, R15
 137  	MOVQ 24(SI), AX
 138  	MULQ R14
 139  	ADDQ R15, R12
 140  	ADCQ $0x00, DX
 141  	ADDQ AX, R12
 142  	ADCQ $0x00, DX
 143  	MOVQ DX, R13
 144  
 145  	// y[3] * y[2]
 146  	MOVQ 16(SI), R14
 147  	MOVQ 24(SI), AX
 148  	MULQ R14
 149  	ADDQ AX, R13
 150  	ADCQ $0x00, DX
 151  	MOVQ DX, CX
 152  	XORQ R15, R15
 153  
 154  	// *2
 155  	ADDQ R9, R9
 156  	ADCQ R10, R10
 157  	ADCQ R11, R11
 158  	ADCQ R12, R12
 159  	ADCQ R13, R13
 160  	ADCQ CX, CX
 161  	ADCQ $0x00, R15
 162  
 163  	// Missing products
 164  	MOVQ (SI), AX
 165  	MULQ AX
 166  	MOVQ AX, R8
 167  	MOVQ DX, R14
 168  	MOVQ 8(SI), AX
 169  	MULQ AX
 170  	ADDQ R14, R9
 171  	ADCQ AX, R10
 172  	ADCQ $0x00, DX
 173  	MOVQ DX, R14
 174  	MOVQ 16(SI), AX
 175  	MULQ AX
 176  	ADDQ R14, R11
 177  	ADCQ AX, R12
 178  	ADCQ $0x00, DX
 179  	MOVQ DX, R14
 180  	MOVQ 24(SI), AX
 181  	MULQ AX
 182  	ADDQ R14, R13
 183  	ADCQ AX, CX
 184  	ADCQ DX, R15
 185  	MOVQ R15, SI
 186  
 187  	// First reduction step
 188  	MOVQ R8, AX
 189  	MOVQ R8, R15
 190  	SHLQ $0x20, R8
 191  	MULQ p256const1<>+0(SB)
 192  	SHRQ $0x20, R15
 193  	ADDQ R8, R9
 194  	ADCQ R15, R10
 195  	ADCQ AX, R11
 196  	ADCQ $0x00, DX
 197  	MOVQ DX, R8
 198  
 199  	// Second reduction step
 200  	MOVQ R9, AX
 201  	MOVQ R9, R15
 202  	SHLQ $0x20, R9
 203  	MULQ p256const1<>+0(SB)
 204  	SHRQ $0x20, R15
 205  	ADDQ R9, R10
 206  	ADCQ R15, R11
 207  	ADCQ AX, R8
 208  	ADCQ $0x00, DX
 209  	MOVQ DX, R9
 210  
 211  	// Third reduction step
 212  	MOVQ R10, AX
 213  	MOVQ R10, R15
 214  	SHLQ $0x20, R10
 215  	MULQ p256const1<>+0(SB)
 216  	SHRQ $0x20, R15
 217  	ADDQ R10, R11
 218  	ADCQ R15, R8
 219  	ADCQ AX, R9
 220  	ADCQ $0x00, DX
 221  	MOVQ DX, R10
 222  
 223  	// Last reduction step
 224  	XORQ R14, R14
 225  	MOVQ R11, AX
 226  	MOVQ R11, R15
 227  	SHLQ $0x20, R11
 228  	MULQ p256const1<>+0(SB)
 229  	SHRQ $0x20, R15
 230  	ADDQ R11, R8
 231  	ADCQ R15, R9
 232  	ADCQ AX, R10
 233  	ADCQ $0x00, DX
 234  	MOVQ DX, R11
 235  
 236  	// Add bits [511:256] of the sqr result
 237  	ADCQ R12, R8
 238  	ADCQ R13, R9
 239  	ADCQ CX, R10
 240  	ADCQ SI, R11
 241  	ADCQ $0x00, R14
 242  	MOVQ R8, R12
 243  	MOVQ R9, R13
 244  	MOVQ R10, CX
 245  	MOVQ R11, R15
 246  
 247  	// Subtract p256
 248  	SUBQ    $-1, R8
 249  	SBBQ    p256const0<>+0(SB), R9
 250  	SBBQ    $0x00, R10
 251  	SBBQ    p256const1<>+0(SB), R11
 252  	SBBQ    $0x00, R14
 253  	CMOVQCS R12, R8
 254  	CMOVQCS R13, R9
 255  	CMOVQCS CX, R10
 256  	CMOVQCS R15, R11
 257  	MOVQ    R8, (DI)
 258  	MOVQ    R9, 8(DI)
 259  	MOVQ    R10, 16(DI)
 260  	MOVQ    R11, 24(DI)
 261  	MOVQ    DI, SI
 262  	DECQ    BX
 263  	JNE     sqrLoop
 264  	RET
 265  
 266  // func p256Mul(res *p256Element, in1 *p256Element, in2 *p256Element)
 267  // Requires: CMOV
 268  TEXT ·p256Mul(SB), NOSPLIT, $0-24
 269  	MOVQ res+0(FP), DI
 270  	MOVQ in1+8(FP), SI
 271  	MOVQ in2+16(FP), CX
 272  
 273  	// x * y[0]
 274  	MOVQ (CX), R14
 275  	MOVQ (SI), AX
 276  	MULQ R14
 277  	MOVQ AX, R8
 278  	MOVQ DX, R9
 279  	MOVQ 8(SI), AX
 280  	MULQ R14
 281  	ADDQ AX, R9
 282  	ADCQ $0x00, DX
 283  	MOVQ DX, R10
 284  	MOVQ 16(SI), AX
 285  	MULQ R14
 286  	ADDQ AX, R10
 287  	ADCQ $0x00, DX
 288  	MOVQ DX, R11
 289  	MOVQ 24(SI), AX
 290  	MULQ R14
 291  	ADDQ AX, R11
 292  	ADCQ $0x00, DX
 293  	MOVQ DX, R12
 294  	XORQ R13, R13
 295  
 296  	// First reduction step
 297  	MOVQ R8, AX
 298  	MOVQ R8, R15
 299  	SHLQ $0x20, R8
 300  	MULQ p256const1<>+0(SB)
 301  	SHRQ $0x20, R15
 302  	ADDQ R8, R9
 303  	ADCQ R15, R10
 304  	ADCQ AX, R11
 305  	ADCQ DX, R12
 306  	ADCQ $0x00, R13
 307  	XORQ R8, R8
 308  
 309  	// x * y[1]
 310  	MOVQ 8(CX), R14
 311  	MOVQ (SI), AX
 312  	MULQ R14
 313  	ADDQ AX, R9
 314  	ADCQ $0x00, DX
 315  	MOVQ DX, R15
 316  	MOVQ 8(SI), AX
 317  	MULQ R14
 318  	ADDQ R15, R10
 319  	ADCQ $0x00, DX
 320  	ADDQ AX, R10
 321  	ADCQ $0x00, DX
 322  	MOVQ DX, R15
 323  	MOVQ 16(SI), AX
 324  	MULQ R14
 325  	ADDQ R15, R11
 326  	ADCQ $0x00, DX
 327  	ADDQ AX, R11
 328  	ADCQ $0x00, DX
 329  	MOVQ DX, R15
 330  	MOVQ 24(SI), AX
 331  	MULQ R14
 332  	ADDQ R15, R12
 333  	ADCQ $0x00, DX
 334  	ADDQ AX, R12
 335  	ADCQ DX, R13
 336  	ADCQ $0x00, R8
 337  
 338  	// Second reduction step
 339  	MOVQ R9, AX
 340  	MOVQ R9, R15
 341  	SHLQ $0x20, R9
 342  	MULQ p256const1<>+0(SB)
 343  	SHRQ $0x20, R15
 344  	ADDQ R9, R10
 345  	ADCQ R15, R11
 346  	ADCQ AX, R12
 347  	ADCQ DX, R13
 348  	ADCQ $0x00, R8
 349  	XORQ R9, R9
 350  
 351  	// x * y[2]
 352  	MOVQ 16(CX), R14
 353  	MOVQ (SI), AX
 354  	MULQ R14
 355  	ADDQ AX, R10
 356  	ADCQ $0x00, DX
 357  	MOVQ DX, R15
 358  	MOVQ 8(SI), AX
 359  	MULQ R14
 360  	ADDQ R15, R11
 361  	ADCQ $0x00, DX
 362  	ADDQ AX, R11
 363  	ADCQ $0x00, DX
 364  	MOVQ DX, R15
 365  	MOVQ 16(SI), AX
 366  	MULQ R14
 367  	ADDQ R15, R12
 368  	ADCQ $0x00, DX
 369  	ADDQ AX, R12
 370  	ADCQ $0x00, DX
 371  	MOVQ DX, R15
 372  	MOVQ 24(SI), AX
 373  	MULQ R14
 374  	ADDQ R15, R13
 375  	ADCQ $0x00, DX
 376  	ADDQ AX, R13
 377  	ADCQ DX, R8
 378  	ADCQ $0x00, R9
 379  
 380  	// Third reduction step
 381  	MOVQ R10, AX
 382  	MOVQ R10, R15
 383  	SHLQ $0x20, R10
 384  	MULQ p256const1<>+0(SB)
 385  	SHRQ $0x20, R15
 386  	ADDQ R10, R11
 387  	ADCQ R15, R12
 388  	ADCQ AX, R13
 389  	ADCQ DX, R8
 390  	ADCQ $0x00, R9
 391  	XORQ R10, R10
 392  
 393  	// x * y[3]
 394  	MOVQ 24(CX), R14
 395  	MOVQ (SI), AX
 396  	MULQ R14
 397  	ADDQ AX, R11
 398  	ADCQ $0x00, DX
 399  	MOVQ DX, R15
 400  	MOVQ 8(SI), AX
 401  	MULQ R14
 402  	ADDQ R15, R12
 403  	ADCQ $0x00, DX
 404  	ADDQ AX, R12
 405  	ADCQ $0x00, DX
 406  	MOVQ DX, R15
 407  	MOVQ 16(SI), AX
 408  	MULQ R14
 409  	ADDQ R15, R13
 410  	ADCQ $0x00, DX
 411  	ADDQ AX, R13
 412  	ADCQ $0x00, DX
 413  	MOVQ DX, R15
 414  	MOVQ 24(SI), AX
 415  	MULQ R14
 416  	ADDQ R15, R8
 417  	ADCQ $0x00, DX
 418  	ADDQ AX, R8
 419  	ADCQ DX, R9
 420  	ADCQ $0x00, R10
 421  
 422  	// Last reduction step
 423  	MOVQ R11, AX
 424  	MOVQ R11, R15
 425  	SHLQ $0x20, R11
 426  	MULQ p256const1<>+0(SB)
 427  	SHRQ $0x20, R15
 428  	ADDQ R11, R12
 429  	ADCQ R15, R13
 430  	ADCQ AX, R8
 431  	ADCQ DX, R9
 432  	ADCQ $0x00, R10
 433  
 434  	// Copy result [255:0]
 435  	MOVQ R12, SI
 436  	MOVQ R13, R11
 437  	MOVQ R8, R14
 438  	MOVQ R9, R15
 439  
 440  	// Subtract p256
 441  	SUBQ    $-1, R12
 442  	SBBQ    p256const0<>+0(SB), R13
 443  	SBBQ    $0x00, R8
 444  	SBBQ    p256const1<>+0(SB), R9
 445  	SBBQ    $0x00, R10
 446  	CMOVQCS SI, R12
 447  	CMOVQCS R11, R13
 448  	CMOVQCS R14, R8
 449  	CMOVQCS R15, R9
 450  	MOVQ    R12, (DI)
 451  	MOVQ    R13, 8(DI)
 452  	MOVQ    R8, 16(DI)
 453  	MOVQ    R9, 24(DI)
 454  	RET
 455  
 456  // func p256FromMont(res *p256Element, in *p256Element)
 457  // Requires: CMOV
 458  TEXT ·p256FromMont(SB), NOSPLIT, $0-16
 459  	MOVQ res+0(FP), DI
 460  	MOVQ in+8(FP), SI
 461  	MOVQ (SI), R8
 462  	MOVQ 8(SI), R9
 463  	MOVQ 16(SI), R10
 464  	MOVQ 24(SI), R11
 465  	XORQ R12, R12
 466  
 467  	// Only reduce, no multiplications are needed
 468  	// First stage
 469  	MOVQ R8, AX
 470  	MOVQ R8, R15
 471  	SHLQ $0x20, R8
 472  	MULQ p256const1<>+0(SB)
 473  	SHRQ $0x20, R15
 474  	ADDQ R8, R9
 475  	ADCQ R15, R10
 476  	ADCQ AX, R11
 477  	ADCQ DX, R12
 478  	XORQ R13, R13
 479  
 480  	// Second stage
 481  	MOVQ R9, AX
 482  	MOVQ R9, R15
 483  	SHLQ $0x20, R9
 484  	MULQ p256const1<>+0(SB)
 485  	SHRQ $0x20, R15
 486  	ADDQ R9, R10
 487  	ADCQ R15, R11
 488  	ADCQ AX, R12
 489  	ADCQ DX, R13
 490  	XORQ R8, R8
 491  
 492  	// Third stage
 493  	MOVQ R10, AX
 494  	MOVQ R10, R15
 495  	SHLQ $0x20, R10
 496  	MULQ p256const1<>+0(SB)
 497  	SHRQ $0x20, R15
 498  	ADDQ R10, R11
 499  	ADCQ R15, R12
 500  	ADCQ AX, R13
 501  	ADCQ DX, R8
 502  	XORQ R9, R9
 503  
 504  	// Last stage
 505  	MOVQ    R11, AX
 506  	MOVQ    R11, R15
 507  	SHLQ    $0x20, R11
 508  	MULQ    p256const1<>+0(SB)
 509  	SHRQ    $0x20, R15
 510  	ADDQ    R11, R12
 511  	ADCQ    R15, R13
 512  	ADCQ    AX, R8
 513  	ADCQ    DX, R9
 514  	MOVQ    R12, SI
 515  	MOVQ    R13, R11
 516  	MOVQ    R8, R14
 517  	MOVQ    R9, R15
 518  	SUBQ    $-1, R12
 519  	SBBQ    p256const0<>+0(SB), R13
 520  	SBBQ    $0x00, R8
 521  	SBBQ    p256const1<>+0(SB), R9
 522  	CMOVQCS SI, R12
 523  	CMOVQCS R11, R13
 524  	CMOVQCS R14, R8
 525  	CMOVQCS R15, R9
 526  	MOVQ    R12, (DI)
 527  	MOVQ    R13, 8(DI)
 528  	MOVQ    R8, 16(DI)
 529  	MOVQ    R9, 24(DI)
 530  	RET
 531  
 532  // func p256Select(res *P256Point, table *p256Table, idx int)
 533  // Requires: SSE2
 534  TEXT ·p256Select(SB), NOSPLIT, $0-24
 535  	MOVQ    idx+16(FP), AX
 536  	MOVQ    table+8(FP), DI
 537  	MOVQ    res+0(FP), DX
 538  	PXOR    X15, X15
 539  	PCMPEQL X14, X14
 540  	PSUBL   X14, X15
 541  	MOVL    AX, X14
 542  	PSHUFD  $0x00, X14, X14
 543  	PXOR    X0, X0
 544  	PXOR    X1, X1
 545  	PXOR    X2, X2
 546  	PXOR    X3, X3
 547  	PXOR    X4, X4
 548  	PXOR    X5, X5
 549  	MOVQ    $0x00000010, AX
 550  	MOVOU   X15, X13
 551  
 552  loop_select:
 553  	MOVOU   X13, X12
 554  	PADDL   X15, X13
 555  	PCMPEQL X14, X12
 556  	MOVOU   (DI), X6
 557  	MOVOU   16(DI), X7
 558  	MOVOU   32(DI), X8
 559  	MOVOU   48(DI), X9
 560  	MOVOU   64(DI), X10
 561  	MOVOU   80(DI), X11
 562  	ADDQ    $0x60, DI
 563  	PAND    X12, X6
 564  	PAND    X12, X7
 565  	PAND    X12, X8
 566  	PAND    X12, X9
 567  	PAND    X12, X10
 568  	PAND    X12, X11
 569  	PXOR    X6, X0
 570  	PXOR    X7, X1
 571  	PXOR    X8, X2
 572  	PXOR    X9, X3
 573  	PXOR    X10, X4
 574  	PXOR    X11, X5
 575  	DECQ    AX
 576  	JNE     loop_select
 577  	MOVOU   X0, (DX)
 578  	MOVOU   X1, 16(DX)
 579  	MOVOU   X2, 32(DX)
 580  	MOVOU   X3, 48(DX)
 581  	MOVOU   X4, 64(DX)
 582  	MOVOU   X5, 80(DX)
 583  	RET
 584  
 585  // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
 586  // Requires: SSE2
 587  TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
 588  	MOVQ    idx+16(FP), AX
 589  	MOVQ    table+8(FP), DI
 590  	MOVQ    res+0(FP), DX
 591  	PXOR    X15, X15
 592  	PCMPEQL X14, X14
 593  	PSUBL   X14, X15
 594  	MOVL    AX, X14
 595  	PSHUFD  $0x00, X14, X14
 596  	PXOR    X0, X0
 597  	PXOR    X1, X1
 598  	PXOR    X2, X2
 599  	PXOR    X3, X3
 600  	MOVQ    $0x00000010, AX
 601  	MOVOU   X15, X13
 602  
 603  loop_select_base:
 604  	MOVOU   X13, X12
 605  	PADDL   X15, X13
 606  	PCMPEQL X14, X12
 607  	MOVOU   (DI), X4
 608  	MOVOU   16(DI), X5
 609  	MOVOU   32(DI), X6
 610  	MOVOU   48(DI), X7
 611  	MOVOU   64(DI), X8
 612  	MOVOU   80(DI), X9
 613  	MOVOU   96(DI), X10
 614  	MOVOU   112(DI), X11
 615  	ADDQ    $0x80, DI
 616  	PAND    X12, X4
 617  	PAND    X12, X5
 618  	PAND    X12, X6
 619  	PAND    X12, X7
 620  	MOVOU   X13, X12
 621  	PADDL   X15, X13
 622  	PCMPEQL X14, X12
 623  	PAND    X12, X8
 624  	PAND    X12, X9
 625  	PAND    X12, X10
 626  	PAND    X12, X11
 627  	PXOR    X4, X0
 628  	PXOR    X5, X1
 629  	PXOR    X6, X2
 630  	PXOR    X7, X3
 631  	PXOR    X8, X0
 632  	PXOR    X9, X1
 633  	PXOR    X10, X2
 634  	PXOR    X11, X3
 635  	DECQ    AX
 636  	JNE     loop_select_base
 637  	MOVOU   X0, (DX)
 638  	MOVOU   X1, 16(DX)
 639  	MOVOU   X2, 32(DX)
 640  	MOVOU   X3, 48(DX)
 641  	RET
 642  
 643  // func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement)
 644  // Requires: CMOV
 645  TEXT ·p256OrdMul(SB), NOSPLIT, $0-24
 646  	MOVQ res+0(FP), DI
 647  	MOVQ in1+8(FP), SI
 648  	MOVQ in2+16(FP), CX
 649  
 650  	// x * y[0]
 651  	MOVQ (CX), R14
 652  	MOVQ (SI), AX
 653  	MULQ R14
 654  	MOVQ AX, R8
 655  	MOVQ DX, R9
 656  	MOVQ 8(SI), AX
 657  	MULQ R14
 658  	ADDQ AX, R9
 659  	ADCQ $0x00, DX
 660  	MOVQ DX, R10
 661  	MOVQ 16(SI), AX
 662  	MULQ R14
 663  	ADDQ AX, R10
 664  	ADCQ $0x00, DX
 665  	MOVQ DX, R11
 666  	MOVQ 24(SI), AX
 667  	MULQ R14
 668  	ADDQ AX, R11
 669  	ADCQ $0x00, DX
 670  	MOVQ DX, R12
 671  	XORQ R13, R13
 672  
 673  	// First reduction step
 674  	MOVQ R8, AX
 675  	MULQ p256ordK0<>+0(SB)
 676  	MOVQ AX, R14
 677  	MOVQ p256ord<>+0(SB), AX
 678  	MULQ R14
 679  	ADDQ AX, R8
 680  	ADCQ $0x00, DX
 681  	MOVQ DX, R15
 682  	MOVQ p256ord<>+8(SB), AX
 683  	MULQ R14
 684  	ADDQ R15, R9
 685  	ADCQ $0x00, DX
 686  	ADDQ AX, R9
 687  	ADCQ $0x00, DX
 688  	MOVQ DX, R15
 689  	MOVQ p256ord<>+16(SB), AX
 690  	MULQ R14
 691  	ADDQ R15, R10
 692  	ADCQ $0x00, DX
 693  	ADDQ AX, R10
 694  	ADCQ $0x00, DX
 695  	MOVQ DX, R15
 696  	MOVQ p256ord<>+24(SB), AX
 697  	MULQ R14
 698  	ADDQ R15, R11
 699  	ADCQ $0x00, DX
 700  	ADDQ AX, R11
 701  	ADCQ DX, R12
 702  	ADCQ $0x00, R13
 703  
 704  	// x * y[1]
 705  	MOVQ 8(CX), R14
 706  	MOVQ (SI), AX
 707  	MULQ R14
 708  	ADDQ AX, R9
 709  	ADCQ $0x00, DX
 710  	MOVQ DX, R15
 711  	MOVQ 8(SI), AX
 712  	MULQ R14
 713  	ADDQ R15, R10
 714  	ADCQ $0x00, DX
 715  	ADDQ AX, R10
 716  	ADCQ $0x00, DX
 717  	MOVQ DX, R15
 718  	MOVQ 16(SI), AX
 719  	MULQ R14
 720  	ADDQ R15, R11
 721  	ADCQ $0x00, DX
 722  	ADDQ AX, R11
 723  	ADCQ $0x00, DX
 724  	MOVQ DX, R15
 725  	MOVQ 24(SI), AX
 726  	MULQ R14
 727  	ADDQ R15, R12
 728  	ADCQ $0x00, DX
 729  	ADDQ AX, R12
 730  	ADCQ DX, R13
 731  	ADCQ $0x00, R8
 732  
 733  	// Second reduction step
 734  	MOVQ R9, AX
 735  	MULQ p256ordK0<>+0(SB)
 736  	MOVQ AX, R14
 737  	MOVQ p256ord<>+0(SB), AX
 738  	MULQ R14
 739  	ADDQ AX, R9
 740  	ADCQ $0x00, DX
 741  	MOVQ DX, R15
 742  	MOVQ p256ord<>+8(SB), AX
 743  	MULQ R14
 744  	ADDQ R15, R10
 745  	ADCQ $0x00, DX
 746  	ADDQ AX, R10
 747  	ADCQ $0x00, DX
 748  	MOVQ DX, R15
 749  	MOVQ p256ord<>+16(SB), AX
 750  	MULQ R14
 751  	ADDQ R15, R11
 752  	ADCQ $0x00, DX
 753  	ADDQ AX, R11
 754  	ADCQ $0x00, DX
 755  	MOVQ DX, R15
 756  	MOVQ p256ord<>+24(SB), AX
 757  	MULQ R14
 758  	ADDQ R15, R12
 759  	ADCQ $0x00, DX
 760  	ADDQ AX, R12
 761  	ADCQ DX, R13
 762  	ADCQ $0x00, R8
 763  
 764  	// x * y[2]
 765  	MOVQ 16(CX), R14
 766  	MOVQ (SI), AX
 767  	MULQ R14
 768  	ADDQ AX, R10
 769  	ADCQ $0x00, DX
 770  	MOVQ DX, R15
 771  	MOVQ 8(SI), AX
 772  	MULQ R14
 773  	ADDQ R15, R11
 774  	ADCQ $0x00, DX
 775  	ADDQ AX, R11
 776  	ADCQ $0x00, DX
 777  	MOVQ DX, R15
 778  	MOVQ 16(SI), AX
 779  	MULQ R14
 780  	ADDQ R15, R12
 781  	ADCQ $0x00, DX
 782  	ADDQ AX, R12
 783  	ADCQ $0x00, DX
 784  	MOVQ DX, R15
 785  	MOVQ 24(SI), AX
 786  	MULQ R14
 787  	ADDQ R15, R13
 788  	ADCQ $0x00, DX
 789  	ADDQ AX, R13
 790  	ADCQ DX, R8
 791  	ADCQ $0x00, R9
 792  
 793  	// Third reduction step
 794  	MOVQ R10, AX
 795  	MULQ p256ordK0<>+0(SB)
 796  	MOVQ AX, R14
 797  	MOVQ p256ord<>+0(SB), AX
 798  	MULQ R14
 799  	ADDQ AX, R10
 800  	ADCQ $0x00, DX
 801  	MOVQ DX, R15
 802  	MOVQ p256ord<>+8(SB), AX
 803  	MULQ R14
 804  	ADDQ R15, R11
 805  	ADCQ $0x00, DX
 806  	ADDQ AX, R11
 807  	ADCQ $0x00, DX
 808  	MOVQ DX, R15
 809  	MOVQ p256ord<>+16(SB), AX
 810  	MULQ R14
 811  	ADDQ R15, R12
 812  	ADCQ $0x00, DX
 813  	ADDQ AX, R12
 814  	ADCQ $0x00, DX
 815  	MOVQ DX, R15
 816  	MOVQ p256ord<>+24(SB), AX
 817  	MULQ R14
 818  	ADDQ R15, R13
 819  	ADCQ $0x00, DX
 820  	ADDQ AX, R13
 821  	ADCQ DX, R8
 822  	ADCQ $0x00, R9
 823  
 824  	// x * y[3]
 825  	MOVQ 24(CX), R14
 826  	MOVQ (SI), AX
 827  	MULQ R14
 828  	ADDQ AX, R11
 829  	ADCQ $0x00, DX
 830  	MOVQ DX, R15
 831  	MOVQ 8(SI), AX
 832  	MULQ R14
 833  	ADDQ R15, R12
 834  	ADCQ $0x00, DX
 835  	ADDQ AX, R12
 836  	ADCQ $0x00, DX
 837  	MOVQ DX, R15
 838  	MOVQ 16(SI), AX
 839  	MULQ R14
 840  	ADDQ R15, R13
 841  	ADCQ $0x00, DX
 842  	ADDQ AX, R13
 843  	ADCQ $0x00, DX
 844  	MOVQ DX, R15
 845  	MOVQ 24(SI), AX
 846  	MULQ R14
 847  	ADDQ R15, R8
 848  	ADCQ $0x00, DX
 849  	ADDQ AX, R8
 850  	ADCQ DX, R9
 851  	ADCQ $0x00, R10
 852  
 853  	// Last reduction step
 854  	MOVQ R11, AX
 855  	MULQ p256ordK0<>+0(SB)
 856  	MOVQ AX, R14
 857  	MOVQ p256ord<>+0(SB), AX
 858  	MULQ R14
 859  	ADDQ AX, R11
 860  	ADCQ $0x00, DX
 861  	MOVQ DX, R15
 862  	MOVQ p256ord<>+8(SB), AX
 863  	MULQ R14
 864  	ADDQ R15, R12
 865  	ADCQ $0x00, DX
 866  	ADDQ AX, R12
 867  	ADCQ $0x00, DX
 868  	MOVQ DX, R15
 869  	MOVQ p256ord<>+16(SB), AX
 870  	MULQ R14
 871  	ADDQ R15, R13
 872  	ADCQ $0x00, DX
 873  	ADDQ AX, R13
 874  	ADCQ $0x00, DX
 875  	MOVQ DX, R15
 876  	MOVQ p256ord<>+24(SB), AX
 877  	MULQ R14
 878  	ADDQ R15, R8
 879  	ADCQ $0x00, DX
 880  	ADDQ AX, R8
 881  	ADCQ DX, R9
 882  	ADCQ $0x00, R10
 883  
 884  	// Copy result [255:0]
 885  	MOVQ R12, SI
 886  	MOVQ R13, R11
 887  	MOVQ R8, R14
 888  	MOVQ R9, R15
 889  
 890  	// Subtract p256
 891  	SUBQ    p256ord<>+0(SB), R12
 892  	SBBQ    p256ord<>+8(SB), R13
 893  	SBBQ    p256ord<>+16(SB), R8
 894  	SBBQ    p256ord<>+24(SB), R9
 895  	SBBQ    $0x00, R10
 896  	CMOVQCS SI, R12
 897  	CMOVQCS R11, R13
 898  	CMOVQCS R14, R8
 899  	CMOVQCS R15, R9
 900  	MOVQ    R12, (DI)
 901  	MOVQ    R13, 8(DI)
 902  	MOVQ    R8, 16(DI)
 903  	MOVQ    R9, 24(DI)
 904  	RET
 905  
 906  DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f
 907  GLOBL p256ordK0<>(SB), RODATA, $8
 908  
 909  DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551
 910  DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84
 911  DATA p256ord<>+16(SB)/8, $0xffffffffffffffff
 912  DATA p256ord<>+24(SB)/8, $0xffffffff00000000
 913  GLOBL p256ord<>(SB), RODATA, $32
 914  
 915  // func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int)
 916  // Requires: CMOV
 917  TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24
 918  	MOVQ res+0(FP), DI
 919  	MOVQ in+8(FP), SI
 920  	MOVQ n+16(FP), BX
 921  
 922  ordSqrLoop:
 923  	// y[1:] * y[0]
 924  	MOVQ (SI), R14
 925  	MOVQ 8(SI), AX
 926  	MULQ R14
 927  	MOVQ AX, R9
 928  	MOVQ DX, R10
 929  	MOVQ 16(SI), AX
 930  	MULQ R14
 931  	ADDQ AX, R10
 932  	ADCQ $0x00, DX
 933  	MOVQ DX, R11
 934  	MOVQ 24(SI), AX
 935  	MULQ R14
 936  	ADDQ AX, R11
 937  	ADCQ $0x00, DX
 938  	MOVQ DX, R12
 939  
 940  	// y[2:] * y[1]
 941  	MOVQ 8(SI), R14
 942  	MOVQ 16(SI), AX
 943  	MULQ R14
 944  	ADDQ AX, R11
 945  	ADCQ $0x00, DX
 946  	MOVQ DX, R15
 947  	MOVQ 24(SI), AX
 948  	MULQ R14
 949  	ADDQ R15, R12
 950  	ADCQ $0x00, DX
 951  	ADDQ AX, R12
 952  	ADCQ $0x00, DX
 953  	MOVQ DX, R13
 954  
 955  	// y[3] * y[2]
 956  	MOVQ 16(SI), R14
 957  	MOVQ 24(SI), AX
 958  	MULQ R14
 959  	ADDQ AX, R13
 960  	ADCQ $0x00, DX
 961  	MOVQ DX, CX
 962  	XORQ R15, R15
 963  
 964  	// *2
 965  	ADDQ R9, R9
 966  	ADCQ R10, R10
 967  	ADCQ R11, R11
 968  	ADCQ R12, R12
 969  	ADCQ R13, R13
 970  	ADCQ CX, CX
 971  	ADCQ $0x00, R15
 972  
 973  	// Missing products
 974  	MOVQ (SI), AX
 975  	MULQ AX
 976  	MOVQ AX, R8
 977  	MOVQ DX, R14
 978  	MOVQ 8(SI), AX
 979  	MULQ AX
 980  	ADDQ R14, R9
 981  	ADCQ AX, R10
 982  	ADCQ $0x00, DX
 983  	MOVQ DX, R14
 984  	MOVQ 16(SI), AX
 985  	MULQ AX
 986  	ADDQ R14, R11
 987  	ADCQ AX, R12
 988  	ADCQ $0x00, DX
 989  	MOVQ DX, R14
 990  	MOVQ 24(SI), AX
 991  	MULQ AX
 992  	ADDQ R14, R13
 993  	ADCQ AX, CX
 994  	ADCQ DX, R15
 995  	MOVQ R15, SI
 996  
 997  	// First reduction step
 998  	MOVQ R8, AX
 999  	MULQ p256ordK0<>+0(SB)
1000  	MOVQ AX, R14
1001  	MOVQ p256ord<>+0(SB), AX
1002  	MULQ R14
1003  	ADDQ AX, R8
1004  	ADCQ $0x00, DX
1005  	MOVQ DX, R15
1006  	MOVQ p256ord<>+8(SB), AX
1007  	MULQ R14
1008  	ADDQ R15, R9
1009  	ADCQ $0x00, DX
1010  	ADDQ AX, R9
1011  	MOVQ R14, R15
1012  	ADCQ DX, R10
1013  	ADCQ $0x00, R15
1014  	SUBQ R14, R10
1015  	SBBQ $0x00, R15
1016  	MOVQ R14, AX
1017  	MOVQ R14, DX
1018  	MOVQ R14, R8
1019  	SHLQ $0x20, AX
1020  	SHRQ $0x20, DX
1021  	ADDQ R15, R11
1022  	ADCQ $0x00, R8
1023  	SUBQ AX, R11
1024  	SBBQ DX, R8
1025  
1026  	// Second reduction step
1027  	MOVQ R9, AX
1028  	MULQ p256ordK0<>+0(SB)
1029  	MOVQ AX, R14
1030  	MOVQ p256ord<>+0(SB), AX
1031  	MULQ R14
1032  	ADDQ AX, R9
1033  	ADCQ $0x00, DX
1034  	MOVQ DX, R15
1035  	MOVQ p256ord<>+8(SB), AX
1036  	MULQ R14
1037  	ADDQ R15, R10
1038  	ADCQ $0x00, DX
1039  	ADDQ AX, R10
1040  	MOVQ R14, R15
1041  	ADCQ DX, R11
1042  	ADCQ $0x00, R15
1043  	SUBQ R14, R11
1044  	SBBQ $0x00, R15
1045  	MOVQ R14, AX
1046  	MOVQ R14, DX
1047  	MOVQ R14, R9
1048  	SHLQ $0x20, AX
1049  	SHRQ $0x20, DX
1050  	ADDQ R15, R8
1051  	ADCQ $0x00, R9
1052  	SUBQ AX, R8
1053  	SBBQ DX, R9
1054  
1055  	// Third reduction step
1056  	MOVQ R10, AX
1057  	MULQ p256ordK0<>+0(SB)
1058  	MOVQ AX, R14
1059  	MOVQ p256ord<>+0(SB), AX
1060  	MULQ R14
1061  	ADDQ AX, R10
1062  	ADCQ $0x00, DX
1063  	MOVQ DX, R15
1064  	MOVQ p256ord<>+8(SB), AX
1065  	MULQ R14
1066  	ADDQ R15, R11
1067  	ADCQ $0x00, DX
1068  	ADDQ AX, R11
1069  	MOVQ R14, R15
1070  	ADCQ DX, R8
1071  	ADCQ $0x00, R15
1072  	SUBQ R14, R8
1073  	SBBQ $0x00, R15
1074  	MOVQ R14, AX
1075  	MOVQ R14, DX
1076  	MOVQ R14, R10
1077  	SHLQ $0x20, AX
1078  	SHRQ $0x20, DX
1079  	ADDQ R15, R9
1080  	ADCQ $0x00, R10
1081  	SUBQ AX, R9
1082  	SBBQ DX, R10
1083  
1084  	// Last reduction step
1085  	MOVQ R11, AX
1086  	MULQ p256ordK0<>+0(SB)
1087  	MOVQ AX, R14
1088  	MOVQ p256ord<>+0(SB), AX
1089  	MULQ R14
1090  	ADDQ AX, R11
1091  	ADCQ $0x00, DX
1092  	MOVQ DX, R15
1093  	MOVQ p256ord<>+8(SB), AX
1094  	MULQ R14
1095  	ADDQ R15, R8
1096  	ADCQ $0x00, DX
1097  	ADDQ AX, R8
1098  	ADCQ $0x00, DX
1099  	MOVQ DX, R15
1100  	MOVQ R14, R15
1101  	ADCQ DX, R9
1102  	ADCQ $0x00, R15
1103  	SUBQ R14, R9
1104  	SBBQ $0x00, R15
1105  	MOVQ R14, AX
1106  	MOVQ R14, DX
1107  	MOVQ R14, R11
1108  	SHLQ $0x20, AX
1109  	SHRQ $0x20, DX
1110  	ADDQ R15, R10
1111  	ADCQ $0x00, R11
1112  	SUBQ AX, R10
1113  	SBBQ DX, R11
1114  	XORQ R14, R14
1115  
1116  	// Add bits [511:256] of the sqr result
1117  	ADCQ R12, R8
1118  	ADCQ R13, R9
1119  	ADCQ CX, R10
1120  	ADCQ SI, R11
1121  	ADCQ $0x00, R14
1122  	MOVQ R8, R12
1123  	MOVQ R9, R13
1124  	MOVQ R10, CX
1125  	MOVQ R11, R15
1126  
1127  	// Subtract p256
1128  	SUBQ    p256ord<>+0(SB), R8
1129  	SBBQ    p256ord<>+8(SB), R9
1130  	SBBQ    p256ord<>+16(SB), R10
1131  	SBBQ    p256ord<>+24(SB), R11
1132  	SBBQ    $0x00, R14
1133  	CMOVQCS R12, R8
1134  	CMOVQCS R13, R9
1135  	CMOVQCS CX, R10
1136  	CMOVQCS R15, R11
1137  	MOVQ    R8, (DI)
1138  	MOVQ    R9, 8(DI)
1139  	MOVQ    R10, 16(DI)
1140  	MOVQ    R11, 24(DI)
1141  	MOVQ    DI, SI
1142  	DECQ    BX
1143  	JNE     ordSqrLoop
1144  	RET
1145  
1146  // func p256SubInternal()
1147  // Requires: CMOV
1148  TEXT p256SubInternal(SB), NOSPLIT, $0
1149  	XORQ    AX, AX
1150  	SUBQ    R14, R10
1151  	SBBQ    R15, R11
1152  	SBBQ    DI, R12
1153  	SBBQ    SI, R13
1154  	SBBQ    $0x00, AX
1155  	MOVQ    R10, BX
1156  	MOVQ    R11, CX
1157  	MOVQ    R12, R8
1158  	MOVQ    R13, R9
1159  	ADDQ    $-1, R10
1160  	ADCQ    p256const0<>+0(SB), R11
1161  	ADCQ    $0x00, R12
1162  	ADCQ    p256const1<>+0(SB), R13
1163  	ANDQ    $0x01, AX
1164  	CMOVQEQ BX, R10
1165  	CMOVQEQ CX, R11
1166  	CMOVQEQ R8, R12
1167  	CMOVQEQ R9, R13
1168  	RET
1169  
1170  // func p256MulInternal()
1171  // Requires: CMOV
1172  TEXT p256MulInternal(SB), NOSPLIT, $8
1173  	MOVQ R10, AX
1174  	MULQ R14
1175  	MOVQ AX, BX
1176  	MOVQ DX, CX
1177  	MOVQ R10, AX
1178  	MULQ R15
1179  	ADDQ AX, CX
1180  	ADCQ $0x00, DX
1181  	MOVQ DX, R8
1182  	MOVQ R10, AX
1183  	MULQ DI
1184  	ADDQ AX, R8
1185  	ADCQ $0x00, DX
1186  	MOVQ DX, R9
1187  	MOVQ R10, AX
1188  	MULQ SI
1189  	ADDQ AX, R9
1190  	ADCQ $0x00, DX
1191  	MOVQ DX, R10
1192  	MOVQ R11, AX
1193  	MULQ R14
1194  	ADDQ AX, CX
1195  	ADCQ $0x00, DX
1196  	MOVQ DX, BP
1197  	MOVQ R11, AX
1198  	MULQ R15
1199  	ADDQ BP, R8
1200  	ADCQ $0x00, DX
1201  	ADDQ AX, R8
1202  	ADCQ $0x00, DX
1203  	MOVQ DX, BP
1204  	MOVQ R11, AX
1205  	MULQ DI
1206  	ADDQ BP, R9
1207  	ADCQ $0x00, DX
1208  	ADDQ AX, R9
1209  	ADCQ $0x00, DX
1210  	MOVQ DX, BP
1211  	MOVQ R11, AX
1212  	MULQ SI
1213  	ADDQ BP, R10
1214  	ADCQ $0x00, DX
1215  	ADDQ AX, R10
1216  	ADCQ $0x00, DX
1217  	MOVQ DX, R11
1218  	MOVQ R12, AX
1219  	MULQ R14
1220  	ADDQ AX, R8
1221  	ADCQ $0x00, DX
1222  	MOVQ DX, BP
1223  	MOVQ R12, AX
1224  	MULQ R15
1225  	ADDQ BP, R9
1226  	ADCQ $0x00, DX
1227  	ADDQ AX, R9
1228  	ADCQ $0x00, DX
1229  	MOVQ DX, BP
1230  	MOVQ R12, AX
1231  	MULQ DI
1232  	ADDQ BP, R10
1233  	ADCQ $0x00, DX
1234  	ADDQ AX, R10
1235  	ADCQ $0x00, DX
1236  	MOVQ DX, BP
1237  	MOVQ R12, AX
1238  	MULQ SI
1239  	ADDQ BP, R11
1240  	ADCQ $0x00, DX
1241  	ADDQ AX, R11
1242  	ADCQ $0x00, DX
1243  	MOVQ DX, R12
1244  	MOVQ R13, AX
1245  	MULQ R14
1246  	ADDQ AX, R9
1247  	ADCQ $0x00, DX
1248  	MOVQ DX, BP
1249  	MOVQ R13, AX
1250  	MULQ R15
1251  	ADDQ BP, R10
1252  	ADCQ $0x00, DX
1253  	ADDQ AX, R10
1254  	ADCQ $0x00, DX
1255  	MOVQ DX, BP
1256  	MOVQ R13, AX
1257  	MULQ DI
1258  	ADDQ BP, R11
1259  	ADCQ $0x00, DX
1260  	ADDQ AX, R11
1261  	ADCQ $0x00, DX
1262  	MOVQ DX, BP
1263  	MOVQ R13, AX
1264  	MULQ SI
1265  	ADDQ BP, R12
1266  	ADCQ $0x00, DX
1267  	ADDQ AX, R12
1268  	ADCQ $0x00, DX
1269  	MOVQ DX, R13
1270  
1271  	// First reduction step
1272  	MOVQ BX, AX
1273  	MOVQ BX, BP
1274  	SHLQ $0x20, BX
1275  	MULQ p256const1<>+0(SB)
1276  	SHRQ $0x20, BP
1277  	ADDQ BX, CX
1278  	ADCQ BP, R8
1279  	ADCQ AX, R9
1280  	ADCQ $0x00, DX
1281  	MOVQ DX, BX
1282  
1283  	// Second reduction step
1284  	MOVQ CX, AX
1285  	MOVQ CX, BP
1286  	SHLQ $0x20, CX
1287  	MULQ p256const1<>+0(SB)
1288  	SHRQ $0x20, BP
1289  	ADDQ CX, R8
1290  	ADCQ BP, R9
1291  	ADCQ AX, BX
1292  	ADCQ $0x00, DX
1293  	MOVQ DX, CX
1294  
1295  	// Third reduction step
1296  	MOVQ R8, AX
1297  	MOVQ R8, BP
1298  	SHLQ $0x20, R8
1299  	MULQ p256const1<>+0(SB)
1300  	SHRQ $0x20, BP
1301  	ADDQ R8, R9
1302  	ADCQ BP, BX
1303  	ADCQ AX, CX
1304  	ADCQ $0x00, DX
1305  	MOVQ DX, R8
1306  
1307  	// Last reduction step
1308  	MOVQ R9, AX
1309  	MOVQ R9, BP
1310  	SHLQ $0x20, R9
1311  	MULQ p256const1<>+0(SB)
1312  	SHRQ $0x20, BP
1313  	ADDQ R9, BX
1314  	ADCQ BP, CX
1315  	ADCQ AX, R8
1316  	ADCQ $0x00, DX
1317  	MOVQ DX, R9
1318  	MOVQ $0x00000000, BP
1319  
1320  	// Add bits [511:256] of the result
1321  	ADCQ BX, R10
1322  	ADCQ CX, R11
1323  	ADCQ R8, R12
1324  	ADCQ R9, R13
1325  	ADCQ $0x00, BP
1326  
1327  	// Copy result
1328  	MOVQ R10, BX
1329  	MOVQ R11, CX
1330  	MOVQ R12, R8
1331  	MOVQ R13, R9
1332  
1333  	// Subtract p256
1334  	SUBQ $-1, R10
1335  	SBBQ p256const0<>+0(SB), R11
1336  	SBBQ $0x00, R12
1337  	SBBQ p256const1<>+0(SB), R13
1338  	SBBQ $0x00, BP
1339  
1340  	// If the result of the subtraction is negative, restore the previous result
1341  	CMOVQCS BX, R10
1342  	CMOVQCS CX, R11
1343  	CMOVQCS R8, R12
1344  	CMOVQCS R9, R13
1345  	RET
1346  
1347  // func p256SqrInternal()
1348  // Requires: CMOV
1349  TEXT p256SqrInternal(SB), NOSPLIT, $8
1350  	MOVQ R10, AX
1351  	MULQ R11
1352  	MOVQ AX, CX
1353  	MOVQ DX, R8
1354  	MOVQ R10, AX
1355  	MULQ R12
1356  	ADDQ AX, R8
1357  	ADCQ $0x00, DX
1358  	MOVQ DX, R9
1359  	MOVQ R10, AX
1360  	MULQ R13
1361  	ADDQ AX, R9
1362  	ADCQ $0x00, DX
1363  	MOVQ DX, R14
1364  	MOVQ R11, AX
1365  	MULQ R12
1366  	ADDQ AX, R9
1367  	ADCQ $0x00, DX
1368  	MOVQ DX, BP
1369  	MOVQ R11, AX
1370  	MULQ R13
1371  	ADDQ BP, R14
1372  	ADCQ $0x00, DX
1373  	ADDQ AX, R14
1374  	ADCQ $0x00, DX
1375  	MOVQ DX, R15
1376  	MOVQ R12, AX
1377  	MULQ R13
1378  	ADDQ AX, R15
1379  	ADCQ $0x00, DX
1380  	MOVQ DX, DI
1381  	XORQ SI, SI
1382  
1383  	// *2
1384  	ADDQ CX, CX
1385  	ADCQ R8, R8
1386  	ADCQ R9, R9
1387  	ADCQ R14, R14
1388  	ADCQ R15, R15
1389  	ADCQ DI, DI
1390  	ADCQ $0x00, SI
1391  
1392  	// Missing products
1393  	MOVQ R10, AX
1394  	MULQ AX
1395  	MOVQ AX, BX
1396  	MOVQ DX, R10
1397  	MOVQ R11, AX
1398  	MULQ AX
1399  	ADDQ R10, CX
1400  	ADCQ AX, R8
1401  	ADCQ $0x00, DX
1402  	MOVQ DX, R10
1403  	MOVQ R12, AX
1404  	MULQ AX
1405  	ADDQ R10, R9
1406  	ADCQ AX, R14
1407  	ADCQ $0x00, DX
1408  	MOVQ DX, R10
1409  	MOVQ R13, AX
1410  	MULQ AX
1411  	ADDQ R10, R15
1412  	ADCQ AX, DI
1413  	ADCQ DX, SI
1414  
1415  	// First reduction step
1416  	MOVQ BX, AX
1417  	MOVQ BX, BP
1418  	SHLQ $0x20, BX
1419  	MULQ p256const1<>+0(SB)
1420  	SHRQ $0x20, BP
1421  	ADDQ BX, CX
1422  	ADCQ BP, R8
1423  	ADCQ AX, R9
1424  	ADCQ $0x00, DX
1425  	MOVQ DX, BX
1426  
1427  	// Second reduction step
1428  	MOVQ CX, AX
1429  	MOVQ CX, BP
1430  	SHLQ $0x20, CX
1431  	MULQ p256const1<>+0(SB)
1432  	SHRQ $0x20, BP
1433  	ADDQ CX, R8
1434  	ADCQ BP, R9
1435  	ADCQ AX, BX
1436  	ADCQ $0x00, DX
1437  	MOVQ DX, CX
1438  
1439  	// Third reduction step
1440  	MOVQ R8, AX
1441  	MOVQ R8, BP
1442  	SHLQ $0x20, R8
1443  	MULQ p256const1<>+0(SB)
1444  	SHRQ $0x20, BP
1445  	ADDQ R8, R9
1446  	ADCQ BP, BX
1447  	ADCQ AX, CX
1448  	ADCQ $0x00, DX
1449  	MOVQ DX, R8
1450  
1451  	// Last reduction step
1452  	MOVQ R9, AX
1453  	MOVQ R9, BP
1454  	SHLQ $0x20, R9
1455  	MULQ p256const1<>+0(SB)
1456  	SHRQ $0x20, BP
1457  	ADDQ R9, BX
1458  	ADCQ BP, CX
1459  	ADCQ AX, R8
1460  	ADCQ $0x00, DX
1461  	MOVQ DX, R9
1462  	MOVQ $0x00000000, BP
1463  
1464  	// Add bits [511:256] of the result
1465  	ADCQ BX, R14
1466  	ADCQ CX, R15
1467  	ADCQ R8, DI
1468  	ADCQ R9, SI
1469  	ADCQ $0x00, BP
1470  
1471  	// Copy result
1472  	MOVQ R14, R10
1473  	MOVQ R15, R11
1474  	MOVQ DI, R12
1475  	MOVQ SI, R13
1476  
1477  	// Subtract p256
1478  	SUBQ $-1, R10
1479  	SBBQ p256const0<>+0(SB), R11
1480  	SBBQ $0x00, R12
1481  	SBBQ p256const1<>+0(SB), R13
1482  	SBBQ $0x00, BP
1483  
1484  	// If the result of the subtraction is negative, restore the previous result
1485  	CMOVQCS R14, R10
1486  	CMOVQCS R15, R11
1487  	CMOVQCS DI, R12
1488  	CMOVQCS SI, R13
1489  	RET
1490  
1491  // func p256PointAddAffineAsm(res *P256Point, in1 *P256Point, in2 *p256AffinePoint, sign int, sel int, zero int)
1492  // Requires: CMOV, SSE2
1493  TEXT ·p256PointAddAffineAsm(SB), $512-48
1494  	MOVQ  res+0(FP), AX
1495  	MOVQ  in1+8(FP), BX
1496  	MOVQ  in2+16(FP), CX
1497  	MOVQ  sign+24(FP), DX
1498  	MOVQ  sel+32(FP), R15
1499  	MOVQ  zero+40(FP), DI
1500  	MOVOU (BX), X0
1501  	MOVOU 16(BX), X1
1502  	MOVOU 32(BX), X2
1503  	MOVOU 48(BX), X3
1504  	MOVOU 64(BX), X4
1505  	MOVOU 80(BX), X5
1506  	MOVOU X0, (SP)
1507  	MOVOU X1, 16(SP)
1508  	MOVOU X2, 32(SP)
1509  	MOVOU X3, 48(SP)
1510  	MOVOU X4, 64(SP)
1511  	MOVOU X5, 80(SP)
1512  	MOVOU (CX), X0
1513  	MOVOU 16(CX), X1
1514  	MOVOU X0, 96(SP)
1515  	MOVOU X1, 112(SP)
1516  
1517  	// Store pointer to result
1518  	MOVQ AX, 480(SP)
1519  	MOVL R15, 488(SP)
1520  	MOVL DI, 492(SP)
1521  
1522  	// Negate y2in based on sign
1523  	MOVQ 32(CX), R10
1524  	MOVQ 40(CX), R11
1525  	MOVQ 48(CX), R12
1526  	MOVQ 56(CX), R13
1527  	MOVQ $-1, BX
1528  	MOVQ p256const0<>+0(SB), CX
1529  	MOVQ $0x00000000, R8
1530  	MOVQ p256const1<>+0(SB), R9
1531  	XORQ AX, AX
1532  
1533  	// Speculatively subtract
1534  	SUBQ R10, BX
1535  	SBBQ R11, CX
1536  	SBBQ R12, R8
1537  	SBBQ R13, R9
1538  	SBBQ $0x00, AX
1539  	MOVQ BX, R14
1540  	MOVQ CX, R15
1541  	MOVQ R8, DI
1542  	MOVQ R9, SI
1543  
1544  	// Add in case the operand was > p256
1545  	ADDQ    $-1, BX
1546  	ADCQ    p256const0<>+0(SB), CX
1547  	ADCQ    $0x00, R8
1548  	ADCQ    p256const1<>+0(SB), R9
1549  	ADCQ    $0x00, AX
1550  	CMOVQNE R14, BX
1551  	CMOVQNE R15, CX
1552  	CMOVQNE DI, R8
1553  	CMOVQNE SI, R9
1554  
1555  	// If condition is 0, keep original value
1556  	TESTQ   DX, DX
1557  	CMOVQEQ R10, BX
1558  	CMOVQEQ R11, CX
1559  	CMOVQEQ R12, R8
1560  	CMOVQEQ R13, R9
1561  
1562  	// Store result
1563  	MOVQ BX, 128(SP)
1564  	MOVQ CX, 136(SP)
1565  	MOVQ R8, 144(SP)
1566  	MOVQ R9, 152(SP)
1567  
1568  	// Begin point add
1569  	MOVQ    64(SP), R10
1570  	MOVQ    72(SP), R11
1571  	MOVQ    80(SP), R12
1572  	MOVQ    88(SP), R13
1573  	CALL    p256SqrInternal(SB)
1574  	MOVQ    R10, 288(SP)
1575  	MOVQ    R11, 296(SP)
1576  	MOVQ    R12, 304(SP)
1577  	MOVQ    R13, 312(SP)
1578  	MOVQ    96(SP), R14
1579  	MOVQ    104(SP), R15
1580  	MOVQ    112(SP), DI
1581  	MOVQ    120(SP), SI
1582  	CALL    p256MulInternal(SB)
1583  	MOVQ    (SP), R14
1584  	MOVQ    8(SP), R15
1585  	MOVQ    16(SP), DI
1586  	MOVQ    24(SP), SI
1587  	CALL    p256SubInternal(SB)
1588  	MOVQ    R10, 320(SP)
1589  	MOVQ    R11, 328(SP)
1590  	MOVQ    R12, 336(SP)
1591  	MOVQ    R13, 344(SP)
1592  	MOVQ    64(SP), R14
1593  	MOVQ    72(SP), R15
1594  	MOVQ    80(SP), DI
1595  	MOVQ    88(SP), SI
1596  	CALL    p256MulInternal(SB)
1597  	MOVQ    R10, 224(SP)
1598  	MOVQ    R11, 232(SP)
1599  	MOVQ    R12, 240(SP)
1600  	MOVQ    R13, 248(SP)
1601  	MOVQ    288(SP), R10
1602  	MOVQ    296(SP), R11
1603  	MOVQ    304(SP), R12
1604  	MOVQ    312(SP), R13
1605  	CALL    p256MulInternal(SB)
1606  	MOVQ    128(SP), R14
1607  	MOVQ    136(SP), R15
1608  	MOVQ    144(SP), DI
1609  	MOVQ    152(SP), SI
1610  	CALL    p256MulInternal(SB)
1611  	MOVQ    R10, 256(SP)
1612  	MOVQ    R11, 264(SP)
1613  	MOVQ    R12, 272(SP)
1614  	MOVQ    R13, 280(SP)
1615  	MOVQ    32(SP), R14
1616  	MOVQ    40(SP), R15
1617  	MOVQ    48(SP), DI
1618  	MOVQ    56(SP), SI
1619  	CALL    p256SubInternal(SB)
1620  	MOVQ    R10, 352(SP)
1621  	MOVQ    R11, 360(SP)
1622  	MOVQ    R12, 368(SP)
1623  	MOVQ    R13, 376(SP)
1624  	CALL    p256SqrInternal(SB)
1625  	MOVQ    R10, 416(SP)
1626  	MOVQ    R11, 424(SP)
1627  	MOVQ    R12, 432(SP)
1628  	MOVQ    R13, 440(SP)
1629  	MOVQ    320(SP), R10
1630  	MOVQ    328(SP), R11
1631  	MOVQ    336(SP), R12
1632  	MOVQ    344(SP), R13
1633  	CALL    p256SqrInternal(SB)
1634  	MOVQ    R10, 384(SP)
1635  	MOVQ    R11, 392(SP)
1636  	MOVQ    R12, 400(SP)
1637  	MOVQ    R13, 408(SP)
1638  	MOVQ    320(SP), R14
1639  	MOVQ    328(SP), R15
1640  	MOVQ    336(SP), DI
1641  	MOVQ    344(SP), SI
1642  	CALL    p256MulInternal(SB)
1643  	MOVQ    R10, 448(SP)
1644  	MOVQ    R11, 456(SP)
1645  	MOVQ    R12, 464(SP)
1646  	MOVQ    R13, 472(SP)
1647  	MOVQ    32(SP), R14
1648  	MOVQ    40(SP), R15
1649  	MOVQ    48(SP), DI
1650  	MOVQ    56(SP), SI
1651  	CALL    p256MulInternal(SB)
1652  	MOVQ    R10, 256(SP)
1653  	MOVQ    R11, 264(SP)
1654  	MOVQ    R12, 272(SP)
1655  	MOVQ    R13, 280(SP)
1656  	MOVQ    (SP), R10
1657  	MOVQ    8(SP), R11
1658  	MOVQ    16(SP), R12
1659  	MOVQ    24(SP), R13
1660  	MOVQ    384(SP), R14
1661  	MOVQ    392(SP), R15
1662  	MOVQ    400(SP), DI
1663  	MOVQ    408(SP), SI
1664  	CALL    p256MulInternal(SB)
1665  	MOVQ    R10, 320(SP)
1666  	MOVQ    R11, 328(SP)
1667  	MOVQ    R12, 336(SP)
1668  	MOVQ    R13, 344(SP)
1669  	XORQ    AX, AX
1670  	ADDQ    R10, R10
1671  	ADCQ    R11, R11
1672  	ADCQ    R12, R12
1673  	ADCQ    R13, R13
1674  	ADCQ    $+0, AX
1675  	MOVQ    R10, R14
1676  	MOVQ    R11, R15
1677  	MOVQ    R12, DI
1678  	MOVQ    R13, SI
1679  	SUBQ    $-1, R14
1680  	SBBQ    p256const0<>+0(SB), R15
1681  	SBBQ    $+0, DI
1682  	SBBQ    p256const1<>+0(SB), SI
1683  	SBBQ    $+0, AX
1684  	CMOVQCS R10, R14
1685  	CMOVQCS R11, R15
1686  	CMOVQCS R12, DI
1687  	CMOVQCS R13, SI
1688  	MOVQ    416(SP), R10
1689  	MOVQ    424(SP), R11
1690  	MOVQ    432(SP), R12
1691  	MOVQ    440(SP), R13
1692  	CALL    p256SubInternal(SB)
1693  	MOVQ    448(SP), R14
1694  	MOVQ    456(SP), R15
1695  	MOVQ    464(SP), DI
1696  	MOVQ    472(SP), SI
1697  	CALL    p256SubInternal(SB)
1698  	MOVQ    R10, 160(SP)
1699  	MOVQ    R11, 168(SP)
1700  	MOVQ    R12, 176(SP)
1701  	MOVQ    R13, 184(SP)
1702  	MOVQ    R10, R14
1703  	MOVQ    R11, R15
1704  	MOVQ    R12, DI
1705  	MOVQ    R13, SI
1706  	MOVQ    320(SP), R10
1707  	MOVQ    328(SP), R11
1708  	MOVQ    336(SP), R12
1709  	MOVQ    344(SP), R13
1710  	CALL    p256SubInternal(SB)
1711  	MOVQ    352(SP), R14
1712  	MOVQ    360(SP), R15
1713  	MOVQ    368(SP), DI
1714  	MOVQ    376(SP), SI
1715  	CALL    p256MulInternal(SB)
1716  	MOVQ    256(SP), R14
1717  	MOVQ    264(SP), R15
1718  	MOVQ    272(SP), DI
1719  	MOVQ    280(SP), SI
1720  	CALL    p256SubInternal(SB)
1721  	MOVQ    R10, 192(SP)
1722  	MOVQ    R11, 200(SP)
1723  	MOVQ    R12, 208(SP)
1724  	MOVQ    R13, 216(SP)
1725  
1726  	// Load stored values from stack
1727  	MOVQ 480(SP), AX
1728  	MOVL 488(SP), BX
1729  	MOVL 492(SP), CX
1730  
1731  	// The result is not valid if (sel == 0), conditional choose
1732  	MOVOU   160(SP), X0
1733  	MOVOU   176(SP), X1
1734  	MOVOU   192(SP), X2
1735  	MOVOU   208(SP), X3
1736  	MOVOU   224(SP), X4
1737  	MOVOU   240(SP), X5
1738  	MOVL    BX, X6
1739  	MOVL    CX, X7
1740  	PXOR    X8, X8
1741  	PCMPEQL X9, X9
1742  	PSHUFD  $0x00, X6, X6
1743  	PSHUFD  $0x00, X7, X7
1744  	PCMPEQL X8, X6
1745  	PCMPEQL X8, X7
1746  	MOVOU   X6, X15
1747  	PANDN   X9, X15
1748  	MOVOU   (SP), X9
1749  	MOVOU   16(SP), X10
1750  	MOVOU   32(SP), X11
1751  	MOVOU   48(SP), X12
1752  	MOVOU   64(SP), X13
1753  	MOVOU   80(SP), X14
1754  	PAND    X15, X0
1755  	PAND    X15, X1
1756  	PAND    X15, X2
1757  	PAND    X15, X3
1758  	PAND    X15, X4
1759  	PAND    X15, X5
1760  	PAND    X6, X9
1761  	PAND    X6, X10
1762  	PAND    X6, X11
1763  	PAND    X6, X12
1764  	PAND    X6, X13
1765  	PAND    X6, X14
1766  	PXOR    X9, X0
1767  	PXOR    X10, X1
1768  	PXOR    X11, X2
1769  	PXOR    X12, X3
1770  	PXOR    X13, X4
1771  	PXOR    X14, X5
1772  
1773  	// Similarly if zero == 0
1774  	PCMPEQL X9, X9
1775  	MOVOU   X7, X15
1776  	PANDN   X9, X15
1777  	MOVOU   96(SP), X9
1778  	MOVOU   112(SP), X10
1779  	MOVOU   128(SP), X11
1780  	MOVOU   144(SP), X12
1781  	MOVOU   p256one<>+0(SB), X13
1782  	MOVOU   p256one<>+16(SB), X14
1783  	PAND    X15, X0
1784  	PAND    X15, X1
1785  	PAND    X15, X2
1786  	PAND    X15, X3
1787  	PAND    X15, X4
1788  	PAND    X15, X5
1789  	PAND    X7, X9
1790  	PAND    X7, X10
1791  	PAND    X7, X11
1792  	PAND    X7, X12
1793  	PAND    X7, X13
1794  	PAND    X7, X14
1795  	PXOR    X9, X0
1796  	PXOR    X10, X1
1797  	PXOR    X11, X2
1798  	PXOR    X12, X3
1799  	PXOR    X13, X4
1800  	PXOR    X14, X5
1801  
1802  	// Finally output the result
1803  	MOVOU X0, (AX)
1804  	MOVOU X1, 16(AX)
1805  	MOVOU X2, 32(AX)
1806  	MOVOU X3, 48(AX)
1807  	MOVOU X4, 64(AX)
1808  	MOVOU X5, 80(AX)
1809  	MOVQ  $0x00000000, 480(SP)
1810  	RET
1811  
1812  DATA p256one<>+0(SB)/8, $0x0000000000000001
1813  DATA p256one<>+8(SB)/8, $0xffffffff00000000
1814  DATA p256one<>+16(SB)/8, $0xffffffffffffffff
1815  DATA p256one<>+24(SB)/8, $0x00000000fffffffe
1816  GLOBL p256one<>(SB), RODATA, $32
1817  
1818  // func p256IsZero()
1819  // Requires: CMOV
1820  TEXT p256IsZero(SB), NOSPLIT, $0
1821  	// AX contains a flag that is set if the input is zero.
1822  	XORQ AX, AX
1823  	MOVQ $0x00000001, R15
1824  
1825  	// Check whether [acc4..acc7] are all zero.
1826  	MOVQ R10, R14
1827  	ORQ  R11, R14
1828  	ORQ  R12, R14
1829  	ORQ  R13, R14
1830  
1831  	// Set the zero flag if so. (CMOV of a constant to a register doesn't
1832  	// appear to be supported in Go. Thus t1 = 1.)
1833  	CMOVQEQ R15, AX
1834  
1835  	// XOR [acc4..acc7] with P and compare with zero again.
1836  	XORQ $-1, R10
1837  	XORQ p256const0<>+0(SB), R11
1838  	XORQ p256const1<>+0(SB), R13
1839  	ORQ  R11, R10
1840  	ORQ  R12, R10
1841  	ORQ  R13, R10
1842  
1843  	// Set the zero flag if so.
1844  	CMOVQEQ R15, AX
1845  	RET
1846  
1847  // func p256PointAddAsm(res *P256Point, in1 *P256Point, in2 *P256Point) int
1848  // Requires: CMOV, SSE2
1849  TEXT ·p256PointAddAsm(SB), $680-32
1850  	// Move input to stack in order to free registers
1851  	MOVQ  res+0(FP), AX
1852  	MOVQ  in1+8(FP), BX
1853  	MOVQ  in2+16(FP), CX
1854  	MOVOU (BX), X0
1855  	MOVOU 16(BX), X1
1856  	MOVOU 32(BX), X2
1857  	MOVOU 48(BX), X3
1858  	MOVOU 64(BX), X4
1859  	MOVOU 80(BX), X5
1860  	MOVOU X0, (SP)
1861  	MOVOU X1, 16(SP)
1862  	MOVOU X2, 32(SP)
1863  	MOVOU X3, 48(SP)
1864  	MOVOU X4, 64(SP)
1865  	MOVOU X5, 80(SP)
1866  	MOVOU (CX), X0
1867  	MOVOU 16(CX), X1
1868  	MOVOU 32(CX), X2
1869  	MOVOU 48(CX), X3
1870  	MOVOU 64(CX), X4
1871  	MOVOU 80(CX), X5
1872  	MOVOU X0, 96(SP)
1873  	MOVOU X1, 112(SP)
1874  	MOVOU X2, 128(SP)
1875  	MOVOU X3, 144(SP)
1876  	MOVOU X4, 160(SP)
1877  	MOVOU X5, 176(SP)
1878  
1879  	// Store pointer to result
1880  	MOVQ AX, 640(SP)
1881  
1882  	// Begin point add
1883  	MOVQ    160(SP), R10
1884  	MOVQ    168(SP), R11
1885  	MOVQ    176(SP), R12
1886  	MOVQ    184(SP), R13
1887  	CALL    p256SqrInternal(SB)
1888  	MOVQ    R10, 448(SP)
1889  	MOVQ    R11, 456(SP)
1890  	MOVQ    R12, 464(SP)
1891  	MOVQ    R13, 472(SP)
1892  	MOVQ    160(SP), R14
1893  	MOVQ    168(SP), R15
1894  	MOVQ    176(SP), DI
1895  	MOVQ    184(SP), SI
1896  	CALL    p256MulInternal(SB)
1897  	MOVQ    32(SP), R14
1898  	MOVQ    40(SP), R15
1899  	MOVQ    48(SP), DI
1900  	MOVQ    56(SP), SI
1901  	CALL    p256MulInternal(SB)
1902  	MOVQ    R10, 352(SP)
1903  	MOVQ    R11, 360(SP)
1904  	MOVQ    R12, 368(SP)
1905  	MOVQ    R13, 376(SP)
1906  	MOVQ    64(SP), R10
1907  	MOVQ    72(SP), R11
1908  	MOVQ    80(SP), R12
1909  	MOVQ    88(SP), R13
1910  	CALL    p256SqrInternal(SB)
1911  	MOVQ    R10, 416(SP)
1912  	MOVQ    R11, 424(SP)
1913  	MOVQ    R12, 432(SP)
1914  	MOVQ    R13, 440(SP)
1915  	MOVQ    64(SP), R14
1916  	MOVQ    72(SP), R15
1917  	MOVQ    80(SP), DI
1918  	MOVQ    88(SP), SI
1919  	CALL    p256MulInternal(SB)
1920  	MOVQ    128(SP), R14
1921  	MOVQ    136(SP), R15
1922  	MOVQ    144(SP), DI
1923  	MOVQ    152(SP), SI
1924  	CALL    p256MulInternal(SB)
1925  	MOVQ    R10, 384(SP)
1926  	MOVQ    R11, 392(SP)
1927  	MOVQ    R12, 400(SP)
1928  	MOVQ    R13, 408(SP)
1929  	MOVQ    352(SP), R14
1930  	MOVQ    360(SP), R15
1931  	MOVQ    368(SP), DI
1932  	MOVQ    376(SP), SI
1933  	CALL    p256SubInternal(SB)
1934  	MOVQ    R10, 512(SP)
1935  	MOVQ    R11, 520(SP)
1936  	MOVQ    R12, 528(SP)
1937  	MOVQ    R13, 536(SP)
1938  	CALL    p256IsZero(SB)
1939  	MOVQ    AX, 648(SP)
1940  	MOVQ    448(SP), R10
1941  	MOVQ    456(SP), R11
1942  	MOVQ    464(SP), R12
1943  	MOVQ    472(SP), R13
1944  	MOVQ    (SP), R14
1945  	MOVQ    8(SP), R15
1946  	MOVQ    16(SP), DI
1947  	MOVQ    24(SP), SI
1948  	CALL    p256MulInternal(SB)
1949  	MOVQ    R10, 288(SP)
1950  	MOVQ    R11, 296(SP)
1951  	MOVQ    R12, 304(SP)
1952  	MOVQ    R13, 312(SP)
1953  	MOVQ    416(SP), R10
1954  	MOVQ    424(SP), R11
1955  	MOVQ    432(SP), R12
1956  	MOVQ    440(SP), R13
1957  	MOVQ    96(SP), R14
1958  	MOVQ    104(SP), R15
1959  	MOVQ    112(SP), DI
1960  	MOVQ    120(SP), SI
1961  	CALL    p256MulInternal(SB)
1962  	MOVQ    R10, 320(SP)
1963  	MOVQ    R11, 328(SP)
1964  	MOVQ    R12, 336(SP)
1965  	MOVQ    R13, 344(SP)
1966  	MOVQ    288(SP), R14
1967  	MOVQ    296(SP), R15
1968  	MOVQ    304(SP), DI
1969  	MOVQ    312(SP), SI
1970  	CALL    p256SubInternal(SB)
1971  	MOVQ    R10, 480(SP)
1972  	MOVQ    R11, 488(SP)
1973  	MOVQ    R12, 496(SP)
1974  	MOVQ    R13, 504(SP)
1975  	CALL    p256IsZero(SB)
1976  	ANDQ    648(SP), AX
1977  	MOVQ    AX, 648(SP)
1978  	MOVQ    512(SP), R10
1979  	MOVQ    520(SP), R11
1980  	MOVQ    528(SP), R12
1981  	MOVQ    536(SP), R13
1982  	CALL    p256SqrInternal(SB)
1983  	MOVQ    R10, 576(SP)
1984  	MOVQ    R11, 584(SP)
1985  	MOVQ    R12, 592(SP)
1986  	MOVQ    R13, 600(SP)
1987  	MOVQ    480(SP), R10
1988  	MOVQ    488(SP), R11
1989  	MOVQ    496(SP), R12
1990  	MOVQ    504(SP), R13
1991  	CALL    p256SqrInternal(SB)
1992  	MOVQ    R10, 544(SP)
1993  	MOVQ    R11, 552(SP)
1994  	MOVQ    R12, 560(SP)
1995  	MOVQ    R13, 568(SP)
1996  	MOVQ    480(SP), R14
1997  	MOVQ    488(SP), R15
1998  	MOVQ    496(SP), DI
1999  	MOVQ    504(SP), SI
2000  	CALL    p256MulInternal(SB)
2001  	MOVQ    R10, 608(SP)
2002  	MOVQ    R11, 616(SP)
2003  	MOVQ    R12, 624(SP)
2004  	MOVQ    R13, 632(SP)
2005  	MOVQ    352(SP), R14
2006  	MOVQ    360(SP), R15
2007  	MOVQ    368(SP), DI
2008  	MOVQ    376(SP), SI
2009  	CALL    p256MulInternal(SB)
2010  	MOVQ    R10, 384(SP)
2011  	MOVQ    R11, 392(SP)
2012  	MOVQ    R12, 400(SP)
2013  	MOVQ    R13, 408(SP)
2014  	MOVQ    64(SP), R10
2015  	MOVQ    72(SP), R11
2016  	MOVQ    80(SP), R12
2017  	MOVQ    88(SP), R13
2018  	MOVQ    160(SP), R14
2019  	MOVQ    168(SP), R15
2020  	MOVQ    176(SP), DI
2021  	MOVQ    184(SP), SI
2022  	CALL    p256MulInternal(SB)
2023  	MOVQ    480(SP), R14
2024  	MOVQ    488(SP), R15
2025  	MOVQ    496(SP), DI
2026  	MOVQ    504(SP), SI
2027  	CALL    p256MulInternal(SB)
2028  	MOVQ    R10, 256(SP)
2029  	MOVQ    R11, 264(SP)
2030  	MOVQ    R12, 272(SP)
2031  	MOVQ    R13, 280(SP)
2032  	MOVQ    544(SP), R10
2033  	MOVQ    552(SP), R11
2034  	MOVQ    560(SP), R12
2035  	MOVQ    568(SP), R13
2036  	MOVQ    288(SP), R14
2037  	MOVQ    296(SP), R15
2038  	MOVQ    304(SP), DI
2039  	MOVQ    312(SP), SI
2040  	CALL    p256MulInternal(SB)
2041  	MOVQ    R10, 320(SP)
2042  	MOVQ    R11, 328(SP)
2043  	MOVQ    R12, 336(SP)
2044  	MOVQ    R13, 344(SP)
2045  	XORQ    AX, AX
2046  	ADDQ    R10, R10
2047  	ADCQ    R11, R11
2048  	ADCQ    R12, R12
2049  	ADCQ    R13, R13
2050  	ADCQ    $+0, AX
2051  	MOVQ    R10, R14
2052  	MOVQ    R11, R15
2053  	MOVQ    R12, DI
2054  	MOVQ    R13, SI
2055  	SUBQ    $-1, R14
2056  	SBBQ    p256const0<>+0(SB), R15
2057  	SBBQ    $+0, DI
2058  	SBBQ    p256const1<>+0(SB), SI
2059  	SBBQ    $+0, AX
2060  	CMOVQCS R10, R14
2061  	CMOVQCS R11, R15
2062  	CMOVQCS R12, DI
2063  	CMOVQCS R13, SI
2064  	MOVQ    576(SP), R10
2065  	MOVQ    584(SP), R11
2066  	MOVQ    592(SP), R12
2067  	MOVQ    600(SP), R13
2068  	CALL    p256SubInternal(SB)
2069  	MOVQ    608(SP), R14
2070  	MOVQ    616(SP), R15
2071  	MOVQ    624(SP), DI
2072  	MOVQ    632(SP), SI
2073  	CALL    p256SubInternal(SB)
2074  	MOVQ    R10, 192(SP)
2075  	MOVQ    R11, 200(SP)
2076  	MOVQ    R12, 208(SP)
2077  	MOVQ    R13, 216(SP)
2078  	MOVQ    R10, R14
2079  	MOVQ    R11, R15
2080  	MOVQ    R12, DI
2081  	MOVQ    R13, SI
2082  	MOVQ    320(SP), R10
2083  	MOVQ    328(SP), R11
2084  	MOVQ    336(SP), R12
2085  	MOVQ    344(SP), R13
2086  	CALL    p256SubInternal(SB)
2087  	MOVQ    512(SP), R14
2088  	MOVQ    520(SP), R15
2089  	MOVQ    528(SP), DI
2090  	MOVQ    536(SP), SI
2091  	CALL    p256MulInternal(SB)
2092  	MOVQ    384(SP), R14
2093  	MOVQ    392(SP), R15
2094  	MOVQ    400(SP), DI
2095  	MOVQ    408(SP), SI
2096  	CALL    p256SubInternal(SB)
2097  	MOVQ    R10, 224(SP)
2098  	MOVQ    R11, 232(SP)
2099  	MOVQ    R12, 240(SP)
2100  	MOVQ    R13, 248(SP)
2101  	MOVOU   192(SP), X0
2102  	MOVOU   208(SP), X1
2103  	MOVOU   224(SP), X2
2104  	MOVOU   240(SP), X3
2105  	MOVOU   256(SP), X4
2106  	MOVOU   272(SP), X5
2107  
2108  	// Finally output the result
2109  	MOVQ  640(SP), AX
2110  	MOVQ  $0x00000000, 640(SP)
2111  	MOVOU X0, (AX)
2112  	MOVOU X1, 16(AX)
2113  	MOVOU X2, 32(AX)
2114  	MOVOU X3, 48(AX)
2115  	MOVOU X4, 64(AX)
2116  	MOVOU X5, 80(AX)
2117  	MOVQ  648(SP), AX
2118  	MOVQ  AX, ret+24(FP)
2119  	RET
2120  
2121  // func p256PointDoubleAsm(res *P256Point, in *P256Point)
2122  // Requires: CMOV, SSE2
2123  TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $256-16
2124  	MOVQ  res+0(FP), AX
2125  	MOVQ  in+8(FP), BX
2126  	MOVOU (BX), X0
2127  	MOVOU 16(BX), X1
2128  	MOVOU 32(BX), X2
2129  	MOVOU 48(BX), X3
2130  	MOVOU 64(BX), X4
2131  	MOVOU 80(BX), X5
2132  	MOVOU X0, (SP)
2133  	MOVOU X1, 16(SP)
2134  	MOVOU X2, 32(SP)
2135  	MOVOU X3, 48(SP)
2136  	MOVOU X4, 64(SP)
2137  	MOVOU X5, 80(SP)
2138  
2139  	// Store pointer to result
2140  	MOVQ AX, 224(SP)
2141  
2142  	// Begin point double
2143  	MOVQ    64(SP), R10
2144  	MOVQ    72(SP), R11
2145  	MOVQ    80(SP), R12
2146  	MOVQ    88(SP), R13
2147  	CALL    p256SqrInternal(SB)
2148  	MOVQ    R10, 160(SP)
2149  	MOVQ    R11, 168(SP)
2150  	MOVQ    R12, 176(SP)
2151  	MOVQ    R13, 184(SP)
2152  	MOVQ    (SP), R14
2153  	MOVQ    8(SP), R15
2154  	MOVQ    16(SP), DI
2155  	MOVQ    24(SP), SI
2156  	XORQ    AX, AX
2157  	ADDQ    R14, R10
2158  	ADCQ    R15, R11
2159  	ADCQ    DI, R12
2160  	ADCQ    SI, R13
2161  	ADCQ    $+0, AX
2162  	MOVQ    R10, R14
2163  	MOVQ    R11, R15
2164  	MOVQ    R12, DI
2165  	MOVQ    R13, SI
2166  	SUBQ    $-1, R14
2167  	SBBQ    p256const0<>+0(SB), R15
2168  	SBBQ    $+0, DI
2169  	SBBQ    p256const1<>+0(SB), SI
2170  	SBBQ    $+0, AX
2171  	CMOVQCS R10, R14
2172  	CMOVQCS R11, R15
2173  	CMOVQCS R12, DI
2174  	CMOVQCS R13, SI
2175  	MOVQ    R14, 128(SP)
2176  	MOVQ    R15, 136(SP)
2177  	MOVQ    DI, 144(SP)
2178  	MOVQ    SI, 152(SP)
2179  	MOVQ    64(SP), R10
2180  	MOVQ    72(SP), R11
2181  	MOVQ    80(SP), R12
2182  	MOVQ    88(SP), R13
2183  	MOVQ    32(SP), R14
2184  	MOVQ    40(SP), R15
2185  	MOVQ    48(SP), DI
2186  	MOVQ    56(SP), SI
2187  	CALL    p256MulInternal(SB)
2188  	XORQ    AX, AX
2189  	ADDQ    R10, R10
2190  	ADCQ    R11, R11
2191  	ADCQ    R12, R12
2192  	ADCQ    R13, R13
2193  	ADCQ    $+0, AX
2194  	MOVQ    R10, R14
2195  	MOVQ    R11, R15
2196  	MOVQ    R12, DI
2197  	MOVQ    R13, SI
2198  	SUBQ    $-1, R14
2199  	SBBQ    p256const0<>+0(SB), R15
2200  	SBBQ    $+0, DI
2201  	SBBQ    p256const1<>+0(SB), SI
2202  	SBBQ    $+0, AX
2203  	CMOVQCS R10, R14
2204  	CMOVQCS R11, R15
2205  	CMOVQCS R12, DI
2206  	CMOVQCS R13, SI
2207  	MOVQ    224(SP), AX
2208  
2209  	// Store z
2210  	MOVQ R14, 64(AX)
2211  	MOVQ R15, 72(AX)
2212  	MOVQ DI, 80(AX)
2213  	MOVQ SI, 88(AX)
2214  	MOVQ (SP), R10
2215  	MOVQ 8(SP), R11
2216  	MOVQ 16(SP), R12
2217  	MOVQ 24(SP), R13
2218  	MOVQ 160(SP), R14
2219  	MOVQ 168(SP), R15
2220  	MOVQ 176(SP), DI
2221  	MOVQ 184(SP), SI
2222  	CALL p256SubInternal(SB)
2223  	MOVQ 128(SP), R14
2224  	MOVQ 136(SP), R15
2225  	MOVQ 144(SP), DI
2226  	MOVQ 152(SP), SI
2227  	CALL p256MulInternal(SB)
2228  	MOVQ R10, 128(SP)
2229  	MOVQ R11, 136(SP)
2230  	MOVQ R12, 144(SP)
2231  	MOVQ R13, 152(SP)
2232  
2233  	// Multiply by 3
2234  	XORQ    AX, AX
2235  	ADDQ    R10, R10
2236  	ADCQ    R11, R11
2237  	ADCQ    R12, R12
2238  	ADCQ    R13, R13
2239  	ADCQ    $+0, AX
2240  	MOVQ    R10, R14
2241  	MOVQ    R11, R15
2242  	MOVQ    R12, DI
2243  	MOVQ    R13, SI
2244  	SUBQ    $-1, R14
2245  	SBBQ    p256const0<>+0(SB), R15
2246  	SBBQ    $+0, DI
2247  	SBBQ    p256const1<>+0(SB), SI
2248  	SBBQ    $+0, AX
2249  	CMOVQCS R10, R14
2250  	CMOVQCS R11, R15
2251  	CMOVQCS R12, DI
2252  	CMOVQCS R13, SI
2253  	MOVQ    128(SP), R10
2254  	MOVQ    136(SP), R11
2255  	MOVQ    144(SP), R12
2256  	MOVQ    152(SP), R13
2257  	XORQ    AX, AX
2258  	ADDQ    R14, R10
2259  	ADCQ    R15, R11
2260  	ADCQ    DI, R12
2261  	ADCQ    SI, R13
2262  	ADCQ    $+0, AX
2263  	MOVQ    R10, R14
2264  	MOVQ    R11, R15
2265  	MOVQ    R12, DI
2266  	MOVQ    R13, SI
2267  	SUBQ    $-1, R14
2268  	SBBQ    p256const0<>+0(SB), R15
2269  	SBBQ    $+0, DI
2270  	SBBQ    p256const1<>+0(SB), SI
2271  	SBBQ    $+0, AX
2272  	CMOVQCS R10, R14
2273  	CMOVQCS R11, R15
2274  	CMOVQCS R12, DI
2275  	CMOVQCS R13, SI
2276  	MOVQ    R14, 128(SP)
2277  	MOVQ    R15, 136(SP)
2278  	MOVQ    DI, 144(SP)
2279  	MOVQ    SI, 152(SP)
2280  
2281  	// ////////////////////////
2282  	MOVQ    32(SP), R10
2283  	MOVQ    40(SP), R11
2284  	MOVQ    48(SP), R12
2285  	MOVQ    56(SP), R13
2286  	XORQ    AX, AX
2287  	ADDQ    R10, R10
2288  	ADCQ    R11, R11
2289  	ADCQ    R12, R12
2290  	ADCQ    R13, R13
2291  	ADCQ    $+0, AX
2292  	MOVQ    R10, R14
2293  	MOVQ    R11, R15
2294  	MOVQ    R12, DI
2295  	MOVQ    R13, SI
2296  	SUBQ    $-1, R14
2297  	SBBQ    p256const0<>+0(SB), R15
2298  	SBBQ    $+0, DI
2299  	SBBQ    p256const1<>+0(SB), SI
2300  	SBBQ    $+0, AX
2301  	CMOVQCS R10, R14
2302  	CMOVQCS R11, R15
2303  	CMOVQCS R12, DI
2304  	CMOVQCS R13, SI
2305  	MOVQ    R14, R10
2306  	MOVQ    R15, R11
2307  	MOVQ    DI, R12
2308  	MOVQ    SI, R13
2309  	CALL    p256SqrInternal(SB)
2310  	MOVQ    R10, 96(SP)
2311  	MOVQ    R11, 104(SP)
2312  	MOVQ    R12, 112(SP)
2313  	MOVQ    R13, 120(SP)
2314  	CALL    p256SqrInternal(SB)
2315  
2316  	// Divide by 2
2317  	XORQ    AX, AX
2318  	MOVQ    R10, R14
2319  	MOVQ    R11, R15
2320  	MOVQ    R12, DI
2321  	MOVQ    R13, SI
2322  	ADDQ    $-1, R10
2323  	ADCQ    p256const0<>+0(SB), R11
2324  	ADCQ    $0x00, R12
2325  	ADCQ    p256const1<>+0(SB), R13
2326  	ADCQ    $0x00, AX
2327  	TESTQ   $0x00000001, R14
2328  	CMOVQEQ R14, R10
2329  	CMOVQEQ R15, R11
2330  	CMOVQEQ DI, R12
2331  	CMOVQEQ SI, R13
2332  	ANDQ    R14, AX
2333  	SHRQ    $0x01, R11, R10
2334  	SHRQ    $0x01, R12, R11
2335  	SHRQ    $0x01, R13, R12
2336  	SHRQ    $0x01, AX, R13
2337  	MOVQ    R10, 32(SP)
2338  	MOVQ    R11, 40(SP)
2339  	MOVQ    R12, 48(SP)
2340  	MOVQ    R13, 56(SP)
2341  
2342  	// /////////////////////////
2343  	MOVQ    (SP), R10
2344  	MOVQ    8(SP), R11
2345  	MOVQ    16(SP), R12
2346  	MOVQ    24(SP), R13
2347  	MOVQ    96(SP), R14
2348  	MOVQ    104(SP), R15
2349  	MOVQ    112(SP), DI
2350  	MOVQ    120(SP), SI
2351  	CALL    p256MulInternal(SB)
2352  	MOVQ    R10, 96(SP)
2353  	MOVQ    R11, 104(SP)
2354  	MOVQ    R12, 112(SP)
2355  	MOVQ    R13, 120(SP)
2356  	XORQ    AX, AX
2357  	ADDQ    R10, R10
2358  	ADCQ    R11, R11
2359  	ADCQ    R12, R12
2360  	ADCQ    R13, R13
2361  	ADCQ    $+0, AX
2362  	MOVQ    R10, R14
2363  	MOVQ    R11, R15
2364  	MOVQ    R12, DI
2365  	MOVQ    R13, SI
2366  	SUBQ    $-1, R14
2367  	SBBQ    p256const0<>+0(SB), R15
2368  	SBBQ    $+0, DI
2369  	SBBQ    p256const1<>+0(SB), SI
2370  	SBBQ    $+0, AX
2371  	CMOVQCS R10, R14
2372  	CMOVQCS R11, R15
2373  	CMOVQCS R12, DI
2374  	CMOVQCS R13, SI
2375  	MOVQ    R14, 192(SP)
2376  	MOVQ    R15, 200(SP)
2377  	MOVQ    DI, 208(SP)
2378  	MOVQ    SI, 216(SP)
2379  	MOVQ    128(SP), R10
2380  	MOVQ    136(SP), R11
2381  	MOVQ    144(SP), R12
2382  	MOVQ    152(SP), R13
2383  	CALL    p256SqrInternal(SB)
2384  	MOVQ    192(SP), R14
2385  	MOVQ    200(SP), R15
2386  	MOVQ    208(SP), DI
2387  	MOVQ    216(SP), SI
2388  	CALL    p256SubInternal(SB)
2389  	MOVQ    224(SP), AX
2390  
2391  	// Store x
2392  	MOVQ R10, (AX)
2393  	MOVQ R11, 8(AX)
2394  	MOVQ R12, 16(AX)
2395  	MOVQ R13, 24(AX)
2396  	MOVQ R10, R14
2397  	MOVQ R11, R15
2398  	MOVQ R12, DI
2399  	MOVQ R13, SI
2400  	MOVQ 96(SP), R10
2401  	MOVQ 104(SP), R11
2402  	MOVQ 112(SP), R12
2403  	MOVQ 120(SP), R13
2404  	CALL p256SubInternal(SB)
2405  	MOVQ 128(SP), R14
2406  	MOVQ 136(SP), R15
2407  	MOVQ 144(SP), DI
2408  	MOVQ 152(SP), SI
2409  	CALL p256MulInternal(SB)
2410  	MOVQ 32(SP), R14
2411  	MOVQ 40(SP), R15
2412  	MOVQ 48(SP), DI
2413  	MOVQ 56(SP), SI
2414  	CALL p256SubInternal(SB)
2415  	MOVQ 224(SP), AX
2416  
2417  	// Store y
2418  	MOVQ R10, 32(AX)
2419  	MOVQ R11, 40(AX)
2420  	MOVQ R12, 48(AX)
2421  	MOVQ R13, 56(AX)
2422  
2423  	// ///////////////////////
2424  	MOVQ $0x00000000, 224(SP)
2425  	RET
2426