curve_amd64.s raw

   1  //go:build amd64 && !purego
   2  // +build amd64,!purego
   3  
   4  #include "textflag.h"
   5  
   6  // Depends on circl/math/fp448 package
   7  #include "../../math/fp448/fp_amd64.h"
   8  #include "curve_amd64.h"
   9  
  10  // CTE_A24 is (A+2)/4 from Curve448
  11  #define CTE_A24 39082
  12  
  13  #define Size 56
  14  
  15  // multiplyA24Leg multiplies x times CTE_A24 and stores in z
  16  // Uses: AX, DX, R8-R15, FLAGS
  17  // Instr: x86_64, cmov, adx
  18  #define multiplyA24Leg(z,x) \
  19      MOVQ $CTE_A24, R15; \
  20      MOVQ  0+x, AX; MULQ R15; MOVQ AX,  R8; ;;;;;;;;;;;;  MOVQ DX,  R9; \
  21      MOVQ  8+x, AX; MULQ R15; ADDQ AX,  R9; ADCQ $0, DX;  MOVQ DX, R10; \
  22      MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX;  MOVQ DX, R11; \
  23      MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX;  MOVQ DX, R12; \
  24      MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX;  MOVQ DX, R13; \
  25      MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX;  MOVQ DX, R14; \
  26      MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \
  27      MOVQ DX,  AX; \
  28      SHLQ $32, AX; \
  29      ADDQ DX,  R8; MOVQ $0, DX; \
  30      ADCQ $0,  R9; \
  31      ADCQ $0, R10; \
  32      ADCQ AX, R11; \
  33      ADCQ $0, R12; \
  34      ADCQ $0, R13; \
  35      ADCQ $0, R14; \
  36      ADCQ $0,  DX; \
  37      MOVQ DX,  AX; \
  38      SHLQ $32, AX; \
  39      ADDQ DX,  R8; \
  40      ADCQ $0,  R9; \
  41      ADCQ $0, R10; \
  42      ADCQ AX, R11; \
  43      ADCQ $0, R12; \
  44      ADCQ $0, R13; \
  45      ADCQ $0, R14; \
  46      MOVQ  R8,  0+z; \
  47      MOVQ  R9,  8+z; \
  48      MOVQ R10, 16+z; \
  49      MOVQ R11, 24+z; \
  50      MOVQ R12, 32+z; \
  51      MOVQ R13, 40+z; \
  52      MOVQ R14, 48+z;
  53  
  54  // multiplyA24Adx multiplies x times CTE_A24 and stores in z
  55  // Uses: AX, DX, R8-R14, FLAGS
  56  // Instr: x86_64, bmi2
  57  #define multiplyA24Adx(z,x) \
  58      MOVQ $CTE_A24, DX; \
  59      MULXQ  0+x, R8,  R9; \
  60      MULXQ  8+x, AX, R10;  ADDQ AX,  R9; \
  61      MULXQ 16+x, AX, R11;  ADCQ AX, R10; \
  62      MULXQ 24+x, AX, R12;  ADCQ AX, R11; \
  63      MULXQ 32+x, AX, R13;  ADCQ AX, R12; \
  64      MULXQ 40+x, AX, R14;  ADCQ AX, R13; \
  65      MULXQ 48+x, AX,  DX;  ADCQ AX, R14; \
  66      ;;;;;;;;;;;;;;;;;;;;  ADCQ $0,  DX; \
  67      MOVQ DX,  AX; \
  68      SHLQ $32, AX; \
  69      ADDQ DX,  R8; MOVQ $0, DX; \
  70      ADCQ $0,  R9; \
  71      ADCQ $0, R10; \
  72      ADCQ AX, R11; \
  73      ADCQ $0, R12; \
  74      ADCQ $0, R13; \
  75      ADCQ $0, R14; \
  76      ADCQ $0,  DX; \
  77      MOVQ DX,  AX; \
  78      SHLQ $32, AX; \
  79      ADDQ DX,  R8; \
  80      ADCQ $0,  R9; \
  81      ADCQ $0, R10; \
  82      ADCQ AX, R11; \
  83      ADCQ $0, R12; \
  84      ADCQ $0, R13; \
  85      ADCQ $0, R14; \
  86      MOVQ  R8,  0+z; \
  87      MOVQ  R9,  8+z; \
  88      MOVQ R10, 16+z; \
  89      MOVQ R11, 24+z; \
  90      MOVQ R12, 32+z; \
  91      MOVQ R13, 40+z; \
  92      MOVQ R14, 48+z;
  93  
  94  #define mulA24Legacy \
  95      multiplyA24Leg(0(DI),0(SI))
  96  #define mulA24Bmi2Adx \
  97      multiplyA24Adx(0(DI),0(SI))
  98  
  99  // func mulA24Amd64(z, x *fp448.Elt)
 100  TEXT ·mulA24Amd64(SB),NOSPLIT,$0-16
 101      MOVQ z+0(FP), DI
 102      MOVQ x+8(FP), SI
 103      CHECK_BMI2ADX(LMA24, mulA24Legacy, mulA24Bmi2Adx)
 104  
 105  // func ladderStepAmd64(w *[5]fp448.Elt, b uint)
 106  // ladderStepAmd64 calculates a point addition and doubling as follows:
 107  // (x2,z2) = 2*(x2,z2) and (x3,z3) = (x2,z2)+(x3,z3) using as a difference (x1,-).
 108  //    w    = {x1,x2,z2,x3,z4} are five fp255.Elt of 56 bytes.
 109  //  stack  = (t0,t1) are two fp.Elt of fp.Size bytes, and
 110  //           (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
 111  TEXT ·ladderStepAmd64(SB),NOSPLIT,$336-16
 112      // Parameters
 113      #define regWork DI
 114      #define regMove SI
 115      #define x1 0*Size(regWork)
 116      #define x2 1*Size(regWork)
 117      #define z2 2*Size(regWork)
 118      #define x3 3*Size(regWork)
 119      #define z3 4*Size(regWork)
 120      // Local variables
 121      #define t0 0*Size(SP)
 122      #define t1 1*Size(SP)
 123      #define b0 2*Size(SP)
 124      #define b1 4*Size(SP)
 125      MOVQ w+0(FP), regWork
 126      MOVQ b+8(FP), regMove
 127      CHECK_BMI2ADX(LLADSTEP, ladderStepLeg, ladderStepBmi2Adx)
 128      #undef regWork
 129      #undef regMove
 130      #undef x1
 131      #undef x2
 132      #undef z2
 133      #undef x3
 134      #undef z3
 135      #undef t0
 136      #undef t1
 137      #undef b0
 138      #undef b1
 139  
 140  // func diffAddAmd64(work *[5]fp.Elt, swap uint)
 141  // diffAddAmd64 calculates a differential point addition using a precomputed point.
 142  // (x1,z1) = (x1,z1)+(mu) using a difference point (x2,z2)
 143  //    work = {mu,x1,z1,x2,z2} are five fp448.Elt of 56 bytes, and
 144  //   stack = (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
 145  // This is Equation 7 at https://eprint.iacr.org/2017/264.
 146  TEXT ·diffAddAmd64(SB),NOSPLIT,$224-16
 147      // Parameters
 148      #define regWork DI
 149      #define regSwap SI
 150      #define ui 0*Size(regWork)
 151      #define x1 1*Size(regWork)
 152      #define z1 2*Size(regWork)
 153      #define x2 3*Size(regWork)
 154      #define z2 4*Size(regWork)
 155      // Local variables
 156      #define b0 0*Size(SP)
 157      #define b1 2*Size(SP)
 158      MOVQ w+0(FP), regWork
 159      MOVQ b+8(FP), regSwap
 160      cswap(x1,x2,regSwap)
 161      cswap(z1,z2,regSwap)
 162      CHECK_BMI2ADX(LDIFADD, difAddLeg, difAddBmi2Adx)
 163      #undef regWork
 164      #undef regSwap
 165      #undef ui
 166      #undef x1
 167      #undef z1
 168      #undef x2
 169      #undef z2
 170      #undef b0
 171      #undef b1
 172  
 173  // func doubleAmd64(x, z *fp448.Elt)
 174  // doubleAmd64 calculates a point doubling (x1,z1) = 2*(x1,z1).
 175  //  stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
 176  //          (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
 177  TEXT ·doubleAmd64(SB),NOSPLIT,$336-16
 178      // Parameters
 179      #define x1 0(DI)
 180      #define z1 0(SI)
 181      // Local variables
 182      #define t0 0*Size(SP)
 183      #define t1 1*Size(SP)
 184      #define b0 2*Size(SP)
 185      #define b1 4*Size(SP)
 186      MOVQ x+0(FP), DI
 187      MOVQ z+8(FP), SI
 188      CHECK_BMI2ADX(LDOUB,doubleLeg,doubleBmi2Adx)
 189      #undef x1
 190      #undef z1
 191      #undef t0
 192      #undef t1
 193      #undef b0
 194      #undef b1
 195