fp_amd64.h raw

   1  // This code was imported from https://github.com/armfazh/rfc7748_precomputed
   2  
   3  // CHECK_BMI2ADX triggers bmi2adx if supported,
   4  // otherwise it fallbacks to legacy code.
   5  #define CHECK_BMI2ADX(label, legacy, bmi2adx) \
   6      CMPB ·hasBmi2Adx(SB), $0  \
   7      JE label                  \
   8      bmi2adx                   \
   9      RET                       \
  10      label:                    \
  11      legacy                    \
  12      RET
  13  
  14  // cselect is a conditional move
  15  // if b=1: it copies y into x;
  16  // if b=0: x remains with the same value;
  17  // if b<> 0,1: undefined.
  18  // Uses: AX, DX, FLAGS
  19  // Instr: x86_64, cmov
  20  #define cselect(x,y,b) \
  21      TESTQ b, b \
  22      MOVQ  0+x, AX; MOVQ  0+y, DX; CMOVQNE DX, AX; MOVQ AX,  0+x; \
  23      MOVQ  8+x, AX; MOVQ  8+y, DX; CMOVQNE DX, AX; MOVQ AX,  8+x; \
  24      MOVQ 16+x, AX; MOVQ 16+y, DX; CMOVQNE DX, AX; MOVQ AX, 16+x; \
  25      MOVQ 24+x, AX; MOVQ 24+y, DX; CMOVQNE DX, AX; MOVQ AX, 24+x; \
  26      MOVQ 32+x, AX; MOVQ 32+y, DX; CMOVQNE DX, AX; MOVQ AX, 32+x; \
  27      MOVQ 40+x, AX; MOVQ 40+y, DX; CMOVQNE DX, AX; MOVQ AX, 40+x; \
  28      MOVQ 48+x, AX; MOVQ 48+y, DX; CMOVQNE DX, AX; MOVQ AX, 48+x;
  29  
  30  // cswap is a conditional swap
  31  // if b=1: x,y <- y,x;
  32  // if b=0: x,y remain with the same values;
  33  // if b<> 0,1: undefined.
  34  // Uses: AX, DX, R8, FLAGS
  35  // Instr: x86_64, cmov
  36  #define cswap(x,y,b) \
  37      TESTQ b, b \
  38      MOVQ  0+x, AX; MOVQ AX, R8; MOVQ  0+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX,  0+x; MOVQ DX,  0+y; \
  39      MOVQ  8+x, AX; MOVQ AX, R8; MOVQ  8+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX,  8+x; MOVQ DX,  8+y; \
  40      MOVQ 16+x, AX; MOVQ AX, R8; MOVQ 16+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 16+x; MOVQ DX, 16+y; \
  41      MOVQ 24+x, AX; MOVQ AX, R8; MOVQ 24+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 24+x; MOVQ DX, 24+y; \
  42      MOVQ 32+x, AX; MOVQ AX, R8; MOVQ 32+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 32+x; MOVQ DX, 32+y; \
  43      MOVQ 40+x, AX; MOVQ AX, R8; MOVQ 40+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 40+x; MOVQ DX, 40+y; \
  44      MOVQ 48+x, AX; MOVQ AX, R8; MOVQ 48+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 48+x; MOVQ DX, 48+y;
  45  
  46  // additionLeg adds x and y and stores in z
  47  // Uses: AX, DX, R8-R14, FLAGS
  48  // Instr: x86_64
  49  #define additionLeg(z,x,y) \
  50      MOVQ  0+x,  R8;  ADDQ  0+y,  R8; \
  51      MOVQ  8+x,  R9;  ADCQ  8+y,  R9; \
  52      MOVQ 16+x, R10;  ADCQ 16+y, R10; \
  53      MOVQ 24+x, R11;  ADCQ 24+y, R11; \
  54      MOVQ 32+x, R12;  ADCQ 32+y, R12; \
  55      MOVQ 40+x, R13;  ADCQ 40+y, R13; \
  56      MOVQ 48+x, R14;  ADCQ 48+y, R14; \
  57      MOVQ   $0,  AX;  ADCQ   $0,  AX; \
  58      MOVQ AX,  DX; \
  59      SHLQ $32, DX; \
  60      ADDQ AX,  R8; MOVQ  $0, AX; \
  61      ADCQ $0,  R9; \
  62      ADCQ $0, R10; \
  63      ADCQ DX, R11; \
  64      ADCQ $0, R12; \
  65      ADCQ $0, R13; \
  66      ADCQ $0, R14; \
  67      ADCQ $0,  AX; \
  68      MOVQ AX,  DX; \
  69      SHLQ $32, DX; \
  70      ADDQ AX,  R8;  MOVQ  R8,  0+z; \
  71      ADCQ $0,  R9;  MOVQ  R9,  8+z; \
  72      ADCQ $0, R10;  MOVQ R10, 16+z; \
  73      ADCQ DX, R11;  MOVQ R11, 24+z; \
  74      ADCQ $0, R12;  MOVQ R12, 32+z; \
  75      ADCQ $0, R13;  MOVQ R13, 40+z; \
  76      ADCQ $0, R14;  MOVQ R14, 48+z;
  77  
  78  
  79  // additionAdx adds x and y and stores in z
  80  // Uses: AX, DX, R8-R15, FLAGS
  81  // Instr: x86_64, adx
  82  #define additionAdx(z,x,y) \
  83      MOVL $32, R15; \
  84      XORL DX, DX; \
  85      MOVQ  0+x,  R8;  ADCXQ  0+y,  R8; \
  86      MOVQ  8+x,  R9;  ADCXQ  8+y,  R9; \
  87      MOVQ 16+x, R10;  ADCXQ 16+y, R10; \
  88      MOVQ 24+x, R11;  ADCXQ 24+y, R11; \
  89      MOVQ 32+x, R12;  ADCXQ 32+y, R12; \
  90      MOVQ 40+x, R13;  ADCXQ 40+y, R13; \
  91      MOVQ 48+x, R14;  ADCXQ 48+y, R14; \
  92      ;;;;;;;;;;;;;;;  ADCXQ   DX,  DX; \
  93      XORL AX, AX; \
  94      ADCXQ DX,  R8; SHLXQ R15, DX, DX; \
  95      ADCXQ AX,  R9; \
  96      ADCXQ AX, R10; \
  97      ADCXQ DX, R11; \
  98      ADCXQ AX, R12; \
  99      ADCXQ AX, R13; \
 100      ADCXQ AX, R14; \
 101      ADCXQ AX,  AX; \
 102      XORL  DX,  DX; \
 103      ADCXQ AX,  R8;  MOVQ  R8,  0+z; SHLXQ R15, AX, AX; \
 104      ADCXQ DX,  R9;  MOVQ  R9,  8+z; \
 105      ADCXQ DX, R10;  MOVQ R10, 16+z; \
 106      ADCXQ AX, R11;  MOVQ R11, 24+z; \
 107      ADCXQ DX, R12;  MOVQ R12, 32+z; \
 108      ADCXQ DX, R13;  MOVQ R13, 40+z; \
 109      ADCXQ DX, R14;  MOVQ R14, 48+z;
 110  
 111  // subtraction subtracts y from x and stores in z
 112  // Uses: AX, DX, R8-R14, FLAGS
 113  // Instr: x86_64
 114  #define subtraction(z,x,y) \
 115      MOVQ  0+x,  R8;  SUBQ  0+y,  R8; \
 116      MOVQ  8+x,  R9;  SBBQ  8+y,  R9; \
 117      MOVQ 16+x, R10;  SBBQ 16+y, R10; \
 118      MOVQ 24+x, R11;  SBBQ 24+y, R11; \
 119      MOVQ 32+x, R12;  SBBQ 32+y, R12; \
 120      MOVQ 40+x, R13;  SBBQ 40+y, R13; \
 121      MOVQ 48+x, R14;  SBBQ 48+y, R14; \
 122      MOVQ   $0,  AX;  SETCS AX; \
 123      MOVQ AX,  DX; \
 124      SHLQ $32, DX; \
 125      SUBQ AX,  R8; MOVQ  $0, AX; \
 126      SBBQ $0,  R9; \
 127      SBBQ $0, R10; \
 128      SBBQ DX, R11; \
 129      SBBQ $0, R12; \
 130      SBBQ $0, R13; \
 131      SBBQ $0, R14; \
 132      SETCS AX; \
 133      MOVQ AX,  DX; \
 134      SHLQ $32, DX; \
 135      SUBQ AX,  R8;  MOVQ  R8,  0+z; \
 136      SBBQ $0,  R9;  MOVQ  R9,  8+z; \
 137      SBBQ $0, R10;  MOVQ R10, 16+z; \
 138      SBBQ DX, R11;  MOVQ R11, 24+z; \
 139      SBBQ $0, R12;  MOVQ R12, 32+z; \
 140      SBBQ $0, R13;  MOVQ R13, 40+z; \
 141      SBBQ $0, R14;  MOVQ R14, 48+z;
 142  
 143  // maddBmi2Adx multiplies x and y and accumulates in z
 144  // Uses: AX, DX, R15, FLAGS
 145  // Instr: x86_64, bmi2, adx
 146  #define maddBmi2Adx(z,x,y,i,r0,r1,r2,r3,r4,r5,r6) \
 147      MOVQ   i+y, DX; XORL AX, AX; \
 148      MULXQ  0+x, AX, R8;  ADOXQ AX, r0;  ADCXQ R8, r1; MOVQ r0,i+z; \
 149      MULXQ  8+x, AX, r0;  ADOXQ AX, r1;  ADCXQ r0, r2; MOVQ $0, R8; \
 150      MULXQ 16+x, AX, r0;  ADOXQ AX, r2;  ADCXQ r0, r3; \
 151      MULXQ 24+x, AX, r0;  ADOXQ AX, r3;  ADCXQ r0, r4; \
 152      MULXQ 32+x, AX, r0;  ADOXQ AX, r4;  ADCXQ r0, r5; \
 153      MULXQ 40+x, AX, r0;  ADOXQ AX, r5;  ADCXQ r0, r6; \
 154      MULXQ 48+x, AX, r0;  ADOXQ AX, r6;  ADCXQ R8, r0; \
 155      ;;;;;;;;;;;;;;;;;;;  ADOXQ R8, r0;
 156  
 157  // integerMulAdx multiplies x and y and stores in z
 158  // Uses: AX, DX, R8-R15, FLAGS
 159  // Instr: x86_64, bmi2, adx
 160  #define integerMulAdx(z,x,y) \
 161      MOVL    $0,R15; \
 162      MOVQ   0+y, DX;  XORL AX, AX;  MOVQ $0, R8; \
 163      MULXQ  0+x, AX,  R9;  MOVQ  AX, 0+z; \
 164      MULXQ  8+x, AX, R10;  ADCXQ AX,  R9; \
 165      MULXQ 16+x, AX, R11;  ADCXQ AX, R10; \
 166      MULXQ 24+x, AX, R12;  ADCXQ AX, R11; \
 167      MULXQ 32+x, AX, R13;  ADCXQ AX, R12; \
 168      MULXQ 40+x, AX, R14;  ADCXQ AX, R13; \
 169      MULXQ 48+x, AX, R15;  ADCXQ AX, R14; \
 170      ;;;;;;;;;;;;;;;;;;;;  ADCXQ R8, R15; \
 171      maddBmi2Adx(z,x,y, 8, R9,R10,R11,R12,R13,R14,R15) \
 172      maddBmi2Adx(z,x,y,16,R10,R11,R12,R13,R14,R15, R9) \
 173      maddBmi2Adx(z,x,y,24,R11,R12,R13,R14,R15, R9,R10) \
 174      maddBmi2Adx(z,x,y,32,R12,R13,R14,R15, R9,R10,R11) \
 175      maddBmi2Adx(z,x,y,40,R13,R14,R15, R9,R10,R11,R12) \
 176      maddBmi2Adx(z,x,y,48,R14,R15, R9,R10,R11,R12,R13) \
 177      MOVQ R15,  56+z; \
 178      MOVQ  R9,  64+z; \
 179      MOVQ R10,  72+z; \
 180      MOVQ R11,  80+z; \
 181      MOVQ R12,  88+z; \
 182      MOVQ R13,  96+z; \
 183      MOVQ R14, 104+z;
 184  
 185  // maddLegacy multiplies x and y and accumulates in z
 186  // Uses: AX, DX, R15, FLAGS
 187  // Instr: x86_64
 188  #define maddLegacy(z,x,y,i) \
 189      MOVQ  i+y, R15; \
 190      MOVQ  0+x, AX; MULQ R15; MOVQ AX,  R8; ;;;;;;;;;;;; MOVQ DX,  R9; \
 191      MOVQ  8+x, AX; MULQ R15; ADDQ AX,  R9; ADCQ $0, DX; MOVQ DX, R10; \
 192      MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
 193      MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
 194      MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
 195      MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
 196      MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \
 197      ADDQ  0+i+z,  R8; MOVQ  R8,  0+i+z; \
 198      ADCQ  8+i+z,  R9; MOVQ  R9,  8+i+z; \
 199      ADCQ 16+i+z, R10; MOVQ R10, 16+i+z; \
 200      ADCQ 24+i+z, R11; MOVQ R11, 24+i+z; \
 201      ADCQ 32+i+z, R12; MOVQ R12, 32+i+z; \
 202      ADCQ 40+i+z, R13; MOVQ R13, 40+i+z; \
 203      ADCQ 48+i+z, R14; MOVQ R14, 48+i+z; \
 204      ADCQ     $0,  DX; MOVQ  DX, 56+i+z;
 205  
 206  // integerMulLeg multiplies x and y and stores in z
 207  // Uses: AX, DX, R8-R15, FLAGS
 208  // Instr: x86_64
 209  #define integerMulLeg(z,x,y) \
 210      MOVQ  0+y, R15; \
 211      MOVQ  0+x, AX; MULQ R15; MOVQ AX, 0+z; ;;;;;;;;;;;; MOVQ DX,  R8; \
 212      MOVQ  8+x, AX; MULQ R15; ADDQ AX,  R8; ADCQ $0, DX; MOVQ DX,  R9; MOVQ  R8,  8+z; \
 213      MOVQ 16+x, AX; MULQ R15; ADDQ AX,  R9; ADCQ $0, DX; MOVQ DX, R10; MOVQ  R9, 16+z; \
 214      MOVQ 24+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; MOVQ R10, 24+z; \
 215      MOVQ 32+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; MOVQ R11, 32+z; \
 216      MOVQ 40+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; MOVQ R12, 40+z; \
 217      MOVQ 48+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX,56+z; MOVQ R13, 48+z; \
 218      maddLegacy(z,x,y, 8) \
 219      maddLegacy(z,x,y,16) \
 220      maddLegacy(z,x,y,24) \
 221      maddLegacy(z,x,y,32) \
 222      maddLegacy(z,x,y,40) \
 223      maddLegacy(z,x,y,48)
 224  
 225  // integerSqrLeg squares x and stores in z
 226  // Uses: AX, CX, DX, R8-R15, FLAGS
 227  // Instr: x86_64
 228  #define integerSqrLeg(z,x) \
 229      XORL R15, R15; \
 230      MOVQ  0+x, CX; \
 231      MOVQ   CX, AX; MULQ CX; MOVQ AX, 0+z; MOVQ DX, R8; \
 232      ADDQ   CX, CX; ADCQ $0, R15; \
 233      MOVQ  8+x, AX; MULQ CX; ADDQ AX,  R8; ADCQ $0, DX; MOVQ DX,  R9; MOVQ R8, 8+z; \
 234      MOVQ 16+x, AX; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; MOVQ DX, R10; \
 235      MOVQ 24+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
 236      MOVQ 32+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
 237      MOVQ 40+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
 238      MOVQ 48+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
 239      \
 240      MOVQ  8+x, CX; \
 241      MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
 242      ;;;;;;;;;;;;;; MULQ CX; ADDQ  AX, R9; ADCQ $0, DX; MOVQ R9,16+z; \
 243      MOVQ  R15, AX; NEGQ AX; ANDQ 8+x, AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \
 244      ADDQ  8+x, CX; ADCQ $0, R15; \
 245      MOVQ 16+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 24+z; \
 246      MOVQ 24+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX, R8; \
 247      MOVQ 32+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; \
 248      MOVQ 40+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \
 249      MOVQ 48+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R9; \
 250      \
 251      MOVQ 16+x, CX; \
 252      MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
 253      ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 32+z; \
 254      MOVQ  R15, AX; NEGQ AX; ANDQ 16+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \
 255      ADDQ 16+x, CX; ADCQ $0, R15; \
 256      MOVQ 24+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 40+z; \
 257      MOVQ 32+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \
 258      MOVQ 40+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; \
 259      MOVQ 48+x, AX; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; ADDQ R8,  R9; ADCQ $0, DX; MOVQ DX,R10; \
 260      \
 261      MOVQ 24+x, CX; \
 262      MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
 263      ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 48+z; \
 264      MOVQ  R15, AX; NEGQ AX; ANDQ 24+x,AX; ADDQ AX, DX; ADCQ $0,  R9; MOVQ DX, R8; \
 265      ADDQ 24+x, CX; ADCQ $0, R15; \
 266      MOVQ 32+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; MOVQ R14, 56+z; \
 267      MOVQ 40+x, AX; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; ADDQ R8,  R9; ADCQ $0, DX; MOVQ DX, R8; \
 268      MOVQ 48+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX,R11; \
 269      \
 270      MOVQ 32+x, CX; \
 271      MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
 272      ;;;;;;;;;;;;;; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; MOVQ R9, 64+z; \
 273      MOVQ  R15, AX; NEGQ AX; ANDQ 32+x,AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \
 274      ADDQ 32+x, CX; ADCQ $0, R15; \
 275      MOVQ 40+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 72+z; \
 276      MOVQ 48+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX,R12; \
 277      \
 278      XORL R13, R13; \
 279      XORL R14, R14; \
 280      MOVQ 40+x, CX; \
 281      MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
 282      ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 80+z; \
 283      MOVQ  R15, AX; NEGQ AX; ANDQ 40+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \
 284      ADDQ 40+x, CX; ADCQ $0, R15; \
 285      MOVQ 48+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 88+z; \
 286      ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8, R13; ADCQ $0,R14; \
 287      \
 288      XORL   R9, R9; \
 289      MOVQ 48+x, CX; \
 290      MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
 291      ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 96+z; \
 292      MOVQ  R15, AX; NEGQ AX; ANDQ 48+x,AX; ADDQ AX, DX; ADCQ $0, R9; MOVQ DX, R8; \
 293      ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8,R14; ADCQ $0, R9; MOVQ R14, 104+z;
 294  
 295  
 296  // integerSqrAdx squares x and stores in z
 297  // Uses: AX, CX, DX, R8-R15, FLAGS
 298  // Instr: x86_64, bmi2, adx
 299  #define integerSqrAdx(z,x) \
 300      XORL R15, R15; \
 301      MOVQ  0+x, DX; \
 302      ;;;;;;;;;;;;;; MULXQ DX, AX, R8; MOVQ AX, 0+z; \
 303      ADDQ   DX, DX; ADCQ $0, R15; CLC; \
 304      MULXQ  8+x, AX,  R9; ADCXQ AX,  R8; MOVQ R8, 8+z; \
 305      MULXQ 16+x, AX, R10; ADCXQ AX,  R9; MOVQ $0, R8;\
 306      MULXQ 24+x, AX, R11; ADCXQ AX, R10; \
 307      MULXQ 32+x, AX, R12; ADCXQ AX, R11; \
 308      MULXQ 40+x, AX, R13; ADCXQ AX, R12; \
 309      MULXQ 48+x, AX, R14; ADCXQ AX, R13; \
 310      ;;;;;;;;;;;;;;;;;;;; ADCXQ R8, R14; \
 311      \
 312      MOVQ  8+x, DX; \
 313      MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
 314      MULXQ AX,  AX, CX; \
 315      MOVQ R15,  R8; NEGQ R8; ANDQ 8+x, R8; \
 316      ADDQ AX,  R9; MOVQ R9, 16+z; \
 317      ADCQ CX,  R8; \
 318      ADCQ $0, R11; \
 319      ADDQ  8+x,  DX; \
 320      ADCQ   $0, R15; \
 321      XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \
 322      MULXQ 16+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 24+z; \
 323      MULXQ 24+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; MOVQ  $0, R10; \
 324      MULXQ 32+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; \
 325      MULXQ 40+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; \
 326      MULXQ 48+x, AX, CX; ADCXQ AX, R14; ADOXQ CX,  R9; \
 327      ;;;;;;;;;;;;;;;;;;; ADCXQ R10, R9; \
 328      \
 329      MOVQ 16+x, DX; \
 330      MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
 331      MULXQ AX,  AX, CX; \
 332      MOVQ R15,  R8; NEGQ R8; ANDQ 16+x, R8; \
 333      ADDQ AX, R11; MOVQ R11, 32+z; \
 334      ADCQ CX,  R8; \
 335      ADCQ $0, R13; \
 336      ADDQ 16+x,  DX; \
 337      ADCQ   $0, R15; \
 338      XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \
 339      MULXQ 24+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 40+z; \
 340      MULXQ 32+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; MOVQ  $0, R12; \
 341      MULXQ 40+x, AX, CX; ADCXQ AX, R14; ADOXQ CX,  R9; \
 342      MULXQ 48+x, AX, CX; ADCXQ AX,  R9; ADOXQ CX, R10; \
 343      ;;;;;;;;;;;;;;;;;;; ADCXQ R11,R10; \
 344      \
 345      MOVQ 24+x, DX; \
 346      MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
 347      MULXQ AX,  AX, CX; \
 348      MOVQ R15,  R8; NEGQ R8; ANDQ 24+x, R8; \
 349      ADDQ AX, R13; MOVQ R13, 48+z; \
 350      ADCQ CX,  R8; \
 351      ADCQ $0,  R9; \
 352      ADDQ 24+x,  DX; \
 353      ADCQ   $0, R15; \
 354      XORL R13, R13; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R14; \
 355      MULXQ 32+x, AX, CX; ADCXQ AX, R14; ADOXQ CX,  R9; MOVQ R14, 56+z; \
 356      MULXQ 40+x, AX, CX; ADCXQ AX,  R9; ADOXQ CX, R10; MOVQ  $0, R14; \
 357      MULXQ 48+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; \
 358      ;;;;;;;;;;;;;;;;;;; ADCXQ R12,R11; \
 359      \
 360      MOVQ 32+x, DX; \
 361      MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
 362      MULXQ AX,  AX, CX; \
 363      MOVQ R15,  R8; NEGQ R8; ANDQ 32+x, R8; \
 364      ADDQ AX,  R9; MOVQ R9, 64+z; \
 365      ADCQ CX,  R8; \
 366      ADCQ $0, R11; \
 367      ADDQ 32+x,  DX; \
 368      ADCQ   $0, R15; \
 369      XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \
 370      MULXQ 40+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 72+z; \
 371      MULXQ 48+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; \
 372      ;;;;;;;;;;;;;;;;;;; ADCXQ R13,R12; \
 373      \
 374      MOVQ 40+x, DX; \
 375      MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
 376      MULXQ AX,  AX, CX; \
 377      MOVQ R15,  R8; NEGQ R8; ANDQ 40+x, R8; \
 378      ADDQ AX, R11; MOVQ R11, 80+z; \
 379      ADCQ CX,  R8; \
 380      ADCQ $0, R13; \
 381      ADDQ 40+x,  DX; \
 382      ADCQ   $0, R15; \
 383      XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \
 384      MULXQ 48+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 88+z; \
 385      ;;;;;;;;;;;;;;;;;;; ADCXQ R14,R13; \
 386      \
 387      MOVQ 48+x, DX; \
 388      MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
 389      MULXQ AX,  AX, CX; \
 390      MOVQ R15,  R8; NEGQ R8; ANDQ 48+x, R8; \
 391      XORL R10, R10; ;;;;;;;;;;;;;; ADOXQ CX, R14; \
 392      ;;;;;;;;;;;;;; ADCXQ AX, R13; ;;;;;;;;;;;;;; MOVQ R13, 96+z; \
 393      ;;;;;;;;;;;;;; ADCXQ R8, R14; MOVQ R14, 104+z;
 394  
 395  // reduceFromDoubleLeg finds a z=x modulo p such that z<2^448 and stores in z
 396  // Uses: AX, R8-R15, FLAGS
 397  // Instr: x86_64
 398  #define reduceFromDoubleLeg(z,x) \
 399      /* (   ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \
 400      /* (r14, r13, r12, r11,     r10,r9,r8,r15) */ \
 401      MOVQ 80+x,AX; MOVQ AX,R10; \
 402      MOVQ $0xFFFFFFFF00000000, R8; \
 403      ANDQ R8,R10; \
 404      \
 405      MOVQ $0,R14; \
 406      MOVQ 104+x,R13; SHLQ $1,R13,R14; \
 407      MOVQ  96+x,R12; SHLQ $1,R12,R13; \
 408      MOVQ  88+x,R11; SHLQ $1,R11,R12; \
 409      MOVQ  72+x, R9; SHLQ $1,R10,R11; \
 410      MOVQ  64+x, R8; SHLQ $1,R10; \
 411      MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \
 412      MOVQ  56+x,R15; \
 413      \
 414      ADDQ  0+x,R15; MOVQ R15, 0+z; MOVQ  56+x,R15; \
 415      ADCQ  8+x, R8; MOVQ  R8, 8+z; MOVQ  64+x, R8; \
 416      ADCQ 16+x, R9; MOVQ  R9,16+z; MOVQ  72+x, R9; \
 417      ADCQ 24+x,R10; MOVQ R10,24+z; MOVQ  80+x,R10; \
 418      ADCQ 32+x,R11; MOVQ R11,32+z; MOVQ  88+x,R11; \
 419      ADCQ 40+x,R12; MOVQ R12,40+z; MOVQ  96+x,R12; \
 420      ADCQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \
 421      ADCQ   $0,R14; \
 422      /* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \
 423      /* (   r9,  r8, r15,  r13,   r12,   r11,   r10) */ \
 424      MOVQ R10, AX; \
 425      SHRQ $32,R11,R10; \
 426      SHRQ $32,R12,R11; \
 427      SHRQ $32,R13,R12; \
 428      SHRQ $32,R15,R13; \
 429      SHRQ $32, R8,R15; \
 430      SHRQ $32, R9, R8; \
 431      SHRQ $32, AX, R9; \
 432      \
 433      ADDQ  0+z,R10; \
 434      ADCQ  8+z,R11; \
 435      ADCQ 16+z,R12; \
 436      ADCQ 24+z,R13; \
 437      ADCQ 32+z,R15; \
 438      ADCQ 40+z, R8; \
 439      ADCQ 48+z, R9; \
 440      ADCQ   $0,R14; \
 441      /* ( c7) + (c6,...,c0) */ \
 442      /* (r14) */ \
 443      MOVQ R14, AX; SHLQ $32, AX; \
 444      ADDQ R14,R10; MOVQ  $0,R14; \
 445      ADCQ  $0,R11; \
 446      ADCQ  $0,R12; \
 447      ADCQ  AX,R13; \
 448      ADCQ  $0,R15; \
 449      ADCQ  $0, R8; \
 450      ADCQ  $0, R9; \
 451      ADCQ  $0,R14; \
 452      /* ( c7) + (c6,...,c0) */ \
 453      /* (r14) */ \
 454      MOVQ R14, AX; SHLQ $32,AX; \
 455      ADDQ R14,R10; MOVQ R10, 0+z; \
 456      ADCQ  $0,R11; MOVQ R11, 8+z; \
 457      ADCQ  $0,R12; MOVQ R12,16+z; \
 458      ADCQ  AX,R13; MOVQ R13,24+z; \
 459      ADCQ  $0,R15; MOVQ R15,32+z; \
 460      ADCQ  $0, R8; MOVQ  R8,40+z; \
 461      ADCQ  $0, R9; MOVQ  R9,48+z;
 462  
 463  // reduceFromDoubleAdx finds a z=x modulo p such that z<2^448 and stores in z
 464  // Uses: AX, R8-R15, FLAGS
 465  // Instr: x86_64, adx
 466  #define reduceFromDoubleAdx(z,x) \
 467      /* (   ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \
 468      /* (r14, r13, r12, r11,     r10,r9,r8,r15) */ \
 469      MOVQ 80+x,AX; MOVQ AX,R10; \
 470      MOVQ $0xFFFFFFFF00000000, R8; \
 471      ANDQ R8,R10; \
 472      \
 473      MOVQ $0,R14; \
 474      MOVQ 104+x,R13; SHLQ $1,R13,R14; \
 475      MOVQ  96+x,R12; SHLQ $1,R12,R13; \
 476      MOVQ  88+x,R11; SHLQ $1,R11,R12; \
 477      MOVQ  72+x, R9; SHLQ $1,R10,R11; \
 478      MOVQ  64+x, R8; SHLQ $1,R10; \
 479      MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \
 480      MOVQ  56+x,R15; \
 481      \
 482      XORL AX,AX; \
 483      ADCXQ  0+x,R15; MOVQ R15, 0+z; MOVQ  56+x,R15; \
 484      ADCXQ  8+x, R8; MOVQ  R8, 8+z; MOVQ  64+x, R8; \
 485      ADCXQ 16+x, R9; MOVQ  R9,16+z; MOVQ  72+x, R9; \
 486      ADCXQ 24+x,R10; MOVQ R10,24+z; MOVQ  80+x,R10; \
 487      ADCXQ 32+x,R11; MOVQ R11,32+z; MOVQ  88+x,R11; \
 488      ADCXQ 40+x,R12; MOVQ R12,40+z; MOVQ  96+x,R12; \
 489      ADCXQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \
 490      ADCXQ   AX,R14; \
 491      /* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \
 492      /* (   r9,  r8, r15,  r13,   r12,   r11,   r10) */ \
 493      MOVQ R10, AX; \
 494      SHRQ $32,R11,R10; \
 495      SHRQ $32,R12,R11; \
 496      SHRQ $32,R13,R12; \
 497      SHRQ $32,R15,R13; \
 498      SHRQ $32, R8,R15; \
 499      SHRQ $32, R9, R8; \
 500      SHRQ $32, AX, R9; \
 501      \
 502      XORL AX,AX; \
 503      ADCXQ  0+z,R10; \
 504      ADCXQ  8+z,R11; \
 505      ADCXQ 16+z,R12; \
 506      ADCXQ 24+z,R13; \
 507      ADCXQ 32+z,R15; \
 508      ADCXQ 40+z, R8; \
 509      ADCXQ 48+z, R9; \
 510      ADCXQ   AX,R14; \
 511      /* ( c7) + (c6,...,c0) */ \
 512      /* (r14) */ \
 513      MOVQ R14, AX; SHLQ $32, AX; \
 514      CLC; \
 515      ADCXQ R14,R10; MOVQ $0,R14; \
 516      ADCXQ R14,R11; \
 517      ADCXQ R14,R12; \
 518      ADCXQ  AX,R13; \
 519      ADCXQ R14,R15; \
 520      ADCXQ R14, R8; \
 521      ADCXQ R14, R9; \
 522      ADCXQ R14,R14; \
 523      /* ( c7) + (c6,...,c0) */ \
 524      /* (r14) */ \
 525      MOVQ R14, AX; SHLQ $32, AX; \
 526      CLC; \
 527      ADCXQ R14,R10; MOVQ R10, 0+z; MOVQ $0,R14; \
 528      ADCXQ R14,R11; MOVQ R11, 8+z; \
 529      ADCXQ R14,R12; MOVQ R12,16+z; \
 530      ADCXQ  AX,R13; MOVQ R13,24+z; \
 531      ADCXQ R14,R15; MOVQ R15,32+z; \
 532      ADCXQ R14, R8; MOVQ  R8,40+z; \
 533      ADCXQ R14, R9; MOVQ  R9,48+z;
 534  
 535  // addSub calculates two operations: x,y = x+y,x-y
 536  // Uses: AX, DX, R8-R15, FLAGS
 537  #define addSub(x,y) \
 538      MOVQ  0+x,  R8;  ADDQ  0+y,  R8; \
 539      MOVQ  8+x,  R9;  ADCQ  8+y,  R9; \
 540      MOVQ 16+x, R10;  ADCQ 16+y, R10; \
 541      MOVQ 24+x, R11;  ADCQ 24+y, R11; \
 542      MOVQ 32+x, R12;  ADCQ 32+y, R12; \
 543      MOVQ 40+x, R13;  ADCQ 40+y, R13; \
 544      MOVQ 48+x, R14;  ADCQ 48+y, R14; \
 545      MOVQ   $0,  AX;  ADCQ   $0,  AX; \
 546      MOVQ AX,  DX; \
 547      SHLQ $32, DX; \
 548      ADDQ AX,  R8; MOVQ  $0, AX; \
 549      ADCQ $0,  R9; \
 550      ADCQ $0, R10; \
 551      ADCQ DX, R11; \
 552      ADCQ $0, R12; \
 553      ADCQ $0, R13; \
 554      ADCQ $0, R14; \
 555      ADCQ $0,  AX; \
 556      MOVQ AX,  DX; \
 557      SHLQ $32, DX; \
 558      ADDQ AX,  R8;  MOVQ  0+x,AX; MOVQ  R8,  0+x; MOVQ AX,  R8; \
 559      ADCQ $0,  R9;  MOVQ  8+x,AX; MOVQ  R9,  8+x; MOVQ AX,  R9; \
 560      ADCQ $0, R10;  MOVQ 16+x,AX; MOVQ R10, 16+x; MOVQ AX, R10; \
 561      ADCQ DX, R11;  MOVQ 24+x,AX; MOVQ R11, 24+x; MOVQ AX, R11; \
 562      ADCQ $0, R12;  MOVQ 32+x,AX; MOVQ R12, 32+x; MOVQ AX, R12; \
 563      ADCQ $0, R13;  MOVQ 40+x,AX; MOVQ R13, 40+x; MOVQ AX, R13; \
 564      ADCQ $0, R14;  MOVQ 48+x,AX; MOVQ R14, 48+x; MOVQ AX, R14; \
 565      SUBQ  0+y,  R8; \
 566      SBBQ  8+y,  R9; \
 567      SBBQ 16+y, R10; \
 568      SBBQ 24+y, R11; \
 569      SBBQ 32+y, R12; \
 570      SBBQ 40+y, R13; \
 571      SBBQ 48+y, R14; \
 572      MOVQ   $0,  AX;  SETCS AX; \
 573      MOVQ AX,  DX; \
 574      SHLQ $32, DX; \
 575      SUBQ AX,  R8; MOVQ  $0, AX; \
 576      SBBQ $0,  R9; \
 577      SBBQ $0, R10; \
 578      SBBQ DX, R11; \
 579      SBBQ $0, R12; \
 580      SBBQ $0, R13; \
 581      SBBQ $0, R14; \
 582      SETCS AX; \
 583      MOVQ AX,  DX; \
 584      SHLQ $32, DX; \
 585      SUBQ AX,  R8;  MOVQ  R8,  0+y; \
 586      SBBQ $0,  R9;  MOVQ  R9,  8+y; \
 587      SBBQ $0, R10;  MOVQ R10, 16+y; \
 588      SBBQ DX, R11;  MOVQ R11, 24+y; \
 589      SBBQ $0, R12;  MOVQ R12, 32+y; \
 590      SBBQ $0, R13;  MOVQ R13, 40+y; \
 591      SBBQ $0, R14;  MOVQ R14, 48+y;
 592