p256_asm_s390x.s raw

   1  // Copyright 2016 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build !purego
   6  
   7  #include "textflag.h"
   8  #include "go_asm.h"
   9  
  10  DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
  11  DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
  12  DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
  13  DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
  14  DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
  15  DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
  16  DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
  17  DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
  18  DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
  19  DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
  20  DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask
  21  DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask
  22  DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
  23  DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
  24  DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
  25  DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
  26  DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
  27  DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
  28  DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
  29  DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
  30  DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
  31  DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
  32  DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
  33  DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
  34  DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
  35  DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
  36  DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
  37  DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
  38  DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
  39  DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
  40  DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
  41  DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
  42  GLOBL p256<>(SB), 8, $96
  43  GLOBL p256mul<>(SB), 8, $160
  44  
  45  // ---------------------------------------
  46  // iff cond == 1  val <- -val
  47  // func p256NegCond(val *p256Element, cond int)
  48  #define P1ptr   R1
  49  #define CPOOL   R4
  50  
  51  #define Y1L   V0
  52  #define Y1H   V1
  53  #define T1L   V2
  54  #define T1H   V3
  55  
  56  #define PL    V30
  57  #define PH    V31
  58  
  59  #define ZER   V4
  60  #define SEL1  V5
  61  #define CAR1  V6
  62  TEXT ·p256NegCond(SB), NOSPLIT, $0
  63  	MOVD val+0(FP), P1ptr
  64  
  65  	MOVD $p256mul<>+0x00(SB), CPOOL
  66  	VL   16(CPOOL), PL
  67  	VL   0(CPOOL), PH
  68  
  69  	VL   16(P1ptr), Y1H
  70  	VPDI $0x4, Y1H, Y1H, Y1H
  71  	VL   0(P1ptr), Y1L
  72  	VPDI $0x4, Y1L, Y1L, Y1L
  73  
  74  	VLREPG cond+8(FP), SEL1
  75  	VZERO  ZER
  76  	VCEQG  SEL1, ZER, SEL1
  77  
  78  	VSCBIQ Y1L, PL, CAR1
  79  	VSQ    Y1L, PL, T1L
  80  	VSBIQ  PH, Y1H, CAR1, T1H
  81  
  82  	VSEL Y1L, T1L, SEL1, Y1L
  83  	VSEL Y1H, T1H, SEL1, Y1H
  84  
  85  	VPDI $0x4, Y1H, Y1H, Y1H
  86  	VST  Y1H, 16(P1ptr)
  87  	VPDI $0x4, Y1L, Y1L, Y1L
  88  	VST  Y1L, 0(P1ptr)
  89  	RET
  90  
  91  #undef P1ptr
  92  #undef CPOOL
  93  #undef Y1L
  94  #undef Y1H
  95  #undef T1L
  96  #undef T1H
  97  #undef PL
  98  #undef PH
  99  #undef ZER
 100  #undef SEL1
 101  #undef CAR1
 102  
 103  // ---------------------------------------
 104  // if cond == 0 res <- b; else res <- a
 105  // func p256MovCond(res, a, b *P256Point, cond int)
 106  #define P3ptr   R1
 107  #define P1ptr   R2
 108  #define P2ptr   R3
 109  
 110  #define X1L    V0
 111  #define X1H    V1
 112  #define Y1L    V2
 113  #define Y1H    V3
 114  #define Z1L    V4
 115  #define Z1H    V5
 116  #define X2L    V6
 117  #define X2H    V7
 118  #define Y2L    V8
 119  #define Y2H    V9
 120  #define Z2L    V10
 121  #define Z2H    V11
 122  
 123  #define ZER   V18
 124  #define SEL1  V19
 125  TEXT ·p256MovCond(SB), NOSPLIT, $0
 126  	MOVD   res+0(FP), P3ptr
 127  	MOVD   a+8(FP), P1ptr
 128  	MOVD   b+16(FP), P2ptr
 129  	VLREPG cond+24(FP), SEL1
 130  	VZERO  ZER
 131  	VCEQG  SEL1, ZER, SEL1
 132  
 133  	VL 0(P1ptr), X1H
 134  	VL 16(P1ptr), X1L
 135  	VL 32(P1ptr), Y1H
 136  	VL 48(P1ptr), Y1L
 137  	VL 64(P1ptr), Z1H
 138  	VL 80(P1ptr), Z1L
 139  
 140  	VL 0(P2ptr), X2H
 141  	VL 16(P2ptr), X2L
 142  	VL 32(P2ptr), Y2H
 143  	VL 48(P2ptr), Y2L
 144  	VL 64(P2ptr), Z2H
 145  	VL 80(P2ptr), Z2L
 146  
 147  	VSEL X2L, X1L, SEL1, X1L
 148  	VSEL X2H, X1H, SEL1, X1H
 149  	VSEL Y2L, Y1L, SEL1, Y1L
 150  	VSEL Y2H, Y1H, SEL1, Y1H
 151  	VSEL Z2L, Z1L, SEL1, Z1L
 152  	VSEL Z2H, Z1H, SEL1, Z1H
 153  
 154  	VST X1H, 0(P3ptr)
 155  	VST X1L, 16(P3ptr)
 156  	VST Y1H, 32(P3ptr)
 157  	VST Y1L, 48(P3ptr)
 158  	VST Z1H, 64(P3ptr)
 159  	VST Z1L, 80(P3ptr)
 160  
 161  	RET
 162  
 163  #undef P3ptr
 164  #undef P1ptr
 165  #undef P2ptr
 166  #undef X1L
 167  #undef X1H
 168  #undef Y1L
 169  #undef Y1H
 170  #undef Z1L
 171  #undef Z1H
 172  #undef X2L
 173  #undef X2H
 174  #undef Y2L
 175  #undef Y2H
 176  #undef Z2L
 177  #undef Z2H
 178  #undef ZER
 179  #undef SEL1
 180  
 181  // ---------------------------------------
 182  // Constant time table access
 183  // Indexed from 1 to 15, with -1 offset
 184  // (index 0 is implicitly point at infinity)
 185  // func p256Select(res *P256Point, table *p256Table, idx int)
 186  #define P3ptr   R1
 187  #define P1ptr   R2
 188  #define COUNT   R4
 189  
 190  #define X1L    V0
 191  #define X1H    V1
 192  #define Y1L    V2
 193  #define Y1H    V3
 194  #define Z1L    V4
 195  #define Z1H    V5
 196  #define X2L    V6
 197  #define X2H    V7
 198  #define Y2L    V8
 199  #define Y2H    V9
 200  #define Z2L    V10
 201  #define Z2H    V11
 202  
 203  #define ONE   V18
 204  #define IDX   V19
 205  #define SEL1  V20
 206  #define SEL2  V21
 207  TEXT ·p256Select(SB), NOSPLIT, $0
 208  	MOVD   res+0(FP), P3ptr
 209  	MOVD   table+8(FP), P1ptr
 210  	VLREPB idx+(16+7)(FP), IDX
 211  	VREPIB $1, ONE
 212  	VREPIB $1, SEL2
 213  	MOVD   $1, COUNT
 214  
 215  	VZERO X1H
 216  	VZERO X1L
 217  	VZERO Y1H
 218  	VZERO Y1L
 219  	VZERO Z1H
 220  	VZERO Z1L
 221  
 222  loop_select:
 223  	VL 0(P1ptr), X2H
 224  	VL 16(P1ptr), X2L
 225  	VL 32(P1ptr), Y2H
 226  	VL 48(P1ptr), Y2L
 227  	VL 64(P1ptr), Z2H
 228  	VL 80(P1ptr), Z2L
 229  
 230  	VCEQG SEL2, IDX, SEL1
 231  
 232  	VSEL X2L, X1L, SEL1, X1L
 233  	VSEL X2H, X1H, SEL1, X1H
 234  	VSEL Y2L, Y1L, SEL1, Y1L
 235  	VSEL Y2H, Y1H, SEL1, Y1H
 236  	VSEL Z2L, Z1L, SEL1, Z1L
 237  	VSEL Z2H, Z1H, SEL1, Z1H
 238  
 239  	VAB  SEL2, ONE, SEL2
 240  	ADDW $1, COUNT
 241  	ADD  $96, P1ptr
 242  	CMPW COUNT, $17
 243  	BLT  loop_select
 244  
 245  	VST X1H, 0(P3ptr)
 246  	VST X1L, 16(P3ptr)
 247  	VST Y1H, 32(P3ptr)
 248  	VST Y1L, 48(P3ptr)
 249  	VST Z1H, 64(P3ptr)
 250  	VST Z1L, 80(P3ptr)
 251  	RET
 252  
 253  #undef P3ptr
 254  #undef P1ptr
 255  #undef COUNT
 256  #undef X1L
 257  #undef X1H
 258  #undef Y1L
 259  #undef Y1H
 260  #undef Z1L
 261  #undef Z1H
 262  #undef X2L
 263  #undef X2H
 264  #undef Y2L
 265  #undef Y2H
 266  #undef Z2L
 267  #undef Z2H
 268  #undef ONE
 269  #undef IDX
 270  #undef SEL1
 271  #undef SEL2
 272  
 273  // ---------------------------------------
 274  
 275  //  func p256FromMont(res, in *p256Element)
 276  #define res_ptr R1
 277  #define x_ptr   R2
 278  #define CPOOL   R4
 279  
 280  #define T0   V0
 281  #define T1   V1
 282  #define T2   V2
 283  #define TT0  V3
 284  #define TT1  V4
 285  
 286  #define ZER   V6
 287  #define SEL1  V7
 288  #define SEL2  V8
 289  #define CAR1  V9
 290  #define CAR2  V10
 291  #define RED1  V11
 292  #define RED2  V12
 293  #define PL    V13
 294  #define PH    V14
 295  
 296  TEXT ·p256FromMont(SB), NOSPLIT, $0
 297  	MOVD res+0(FP), res_ptr
 298  	MOVD in+8(FP), x_ptr
 299  
 300  	VZERO T2
 301  	VZERO ZER
 302  	MOVD  $p256<>+0x00(SB), CPOOL
 303  	VL    16(CPOOL), PL
 304  	VL    0(CPOOL), PH
 305  	VL    48(CPOOL), SEL2
 306  	VL    64(CPOOL), SEL1
 307  
 308  	VL   (0*16)(x_ptr), T0
 309  	VPDI $0x4, T0, T0, T0
 310  	VL   (1*16)(x_ptr), T1
 311  	VPDI $0x4, T1, T1, T1
 312  
 313  	// First round
 314  	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
 315  	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
 316  	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
 317  
 318  	VSLDB $8, T1, T0, T0
 319  	VSLDB $8, T2, T1, T1
 320  
 321  	VACCQ  T0, RED1, CAR1
 322  	VAQ    T0, RED1, T0
 323  	VACCCQ T1, RED2, CAR1, CAR2
 324  	VACQ   T1, RED2, CAR1, T1
 325  	VAQ    T2, CAR2, T2
 326  
 327  	// Second round
 328  	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
 329  	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
 330  	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
 331  
 332  	VSLDB $8, T1, T0, T0
 333  	VSLDB $8, T2, T1, T1
 334  
 335  	VACCQ  T0, RED1, CAR1
 336  	VAQ    T0, RED1, T0
 337  	VACCCQ T1, RED2, CAR1, CAR2
 338  	VACQ   T1, RED2, CAR1, T1
 339  	VAQ    T2, CAR2, T2
 340  
 341  	// Third round
 342  	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
 343  	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
 344  	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
 345  
 346  	VSLDB $8, T1, T0, T0
 347  	VSLDB $8, T2, T1, T1
 348  
 349  	VACCQ  T0, RED1, CAR1
 350  	VAQ    T0, RED1, T0
 351  	VACCCQ T1, RED2, CAR1, CAR2
 352  	VACQ   T1, RED2, CAR1, T1
 353  	VAQ    T2, CAR2, T2
 354  
 355  	// Last round
 356  	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
 357  	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
 358  	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
 359  
 360  	VSLDB $8, T1, T0, T0
 361  	VSLDB $8, T2, T1, T1
 362  
 363  	VACCQ  T0, RED1, CAR1
 364  	VAQ    T0, RED1, T0
 365  	VACCCQ T1, RED2, CAR1, CAR2
 366  	VACQ   T1, RED2, CAR1, T1
 367  	VAQ    T2, CAR2, T2
 368  
 369  	// ---------------------------------------------------
 370  
 371  	VSCBIQ  PL, T0, CAR1
 372  	VSQ     PL, T0, TT0
 373  	VSBCBIQ T1, PH, CAR1, CAR2
 374  	VSBIQ   T1, PH, CAR1, TT1
 375  	VSBIQ   T2, ZER, CAR2, T2
 376  
 377  	// what output to use, TT1||TT0 or T1||T0?
 378  	VSEL T0, TT0, T2, T0
 379  	VSEL T1, TT1, T2, T1
 380  
 381  	VPDI $0x4, T0, T0, TT0
 382  	VST  TT0, (0*16)(res_ptr)
 383  	VPDI $0x4, T1, T1, TT1
 384  	VST  TT1, (1*16)(res_ptr)
 385  	RET
 386  
 387  #undef res_ptr
 388  #undef x_ptr
 389  #undef CPOOL
 390  #undef T0
 391  #undef T1
 392  #undef T2
 393  #undef TT0
 394  #undef TT1
 395  #undef ZER
 396  #undef SEL1
 397  #undef SEL2
 398  #undef CAR1
 399  #undef CAR2
 400  #undef RED1
 401  #undef RED2
 402  #undef PL
 403  #undef PH
 404  
 405  // Constant time table access
 406  // Indexed from 1 to 15, with -1 offset
 407  // (index 0 is implicitly point at infinity)
 408  // func p256SelectBase(point *p256Point, table []p256Point, idx int)
 409  // new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
 410  
 411  #define P3ptr   R1
 412  #define P1ptr   R2
 413  #define COUNT   R4
 414  #define CPOOL   R5
 415  
 416  #define X1L    V0
 417  #define X1H    V1
 418  #define Y1L    V2
 419  #define Y1H    V3
 420  #define Z1L    V4
 421  #define Z1H    V5
 422  #define X2L    V6
 423  #define X2H    V7
 424  #define Y2L    V8
 425  #define Y2H    V9
 426  #define Z2L    V10
 427  #define Z2H    V11
 428  #define LE2BE  V12
 429  
 430  #define ONE   V18
 431  #define IDX   V19
 432  #define SEL1  V20
 433  #define SEL2  V21
 434  
 435  TEXT ·p256SelectAffine(SB), NOSPLIT, $0
 436  	MOVD   res+0(FP), P3ptr
 437  	MOVD   table+8(FP), P1ptr
 438  	MOVD   $p256<>+0x00(SB), CPOOL
 439  	VLREPB idx+(16+7)(FP), IDX
 440  	VREPIB $1, ONE
 441  	VREPIB $1, SEL2
 442  	MOVD   $1, COUNT
 443  	VL     80(CPOOL), LE2BE
 444  
 445  	VZERO X1H
 446  	VZERO X1L
 447  	VZERO Y1H
 448  	VZERO Y1L
 449  
 450  loop_select:
 451  	VL 0(P1ptr), X2H
 452  	VL 16(P1ptr), X2L
 453  	VL 32(P1ptr), Y2H
 454  	VL 48(P1ptr), Y2L
 455  
 456  	VCEQG SEL2, IDX, SEL1
 457  
 458  	VSEL X2L, X1L, SEL1, X1L
 459  	VSEL X2H, X1H, SEL1, X1H
 460  	VSEL Y2L, Y1L, SEL1, Y1L
 461  	VSEL Y2H, Y1H, SEL1, Y1H
 462  
 463  	VAB  SEL2, ONE, SEL2
 464  	ADDW $1, COUNT
 465  	ADD  $64, P1ptr
 466  	CMPW COUNT, $33 // len(p256AffineTable) + 1
 467  	BLT  loop_select
 468  	VST  X1H, 0(P3ptr)
 469  	VST  X1L, 16(P3ptr)
 470  	VST  Y1H, 32(P3ptr)
 471  	VST  Y1L, 48(P3ptr)
 472  
 473  	RET
 474  
 475  #undef P3ptr
 476  #undef P1ptr
 477  #undef COUNT
 478  #undef X1L
 479  #undef X1H
 480  #undef Y1L
 481  #undef Y1H
 482  #undef Z1L
 483  #undef Z1H
 484  #undef X2L
 485  #undef X2H
 486  #undef Y2L
 487  #undef Y2H
 488  #undef Z2L
 489  #undef Z2H
 490  #undef ONE
 491  #undef IDX
 492  #undef SEL1
 493  #undef SEL2
 494  #undef CPOOL
 495  
 496  // ---------------------------------------
 497  // p256MulInternal
 498  // V0-V3,V30,V31 - Not Modified
 499  // V4-V15 - Volatile
 500  
 501  #define CPOOL   R4
 502  
 503  // Parameters
 504  #define X0    V0 // Not modified
 505  #define X1    V1 // Not modified
 506  #define Y0    V2 // Not modified
 507  #define Y1    V3 // Not modified
 508  #define T0    V4
 509  #define T1    V5
 510  #define P0    V30 // Not modified
 511  #define P1    V31 // Not modified
 512  
 513  // Temporaries
 514  #define YDIG  V6 // Overloaded with CAR2, ZER
 515  #define ADD1H V7 // Overloaded with ADD3H
 516  #define ADD2H V8 // Overloaded with ADD4H
 517  #define ADD3  V9 // Overloaded with SEL2,SEL5
 518  #define ADD4  V10 // Overloaded with SEL3,SEL6
 519  #define RED1  V11 // Overloaded with CAR2
 520  #define RED2  V12
 521  #define RED3  V13 // Overloaded with SEL1
 522  #define T2    V14
 523  // Overloaded temporaries
 524  #define ADD1  V4 // Overloaded with T0
 525  #define ADD2  V5 // Overloaded with T1
 526  #define ADD3H V7 // Overloaded with ADD1H
 527  #define ADD4H V8 // Overloaded with ADD2H
 528  #define ZER   V6 // Overloaded with YDIG, CAR2
 529  #define CAR1  V6 // Overloaded with YDIG, ZER
 530  #define CAR2  V11 // Overloaded with RED1
 531  // Constant Selects
 532  #define SEL1  V13 // Overloaded with RED3
 533  #define SEL2  V9 // Overloaded with ADD3,SEL5
 534  #define SEL3  V10 // Overloaded with ADD4,SEL6
 535  #define SEL4  V6 // Overloaded with YDIG,CAR2,ZER
 536  #define SEL5  V9 // Overloaded with ADD3,SEL2
 537  #define SEL6  V10 // Overloaded with ADD4,SEL3
 538  
 539  /* *
 540   * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
 541   * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
 542   * With you, SIMD be...
 543   *
 544   *                                           +--------+--------+
 545   *                                  +--------|  RED2  |  RED1  |
 546   *                                  |        +--------+--------+
 547   *                                  |       ---+--------+--------+
 548   *                                  |  +---- T2|   T1   |   T0   |--+
 549   *                                  |  |    ---+--------+--------+  |
 550   *                                  |  |                            |
 551   *                                  |  |    ======================= |
 552   *                                  |  |                            |
 553   *                                  |  |       +--------+--------+<-+
 554   *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
 555   *                                  |  |       +--------+--------+  |     |
 556   *                                  |  |     +--------+--------+<---+     |
 557   *                                  |  |     | ADD2H  | ADD1H  |--+       |
 558   *                                  |  |     +--------+--------+  |       |
 559   *                                  |  |     +--------+--------+<-+       |
 560   *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
 561   *                                  |  |     +--------+--------+  | |     |
 562   *                                  |  |   +--------+--------+<---+ |     |
 563   *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
 564   *                                  |  |   +--------+--------+      | |   V
 565   *                                  |  | ------------------------   | | +--------+
 566   *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
 567   *                                  |  |                            | | +--------+
 568   *                                  |  +---->+--------+--------+    | |   |
 569   *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
 570   *                                  |        +--------+--------+    | |   |
 571   *                                  +---->---+--------+--------+    | |   |
 572   *                                         T2|   T1   |   T0   |----+ |   |
 573   *                                        ---+--------+--------+    | |   |
 574   *                                        ---+--------+--------+<---+ |   |
 575   *                                    +--- T2|   T1   |   T0   |----------+
 576   *                                    |   ---+--------+--------+      |   |
 577   *                                    |  +--------+--------+<-------------+
 578   *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
 579   *                                    |  +--------+--------+     |    |   |
 580   *                                    |  +--------+<----------------------+
 581   *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
 582   *                                    |  +--------+              |    |
 583   *                                    +--->+--------+--------+   |    |
 584   *                                         |   T1   |   T0   |--------+
 585   *                                         +--------+--------+   |    |
 586   *                                   --------------------------- |    |
 587   *                                                               |    |
 588   *                                       +--------+--------+<----+    |
 589   *                                       |  RED2  |  RED1  |          |
 590   *                                       +--------+--------+          |
 591   *                                      ---+--------+--------+<-------+
 592   *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
 593   *                                      ---+--------+--------+
 594   *
 595   *                                                                *Mi obra de arte de siglo XXI @vpaprots
 596   *
 597   *
 598   * First group is special, doesn't get the two inputs:
 599   *                                             +--------+--------+<-+
 600   *                                     +-------|  ADD2  |  ADD1  |--|-----+
 601   *                                     |       +--------+--------+  |     |
 602   *                                     |     +--------+--------+<---+     |
 603   *                                     |     | ADD2H  | ADD1H  |--+       |
 604   *                                     |     +--------+--------+  |       |
 605   *                                     |     +--------+--------+<-+       |
 606   *                                     |     |  ADD4  |  ADD3  |--|-+     |
 607   *                                     |     +--------+--------+  | |     |
 608   *                                     |   +--------+--------+<---+ |     |
 609   *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
 610   *                                     |   +--------+--------+      | |   V
 611   *                                     | ------------------------   | | +--------+
 612   *                                     |                            | | |  RED3  |  [d0 0 0 d0]
 613   *                                     |                            | | +--------+
 614   *                                     +---->+--------+--------+    | |   |
 615   *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
 616   *                                           +--------+--------+    | |   |
 617   *                                        ---+--------+--------+<---+ |   |
 618   *                                    +--- T2|   T1   |   T0   |----------+
 619   *                                    |   ---+--------+--------+      |   |
 620   *                                    |  +--------+--------+<-------------+
 621   *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
 622   *                                    |  +--------+--------+     |    |   |
 623   *                                    |  +--------+<----------------------+
 624   *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
 625   *                                    |  +--------+              |    |
 626   *                                    +--->+--------+--------+   |    |
 627   *                                         |   T1   |   T0   |--------+
 628   *                                         +--------+--------+   |    |
 629   *                                   --------------------------- |    |
 630   *                                                               |    |
 631   *                                       +--------+--------+<----+    |
 632   *                                       |  RED2  |  RED1  |          |
 633   *                                       +--------+--------+          |
 634   *                                      ---+--------+--------+<-------+
 635   *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
 636   *                                      ---+--------+--------+
 637   *
 638   * Last 'group' needs to RED2||RED1 shifted less
 639   */
 640  TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
 641  	VL 32(CPOOL), SEL1
 642  	VL 48(CPOOL), SEL2
 643  	VL 64(CPOOL), SEL3
 644  	VL 80(CPOOL), SEL4
 645  
 646  	// ---------------------------------------------------
 647  
 648  	VREPF $3, Y0, YDIG
 649  	VMLHF X0, YDIG, ADD1H
 650  	VMLHF X1, YDIG, ADD2H
 651  	VMLF  X0, YDIG, ADD1
 652  	VMLF  X1, YDIG, ADD2
 653  
 654  	VREPF  $2, Y0, YDIG
 655  	VMALF  X0, YDIG, ADD1H, ADD3
 656  	VMALF  X1, YDIG, ADD2H, ADD4
 657  	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
 658  	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
 659  
 660  	VZERO ZER
 661  	VL    32(CPOOL), SEL1
 662  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
 663  
 664  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
 665  	VSLDB $12, ZER, ADD2, T1  // ADD2 Free
 666  
 667  	VACCQ  T0, ADD3, CAR1
 668  	VAQ    T0, ADD3, T0       // ADD3 Free
 669  	VACCCQ T1, ADD4, CAR1, T2
 670  	VACQ   T1, ADD4, CAR1, T1 // ADD4 Free
 671  
 672  	VL    48(CPOOL), SEL2
 673  	VL    64(CPOOL), SEL3
 674  	VL    80(CPOOL), SEL4
 675  	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
 676  	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
 677  	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
 678  	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
 679  
 680  	VSLDB $12, T1, T0, T0
 681  	VSLDB $12, T2, T1, T1
 682  
 683  	VACCQ  T0, ADD3H, CAR1
 684  	VAQ    T0, ADD3H, T0
 685  	VACCCQ T1, ADD4H, CAR1, T2
 686  	VACQ   T1, ADD4H, CAR1, T1
 687  
 688  	// ---------------------------------------------------
 689  
 690  	VREPF  $1, Y0, YDIG
 691  	VMALHF X0, YDIG, T0, ADD1H
 692  	VMALHF X1, YDIG, T1, ADD2H
 693  	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
 694  	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
 695  
 696  	VREPF  $0, Y0, YDIG
 697  	VMALF  X0, YDIG, ADD1H, ADD3
 698  	VMALF  X1, YDIG, ADD2H, ADD4
 699  	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
 700  	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
 701  
 702  	VZERO ZER
 703  	VL    32(CPOOL), SEL1
 704  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
 705  
 706  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
 707  	VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
 708  
 709  	VACCQ  T0, RED1, CAR1
 710  	VAQ    T0, RED1, T0
 711  	VACCCQ T1, RED2, CAR1, T2
 712  	VACQ   T1, RED2, CAR1, T1
 713  
 714  	VACCQ  T0, ADD3, CAR1
 715  	VAQ    T0, ADD3, T0
 716  	VACCCQ T1, ADD4, CAR1, CAR2
 717  	VACQ   T1, ADD4, CAR1, T1
 718  	VAQ    T2, CAR2, T2
 719  
 720  	VL    48(CPOOL), SEL2
 721  	VL    64(CPOOL), SEL3
 722  	VL    80(CPOOL), SEL4
 723  	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
 724  	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
 725  	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
 726  	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
 727  
 728  	VSLDB $12, T1, T0, T0
 729  	VSLDB $12, T2, T1, T1
 730  
 731  	VACCQ  T0, ADD3H, CAR1
 732  	VAQ    T0, ADD3H, T0
 733  	VACCCQ T1, ADD4H, CAR1, T2
 734  	VACQ   T1, ADD4H, CAR1, T1
 735  
 736  	// ---------------------------------------------------
 737  
 738  	VREPF  $3, Y1, YDIG
 739  	VMALHF X0, YDIG, T0, ADD1H
 740  	VMALHF X1, YDIG, T1, ADD2H
 741  	VMALF  X0, YDIG, T0, ADD1
 742  	VMALF  X1, YDIG, T1, ADD2
 743  
 744  	VREPF  $2, Y1, YDIG
 745  	VMALF  X0, YDIG, ADD1H, ADD3
 746  	VMALF  X1, YDIG, ADD2H, ADD4
 747  	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
 748  	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
 749  
 750  	VZERO ZER
 751  	VL    32(CPOOL), SEL1
 752  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
 753  
 754  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
 755  	VSLDB $12, T2, ADD2, T1   // ADD2 Free
 756  
 757  	VACCQ  T0, RED1, CAR1
 758  	VAQ    T0, RED1, T0
 759  	VACCCQ T1, RED2, CAR1, T2
 760  	VACQ   T1, RED2, CAR1, T1
 761  
 762  	VACCQ  T0, ADD3, CAR1
 763  	VAQ    T0, ADD3, T0
 764  	VACCCQ T1, ADD4, CAR1, CAR2
 765  	VACQ   T1, ADD4, CAR1, T1
 766  	VAQ    T2, CAR2, T2
 767  
 768  	VL    48(CPOOL), SEL2
 769  	VL    64(CPOOL), SEL3
 770  	VL    80(CPOOL), SEL4
 771  	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
 772  	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
 773  	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
 774  	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
 775  
 776  	VSLDB $12, T1, T0, T0
 777  	VSLDB $12, T2, T1, T1
 778  
 779  	VACCQ  T0, ADD3H, CAR1
 780  	VAQ    T0, ADD3H, T0
 781  	VACCCQ T1, ADD4H, CAR1, T2
 782  	VACQ   T1, ADD4H, CAR1, T1
 783  
 784  	// ---------------------------------------------------
 785  
 786  	VREPF  $1, Y1, YDIG
 787  	VMALHF X0, YDIG, T0, ADD1H
 788  	VMALHF X1, YDIG, T1, ADD2H
 789  	VMALF  X0, YDIG, T0, ADD1
 790  	VMALF  X1, YDIG, T1, ADD2
 791  
 792  	VREPF  $0, Y1, YDIG
 793  	VMALF  X0, YDIG, ADD1H, ADD3
 794  	VMALF  X1, YDIG, ADD2H, ADD4
 795  	VMALHF X0, YDIG, ADD1H, ADD3H
 796  	VMALHF X1, YDIG, ADD2H, ADD4H
 797  
 798  	VZERO ZER
 799  	VL    32(CPOOL), SEL1
 800  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
 801  
 802  	VSLDB $12, ADD2, ADD1, T0
 803  	VSLDB $12, T2, ADD2, T1
 804  
 805  	VACCQ  T0, RED1, CAR1
 806  	VAQ    T0, RED1, T0
 807  	VACCCQ T1, RED2, CAR1, T2
 808  	VACQ   T1, RED2, CAR1, T1
 809  
 810  	VACCQ  T0, ADD3, CAR1
 811  	VAQ    T0, ADD3, T0
 812  	VACCCQ T1, ADD4, CAR1, CAR2
 813  	VACQ   T1, ADD4, CAR1, T1
 814  	VAQ    T2, CAR2, T2
 815  
 816  	VL    96(CPOOL), SEL5
 817  	VL    112(CPOOL), SEL6
 818  	VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
 819  	VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
 820  	VSQ   RED1, RED2, RED2     // Guaranteed not to underflow
 821  
 822  	VSLDB $12, T1, T0, T0
 823  	VSLDB $12, T2, T1, T1
 824  
 825  	VACCQ  T0, ADD3H, CAR1
 826  	VAQ    T0, ADD3H, T0
 827  	VACCCQ T1, ADD4H, CAR1, T2
 828  	VACQ   T1, ADD4H, CAR1, T1
 829  
 830  	VACCQ  T0, RED1, CAR1
 831  	VAQ    T0, RED1, T0
 832  	VACCCQ T1, RED2, CAR1, CAR2
 833  	VACQ   T1, RED2, CAR1, T1
 834  	VAQ    T2, CAR2, T2
 835  
 836  	// ---------------------------------------------------
 837  
 838  	VZERO   RED3
 839  	VSCBIQ  P0, T0, CAR1
 840  	VSQ     P0, T0, ADD1H
 841  	VSBCBIQ T1, P1, CAR1, CAR2
 842  	VSBIQ   T1, P1, CAR1, ADD2H
 843  	VSBIQ   T2, RED3, CAR2, T2
 844  
 845  	// what output to use, ADD2H||ADD1H or T1||T0?
 846  	VSEL T0, ADD1H, T2, T0
 847  	VSEL T1, ADD2H, T2, T1
 848  	RET
 849  
 850  #undef CPOOL
 851  
 852  #undef X0
 853  #undef X1
 854  #undef Y0
 855  #undef Y1
 856  #undef T0
 857  #undef T1
 858  #undef P0
 859  #undef P1
 860  
 861  #undef SEL1
 862  #undef SEL2
 863  #undef SEL3
 864  #undef SEL4
 865  #undef SEL5
 866  #undef SEL6
 867  
 868  #undef YDIG
 869  #undef ADD1H
 870  #undef ADD2H
 871  #undef ADD3
 872  #undef ADD4
 873  #undef RED1
 874  #undef RED2
 875  #undef RED3
 876  #undef T2
 877  #undef ADD1
 878  #undef ADD2
 879  #undef ADD3H
 880  #undef ADD4H
 881  #undef ZER
 882  #undef CAR1
 883  #undef CAR2
 884  
 885  // ---------------------------------------
 886  
 887  // Parameters
 888  #define X0    V0
 889  #define X1    V1
 890  #define Y0    V2
 891  #define Y1    V3
 892  
 893  TEXT p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0
 894  	VLR X0, Y0
 895  	VLR X1, Y1
 896  	BR  p256MulInternal<>(SB)
 897  
 898  #undef X0
 899  #undef X1
 900  #undef Y0
 901  #undef Y1
 902  
 903  #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
 904  	VZERO   ZER                \
 905  	VSCBIQ  Y0, X0, CAR1       \
 906  	VSQ     Y0, X0, T0         \
 907  	VSBCBIQ X1, Y1, CAR1, SEL1 \
 908  	VSBIQ   X1, Y1, CAR1, T1   \
 909  	VSQ     SEL1, ZER, SEL1    \
 910  	                           \
 911  	VACCQ   T0, PL, CAR1       \
 912  	VAQ     T0, PL, TT0        \
 913  	VACQ    T1, PH, CAR1, TT1  \
 914  	                           \
 915  	VSEL    T0, TT0, SEL1, T0  \
 916  	VSEL    T1, TT1, SEL1, T1  \
 917  
 918  #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
 919  	VACCQ   X0, Y0, CAR1        \
 920  	VAQ     X0, Y0, T0          \
 921  	VACCCQ  X1, Y1, CAR1, T2    \
 922  	VACQ    X1, Y1, CAR1, T1    \
 923  	                            \
 924  	VZERO   ZER                 \
 925  	VSCBIQ  PL, T0, CAR1        \
 926  	VSQ     PL, T0, TT0         \
 927  	VSBCBIQ T1, PH, CAR1, CAR2  \
 928  	VSBIQ   T1, PH, CAR1, TT1   \
 929  	VSBIQ   T2, ZER, CAR2, SEL1 \
 930  	                            \
 931  	VSEL    T0, TT0, SEL1, T0   \
 932  	VSEL    T1, TT1, SEL1, T1
 933  
 934  #define p256HalfInternal(T1, T0, X1, X0) \
 935  	VZERO  ZER                \
 936  	VSBIQ  ZER, ZER, X0, SEL1 \
 937  	                          \
 938  	VACCQ  X0, PL, CAR1       \
 939  	VAQ    X0, PL, T0         \
 940  	VACCCQ X1, PH, CAR1, T2   \
 941  	VACQ   X1, PH, CAR1, T1   \
 942  	                          \
 943  	VSEL   X0, T0, SEL1, T0   \
 944  	VSEL   X1, T1, SEL1, T1   \
 945  	VSEL   ZER, T2, SEL1, T2  \
 946  	                          \
 947  	VSLDB  $15, T2, ZER, TT1  \
 948  	VSLDB  $15, T1, ZER, TT0  \
 949  	VREPIB $1, SEL1           \
 950  	VSRL   SEL1, T0, T0       \
 951  	VSRL   SEL1, T1, T1       \
 952  	VREPIB $7, SEL1           \
 953  	VSL    SEL1, TT0, TT0     \
 954  	VSL    SEL1, TT1, TT1     \
 955  	VO     T0, TT0, T0        \
 956  	VO     T1, TT1, T1
 957  
 958  // ---------------------------------------
 959  // func p256Mul(res, in1, in2 *p256Element)
 960  #define res_ptr R1
 961  #define x_ptr   R2
 962  #define y_ptr   R3
 963  #define CPOOL   R4
 964  
 965  // Parameters
 966  #define X0    V0
 967  #define X1    V1
 968  #define Y0    V2
 969  #define Y1    V3
 970  #define T0    V4
 971  #define T1    V5
 972  
 973  // Constants
 974  #define P0    V30
 975  #define P1    V31
 976  TEXT ·p256Mul(SB), NOSPLIT, $0
 977  	MOVD res+0(FP), res_ptr
 978  	MOVD in1+8(FP), x_ptr
 979  	MOVD in2+16(FP), y_ptr
 980  
 981  	VL   (0*16)(x_ptr), X0
 982  	VPDI $0x4, X0, X0, X0
 983  	VL   (1*16)(x_ptr), X1
 984  	VPDI $0x4, X1, X1, X1
 985  	VL   (0*16)(y_ptr), Y0
 986  	VPDI $0x4, Y0, Y0, Y0
 987  	VL   (1*16)(y_ptr), Y1
 988  	VPDI $0x4, Y1, Y1, Y1
 989  
 990  	MOVD $p256mul<>+0x00(SB), CPOOL
 991  	VL   16(CPOOL), P0
 992  	VL   0(CPOOL), P1
 993  
 994  	CALL p256MulInternal<>(SB)
 995  
 996  	VPDI $0x4, T0, T0, T0
 997  	VST  T0, (0*16)(res_ptr)
 998  	VPDI $0x4, T1, T1, T1
 999  	VST  T1, (1*16)(res_ptr)
1000  	RET
1001  
1002  #undef res_ptr
1003  #undef x_ptr
1004  #undef y_ptr
1005  #undef CPOOL
1006  
1007  #undef X0
1008  #undef X1
1009  #undef Y0
1010  #undef Y1
1011  #undef T0
1012  #undef T1
1013  #undef P0
1014  #undef P1
1015  
1016  // ---------------------------------------
1017  //  func p256Sqr(res, in *p256Element, n int)
1018  #define res_ptr R1
1019  #define x_ptr   R2
1020  #define y_ptr   R3
1021  #define CPOOL   R4
1022  #define COUNT   R5
1023  #define N       R6
1024  
1025  // Parameters
1026  #define X0    V0
1027  #define X1    V1
1028  #define T0    V4
1029  #define T1    V5
1030  
1031  // Constants
1032  #define P0    V30
1033  #define P1    V31
1034  TEXT ·p256Sqr(SB), NOSPLIT, $0
1035  	MOVD res+0(FP), res_ptr
1036  	MOVD in+8(FP), x_ptr
1037  
1038  	VL   (0*16)(x_ptr), X0
1039  	VPDI $0x4, X0, X0, X0
1040  	VL   (1*16)(x_ptr), X1
1041  	VPDI $0x4, X1, X1, X1
1042  
1043  	MOVD $p256mul<>+0x00(SB), CPOOL
1044  	MOVD $0, COUNT
1045  	MOVD n+16(FP), N
1046  	VL   16(CPOOL), P0
1047  	VL   0(CPOOL), P1
1048  
1049  loop:
1050  	CALL p256SqrInternal<>(SB)
1051  	VLR  T0, X0
1052  	VLR  T1, X1
1053  	ADDW $1, COUNT
1054  	CMPW COUNT, N
1055  	BLT  loop
1056  
1057  	VPDI $0x4, T0, T0, T0
1058  	VST  T0, (0*16)(res_ptr)
1059  	VPDI $0x4, T1, T1, T1
1060  	VST  T1, (1*16)(res_ptr)
1061  	RET
1062  
1063  #undef res_ptr
1064  #undef x_ptr
1065  #undef y_ptr
1066  #undef CPOOL
1067  #undef COUNT
1068  #undef N
1069  
1070  #undef X0
1071  #undef X1
1072  #undef T0
1073  #undef T1
1074  #undef P0
1075  #undef P1
1076  
1077  // Point add with P2 being affine point
1078  // If sign == 1 -> P2 = -P2
1079  // If sel == 0 -> P3 = P1
1080  // if zero == 0 -> P3 = P2
1081  // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1082  #define P3ptr   R1
1083  #define P1ptr   R2
1084  #define P2ptr   R3
1085  #define CPOOL   R4
1086  
1087  // Temporaries in REGs
1088  #define Y2L    V15
1089  #define Y2H    V16
1090  #define T1L    V17
1091  #define T1H    V18
1092  #define T2L    V19
1093  #define T2H    V20
1094  #define T3L    V21
1095  #define T3H    V22
1096  #define T4L    V23
1097  #define T4H    V24
1098  
1099  // Temps for Sub and Add
1100  #define TT0  V11
1101  #define TT1  V12
1102  #define T2   V13
1103  
1104  // p256MulAsm Parameters
1105  #define X0    V0
1106  #define X1    V1
1107  #define Y0    V2
1108  #define Y1    V3
1109  #define T0    V4
1110  #define T1    V5
1111  
1112  #define PL    V30
1113  #define PH    V31
1114  
1115  // Names for zero/sel selects
1116  #define X1L    V0
1117  #define X1H    V1
1118  #define Y1L    V2 // p256MulAsmParmY
1119  #define Y1H    V3 // p256MulAsmParmY
1120  #define Z1L    V4
1121  #define Z1H    V5
1122  #define X2L    V0
1123  #define X2H    V1
1124  #define Z2L    V4
1125  #define Z2H    V5
1126  #define X3L    V17 // T1L
1127  #define X3H    V18 // T1H
1128  #define Y3L    V21 // T3L
1129  #define Y3H    V22 // T3H
1130  #define Z3L    V28
1131  #define Z3H    V29
1132  
1133  #define ZER   V6
1134  #define SEL1  V7
1135  #define CAR1  V8
1136  #define CAR2  V9
1137  /* *
1138   * Three operand formula:
1139   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1140   * T1 = Z1²
1141   * T2 = T1*Z1
1142   * T1 = T1*X2
1143   * T2 = T2*Y2
1144   * T1 = T1-X1
1145   * T2 = T2-Y1
1146   * Z3 = Z1*T1
1147   * T3 = T1²
1148   * T4 = T3*T1
1149   * T3 = T3*X1
1150   * T1 = 2*T3
1151   * X3 = T2²
1152   * X3 = X3-T1
1153   * X3 = X3-T4
1154   * T3 = T3-X3
1155   * T3 = T3*T2
1156   * T4 = T4*Y1
1157   * Y3 = T3-T4
1158  
1159   * Three operand formulas, but with MulInternal X,Y used to store temps
1160  X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
1161  X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
1162  X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
1163  X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
1164  SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
1165  SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
1166  X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
1167  X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
1168  X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
1169  X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
1170  ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
1171  X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
1172  SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
1173  SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
1174  SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
1175  X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
1176  X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
1177  SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
1178  
1179  	*/
1180  TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
1181  	MOVD res+0(FP), P3ptr
1182  	MOVD in1+8(FP), P1ptr
1183  	MOVD in2+16(FP), P2ptr
1184  
1185  	MOVD $p256mul<>+0x00(SB), CPOOL
1186  	VL   16(CPOOL), PL
1187  	VL   0(CPOOL), PH
1188  
1189  	//	if (sign == 1) {
1190  	//		Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2  = P-Y2
1191  	//	}
1192  
1193  	VL   48(P2ptr), Y2H
1194  	VPDI $0x4, Y2H, Y2H, Y2H
1195  	VL   32(P2ptr), Y2L
1196  	VPDI $0x4, Y2L, Y2L, Y2L
1197  
1198  	VLREPG sign+24(FP), SEL1
1199  	VZERO  ZER
1200  	VCEQG  SEL1, ZER, SEL1
1201  
1202  	VSCBIQ Y2L, PL, CAR1
1203  	VSQ    Y2L, PL, T1L
1204  	VSBIQ  PH, Y2H, CAR1, T1H
1205  
1206  	VSEL Y2L, T1L, SEL1, Y2L
1207  	VSEL Y2H, T1H, SEL1, Y2H
1208  
1209  /* *
1210   * Three operand formula:
1211   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1212   */
1213  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
1214  	VL   80(P1ptr), X1       // Z1H
1215  	VPDI $0x4, X1, X1, X1
1216  	VL   64(P1ptr), X0       // Z1L
1217  	VPDI $0x4, X0, X0, X0
1218  	VLR  X0, Y0
1219  	VLR  X1, Y1
1220  	CALL p256SqrInternal<>(SB)
1221  
1222  	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
1223  	VLR  T0, X0
1224  	VLR  T1, X1
1225  	CALL p256MulInternal<>(SB)
1226  	VLR  T0, T2L
1227  	VLR  T1, T2H
1228  
1229  	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
1230  	VL   16(P2ptr), Y1       // X2H
1231  	VPDI $0x4, Y1, Y1, Y1
1232  	VL   0(P2ptr), Y0        // X2L
1233  	VPDI $0x4, Y0, Y0, Y0
1234  	CALL p256MulInternal<>(SB)
1235  	VLR  T0, T1L
1236  	VLR  T1, T1H
1237  
1238  	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
1239  	VLR  T2L, X0
1240  	VLR  T2H, X1
1241  	VLR  Y2L, Y0
1242  	VLR  Y2H, Y1
1243  	CALL p256MulInternal<>(SB)
1244  
1245  	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
1246  	VL   48(P1ptr), Y1H
1247  	VPDI $0x4, Y1H, Y1H, Y1H
1248  	VL   32(P1ptr), Y1L
1249  	VPDI $0x4, Y1L, Y1L, Y1L
1250  	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
1251  
1252  	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
1253  	VL   16(P1ptr), X1H
1254  	VPDI $0x4, X1H, X1H, X1H
1255  	VL   0(P1ptr), X1L
1256  	VPDI $0x4, X1L, X1L, X1L
1257  	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
1258  
1259  	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
1260  	VL   80(P1ptr), X1       // Z1H
1261  	VPDI $0x4, X1, X1, X1
1262  	VL   64(P1ptr), X0       // Z1L
1263  	VPDI $0x4, X0, X0, X0
1264  	CALL p256MulInternal<>(SB)
1265  
1266  	// VST T1, 64(P3ptr)
1267  	// VST T0, 80(P3ptr)
1268  	VLR T0, Z3L
1269  	VLR T1, Z3H
1270  
1271  	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
1272  	VLR  Y0, X0
1273  	VLR  Y1, X1
1274  	CALL p256SqrInternal<>(SB)
1275  	VLR  T0, X0
1276  	VLR  T1, X1
1277  
1278  	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
1279  	CALL p256MulInternal<>(SB)
1280  	VLR  T0, T4L
1281  	VLR  T1, T4H
1282  
1283  	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
1284  	VL   16(P1ptr), Y1       // X1H
1285  	VPDI $0x4, Y1, Y1, Y1
1286  	VL   0(P1ptr), Y0        // X1L
1287  	VPDI $0x4, Y0, Y0, Y0
1288  	CALL p256MulInternal<>(SB)
1289  	VLR  T0, T3L
1290  	VLR  T1, T3H
1291  
1292  	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
1293  	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
1294  
1295  	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
1296  	VLR  T2L, X0
1297  	VLR  T2H, X1
1298  	VLR  T2L, Y0
1299  	VLR  T2H, Y1
1300  	CALL p256SqrInternal<>(SB)
1301  
1302  	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
1303  	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
1304  
1305  	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
1306  	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
1307  	VLR T0, X3L
1308  	VLR T1, X3H
1309  
1310  	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
1311  	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
1312  
1313  	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
1314  	CALL p256MulInternal<>(SB)
1315  	VLR  T0, T3L
1316  	VLR  T1, T3H
1317  
1318  	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
1319  	VLR  T4L, X0
1320  	VLR  T4H, X1
1321  	VL   48(P1ptr), Y1       // Y1H
1322  	VPDI $0x4, Y1, Y1, Y1
1323  	VL   32(P1ptr), Y0       // Y1L
1324  	VPDI $0x4, Y0, Y0, Y0
1325  	CALL p256MulInternal<>(SB)
1326  
1327  	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
1328  	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
1329  
1330  	//	if (sel == 0) {
1331  	//		copy(P3.x[:], X1)
1332  	//		copy(P3.y[:], Y1)
1333  	//		copy(P3.z[:], Z1)
1334  	//	}
1335  
1336  	VL   16(P1ptr), X1H
1337  	VPDI $0x4, X1H, X1H, X1H
1338  	VL   0(P1ptr), X1L
1339  	VPDI $0x4, X1L, X1L, X1L
1340  
1341  	// Y1 already loaded, left over from addition
1342  	VL   80(P1ptr), Z1H
1343  	VPDI $0x4, Z1H, Z1H, Z1H
1344  	VL   64(P1ptr), Z1L
1345  	VPDI $0x4, Z1L, Z1L, Z1L
1346  
1347  	VLREPG sel+32(FP), SEL1
1348  	VZERO  ZER
1349  	VCEQG  SEL1, ZER, SEL1
1350  
1351  	VSEL X1L, X3L, SEL1, X3L
1352  	VSEL X1H, X3H, SEL1, X3H
1353  	VSEL Y1L, Y3L, SEL1, Y3L
1354  	VSEL Y1H, Y3H, SEL1, Y3H
1355  	VSEL Z1L, Z3L, SEL1, Z3L
1356  	VSEL Z1H, Z3H, SEL1, Z3H
1357  
1358  	//	if (zero == 0) {
1359  	//		copy(P3.x[:], X2)
1360  	//		copy(P3.y[:], Y2)
1361  	//		copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1362  	//			0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})  //(p256.z*2^256)%p
1363  	//	}
1364  	VL   16(P2ptr), X2H
1365  	VPDI $0x4, X2H, X2H, X2H
1366  	VL   0(P2ptr), X2L
1367  	VPDI $0x4, X2L, X2L, X2L
1368  
1369  	// Y2 already loaded
1370  	VL 128(CPOOL), Z2H
1371  	VL 144(CPOOL), Z2L
1372  
1373  	VLREPG zero+40(FP), SEL1
1374  	VZERO  ZER
1375  	VCEQG  SEL1, ZER, SEL1
1376  
1377  	VSEL X2L, X3L, SEL1, X3L
1378  	VSEL X2H, X3H, SEL1, X3H
1379  	VSEL Y2L, Y3L, SEL1, Y3L
1380  	VSEL Y2H, Y3H, SEL1, Y3H
1381  	VSEL Z2L, Z3L, SEL1, Z3L
1382  	VSEL Z2H, Z3H, SEL1, Z3H
1383  
1384  	// All done, store out the result!!!
1385  	VPDI $0x4, X3H, X3H, X3H
1386  	VST  X3H, 16(P3ptr)
1387  	VPDI $0x4, X3L, X3L, X3L
1388  	VST  X3L, 0(P3ptr)
1389  	VPDI $0x4, Y3H, Y3H, Y3H
1390  	VST  Y3H, 48(P3ptr)
1391  	VPDI $0x4, Y3L, Y3L, Y3L
1392  	VST  Y3L, 32(P3ptr)
1393  	VPDI $0x4, Z3H, Z3H, Z3H
1394  	VST  Z3H, 80(P3ptr)
1395  	VPDI $0x4, Z3L, Z3L, Z3L
1396  	VST  Z3L, 64(P3ptr)
1397  
1398  	RET
1399  
1400  #undef P3ptr
1401  #undef P1ptr
1402  #undef P2ptr
1403  #undef CPOOL
1404  
1405  #undef Y2L
1406  #undef Y2H
1407  #undef T1L
1408  #undef T1H
1409  #undef T2L
1410  #undef T2H
1411  #undef T3L
1412  #undef T3H
1413  #undef T4L
1414  #undef T4H
1415  
1416  #undef TT0
1417  #undef TT1
1418  #undef T2
1419  
1420  #undef X0
1421  #undef X1
1422  #undef Y0
1423  #undef Y1
1424  #undef T0
1425  #undef T1
1426  
1427  #undef PL
1428  #undef PH
1429  
1430  #undef X1L
1431  #undef X1H
1432  #undef Y1L
1433  #undef Y1H
1434  #undef Z1L
1435  #undef Z1H
1436  #undef X2L
1437  #undef X2H
1438  #undef Z2L
1439  #undef Z2H
1440  #undef X3L
1441  #undef X3H
1442  #undef Y3L
1443  #undef Y3H
1444  #undef Z3L
1445  #undef Z3H
1446  
1447  #undef ZER
1448  #undef SEL1
1449  #undef CAR1
1450  #undef CAR2
1451  
1452  // func p256PointDoubleAsm(res, in *P256Point)
1453  // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
1454  // https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
1455  // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
1456  #define P3ptr   R1
1457  #define P1ptr   R2
1458  #define CPOOL   R4
1459  
1460  // Temporaries in REGs
1461  #define X3L    V15
1462  #define X3H    V16
1463  #define Y3L    V17
1464  #define Y3H    V18
1465  #define T1L    V19
1466  #define T1H    V20
1467  #define T2L    V21
1468  #define T2H    V22
1469  #define T3L    V23
1470  #define T3H    V24
1471  
1472  #define X1L    V6
1473  #define X1H    V7
1474  #define Y1L    V8
1475  #define Y1H    V9
1476  #define Z1L    V10
1477  #define Z1H    V11
1478  
1479  // Temps for Sub and Add
1480  #define TT0  V11
1481  #define TT1  V12
1482  #define T2   V13
1483  
1484  // p256MulAsm Parameters
1485  #define X0    V0
1486  #define X1    V1
1487  #define Y0    V2
1488  #define Y1    V3
1489  #define T0    V4
1490  #define T1    V5
1491  
1492  #define PL    V30
1493  #define PH    V31
1494  
1495  #define Z3L    V23
1496  #define Z3H    V24
1497  
1498  #define ZER   V26
1499  #define SEL1  V27
1500  #define CAR1  V28
1501  #define CAR2  V29
1502  /*
1503   * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
1504   * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
1505   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1506   * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
1507   * 	B  = 2Y₁
1508   * 	Z₃ = B×Z₁
1509   * 	C  = B²
1510   * 	D  = C×X₁
1511   * 	X₃ = A²-2D
1512   * 	Y₃ = (D-X₃)×A-C²/2
1513   *
1514   * Three-operand formula:
1515   *       T1 = Z1²
1516   *       T2 = X1-T1
1517   *       T1 = X1+T1
1518   *       T2 = T2*T1
1519   *       T2 = 3*T2
1520   *       Y3 = 2*Y1
1521   *       Z3 = Y3*Z1
1522   *       Y3 = Y3²
1523   *       T3 = Y3*X1
1524   *       Y3 = Y3²
1525   *       Y3 = half*Y3
1526   *       X3 = T2²
1527   *       T1 = 2*T3
1528   *       X3 = X3-T1
1529   *       T1 = T3-X3
1530   *       T1 = T1*T2
1531   *       Y3 = T1-Y3
1532   */
1533  
1534  TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
1535  	MOVD res+0(FP), P3ptr
1536  	MOVD in+8(FP), P1ptr
1537  
1538  	MOVD $p256mul<>+0x00(SB), CPOOL
1539  	VL   16(CPOOL), PL
1540  	VL   0(CPOOL), PH
1541  
1542  	// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
1543  	VL   80(P1ptr), X1        // Z1H
1544  	VPDI $0x4, X1, X1, X1
1545  	VL   64(P1ptr), X0        // Z1L
1546  	VPDI $0x4, X0, X0, X0
1547  	VLR  X0, Y0
1548  	VLR  X1, Y1
1549  	CALL p256SqrInternal<>(SB)
1550  
1551  	// SUB(X<X1-T)            // T2 = X1-T1
1552  	VL   16(P1ptr), X1H
1553  	VPDI $0x4, X1H, X1H, X1H
1554  	VL   0(P1ptr), X1L
1555  	VPDI $0x4, X1L, X1L, X1L
1556  	p256SubInternal(X1,X0,X1H,X1L,T1,T0)
1557  
1558  	// ADD(Y<X1+T)            // T1 = X1+T1
1559  	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
1560  
1561  	// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
1562  	CALL p256MulInternal<>(SB)
1563  
1564  	// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
1565  	p256AddInternal(T2H,T2L,T1,T0,T1,T0)
1566  	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
1567  
1568  	// ADD(X<Y1+Y1)           // Y3 = 2*Y1
1569  	VL   48(P1ptr), Y1H
1570  	VPDI $0x4, Y1H, Y1H, Y1H
1571  	VL   32(P1ptr), Y1L
1572  	VPDI $0x4, Y1L, Y1L, Y1L
1573  	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
1574  
1575  	// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
1576  	VL   80(P1ptr), Y1        // Z1H
1577  	VPDI $0x4, Y1, Y1, Y1
1578  	VL   64(P1ptr), Y0        // Z1L
1579  	VPDI $0x4, Y0, Y0, Y0
1580  	CALL p256MulInternal<>(SB)
1581  	VPDI $0x4, T1, T1, TT1
1582  	VST  TT1, 80(P3ptr)
1583  	VPDI $0x4, T0, T0, TT0
1584  	VST  TT0, 64(P3ptr)
1585  
1586  	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
1587  	VLR  X0, Y0
1588  	VLR  X1, Y1
1589  	CALL p256SqrInternal<>(SB)
1590  
1591  	// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
1592  	VLR  T0, X0
1593  	VLR  T1, X1
1594  	VL   16(P1ptr), Y1
1595  	VPDI $0x4, Y1, Y1, Y1
1596  	VL   0(P1ptr), Y0
1597  	VPDI $0x4, Y0, Y0, Y0
1598  	CALL p256MulInternal<>(SB)
1599  	VLR  T0, T3L
1600  	VLR  T1, T3H
1601  
1602  	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
1603  	VLR  X0, Y0
1604  	VLR  X1, Y1
1605  	CALL p256SqrInternal<>(SB)
1606  
1607  	// HAL(Y3<T)              // Y3 = half*Y3
1608  	p256HalfInternal(Y3H,Y3L, T1,T0)
1609  
1610  	// X=T2; Y=T2; MUL; T-    // X3 = T2²
1611  	VLR  T2L, X0
1612  	VLR  T2H, X1
1613  	VLR  T2L, Y0
1614  	VLR  T2H, Y1
1615  	CALL p256SqrInternal<>(SB)
1616  
1617  	// ADD(T1<T3+T3)          // T1 = 2*T3
1618  	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
1619  
1620  	// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
1621  	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
1622  	VPDI $0x4, X3H, X3H, TT1
1623  	VST  TT1, 16(P3ptr)
1624  	VPDI $0x4, X3L, X3L, TT0
1625  	VST  TT0, 0(P3ptr)
1626  
1627  	// SUB(X<T3-X3)           // T1 = T3-X3
1628  	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
1629  
1630  	// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
1631  	CALL p256MulInternal<>(SB)
1632  
1633  	// SUB(Y3<T-Y3)           // Y3 = T1-Y3
1634  	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
1635  
1636  	VPDI $0x4, Y3H, Y3H, Y3H
1637  	VST  Y3H, 48(P3ptr)
1638  	VPDI $0x4, Y3L, Y3L, Y3L
1639  	VST  Y3L, 32(P3ptr)
1640  	RET
1641  
1642  #undef P3ptr
1643  #undef P1ptr
1644  #undef CPOOL
1645  #undef X3L
1646  #undef X3H
1647  #undef Y3L
1648  #undef Y3H
1649  #undef T1L
1650  #undef T1H
1651  #undef T2L
1652  #undef T2H
1653  #undef T3L
1654  #undef T3H
1655  #undef X1L
1656  #undef X1H
1657  #undef Y1L
1658  #undef Y1H
1659  #undef Z1L
1660  #undef Z1H
1661  #undef TT0
1662  #undef TT1
1663  #undef T2
1664  #undef X0
1665  #undef X1
1666  #undef Y0
1667  #undef Y1
1668  #undef T0
1669  #undef T1
1670  #undef PL
1671  #undef PH
1672  #undef Z3L
1673  #undef Z3H
1674  #undef ZER
1675  #undef SEL1
1676  #undef CAR1
1677  #undef CAR2
1678  
1679  // func p256PointAddAsm(res, in1, in2 *P256Point) int
1680  #define P3ptr  R1
1681  #define P1ptr  R2
1682  #define P2ptr  R3
1683  #define CPOOL  R4
1684  #define ISZERO R5
1685  #define TRUE   R6
1686  
1687  // Temporaries in REGs
1688  #define T1L   V16
1689  #define T1H   V17
1690  #define T2L   V18
1691  #define T2H   V19
1692  #define U1L   V20
1693  #define U1H   V21
1694  #define S1L   V22
1695  #define S1H   V23
1696  #define HL    V24
1697  #define HH    V25
1698  #define RL    V26
1699  #define RH    V27
1700  
1701  // Temps for Sub and Add
1702  #define ZER   V6
1703  #define SEL1  V7
1704  #define CAR1  V8
1705  #define CAR2  V9
1706  #define TT0  V11
1707  #define TT1  V12
1708  #define T2   V13
1709  
1710  // p256MulAsm Parameters
1711  #define X0    V0
1712  #define X1    V1
1713  #define Y0    V2
1714  #define Y1    V3
1715  #define T0    V4
1716  #define T1    V5
1717  
1718  #define PL    V30
1719  #define PH    V31
1720  /*
1721   * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
1722   *
1723   * A = X₁×Z₂²
1724   * B = Y₁×Z₂³
1725   * C = X₂×Z₁²-A
1726   * D = Y₂×Z₁³-B
1727   * X₃ = D² - 2A×C² - C³
1728   * Y₃ = D×(A×C² - X₃) - B×C³
1729   * Z₃ = Z₁×Z₂×C
1730   *
1731   * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
1732   * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
1733   *
1734   * T1 = Z1*Z1
1735   * T2 = Z2*Z2
1736   * U1 = X1*T2
1737   * H  = X2*T1
1738   * H  = H-U1
1739   * Z3 = Z1*Z2
1740   * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1741   *
1742   * S1 = Z2*T2
1743   * S1 = Y1*S1
1744   * R  = Z1*T1
1745   * R  = Y2*R
1746   * R  = R-S1
1747   *
1748   * T1 = H*H
1749   * T2 = H*T1
1750   * U1 = U1*T1
1751   *
1752   * X3 = R*R
1753   * X3 = X3-T2
1754   * T1 = 2*U1
1755   * X3 = X3-T1 << store-out X3 result reg
1756   *
1757   * T2 = S1*T2
1758   * Y3 = U1-X3
1759   * Y3 = R*Y3
1760   * Y3 = Y3-T2 << store-out Y3 result reg
1761  
1762   	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
1763  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
1764  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
1765  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
1766  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
1767  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
1768  	// SUB(H<H-T)            // H  = H-U1
1769  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
1770  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1771  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
1772  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
1773  	// SUB(R<T-S1)           // R  = R-S1
1774  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
1775  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
1776  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
1777  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
1778  	// SUB(T<T-T2)           // X3 = X3-T2
1779  	// ADD(X<U1+U1)          // T1 = 2*U1
1780  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
1781  	// SUB(Y<U1-T)           // Y3 = U1-X3
1782  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
1783  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
1784  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
1785  	*/
1786  TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
1787  	MOVD res+0(FP), P3ptr
1788  	MOVD in1+8(FP), P1ptr
1789  	MOVD in2+16(FP), P2ptr
1790  
1791  	MOVD $p256mul<>+0x00(SB), CPOOL
1792  	VL   16(CPOOL), PL
1793  	VL   0(CPOOL), PH
1794  
1795  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
1796  	VL   80(P1ptr), X1       // Z1H
1797  	VPDI $0x4, X1, X1, X1
1798  	VL   64(P1ptr), X0       // Z1L
1799  	VPDI $0x4, X0, X0, X0
1800  	VLR  X0, Y0
1801  	VLR  X1, Y1
1802  	CALL p256SqrInternal<>(SB)
1803  
1804  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
1805  	VLR  T0, Y0
1806  	VLR  T1, Y1
1807  	CALL p256MulInternal<>(SB)
1808  	VLR  T0, RL
1809  	VLR  T1, RH
1810  
1811  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
1812  	VL   16(P2ptr), X1       // X2H
1813  	VPDI $0x4, X1, X1, X1
1814  	VL   0(P2ptr), X0        // X2L
1815  	VPDI $0x4, X0, X0, X0
1816  	CALL p256MulInternal<>(SB)
1817  	VLR  T0, HL
1818  	VLR  T1, HH
1819  
1820  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
1821  	VL   80(P2ptr), X1       // Z2H
1822  	VPDI $0x4, X1, X1, X1
1823  	VL   64(P2ptr), X0       // Z2L
1824  	VPDI $0x4, X0, X0, X0
1825  	VLR  X0, Y0
1826  	VLR  X1, Y1
1827  	CALL p256SqrInternal<>(SB)
1828  
1829  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
1830  	VLR  T0, Y0
1831  	VLR  T1, Y1
1832  	CALL p256MulInternal<>(SB)
1833  	VLR  T0, S1L
1834  	VLR  T1, S1H
1835  
1836  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
1837  	VL   16(P1ptr), X1       // X1H
1838  	VPDI $0x4, X1, X1, X1
1839  	VL   0(P1ptr), X0        // X1L
1840  	VPDI $0x4, X0, X0, X0
1841  	CALL p256MulInternal<>(SB)
1842  	VLR  T0, U1L
1843  	VLR  T1, U1H
1844  
1845  	// SUB(H<H-T)            // H  = H-U1
1846  	p256SubInternal(HH,HL,HH,HL,T1,T0)
1847  
1848  	// if H == 0 or H^P == 0 then ret=1 else ret=0
1849  	// clobbers T1H and T1L
1850  	MOVD   $0, ISZERO
1851  	MOVD   $1, TRUE
1852  	VZERO  ZER
1853  	VO     HL, HH, T1H
1854  	VCEQGS ZER, T1H, T1H
1855  	MOVDEQ TRUE, ISZERO
1856  	VX     HL, PL, T1L
1857  	VX     HH, PH, T1H
1858  	VO     T1L, T1H, T1H
1859  	VCEQGS ZER, T1H, T1H
1860  	MOVDEQ TRUE, ISZERO
1861  	MOVD   ISZERO, ret+24(FP)
1862  
1863  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
1864  	VL   80(P1ptr), X1       // Z1H
1865  	VPDI $0x4, X1, X1, X1
1866  	VL   64(P1ptr), X0       // Z1L
1867  	VPDI $0x4, X0, X0, X0
1868  	VL   80(P2ptr), Y1       // Z2H
1869  	VPDI $0x4, Y1, Y1, Y1
1870  	VL   64(P2ptr), Y0       // Z2L
1871  	VPDI $0x4, Y0, Y0, Y0
1872  	CALL p256MulInternal<>(SB)
1873  
1874  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
1875  	VLR  T0, X0
1876  	VLR  T1, X1
1877  	VLR  HL, Y0
1878  	VLR  HH, Y1
1879  	CALL p256MulInternal<>(SB)
1880  	VPDI $0x4, T1, T1, TT1
1881  	VST  TT1, 80(P3ptr)
1882  	VPDI $0x4, T0, T0, TT0
1883  	VST  TT0, 64(P3ptr)
1884  
1885  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
1886  	VL   48(P1ptr), X1
1887  	VPDI $0x4, X1, X1, X1
1888  	VL   32(P1ptr), X0
1889  	VPDI $0x4, X0, X0, X0
1890  	VLR  S1L, Y0
1891  	VLR  S1H, Y1
1892  	CALL p256MulInternal<>(SB)
1893  	VLR  T0, S1L
1894  	VLR  T1, S1H
1895  
1896  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
1897  	VL   48(P2ptr), X1
1898  	VPDI $0x4, X1, X1, X1
1899  	VL   32(P2ptr), X0
1900  	VPDI $0x4, X0, X0, X0
1901  	VLR  RL, Y0
1902  	VLR  RH, Y1
1903  	CALL p256MulInternal<>(SB)
1904  
1905  	// SUB(R<T-S1)           // R  = T-S1
1906  	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
1907  
1908  	// if R == 0 or R^P == 0 then ret=ret else ret=0
1909  	// clobbers T1H and T1L
1910  	MOVD   $0, ISZERO
1911  	MOVD   $1, TRUE
1912  	VZERO  ZER
1913  	VO     RL, RH, T1H
1914  	VCEQGS ZER, T1H, T1H
1915  	MOVDEQ TRUE, ISZERO
1916  	VX     RL, PL, T1L
1917  	VX     RH, PH, T1H
1918  	VO     T1L, T1H, T1H
1919  	VCEQGS ZER, T1H, T1H
1920  	MOVDEQ TRUE, ISZERO
1921  	AND    ret+24(FP), ISZERO
1922  	MOVD   ISZERO, ret+24(FP)
1923  
1924  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
1925  	VLR  HL, X0
1926  	VLR  HH, X1
1927  	VLR  HL, Y0
1928  	VLR  HH, Y1
1929  	CALL p256SqrInternal<>(SB)
1930  
1931  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
1932  	VLR  T0, Y0
1933  	VLR  T1, Y1
1934  	CALL p256MulInternal<>(SB)
1935  	VLR  T0, T2L
1936  	VLR  T1, T2H
1937  
1938  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
1939  	VLR  U1L, X0
1940  	VLR  U1H, X1
1941  	CALL p256MulInternal<>(SB)
1942  	VLR  T0, U1L
1943  	VLR  T1, U1H
1944  
1945  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
1946  	VLR  RL, X0
1947  	VLR  RH, X1
1948  	VLR  RL, Y0
1949  	VLR  RH, Y1
1950  	CALL p256SqrInternal<>(SB)
1951  
1952  	// SUB(T<T-T2)           // X3 = X3-T2
1953  	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
1954  
1955  	// ADD(X<U1+U1)          // T1 = 2*U1
1956  	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
1957  
1958  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
1959  	p256SubInternal(T1,T0,T1,T0,X1,X0)
1960  	VPDI $0x4, T1, T1, TT1
1961  	VST  TT1, 16(P3ptr)
1962  	VPDI $0x4, T0, T0, TT0
1963  	VST  TT0, 0(P3ptr)
1964  
1965  	// SUB(Y<U1-T)           // Y3 = U1-X3
1966  	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
1967  
1968  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
1969  	VLR  RL, X0
1970  	VLR  RH, X1
1971  	CALL p256MulInternal<>(SB)
1972  	VLR  T0, U1L
1973  	VLR  T1, U1H
1974  
1975  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
1976  	VLR  S1L, X0
1977  	VLR  S1H, X1
1978  	VLR  T2L, Y0
1979  	VLR  T2H, Y1
1980  	CALL p256MulInternal<>(SB)
1981  
1982  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
1983  	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
1984  	VPDI $0x4, T1, T1, T1
1985  	VST  T1, 48(P3ptr)
1986  	VPDI $0x4, T0, T0, T0
1987  	VST  T0, 32(P3ptr)
1988  
1989  	RET
1990