p256_asm_ppc64le.s raw

   1  // Copyright 2019 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build !purego
   6  
   7  #include "textflag.h"
   8  
   9  // This is a port of the s390x asm implementation.
  10  // to ppc64le.
  11  
  12  // Some changes were needed due to differences in
  13  // the Go opcodes and/or available instructions
  14  // between s390x and ppc64le.
  15  
  16  // 1. There were operand order differences in the
  17  // VSUBUQM, VSUBCUQ, and VSEL instructions.
  18  
  19  // 2. ppc64 does not have a multiply high and low
  20  // like s390x, so those were implemented using
  21  // macros to compute the equivalent values.
  22  
  23  // 3. The LVX, STVX instructions on ppc64 require
  24  // 16 byte alignment of the data.  To avoid that
  25  // requirement, data is loaded using LXVD2X and
  26  // STXVD2X with VPERM to reorder bytes correctly.
  27  
  28  // I have identified some areas where I believe
  29  // changes would be needed to make this work for big
  30  // endian; however additional changes beyond what I
  31  // have noted are most likely needed to make it work.
  32  // - The string used with VPERM to swap the byte order
  33  //   for loads and stores.
  34  // - The constants that are loaded from CPOOL.
  35  //
  36  
  37  // The following constants are defined in an order
  38  // that is correct for use with LXVD2X/STXVD2X
  39  // on little endian.
  40  DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
  41  DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
  42  DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
  43  DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
  44  DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
  45  DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
  46  DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
  47  DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
  48  DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
  49  DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
  50  DATA p256mul<>+0x00(SB)/8, $0x00000000ffffffff // P256 original
  51  DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256
  52  DATA p256mul<>+0x10(SB)/8, $0xffffffff00000001 // P256 original
  53  DATA p256mul<>+0x18(SB)/8, $0x0000000000000000 // P256
  54  DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
  55  DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
  56  DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
  57  DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
  58  DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
  59  DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
  60  DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
  61  DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
  62  DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
  63  DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
  64  DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
  65  DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
  66  DATA p256mul<>+0x80(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
  67  DATA p256mul<>+0x88(SB)/8, $0x0000000000000001 // (1*2^256)%P256
  68  DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
  69  DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
  70  
  71  // External declarations for constants
  72  GLOBL p256ord<>(SB), 8, $32
  73  GLOBL p256<>(SB), 8, $80
  74  GLOBL p256mul<>(SB), 8, $160
  75  
  76  // The following macros are used to implement the ppc64le
  77  // equivalent function from the corresponding s390x
  78  // instruction for vector multiply high, low, and add,
  79  // since there aren't exact equivalent instructions.
  80  // The corresponding s390x instructions appear in the
  81  // comments.
  82  // Implementation for big endian would have to be
  83  // investigated, I think it would be different.
  84  //
  85  //
  86  // Vector multiply word
  87  //
  88  //	VMLF  x0, x1, out_low
  89  //	VMLHF x0, x1, out_hi
  90  #define VMULT(x1, x2, out_low, out_hi) \
  91  	VMULEUW x1, x2, TMP1; \
  92  	VMULOUW x1, x2, TMP2; \
  93  	VMRGEW TMP1, TMP2, out_hi; \
  94  	VMRGOW TMP1, TMP2, out_low
  95  
  96  //
  97  // Vector multiply add word
  98  //
  99  //	VMALF  x0, x1, y, out_low
 100  //	VMALHF x0, x1, y, out_hi
 101  #define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \
 102  	VMULEUW  y, one, TMP2; \
 103  	VMULOUW  y, one, TMP1; \
 104  	VMULEUW  x1, x2, out_low; \
 105  	VMULOUW  x1, x2, out_hi; \
 106  	VADDUDM  TMP2, out_low, TMP2; \
 107  	VADDUDM  TMP1, out_hi, TMP1; \
 108  	VMRGOW   TMP2, TMP1, out_low; \
 109  	VMRGEW   TMP2, TMP1, out_hi
 110  
 111  #define res_ptr R3
 112  #define a_ptr R4
 113  
 114  #undef res_ptr
 115  #undef a_ptr
 116  
 117  #define P1ptr   R3
 118  #define CPOOL   R7
 119  
 120  #define Y1L   V0
 121  #define Y1H   V1
 122  #define T1L   V2
 123  #define T1H   V3
 124  
 125  #define PL    V30
 126  #define PH    V31
 127  
 128  #define CAR1  V6
 129  
 130  #define SEL    V8
 131  #define ZER    V9
 132  
 133  // func p256NegCond(val *p256Point, cond int)
 134  TEXT ·p256NegCond(SB), NOSPLIT, $0-16
 135  	MOVD val+0(FP), P1ptr
 136  	MOVD $16, R16
 137  
 138  	// Copy cond into SEL (cond is R1 + 8 (cond offset) + 32)
 139  	MOVD $40, R17
 140  	LXVDSX (R1)(R17), SEL
 141  	// Zeroize ZER
 142  	VSPLTISB $0, ZER
 143  	// SEL controls whether to return the original value (Y1H/Y1L)
 144  	// or the negated value (T1H/T1L).
 145  	VCMPEQUD SEL, ZER, SEL
 146  
 147  	MOVD $p256mul<>+0x00(SB), CPOOL
 148  
 149  	LXVD2X (P1ptr)(R0), Y1L
 150  	LXVD2X (P1ptr)(R16), Y1H
 151  
 152  	XXPERMDI Y1H, Y1H, $2, Y1H
 153  	XXPERMDI Y1L, Y1L, $2, Y1L
 154  
 155  	LXVD2X (CPOOL)(R0), PL
 156  	LXVD2X (CPOOL)(R16), PH
 157  
 158  	VSUBCUQ  PL, Y1L, CAR1      // subtract part2 giving carry
 159  	VSUBUQM  PL, Y1L, T1L       // subtract part2 giving result
 160  	VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2
 161  
 162  	VSEL T1H, Y1H, SEL, T1H
 163  	VSEL T1L, Y1L, SEL, T1L
 164  
 165  	XXPERMDI T1H, T1H, $2, T1H
 166  	XXPERMDI T1L, T1L, $2, T1L
 167  
 168  	STXVD2X T1L, (R0+P1ptr)
 169  	STXVD2X T1H, (R16+P1ptr)
 170  	RET
 171  
 172  #undef P1ptr
 173  #undef CPOOL
 174  #undef Y1L
 175  #undef Y1H
 176  #undef T1L
 177  #undef T1H
 178  #undef PL
 179  #undef PH
 180  #undef CAR1
 181  #undef SEL
 182  #undef ZER
 183  
 184  #define P3ptr   R3
 185  #define P1ptr   R4
 186  #define P2ptr   R5
 187  
 188  #define X1L    V0
 189  #define X1H    V1
 190  #define Y1L    V2
 191  #define Y1H    V3
 192  #define Z1L    V4
 193  #define Z1H    V5
 194  #define X2L    V6
 195  #define X2H    V7
 196  #define Y2L    V8
 197  #define Y2H    V9
 198  #define Z2L    V10
 199  #define Z2H    V11
 200  #define SEL    V12
 201  #define ZER    V13
 202  
 203  // This function uses LXVD2X and STXVD2X to avoid the
 204  // data alignment requirement for LVX, STVX. Since
 205  // this code is just moving bytes and not doing arithmetic,
 206  // order of the bytes doesn't matter.
 207  //
 208  // func p256MovCond(res, a, b *p256Point, cond int)
 209  TEXT ·p256MovCond(SB), NOSPLIT, $0-32
 210  	MOVD res+0(FP), P3ptr
 211  	MOVD a+8(FP), P1ptr
 212  	MOVD b+16(FP), P2ptr
 213  	MOVD $16, R16
 214  	MOVD $32, R17
 215  	MOVD $48, R18
 216  	MOVD $56, R21
 217  	MOVD $64, R19
 218  	MOVD $80, R20
 219  	// cond is R1 + 24 (cond offset) + 32
 220  	LXVDSX (R1)(R21), SEL
 221  	VSPLTISB $0, ZER
 222  	// SEL controls whether to store a or b
 223  	VCMPEQUD SEL, ZER, SEL
 224  
 225  	LXVD2X (P1ptr+R0), X1H
 226  	LXVD2X (P1ptr+R16), X1L
 227  	LXVD2X (P1ptr+R17), Y1H
 228  	LXVD2X (P1ptr+R18), Y1L
 229  	LXVD2X (P1ptr+R19), Z1H
 230  	LXVD2X (P1ptr+R20), Z1L
 231  
 232  	LXVD2X (P2ptr+R0), X2H
 233  	LXVD2X (P2ptr+R16), X2L
 234  	LXVD2X (P2ptr+R17), Y2H
 235  	LXVD2X (P2ptr+R18), Y2L
 236  	LXVD2X (P2ptr+R19), Z2H
 237  	LXVD2X (P2ptr+R20), Z2L
 238  
 239  	VSEL X1H, X2H, SEL, X1H
 240  	VSEL X1L, X2L, SEL, X1L
 241  	VSEL Y1H, Y2H, SEL, Y1H
 242  	VSEL Y1L, Y2L, SEL, Y1L
 243  	VSEL Z1H, Z2H, SEL, Z1H
 244  	VSEL Z1L, Z2L, SEL, Z1L
 245  
 246  	STXVD2X X1H, (P3ptr+R0)
 247  	STXVD2X X1L, (P3ptr+R16)
 248  	STXVD2X Y1H, (P3ptr+R17)
 249  	STXVD2X Y1L, (P3ptr+R18)
 250  	STXVD2X Z1H, (P3ptr+R19)
 251  	STXVD2X Z1L, (P3ptr+R20)
 252  
 253  	RET
 254  
 255  #undef P3ptr
 256  #undef P1ptr
 257  #undef P2ptr
 258  #undef X1L
 259  #undef X1H
 260  #undef Y1L
 261  #undef Y1H
 262  #undef Z1L
 263  #undef Z1H
 264  #undef X2L
 265  #undef X2H
 266  #undef Y2L
 267  #undef Y2H
 268  #undef Z2L
 269  #undef Z2H
 270  #undef SEL
 271  #undef ZER
 272  
 273  #define P3ptr   R3
 274  #define P1ptr   R4
 275  #define COUNT   R5
 276  
 277  #define X1L    V0
 278  #define X1H    V1
 279  #define Y1L    V2
 280  #define Y1H    V3
 281  #define Z1L    V4
 282  #define Z1H    V5
 283  #define X2L    V6
 284  #define X2H    V7
 285  #define Y2L    V8
 286  #define Y2H    V9
 287  #define Z2L    V10
 288  #define Z2H    V11
 289  
 290  #define ONE   V18
 291  #define IDX   V19
 292  #define SEL1  V20
 293  #define SEL2  V21
 294  // func p256Select(point *p256Point, table *p256Table, idx int)
 295  TEXT ·p256Select(SB), NOSPLIT, $0-24
 296  	MOVD res+0(FP), P3ptr
 297  	MOVD table+8(FP), P1ptr
 298  	MOVD $16, R16
 299  	MOVD $32, R17
 300  	MOVD $48, R18
 301  	MOVD $64, R19
 302  	MOVD $80, R20
 303  
 304  	LXVDSX   (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1
 305  	VSPLTB   $7, SEL1, IDX    // splat byte
 306  	VSPLTISB $1, ONE          // VREPIB $1, ONE
 307  	VSPLTISB $1, SEL2         // VREPIB $1, SEL2
 308  	MOVD     $16, COUNT	  // len(p256Table)
 309  	MOVD     COUNT, CTR       // set up ctr
 310  
 311  	VSPLTISB $0, X1H // VZERO  X1H
 312  	VSPLTISB $0, X1L // VZERO  X1L
 313  	VSPLTISB $0, Y1H // VZERO  Y1H
 314  	VSPLTISB $0, Y1L // VZERO  Y1L
 315  	VSPLTISB $0, Z1H // VZERO  Z1H
 316  	VSPLTISB $0, Z1L // VZERO  Z1L
 317  
 318  loop_select:
 319  
 320  	// LVXD2X is used here since data alignment doesn't
 321  	// matter.
 322  
 323  	LXVD2X (P1ptr+R0), X2H
 324  	LXVD2X (P1ptr+R16), X2L
 325  	LXVD2X (P1ptr+R17), Y2H
 326  	LXVD2X (P1ptr+R18), Y2L
 327  	LXVD2X (P1ptr+R19), Z2H
 328  	LXVD2X (P1ptr+R20), Z2L
 329  
 330  	VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK
 331  
 332  	// This will result in SEL1 being all 0s or 1s, meaning
 333  	// the result is either X1L or X2L, no individual byte
 334  	// selection.
 335  
 336  	VSEL X1L, X2L, SEL1, X1L
 337  	VSEL X1H, X2H, SEL1, X1H
 338  	VSEL Y1L, Y2L, SEL1, Y1L
 339  	VSEL Y1H, Y2H, SEL1, Y1H
 340  	VSEL Z1L, Z2L, SEL1, Z1L
 341  	VSEL Z1H, Z2H, SEL1, Z1H
 342  
 343  	// Add 1 to all bytes in SEL2
 344  	VADDUBM SEL2, ONE, SEL2    // VAB  SEL2, ONE, SEL2 OK
 345  	ADD     $96, P1ptr
 346  	BDNZ    loop_select
 347  
 348  	// STXVD2X is used here so that alignment doesn't
 349  	// need to be verified. Since values were loaded
 350  	// using LXVD2X this is OK.
 351  	STXVD2X X1H, (P3ptr+R0)
 352  	STXVD2X X1L, (P3ptr+R16)
 353  	STXVD2X Y1H, (P3ptr+R17)
 354  	STXVD2X Y1L, (P3ptr+R18)
 355  	STXVD2X Z1H, (P3ptr+R19)
 356  	STXVD2X Z1L, (P3ptr+R20)
 357  	RET
 358  
 359  #undef P3ptr
 360  #undef P1ptr
 361  #undef COUNT
 362  #undef X1L
 363  #undef X1H
 364  #undef Y1L
 365  #undef Y1H
 366  #undef Z1L
 367  #undef Z1H
 368  #undef X2L
 369  #undef X2H
 370  #undef Y2L
 371  #undef Y2H
 372  #undef Z2L
 373  #undef Z2H
 374  #undef ONE
 375  #undef IDX
 376  #undef SEL1
 377  #undef SEL2
 378  
 379  #define P3ptr   R3
 380  #define P1ptr   R4
 381  #define COUNT   R5
 382  
 383  #define X1L    V0
 384  #define X1H    V1
 385  #define Y1L    V2
 386  #define Y1H    V3
 387  #define Z1L    V4
 388  #define Z1H    V5
 389  #define X2L    V6
 390  #define X2H    V7
 391  #define Y2L    V8
 392  #define Y2H    V9
 393  #define Z2L    V10
 394  #define Z2H    V11
 395  
 396  #define ONE   V18
 397  #define IDX   V19
 398  #define SEL1  V20
 399  #define SEL2  V21
 400  
 401  // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
 402  TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
 403  	MOVD res+0(FP), P3ptr
 404  	MOVD table+8(FP), P1ptr
 405  	MOVD $16, R16
 406  	MOVD $32, R17
 407  	MOVD $48, R18
 408  
 409  	LXVDSX (R1)(R18), SEL1
 410  	VSPLTB $7, SEL1, IDX    // splat byte
 411  
 412  	VSPLTISB $1, ONE    // Vector with byte 1s
 413  	VSPLTISB $1, SEL2   // Vector with byte 1s
 414  	MOVD     $32, COUNT // len(p256AffineTable)
 415  	MOVD     COUNT, CTR // loop count
 416  
 417  	VSPLTISB $0, X1H // VZERO  X1H
 418  	VSPLTISB $0, X1L // VZERO  X1L
 419  	VSPLTISB $0, Y1H // VZERO  Y1H
 420  	VSPLTISB $0, Y1L // VZERO  Y1L
 421  
 422  loop_select:
 423  	LXVD2X (P1ptr+R0), X2H
 424  	LXVD2X (P1ptr+R16), X2L
 425  	LXVD2X (P1ptr+R17), Y2H
 426  	LXVD2X (P1ptr+R18), Y2L
 427  
 428  	VCMPEQUD SEL2, IDX, SEL1 // Compare against idx
 429  
 430  	VSEL X1L, X2L, SEL1, X1L // Select if idx matched
 431  	VSEL X1H, X2H, SEL1, X1H
 432  	VSEL Y1L, Y2L, SEL1, Y1L
 433  	VSEL Y1H, Y2H, SEL1, Y1H
 434  
 435  	VADDUBM SEL2, ONE, SEL2    // Increment SEL2 bytes by 1
 436  	ADD     $64, P1ptr         // Next chunk
 437  	BDNZ	loop_select
 438  
 439  	STXVD2X X1H, (P3ptr+R0)
 440  	STXVD2X X1L, (P3ptr+R16)
 441  	STXVD2X Y1H, (P3ptr+R17)
 442  	STXVD2X Y1L, (P3ptr+R18)
 443  	RET
 444  
 445  #undef P3ptr
 446  #undef P1ptr
 447  #undef COUNT
 448  #undef X1L
 449  #undef X1H
 450  #undef Y1L
 451  #undef Y1H
 452  #undef Z1L
 453  #undef Z1H
 454  #undef X2L
 455  #undef X2H
 456  #undef Y2L
 457  #undef Y2H
 458  #undef Z2L
 459  #undef Z2H
 460  #undef ONE
 461  #undef IDX
 462  #undef SEL1
 463  #undef SEL2
 464  
 465  #define res_ptr R3
 466  #define x_ptr   R4
 467  #define CPOOL   R7
 468  
 469  #define T0   V0
 470  #define T1   V1
 471  #define T2   V2
 472  #define TT0  V3
 473  #define TT1  V4
 474  
 475  #define ZER   V6
 476  #define SEL1  V7
 477  #define SEL2  V8
 478  #define CAR1  V9
 479  #define CAR2  V10
 480  #define RED1  V11
 481  #define RED2  V12
 482  #define PL    V13
 483  #define PH    V14
 484  
 485  // func p256FromMont(res, in *p256Element)
 486  TEXT ·p256FromMont(SB), NOSPLIT, $0-16
 487  	MOVD res+0(FP), res_ptr
 488  	MOVD in+8(FP), x_ptr
 489  
 490  	MOVD $16, R16
 491  	MOVD $32, R17
 492  	MOVD $48, R18
 493  	MOVD $64, R19
 494  	MOVD $p256<>+0x00(SB), CPOOL
 495  
 496  	VSPLTISB $0, T2  // VZERO T2
 497  	VSPLTISB $0, ZER // VZERO ZER
 498  
 499  	// Constants are defined so that the LXVD2X is correct
 500  	LXVD2X (CPOOL+R0), PH
 501  	LXVD2X (CPOOL+R16), PL
 502  
 503  	// VPERM byte selections
 504  	LXVD2X (CPOOL+R18), SEL2
 505  	LXVD2X (CPOOL+R19), SEL1
 506  
 507  	LXVD2X (R16)(x_ptr), T1
 508  	LXVD2X (R0)(x_ptr), T0
 509  
 510  	// Put in true little endian order
 511  	XXPERMDI T0, T0, $2, T0
 512  	XXPERMDI T1, T1, $2, T1
 513  
 514  	// First round
 515  	VPERM   T1, T0, SEL1, RED2    // d1 d0 d1 d0
 516  	VPERM   ZER, RED2, SEL2, RED1 // 0  d1 d0  0
 517  	VSUBUQM RED2, RED1, RED2      // VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
 518  
 519  	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
 520  	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
 521  
 522  	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
 523  	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
 524  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
 525  	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
 526  	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
 527  
 528  	// Second round
 529  	VPERM   T1, T0, SEL1, RED2    // d1 d0 d1 d0
 530  	VPERM   ZER, RED2, SEL2, RED1 // 0  d1 d0  0
 531  	VSUBUQM RED2, RED1, RED2      // VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
 532  
 533  	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
 534  	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
 535  
 536  	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
 537  	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
 538  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
 539  	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
 540  	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
 541  
 542  	// Third round
 543  	VPERM   T1, T0, SEL1, RED2    // d1 d0 d1 d0
 544  	VPERM   ZER, RED2, SEL2, RED1 // 0  d1 d0  0
 545  	VSUBUQM RED2, RED1, RED2      // VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
 546  
 547  	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
 548  	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
 549  
 550  	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
 551  	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
 552  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
 553  	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
 554  	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
 555  
 556  	// Last round
 557  	VPERM   T1, T0, SEL1, RED2    // d1 d0 d1 d0
 558  	VPERM   ZER, RED2, SEL2, RED1 // 0  d1 d0  0
 559  	VSUBUQM RED2, RED1, RED2      // VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
 560  
 561  	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
 562  	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
 563  
 564  	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
 565  	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
 566  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
 567  	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
 568  	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
 569  
 570  	// ---------------------------------------------------
 571  
 572  	VSUBCUQ  T0, PL, CAR1       // VSCBIQ  PL, T0, CAR1
 573  	VSUBUQM  T0, PL, TT0        // VSQ     PL, T0, TT0
 574  	VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2
 575  	VSUBEUQM T1, PH, CAR1, TT1  // VSBIQ   T1, PH, CAR1, TT1
 576  	VSUBEUQM T2, ZER, CAR2, T2  // VSBIQ   T2, ZER, CAR2, T2
 577  
 578  	VSEL TT0, T0, T2, T0
 579  	VSEL TT1, T1, T2, T1
 580  
 581  	// Reorder the bytes so STXVD2X can be used.
 582  	// TT0, TT1 used for VPERM result in case
 583  	// the caller expects T0, T1 to be good.
 584  	XXPERMDI T0, T0, $2, TT0
 585  	XXPERMDI T1, T1, $2, TT1
 586  
 587  	STXVD2X TT0, (R0)(res_ptr)
 588  	STXVD2X TT1, (R16)(res_ptr)
 589  	RET
 590  
 591  #undef res_ptr
 592  #undef x_ptr
 593  #undef CPOOL
 594  #undef T0
 595  #undef T1
 596  #undef T2
 597  #undef TT0
 598  #undef TT1
 599  #undef ZER
 600  #undef SEL1
 601  #undef SEL2
 602  #undef CAR1
 603  #undef CAR2
 604  #undef RED1
 605  #undef RED2
 606  #undef PL
 607  #undef PH
 608  
 609  // ---------------------------------------
 610  // p256MulInternal
 611  // V0-V3 V30,V31 - Not Modified
 612  // V4-V15 V27-V29 - Volatile
 613  
 614  #define CPOOL   R7
 615  
 616  // Parameters
 617  #define X0    V0 // Not modified
 618  #define X1    V1 // Not modified
 619  #define Y0    V2 // Not modified
 620  #define Y1    V3 // Not modified
 621  #define T0    V4 // Result
 622  #define T1    V5 // Result
 623  #define P0    V30 // Not modified
 624  #define P1    V31 // Not modified
 625  
 626  // Temporaries: lots of reused vector regs
 627  #define YDIG  V6 // Overloaded with CAR2
 628  #define ADD1H V7 // Overloaded with ADD3H
 629  #define ADD2H V8 // Overloaded with ADD4H
 630  #define ADD3  V9 // Overloaded with SEL2,SEL5
 631  #define ADD4  V10 // Overloaded with SEL3,SEL6
 632  #define RED1  V11 // Overloaded with CAR2
 633  #define RED2  V12
 634  #define RED3  V13 // Overloaded with SEL1
 635  #define T2    V14
 636  // Overloaded temporaries
 637  #define ADD1  V4 // Overloaded with T0
 638  #define ADD2  V5 // Overloaded with T1
 639  #define ADD3H V7 // Overloaded with ADD1H
 640  #define ADD4H V8 // Overloaded with ADD2H
 641  #define ZER   V28 // Overloaded with TMP1
 642  #define CAR1  V6 // Overloaded with YDIG
 643  #define CAR2  V11 // Overloaded with RED1
 644  // Constant Selects
 645  #define SEL1  V13 // Overloaded with RED3
 646  #define SEL2  V9 // Overloaded with ADD3,SEL5
 647  #define SEL3  V10 // Overloaded with ADD4,SEL6
 648  #define SEL4  V6 // Overloaded with YDIG,CAR1
 649  #define SEL5  V9 // Overloaded with ADD3,SEL2
 650  #define SEL6  V10 // Overloaded with ADD4,SEL3
 651  
 652  // TMP1, TMP2 used in
 653  // VMULT macros
 654  #define TMP1  V13 // Overloaded with RED3
 655  #define TMP2  V27
 656  #define ONE   V29 // 1s splatted by word
 657  
 658  /* *
 659   * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
 660   * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
 661   * With you, SIMD be...
 662   *
 663   *                                           +--------+--------+
 664   *                                  +--------|  RED2  |  RED1  |
 665   *                                  |        +--------+--------+
 666   *                                  |       ---+--------+--------+
 667   *                                  |  +---- T2|   T1   |   T0   |--+
 668   *                                  |  |    ---+--------+--------+  |
 669   *                                  |  |                            |
 670   *                                  |  |    ======================= |
 671   *                                  |  |                            |
 672   *                                  |  |       +--------+--------+<-+
 673   *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
 674   *                                  |  |       +--------+--------+  |     |
 675   *                                  |  |     +--------+--------+<---+     |
 676   *                                  |  |     | ADD2H  | ADD1H  |--+       |
 677   *                                  |  |     +--------+--------+  |       |
 678   *                                  |  |     +--------+--------+<-+       |
 679   *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
 680   *                                  |  |     +--------+--------+  | |     |
 681   *                                  |  |   +--------+--------+<---+ |     |
 682   *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
 683   *                                  |  |   +--------+--------+      | |   V
 684   *                                  |  | ------------------------   | | +--------+
 685   *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
 686   *                                  |  |                            | | +--------+
 687   *                                  |  +---->+--------+--------+    | |   |
 688   *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
 689   *                                  |        +--------+--------+    | |   |
 690   *                                  +---->---+--------+--------+    | |   |
 691   *                                         T2|   T1   |   T0   |----+ |   |
 692   *                                        ---+--------+--------+    | |   |
 693   *                                        ---+--------+--------+<---+ |   |
 694   *                                    +--- T2|   T1   |   T0   |----------+
 695   *                                    |   ---+--------+--------+      |   |
 696   *                                    |  +--------+--------+<-------------+
 697   *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
 698   *                                    |  +--------+--------+     |    |   |
 699   *                                    |  +--------+<----------------------+
 700   *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
 701   *                                    |  +--------+              |    |
 702   *                                    +--->+--------+--------+   |    |
 703   *                                         |   T1   |   T0   |--------+
 704   *                                         +--------+--------+   |    |
 705   *                                   --------------------------- |    |
 706   *                                                               |    |
 707   *                                       +--------+--------+<----+    |
 708   *                                       |  RED2  |  RED1  |          |
 709   *                                       +--------+--------+          |
 710   *                                      ---+--------+--------+<-------+
 711   *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
 712   *                                      ---+--------+--------+
 713   *
 714   *                                                                *Mi obra de arte de siglo XXI @vpaprots
 715   *
 716   *
 717   * First group is special, doesn't get the two inputs:
 718   *                                             +--------+--------+<-+
 719   *                                     +-------|  ADD2  |  ADD1  |--|-----+
 720   *                                     |       +--------+--------+  |     |
 721   *                                     |     +--------+--------+<---+     |
 722   *                                     |     | ADD2H  | ADD1H  |--+       |
 723   *                                     |     +--------+--------+  |       |
 724   *                                     |     +--------+--------+<-+       |
 725   *                                     |     |  ADD4  |  ADD3  |--|-+     |
 726   *                                     |     +--------+--------+  | |     |
 727   *                                     |   +--------+--------+<---+ |     |
 728   *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
 729   *                                     |   +--------+--------+      | |   V
 730   *                                     | ------------------------   | | +--------+
 731   *                                     |                            | | |  RED3  |  [d0 0 0 d0]
 732   *                                     |                            | | +--------+
 733   *                                     +---->+--------+--------+    | |   |
 734   *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
 735   *                                           +--------+--------+    | |   |
 736   *                                        ---+--------+--------+<---+ |   |
 737   *                                    +--- T2|   T1   |   T0   |----------+
 738   *                                    |   ---+--------+--------+      |   |
 739   *                                    |  +--------+--------+<-------------+
 740   *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
 741   *                                    |  +--------+--------+     |    |   |
 742   *                                    |  +--------+<----------------------+
 743   *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
 744   *                                    |  +--------+              |    |
 745   *                                    +--->+--------+--------+   |    |
 746   *                                         |   T1   |   T0   |--------+
 747   *                                         +--------+--------+   |    |
 748   *                                   --------------------------- |    |
 749   *                                                               |    |
 750   *                                       +--------+--------+<----+    |
 751   *                                       |  RED2  |  RED1  |          |
 752   *                                       +--------+--------+          |
 753   *                                      ---+--------+--------+<-------+
 754   *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
 755   *                                      ---+--------+--------+
 756   *
 757   * Last 'group' needs to RED2||RED1 shifted less
 758   */
 759  TEXT p256MulInternal<>(SB), NOSPLIT, $0-16
 760  	// CPOOL loaded from caller
 761  	MOVD $16, R16
 762  	MOVD $32, R17
 763  	MOVD $48, R18
 764  	MOVD $64, R19
 765  	MOVD $80, R20
 766  	MOVD $96, R21
 767  	MOVD $112, R22
 768  
 769  	// ---------------------------------------------------
 770  
 771  	VSPLTW $3, Y0, YDIG // VREPF Y0 is input
 772  
 773  	//	VMLHF X0, YDIG, ADD1H
 774  	//	VMLHF X1, YDIG, ADD2H
 775  	//	VMLF  X0, YDIG, ADD1
 776  	//	VMLF  X1, YDIG, ADD2
 777  	//
 778  	VMULT(X0, YDIG, ADD1, ADD1H)
 779  	VMULT(X1, YDIG, ADD2, ADD2H)
 780  
 781  	VSPLTISW $1, ONE
 782  	VSPLTW $2, Y0, YDIG // VREPF
 783  
 784  	//	VMALF  X0, YDIG, ADD1H, ADD3
 785  	//	VMALF  X1, YDIG, ADD2H, ADD4
 786  	//	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
 787  	//	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
 788  	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
 789  	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
 790  
 791  	LXVD2X   (R17)(CPOOL), SEL1
 792  	VSPLTISB $0, ZER               // VZERO ZER
 793  	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
 794  
 795  	VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free	// VSLDB
 796  	VSLDOI $12, ZER, ADD2, T1  // ADD2 Free	// VSLDB
 797  
 798  	VADDCUQ  T0, ADD3, CAR1     // VACCQ
 799  	VADDUQM  T0, ADD3, T0       // ADD3 Free	// VAQ
 800  	VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ
 801  	VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free	// VACQ
 802  
 803  	LXVD2X  (R18)(CPOOL), SEL2
 804  	LXVD2X  (R19)(CPOOL), SEL3
 805  	LXVD2X  (R20)(CPOOL), SEL4
 806  	VPERM   RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
 807  	VPERM   RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
 808  	VPERM   RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
 809  	VSUBUQM RED2, RED3, RED2     // Guaranteed not to underflow -->? // VSQ
 810  
 811  	VSLDOI $12, T1, T0, T0 // VSLDB
 812  	VSLDOI $12, T2, T1, T1 // VSLDB
 813  
 814  	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
 815  	VADDUQM  T0, ADD3H, T0       // VAQ
 816  	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
 817  	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
 818  
 819  	// ---------------------------------------------------
 820  
 821  	VSPLTW $1, Y0, YDIG                // VREPF
 822  
 823  	//	VMALHF X0, YDIG, T0, ADD1H
 824  	//	VMALHF X1, YDIG, T1, ADD2H
 825  	//	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
 826  	//	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
 827  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
 828  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
 829  
 830  	VSPLTW $0, Y0, YDIG // VREPF
 831  
 832  	//	VMALF  X0, YDIG, ADD1H, ADD3
 833  	//	VMALF  X1, YDIG, ADD2H, ADD4
 834  	//	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
 835  	//	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
 836  	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
 837  	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
 838  
 839  	VSPLTISB $0, ZER               // VZERO ZER
 840  	LXVD2X   (R17)(CPOOL), SEL1
 841  	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
 842  
 843  	VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0		// VSLDB
 844  	VSLDOI $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free	// VSLDB
 845  
 846  	VADDCUQ  T0, RED1, CAR1     // VACCQ
 847  	VADDUQM  T0, RED1, T0       // VAQ
 848  	VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
 849  	VADDEUQM T1, RED2, CAR1, T1 // VACQ
 850  
 851  	VADDCUQ  T0, ADD3, CAR1       // VACCQ
 852  	VADDUQM  T0, ADD3, T0         // VAQ
 853  	VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
 854  	VADDEUQM T1, ADD4, CAR1, T1   // VACQ
 855  	VADDUQM  T2, CAR2, T2         // VAQ
 856  
 857  	LXVD2X  (R18)(CPOOL), SEL2
 858  	LXVD2X  (R19)(CPOOL), SEL3
 859  	LXVD2X  (R20)(CPOOL), SEL4
 860  	VPERM   RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
 861  	VPERM   RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
 862  	VPERM   RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
 863  	VSUBUQM RED2, RED3, RED2     // Guaranteed not to underflow	// VSQ
 864  
 865  	VSLDOI $12, T1, T0, T0 // VSLDB
 866  	VSLDOI $12, T2, T1, T1 // VSLDB
 867  
 868  	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
 869  	VADDUQM  T0, ADD3H, T0       // VAQ
 870  	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
 871  	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
 872  
 873  	// ---------------------------------------------------
 874  
 875  	VSPLTW $3, Y1, YDIG                // VREPF
 876  
 877  	//	VMALHF X0, YDIG, T0, ADD1H
 878  	//	VMALHF X1, YDIG, T1, ADD2H
 879  	//	VMALF  X0, YDIG, T0, ADD1
 880  	//	VMALF  X1, YDIG, T1, ADD2
 881  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
 882  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
 883  
 884  	VSPLTW $2, Y1, YDIG // VREPF
 885  
 886  	//	VMALF  X0, YDIG, ADD1H, ADD3
 887  	//	VMALF  X1, YDIG, ADD2H, ADD4
 888  	//	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
 889  	//	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
 890  	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
 891  	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
 892  
 893  	LXVD2X   (R17)(CPOOL), SEL1
 894  	VSPLTISB $0, ZER               // VZERO ZER
 895  	LXVD2X   (R17)(CPOOL), SEL1
 896  	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
 897  
 898  	VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free		// VSLDB
 899  	VSLDOI $12, T2, ADD2, T1   // ADD2 Free		// VSLDB
 900  
 901  	VADDCUQ  T0, RED1, CAR1     // VACCQ
 902  	VADDUQM  T0, RED1, T0       // VAQ
 903  	VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
 904  	VADDEUQM T1, RED2, CAR1, T1 // VACQ
 905  
 906  	VADDCUQ  T0, ADD3, CAR1       // VACCQ
 907  	VADDUQM  T0, ADD3, T0         // VAQ
 908  	VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
 909  	VADDEUQM T1, ADD4, CAR1, T1   // VACQ
 910  	VADDUQM  T2, CAR2, T2         // VAQ
 911  
 912  	LXVD2X  (R18)(CPOOL), SEL2
 913  	LXVD2X  (R19)(CPOOL), SEL3
 914  	LXVD2X  (R20)(CPOOL), SEL4
 915  	VPERM   RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
 916  	VPERM   RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
 917  	VPERM   RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
 918  	VSUBUQM RED2, RED3, RED2     // Guaranteed not to underflow	// VSQ
 919  
 920  	VSLDOI $12, T1, T0, T0 // VSLDB
 921  	VSLDOI $12, T2, T1, T1 // VSLDB
 922  
 923  	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
 924  	VADDUQM  T0, ADD3H, T0       // VAQ
 925  	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
 926  	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
 927  
 928  	// ---------------------------------------------------
 929  
 930  	VSPLTW $1, Y1, YDIG                // VREPF
 931  
 932  	//	VMALHF X0, YDIG, T0, ADD1H
 933  	//	VMALHF X1, YDIG, T1, ADD2H
 934  	//	VMALF  X0, YDIG, T0, ADD1
 935  	//	VMALF  X1, YDIG, T1, ADD2
 936  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
 937  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
 938  
 939  	VSPLTW $0, Y1, YDIG // VREPF
 940  
 941  	//	VMALF  X0, YDIG, ADD1H, ADD3
 942  	//	VMALF  X1, YDIG, ADD2H, ADD4
 943  	//	VMALHF X0, YDIG, ADD1H, ADD3H
 944  	//	VMALHF X1, YDIG, ADD2H, ADD4H
 945  	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
 946  	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
 947  
 948  	VSPLTISB $0, ZER               // VZERO ZER
 949  	LXVD2X   (R17)(CPOOL), SEL1
 950  	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
 951  
 952  	VSLDOI $12, ADD2, ADD1, T0 // VSLDB
 953  	VSLDOI $12, T2, ADD2, T1   // VSLDB
 954  
 955  	VADDCUQ  T0, RED1, CAR1     // VACCQ
 956  	VADDUQM  T0, RED1, T0       // VAQ
 957  	VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
 958  	VADDEUQM T1, RED2, CAR1, T1 // VACQ
 959  
 960  	VADDCUQ  T0, ADD3, CAR1       // VACCQ
 961  	VADDUQM  T0, ADD3, T0         // VAQ
 962  	VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
 963  	VADDEUQM T1, ADD4, CAR1, T1   // VACQ
 964  	VADDUQM  T2, CAR2, T2         // VAQ
 965  
 966  	LXVD2X  (R21)(CPOOL), SEL5
 967  	LXVD2X  (R22)(CPOOL), SEL6
 968  	VPERM   T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
 969  	VPERM   T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
 970  	VSUBUQM RED2, RED1, RED2     // Guaranteed not to underflow	// VSQ
 971  
 972  	VSLDOI $12, T1, T0, T0 // VSLDB
 973  	VSLDOI $12, T2, T1, T1 // VSLDB
 974  
 975  	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
 976  	VADDUQM  T0, ADD3H, T0       // VAQ
 977  	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
 978  	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
 979  
 980  	VADDCUQ  T0, RED1, CAR1       // VACCQ
 981  	VADDUQM  T0, RED1, T0         // VAQ
 982  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ
 983  	VADDEUQM T1, RED2, CAR1, T1   // VACQ
 984  	VADDUQM  T2, CAR2, T2         // VAQ
 985  
 986  	// ---------------------------------------------------
 987  
 988  	VSPLTISB $0, RED3            // VZERO   RED3
 989  	VSUBCUQ  T0, P0, CAR1        // VSCBIQ
 990  	VSUBUQM  T0, P0, ADD1H       // VSQ
 991  	VSUBECUQ T1, P1, CAR1, CAR2  // VSBCBIQ
 992  	VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ
 993  	VSUBEUQM T2, RED3, CAR2, T2  // VSBIQ
 994  
 995  	// what output to use, ADD2H||ADD1H or T1||T0?
 996  	VSEL ADD1H, T0, T2, T0
 997  	VSEL ADD2H, T1, T2, T1
 998  	RET
 999  
1000  #undef CPOOL
1001  
1002  #undef X0
1003  #undef X1
1004  #undef Y0
1005  #undef Y1
1006  #undef T0
1007  #undef T1
1008  #undef P0
1009  #undef P1
1010  
1011  #undef SEL1
1012  #undef SEL2
1013  #undef SEL3
1014  #undef SEL4
1015  #undef SEL5
1016  #undef SEL6
1017  
1018  #undef YDIG
1019  #undef ADD1H
1020  #undef ADD2H
1021  #undef ADD3
1022  #undef ADD4
1023  #undef RED1
1024  #undef RED2
1025  #undef RED3
1026  #undef T2
1027  #undef ADD1
1028  #undef ADD2
1029  #undef ADD3H
1030  #undef ADD4H
1031  #undef ZER
1032  #undef CAR1
1033  #undef CAR2
1034  
1035  #undef TMP1
1036  #undef TMP2
1037  
1038  #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
1039  	VSPLTISB $0, ZER            \ // VZERO
1040  	VSUBCUQ  X0, Y0, CAR1       \
1041  	VSUBUQM  X0, Y0, T0         \
1042  	VSUBECUQ X1, Y1, CAR1, SEL1 \
1043  	VSUBEUQM X1, Y1, CAR1, T1   \
1044  	VSUBUQM  ZER, SEL1, SEL1    \ // VSQ
1045  	                            \
1046  	VADDCUQ  T0, PL, CAR1       \ // VACCQ
1047  	VADDUQM  T0, PL, TT0        \ // VAQ
1048  	VADDEUQM T1, PH, CAR1, TT1  \ // VACQ
1049  	                            \
1050  	VSEL     TT0, T0, SEL1, T0  \
1051  	VSEL     TT1, T1, SEL1, T1  \
1052  
1053  #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
1054  	VADDCUQ  X0, Y0, CAR1        \
1055  	VADDUQM  X0, Y0, T0          \
1056  	VADDECUQ X1, Y1, CAR1, T2    \ // VACCCQ
1057  	VADDEUQM X1, Y1, CAR1, T1    \
1058  	                             \
1059  	VSPLTISB $0, ZER             \
1060  	VSUBCUQ  T0, PL, CAR1        \ // VSCBIQ
1061  	VSUBUQM  T0, PL, TT0         \
1062  	VSUBECUQ T1, PH, CAR1, CAR2  \ // VSBCBIQ
1063  	VSUBEUQM T1, PH, CAR1, TT1   \ // VSBIQ
1064  	VSUBEUQM T2, ZER, CAR2, SEL1 \
1065  	                             \
1066  	VSEL     TT0, T0, SEL1, T0   \
1067  	VSEL     TT1, T1, SEL1, T1
1068  
1069  #define p256HalfInternal(T1, T0, X1, X0) \
1070  	VSPLTISB $0, ZER            \
1071  	VSUBEUQM ZER, ZER, X0, SEL1 \
1072  	                            \
1073  	VADDCUQ  X0, PL, CAR1       \
1074  	VADDUQM  X0, PL, T0         \
1075  	VADDECUQ X1, PH, CAR1, T2   \
1076  	VADDEUQM X1, PH, CAR1, T1   \
1077  	                            \
1078  	VSEL     T0, X0, SEL1, T0   \
1079  	VSEL     T1, X1, SEL1, T1   \
1080  	VSEL     T2, ZER, SEL1, T2  \
1081  	                            \
1082  	VSLDOI   $15, T2, ZER, TT1  \
1083  	VSLDOI   $15, T1, ZER, TT0  \
1084  	VSPLTISB $1, SEL1           \
1085  	VSR      T0, SEL1, T0       \ // VSRL
1086  	VSR      T1, SEL1, T1       \
1087  	VSPLTISB $7, SEL1           \ // VREPIB
1088  	VSL      TT0, SEL1, TT0     \
1089  	VSL      TT1, SEL1, TT1     \
1090  	VOR      T0, TT0, T0        \
1091  	VOR      T1, TT1, T1
1092  
1093  #define res_ptr R3
1094  #define x_ptr   R4
1095  #define y_ptr   R5
1096  #define CPOOL   R7
1097  #define TEMP    R8
1098  #define N       R9
1099  
1100  // Parameters
1101  #define X0    V0
1102  #define X1    V1
1103  #define Y0    V2
1104  #define Y1    V3
1105  #define T0    V4
1106  #define T1    V5
1107  
1108  // Constants
1109  #define P0    V30
1110  #define P1    V31
1111  // func p256MulAsm(res, in1, in2 *p256Element)
1112  TEXT ·p256Mul(SB), NOSPLIT, $0-24
1113  	MOVD res+0(FP), res_ptr
1114  	MOVD in1+8(FP), x_ptr
1115  	MOVD in2+16(FP), y_ptr
1116  	MOVD $16, R16
1117  	MOVD $32, R17
1118  
1119  	MOVD $p256mul<>+0x00(SB), CPOOL
1120  
1121  
1122  	LXVD2X (R0)(x_ptr), X0
1123  	LXVD2X (R16)(x_ptr), X1
1124  
1125  	XXPERMDI X0, X0, $2, X0
1126  	XXPERMDI X1, X1, $2, X1
1127  
1128  	LXVD2X (R0)(y_ptr), Y0
1129  	LXVD2X (R16)(y_ptr), Y1
1130  
1131  	XXPERMDI Y0, Y0, $2, Y0
1132  	XXPERMDI Y1, Y1, $2, Y1
1133  
1134  	LXVD2X (R16)(CPOOL), P1
1135  	LXVD2X (R0)(CPOOL), P0
1136  
1137  	CALL p256MulInternal<>(SB)
1138  
1139  	MOVD $p256mul<>+0x00(SB), CPOOL
1140  
1141  	XXPERMDI T0, T0, $2, T0
1142  	XXPERMDI T1, T1, $2, T1
1143  	STXVD2X T0, (R0)(res_ptr)
1144  	STXVD2X T1, (R16)(res_ptr)
1145  	RET
1146  
1147  // func p256Sqr(res, in *p256Element, n int)
1148  TEXT ·p256Sqr(SB), NOSPLIT, $0-24
1149  	MOVD res+0(FP), res_ptr
1150  	MOVD in+8(FP), x_ptr
1151  	MOVD $16, R16
1152  	MOVD $32, R17
1153  
1154  	MOVD $p256mul<>+0x00(SB), CPOOL
1155  
1156  	LXVD2X (R0)(x_ptr), X0
1157  	LXVD2X (R16)(x_ptr), X1
1158  
1159  	XXPERMDI X0, X0, $2, X0
1160  	XXPERMDI X1, X1, $2, X1
1161  
1162  sqrLoop:
1163  	// Sqr uses same value for both
1164  
1165  	VOR	X0, X0, Y0
1166  	VOR	X1, X1, Y1
1167  
1168  	LXVD2X (R16)(CPOOL), P1
1169  	LXVD2X (R0)(CPOOL), P0
1170  
1171  	CALL p256MulInternal<>(SB)
1172  
1173  	MOVD	n+16(FP), N
1174  	ADD	$-1, N
1175  	CMP	$0, N
1176  	BEQ	done
1177  	MOVD	N, n+16(FP)	// Save counter to avoid clobber
1178  	VOR	T0, T0, X0
1179  	VOR	T1, T1, X1
1180  	BR	sqrLoop
1181  
1182  done:
1183  	MOVD $p256mul<>+0x00(SB), CPOOL
1184  
1185  	XXPERMDI T0, T0, $2, T0
1186  	XXPERMDI T1, T1, $2, T1
1187  	STXVD2X T0, (R0)(res_ptr)
1188  	STXVD2X T1, (R16)(res_ptr)
1189  	RET
1190  
1191  #undef res_ptr
1192  #undef x_ptr
1193  #undef y_ptr
1194  #undef CPOOL
1195  
1196  #undef X0
1197  #undef X1
1198  #undef Y0
1199  #undef Y1
1200  #undef T0
1201  #undef T1
1202  #undef P0
1203  #undef P1
1204  
1205  #define P3ptr   R3
1206  #define P1ptr   R4
1207  #define P2ptr   R5
1208  #define CPOOL   R7
1209  
1210  // Temporaries in REGs
1211  #define Y2L    V15
1212  #define Y2H    V16
1213  #define T1L    V17
1214  #define T1H    V18
1215  #define T2L    V19
1216  #define T2H    V20
1217  #define T3L    V21
1218  #define T3H    V22
1219  #define T4L    V23
1220  #define T4H    V24
1221  
1222  // Temps for Sub and Add
1223  #define TT0  V11
1224  #define TT1  V12
1225  #define T2   V13
1226  
1227  // p256MulAsm Parameters
1228  #define X0    V0
1229  #define X1    V1
1230  #define Y0    V2
1231  #define Y1    V3
1232  #define T0    V4
1233  #define T1    V5
1234  
1235  #define PL    V30
1236  #define PH    V31
1237  
1238  // Names for zero/sel selects
1239  #define X1L    V0
1240  #define X1H    V1
1241  #define Y1L    V2 // p256MulAsmParmY
1242  #define Y1H    V3 // p256MulAsmParmY
1243  #define Z1L    V4
1244  #define Z1H    V5
1245  #define X2L    V0
1246  #define X2H    V1
1247  #define Z2L    V4
1248  #define Z2H    V5
1249  #define X3L    V17 // T1L
1250  #define X3H    V18 // T1H
1251  #define Y3L    V21 // T3L
1252  #define Y3H    V22 // T3H
1253  #define Z3L    V25
1254  #define Z3H    V26
1255  
1256  #define ZER   V6
1257  #define SEL1  V7
1258  #define CAR1  V8
1259  #define CAR2  V9
1260  /* *
1261   * Three operand formula:
1262   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1263   * T1 = Z1²
1264   * T2 = T1*Z1
1265   * T1 = T1*X2
1266   * T2 = T2*Y2
1267   * T1 = T1-X1
1268   * T2 = T2-Y1
1269   * Z3 = Z1*T1
1270   * T3 = T1²
1271   * T4 = T3*T1
1272   * T3 = T3*X1
1273   * T1 = 2*T3
1274   * X3 = T2²
1275   * X3 = X3-T1
1276   * X3 = X3-T4
1277   * T3 = T3-X3
1278   * T3 = T3*T2
1279   * T4 = T4*Y1
1280   * Y3 = T3-T4
1281  
1282   * Three operand formulas, but with MulInternal X,Y used to store temps
1283  X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
1284  X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
1285  X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
1286  X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
1287  SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
1288  SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
1289  X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
1290  X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
1291  X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
1292  X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
1293  ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
1294  X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
1295  SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
1296  SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
1297  SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
1298  X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
1299  X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
1300  SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
1301  
1302  	*/
1303  //
1304  // V27 is clobbered by p256MulInternal so must be
1305  // saved in a temp.
1306  //
1307  // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1308  TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $16-48
1309  	MOVD res+0(FP), P3ptr
1310  	MOVD in1+8(FP), P1ptr
1311  	MOVD in2+16(FP), P2ptr
1312  
1313  	MOVD $p256mul<>+0x00(SB), CPOOL
1314  
1315  	MOVD $16, R16
1316  	MOVD $32, R17
1317  	MOVD $48, R18
1318  	MOVD $64, R19
1319  	MOVD $80, R20
1320  	MOVD $96, R21
1321  	MOVD $112, R22
1322  	MOVD $128, R23
1323  	MOVD $144, R24
1324  	MOVD $160, R25
1325  	MOVD $104, R26 // offset of sign+24(FP)
1326  
1327  	LXVD2X (R16)(CPOOL), PH
1328  	LXVD2X (R0)(CPOOL), PL
1329  
1330  	LXVD2X (R17)(P2ptr), Y2L
1331  	LXVD2X (R18)(P2ptr), Y2H
1332  	XXPERMDI Y2H, Y2H, $2, Y2H
1333  	XXPERMDI Y2L, Y2L, $2, Y2L
1334  
1335  	// Equivalent of VLREPG sign+24(FP), SEL1
1336  	LXVDSX   (R1)(R26), SEL1
1337  	VSPLTISB $0, ZER
1338  	VCMPEQUD SEL1, ZER, SEL1
1339  
1340  	VSUBCUQ  PL, Y2L, CAR1
1341  	VSUBUQM  PL, Y2L, T1L
1342  	VSUBEUQM PH, Y2H, CAR1, T1H
1343  
1344  	VSEL T1L, Y2L, SEL1, Y2L
1345  	VSEL T1H, Y2H, SEL1, Y2H
1346  
1347  /* *
1348   * Three operand formula:
1349   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1350   */
1351  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
1352  	LXVD2X (R19)(P1ptr), X0     // Z1H
1353  	LXVD2X (R20)(P1ptr), X1     // Z1L
1354  	XXPERMDI X0, X0, $2, X0
1355  	XXPERMDI X1, X1, $2, X1
1356  	VOR    X0, X0, Y0
1357  	VOR    X1, X1, Y1
1358  	CALL   p256MulInternal<>(SB)
1359  
1360  	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
1361  	VOR  T0, T0, X0
1362  	VOR  T1, T1, X1
1363  	CALL p256MulInternal<>(SB)
1364  	VOR  T0, T0, T2L
1365  	VOR  T1, T1, T2H
1366  
1367  	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
1368  	MOVD   in2+16(FP), P2ptr
1369  	LXVD2X (R0)(P2ptr), Y0      // X2H
1370  	LXVD2X (R16)(P2ptr), Y1     // X2L
1371  	XXPERMDI Y0, Y0, $2, Y0
1372  	XXPERMDI Y1, Y1, $2, Y1
1373  	CALL   p256MulInternal<>(SB)
1374  	VOR    T0, T0, T1L
1375  	VOR    T1, T1, T1H
1376  
1377  	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
1378  	VOR  T2L, T2L, X0
1379  	VOR  T2H, T2H, X1
1380  	VOR  Y2L, Y2L, Y0
1381  	VOR  Y2H, Y2H, Y1
1382  	CALL p256MulInternal<>(SB)
1383  
1384  	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
1385  	MOVD   in1+8(FP), P1ptr
1386  	LXVD2X (R17)(P1ptr), Y1L
1387  	LXVD2X (R18)(P1ptr), Y1H
1388  	XXPERMDI Y1H, Y1H, $2, Y1H
1389  	XXPERMDI Y1L, Y1L, $2, Y1L
1390  	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
1391  
1392  	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
1393  	LXVD2X (R0)(P1ptr), X1L
1394  	LXVD2X (R16)(P1ptr), X1H
1395  	XXPERMDI X1H, X1H, $2, X1H
1396  	XXPERMDI X1L, X1L, $2, X1L
1397  	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
1398  
1399  	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
1400  	LXVD2X (R19)(P1ptr), X0     // Z1H
1401  	LXVD2X (R20)(P1ptr), X1     // Z1L
1402  	XXPERMDI X0, X0, $2, X0
1403  	XXPERMDI X1, X1, $2, X1
1404  	CALL   p256MulInternal<>(SB)
1405  
1406  	VOR T0, T0, Z3L
1407  	VOR T1, T1, Z3H
1408  
1409  	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
1410  	VOR  Y0, Y0, X0
1411  	VOR  Y1, Y1, X1
1412  	CALL p256MulInternal<>(SB)
1413  	VOR  T0, T0, X0
1414  	VOR  T1, T1, X1
1415  
1416  	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
1417  	CALL p256MulInternal<>(SB)
1418  	VOR  T0, T0, T4L
1419  	VOR  T1, T1, T4H
1420  
1421  	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
1422  	MOVD   in1+8(FP), P1ptr
1423  	LXVD2X (R0)(P1ptr), Y0      // X1H
1424  	LXVD2X (R16)(P1ptr), Y1     // X1L
1425  	XXPERMDI Y1, Y1, $2, Y1
1426  	XXPERMDI Y0, Y0, $2, Y0
1427  	CALL   p256MulInternal<>(SB)
1428  	VOR    T0, T0, T3L
1429  	VOR    T1, T1, T3H
1430  
1431  	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
1432  	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
1433  
1434  	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
1435  	VOR  T2L, T2L, X0
1436  	VOR  T2H, T2H, X1
1437  	VOR  T2L, T2L, Y0
1438  	VOR  T2H, T2H, Y1
1439  	CALL p256MulInternal<>(SB)
1440  
1441  	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
1442  	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
1443  
1444  	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
1445  	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
1446  	VOR T0, T0, X3L
1447  	VOR T1, T1, X3H
1448  
1449  	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
1450  	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
1451  
1452  	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
1453  	CALL p256MulInternal<>(SB)
1454  	VOR  T0, T0, T3L
1455  	VOR  T1, T1, T3H
1456  
1457  	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
1458  	VOR    T4L, T4L, X0
1459  	VOR    T4H, T4H, X1
1460  	MOVD   in1+8(FP), P1ptr
1461  	LXVD2X (R17)(P1ptr), Y0     // Y1H
1462  	LXVD2X (R18)(P1ptr), Y1     // Y1L
1463  	XXPERMDI Y0, Y0, $2, Y0
1464  	XXPERMDI Y1, Y1, $2, Y1
1465  	CALL   p256MulInternal<>(SB)
1466  
1467  	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
1468  	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
1469  
1470  	//	if (sel == 0) {
1471  	//		copy(P3.x[:], X1)
1472  	//		copy(P3.y[:], Y1)
1473  	//		copy(P3.z[:], Z1)
1474  	//	}
1475  
1476  	LXVD2X (R0)(P1ptr), X1L
1477  	LXVD2X (R16)(P1ptr), X1H
1478  	XXPERMDI X1H, X1H, $2, X1H
1479  	XXPERMDI X1L, X1L, $2, X1L
1480  
1481  	// Y1 already loaded, left over from addition
1482  	LXVD2X (R19)(P1ptr), Z1L
1483  	LXVD2X (R20)(P1ptr), Z1H
1484  	XXPERMDI Z1H, Z1H, $2, Z1H
1485  	XXPERMDI Z1L, Z1L, $2, Z1L
1486  
1487  	MOVD     $112, R26        // Get offset to sel+32
1488  	LXVDSX   (R1)(R26), SEL1
1489  	VSPLTISB $0, ZER
1490  	VCMPEQUD SEL1, ZER, SEL1
1491  
1492  	VSEL X3L, X1L, SEL1, X3L
1493  	VSEL X3H, X1H, SEL1, X3H
1494  	VSEL Y3L, Y1L, SEL1, Y3L
1495  	VSEL Y3H, Y1H, SEL1, Y3H
1496  	VSEL Z3L, Z1L, SEL1, Z3L
1497  	VSEL Z3H, Z1H, SEL1, Z3H
1498  
1499  	MOVD   in2+16(FP), P2ptr
1500  	LXVD2X (R0)(P2ptr), X2L
1501  	LXVD2X (R16)(P2ptr), X2H
1502  	XXPERMDI X2H, X2H, $2, X2H
1503  	XXPERMDI X2L, X2L, $2, X2L
1504  
1505  	// Y2 already loaded
1506  	LXVD2X (R23)(CPOOL), Z2L
1507  	LXVD2X (R24)(CPOOL), Z2H
1508  
1509  	MOVD     $120, R26        // Get the value from zero+40(FP)
1510  	LXVDSX   (R1)(R26), SEL1
1511  	VSPLTISB $0, ZER
1512  	VCMPEQUD SEL1, ZER, SEL1
1513  
1514  	VSEL X3L, X2L, SEL1, X3L
1515  	VSEL X3H, X2H, SEL1, X3H
1516  	VSEL Y3L, Y2L, SEL1, Y3L
1517  	VSEL Y3H, Y2H, SEL1, Y3H
1518  	VSEL Z3L, Z2L, SEL1, Z3L
1519  	VSEL Z3H, Z2H, SEL1, Z3H
1520  
1521  	// Reorder the bytes so they can be stored using STXVD2X.
1522  	MOVD    res+0(FP), P3ptr
1523  	XXPERMDI X3H, X3H, $2, X3H
1524  	XXPERMDI X3L, X3L, $2, X3L
1525  	XXPERMDI Y3H, Y3H, $2, Y3H
1526  	XXPERMDI Y3L, Y3L, $2, Y3L
1527  	XXPERMDI Z3H, Z3H, $2, Z3H
1528  	XXPERMDI Z3L, Z3L, $2, Z3L
1529  	STXVD2X X3L, (R0)(P3ptr)
1530  	STXVD2X X3H, (R16)(P3ptr)
1531  	STXVD2X Y3L, (R17)(P3ptr)
1532  	STXVD2X Y3H, (R18)(P3ptr)
1533  	STXVD2X Z3L, (R19)(P3ptr)
1534  	STXVD2X Z3H, (R20)(P3ptr)
1535  
1536  	RET
1537  
1538  #undef P3ptr
1539  #undef P1ptr
1540  #undef P2ptr
1541  #undef CPOOL
1542  
1543  #undef Y2L
1544  #undef Y2H
1545  #undef T1L
1546  #undef T1H
1547  #undef T2L
1548  #undef T2H
1549  #undef T3L
1550  #undef T3H
1551  #undef T4L
1552  #undef T4H
1553  
1554  #undef TT0
1555  #undef TT1
1556  #undef T2
1557  
1558  #undef X0
1559  #undef X1
1560  #undef Y0
1561  #undef Y1
1562  #undef T0
1563  #undef T1
1564  
1565  #undef PL
1566  #undef PH
1567  
1568  #undef X1L
1569  #undef X1H
1570  #undef Y1L
1571  #undef Y1H
1572  #undef Z1L
1573  #undef Z1H
1574  #undef X2L
1575  #undef X2H
1576  #undef Z2L
1577  #undef Z2H
1578  #undef X3L
1579  #undef X3H
1580  #undef Y3L
1581  #undef Y3H
1582  #undef Z3L
1583  #undef Z3H
1584  
1585  #undef ZER
1586  #undef SEL1
1587  #undef CAR1
1588  #undef CAR2
1589  
1590  // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
1591  // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
1592  // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
1593  #define P3ptr   R3
1594  #define P1ptr   R4
1595  #define CPOOL   R7
1596  
1597  // Temporaries in REGs
1598  #define X3L    V15
1599  #define X3H    V16
1600  #define Y3L    V17
1601  #define Y3H    V18
1602  #define T1L    V19
1603  #define T1H    V20
1604  #define T2L    V21
1605  #define T2H    V22
1606  #define T3L    V23
1607  #define T3H    V24
1608  
1609  #define X1L    V6
1610  #define X1H    V7
1611  #define Y1L    V8
1612  #define Y1H    V9
1613  #define Z1L    V10
1614  #define Z1H    V11
1615  
1616  // Temps for Sub and Add
1617  #define TT0  V11
1618  #define TT1  V12
1619  #define T2   V13
1620  
1621  // p256MulAsm Parameters
1622  #define X0    V0
1623  #define X1    V1
1624  #define Y0    V2
1625  #define Y1    V3
1626  #define T0    V4
1627  #define T1    V5
1628  
1629  #define PL    V30
1630  #define PH    V31
1631  
1632  #define Z3L    V23
1633  #define Z3H    V24
1634  
1635  #define ZER   V26
1636  #define SEL1  V27
1637  #define CAR1  V28
1638  #define CAR2  V29
1639  /*
1640   * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
1641   * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
1642   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1643   * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
1644   * 	B  = 2Y₁
1645   * 	Z₃ = B×Z₁
1646   * 	C  = B²
1647   * 	D  = C×X₁
1648   * 	X₃ = A²-2D
1649   * 	Y₃ = (D-X₃)×A-C²/2
1650   *
1651   * Three-operand formula:
1652   *       T1 = Z1²
1653   *       T2 = X1-T1
1654   *       T1 = X1+T1
1655   *       T2 = T2*T1
1656   *       T2 = 3*T2
1657   *       Y3 = 2*Y1
1658   *       Z3 = Y3*Z1
1659   *       Y3 = Y3²
1660   *       T3 = Y3*X1
1661   *       Y3 = Y3²
1662   *       Y3 = half*Y3
1663   *       X3 = T2²
1664   *       T1 = 2*T3
1665   *       X3 = X3-T1
1666   *       T1 = T3-X3
1667   *       T1 = T1*T2
1668   *       Y3 = T1-Y3
1669   */
1670  // p256PointDoubleAsm(res, in1 *p256Point)
1671  TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16
1672  	MOVD res+0(FP), P3ptr
1673  	MOVD in+8(FP), P1ptr
1674  
1675  	MOVD $p256mul<>+0x00(SB), CPOOL
1676  
1677  	MOVD $16, R16
1678  	MOVD $32, R17
1679  	MOVD $48, R18
1680  	MOVD $64, R19
1681  	MOVD $80, R20
1682  
1683  	LXVD2X (R16)(CPOOL), PH
1684  	LXVD2X (R0)(CPOOL), PL
1685  
1686  	// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
1687  	LXVD2X (R19)(P1ptr), X0 // Z1H
1688  	LXVD2X (R20)(P1ptr), X1 // Z1L
1689  
1690  	XXPERMDI X0, X0, $2, X0
1691  	XXPERMDI X1, X1, $2, X1
1692  
1693  	VOR  X0, X0, Y0
1694  	VOR  X1, X1, Y1
1695  	CALL p256MulInternal<>(SB)
1696  
1697  	// SUB(X<X1-T)            // T2 = X1-T1
1698  	LXVD2X (R0)(P1ptr), X1L
1699  	LXVD2X (R16)(P1ptr), X1H
1700  	XXPERMDI X1L, X1L, $2, X1L
1701  	XXPERMDI X1H, X1H, $2, X1H
1702  
1703  	p256SubInternal(X1,X0,X1H,X1L,T1,T0)
1704  
1705  	// ADD(Y<X1+T)            // T1 = X1+T1
1706  	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
1707  
1708  	// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
1709  	CALL p256MulInternal<>(SB)
1710  
1711  	// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
1712  	p256AddInternal(T2H,T2L,T1,T0,T1,T0)
1713  	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
1714  
1715  	// ADD(X<Y1+Y1)           // Y3 = 2*Y1
1716  	LXVD2X (R17)(P1ptr), Y1L
1717  	LXVD2X (R18)(P1ptr), Y1H
1718  	XXPERMDI Y1L, Y1L, $2, Y1L
1719  	XXPERMDI Y1H, Y1H, $2, Y1H
1720  
1721  	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
1722  
1723  	// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
1724  	LXVD2X (R19)(P1ptr), Y0
1725  	LXVD2X (R20)(P1ptr), Y1
1726  	XXPERMDI Y0, Y0, $2, Y0
1727  	XXPERMDI Y1, Y1, $2, Y1
1728  
1729  	CALL p256MulInternal<>(SB)
1730  
1731  	// Leave T0, T1 as is.
1732  	XXPERMDI T0, T0, $2, TT0
1733  	XXPERMDI T1, T1, $2, TT1
1734  	STXVD2X TT0, (R19)(P3ptr)
1735  	STXVD2X TT1, (R20)(P3ptr)
1736  
1737  	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
1738  	VOR  X0, X0, Y0
1739  	VOR  X1, X1, Y1
1740  	CALL p256MulInternal<>(SB)
1741  
1742  	// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
1743  	VOR    T0, T0, X0
1744  	VOR    T1, T1, X1
1745  	LXVD2X (R0)(P1ptr), Y0
1746  	LXVD2X (R16)(P1ptr), Y1
1747  	XXPERMDI Y0, Y0, $2, Y0
1748  	XXPERMDI Y1, Y1, $2, Y1
1749  	CALL   p256MulInternal<>(SB)
1750  	VOR    T0, T0, T3L
1751  	VOR    T1, T1, T3H
1752  
1753  	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
1754  	VOR  X0, X0, Y0
1755  	VOR  X1, X1, Y1
1756  	CALL p256MulInternal<>(SB)
1757  
1758  	// HAL(Y3<T)              // Y3 = half*Y3
1759  	p256HalfInternal(Y3H,Y3L, T1,T0)
1760  
1761  	// X=T2; Y=T2; MUL; T-    // X3 = T2²
1762  	VOR  T2L, T2L, X0
1763  	VOR  T2H, T2H, X1
1764  	VOR  T2L, T2L, Y0
1765  	VOR  T2H, T2H, Y1
1766  	CALL p256MulInternal<>(SB)
1767  
1768  	// ADD(T1<T3+T3)          // T1 = 2*T3
1769  	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
1770  
1771  	// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
1772  	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
1773  
1774  	XXPERMDI X3L, X3L, $2, TT0
1775  	XXPERMDI X3H, X3H, $2, TT1
1776  	STXVD2X TT0, (R0)(P3ptr)
1777  	STXVD2X TT1, (R16)(P3ptr)
1778  
1779  	// SUB(X<T3-X3)           // T1 = T3-X3
1780  	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
1781  
1782  	// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
1783  	CALL p256MulInternal<>(SB)
1784  
1785  	// SUB(Y3<T-Y3)           // Y3 = T1-Y3
1786  	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
1787  
1788  	XXPERMDI Y3L, Y3L, $2, Y3L
1789  	XXPERMDI Y3H, Y3H, $2, Y3H
1790  	STXVD2X Y3L, (R17)(P3ptr)
1791  	STXVD2X Y3H, (R18)(P3ptr)
1792  	RET
1793  
1794  #undef P3ptr
1795  #undef P1ptr
1796  #undef CPOOL
1797  #undef X3L
1798  #undef X3H
1799  #undef Y3L
1800  #undef Y3H
1801  #undef T1L
1802  #undef T1H
1803  #undef T2L
1804  #undef T2H
1805  #undef T3L
1806  #undef T3H
1807  #undef X1L
1808  #undef X1H
1809  #undef Y1L
1810  #undef Y1H
1811  #undef Z1L
1812  #undef Z1H
1813  #undef TT0
1814  #undef TT1
1815  #undef T2
1816  #undef X0
1817  #undef X1
1818  #undef Y0
1819  #undef Y1
1820  #undef T0
1821  #undef T1
1822  #undef PL
1823  #undef PH
1824  #undef Z3L
1825  #undef Z3H
1826  #undef ZER
1827  #undef SEL1
1828  #undef CAR1
1829  #undef CAR2
1830  
1831  #define P3ptr  R3
1832  #define P1ptr  R4
1833  #define P2ptr  R5
1834  #define CPOOL  R7
1835  #define TRUE   R14
1836  #define RES1   R9
1837  #define RES2   R10
1838  
1839  // Temporaries in REGs
1840  #define T1L   V16
1841  #define T1H   V17
1842  #define T2L   V18
1843  #define T2H   V19
1844  #define U1L   V20
1845  #define U1H   V21
1846  #define S1L   V22
1847  #define S1H   V23
1848  #define HL    V24
1849  #define HH    V25
1850  #define RL    V26
1851  #define RH    V27
1852  
1853  // Temps for Sub and Add
1854  #define ZER   V6
1855  #define SEL1  V7
1856  #define CAR1  V8
1857  #define CAR2  V9
1858  #define TT0  V11
1859  #define TT1  V12
1860  #define T2   V13
1861  
1862  // p256MulAsm Parameters
1863  #define X0    V0
1864  #define X1    V1
1865  #define Y0    V2
1866  #define Y1    V3
1867  #define T0    V4
1868  #define T1    V5
1869  
1870  #define PL    V30
1871  #define PH    V31
1872  /*
1873   * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
1874   *
1875   * A = X₁×Z₂²
1876   * B = Y₁×Z₂³
1877   * C = X₂×Z₁²-A
1878   * D = Y₂×Z₁³-B
1879   * X₃ = D² - 2A×C² - C³
1880   * Y₃ = D×(A×C² - X₃) - B×C³
1881   * Z₃ = Z₁×Z₂×C
1882   *
1883   * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
1884   * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
1885   *
1886   * T1 = Z1*Z1
1887   * T2 = Z2*Z2
1888   * U1 = X1*T2
1889   * H  = X2*T1
1890   * H  = H-U1
1891   * Z3 = Z1*Z2
1892   * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1893   *
1894   * S1 = Z2*T2
1895   * S1 = Y1*S1
1896   * R  = Z1*T1
1897   * R  = Y2*R
1898   * R  = R-S1
1899   *
1900   * T1 = H*H
1901   * T2 = H*T1
1902   * U1 = U1*T1
1903   *
1904   * X3 = R*R
1905   * X3 = X3-T2
1906   * T1 = 2*U1
1907   * X3 = X3-T1 << store-out X3 result reg
1908   *
1909   * T2 = S1*T2
1910   * Y3 = U1-X3
1911   * Y3 = R*Y3
1912   * Y3 = Y3-T2 << store-out Y3 result reg
1913  
1914  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
1915  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
1916  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
1917  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
1918  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
1919  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
1920  	// SUB(H<H-T)            // H  = H-U1
1921  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
1922  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1923  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
1924  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
1925  	// SUB(R<T-S1)           // R  = R-S1
1926  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
1927  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
1928  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
1929  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
1930  	// SUB(T<T-T2)           // X3 = X3-T2
1931  	// ADD(X<U1+U1)          // T1 = 2*U1
1932  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
1933  	// SUB(Y<U1-T)           // Y3 = U1-X3
1934  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
1935  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
1936  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
1937  	*/
1938  // p256PointAddAsm(res, in1, in2 *p256Point)
1939  TEXT ·p256PointAddAsm(SB), NOSPLIT, $16-32
1940  	MOVD res+0(FP), P3ptr
1941  	MOVD in1+8(FP), P1ptr
1942  	MOVD $p256mul<>+0x00(SB), CPOOL
1943  	MOVD $16, R16
1944  	MOVD $32, R17
1945  	MOVD $48, R18
1946  	MOVD $64, R19
1947  	MOVD $80, R20
1948  
1949  	LXVD2X (R16)(CPOOL), PH
1950  	LXVD2X (R0)(CPOOL), PL
1951  
1952  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
1953  	LXVD2X (R19)(P1ptr), X0     // Z1L
1954  	LXVD2X (R20)(P1ptr), X1     // Z1H
1955  	XXPERMDI X0, X0, $2, X0
1956  	XXPERMDI X1, X1, $2, X1
1957  	VOR    X0, X0, Y0
1958  	VOR    X1, X1, Y1
1959  	CALL   p256MulInternal<>(SB)
1960  
1961  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
1962  	VOR  T0, T0, Y0
1963  	VOR  T1, T1, Y1
1964  	CALL p256MulInternal<>(SB)
1965  	VOR  T0, T0, RL            // SAVE: RL
1966  	VOR  T1, T1, RH            // SAVE: RH
1967  
1968  	STXVD2X RH, (R1)(R17) // V27 has to be saved
1969  
1970  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
1971  	MOVD   in2+16(FP), P2ptr
1972  	LXVD2X (R0)(P2ptr), X0      // X2L
1973  	LXVD2X (R16)(P2ptr), X1     // X2H
1974  	XXPERMDI X0, X0, $2, X0
1975  	XXPERMDI X1, X1, $2, X1
1976  	CALL   p256MulInternal<>(SB)
1977  	VOR    T0, T0, HL            // SAVE: HL
1978  	VOR    T1, T1, HH            // SAVE: HH
1979  
1980  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
1981  	MOVD   in2+16(FP), P2ptr
1982  	LXVD2X (R19)(P2ptr), X0     // Z2L
1983  	LXVD2X (R20)(P2ptr), X1     // Z2H
1984  	XXPERMDI X0, X0, $2, X0
1985  	XXPERMDI X1, X1, $2, X1
1986  	VOR    X0, X0, Y0
1987  	VOR    X1, X1, Y1
1988  	CALL   p256MulInternal<>(SB)
1989  
1990  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
1991  	VOR  T0, T0, Y0
1992  	VOR  T1, T1, Y1
1993  	CALL p256MulInternal<>(SB)
1994  	VOR  T0, T0, S1L           // SAVE: S1L
1995  	VOR  T1, T1, S1H           // SAVE: S1H
1996  
1997  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
1998  	MOVD   in1+8(FP), P1ptr
1999  	LXVD2X (R0)(P1ptr), X0      // X1L
2000  	LXVD2X (R16)(P1ptr), X1     // X1H
2001  	XXPERMDI X0, X0, $2, X0
2002  	XXPERMDI X1, X1, $2, X1
2003  	CALL   p256MulInternal<>(SB)
2004  	VOR    T0, T0, U1L           // SAVE: U1L
2005  	VOR    T1, T1, U1H           // SAVE: U1H
2006  
2007  	// SUB(H<H-T)            // H  = H-U1
2008  	p256SubInternal(HH,HL,HH,HL,T1,T0)
2009  
2010  	// if H == 0 or H^P == 0 then ret=1 else ret=0
2011  	// clobbers T1H and T1L
2012  	MOVD       $1, TRUE
2013  	VSPLTISB   $0, ZER
2014  	VOR        HL, HH, T1H
2015  	VCMPEQUDCC ZER, T1H, T1H
2016  
2017  	// 26 = CR6 NE
2018  	ISEL       $26, R0, TRUE, RES1
2019  	VXOR       HL, PL, T1L         // SAVE: T1L
2020  	VXOR       HH, PH, T1H         // SAVE: T1H
2021  	VOR        T1L, T1H, T1H
2022  	VCMPEQUDCC ZER, T1H, T1H
2023  
2024  	// 26 = CR6 NE
2025  	ISEL $26, R0, TRUE, RES2
2026  	OR   RES2, RES1, RES1
2027  	MOVD RES1, ret+24(FP)
2028  
2029  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
2030  	MOVD   in1+8(FP), P1ptr
2031  	MOVD   in2+16(FP), P2ptr
2032  	LXVD2X (R19)(P1ptr), X0        // Z1L
2033  	LXVD2X (R20)(P1ptr), X1        // Z1H
2034  	XXPERMDI X0, X0, $2, X0
2035  	XXPERMDI X1, X1, $2, X1
2036  	LXVD2X (R19)(P2ptr), Y0        // Z2L
2037  	LXVD2X (R20)(P2ptr), Y1        // Z2H
2038  	XXPERMDI Y0, Y0, $2, Y0
2039  	XXPERMDI Y1, Y1, $2, Y1
2040  	CALL   p256MulInternal<>(SB)
2041  
2042  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
2043  	VOR     T0, T0, X0
2044  	VOR     T1, T1, X1
2045  	VOR     HL, HL, Y0
2046  	VOR     HH, HH, Y1
2047  	CALL    p256MulInternal<>(SB)
2048  	MOVD    res+0(FP), P3ptr
2049  	XXPERMDI T1, T1, $2, TT1
2050  	XXPERMDI T0, T0, $2, TT0
2051  	STXVD2X TT0, (R19)(P3ptr)
2052  	STXVD2X TT1, (R20)(P3ptr)
2053  
2054  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2055  	MOVD   in1+8(FP), P1ptr
2056  	LXVD2X (R17)(P1ptr), X0
2057  	LXVD2X (R18)(P1ptr), X1
2058  	XXPERMDI X0, X0, $2, X0
2059  	XXPERMDI X1, X1, $2, X1
2060  	VOR    S1L, S1L, Y0
2061  	VOR    S1H, S1H, Y1
2062  	CALL   p256MulInternal<>(SB)
2063  	VOR    T0, T0, S1L
2064  	VOR    T1, T1, S1H
2065  
2066  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
2067  	MOVD   in2+16(FP), P2ptr
2068  	LXVD2X (R17)(P2ptr), X0
2069  	LXVD2X (R18)(P2ptr), X1
2070  	XXPERMDI X0, X0, $2, X0
2071  	XXPERMDI X1, X1, $2, X1
2072  	VOR    RL, RL, Y0
2073  
2074  	// VOR RH, RH, Y1   RH was saved above in D2X format
2075  	LXVD2X (R1)(R17), Y1
2076  	CALL   p256MulInternal<>(SB)
2077  
2078  	// SUB(R<T-S1)           // R  = T-S1
2079  	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
2080  
2081  	STXVD2X RH, (R1)(R17) // Save RH
2082  
2083  	// if R == 0 or R^P == 0 then ret=ret else ret=0
2084  	// clobbers T1H and T1L
2085  	// Redo this using ISEL??
2086  	MOVD       $1, TRUE
2087  	VSPLTISB   $0, ZER
2088  	VOR        RL, RH, T1H
2089  	VCMPEQUDCC ZER, T1H, T1H
2090  
2091  	// 24 = CR6 NE
2092  	ISEL       $26, R0, TRUE, RES1
2093  	VXOR       RL, PL, T1L
2094  	VXOR       RH, PH, T1H         // SAVE: T1L
2095  	VOR        T1L, T1H, T1H
2096  	VCMPEQUDCC ZER, T1H, T1H
2097  
2098  	// 26 = CR6 NE
2099  	ISEL $26, R0, TRUE, RES2
2100  	OR   RES2, RES1, RES1
2101  	MOVD ret+24(FP), RES2
2102  	AND  RES2, RES1, RES1
2103  	MOVD RES1, ret+24(FP)
2104  
2105  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
2106  	VOR  HL, HL, X0
2107  	VOR  HH, HH, X1
2108  	VOR  HL, HL, Y0
2109  	VOR  HH, HH, Y1
2110  	CALL p256MulInternal<>(SB)
2111  
2112  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
2113  	VOR  T0, T0, Y0
2114  	VOR  T1, T1, Y1
2115  	CALL p256MulInternal<>(SB)
2116  	VOR  T0, T0, T2L
2117  	VOR  T1, T1, T2H
2118  
2119  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
2120  	VOR  U1L, U1L, X0
2121  	VOR  U1H, U1H, X1
2122  	CALL p256MulInternal<>(SB)
2123  	VOR  T0, T0, U1L
2124  	VOR  T1, T1, U1H
2125  
2126  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
2127  	VOR RL, RL, X0
2128  
2129  	// VOR  RH, RH, X1
2130  	VOR RL, RL, Y0
2131  
2132  	// RH was saved above using STXVD2X
2133  	LXVD2X (R1)(R17), X1
2134  	VOR    X1, X1, Y1
2135  
2136  	// VOR  RH, RH, Y1
2137  	CALL p256MulInternal<>(SB)
2138  
2139  	// SUB(T<T-T2)           // X3 = X3-T2
2140  	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
2141  
2142  	// ADD(X<U1+U1)          // T1 = 2*U1
2143  	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
2144  
2145  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
2146  	p256SubInternal(T1,T0,T1,T0,X1,X0)
2147  	MOVD    res+0(FP), P3ptr
2148  	XXPERMDI T1, T1, $2, TT1
2149  	XXPERMDI T0, T0, $2, TT0
2150  	STXVD2X TT0, (R0)(P3ptr)
2151  	STXVD2X TT1, (R16)(P3ptr)
2152  
2153  	// SUB(Y<U1-T)           // Y3 = U1-X3
2154  	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
2155  
2156  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
2157  	VOR RL, RL, X0
2158  
2159  	// VOR  RH, RH, X1
2160  	LXVD2X (R1)(R17), X1
2161  	CALL   p256MulInternal<>(SB)
2162  	VOR    T0, T0, U1L
2163  	VOR    T1, T1, U1H
2164  
2165  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
2166  	VOR  S1L, S1L, X0
2167  	VOR  S1H, S1H, X1
2168  	VOR  T2L, T2L, Y0
2169  	VOR  T2H, T2H, Y1
2170  	CALL p256MulInternal<>(SB)
2171  
2172  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
2173  	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
2174  	MOVD    res+0(FP), P3ptr
2175  	XXPERMDI T1, T1, $2, TT1
2176  	XXPERMDI T0, T0, $2, TT0
2177  	STXVD2X TT0, (R17)(P3ptr)
2178  	STXVD2X TT1, (R18)(P3ptr)
2179  
2180  	RET
2181