p256_asm_arm64.s raw

   1  // Copyright 2018 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build !purego
   6  
   7  // This file contains constant-time, 64-bit assembly implementation of
   8  // P256. The optimizations performed here are described in detail in:
   9  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
  10  //                          256-bit primes"
  11  // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
  12  // https://eprint.iacr.org/2013/816.pdf
  13  
  14  #include "textflag.h"
  15  
  16  #define res_ptr R0
  17  #define a_ptr R1
  18  #define b_ptr R2
  19  
  20  #define acc0 R3
  21  #define acc1 R4
  22  #define acc2 R5
  23  #define acc3 R6
  24  
  25  #define acc4 R7
  26  #define acc5 R8
  27  #define acc6 R9
  28  #define acc7 R10
  29  #define t0 R11
  30  #define t1 R12
  31  #define t2 R13
  32  #define t3 R14
  33  #define const0 R15
  34  #define const1 R16
  35  
  36  #define hlp0 R17
  37  #define hlp1 res_ptr
  38  
  39  #define x0 R19
  40  #define x1 R20
  41  #define x2 R21
  42  #define x3 R22
  43  #define y0 R23
  44  #define y1 R24
  45  #define y2 R25
  46  #define y3 R26
  47  
  48  #define const2 t2
  49  #define const3 t3
  50  
  51  DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
  52  DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
  53  DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
  54  DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
  55  DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
  56  DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
  57  DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
  58  DATA p256one<>+0x00(SB)/8, $0x0000000000000001
  59  DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
  60  DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
  61  DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
  62  GLOBL p256const0<>(SB), 8, $8
  63  GLOBL p256const1<>(SB), 8, $8
  64  GLOBL p256ordK0<>(SB), 8, $8
  65  GLOBL p256ord<>(SB), 8, $32
  66  GLOBL p256one<>(SB), 8, $32
  67  
  68  /* ---------------------------------------*/
  69  // func p256MovCond(res, a, b *P256Point, cond int)
  70  // If cond == 0 res=b, else res=a
  71  TEXT ·p256MovCond(SB),NOSPLIT,$0
  72  	MOVD	res+0(FP), res_ptr
  73  	MOVD	a+8(FP), a_ptr
  74  	MOVD	b+16(FP), b_ptr
  75  	MOVD	cond+24(FP), R3
  76  
  77  	CMP	$0, R3
  78  	// Two remarks:
  79  	// 1) Will want to revisit NEON, when support is better
  80  	// 2) CSEL might not be constant time on all ARM processors
  81  	LDP	0*16(a_ptr), (R4, R5)
  82  	LDP	1*16(a_ptr), (R6, R7)
  83  	LDP	2*16(a_ptr), (R8, R9)
  84  	LDP	0*16(b_ptr), (R16, R17)
  85  	LDP	1*16(b_ptr), (R19, R20)
  86  	LDP	2*16(b_ptr), (R21, R22)
  87  	CSEL	EQ, R16, R4, R4
  88  	CSEL	EQ, R17, R5, R5
  89  	CSEL	EQ, R19, R6, R6
  90  	CSEL	EQ, R20, R7, R7
  91  	CSEL	EQ, R21, R8, R8
  92  	CSEL	EQ, R22, R9, R9
  93  	STP	(R4, R5), 0*16(res_ptr)
  94  	STP	(R6, R7), 1*16(res_ptr)
  95  	STP	(R8, R9), 2*16(res_ptr)
  96  
  97  	LDP	3*16(a_ptr), (R4, R5)
  98  	LDP	4*16(a_ptr), (R6, R7)
  99  	LDP	5*16(a_ptr), (R8, R9)
 100  	LDP	3*16(b_ptr), (R16, R17)
 101  	LDP	4*16(b_ptr), (R19, R20)
 102  	LDP	5*16(b_ptr), (R21, R22)
 103  	CSEL	EQ, R16, R4, R4
 104  	CSEL	EQ, R17, R5, R5
 105  	CSEL	EQ, R19, R6, R6
 106  	CSEL	EQ, R20, R7, R7
 107  	CSEL	EQ, R21, R8, R8
 108  	CSEL	EQ, R22, R9, R9
 109  	STP	(R4, R5), 3*16(res_ptr)
 110  	STP	(R6, R7), 4*16(res_ptr)
 111  	STP	(R8, R9), 5*16(res_ptr)
 112  
 113  	RET
 114  /* ---------------------------------------*/
 115  // func p256NegCond(val *p256Element, cond int)
 116  TEXT ·p256NegCond(SB),NOSPLIT,$0
 117  	MOVD	val+0(FP), a_ptr
 118  	MOVD	cond+8(FP), hlp0
 119  	MOVD	a_ptr, res_ptr
 120  	// acc = poly
 121  	MOVD	$-1, acc0
 122  	MOVD	p256const0<>(SB), acc1
 123  	MOVD	$0, acc2
 124  	MOVD	p256const1<>(SB), acc3
 125  	// Load the original value
 126  	LDP	0*16(a_ptr), (t0, t1)
 127  	LDP	1*16(a_ptr), (t2, t3)
 128  	// Speculatively subtract
 129  	SUBS	t0, acc0
 130  	SBCS	t1, acc1
 131  	SBCS	t2, acc2
 132  	SBC	t3, acc3
 133  	// If condition is 0, keep original value
 134  	CMP	$0, hlp0
 135  	CSEL	EQ, t0, acc0, acc0
 136  	CSEL	EQ, t1, acc1, acc1
 137  	CSEL	EQ, t2, acc2, acc2
 138  	CSEL	EQ, t3, acc3, acc3
 139  	// Store result
 140  	STP	(acc0, acc1), 0*16(res_ptr)
 141  	STP	(acc2, acc3), 1*16(res_ptr)
 142  
 143  	RET
 144  /* ---------------------------------------*/
 145  // func p256Sqr(res, in *p256Element, n int)
 146  TEXT ·p256Sqr(SB),NOSPLIT,$0
 147  	MOVD	res+0(FP), res_ptr
 148  	MOVD	in+8(FP), a_ptr
 149  	MOVD	n+16(FP), b_ptr
 150  
 151  	MOVD	p256const0<>(SB), const0
 152  	MOVD	p256const1<>(SB), const1
 153  
 154  	LDP	0*16(a_ptr), (x0, x1)
 155  	LDP	1*16(a_ptr), (x2, x3)
 156  
 157  sqrLoop:
 158  	SUB	$1, b_ptr
 159  	CALL	p256SqrInternal<>(SB)
 160  	MOVD	y0, x0
 161  	MOVD	y1, x1
 162  	MOVD	y2, x2
 163  	MOVD	y3, x3
 164  	CBNZ	b_ptr, sqrLoop
 165  
 166  	STP	(y0, y1), 0*16(res_ptr)
 167  	STP	(y2, y3), 1*16(res_ptr)
 168  	RET
 169  /* ---------------------------------------*/
 170  // func p256Mul(res, in1, in2 *p256Element)
 171  TEXT ·p256Mul(SB),NOSPLIT,$0
 172  	MOVD	res+0(FP), res_ptr
 173  	MOVD	in1+8(FP), a_ptr
 174  	MOVD	in2+16(FP), b_ptr
 175  
 176  	MOVD	p256const0<>(SB), const0
 177  	MOVD	p256const1<>(SB), const1
 178  
 179  	LDP	0*16(a_ptr), (x0, x1)
 180  	LDP	1*16(a_ptr), (x2, x3)
 181  
 182  	LDP	0*16(b_ptr), (y0, y1)
 183  	LDP	1*16(b_ptr), (y2, y3)
 184  
 185  	CALL	p256MulInternal<>(SB)
 186  
 187  	STP	(y0, y1), 0*16(res_ptr)
 188  	STP	(y2, y3), 1*16(res_ptr)
 189  	RET
 190  /* ---------------------------------------*/
 191  // func p256FromMont(res, in *p256Element)
 192  TEXT ·p256FromMont(SB),NOSPLIT,$0
 193  	MOVD	res+0(FP), res_ptr
 194  	MOVD	in+8(FP), a_ptr
 195  
 196  	MOVD	p256const0<>(SB), const0
 197  	MOVD	p256const1<>(SB), const1
 198  
 199  	LDP	0*16(a_ptr), (acc0, acc1)
 200  	LDP	1*16(a_ptr), (acc2, acc3)
 201  	// Only reduce, no multiplications are needed
 202  	// First reduction step
 203  	ADDS	acc0<<32, acc1, acc1
 204  	LSR	$32, acc0, t0
 205  	MUL	acc0, const1, t1
 206  	UMULH	acc0, const1, acc0
 207  	ADCS	t0, acc2
 208  	ADCS	t1, acc3
 209  	ADC	$0, acc0
 210  	// Second reduction step
 211  	ADDS	acc1<<32, acc2, acc2
 212  	LSR	$32, acc1, t0
 213  	MUL	acc1, const1, t1
 214  	UMULH	acc1, const1, acc1
 215  	ADCS	t0, acc3
 216  	ADCS	t1, acc0
 217  	ADC	$0, acc1
 218  	// Third reduction step
 219  	ADDS	acc2<<32, acc3, acc3
 220  	LSR	$32, acc2, t0
 221  	MUL	acc2, const1, t1
 222  	UMULH	acc2, const1, acc2
 223  	ADCS	t0, acc0
 224  	ADCS	t1, acc1
 225  	ADC	$0, acc2
 226  	// Last reduction step
 227  	ADDS	acc3<<32, acc0, acc0
 228  	LSR	$32, acc3, t0
 229  	MUL	acc3, const1, t1
 230  	UMULH	acc3, const1, acc3
 231  	ADCS	t0, acc1
 232  	ADCS	t1, acc2
 233  	ADC	$0, acc3
 234  
 235  	SUBS	$-1, acc0, t0
 236  	SBCS	const0, acc1, t1
 237  	SBCS	$0, acc2, t2
 238  	SBCS	const1, acc3, t3
 239  
 240  	CSEL	CS, t0, acc0, acc0
 241  	CSEL	CS, t1, acc1, acc1
 242  	CSEL	CS, t2, acc2, acc2
 243  	CSEL	CS, t3, acc3, acc3
 244  
 245  	STP	(acc0, acc1), 0*16(res_ptr)
 246  	STP	(acc2, acc3), 1*16(res_ptr)
 247  
 248  	RET
 249  /* ---------------------------------------*/
 250  // func p256Select(res *P256Point, table *p256Table, idx int)
 251  TEXT ·p256Select(SB),NOSPLIT,$0
 252  	MOVD	idx+16(FP), const0
 253  	MOVD	table+8(FP), b_ptr
 254  	MOVD	res+0(FP), res_ptr
 255  
 256  	EOR	x0, x0, x0
 257  	EOR	x1, x1, x1
 258  	EOR	x2, x2, x2
 259  	EOR	x3, x3, x3
 260  	EOR	y0, y0, y0
 261  	EOR	y1, y1, y1
 262  	EOR	y2, y2, y2
 263  	EOR	y3, y3, y3
 264  	EOR	t0, t0, t0
 265  	EOR	t1, t1, t1
 266  	EOR	t2, t2, t2
 267  	EOR	t3, t3, t3
 268  
 269  	MOVD	$0, const1
 270  
 271  loop_select:
 272  		ADD	$1, const1
 273  		CMP	const0, const1
 274  		LDP.P	16(b_ptr), (acc0, acc1)
 275  		CSEL	EQ, acc0, x0, x0
 276  		CSEL	EQ, acc1, x1, x1
 277  		LDP.P	16(b_ptr), (acc2, acc3)
 278  		CSEL	EQ, acc2, x2, x2
 279  		CSEL	EQ, acc3, x3, x3
 280  		LDP.P	16(b_ptr), (acc4, acc5)
 281  		CSEL	EQ, acc4, y0, y0
 282  		CSEL	EQ, acc5, y1, y1
 283  		LDP.P	16(b_ptr), (acc6, acc7)
 284  		CSEL	EQ, acc6, y2, y2
 285  		CSEL	EQ, acc7, y3, y3
 286  		LDP.P	16(b_ptr), (acc0, acc1)
 287  		CSEL	EQ, acc0, t0, t0
 288  		CSEL	EQ, acc1, t1, t1
 289  		LDP.P	16(b_ptr), (acc2, acc3)
 290  		CSEL	EQ, acc2, t2, t2
 291  		CSEL	EQ, acc3, t3, t3
 292  
 293  		CMP	$16, const1
 294  		BNE	loop_select
 295  
 296  	STP	(x0, x1), 0*16(res_ptr)
 297  	STP	(x2, x3), 1*16(res_ptr)
 298  	STP	(y0, y1), 2*16(res_ptr)
 299  	STP	(y2, y3), 3*16(res_ptr)
 300  	STP	(t0, t1), 4*16(res_ptr)
 301  	STP	(t2, t3), 5*16(res_ptr)
 302  	RET
 303  /* ---------------------------------------*/
 304  // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
 305  TEXT ·p256SelectAffine(SB),NOSPLIT,$0
 306  	MOVD	idx+16(FP), t0
 307  	MOVD	table+8(FP), t1
 308  	MOVD	res+0(FP), res_ptr
 309  
 310  	EOR	x0, x0, x0
 311  	EOR	x1, x1, x1
 312  	EOR	x2, x2, x2
 313  	EOR	x3, x3, x3
 314  	EOR	y0, y0, y0
 315  	EOR	y1, y1, y1
 316  	EOR	y2, y2, y2
 317  	EOR	y3, y3, y3
 318  
 319  	MOVD	$0, t2
 320  
 321  loop_select:
 322  		ADD	$1, t2
 323  		CMP	t0, t2
 324  		LDP.P	16(t1), (acc0, acc1)
 325  		CSEL	EQ, acc0, x0, x0
 326  		CSEL	EQ, acc1, x1, x1
 327  		LDP.P	16(t1), (acc2, acc3)
 328  		CSEL	EQ, acc2, x2, x2
 329  		CSEL	EQ, acc3, x3, x3
 330  		LDP.P	16(t1), (acc4, acc5)
 331  		CSEL	EQ, acc4, y0, y0
 332  		CSEL	EQ, acc5, y1, y1
 333  		LDP.P	16(t1), (acc6, acc7)
 334  		CSEL	EQ, acc6, y2, y2
 335  		CSEL	EQ, acc7, y3, y3
 336  
 337  		CMP	$32, t2
 338  		BNE	loop_select
 339  
 340  	STP	(x0, x1), 0*16(res_ptr)
 341  	STP	(x2, x3), 1*16(res_ptr)
 342  	STP	(y0, y1), 2*16(res_ptr)
 343  	STP	(y2, y3), 3*16(res_ptr)
 344  	RET
 345  /* ---------------------------------------*/
 346  // func p256OrdSqr(res, in *p256OrdElement, n int)
 347  TEXT ·p256OrdSqr(SB),NOSPLIT,$0
 348  	MOVD	in+8(FP), a_ptr
 349  	MOVD	n+16(FP), b_ptr
 350  
 351  	MOVD	p256ordK0<>(SB), hlp1
 352  	LDP	p256ord<>+0x00(SB), (const0, const1)
 353  	LDP	p256ord<>+0x10(SB), (const2, const3)
 354  
 355  	LDP	0*16(a_ptr), (x0, x1)
 356  	LDP	1*16(a_ptr), (x2, x3)
 357  
 358  ordSqrLoop:
 359  	SUB	$1, b_ptr
 360  
 361  	// x[1:] * x[0]
 362  	MUL	x0, x1, acc1
 363  	UMULH	x0, x1, acc2
 364  
 365  	MUL	x0, x2, t0
 366  	ADDS	t0, acc2, acc2
 367  	UMULH	x0, x2, acc3
 368  
 369  	MUL	x0, x3, t0
 370  	ADCS	t0, acc3, acc3
 371  	UMULH	x0, x3, acc4
 372  	ADC	$0, acc4, acc4
 373  	// x[2:] * x[1]
 374  	MUL	x1, x2, t0
 375  	ADDS	t0, acc3
 376  	UMULH	x1, x2, t1
 377  	ADCS	t1, acc4
 378  	ADC	$0, ZR, acc5
 379  
 380  	MUL	x1, x3, t0
 381  	ADDS	t0, acc4
 382  	UMULH	x1, x3, t1
 383  	ADC	t1, acc5
 384  	// x[3] * x[2]
 385  	MUL	x2, x3, t0
 386  	ADDS	t0, acc5
 387  	UMULH	x2, x3, acc6
 388  	ADC	$0, acc6
 389  
 390  	MOVD	$0, acc7
 391  	// *2
 392  	ADDS	acc1, acc1
 393  	ADCS	acc2, acc2
 394  	ADCS	acc3, acc3
 395  	ADCS	acc4, acc4
 396  	ADCS	acc5, acc5
 397  	ADCS	acc6, acc6
 398  	ADC	$0, acc7
 399  	// Missing products
 400  	MUL	x0, x0, acc0
 401  	UMULH	x0, x0, t0
 402  	ADDS	t0, acc1, acc1
 403  
 404  	MUL	x1, x1, t0
 405  	ADCS	t0, acc2, acc2
 406  	UMULH	x1, x1, t1
 407  	ADCS	t1, acc3, acc3
 408  
 409  	MUL	x2, x2, t0
 410  	ADCS	t0, acc4, acc4
 411  	UMULH	x2, x2, t1
 412  	ADCS	t1, acc5, acc5
 413  
 414  	MUL	x3, x3, t0
 415  	ADCS	t0, acc6, acc6
 416  	UMULH	x3, x3, t1
 417  	ADC	t1, acc7, acc7
 418  	// First reduction step
 419  	MUL	acc0, hlp1, hlp0
 420  
 421  	MUL	const0, hlp1, t0
 422  	ADDS	t0, acc0, acc0
 423  	UMULH	const0, hlp0, t1
 424  
 425  	MUL	const1, hlp0, t0
 426  	ADCS	t0, acc1, acc1
 427  	UMULH	const1, hlp0, y0
 428  
 429  	MUL	const2, hlp0, t0
 430  	ADCS	t0, acc2, acc2
 431  	UMULH	const2, hlp0, acc0
 432  
 433  	MUL	const3, hlp0, t0
 434  	ADCS	t0, acc3, acc3
 435  
 436  	UMULH	const3, hlp0, hlp0
 437  	ADC	$0, hlp0
 438  
 439  	ADDS	t1, acc1, acc1
 440  	ADCS	y0, acc2, acc2
 441  	ADCS	acc0, acc3, acc3
 442  	ADC	$0, hlp0, acc0
 443  	// Second reduction step
 444  	MUL	acc1, hlp1, hlp0
 445  
 446  	MUL	const0, hlp1, t0
 447  	ADDS	t0, acc1, acc1
 448  	UMULH	const0, hlp0, t1
 449  
 450  	MUL	const1, hlp0, t0
 451  	ADCS	t0, acc2, acc2
 452  	UMULH	const1, hlp0, y0
 453  
 454  	MUL	const2, hlp0, t0
 455  	ADCS	t0, acc3, acc3
 456  	UMULH	const2, hlp0, acc1
 457  
 458  	MUL	const3, hlp0, t0
 459  	ADCS	t0, acc0, acc0
 460  
 461  	UMULH	const3, hlp0, hlp0
 462  	ADC	$0, hlp0
 463  
 464  	ADDS	t1, acc2, acc2
 465  	ADCS	y0, acc3, acc3
 466  	ADCS	acc1, acc0, acc0
 467  	ADC	$0, hlp0, acc1
 468  	// Third reduction step
 469  	MUL	acc2, hlp1, hlp0
 470  
 471  	MUL	const0, hlp1, t0
 472  	ADDS	t0, acc2, acc2
 473  	UMULH	const0, hlp0, t1
 474  
 475  	MUL	const1, hlp0, t0
 476  	ADCS	t0, acc3, acc3
 477  	UMULH	const1, hlp0, y0
 478  
 479  	MUL	const2, hlp0, t0
 480  	ADCS	t0, acc0, acc0
 481  	UMULH	const2, hlp0, acc2
 482  
 483  	MUL	const3, hlp0, t0
 484  	ADCS	t0, acc1, acc1
 485  
 486  	UMULH	const3, hlp0, hlp0
 487  	ADC	$0, hlp0
 488  
 489  	ADDS	t1, acc3, acc3
 490  	ADCS	y0, acc0, acc0
 491  	ADCS	acc2, acc1, acc1
 492  	ADC	$0, hlp0, acc2
 493  
 494  	// Last reduction step
 495  	MUL	acc3, hlp1, hlp0
 496  
 497  	MUL	const0, hlp1, t0
 498  	ADDS	t0, acc3, acc3
 499  	UMULH	const0, hlp0, t1
 500  
 501  	MUL	const1, hlp0, t0
 502  	ADCS	t0, acc0, acc0
 503  	UMULH	const1, hlp0, y0
 504  
 505  	MUL	const2, hlp0, t0
 506  	ADCS	t0, acc1, acc1
 507  	UMULH	const2, hlp0, acc3
 508  
 509  	MUL	const3, hlp0, t0
 510  	ADCS	t0, acc2, acc2
 511  
 512  	UMULH	const3, hlp0, hlp0
 513  	ADC	$0, acc7
 514  
 515  	ADDS	t1, acc0, acc0
 516  	ADCS	y0, acc1, acc1
 517  	ADCS	acc3, acc2, acc2
 518  	ADC	$0, hlp0, acc3
 519  
 520  	ADDS	acc4, acc0, acc0
 521  	ADCS	acc5, acc1, acc1
 522  	ADCS	acc6, acc2, acc2
 523  	ADCS	acc7, acc3, acc3
 524  	ADC	$0, ZR, acc4
 525  
 526  	SUBS	const0, acc0, y0
 527  	SBCS	const1, acc1, y1
 528  	SBCS	const2, acc2, y2
 529  	SBCS	const3, acc3, y3
 530  	SBCS	$0, acc4, acc4
 531  
 532  	CSEL	CS, y0, acc0, x0
 533  	CSEL	CS, y1, acc1, x1
 534  	CSEL	CS, y2, acc2, x2
 535  	CSEL	CS, y3, acc3, x3
 536  
 537  	CBNZ	b_ptr, ordSqrLoop
 538  
 539  	MOVD	res+0(FP), res_ptr
 540  	STP	(x0, x1), 0*16(res_ptr)
 541  	STP	(x2, x3), 1*16(res_ptr)
 542  
 543  	RET
 544  /* ---------------------------------------*/
 545  // func p256OrdMul(res, in1, in2 *p256OrdElement)
 546  TEXT ·p256OrdMul(SB),NOSPLIT,$0
 547  	MOVD	in1+8(FP), a_ptr
 548  	MOVD	in2+16(FP), b_ptr
 549  
 550  	MOVD	p256ordK0<>(SB), hlp1
 551  	LDP	p256ord<>+0x00(SB), (const0, const1)
 552  	LDP	p256ord<>+0x10(SB), (const2, const3)
 553  
 554  	LDP	0*16(a_ptr), (x0, x1)
 555  	LDP	1*16(a_ptr), (x2, x3)
 556  	LDP	0*16(b_ptr), (y0, y1)
 557  	LDP	1*16(b_ptr), (y2, y3)
 558  
 559  	// y[0] * x
 560  	MUL	y0, x0, acc0
 561  	UMULH	y0, x0, acc1
 562  
 563  	MUL	y0, x1, t0
 564  	ADDS	t0, acc1
 565  	UMULH	y0, x1, acc2
 566  
 567  	MUL	y0, x2, t0
 568  	ADCS	t0, acc2
 569  	UMULH	y0, x2, acc3
 570  
 571  	MUL	y0, x3, t0
 572  	ADCS	t0, acc3
 573  	UMULH	y0, x3, acc4
 574  	ADC	$0, acc4
 575  	// First reduction step
 576  	MUL	acc0, hlp1, hlp0
 577  
 578  	MUL	const0, hlp1, t0
 579  	ADDS	t0, acc0, acc0
 580  	UMULH	const0, hlp0, t1
 581  
 582  	MUL	const1, hlp0, t0
 583  	ADCS	t0, acc1, acc1
 584  	UMULH	const1, hlp0, y0
 585  
 586  	MUL	const2, hlp0, t0
 587  	ADCS	t0, acc2, acc2
 588  	UMULH	const2, hlp0, acc0
 589  
 590  	MUL	const3, hlp0, t0
 591  	ADCS	t0, acc3, acc3
 592  
 593  	UMULH	const3, hlp0, hlp0
 594  	ADC	$0, acc4
 595  
 596  	ADDS	t1, acc1, acc1
 597  	ADCS	y0, acc2, acc2
 598  	ADCS	acc0, acc3, acc3
 599  	ADC	$0, hlp0, acc0
 600  	// y[1] * x
 601  	MUL	y1, x0, t0
 602  	ADDS	t0, acc1
 603  	UMULH	y1, x0, t1
 604  
 605  	MUL	y1, x1, t0
 606  	ADCS	t0, acc2
 607  	UMULH	y1, x1, hlp0
 608  
 609  	MUL	y1, x2, t0
 610  	ADCS	t0, acc3
 611  	UMULH	y1, x2, y0
 612  
 613  	MUL	y1, x3, t0
 614  	ADCS	t0, acc4
 615  	UMULH	y1, x3, y1
 616  	ADC	$0, ZR, acc5
 617  
 618  	ADDS	t1, acc2
 619  	ADCS	hlp0, acc3
 620  	ADCS	y0, acc4
 621  	ADC	y1, acc5
 622  	// Second reduction step
 623  	MUL	acc1, hlp1, hlp0
 624  
 625  	MUL	const0, hlp1, t0
 626  	ADDS	t0, acc1, acc1
 627  	UMULH	const0, hlp0, t1
 628  
 629  	MUL	const1, hlp0, t0
 630  	ADCS	t0, acc2, acc2
 631  	UMULH	const1, hlp0, y0
 632  
 633  	MUL	const2, hlp0, t0
 634  	ADCS	t0, acc3, acc3
 635  	UMULH	const2, hlp0, acc1
 636  
 637  	MUL	const3, hlp0, t0
 638  	ADCS	t0, acc0, acc0
 639  
 640  	UMULH	const3, hlp0, hlp0
 641  	ADC	$0, acc5
 642  
 643  	ADDS	t1, acc2, acc2
 644  	ADCS	y0, acc3, acc3
 645  	ADCS	acc1, acc0, acc0
 646  	ADC	$0, hlp0, acc1
 647  	// y[2] * x
 648  	MUL	y2, x0, t0
 649  	ADDS	t0, acc2
 650  	UMULH	y2, x0, t1
 651  
 652  	MUL	y2, x1, t0
 653  	ADCS	t0, acc3
 654  	UMULH	y2, x1, hlp0
 655  
 656  	MUL	y2, x2, t0
 657  	ADCS	t0, acc4
 658  	UMULH	y2, x2, y0
 659  
 660  	MUL	y2, x3, t0
 661  	ADCS	t0, acc5
 662  	UMULH	y2, x3, y1
 663  	ADC	$0, ZR, acc6
 664  
 665  	ADDS	t1, acc3
 666  	ADCS	hlp0, acc4
 667  	ADCS	y0, acc5
 668  	ADC	y1, acc6
 669  	// Third reduction step
 670  	MUL	acc2, hlp1, hlp0
 671  
 672  	MUL	const0, hlp1, t0
 673  	ADDS	t0, acc2, acc2
 674  	UMULH	const0, hlp0, t1
 675  
 676  	MUL	const1, hlp0, t0
 677  	ADCS	t0, acc3, acc3
 678  	UMULH	const1, hlp0, y0
 679  
 680  	MUL	const2, hlp0, t0
 681  	ADCS	t0, acc0, acc0
 682  	UMULH	const2, hlp0, acc2
 683  
 684  	MUL	const3, hlp0, t0
 685  	ADCS	t0, acc1, acc1
 686  
 687  	UMULH	const3, hlp0, hlp0
 688  	ADC	$0, acc6
 689  
 690  	ADDS	t1, acc3, acc3
 691  	ADCS	y0, acc0, acc0
 692  	ADCS	acc2, acc1, acc1
 693  	ADC	$0, hlp0, acc2
 694  	// y[3] * x
 695  	MUL	y3, x0, t0
 696  	ADDS	t0, acc3
 697  	UMULH	y3, x0, t1
 698  
 699  	MUL	y3, x1, t0
 700  	ADCS	t0, acc4
 701  	UMULH	y3, x1, hlp0
 702  
 703  	MUL	y3, x2, t0
 704  	ADCS	t0, acc5
 705  	UMULH	y3, x2, y0
 706  
 707  	MUL	y3, x3, t0
 708  	ADCS	t0, acc6
 709  	UMULH	y3, x3, y1
 710  	ADC	$0, ZR, acc7
 711  
 712  	ADDS	t1, acc4
 713  	ADCS	hlp0, acc5
 714  	ADCS	y0, acc6
 715  	ADC	y1, acc7
 716  	// Last reduction step
 717  	MUL	acc3, hlp1, hlp0
 718  
 719  	MUL	const0, hlp1, t0
 720  	ADDS	t0, acc3, acc3
 721  	UMULH	const0, hlp0, t1
 722  
 723  	MUL	const1, hlp0, t0
 724  	ADCS	t0, acc0, acc0
 725  	UMULH	const1, hlp0, y0
 726  
 727  	MUL	const2, hlp0, t0
 728  	ADCS	t0, acc1, acc1
 729  	UMULH	const2, hlp0, acc3
 730  
 731  	MUL	const3, hlp0, t0
 732  	ADCS	t0, acc2, acc2
 733  
 734  	UMULH	const3, hlp0, hlp0
 735  	ADC	$0, acc7
 736  
 737  	ADDS	t1, acc0, acc0
 738  	ADCS	y0, acc1, acc1
 739  	ADCS	acc3, acc2, acc2
 740  	ADC	$0, hlp0, acc3
 741  
 742  	ADDS	acc4, acc0, acc0
 743  	ADCS	acc5, acc1, acc1
 744  	ADCS	acc6, acc2, acc2
 745  	ADCS	acc7, acc3, acc3
 746  	ADC	$0, ZR, acc4
 747  
 748  	SUBS	const0, acc0, t0
 749  	SBCS	const1, acc1, t1
 750  	SBCS	const2, acc2, t2
 751  	SBCS	const3, acc3, t3
 752  	SBCS	$0, acc4, acc4
 753  
 754  	CSEL	CS, t0, acc0, acc0
 755  	CSEL	CS, t1, acc1, acc1
 756  	CSEL	CS, t2, acc2, acc2
 757  	CSEL	CS, t3, acc3, acc3
 758  
 759  	MOVD	res+0(FP), res_ptr
 760  	STP	(acc0, acc1), 0*16(res_ptr)
 761  	STP	(acc2, acc3), 1*16(res_ptr)
 762  
 763  	RET
 764  /* ---------------------------------------*/
 765  TEXT p256SubInternal<>(SB),NOSPLIT,$0
 766  	SUBS	x0, y0, acc0
 767  	SBCS	x1, y1, acc1
 768  	SBCS	x2, y2, acc2
 769  	SBCS	x3, y3, acc3
 770  	SBC	$0, ZR, t0
 771  
 772  	ADDS	$-1, acc0, acc4
 773  	ADCS	const0, acc1, acc5
 774  	ADCS	$0, acc2, acc6
 775  	ADC	const1, acc3, acc7
 776  
 777  	ANDS	$1, t0
 778  	CSEL	EQ, acc0, acc4, x0
 779  	CSEL	EQ, acc1, acc5, x1
 780  	CSEL	EQ, acc2, acc6, x2
 781  	CSEL	EQ, acc3, acc7, x3
 782  
 783  	RET
 784  /* ---------------------------------------*/
 785  TEXT p256SqrInternal<>(SB),NOSPLIT,$0
 786  	// x[1:] * x[0]
 787  	MUL	x0, x1, acc1
 788  	UMULH	x0, x1, acc2
 789  
 790  	MUL	x0, x2, t0
 791  	ADDS	t0, acc2, acc2
 792  	UMULH	x0, x2, acc3
 793  
 794  	MUL	x0, x3, t0
 795  	ADCS	t0, acc3, acc3
 796  	UMULH	x0, x3, acc4
 797  	ADC	$0, acc4, acc4
 798  	// x[2:] * x[1]
 799  	MUL	x1, x2, t0
 800  	ADDS	t0, acc3
 801  	UMULH	x1, x2, t1
 802  	ADCS	t1, acc4
 803  	ADC	$0, ZR, acc5
 804  
 805  	MUL	x1, x3, t0
 806  	ADDS	t0, acc4
 807  	UMULH	x1, x3, t1
 808  	ADC	t1, acc5
 809  	// x[3] * x[2]
 810  	MUL	x2, x3, t0
 811  	ADDS	t0, acc5
 812  	UMULH	x2, x3, acc6
 813  	ADC	$0, acc6
 814  
 815  	MOVD	$0, acc7
 816  	// *2
 817  	ADDS	acc1, acc1
 818  	ADCS	acc2, acc2
 819  	ADCS	acc3, acc3
 820  	ADCS	acc4, acc4
 821  	ADCS	acc5, acc5
 822  	ADCS	acc6, acc6
 823  	ADC	$0, acc7
 824  	// Missing products
 825  	MUL	x0, x0, acc0
 826  	UMULH	x0, x0, t0
 827  	ADDS	t0, acc1, acc1
 828  
 829  	MUL	x1, x1, t0
 830  	ADCS	t0, acc2, acc2
 831  	UMULH	x1, x1, t1
 832  	ADCS	t1, acc3, acc3
 833  
 834  	MUL	x2, x2, t0
 835  	ADCS	t0, acc4, acc4
 836  	UMULH	x2, x2, t1
 837  	ADCS	t1, acc5, acc5
 838  
 839  	MUL	x3, x3, t0
 840  	ADCS	t0, acc6, acc6
 841  	UMULH	x3, x3, t1
 842  	ADCS	t1, acc7, acc7
 843  	// First reduction step
 844  	ADDS	acc0<<32, acc1, acc1
 845  	LSR	$32, acc0, t0
 846  	MUL	acc0, const1, t1
 847  	UMULH	acc0, const1, acc0
 848  	ADCS	t0, acc2, acc2
 849  	ADCS	t1, acc3, acc3
 850  	ADC	$0, acc0, acc0
 851  	// Second reduction step
 852  	ADDS	acc1<<32, acc2, acc2
 853  	LSR	$32, acc1, t0
 854  	MUL	acc1, const1, t1
 855  	UMULH	acc1, const1, acc1
 856  	ADCS	t0, acc3, acc3
 857  	ADCS	t1, acc0, acc0
 858  	ADC	$0, acc1, acc1
 859  	// Third reduction step
 860  	ADDS	acc2<<32, acc3, acc3
 861  	LSR	$32, acc2, t0
 862  	MUL	acc2, const1, t1
 863  	UMULH	acc2, const1, acc2
 864  	ADCS	t0, acc0, acc0
 865  	ADCS	t1, acc1, acc1
 866  	ADC	$0, acc2, acc2
 867  	// Last reduction step
 868  	ADDS	acc3<<32, acc0, acc0
 869  	LSR	$32, acc3, t0
 870  	MUL	acc3, const1, t1
 871  	UMULH	acc3, const1, acc3
 872  	ADCS	t0, acc1, acc1
 873  	ADCS	t1, acc2, acc2
 874  	ADC	$0, acc3, acc3
 875  	// Add bits [511:256] of the sqr result
 876  	ADDS	acc4, acc0, acc0
 877  	ADCS	acc5, acc1, acc1
 878  	ADCS	acc6, acc2, acc2
 879  	ADCS	acc7, acc3, acc3
 880  	ADC	$0, ZR, acc4
 881  
 882  	SUBS	$-1, acc0, t0
 883  	SBCS	const0, acc1, t1
 884  	SBCS	$0, acc2, t2
 885  	SBCS	const1, acc3, t3
 886  	SBCS	$0, acc4, acc4
 887  
 888  	CSEL	CS, t0, acc0, y0
 889  	CSEL	CS, t1, acc1, y1
 890  	CSEL	CS, t2, acc2, y2
 891  	CSEL	CS, t3, acc3, y3
 892  	RET
 893  /* ---------------------------------------*/
 894  TEXT p256MulInternal<>(SB),NOSPLIT,$0
 895  	// y[0] * x
 896  	MUL	y0, x0, acc0
 897  	UMULH	y0, x0, acc1
 898  
 899  	MUL	y0, x1, t0
 900  	ADDS	t0, acc1
 901  	UMULH	y0, x1, acc2
 902  
 903  	MUL	y0, x2, t0
 904  	ADCS	t0, acc2
 905  	UMULH	y0, x2, acc3
 906  
 907  	MUL	y0, x3, t0
 908  	ADCS	t0, acc3
 909  	UMULH	y0, x3, acc4
 910  	ADC	$0, acc4
 911  	// First reduction step
 912  	ADDS	acc0<<32, acc1, acc1
 913  	LSR	$32, acc0, t0
 914  	MUL	acc0, const1, t1
 915  	UMULH	acc0, const1, acc0
 916  	ADCS	t0, acc2
 917  	ADCS	t1, acc3
 918  	ADC	$0, acc0
 919  	// y[1] * x
 920  	MUL	y1, x0, t0
 921  	ADDS	t0, acc1
 922  	UMULH	y1, x0, t1
 923  
 924  	MUL	y1, x1, t0
 925  	ADCS	t0, acc2
 926  	UMULH	y1, x1, t2
 927  
 928  	MUL	y1, x2, t0
 929  	ADCS	t0, acc3
 930  	UMULH	y1, x2, t3
 931  
 932  	MUL	y1, x3, t0
 933  	ADCS	t0, acc4
 934  	UMULH	y1, x3, hlp0
 935  	ADC	$0, ZR, acc5
 936  
 937  	ADDS	t1, acc2
 938  	ADCS	t2, acc3
 939  	ADCS	t3, acc4
 940  	ADC	hlp0, acc5
 941  	// Second reduction step
 942  	ADDS	acc1<<32, acc2, acc2
 943  	LSR	$32, acc1, t0
 944  	MUL	acc1, const1, t1
 945  	UMULH	acc1, const1, acc1
 946  	ADCS	t0, acc3
 947  	ADCS	t1, acc0
 948  	ADC	$0, acc1
 949  	// y[2] * x
 950  	MUL	y2, x0, t0
 951  	ADDS	t0, acc2
 952  	UMULH	y2, x0, t1
 953  
 954  	MUL	y2, x1, t0
 955  	ADCS	t0, acc3
 956  	UMULH	y2, x1, t2
 957  
 958  	MUL	y2, x2, t0
 959  	ADCS	t0, acc4
 960  	UMULH	y2, x2, t3
 961  
 962  	MUL	y2, x3, t0
 963  	ADCS	t0, acc5
 964  	UMULH	y2, x3, hlp0
 965  	ADC	$0, ZR, acc6
 966  
 967  	ADDS	t1, acc3
 968  	ADCS	t2, acc4
 969  	ADCS	t3, acc5
 970  	ADC	hlp0, acc6
 971  	// Third reduction step
 972  	ADDS	acc2<<32, acc3, acc3
 973  	LSR	$32, acc2, t0
 974  	MUL	acc2, const1, t1
 975  	UMULH	acc2, const1, acc2
 976  	ADCS	t0, acc0
 977  	ADCS	t1, acc1
 978  	ADC	$0, acc2
 979  	// y[3] * x
 980  	MUL	y3, x0, t0
 981  	ADDS	t0, acc3
 982  	UMULH	y3, x0, t1
 983  
 984  	MUL	y3, x1, t0
 985  	ADCS	t0, acc4
 986  	UMULH	y3, x1, t2
 987  
 988  	MUL	y3, x2, t0
 989  	ADCS	t0, acc5
 990  	UMULH	y3, x2, t3
 991  
 992  	MUL	y3, x3, t0
 993  	ADCS	t0, acc6
 994  	UMULH	y3, x3, hlp0
 995  	ADC	$0, ZR, acc7
 996  
 997  	ADDS	t1, acc4
 998  	ADCS	t2, acc5
 999  	ADCS	t3, acc6
1000  	ADC	hlp0, acc7
1001  	// Last reduction step
1002  	ADDS	acc3<<32, acc0, acc0
1003  	LSR	$32, acc3, t0
1004  	MUL	acc3, const1, t1
1005  	UMULH	acc3, const1, acc3
1006  	ADCS	t0, acc1
1007  	ADCS	t1, acc2
1008  	ADC	$0, acc3
1009  	// Add bits [511:256] of the mul result
1010  	ADDS	acc4, acc0, acc0
1011  	ADCS	acc5, acc1, acc1
1012  	ADCS	acc6, acc2, acc2
1013  	ADCS	acc7, acc3, acc3
1014  	ADC	$0, ZR, acc4
1015  
1016  	SUBS	$-1, acc0, t0
1017  	SBCS	const0, acc1, t1
1018  	SBCS	$0, acc2, t2
1019  	SBCS	const1, acc3, t3
1020  	SBCS	$0, acc4, acc4
1021  
1022  	CSEL	CS, t0, acc0, y0
1023  	CSEL	CS, t1, acc1, y1
1024  	CSEL	CS, t2, acc2, y2
1025  	CSEL	CS, t3, acc3, y3
1026  	RET
1027  /* ---------------------------------------*/
1028  #define p256MulBy2Inline       \
1029  	ADDS	y0, y0, x0;    \
1030  	ADCS	y1, y1, x1;    \
1031  	ADCS	y2, y2, x2;    \
1032  	ADCS	y3, y3, x3;    \
1033  	ADC	$0, ZR, hlp0;  \
1034  	SUBS	$-1, x0, t0;   \
1035  	SBCS	const0, x1, t1;\
1036  	SBCS	$0, x2, t2;    \
1037  	SBCS	const1, x3, t3;\
1038  	SBCS	$0, hlp0, hlp0;\
1039  	CSEL	CC, x0, t0, x0;\
1040  	CSEL	CC, x1, t1, x1;\
1041  	CSEL	CC, x2, t2, x2;\
1042  	CSEL	CC, x3, t3, x3;
1043  /* ---------------------------------------*/
1044  #define x1in(off) (off)(a_ptr)
1045  #define y1in(off) (off + 32)(a_ptr)
1046  #define z1in(off) (off + 64)(a_ptr)
1047  #define x2in(off) (off)(b_ptr)
1048  #define z2in(off) (off + 64)(b_ptr)
1049  #define x3out(off) (off)(res_ptr)
1050  #define y3out(off) (off + 32)(res_ptr)
1051  #define z3out(off) (off + 64)(res_ptr)
1052  #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
1053  #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
1054  #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
1055  #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
1056  /* ---------------------------------------*/
1057  #define y2in(off)  (32*0 + 8 + off)(RSP)
1058  #define s2(off)    (32*1 + 8 + off)(RSP)
1059  #define z1sqr(off) (32*2 + 8 + off)(RSP)
1060  #define h(off)	   (32*3 + 8 + off)(RSP)
1061  #define r(off)	   (32*4 + 8 + off)(RSP)
1062  #define hsqr(off)  (32*5 + 8 + off)(RSP)
1063  #define rsqr(off)  (32*6 + 8 + off)(RSP)
1064  #define hcub(off)  (32*7 + 8 + off)(RSP)
1065  
1066  #define z2sqr(off) (32*8 + 8 + off)(RSP)
1067  #define s1(off) (32*9 + 8 + off)(RSP)
1068  #define u1(off) (32*10 + 8 + off)(RSP)
1069  #define u2(off) (32*11 + 8 + off)(RSP)
1070  
1071  // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1072  TEXT ·p256PointAddAffineAsm(SB),0,$264-48
1073  	MOVD	in1+8(FP), a_ptr
1074  	MOVD	in2+16(FP), b_ptr
1075  	MOVD	sign+24(FP), hlp0
1076  	MOVD	sel+32(FP), hlp1
1077  	MOVD	zero+40(FP), t2
1078  
1079  	MOVD	$1, t0
1080  	CMP	$0, t2
1081  	CSEL	EQ, ZR, t0, t2
1082  	CMP	$0, hlp1
1083  	CSEL	EQ, ZR, t0, hlp1
1084  
1085  	MOVD	p256const0<>(SB), const0
1086  	MOVD	p256const1<>(SB), const1
1087  	EOR	t2<<1, hlp1
1088  
1089  	// Negate y2in based on sign
1090  	LDP	2*16(b_ptr), (y0, y1)
1091  	LDP	3*16(b_ptr), (y2, y3)
1092  	MOVD	$-1, acc0
1093  
1094  	SUBS	y0, acc0, acc0
1095  	SBCS	y1, const0, acc1
1096  	SBCS	y2, ZR, acc2
1097  	SBCS	y3, const1, acc3
1098  	SBC	$0, ZR, t0
1099  
1100  	ADDS	$-1, acc0, acc4
1101  	ADCS	const0, acc1, acc5
1102  	ADCS	$0, acc2, acc6
1103  	ADCS	const1, acc3, acc7
1104  	ADC	$0, t0, t0
1105  
1106  	CMP	$0, t0
1107  	CSEL	EQ, acc4, acc0, acc0
1108  	CSEL	EQ, acc5, acc1, acc1
1109  	CSEL	EQ, acc6, acc2, acc2
1110  	CSEL	EQ, acc7, acc3, acc3
1111  	// If condition is 0, keep original value
1112  	CMP	$0, hlp0
1113  	CSEL	EQ, y0, acc0, y0
1114  	CSEL	EQ, y1, acc1, y1
1115  	CSEL	EQ, y2, acc2, y2
1116  	CSEL	EQ, y3, acc3, y3
1117  	// Store result
1118  	STy(y2in)
1119  	// Begin point add
1120  	LDx(z1in)
1121  	CALL	p256SqrInternal<>(SB)    // z1ˆ2
1122  	STy(z1sqr)
1123  
1124  	LDx(x2in)
1125  	CALL	p256MulInternal<>(SB)    // x2 * z1ˆ2
1126  
1127  	LDx(x1in)
1128  	CALL	p256SubInternal<>(SB)    // h = u2 - u1
1129  	STx(h)
1130  
1131  	LDy(z1in)
1132  	CALL	p256MulInternal<>(SB)    // z3 = h * z1
1133  
1134  	LDP	4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
1135  	LDP	5*16(a_ptr), (acc2, acc3)
1136  	ANDS	$1, hlp1, ZR
1137  	CSEL	EQ, acc0, y0, y0
1138  	CSEL	EQ, acc1, y1, y1
1139  	CSEL	EQ, acc2, y2, y2
1140  	CSEL	EQ, acc3, y3, y3
1141  	LDP	p256one<>+0x00(SB), (acc0, acc1)
1142  	LDP	p256one<>+0x10(SB), (acc2, acc3)
1143  	ANDS	$2, hlp1, ZR            // iff select[1] == 0, z3 = 1
1144  	CSEL	EQ, acc0, y0, y0
1145  	CSEL	EQ, acc1, y1, y1
1146  	CSEL	EQ, acc2, y2, y2
1147  	CSEL	EQ, acc3, y3, y3
1148  	LDx(z1in)
1149  	MOVD	res+0(FP), t0
1150  	STP	(y0, y1), 4*16(t0)
1151  	STP	(y2, y3), 5*16(t0)
1152  
1153  	LDy(z1sqr)
1154  	CALL	p256MulInternal<>(SB)    // z1 ^ 3
1155  
1156  	LDx(y2in)
1157  	CALL	p256MulInternal<>(SB)    // s2 = y2 * z1ˆ3
1158  	STy(s2)
1159  
1160  	LDx(y1in)
1161  	CALL	p256SubInternal<>(SB)    // r = s2 - s1
1162  	STx(r)
1163  
1164  	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
1165  	STy	(rsqr)
1166  
1167  	LDx(h)
1168  	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
1169  	STy(hsqr)
1170  
1171  	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
1172  	STy(hcub)
1173  
1174  	LDx(y1in)
1175  	CALL	p256MulInternal<>(SB)    // y1 * hˆ3
1176  	STy(s2)
1177  
1178  	LDP	hsqr(0*8), (x0, x1)
1179  	LDP	hsqr(2*8), (x2, x3)
1180  	LDP	0*16(a_ptr), (y0, y1)
1181  	LDP	1*16(a_ptr), (y2, y3)
1182  	CALL	p256MulInternal<>(SB)    // u1 * hˆ2
1183  	STP	(y0, y1), h(0*8)
1184  	STP	(y2, y3), h(2*8)
1185  
1186  	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
1187  
1188  	LDy(rsqr)
1189  	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
1190  
1191  	MOVD	x0, y0
1192  	MOVD	x1, y1
1193  	MOVD	x2, y2
1194  	MOVD	x3, y3
1195  	LDx(hcub)
1196  	CALL	p256SubInternal<>(SB)
1197  
1198  	LDP	0*16(a_ptr), (acc0, acc1)
1199  	LDP	1*16(a_ptr), (acc2, acc3)
1200  	ANDS	$1, hlp1, ZR           // iff select[0] == 0, x3 = x1
1201  	CSEL	EQ, acc0, x0, x0
1202  	CSEL	EQ, acc1, x1, x1
1203  	CSEL	EQ, acc2, x2, x2
1204  	CSEL	EQ, acc3, x3, x3
1205  	LDP	0*16(b_ptr), (acc0, acc1)
1206  	LDP	1*16(b_ptr), (acc2, acc3)
1207  	ANDS	$2, hlp1, ZR           // iff select[1] == 0, x3 = x2
1208  	CSEL	EQ, acc0, x0, x0
1209  	CSEL	EQ, acc1, x1, x1
1210  	CSEL	EQ, acc2, x2, x2
1211  	CSEL	EQ, acc3, x3, x3
1212  	MOVD	res+0(FP), t0
1213  	STP	(x0, x1), 0*16(t0)
1214  	STP	(x2, x3), 1*16(t0)
1215  
1216  	LDP	h(0*8), (y0, y1)
1217  	LDP	h(2*8), (y2, y3)
1218  	CALL	p256SubInternal<>(SB)
1219  
1220  	LDP	r(0*8), (y0, y1)
1221  	LDP	r(2*8), (y2, y3)
1222  	CALL	p256MulInternal<>(SB)
1223  
1224  	LDP	s2(0*8), (x0, x1)
1225  	LDP	s2(2*8), (x2, x3)
1226  	CALL	p256SubInternal<>(SB)
1227  	LDP	2*16(a_ptr), (acc0, acc1)
1228  	LDP	3*16(a_ptr), (acc2, acc3)
1229  	ANDS	$1, hlp1, ZR           // iff select[0] == 0, y3 = y1
1230  	CSEL	EQ, acc0, x0, x0
1231  	CSEL	EQ, acc1, x1, x1
1232  	CSEL	EQ, acc2, x2, x2
1233  	CSEL	EQ, acc3, x3, x3
1234  	LDP	y2in(0*8), (acc0, acc1)
1235  	LDP	y2in(2*8), (acc2, acc3)
1236  	ANDS	$2, hlp1, ZR            // iff select[1] == 0, y3 = y2
1237  	CSEL	EQ, acc0, x0, x0
1238  	CSEL	EQ, acc1, x1, x1
1239  	CSEL	EQ, acc2, x2, x2
1240  	CSEL	EQ, acc3, x3, x3
1241  	MOVD	res+0(FP), t0
1242  	STP	(x0, x1), 2*16(t0)
1243  	STP	(x2, x3), 3*16(t0)
1244  
1245  	RET
1246  
1247  #define p256AddInline          \
1248  	ADDS	y0, x0, x0;    \
1249  	ADCS	y1, x1, x1;    \
1250  	ADCS	y2, x2, x2;    \
1251  	ADCS	y3, x3, x3;    \
1252  	ADC	$0, ZR, hlp0;  \
1253  	SUBS	$-1, x0, t0;   \
1254  	SBCS	const0, x1, t1;\
1255  	SBCS	$0, x2, t2;    \
1256  	SBCS	const1, x3, t3;\
1257  	SBCS	$0, hlp0, hlp0;\
1258  	CSEL	CC, x0, t0, x0;\
1259  	CSEL	CC, x1, t1, x1;\
1260  	CSEL	CC, x2, t2, x2;\
1261  	CSEL	CC, x3, t3, x3;
1262  
1263  #define s(off)	(32*0 + 8 + off)(RSP)
1264  #define m(off)	(32*1 + 8 + off)(RSP)
1265  #define zsqr(off) (32*2 + 8 + off)(RSP)
1266  #define tmp(off)  (32*3 + 8 + off)(RSP)
1267  
1268  //func p256PointDoubleAsm(res, in *P256Point)
1269  TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
1270  	MOVD	res+0(FP), res_ptr
1271  	MOVD	in+8(FP), a_ptr
1272  
1273  	MOVD	p256const0<>(SB), const0
1274  	MOVD	p256const1<>(SB), const1
1275  
1276  	// Begin point double
1277  	LDP	4*16(a_ptr), (x0, x1)
1278  	LDP	5*16(a_ptr), (x2, x3)
1279  	CALL	p256SqrInternal<>(SB)
1280  	STP	(y0, y1), zsqr(0*8)
1281  	STP	(y2, y3), zsqr(2*8)
1282  
1283  	LDP	0*16(a_ptr), (x0, x1)
1284  	LDP	1*16(a_ptr), (x2, x3)
1285  	p256AddInline
1286  	STx(m)
1287  
1288  	LDx(z1in)
1289  	LDy(y1in)
1290  	CALL	p256MulInternal<>(SB)
1291  	p256MulBy2Inline
1292  	STx(z3out)
1293  
1294  	LDy(x1in)
1295  	LDx(zsqr)
1296  	CALL	p256SubInternal<>(SB)
1297  	LDy(m)
1298  	CALL	p256MulInternal<>(SB)
1299  
1300  	// Multiply by 3
1301  	p256MulBy2Inline
1302  	p256AddInline
1303  	STx(m)
1304  
1305  	LDy(y1in)
1306  	p256MulBy2Inline
1307  	CALL	p256SqrInternal<>(SB)
1308  	STy(s)
1309  	MOVD	y0, x0
1310  	MOVD	y1, x1
1311  	MOVD	y2, x2
1312  	MOVD	y3, x3
1313  	CALL	p256SqrInternal<>(SB)
1314  
1315  	// Divide by 2
1316  	ADDS	$-1, y0, t0
1317  	ADCS	const0, y1, t1
1318  	ADCS	$0, y2, t2
1319  	ADCS	const1, y3, t3
1320  	ADC	$0, ZR, hlp0
1321  
1322  	ANDS	$1, y0, ZR
1323  	CSEL	EQ, y0, t0, t0
1324  	CSEL	EQ, y1, t1, t1
1325  	CSEL	EQ, y2, t2, t2
1326  	CSEL	EQ, y3, t3, t3
1327  	AND	y0, hlp0, hlp0
1328  
1329  	EXTR	$1, t0, t1, y0
1330  	EXTR	$1, t1, t2, y1
1331  	EXTR	$1, t2, t3, y2
1332  	EXTR	$1, t3, hlp0, y3
1333  	STy(y3out)
1334  
1335  	LDx(x1in)
1336  	LDy(s)
1337  	CALL	p256MulInternal<>(SB)
1338  	STy(s)
1339  	p256MulBy2Inline
1340  	STx(tmp)
1341  
1342  	LDx(m)
1343  	CALL	p256SqrInternal<>(SB)
1344  	LDx(tmp)
1345  	CALL	p256SubInternal<>(SB)
1346  
1347  	STx(x3out)
1348  
1349  	LDy(s)
1350  	CALL	p256SubInternal<>(SB)
1351  
1352  	LDy(m)
1353  	CALL	p256MulInternal<>(SB)
1354  
1355  	LDx(y3out)
1356  	CALL	p256SubInternal<>(SB)
1357  	STx(y3out)
1358  	RET
1359  /* ---------------------------------------*/
1360  #undef y2in
1361  #undef x3out
1362  #undef y3out
1363  #undef z3out
1364  #define y2in(off) (off + 32)(b_ptr)
1365  #define x3out(off) (off)(b_ptr)
1366  #define y3out(off) (off + 32)(b_ptr)
1367  #define z3out(off) (off + 64)(b_ptr)
1368  // func p256PointAddAsm(res, in1, in2 *P256Point) int
1369  TEXT ·p256PointAddAsm(SB),0,$392-32
1370  	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
1371  	// Move input to stack in order to free registers
1372  	MOVD	in1+8(FP), a_ptr
1373  	MOVD	in2+16(FP), b_ptr
1374  
1375  	MOVD	p256const0<>(SB), const0
1376  	MOVD	p256const1<>(SB), const1
1377  
1378  	// Begin point add
1379  	LDx(z2in)
1380  	CALL	p256SqrInternal<>(SB)    // z2^2
1381  	STy(z2sqr)
1382  
1383  	CALL	p256MulInternal<>(SB)    // z2^3
1384  
1385  	LDx(y1in)
1386  	CALL	p256MulInternal<>(SB)    // s1 = z2ˆ3*y1
1387  	STy(s1)
1388  
1389  	LDx(z1in)
1390  	CALL	p256SqrInternal<>(SB)    // z1^2
1391  	STy(z1sqr)
1392  
1393  	CALL	p256MulInternal<>(SB)    // z1^3
1394  
1395  	LDx(y2in)
1396  	CALL	p256MulInternal<>(SB)    // s2 = z1ˆ3*y2
1397  
1398  	LDx(s1)
1399  	CALL	p256SubInternal<>(SB)    // r = s2 - s1
1400  	STx(r)
1401  
1402  	MOVD	$1, t2
1403  	ORR	x0, x1, t0             // Check if zero mod p256
1404  	ORR	x2, x3, t1
1405  	ORR	t1, t0, t0
1406  	CMP	$0, t0
1407  	CSEL	EQ, t2, ZR, hlp1
1408  
1409  	EOR	$-1, x0, t0
1410  	EOR	const0, x1, t1
1411  	EOR	const1, x3, t3
1412  
1413  	ORR	t0, t1, t0
1414  	ORR	x2, t3, t1
1415  	ORR	t1, t0, t0
1416  	CMP	$0, t0
1417  	CSEL	EQ, t2, hlp1, hlp1
1418  
1419  	LDx(z2sqr)
1420  	LDy(x1in)
1421  	CALL	p256MulInternal<>(SB)    // u1 = x1 * z2ˆ2
1422  	STy(u1)
1423  
1424  	LDx(z1sqr)
1425  	LDy(x2in)
1426  	CALL	p256MulInternal<>(SB)    // u2 = x2 * z1ˆ2
1427  	STy(u2)
1428  
1429  	LDx(u1)
1430  	CALL	p256SubInternal<>(SB)    // h = u2 - u1
1431  	STx(h)
1432  
1433  	MOVD	$1, t2
1434  	ORR	x0, x1, t0             // Check if zero mod p256
1435  	ORR	x2, x3, t1
1436  	ORR	t1, t0, t0
1437  	CMP	$0, t0
1438  	CSEL	EQ, t2, ZR, hlp0
1439  
1440  	EOR	$-1, x0, t0
1441  	EOR	const0, x1, t1
1442  	EOR	const1, x3, t3
1443  
1444  	ORR	t0, t1, t0
1445  	ORR	x2, t3, t1
1446  	ORR	t1, t0, t0
1447  	CMP	$0, t0
1448  	CSEL	EQ, t2, hlp0, hlp0
1449  
1450  	AND	hlp0, hlp1, hlp1
1451  
1452  	LDx(r)
1453  	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
1454  	STy(rsqr)
1455  
1456  	LDx(h)
1457  	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
1458  	STy(hsqr)
1459  
1460  	LDx(h)
1461  	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
1462  	STy(hcub)
1463  
1464  	LDx(s1)
1465  	CALL	p256MulInternal<>(SB)
1466  	STy(s2)
1467  
1468  	LDx(z1in)
1469  	LDy(z2in)
1470  	CALL	p256MulInternal<>(SB)    // z1 * z2
1471  	LDx(h)
1472  	CALL	p256MulInternal<>(SB)    // z1 * z2 * h
1473  	MOVD	res+0(FP), b_ptr
1474  	STy(z3out)
1475  
1476  	LDx(hsqr)
1477  	LDy(u1)
1478  	CALL	p256MulInternal<>(SB)    // hˆ2 * u1
1479  	STy(u2)
1480  
1481  	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
1482  	LDy(rsqr)
1483  	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
1484  
1485  	MOVD	x0, y0
1486  	MOVD	x1, y1
1487  	MOVD	x2, y2
1488  	MOVD	x3, y3
1489  	LDx(hcub)
1490  	CALL	p256SubInternal<>(SB)
1491  	STx(x3out)
1492  
1493  	LDy(u2)
1494  	CALL	p256SubInternal<>(SB)
1495  
1496  	LDy(r)
1497  	CALL	p256MulInternal<>(SB)
1498  
1499  	LDx(s2)
1500  	CALL	p256SubInternal<>(SB)
1501  	STx(y3out)
1502  
1503  	MOVD	hlp1, R0
1504  	MOVD	R0, ret+24(FP)
1505  
1506  	RET
1507