p256_asm.mx raw

   1  // Copyright 2015 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // This file contains the Go wrapper for the constant-time, 64-bit assembly
   6  // implementation of P256. The optimizations performed here are described in
   7  // detail in:
   8  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
   9  //                          256-bit primes"
  10  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
  11  // https://eprint.iacr.org/2013/816.pdf
  12  
  13  //go:build (amd64 || arm64 || ppc64le || s390x) && !purego
  14  
  15  package nistec
  16  
  17  import (
  18  	"crypto/internal/fips140deps/byteorder"
  19  	"errors"
  20  	"math/bits"
  21  	"runtime"
  22  	"unsafe"
  23  )
  24  
  25  // p256Element is a P-256 base field element in [0, P-1] in the Montgomery
  26  // domain (with R 2²⁵⁶) as four limbs in little-endian order value.
  27  type p256Element [4]uint64
  28  
  29  // p256One is one in the Montgomery domain.
  30  var p256One = p256Element{0x0000000000000001, 0xffffffff00000000,
  31  	0xffffffffffffffff, 0x00000000fffffffe}
  32  
  33  var p256Zero = p256Element{}
  34  
  35  // p256P is 2²⁵⁶ - 2²²⁴ + 2¹⁹² + 2⁹⁶ - 1 in the Montgomery domain.
  36  var p256P = p256Element{0xffffffffffffffff, 0x00000000ffffffff,
  37  	0x0000000000000000, 0xffffffff00000001}
  38  
  39  // P256Point is a P-256 point. The zero value should not be assumed to be valid
  40  // (although it is in this implementation).
  41  type P256Point struct {
  42  	// (X:Y:Z) are Jacobian coordinates where x = X/Z² and y = Y/Z³. The point
  43  	// at infinity can be represented by any set of coordinates with Z = 0.
  44  	x, y, z p256Element
  45  }
  46  
  47  // NewP256Point returns a new P256Point representing the point at infinity.
  48  func NewP256Point() *P256Point {
  49  	return &P256Point{
  50  		x: p256One, y: p256One, z: p256Zero,
  51  	}
  52  }
  53  
  54  // SetGenerator sets p to the canonical generator and returns p.
  55  func (p *P256Point) SetGenerator() *P256Point {
  56  	p.x = p256Element{0x79e730d418a9143c, 0x75ba95fc5fedb601,
  57  		0x79fb732b77622510, 0x18905f76a53755c6}
  58  	p.y = p256Element{0xddf25357ce95560a, 0x8b4ab8e4ba19e45c,
  59  		0xd2e88688dd21f325, 0x8571ff1825885d85}
  60  	p.z = p256One
  61  	return p
  62  }
  63  
  64  // Set sets p = q and returns p.
  65  func (p *P256Point) Set(q *P256Point) *P256Point {
  66  	p.x, p.y, p.z = q.x, q.y, q.z
  67  	return p
  68  }
  69  
  70  const p256ElementLength = 32
  71  const p256UncompressedLength = 1 + 2*p256ElementLength
  72  const p256CompressedLength = 1 + p256ElementLength
  73  
  74  // SetBytes sets p to the compressed, uncompressed, or infinity value encoded in
  75  // b, as specified in SEC 1, Version 2.0, Section 2.3.4. If the point is not on
  76  // the curve, it returns nil and an error, and the receiver is unchanged.
  77  // Otherwise, it returns p.
  78  func (p *P256Point) SetBytes(b []byte) (*P256Point, error) {
  79  	// p256Mul operates in the Montgomery domain with R = 2²⁵⁶ mod p. Thus rr
  80  	// here is R in the Montgomery domain, or R×R mod p. See comment in
  81  	// P256OrdInverse about how this is used.
  82  	rr := p256Element{0x0000000000000003, 0xfffffffbffffffff,
  83  		0xfffffffffffffffe, 0x00000004fffffffd}
  84  
  85  	switch {
  86  	// Point at infinity.
  87  	case len(b) == 1 && b[0] == 0:
  88  		return p.Set(NewP256Point()), nil
  89  
  90  	// Uncompressed form.
  91  	case len(b) == p256UncompressedLength && b[0] == 4:
  92  		var r P256Point
  93  		p256BigToLittle(&r.x, (*[32]byte)(b[1:33]))
  94  		p256BigToLittle(&r.y, (*[32]byte)(b[33:65]))
  95  		if p256LessThanP(&r.x) == 0 || p256LessThanP(&r.y) == 0 {
  96  			return nil, errors.New("invalid P256 element encoding")
  97  		}
  98  		p256Mul(&r.x, &r.x, &rr)
  99  		p256Mul(&r.y, &r.y, &rr)
 100  		if err := p256CheckOnCurve(&r.x, &r.y); err != nil {
 101  			return nil, err
 102  		}
 103  		r.z = p256One
 104  		return p.Set(&r), nil
 105  
 106  	// Compressed form.
 107  	case len(b) == p256CompressedLength && (b[0] == 2 || b[0] == 3):
 108  		var r P256Point
 109  		p256BigToLittle(&r.x, (*[32]byte)(b[1:33]))
 110  		if p256LessThanP(&r.x) == 0 {
 111  			return nil, errors.New("invalid P256 element encoding")
 112  		}
 113  		p256Mul(&r.x, &r.x, &rr)
 114  
 115  		// y² = x³ - 3x + b
 116  		p256Polynomial(&r.y, &r.x)
 117  		if !p256Sqrt(&r.y, &r.y) {
 118  			return nil, errors.New("invalid P256 compressed point encoding")
 119  		}
 120  
 121  		// Select the positive or negative root, as indicated by the least
 122  		// significant bit, based on the encoding type byte.
 123  		yy := &p256Element{}
 124  		p256FromMont(yy, &r.y)
 125  		cond := int(yy[0]&1) ^ int(b[0]&1)
 126  		p256NegCond(&r.y, cond)
 127  
 128  		r.z = p256One
 129  		return p.Set(&r), nil
 130  
 131  	default:
 132  		return nil, errors.New("invalid P256 point encoding")
 133  	}
 134  }
 135  
 136  // p256Polynomial sets y2 to x³ - 3x + b, and returns y2.
 137  func p256Polynomial(y2, x *p256Element) *p256Element {
 138  	x3 := &p256Element{}
 139  	p256Sqr(x3, x, 1)
 140  	p256Mul(x3, x3, x)
 141  
 142  	threeX := &p256Element{}
 143  	p256Add(threeX, x, x)
 144  	p256Add(threeX, threeX, x)
 145  	p256NegCond(threeX, 1)
 146  
 147  	p256B := &p256Element{0xd89cdf6229c4bddf, 0xacf005cd78843090,
 148  		0xe5a220abf7212ed6, 0xdc30061d04874834}
 149  
 150  	p256Add(x3, x3, threeX)
 151  	p256Add(x3, x3, p256B)
 152  
 153  	*y2 = *x3
 154  	return y2
 155  }
 156  
 157  func p256CheckOnCurve(x, y *p256Element) error {
 158  	// y² = x³ - 3x + b
 159  	rhs := p256Polynomial(&p256Element{}, x)
 160  	lhs := &p256Element{}
 161  	p256Sqr(lhs, y, 1)
 162  	if p256Equal(lhs, rhs) != 1 {
 163  		return errors.New("P256 point not on curve")
 164  	}
 165  	return nil
 166  }
 167  
 168  // p256LessThanP returns 1 if x < p, and 0 otherwise. Note that a p256Element is
 169  // not allowed to be equal to or greater than p, so if this function returns 0
 170  // then x is invalid.
 171  func p256LessThanP(x *p256Element) int {
 172  	var b uint64
 173  	_, b = bits.Sub64(x[0], p256P[0], b)
 174  	_, b = bits.Sub64(x[1], p256P[1], b)
 175  	_, b = bits.Sub64(x[2], p256P[2], b)
 176  	_, b = bits.Sub64(x[3], p256P[3], b)
 177  	return int(b)
 178  }
 179  
 180  func p256BigToLittle(l *p256Element, b *[32]byte) {
 181  	bytesToLimbs((*[4]uint64)(l), b)
 182  }
 183  
 184  func bytesToLimbs(l *[4]uint64, b *[32]byte) {
 185  	l[0] = byteorder.BEUint64(b[24:])
 186  	l[1] = byteorder.BEUint64(b[16:])
 187  	l[2] = byteorder.BEUint64(b[8:])
 188  	l[3] = byteorder.BEUint64(b[:])
 189  }
 190  
 191  func p256LittleToBig(b *[32]byte, l *p256Element) {
 192  	limbsToBytes(b, (*[4]uint64)(l))
 193  }
 194  
 195  func limbsToBytes(b *[32]byte, l *[4]uint64) {
 196  	byteorder.BEPutUint64(b[24:], l[0])
 197  	byteorder.BEPutUint64(b[16:], l[1])
 198  	byteorder.BEPutUint64(b[8:], l[2])
 199  	byteorder.BEPutUint64(b[:], l[3])
 200  }
 201  
 202  // p256Add sets res = x + y.
 203  func p256Add(res, x, y *p256Element) {
 204  	var c, b uint64
 205  	t1 := []uint64{:4}
 206  	t1[0], c = bits.Add64(x[0], y[0], 0)
 207  	t1[1], c = bits.Add64(x[1], y[1], c)
 208  	t1[2], c = bits.Add64(x[2], y[2], c)
 209  	t1[3], c = bits.Add64(x[3], y[3], c)
 210  	t2 := []uint64{:4}
 211  	t2[0], b = bits.Sub64(t1[0], p256P[0], 0)
 212  	t2[1], b = bits.Sub64(t1[1], p256P[1], b)
 213  	t2[2], b = bits.Sub64(t1[2], p256P[2], b)
 214  	t2[3], b = bits.Sub64(t1[3], p256P[3], b)
 215  	// Three options:
 216  	//   - a+b < p
 217  	//     then c is 0, b is 1, and t1 is correct
 218  	//   - p <= a+b < 2^256
 219  	//     then c is 0, b is 0, and t2 is correct
 220  	//   - 2^256 <= a+b
 221  	//     then c is 1, b is 1, and t2 is correct
 222  	t2Mask := (c ^ b) - 1
 223  	res[0] = (t1[0] & ^t2Mask) | (t2[0] & t2Mask)
 224  	res[1] = (t1[1] & ^t2Mask) | (t2[1] & t2Mask)
 225  	res[2] = (t1[2] & ^t2Mask) | (t2[2] & t2Mask)
 226  	res[3] = (t1[3] & ^t2Mask) | (t2[3] & t2Mask)
 227  }
 228  
 229  // p256Sqrt sets e to a square root of x. If x is not a square, p256Sqrt returns
 230  // false and e is unchanged. e and x can overlap.
 231  func p256Sqrt(e, x *p256Element) (isSquare bool) {
 232  	t0, t1 := &p256Element{}, &p256Element{}
 233  
 234  	// Since p = 3 mod 4, exponentiation by (p + 1) / 4 yields a square root candidate.
 235  	//
 236  	// The sequence of 7 multiplications and 253 squarings is derived from the
 237  	// following addition chain generated with github.com/mmcloughlin/addchain v0.4.0.
 238  	//
 239  	//	_10       = 2*1
 240  	//	_11       = 1 + _10
 241  	//	_1100     = _11 << 2
 242  	//	_1111     = _11 + _1100
 243  	//	_11110000 = _1111 << 4
 244  	//	_11111111 = _1111 + _11110000
 245  	//	x16       = _11111111 << 8 + _11111111
 246  	//	x32       = x16 << 16 + x16
 247  	//	return      ((x32 << 32 + 1) << 96 + 1) << 94
 248  	//
 249  	p256Sqr(t0, x, 1)
 250  	p256Mul(t0, x, t0)
 251  	p256Sqr(t1, t0, 2)
 252  	p256Mul(t0, t0, t1)
 253  	p256Sqr(t1, t0, 4)
 254  	p256Mul(t0, t0, t1)
 255  	p256Sqr(t1, t0, 8)
 256  	p256Mul(t0, t0, t1)
 257  	p256Sqr(t1, t0, 16)
 258  	p256Mul(t0, t0, t1)
 259  	p256Sqr(t0, t0, 32)
 260  	p256Mul(t0, x, t0)
 261  	p256Sqr(t0, t0, 96)
 262  	p256Mul(t0, x, t0)
 263  	p256Sqr(t0, t0, 94)
 264  
 265  	p256Sqr(t1, t0, 1)
 266  	if p256Equal(t1, x) != 1 {
 267  		return false
 268  	}
 269  	*e = *t0
 270  	return true
 271  }
 272  
 273  // The following assembly functions are implemented in p256_asm_*.s
 274  
 275  // Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p.
 276  //
 277  //go:noescape
 278  func p256Mul(res, in1, in2 *p256Element)
 279  
 280  // Montgomery square, repeated n times (n >= 1).
 281  //
 282  //go:noescape
 283  func p256Sqr(res, in *p256Element, n int)
 284  
 285  // Montgomery multiplication by R⁻¹, or 1 outside the domain.
 286  // Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
 287  //
 288  //go:noescape
 289  func p256FromMont(res, in *p256Element)
 290  
 291  // If cond is not 0, sets val = -val mod p.
 292  //
 293  //go:noescape
 294  func p256NegCond(val *p256Element, cond int)
 295  
 296  // If cond is 0, sets res = b, otherwise sets res = a.
 297  //
 298  //go:noescape
 299  func p256MovCond(res, a, b *P256Point, cond int)
 300  
 301  // p256Table is a table of the first 16 multiples of a point. Points are stored
 302  // at an index offset of -1 so [8]P is at index 7, P is at 0, and [16]P is at 15.
 303  // [0]P is the point at infinity and it's not stored.
 304  type p256Table [16]P256Point
 305  
 306  // p256Select sets res to the point at index idx in the table.
 307  // idx must be in [0, 15]. It executes in constant time.
 308  //
 309  //go:noescape
 310  func p256Select(res *P256Point, table *p256Table, idx int)
 311  
 312  // p256AffinePoint is a point in affine coordinates (x, y). x and y are still
 313  // Montgomery domain elements. The point can't be the point at infinity.
 314  type p256AffinePoint struct {
 315  	x, y p256Element
 316  }
 317  
 318  // p256AffineTable is a table of the first 32 multiples of a point. Points are
 319  // stored at an index offset of -1 like in p256Table, and [0]P is not stored.
 320  type p256AffineTable [32]p256AffinePoint
 321  
 322  // p256Precomputed is a series of precomputed multiples of G, the canonical
 323  // generator. The first p256AffineTable contains multiples of G. The second one
 324  // multiples of [2⁶]G, the third one of [2¹²]G, and so on, where each successive
 325  // table is the previous table doubled six times. Six is the width of the
 326  // sliding window used in p256ScalarBaseMult, and having each table already
 327  // pre-doubled lets us avoid the doublings between windows entirely. This table
 328  // aliases into p256PrecomputedEmbed.
 329  var p256Precomputed *[43]p256AffineTable
 330  
 331  func init() {
 332  	p256PrecomputedPtr := unsafe.Pointer(&p256PrecomputedEmbed)
 333  	if runtime.GOARCH == "s390x" {
 334  		var newTable [43 * 32 * 2 * 4]uint64
 335  		for i, x := range (*[43 * 32 * 2 * 4][8]byte)(p256PrecomputedPtr) {
 336  			newTable[i] = byteorder.LEUint64(x[:])
 337  		}
 338  		p256PrecomputedPtr = unsafe.Pointer(&newTable)
 339  	}
 340  	p256Precomputed = (*[43]p256AffineTable)(p256PrecomputedPtr)
 341  }
 342  
 343  // p256SelectAffine sets res to the point at index idx in the table.
 344  // idx must be in [0, 31]. It executes in constant time.
 345  //
 346  //go:noescape
 347  func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
 348  
 349  // Point addition with an affine point and constant time conditions.
 350  // If zero is 0, sets res = in2. If sel is 0, sets res = in1.
 351  // If sign is not 0, sets res = in1 + -in2. Otherwise, sets res = in1 + in2
 352  //
 353  //go:noescape
 354  func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
 355  
 356  // Point addition. Sets res = in1 + in2. Returns one if the two input points
 357  // were equal and zero otherwise. If in1 or in2 are the point at infinity, res
 358  // and the return value are undefined.
 359  //
 360  //go:noescape
 361  func p256PointAddAsm(res, in1, in2 *P256Point) int
 362  
 363  // Point doubling. Sets res = in + in. in can be the point at infinity.
 364  //
 365  //go:noescape
 366  func p256PointDoubleAsm(res, in *P256Point)
 367  
 368  // p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the
 369  // Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
 370  type p256OrdElement [4]uint64
 371  
 372  // p256OrdReduce ensures s is in the range [0, ord(G)-1].
 373  func p256OrdReduce(s *p256OrdElement) {
 374  	// Since 2 * ord(G) > 2²⁵⁶, we can just conditionally subtract ord(G),
 375  	// keeping the result if it doesn't underflow.
 376  	t0, b := bits.Sub64(s[0], 0xf3b9cac2fc632551, 0)
 377  	t1, b := bits.Sub64(s[1], 0xbce6faada7179e84, b)
 378  	t2, b := bits.Sub64(s[2], 0xffffffffffffffff, b)
 379  	t3, b := bits.Sub64(s[3], 0xffffffff00000000, b)
 380  	tMask := b - 1 // zero if subtraction underflowed
 381  	s[0] ^= (t0 ^ s[0]) & tMask
 382  	s[1] ^= (t1 ^ s[1]) & tMask
 383  	s[2] ^= (t2 ^ s[2]) & tMask
 384  	s[3] ^= (t3 ^ s[3]) & tMask
 385  }
 386  
 387  func p256OrdLittleToBig(b *[32]byte, l *p256OrdElement) {
 388  	limbsToBytes(b, (*[4]uint64)(l))
 389  }
 390  
 391  func p256OrdBigToLittle(l *p256OrdElement, b *[32]byte) {
 392  	bytesToLimbs((*[4]uint64)(l), b)
 393  }
 394  
 395  // Add sets q = p1 + p2, and returns q. The points may overlap.
 396  func (q *P256Point) Add(r1, r2 *P256Point) *P256Point {
 397  	var sum, double P256Point
 398  	r1IsInfinity := r1.isInfinity()
 399  	r2IsInfinity := r2.isInfinity()
 400  	pointsEqual := p256PointAddAsm(&sum, r1, r2)
 401  	p256PointDoubleAsm(&double, r1)
 402  	p256MovCond(&sum, &double, &sum, pointsEqual)
 403  	p256MovCond(&sum, r1, &sum, r2IsInfinity)
 404  	p256MovCond(&sum, r2, &sum, r1IsInfinity)
 405  	return q.Set(&sum)
 406  }
 407  
 408  // Double sets q = p + p, and returns q. The points may overlap.
 409  func (q *P256Point) Double(p *P256Point) *P256Point {
 410  	var double P256Point
 411  	p256PointDoubleAsm(&double, p)
 412  	return q.Set(&double)
 413  }
 414  
 415  // ScalarBaseMult sets r = scalar * generator, where scalar is a 32-byte big
 416  // endian value, and returns r. If scalar is not 32 bytes long, ScalarBaseMult
 417  // returns an error and the receiver is unchanged.
 418  func (r *P256Point) ScalarBaseMult(scalar []byte) (*P256Point, error) {
 419  	if len(scalar) != 32 {
 420  		return nil, errors.New("invalid scalar length")
 421  	}
 422  	scalarReversed := &p256OrdElement{}
 423  	p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar))
 424  	p256OrdReduce(scalarReversed)
 425  
 426  	r.p256BaseMult(scalarReversed)
 427  	return r, nil
 428  }
 429  
 430  // ScalarMult sets r = scalar * q, where scalar is a 32-byte big endian value,
 431  // and returns r. If scalar is not 32 bytes long, ScalarBaseMult returns an
 432  // error and the receiver is unchanged.
 433  func (r *P256Point) ScalarMult(q *P256Point, scalar []byte) (*P256Point, error) {
 434  	if len(scalar) != 32 {
 435  		return nil, errors.New("invalid scalar length")
 436  	}
 437  	scalarReversed := &p256OrdElement{}
 438  	p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar))
 439  	p256OrdReduce(scalarReversed)
 440  
 441  	r.Set(q).p256ScalarMult(scalarReversed)
 442  	return r, nil
 443  }
 444  
 445  // uint64IsZero returns 1 if x is zero and zero otherwise.
 446  func uint64IsZero(x uint64) int {
 447  	x = ^x
 448  	x &= x >> 32
 449  	x &= x >> 16
 450  	x &= x >> 8
 451  	x &= x >> 4
 452  	x &= x >> 2
 453  	x &= x >> 1
 454  	return int(x & 1)
 455  }
 456  
 457  // p256Equal returns 1 if a and b are equal and 0 otherwise.
 458  func p256Equal(a, b *p256Element) int {
 459  	var acc uint64
 460  	for i := range a {
 461  		acc |= a[i] ^ b[i]
 462  	}
 463  	return uint64IsZero(acc)
 464  }
 465  
 466  // isInfinity returns 1 if p is the point at infinity and 0 otherwise.
 467  func (p *P256Point) isInfinity() int {
 468  	return p256Equal(&p.z, &p256Zero)
 469  }
 470  
 471  // Bytes returns the uncompressed or infinity encoding of p, as specified in
 472  // SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the point at
 473  // infinity is shorter than all other encodings.
 474  func (p *P256Point) Bytes() []byte {
 475  	// This function is outlined to make the allocations inline in the caller
 476  	// rather than happen on the heap.
 477  	var out [p256UncompressedLength]byte
 478  	return p.bytes(&out)
 479  }
 480  
 481  func (p *P256Point) bytes(out *[p256UncompressedLength]byte) []byte {
 482  	// The proper representation of the point at infinity is a single zero byte.
 483  	if p.isInfinity() == 1 {
 484  		return append(out[:0], 0)
 485  	}
 486  
 487  	x, y := &p256Element{}, &p256Element{}
 488  	p.affineFromMont(x, y)
 489  
 490  	out[0] = 4 // Uncompressed form.
 491  	p256LittleToBig((*[32]byte)(out[1:33]), x)
 492  	p256LittleToBig((*[32]byte)(out[33:65]), y)
 493  
 494  	return out[:]
 495  }
 496  
 497  // affineFromMont sets (x, y) to the affine coordinates of p, converted out of the
 498  // Montgomery domain.
 499  func (p *P256Point) affineFromMont(x, y *p256Element) {
 500  	p256Inverse(y, &p.z)
 501  	p256Sqr(x, y, 1)
 502  	p256Mul(y, y, x)
 503  
 504  	p256Mul(x, &p.x, x)
 505  	p256Mul(y, &p.y, y)
 506  
 507  	p256FromMont(x, x)
 508  	p256FromMont(y, y)
 509  }
 510  
 511  // BytesX returns the encoding of the x-coordinate of p, as specified in SEC 1,
 512  // Version 2.0, Section 2.3.5, or an error if p is the point at infinity.
 513  func (p *P256Point) BytesX() ([]byte, error) {
 514  	// This function is outlined to make the allocations inline in the caller
 515  	// rather than happen on the heap.
 516  	var out [p256ElementLength]byte
 517  	return p.bytesX(&out)
 518  }
 519  
 520  func (p *P256Point) bytesX(out *[p256ElementLength]byte) ([]byte, error) {
 521  	if p.isInfinity() == 1 {
 522  		return nil, errors.New("P256 point is the point at infinity")
 523  	}
 524  
 525  	x := &p256Element{}
 526  	p256Inverse(x, &p.z)
 527  	p256Sqr(x, x, 1)
 528  	p256Mul(x, &p.x, x)
 529  	p256FromMont(x, x)
 530  	p256LittleToBig((*[32]byte)(out[:]), x)
 531  
 532  	return out[:], nil
 533  }
 534  
 535  // BytesCompressed returns the compressed or infinity encoding of p, as
 536  // specified in SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the
 537  // point at infinity is shorter than all other encodings.
 538  func (p *P256Point) BytesCompressed() []byte {
 539  	// This function is outlined to make the allocations inline in the caller
 540  	// rather than happen on the heap.
 541  	var out [p256CompressedLength]byte
 542  	return p.bytesCompressed(&out)
 543  }
 544  
 545  func (p *P256Point) bytesCompressed(out *[p256CompressedLength]byte) []byte {
 546  	if p.isInfinity() == 1 {
 547  		return append(out[:0], 0)
 548  	}
 549  
 550  	x, y := &p256Element{}, &p256Element{}
 551  	p.affineFromMont(x, y)
 552  
 553  	out[0] = 2 | byte(y[0]&1)
 554  	p256LittleToBig((*[32]byte)(out[1:33]), x)
 555  
 556  	return out[:]
 557  }
 558  
 559  // Select sets q to p1 if cond == 1, and to p2 if cond == 0.
 560  func (q *P256Point) Select(p1, p2 *P256Point, cond int) *P256Point {
 561  	p256MovCond(q, p1, p2, cond)
 562  	return q
 563  }
 564  
 565  // p256Inverse sets out to in⁻¹ mod p. If in is zero, out will be zero.
 566  func p256Inverse(out, in *p256Element) {
 567  	// Inversion is calculated through exponentiation by p - 2, per Fermat's
 568  	// little theorem.
 569  	//
 570  	// The sequence of 12 multiplications and 255 squarings is derived from the
 571  	// following addition chain generated with github.com/mmcloughlin/addchain
 572  	// v0.4.0.
 573  	//
 574  	//  _10     = 2*1
 575  	//  _11     = 1 + _10
 576  	//  _110    = 2*_11
 577  	//  _111    = 1 + _110
 578  	//  _111000 = _111 << 3
 579  	//  _111111 = _111 + _111000
 580  	//  x12     = _111111 << 6 + _111111
 581  	//  x15     = x12 << 3 + _111
 582  	//  x16     = 2*x15 + 1
 583  	//  x32     = x16 << 16 + x16
 584  	//  i53     = x32 << 15
 585  	//  x47     = x15 + i53
 586  	//  i263    = ((i53 << 17 + 1) << 143 + x47) << 47
 587  	//  return    (x47 + i263) << 2 + 1
 588  	//
 589  	var z = &p256Element{}
 590  	var t0 = &p256Element{}
 591  	var t1 = &p256Element{}
 592  
 593  	p256Sqr(z, in, 1)
 594  	p256Mul(z, in, z)
 595  	p256Sqr(z, z, 1)
 596  	p256Mul(z, in, z)
 597  	p256Sqr(t0, z, 3)
 598  	p256Mul(t0, z, t0)
 599  	p256Sqr(t1, t0, 6)
 600  	p256Mul(t0, t0, t1)
 601  	p256Sqr(t0, t0, 3)
 602  	p256Mul(z, z, t0)
 603  	p256Sqr(t0, z, 1)
 604  	p256Mul(t0, in, t0)
 605  	p256Sqr(t1, t0, 16)
 606  	p256Mul(t0, t0, t1)
 607  	p256Sqr(t0, t0, 15)
 608  	p256Mul(z, z, t0)
 609  	p256Sqr(t0, t0, 17)
 610  	p256Mul(t0, in, t0)
 611  	p256Sqr(t0, t0, 143)
 612  	p256Mul(t0, z, t0)
 613  	p256Sqr(t0, t0, 47)
 614  	p256Mul(z, z, t0)
 615  	p256Sqr(z, z, 2)
 616  	p256Mul(out, in, z)
 617  }
 618  
 619  func boothW5(in uint) (int, int) {
 620  	var s uint = ^((in >> 5) - 1)
 621  	var d uint = (1 << 6) - in - 1
 622  	d = (d & s) | (in & (^s))
 623  	d = (d >> 1) + (d & 1)
 624  	return int(d), int(s & 1)
 625  }
 626  
 627  func boothW6(in uint) (int, int) {
 628  	var s uint = ^((in >> 6) - 1)
 629  	var d uint = (1 << 7) - in - 1
 630  	d = (d & s) | (in & (^s))
 631  	d = (d >> 1) + (d & 1)
 632  	return int(d), int(s & 1)
 633  }
 634  
 635  func (p *P256Point) p256BaseMult(scalar *p256OrdElement) {
 636  	var t0 p256AffinePoint
 637  
 638  	wvalue := (scalar[0] << 1) & 0x7f
 639  	sel, sign := boothW6(uint(wvalue))
 640  	p256SelectAffine(&t0, &p256Precomputed[0], sel)
 641  	p.x, p.y, p.z = t0.x, t0.y, p256One
 642  	p256NegCond(&p.y, sign)
 643  
 644  	index := uint(5)
 645  	zero := sel
 646  
 647  	for i := 1; i < 43; i++ {
 648  		if index < 192 {
 649  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
 650  		} else {
 651  			wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
 652  		}
 653  		index += 6
 654  		sel, sign = boothW6(uint(wvalue))
 655  		p256SelectAffine(&t0, &p256Precomputed[i], sel)
 656  		p256PointAddAffineAsm(p, p, &t0, sign, sel, zero)
 657  		zero |= sel
 658  	}
 659  
 660  	// If the whole scalar was zero, set to the point at infinity.
 661  	p256MovCond(p, p, NewP256Point(), zero)
 662  }
 663  
 664  func (p *P256Point) p256ScalarMult(scalar *p256OrdElement) {
 665  	// precomp is a table of precomputed points that stores powers of p
 666  	// from p^1 to p^16.
 667  	var precomp p256Table
 668  	var t0, t1, t2, t3 P256Point
 669  
 670  	// Prepare the table
 671  	precomp[0] = *p // 1
 672  
 673  	p256PointDoubleAsm(&t0, p)
 674  	p256PointDoubleAsm(&t1, &t0)
 675  	p256PointDoubleAsm(&t2, &t1)
 676  	p256PointDoubleAsm(&t3, &t2)
 677  	precomp[1] = t0  // 2
 678  	precomp[3] = t1  // 4
 679  	precomp[7] = t2  // 8
 680  	precomp[15] = t3 // 16
 681  
 682  	p256PointAddAsm(&t0, &t0, p)
 683  	p256PointAddAsm(&t1, &t1, p)
 684  	p256PointAddAsm(&t2, &t2, p)
 685  	precomp[2] = t0 // 3
 686  	precomp[4] = t1 // 5
 687  	precomp[8] = t2 // 9
 688  
 689  	p256PointDoubleAsm(&t0, &t0)
 690  	p256PointDoubleAsm(&t1, &t1)
 691  	precomp[5] = t0 // 6
 692  	precomp[9] = t1 // 10
 693  
 694  	p256PointAddAsm(&t2, &t0, p)
 695  	p256PointAddAsm(&t1, &t1, p)
 696  	precomp[6] = t2  // 7
 697  	precomp[10] = t1 // 11
 698  
 699  	p256PointDoubleAsm(&t0, &t0)
 700  	p256PointDoubleAsm(&t2, &t2)
 701  	precomp[11] = t0 // 12
 702  	precomp[13] = t2 // 14
 703  
 704  	p256PointAddAsm(&t0, &t0, p)
 705  	p256PointAddAsm(&t2, &t2, p)
 706  	precomp[12] = t0 // 13
 707  	precomp[14] = t2 // 15
 708  
 709  	// Start scanning the window from top bit
 710  	index := uint(254)
 711  	var sel, sign int
 712  
 713  	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
 714  	sel, _ = boothW5(uint(wvalue))
 715  
 716  	p256Select(p, &precomp, sel)
 717  	zero := sel
 718  
 719  	for index > 4 {
 720  		index -= 5
 721  		p256PointDoubleAsm(p, p)
 722  		p256PointDoubleAsm(p, p)
 723  		p256PointDoubleAsm(p, p)
 724  		p256PointDoubleAsm(p, p)
 725  		p256PointDoubleAsm(p, p)
 726  
 727  		if index < 192 {
 728  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
 729  		} else {
 730  			wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
 731  		}
 732  
 733  		sel, sign = boothW5(uint(wvalue))
 734  
 735  		p256Select(&t0, &precomp, sel)
 736  		p256NegCond(&t0.y, sign)
 737  		p256PointAddAsm(&t1, p, &t0)
 738  		p256MovCond(&t1, &t1, p, sel)
 739  		p256MovCond(p, &t1, &t0, zero)
 740  		zero |= sel
 741  	}
 742  
 743  	p256PointDoubleAsm(p, p)
 744  	p256PointDoubleAsm(p, p)
 745  	p256PointDoubleAsm(p, p)
 746  	p256PointDoubleAsm(p, p)
 747  	p256PointDoubleAsm(p, p)
 748  
 749  	wvalue = (scalar[0] << 1) & 0x3f
 750  	sel, sign = boothW5(uint(wvalue))
 751  
 752  	p256Select(&t0, &precomp, sel)
 753  	p256NegCond(&t0.y, sign)
 754  	p256PointAddAsm(&t1, p, &t0)
 755  	p256MovCond(&t1, &t1, p, sel)
 756  	p256MovCond(p, &t1, &t0, zero)
 757  }
 758