modnscalar.mx raw

   1  // Copyright (c) 2020-2023 The Decred developers
   2  // Use of this source code is governed by an ISC
   3  // license that can be found in the LICENSE file.
   4  
   5  package secp256k1
   6  
   7  import (
   8  	"math/big"
   9  
  10  	"smesh.lol/pkg/nostr/hex"
  11  )
  12  
  13  // References:
  14  //   [SECG]: Recommended Elliptic Curve Domain Parameters
  15  //     https://www.secg.org/sec2-v2.pdf
  16  //
  17  //   [HAC]: Handbook of Applied Cryptography Menezes, van Oorschot, Vanstone.
  18  //     http://cacr.uwaterloo.ca/hac/
  19  
  20  // Many elliptic curve operations require working with scalars in a finite field
  21  // characterized by the order of the group underlying the secp256k1 curve.
  22  // Given this precision is larger than the biggest available native type,
  23  // obviously some form of bignum math is needed.  This code implements
  24  // specialized fixed-precision field arithmetic rather than relying on an
  25  // arbitrary-precision arithmetic package such as math/big for dealing with the
  26  // math modulo the group order since the size is known.  As a result, rather
  27  // large performance gains are achieved by taking advantage of many
  28  // optimizations not available to arbitrary-precision arithmetic and generic
  29  // modular arithmetic algorithms.
  30  //
  31  // There are various ways to internally represent each element.  For example,
  32  // the most obvious representation would be to use an array of 4 uint64s (64
  33  // bits * 4 = 256 bits).  However, that representation suffers from the fact
  34  // that there is no native Go type large enough to handle the intermediate
  35  // results while adding or multiplying two 64-bit numbers.
  36  //
  37  // Given the above, this implementation represents the field elements as 8
  38  // uint32s with each word (array entry) treated as base 2^32.  This was chosen
  39  // because most systems at the current time are 64-bit (or at least have 64-bit
  40  // registers available for specialized purposes such as MMX) so the intermediate
  41  // results can typically be done using a native register (and using uint64s to
  42  // avoid the need for additional half-word arithmetic)
  43  
  44  const (
  45  	// These fields provide convenient access to each of the words of the
  46  	// secp256k1 curve group order N to improve code readability.
  47  	//
  48  	// The group order of the curve per [SECG] is:
  49  	// 0xffffffff ffffffff ffffffff fffffffe baaedce6 af48a03b bfd25e8c d0364141
  50  	//
  51  	// nolint: dupword
  52  	orderWordZero  uint32 = 0xd0364141
  53  	orderWordOne   uint32 = 0xbfd25e8c
  54  	orderWordTwo   uint32 = 0xaf48a03b
  55  	orderWordThree uint32 = 0xbaaedce6
  56  	orderWordFour  uint32 = 0xfffffffe
  57  	orderWordFive  uint32 = 0xffffffff
  58  	orderWordSix   uint32 = 0xffffffff
  59  	orderWordSeven uint32 = 0xffffffff
  60  	// These fields provide convenient access to each of the words of the two's
  61  	// complement of the secp256k1 curve group order N to improve code
  62  	// readability.
  63  	//
  64  	// The two's complement of the group order is:
  65  	// 0x00000000 00000000 00000000 00000001 45512319 50b75fc4 402da173 2fc9bebf
  66  	orderComplementWordZero  = (^orderWordZero) + 1
  67  	orderComplementWordOne   = ^orderWordOne
  68  	orderComplementWordTwo   = ^orderWordTwo
  69  	orderComplementWordThree = ^orderWordThree
  70  	// orderComplementWordFour  uint32 = ^orderWordFour  // unused
  71  	// orderComplementWordFive  uint32 = ^orderWordFive  // unused
  72  	// orderComplementWordSix   uint32 = ^orderWordSix   // unused
  73  	// orderComplementWordSeven uint32 = ^orderWordSeven // unused
  74  	// These fields provide convenient access to each of the words of the
  75  	// secp256k1 curve group order N / 2 to improve code readability and avoid
  76  	// the need to recalculate them.
  77  	//
  78  	// The half order of the secp256k1 curve group is:
  79  	// 0x7fffffff ffffffff ffffffff ffffffff 5d576e73 57a4501d dfe92f46 681b20a0
  80  	//
  81  	// nolint: dupword
  82  	halfOrderWordZero  uint32 = 0x681b20a0
  83  	halfOrderWordOne   uint32 = 0xdfe92f46
  84  	halfOrderWordTwo   uint32 = 0x57a4501d
  85  	halfOrderWordThree uint32 = 0x5d576e73
  86  	halfOrderWordFour  uint32 = 0xffffffff
  87  	halfOrderWordFive  uint32 = 0xffffffff
  88  	halfOrderWordSix   uint32 = 0xffffffff
  89  	halfOrderWordSeven uint32 = 0x7fffffff
  90  	// uint32Mask is simply a mask with all bits set for a uint32 and is used to
  91  	// improve the readability of the code.
  92  	uint32Mask = 0xffffffff
  93  )
  94  
  95  var (
  96  	// zero32 is an array of 32 bytes used for the purposes of zeroing and is
  97  	// defined here to avoid extra allocations.
  98  	zero32 = [32]byte{}
  99  )
 100  
 101  // ModNScalar implements optimized 256-bit constant-time fixed-precision
 102  // arithmetic over the secp256k1 group order. This means all arithmetic is
 103  // performed modulo:
 104  //
 105  //	0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141
 106  //
 107  // It only implements the arithmetic needed for elliptic curve operations,
 108  // however, the operations that are not implemented can typically be worked
 109  // around if absolutely needed.  For example, subtraction can be performed by
 110  // adding the negation.
 111  //
 112  // Should it be absolutely necessary, conversion to the standard library
 113  // math/big.Int can be accomplished by using the Bytes method, slicing the
 114  // resulting fixed-size array, and feeding it to big.Int.SetBytes.  However,
 115  // that should typically be avoided when possible as conversion to big.Ints
 116  // requires allocations, is not constant time, and is slower when working modulo
 117  // the group order.
 118  type ModNScalar struct {
 119  	// The scalar is represented as 8 32-bit integers in base 2^32.
 120  	//
 121  	// The following depicts the internal representation:
 122  	// 	 ---------------------------------------------------------
 123  	// 	|       n[7]     |      n[6]      | ... |      n[0]      |
 124  	// 	| 32 bits        | 32 bits        | ... | 32 bits        |
 125  	// 	| Mult: 2^(32*7) | Mult: 2^(32*6) | ... | Mult: 2^(32*0) |
 126  	// 	 ---------------------------------------------------------
 127  	//
 128  	// For example, consider the number 2^87 + 2^42 + 1.  It would be
 129  	// represented as:
 130  	// 	n[0] = 1
 131  	// 	n[1] = 2^10
 132  	// 	n[2] = 2^23
 133  	// 	n[3..7] = 0
 134  	//
 135  	// The full 256-bit value is then calculated by looping i from 7..0 and
 136  	// doing sum(n[i] * 2^(32i)) like so:
 137  	// 	n[7] * 2^(32*7) = 0    * 2^224 = 0
 138  	// 	n[6] * 2^(32*6) = 0    * 2^192 = 0
 139  	// 	...
 140  	// 	n[2] * 2^(32*2) = 2^23 * 2^64  = 2^87
 141  	// 	n[1] * 2^(32*1) = 2^10 * 2^32  = 2^42
 142  	// 	n[0] * 2^(32*0) = 1    * 2^0   = 1
 143  	// 	Sum: 0 + 0 + ... + 2^87 + 2^42 + 1 = 2^87 + 2^42 + 1
 144  	n [8]uint32
 145  }
 146  
 147  // String returns the scalar as a human-readable hex string.
 148  //
 149  // This is NOT constant time.
 150  func (s ModNScalar) String() string {
 151  	b := s.Bytes()
 152  	return hex.Enc(b[:])
 153  }
 154  
 155  // Set sets the scalar equal to a copy of the passed one in constant time.
 156  //
 157  // The scalar is returned to support chaining.  This enables syntax like:
 158  // s := new(ModNScalar).Set(s2).Add(1) so that s = s2 + 1 where s2 is not
 159  // modified.
 160  func (s *ModNScalar) Set(val *ModNScalar) *ModNScalar {
 161  	*s = *val
 162  	return s
 163  }
 164  
 165  // Zero sets the scalar to zero in constant time.  A newly created scalar is
 166  // already set to zero.  This function can be useful to clear an existing scalar
 167  // for reuse.
 168  func (s *ModNScalar) Zero() {
 169  	s.n[0] = 0
 170  	s.n[1] = 0
 171  	s.n[2] = 0
 172  	s.n[3] = 0
 173  	s.n[4] = 0
 174  	s.n[5] = 0
 175  	s.n[6] = 0
 176  	s.n[7] = 0
 177  }
 178  
 179  // IsZeroBit returns 1 when the scalar is equal to zero or 0 otherwise in
 180  // constant time.
 181  //
 182  // Note that a bool is not used here because it is not possible in Go to convert
 183  // from a bool to numeric value in constant time and many constant-time
 184  // operations require a numeric value.  See IsZero for the version that returns
 185  // a bool.
 186  func (s *ModNScalar) IsZeroBit() uint32 {
 187  	// The scalar can only be zero if no bits are set in any of the words.
 188  	bits := s.n[0] | s.n[1] | s.n[2] | s.n[3] | s.n[4] | s.n[5] | s.n[6] | s.n[7]
 189  	return constantTimeEq(bits, 0)
 190  }
 191  
 192  // IsZero returns whether or not the scalar is equal to zero in constant time.
 193  func (s *ModNScalar) IsZero() bool {
 194  	// The scalar can only be zero if no bits are set in any of the words.
 195  	bits := s.n[0] | s.n[1] | s.n[2] | s.n[3] | s.n[4] | s.n[5] | s.n[6] | s.n[7]
 196  	return bits == 0
 197  }
 198  
 199  // SetInt sets the scalar to the passed integer in constant time.  This is a
 200  // convenience function since it is fairly common to perform some arithmetic
 201  // with small native integers.
 202  //
 203  // The scalar is returned to support chaining.  This enables syntax like:
 204  // s := new(ModNScalar).SetInt(2).Mul(s2) so that s = 2 * s2.
 205  func (s *ModNScalar) SetInt(ui uint32) *ModNScalar {
 206  	s.Zero()
 207  	s.n[0] = ui
 208  	return s
 209  }
 210  
 211  // constantTimeEq returns 1 if a == b or 0 otherwise in constant time.
 212  func constantTimeEq(a, b uint32) uint32 {
 213  	return uint32((uint64(a^b) - 1) >> 63)
 214  }
 215  
 216  // constantTimeNotEq returns 1 if a != b or 0 otherwise in constant time.
 217  func constantTimeNotEq(a, b uint32) uint32 {
 218  	return ^uint32((uint64(a^b)-1)>>63) & 1
 219  }
 220  
 221  // constantTimeLess returns 1 if a < b or 0 otherwise in constant time.
 222  func constantTimeLess(a, b uint32) uint32 {
 223  	return uint32((uint64(a) - uint64(b)) >> 63)
 224  }
 225  
 226  // constantTimeLessOrEq returns 1 if a <= b or 0 otherwise in constant time.
 227  func constantTimeLessOrEq(a, b uint32) uint32 {
 228  	return uint32((uint64(a) - uint64(b) - 1) >> 63)
 229  }
 230  
 231  // constantTimeGreater returns 1 if a > b or 0 otherwise in constant time.
 232  func constantTimeGreater(a, b uint32) uint32 {
 233  	return constantTimeLess(b, a)
 234  }
 235  
 236  // constantTimeGreaterOrEq returns 1 if a >= b or 0 otherwise in constant time.
 237  func constantTimeGreaterOrEq(a, b uint32) uint32 {
 238  	return constantTimeLessOrEq(b, a)
 239  }
 240  
 241  // constantTimeMin returns min(a,b) in constant time.
 242  func constantTimeMin(a, b uint32) uint32 {
 243  	return b ^ ((a ^ b) & -constantTimeLess(a, b))
 244  }
 245  
 246  // overflows determines if the current scalar is greater than or equal to the
 247  // group order in constant time and returns 1 if it is or 0 otherwise.
 248  func (s *ModNScalar) overflows() uint32 {
 249  	// The intuition here is that the scalar is greater than the group order if
 250  	// one of the higher individual words is greater than corresponding word of
 251  	// the group order and all higher words in the scalar are equal to their
 252  	// corresponding word of the group order.  Since this type is modulo the
 253  	// group order, being equal is also an overflow back to 0.
 254  	//
 255  	// Note that the words 5, 6, and 7 are all the max uint32 value, so there is
 256  	// no need to test if those individual words of the scalar exceeds them,
 257  	// hence, only equality is checked for them.
 258  	highWordsEqual := constantTimeEq(s.n[7], orderWordSeven)
 259  	highWordsEqual &= constantTimeEq(s.n[6], orderWordSix)
 260  	highWordsEqual &= constantTimeEq(s.n[5], orderWordFive)
 261  	overflow := highWordsEqual & constantTimeGreater(s.n[4], orderWordFour)
 262  	highWordsEqual &= constantTimeEq(s.n[4], orderWordFour)
 263  	overflow |= highWordsEqual & constantTimeGreater(s.n[3], orderWordThree)
 264  	highWordsEqual &= constantTimeEq(s.n[3], orderWordThree)
 265  	overflow |= highWordsEqual & constantTimeGreater(s.n[2], orderWordTwo)
 266  	highWordsEqual &= constantTimeEq(s.n[2], orderWordTwo)
 267  	overflow |= highWordsEqual & constantTimeGreater(s.n[1], orderWordOne)
 268  	highWordsEqual &= constantTimeEq(s.n[1], orderWordOne)
 269  	overflow |= highWordsEqual & constantTimeGreaterOrEq(s.n[0], orderWordZero)
 270  	return overflow
 271  }
 272  
 273  // reduce256 reduces the current scalar modulo the group order in accordance
 274  // with the overflows parameter in constant time.  The overflows parameter
 275  // specifies whether or not the scalar is known to be greater than the group
 276  // order and MUST either be 1 in the case it is or 0 in the case it is not for a
 277  // correct result.
 278  func (s *ModNScalar) reduce256(overflows uint32) {
 279  	// Notice that since s < 2^256 < 2N (where N is the group order), the max
 280  	// possible number of reductions required is one.  Therefore, in the case a
 281  	// reduction is needed, it can be performed with a single subtraction of N.
 282  	// Also, recall that subtraction is equivalent to addition by the two's
 283  	// complement while ignoring the carry.
 284  	//
 285  	// When s >= N, the overflows parameter will be 1.  Conversely, it will be 0
 286  	// when s < N.  Thus multiplying by the overflows parameter will either
 287  	// result in 0 or the multiplicand itself.
 288  	//
 289  	// Combining the above along with the fact that s + 0 = s, the following is
 290  	// a constant time implementation that works by either adding 0 or the two's
 291  	// complement of N as needed.
 292  	//
 293  	// The final result will be in the range 0 <= s < N as expected.
 294  	overflows64 := uint64(overflows)
 295  	c := uint64(s.n[0]) + overflows64*uint64(orderComplementWordZero)
 296  	s.n[0] = uint32(c & uint32Mask)
 297  	c = (c >> 32) + uint64(s.n[1]) + overflows64*uint64(orderComplementWordOne)
 298  	s.n[1] = uint32(c & uint32Mask)
 299  	c = (c >> 32) + uint64(s.n[2]) + overflows64*uint64(orderComplementWordTwo)
 300  	s.n[2] = uint32(c & uint32Mask)
 301  	c = (c >> 32) + uint64(s.n[3]) + overflows64*uint64(orderComplementWordThree)
 302  	s.n[3] = uint32(c & uint32Mask)
 303  	c = (c >> 32) + uint64(s.n[4]) + overflows64 // * 1
 304  	s.n[4] = uint32(c & uint32Mask)
 305  	c = (c >> 32) + uint64(s.n[5]) // + overflows64 * 0
 306  	s.n[5] = uint32(c & uint32Mask)
 307  	c = (c >> 32) + uint64(s.n[6]) // + overflows64 * 0
 308  	s.n[6] = uint32(c & uint32Mask)
 309  	c = (c >> 32) + uint64(s.n[7]) // + overflows64 * 0
 310  	s.n[7] = uint32(c & uint32Mask)
 311  }
 312  
 313  // SetBytes interprets the provided array as a 256-bit big-endian unsigned
 314  // integer, reduces it modulo the group order, sets the scalar to the result,
 315  // and returns either 1 if it was reduced (aka it overflowed) or 0 otherwise in
 316  // constant time.
 317  //
 318  // Note that a bool is not used here because it is not possible in Go to convert
 319  // from a bool to numeric value in constant time and many constant-time
 320  // operations require a numeric value.
 321  func (s *ModNScalar) SetBytes(b *[32]byte) uint32 {
 322  	// Pack the 256 total bits across the 8 uint32 words.  This could be done
 323  	// with a for loop, but benchmarks show this unrolled version is about 2
 324  	// times faster than the variant that uses a loop.
 325  	s.n[0] = uint32(b[31]) | uint32(b[30])<<8 | uint32(b[29])<<16 | uint32(b[28])<<24
 326  	s.n[1] = uint32(b[27]) | uint32(b[26])<<8 | uint32(b[25])<<16 | uint32(b[24])<<24
 327  	s.n[2] = uint32(b[23]) | uint32(b[22])<<8 | uint32(b[21])<<16 | uint32(b[20])<<24
 328  	s.n[3] = uint32(b[19]) | uint32(b[18])<<8 | uint32(b[17])<<16 | uint32(b[16])<<24
 329  	s.n[4] = uint32(b[15]) | uint32(b[14])<<8 | uint32(b[13])<<16 | uint32(b[12])<<24
 330  	s.n[5] = uint32(b[11]) | uint32(b[10])<<8 | uint32(b[9])<<16 | uint32(b[8])<<24
 331  	s.n[6] = uint32(b[7]) | uint32(b[6])<<8 | uint32(b[5])<<16 | uint32(b[4])<<24
 332  	s.n[7] = uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
 333  	// The value might be >= N, so reduce it as required and return whether or
 334  	// not it was reduced.
 335  	needsReduce := s.overflows()
 336  	s.reduce256(needsReduce)
 337  	return needsReduce
 338  }
 339  
 340  // zeroArray32 zeroes the provided 32-byte buffer.
 341  func zeroArray32(b *[32]byte) { copy(b[:], zero32[:]) }
 342  
 343  // SetByteSlice interprets the provided slice as a 256-bit big-endian unsigned
 344  // integer (meaning it is truncated to the first 32 bytes), reduces it modulo
 345  // the group order, sets the scalar to the result, and returns whether or not
 346  // the resulting truncated 256-bit integer overflowed in constant time.
 347  //
 348  // Note that since passing a slice with more than 32 bytes is truncated, it is
 349  // possible that the truncated value is less than the order of the curve and
 350  // hence it will not be reported as having overflowed in that case.  It is up to
 351  // the caller to decide whether it needs to provide numbers of the appropriate
 352  // size or it is acceptable to use this function with the described truncation
 353  // and overflow behavior.
 354  func (s *ModNScalar) SetByteSlice(b []byte) bool {
 355  	var b32 [32]byte
 356  	b = b[:constantTimeMin(uint32(len(b)), 32)]
 357  	copy(b32[:], b32[:32-len(b)])
 358  	copy(b32[32-len(b):], b)
 359  	result := s.SetBytes(&b32)
 360  	zeroArray32(&b32)
 361  	return result != 0
 362  }
 363  
 364  // PutBytesUnchecked unpacks the scalar to a 32-byte big-endian value directly
 365  // into the passed byte slice in constant time.  The target slice must have at
 366  // least 32 bytes available or it will panic.
 367  //
 368  // There is a similar function, PutBytes, which unpacks the scalar into a
 369  // 32-byte array directly.  This version is provided since it can be useful to
 370  // write directly into part of a larger buffer without needing a separate
 371  // allocation.
 372  //
 373  // Preconditions:
 374  //   - The target slice MUST have at least 32 bytes available
 375  func (s *ModNScalar) PutBytesUnchecked(b []byte) {
 376  	// Unpack the 256 total bits from the 8 uint32 words.  This could be done
 377  	// with a for loop, but benchmarks show this unrolled version is about 2
 378  	// times faster than the variant which uses a loop.
 379  	b[31] = byte(s.n[0])
 380  	b[30] = byte(s.n[0] >> 8)
 381  	b[29] = byte(s.n[0] >> 16)
 382  	b[28] = byte(s.n[0] >> 24)
 383  	b[27] = byte(s.n[1])
 384  	b[26] = byte(s.n[1] >> 8)
 385  	b[25] = byte(s.n[1] >> 16)
 386  	b[24] = byte(s.n[1] >> 24)
 387  	b[23] = byte(s.n[2])
 388  	b[22] = byte(s.n[2] >> 8)
 389  	b[21] = byte(s.n[2] >> 16)
 390  	b[20] = byte(s.n[2] >> 24)
 391  	b[19] = byte(s.n[3])
 392  	b[18] = byte(s.n[3] >> 8)
 393  	b[17] = byte(s.n[3] >> 16)
 394  	b[16] = byte(s.n[3] >> 24)
 395  	b[15] = byte(s.n[4])
 396  	b[14] = byte(s.n[4] >> 8)
 397  	b[13] = byte(s.n[4] >> 16)
 398  	b[12] = byte(s.n[4] >> 24)
 399  	b[11] = byte(s.n[5])
 400  	b[10] = byte(s.n[5] >> 8)
 401  	b[9] = byte(s.n[5] >> 16)
 402  	b[8] = byte(s.n[5] >> 24)
 403  	b[7] = byte(s.n[6])
 404  	b[6] = byte(s.n[6] >> 8)
 405  	b[5] = byte(s.n[6] >> 16)
 406  	b[4] = byte(s.n[6] >> 24)
 407  	b[3] = byte(s.n[7])
 408  	b[2] = byte(s.n[7] >> 8)
 409  	b[1] = byte(s.n[7] >> 16)
 410  	b[0] = byte(s.n[7] >> 24)
 411  }
 412  
 413  // PutBytes unpacks the scalar to a 32-byte big-endian value using the passed
 414  // byte array in constant time.
 415  //
 416  // There is a similar function, PutBytesUnchecked, which unpacks the scalar into
 417  // a slice that must have at least 32 bytes available.  This version is provided
 418  // since it can be useful to write directly into an array that is type checked.
 419  //
 420  // Alternatively, there is also Bytes, which unpacks the scalar into a new array
 421  // and returns that which can sometimes be more ergonomic in applications that
 422  // aren't concerned about an additional copy.
 423  func (s *ModNScalar) PutBytes(b *[32]byte) { s.PutBytesUnchecked(b[:]) }
 424  
 425  // Bytes unpacks the scalar to a 32-byte big-endian value in constant time.
 426  //
 427  // See PutBytes and PutBytesUnchecked for variants that allow an array or slice
 428  // to be passed which can be useful to cut down on the number of allocations
 429  // by allowing the caller to reuse a buffer or write directly into part of a
 430  // larger buffer.
 431  func (s *ModNScalar) Bytes() [32]byte {
 432  	var b [32]byte
 433  	s.PutBytesUnchecked(b[:])
 434  	return b
 435  }
 436  
 437  // IsOdd returns whether the scalar is an odd number in constant time.
 438  func (s *ModNScalar) IsOdd() bool {
 439  	// Only odd numbers have the bottom bit set.
 440  	return s.n[0]&1 == 1
 441  }
 442  
 443  // Equals returns whether or not the two scalars are the same in constant time.
 444  func (s *ModNScalar) Equals(val *ModNScalar) bool {
 445  	// Xor only sets bits when they are different, so the two scalars can only
 446  	// be the same if no bits are set after xoring each word.
 447  	bits := (s.n[0] ^ val.n[0]) | (s.n[1] ^ val.n[1]) | (s.n[2] ^ val.n[2]) |
 448  		(s.n[3] ^ val.n[3]) | (s.n[4] ^ val.n[4]) | (s.n[5] ^ val.n[5]) |
 449  		(s.n[6] ^ val.n[6]) | (s.n[7] ^ val.n[7])
 450  
 451  	return bits == 0
 452  }
 453  
 454  // Add2 adds the passed two scalars together modulo the group order in constant
 455  // time and stores the result in s.
 456  //
 457  // The scalar is returned to support chaining.  This enables syntax like:
 458  // s3.Add2(s, s2).AddInt(1) so that s3 = s + s2 + 1.
 459  func (s *ModNScalar) Add2(val1, val2 *ModNScalar) *ModNScalar {
 460  	c := uint64(val1.n[0]) + uint64(val2.n[0])
 461  	s.n[0] = uint32(c & uint32Mask)
 462  	c = (c >> 32) + uint64(val1.n[1]) + uint64(val2.n[1])
 463  	s.n[1] = uint32(c & uint32Mask)
 464  	c = (c >> 32) + uint64(val1.n[2]) + uint64(val2.n[2])
 465  	s.n[2] = uint32(c & uint32Mask)
 466  	c = (c >> 32) + uint64(val1.n[3]) + uint64(val2.n[3])
 467  	s.n[3] = uint32(c & uint32Mask)
 468  	c = (c >> 32) + uint64(val1.n[4]) + uint64(val2.n[4])
 469  	s.n[4] = uint32(c & uint32Mask)
 470  	c = (c >> 32) + uint64(val1.n[5]) + uint64(val2.n[5])
 471  	s.n[5] = uint32(c & uint32Mask)
 472  	c = (c >> 32) + uint64(val1.n[6]) + uint64(val2.n[6])
 473  	s.n[6] = uint32(c & uint32Mask)
 474  	c = (c >> 32) + uint64(val1.n[7]) + uint64(val2.n[7])
 475  	s.n[7] = uint32(c & uint32Mask)
 476  	// The result is now 256 bits, but it might still be >= N, so use the
 477  	// existing normal reduce method for 256-bit values.
 478  	s.reduce256(uint32(c>>32) + s.overflows())
 479  	return s
 480  }
 481  
 482  // Add adds the passed scalar to the existing one modulo the group order in
 483  // constant time and stores the result in s.
 484  //
 485  // The scalar is returned to support chaining.  This enables syntax like:
 486  // s.Add(s2).AddInt(1) so that s = s + s2 + 1.
 487  func (s *ModNScalar) Add(val *ModNScalar) *ModNScalar { return s.Add2(s, val) }
 488  
 489  // accumulator96 provides a 96-bit accumulator for use in the intermediate
 490  // calculations requiring more than 64-bits.
 491  type accumulator96 struct {
 492  	n [3]uint32
 493  }
 494  
 495  // Add adds the passed unsigned 64-bit value to the accumulator.
 496  func (a *accumulator96) Add(v uint64) {
 497  	low := uint32(v & uint32Mask)
 498  	hi := uint32(v >> 32)
 499  	a.n[0] += low
 500  	hi += constantTimeLess(a.n[0], low) // Carry if overflow in n[0].
 501  	a.n[1] += hi
 502  	a.n[2] += constantTimeLess(a.n[1], hi) // Carry if overflow in n[1].
 503  }
 504  
 505  // Rsh32 right shifts the accumulator by 32 bits.
 506  func (a *accumulator96) Rsh32() {
 507  	a.n[0] = a.n[1]
 508  	a.n[1] = a.n[2]
 509  	a.n[2] = 0
 510  }
 511  
 512  // reduce385 reduces the 385-bit intermediate result in the passed terms modulo
 513  // the group order in constant time and stores the result in s.
 514  func (s *ModNScalar) reduce385(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12 uint64) {
 515  	// At this point, the intermediate result in the passed terms has been
 516  	// reduced to fit within 385 bits, so reduce it again using the same method
 517  	// described in reduce512.  As before, the intermediate result will end up
 518  	// being reduced by another 127 bits to 258 bits, thus 9 32-bit terms are
 519  	// needed for this iteration.  The reduced terms are assigned back to t0
 520  	// through t8.
 521  	//
 522  	// Note that several of the intermediate calculations require adding 64-bit
 523  	// products together which would overflow a uint64, so a 96-bit accumulator
 524  	// is used instead until the value is reduced enough to use native uint64s.
 525  	// Terms for 2^(32*0).
 526  	var acc accumulator96
 527  	acc.n[0] = uint32(t0) // == acc.Add(t0) because acc is guaranteed to be 0.
 528  	acc.Add(t8 * uint64(orderComplementWordZero))
 529  	t0 = uint64(acc.n[0])
 530  	acc.Rsh32()
 531  	// Terms for 2^(32*1).
 532  	acc.Add(t1)
 533  	acc.Add(t8 * uint64(orderComplementWordOne))
 534  	acc.Add(t9 * uint64(orderComplementWordZero))
 535  	t1 = uint64(acc.n[0])
 536  	acc.Rsh32()
 537  	// Terms for 2^(32*2).
 538  	acc.Add(t2)
 539  	acc.Add(t8 * uint64(orderComplementWordTwo))
 540  	acc.Add(t9 * uint64(orderComplementWordOne))
 541  	acc.Add(t10 * uint64(orderComplementWordZero))
 542  	t2 = uint64(acc.n[0])
 543  	acc.Rsh32()
 544  	// Terms for 2^(32*3).
 545  	acc.Add(t3)
 546  	acc.Add(t8 * uint64(orderComplementWordThree))
 547  	acc.Add(t9 * uint64(orderComplementWordTwo))
 548  	acc.Add(t10 * uint64(orderComplementWordOne))
 549  	acc.Add(t11 * uint64(orderComplementWordZero))
 550  	t3 = uint64(acc.n[0])
 551  	acc.Rsh32()
 552  	// Terms for 2^(32*4).
 553  	acc.Add(t4)
 554  	acc.Add(t8) // * uint64(orderComplementWordFour) // * 1
 555  	acc.Add(t9 * uint64(orderComplementWordThree))
 556  	acc.Add(t10 * uint64(orderComplementWordTwo))
 557  	acc.Add(t11 * uint64(orderComplementWordOne))
 558  	acc.Add(t12 * uint64(orderComplementWordZero))
 559  	t4 = uint64(acc.n[0])
 560  	acc.Rsh32()
 561  	// Terms for 2^(32*5).
 562  	acc.Add(t5)
 563  	// acc.Add(t8 * uint64(orderComplementWordFive)) // 0
 564  	acc.Add(t9) // * uint64(orderComplementWordFour) // * 1
 565  	acc.Add(t10 * uint64(orderComplementWordThree))
 566  	acc.Add(t11 * uint64(orderComplementWordTwo))
 567  	acc.Add(t12 * uint64(orderComplementWordOne))
 568  	t5 = uint64(acc.n[0])
 569  	acc.Rsh32()
 570  	// Terms for 2^(32*6).
 571  	acc.Add(t6)
 572  	// acc.Add(t8 * uint64(orderComplementWordSix)) // 0
 573  	// acc.Add(t9 * uint64(orderComplementWordFive)) // 0
 574  	acc.Add(t10) // * uint64(orderComplementWordFour) // * 1
 575  	acc.Add(t11 * uint64(orderComplementWordThree))
 576  	acc.Add(t12 * uint64(orderComplementWordTwo))
 577  	t6 = uint64(acc.n[0])
 578  	acc.Rsh32()
 579  	// Terms for 2^(32*7).
 580  	acc.Add(t7)
 581  	// acc.Add(t8 * uint64(orderComplementWordSeven)) // 0
 582  	// acc.Add(t9 * uint64(orderComplementWordSix)) // 0
 583  	// acc.Add(t10 * uint64(orderComplementWordFive)) // 0
 584  	acc.Add(t11) // * uint64(orderComplementWordFour) // * 1
 585  	acc.Add(t12 * uint64(orderComplementWordThree))
 586  	t7 = uint64(acc.n[0])
 587  	acc.Rsh32()
 588  	// Terms for 2^(32*8).
 589  	// acc.Add(t9 * uint64(orderComplementWordSeven)) // 0
 590  	// acc.Add(t10 * uint64(orderComplementWordSix)) // 0
 591  	// acc.Add(t11 * uint64(orderComplementWordFive)) // 0
 592  	acc.Add(t12) // * uint64(orderComplementWordFour) // * 1
 593  	t8 = uint64(acc.n[0])
 594  	// acc.Rsh32() // No need since not used after this.  Guaranteed to be 0.
 595  	//
 596  	// NOTE: All of the remaining multiplications for this iteration result in 0
 597  	// as they all involve multiplying by combinations of the fifth, sixth, and
 598  	// seventh words of the two's complement of N, which are 0, so skip them.
 599  	//
 600  	// At this point, the result is reduced to fit within 258 bits, so reduce it
 601  	// again using a slightly modified version of the same method.  The maximum
 602  	// value in t8 is 2 at this point and therefore multiplying it by each word
 603  	// of the two's complement of N and adding it to a 32-bit term will result
 604  	// in a maximum requirement of 33 bits, so it is safe to use native uint64s
 605  	// here for the intermediate term carry propagation.
 606  	//
 607  	// Also, since the maximum value in t8 is 2, this ends up reducing by
 608  	// another 2 bits to 256 bits.
 609  	c := t0 + t8*uint64(orderComplementWordZero)
 610  	s.n[0] = uint32(c & uint32Mask)
 611  	c = (c >> 32) + t1 + t8*uint64(orderComplementWordOne)
 612  	s.n[1] = uint32(c & uint32Mask)
 613  	c = (c >> 32) + t2 + t8*uint64(orderComplementWordTwo)
 614  	s.n[2] = uint32(c & uint32Mask)
 615  	c = (c >> 32) + t3 + t8*uint64(orderComplementWordThree)
 616  	s.n[3] = uint32(c & uint32Mask)
 617  	c = (c >> 32) + t4 + t8 // * uint64(orderComplementWordFour) == * 1
 618  	s.n[4] = uint32(c & uint32Mask)
 619  	c = (c >> 32) + t5 // + t8*uint64(orderComplementWordFive) == 0
 620  	s.n[5] = uint32(c & uint32Mask)
 621  	c = (c >> 32) + t6 // + t8*uint64(orderComplementWordSix) == 0
 622  	s.n[6] = uint32(c & uint32Mask)
 623  	c = (c >> 32) + t7 // + t8*uint64(orderComplementWordSeven) == 0
 624  	s.n[7] = uint32(c & uint32Mask)
 625  	// The result is now 256 bits, but it might still be >= N, so use the
 626  	// existing normal reduce method for 256-bit values.
 627  	s.reduce256(uint32(c>>32) + s.overflows())
 628  }
 629  
 630  // reduce512 reduces the 512-bit intermediate result in the passed terms modulo
 631  // the group order down to 385 bits in constant time and stores the result in s.
 632  func (s *ModNScalar) reduce512(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15 uint64) {
 633  	// At this point, the intermediate result in the passed terms is grouped
 634  	// into the respective bases.
 635  	//
 636  	// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
 637  	// when the modulus is of the special form m = b^t - c, where log_2(c) < t,
 638  	// highly efficient reduction can be achieved per the provided algorithm.
 639  	//
 640  	// The secp256k1 group order fits this criteria since it is:
 641  	//   2^256 - 432420386565659656852420866394968145599
 642  	//
 643  	// Technically the max possible value here is (N-1)^2 since the two scalars
 644  	// being multiplied are always mod N.  Nevertheless, it is safer to consider
 645  	// it to be (2^256-1)^2 = 2^512 - 2^256 + 1 since it is the product of two
 646  	// 256-bit values.
 647  	//
 648  	// The algorithm is to reduce the result modulo the prime by subtracting
 649  	// multiples of the group order N.  However, in order simplify carry
 650  	// propagation, this adds with the two's complement of N to achieve the same
 651  	// result.
 652  	//
 653  	// Since the two's complement of N has 127 leading zero bits, this will end
 654  	// up reducing the intermediate result from 512 bits to 385 bits, resulting
 655  	// in 13 32-bit terms.  The reduced terms are assigned back to t0 through
 656  	// t12.
 657  	//
 658  	// Note that several of the intermediate calculations require adding 64-bit
 659  	// products together which would overflow a uint64, so a 96-bit accumulator
 660  	// is used instead.
 661  	//
 662  	// Terms for 2^(32*0).
 663  	var acc accumulator96
 664  	acc.n[0] = uint32(t0) // == acc.Add(t0) because acc is guaranteed to be 0.
 665  	acc.Add(t8 * uint64(orderComplementWordZero))
 666  	t0 = uint64(acc.n[0])
 667  	acc.Rsh32()
 668  	// Terms for 2^(32*1).
 669  	acc.Add(t1)
 670  	acc.Add(t8 * uint64(orderComplementWordOne))
 671  	acc.Add(t9 * uint64(orderComplementWordZero))
 672  	t1 = uint64(acc.n[0])
 673  	acc.Rsh32()
 674  	// Terms for 2^(32*2).
 675  	acc.Add(t2)
 676  	acc.Add(t8 * uint64(orderComplementWordTwo))
 677  	acc.Add(t9 * uint64(orderComplementWordOne))
 678  	acc.Add(t10 * uint64(orderComplementWordZero))
 679  	t2 = uint64(acc.n[0])
 680  	acc.Rsh32()
 681  	// Terms for 2^(32*3).
 682  	acc.Add(t3)
 683  	acc.Add(t8 * uint64(orderComplementWordThree))
 684  	acc.Add(t9 * uint64(orderComplementWordTwo))
 685  	acc.Add(t10 * uint64(orderComplementWordOne))
 686  	acc.Add(t11 * uint64(orderComplementWordZero))
 687  	t3 = uint64(acc.n[0])
 688  	acc.Rsh32()
 689  	// Terms for 2^(32*4).
 690  	acc.Add(t4)
 691  	acc.Add(t8) // * uint64(orderComplementWordFour) // * 1
 692  	acc.Add(t9 * uint64(orderComplementWordThree))
 693  	acc.Add(t10 * uint64(orderComplementWordTwo))
 694  	acc.Add(t11 * uint64(orderComplementWordOne))
 695  	acc.Add(t12 * uint64(orderComplementWordZero))
 696  	t4 = uint64(acc.n[0])
 697  	acc.Rsh32()
 698  	// Terms for 2^(32*5).
 699  	acc.Add(t5)
 700  	// acc.Add(t8 * uint64(orderComplementWordFive)) // 0
 701  	acc.Add(t9) // * uint64(orderComplementWordFour) // * 1
 702  	acc.Add(t10 * uint64(orderComplementWordThree))
 703  	acc.Add(t11 * uint64(orderComplementWordTwo))
 704  	acc.Add(t12 * uint64(orderComplementWordOne))
 705  	acc.Add(t13 * uint64(orderComplementWordZero))
 706  	t5 = uint64(acc.n[0])
 707  	acc.Rsh32()
 708  	// Terms for 2^(32*6).
 709  	acc.Add(t6)
 710  	// acc.Add(t8 * uint64(orderComplementWordSix)) // 0
 711  	// acc.Add(t9 * uint64(orderComplementWordFive)) // 0
 712  	acc.Add(t10) // * uint64(orderComplementWordFour)) // * 1
 713  	acc.Add(t11 * uint64(orderComplementWordThree))
 714  	acc.Add(t12 * uint64(orderComplementWordTwo))
 715  	acc.Add(t13 * uint64(orderComplementWordOne))
 716  	acc.Add(t14 * uint64(orderComplementWordZero))
 717  	t6 = uint64(acc.n[0])
 718  	acc.Rsh32()
 719  	// Terms for 2^(32*7).
 720  	acc.Add(t7)
 721  	// acc.Add(t8 * uint64(orderComplementWordSeven)) // 0
 722  	// acc.Add(t9 * uint64(orderComplementWordSix)) // 0
 723  	// acc.Add(t10 * uint64(orderComplementWordFive)) // 0
 724  	acc.Add(t11) // * uint64(orderComplementWordFour) // * 1
 725  	acc.Add(t12 * uint64(orderComplementWordThree))
 726  	acc.Add(t13 * uint64(orderComplementWordTwo))
 727  	acc.Add(t14 * uint64(orderComplementWordOne))
 728  	acc.Add(t15 * uint64(orderComplementWordZero))
 729  	t7 = uint64(acc.n[0])
 730  	acc.Rsh32()
 731  	// Terms for 2^(32*8).
 732  	// acc.Add(t9 * uint64(orderComplementWordSeven)) // 0
 733  	// acc.Add(t10 * uint64(orderComplementWordSix)) // 0
 734  	// acc.Add(t11 * uint64(orderComplementWordFive)) // 0
 735  	acc.Add(t12) // * uint64(orderComplementWordFour) // * 1
 736  	acc.Add(t13 * uint64(orderComplementWordThree))
 737  	acc.Add(t14 * uint64(orderComplementWordTwo))
 738  	acc.Add(t15 * uint64(orderComplementWordOne))
 739  	t8 = uint64(acc.n[0])
 740  	acc.Rsh32()
 741  	// Terms for 2^(32*9).
 742  	// acc.Add(t10 * uint64(orderComplementWordSeven)) // 0
 743  	// acc.Add(t11 * uint64(orderComplementWordSix)) // 0
 744  	// acc.Add(t12 * uint64(orderComplementWordFive)) // 0
 745  	acc.Add(t13) // * uint64(orderComplementWordFour) // * 1
 746  	acc.Add(t14 * uint64(orderComplementWordThree))
 747  	acc.Add(t15 * uint64(orderComplementWordTwo))
 748  	t9 = uint64(acc.n[0])
 749  	acc.Rsh32()
 750  	// Terms for 2^(32*10).
 751  	// acc.Add(t11 * uint64(orderComplementWordSeven)) // 0
 752  	// acc.Add(t12 * uint64(orderComplementWordSix)) // 0
 753  	// acc.Add(t13 * uint64(orderComplementWordFive)) // 0
 754  	acc.Add(t14) // * uint64(orderComplementWordFour) // * 1
 755  	acc.Add(t15 * uint64(orderComplementWordThree))
 756  	t10 = uint64(acc.n[0])
 757  	acc.Rsh32()
 758  	// Terms for 2^(32*11).
 759  	// acc.Add(t12 * uint64(orderComplementWordSeven)) // 0
 760  	// acc.Add(t13 * uint64(orderComplementWordSix)) // 0
 761  	// acc.Add(t14 * uint64(orderComplementWordFive)) // 0
 762  	acc.Add(t15) // * uint64(orderComplementWordFour) // * 1
 763  	t11 = uint64(acc.n[0])
 764  	acc.Rsh32()
 765  	// NOTE: All of the remaining multiplications for this iteration result in 0
 766  	// as they all involve multiplying by combinations of the fifth, sixth, and
 767  	// seventh words of the two's complement of N, which are 0, so skip them.
 768  	//
 769  	// Terms for 2^(32*12).
 770  	t12 = uint64(acc.n[0])
 771  	// acc.Rsh32() // No need since not used after this.  Guaranteed to be 0.
 772  	//
 773  	// At this point, the result is reduced to fit within 385 bits, so reduce it
 774  	// again using the same method accordingly.
 775  	s.reduce385(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12)
 776  }
 777  
 778  // Mul2 multiplies the passed two scalars together modulo the group order in
 779  // constant time and stores the result in s.
 780  //
 781  // The scalar is returned to support chaining.  This enables syntax like:
 782  // s3.Mul2(s, s2).AddInt(1) so that s3 = (s * s2) + 1.
 783  func (s *ModNScalar) Mul2(val, val2 *ModNScalar) *ModNScalar {
 784  	// This could be done with for loops and an array to store the intermediate
 785  	// terms, but this unrolled version is significantly faster.
 786  	//
 787  	// The overall strategy employed here is:
 788  	// 1) Calculate the 512-bit product of the two scalars using the standard
 789  	//    pencil-and-paper method.
 790  	// 2) Reduce the result modulo the prime by effectively subtracting
 791  	//    multiples of the group order N (actually performed by adding multiples
 792  	//    of the two's complement of N to avoid implementing subtraction).
 793  	// 3) Repeat step 2 noting that each iteration reduces the required number
 794  	//    of bits by 127 because the two's complement of N has 127 leading zero
 795  	//    bits.
 796  	// 4) Once reduced to 256 bits, call the existing reduce method to perform
 797  	//    a final reduction as needed.
 798  	//
 799  	// Note that several of the intermediate calculations require adding 64-bit
 800  	// products together which would overflow a uint64, so a 96-bit accumulator
 801  	// is used instead.
 802  	//
 803  	// Terms for 2^(32*0).
 804  	var acc accumulator96
 805  	acc.Add(uint64(val.n[0]) * uint64(val2.n[0]))
 806  	t0 := uint64(acc.n[0])
 807  	acc.Rsh32()
 808  	// Terms for 2^(32*1).
 809  	acc.Add(uint64(val.n[0]) * uint64(val2.n[1]))
 810  	acc.Add(uint64(val.n[1]) * uint64(val2.n[0]))
 811  	t1 := uint64(acc.n[0])
 812  	acc.Rsh32()
 813  	// Terms for 2^(32*2).
 814  	acc.Add(uint64(val.n[0]) * uint64(val2.n[2]))
 815  	acc.Add(uint64(val.n[1]) * uint64(val2.n[1]))
 816  	acc.Add(uint64(val.n[2]) * uint64(val2.n[0]))
 817  	t2 := uint64(acc.n[0])
 818  	acc.Rsh32()
 819  	// Terms for 2^(32*3).
 820  	acc.Add(uint64(val.n[0]) * uint64(val2.n[3]))
 821  	acc.Add(uint64(val.n[1]) * uint64(val2.n[2]))
 822  	acc.Add(uint64(val.n[2]) * uint64(val2.n[1]))
 823  	acc.Add(uint64(val.n[3]) * uint64(val2.n[0]))
 824  	t3 := uint64(acc.n[0])
 825  	acc.Rsh32()
 826  	// Terms for 2^(32*4).
 827  	acc.Add(uint64(val.n[0]) * uint64(val2.n[4]))
 828  	acc.Add(uint64(val.n[1]) * uint64(val2.n[3]))
 829  	acc.Add(uint64(val.n[2]) * uint64(val2.n[2]))
 830  	acc.Add(uint64(val.n[3]) * uint64(val2.n[1]))
 831  	acc.Add(uint64(val.n[4]) * uint64(val2.n[0]))
 832  	t4 := uint64(acc.n[0])
 833  	acc.Rsh32()
 834  	// Terms for 2^(32*5).
 835  	acc.Add(uint64(val.n[0]) * uint64(val2.n[5]))
 836  	acc.Add(uint64(val.n[1]) * uint64(val2.n[4]))
 837  	acc.Add(uint64(val.n[2]) * uint64(val2.n[3]))
 838  	acc.Add(uint64(val.n[3]) * uint64(val2.n[2]))
 839  	acc.Add(uint64(val.n[4]) * uint64(val2.n[1]))
 840  	acc.Add(uint64(val.n[5]) * uint64(val2.n[0]))
 841  	t5 := uint64(acc.n[0])
 842  	acc.Rsh32()
 843  	// Terms for 2^(32*6).
 844  	acc.Add(uint64(val.n[0]) * uint64(val2.n[6]))
 845  	acc.Add(uint64(val.n[1]) * uint64(val2.n[5]))
 846  	acc.Add(uint64(val.n[2]) * uint64(val2.n[4]))
 847  	acc.Add(uint64(val.n[3]) * uint64(val2.n[3]))
 848  	acc.Add(uint64(val.n[4]) * uint64(val2.n[2]))
 849  	acc.Add(uint64(val.n[5]) * uint64(val2.n[1]))
 850  	acc.Add(uint64(val.n[6]) * uint64(val2.n[0]))
 851  	t6 := uint64(acc.n[0])
 852  	acc.Rsh32()
 853  	// Terms for 2^(32*7).
 854  	acc.Add(uint64(val.n[0]) * uint64(val2.n[7]))
 855  	acc.Add(uint64(val.n[1]) * uint64(val2.n[6]))
 856  	acc.Add(uint64(val.n[2]) * uint64(val2.n[5]))
 857  	acc.Add(uint64(val.n[3]) * uint64(val2.n[4]))
 858  	acc.Add(uint64(val.n[4]) * uint64(val2.n[3]))
 859  	acc.Add(uint64(val.n[5]) * uint64(val2.n[2]))
 860  	acc.Add(uint64(val.n[6]) * uint64(val2.n[1]))
 861  	acc.Add(uint64(val.n[7]) * uint64(val2.n[0]))
 862  	t7 := uint64(acc.n[0])
 863  	acc.Rsh32()
 864  	// Terms for 2^(32*8).
 865  	acc.Add(uint64(val.n[1]) * uint64(val2.n[7]))
 866  	acc.Add(uint64(val.n[2]) * uint64(val2.n[6]))
 867  	acc.Add(uint64(val.n[3]) * uint64(val2.n[5]))
 868  	acc.Add(uint64(val.n[4]) * uint64(val2.n[4]))
 869  	acc.Add(uint64(val.n[5]) * uint64(val2.n[3]))
 870  	acc.Add(uint64(val.n[6]) * uint64(val2.n[2]))
 871  	acc.Add(uint64(val.n[7]) * uint64(val2.n[1]))
 872  	t8 := uint64(acc.n[0])
 873  	acc.Rsh32()
 874  	// Terms for 2^(32*9).
 875  	acc.Add(uint64(val.n[2]) * uint64(val2.n[7]))
 876  	acc.Add(uint64(val.n[3]) * uint64(val2.n[6]))
 877  	acc.Add(uint64(val.n[4]) * uint64(val2.n[5]))
 878  	acc.Add(uint64(val.n[5]) * uint64(val2.n[4]))
 879  	acc.Add(uint64(val.n[6]) * uint64(val2.n[3]))
 880  	acc.Add(uint64(val.n[7]) * uint64(val2.n[2]))
 881  	t9 := uint64(acc.n[0])
 882  	acc.Rsh32()
 883  	// Terms for 2^(32*10).
 884  	acc.Add(uint64(val.n[3]) * uint64(val2.n[7]))
 885  	acc.Add(uint64(val.n[4]) * uint64(val2.n[6]))
 886  	acc.Add(uint64(val.n[5]) * uint64(val2.n[5]))
 887  	acc.Add(uint64(val.n[6]) * uint64(val2.n[4]))
 888  	acc.Add(uint64(val.n[7]) * uint64(val2.n[3]))
 889  	t10 := uint64(acc.n[0])
 890  	acc.Rsh32()
 891  	// Terms for 2^(32*11).
 892  	acc.Add(uint64(val.n[4]) * uint64(val2.n[7]))
 893  	acc.Add(uint64(val.n[5]) * uint64(val2.n[6]))
 894  	acc.Add(uint64(val.n[6]) * uint64(val2.n[5]))
 895  	acc.Add(uint64(val.n[7]) * uint64(val2.n[4]))
 896  	t11 := uint64(acc.n[0])
 897  	acc.Rsh32()
 898  	// Terms for 2^(32*12).
 899  	acc.Add(uint64(val.n[5]) * uint64(val2.n[7]))
 900  	acc.Add(uint64(val.n[6]) * uint64(val2.n[6]))
 901  	acc.Add(uint64(val.n[7]) * uint64(val2.n[5]))
 902  	t12 := uint64(acc.n[0])
 903  	acc.Rsh32()
 904  	// Terms for 2^(32*13).
 905  	acc.Add(uint64(val.n[6]) * uint64(val2.n[7]))
 906  	acc.Add(uint64(val.n[7]) * uint64(val2.n[6]))
 907  	t13 := uint64(acc.n[0])
 908  	acc.Rsh32()
 909  	// Terms for 2^(32*14).
 910  	acc.Add(uint64(val.n[7]) * uint64(val2.n[7]))
 911  	t14 := uint64(acc.n[0])
 912  	acc.Rsh32()
 913  	// What's left is for 2^(32*15).
 914  	t15 := uint64(acc.n[0])
 915  	// acc.Rsh32() // No need since not used after this.  Guaranteed to be 0.
 916  	//
 917  	// At this point, all of the terms are grouped into their respective base
 918  	// and occupy up to 512 bits.  Reduce the result accordingly.
 919  	s.reduce512(
 920  		t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14,
 921  		t15,
 922  	)
 923  	return s
 924  }
 925  
 926  // Mul multiplies the passed scalar with the existing one modulo the group order
 927  // in constant time and stores the result in s.
 928  //
 929  // The scalar is returned to support chaining.  This enables syntax like:
 930  // s.Mul(s2).AddInt(1) so that s = (s * s2) + 1.
 931  func (s *ModNScalar) Mul(val *ModNScalar) *ModNScalar {
 932  	return s.Mul2(s, val)
 933  }
 934  
 935  // SquareVal squares the passed scalar modulo the group order in constant time
 936  // and stores the result in s.
 937  //
 938  // The scalar is returned to support chaining.  This enables syntax like:
 939  // s3.SquareVal(s).Mul(s) so that s3 = s^2 * s = s^3.
 940  func (s *ModNScalar) SquareVal(val *ModNScalar) *ModNScalar {
 941  	// This could technically be optimized slightly to take advantage of the
 942  	// fact that many of the intermediate calculations in squaring are just
 943  	// doubling, however, benchmarking has shown that due to the need to use a
 944  	// 96-bit accumulator, any savings are essentially offset by that and
 945  	// consequently there is no real difference in performance over just
 946  	// multiplying the value by itself to justify the extra code for now.  This
 947  	// can be revisited in the future if it becomes a bottleneck in practice.
 948  	return s.Mul2(val, val)
 949  }
 950  
 951  // Square squares the scalar modulo the group order in constant time.  The
 952  // existing scalar is modified.
 953  //
 954  // The scalar is returned to support chaining.  This enables syntax like:
 955  // s.Square().Mul(s2) so that s = s^2 * s2.
 956  func (s *ModNScalar) Square() *ModNScalar {
 957  	return s.SquareVal(s)
 958  }
 959  
 960  // NegateVal negates the passed scalar modulo the group order and stores the
 961  // result in s in constant time.
 962  //
 963  // The scalar is returned to support chaining.  This enables syntax like:
 964  // s.NegateVal(s2).AddInt(1) so that s = -s2 + 1.
 965  func (s *ModNScalar) NegateVal(val *ModNScalar) *ModNScalar {
 966  	// Since the scalar is already in the range 0 <= val < N, where N is the
 967  	// group order, negation modulo the group order is just the group order
 968  	// minus the value.  This implies that the result will always be in the
 969  	// desired range with the sole exception of 0 because N - 0 = N itself.
 970  	//
 971  	// Therefore, in order to avoid the need to reduce the result for every
 972  	// other case in order to achieve constant time, this creates a mask that is
 973  	// all 0s in the case of the scalar being negated is 0 and all 1s otherwise
 974  	// and bitwise ands that mask with each word.
 975  	//
 976  	// Finally, to simplify the carry propagation, this adds the two's
 977  	// complement of the scalar to N in order to achieve the same result.
 978  	bits := val.n[0] | val.n[1] | val.n[2] | val.n[3] | val.n[4] | val.n[5] |
 979  		val.n[6] | val.n[7]
 980  	mask := uint64(uint32Mask * constantTimeNotEq(bits, 0))
 981  	c := uint64(orderWordZero) + (uint64(^val.n[0]) + 1)
 982  	s.n[0] = uint32(c & mask)
 983  	c = (c >> 32) + uint64(orderWordOne) + uint64(^val.n[1])
 984  	s.n[1] = uint32(c & mask)
 985  	c = (c >> 32) + uint64(orderWordTwo) + uint64(^val.n[2])
 986  	s.n[2] = uint32(c & mask)
 987  	c = (c >> 32) + uint64(orderWordThree) + uint64(^val.n[3])
 988  	s.n[3] = uint32(c & mask)
 989  	c = (c >> 32) + uint64(orderWordFour) + uint64(^val.n[4])
 990  	s.n[4] = uint32(c & mask)
 991  	c = (c >> 32) + uint64(orderWordFive) + uint64(^val.n[5])
 992  	s.n[5] = uint32(c & mask)
 993  	c = (c >> 32) + uint64(orderWordSix) + uint64(^val.n[6])
 994  	s.n[6] = uint32(c & mask)
 995  	c = (c >> 32) + uint64(orderWordSeven) + uint64(^val.n[7])
 996  	s.n[7] = uint32(c & mask)
 997  	return s
 998  }
 999  
1000  // Negate negates the scalar modulo the group order in constant time.  The
1001  // existing scalar is modified.
1002  //
1003  // The scalar is returned to support chaining.  This enables syntax like:
1004  // s.Negate().AddInt(1) so that s = -s + 1.
1005  func (s *ModNScalar) Negate() *ModNScalar { return s.NegateVal(s) }
1006  
1007  // InverseValNonConst finds the modular multiplicative inverse of the passed
1008  // scalar and stores result in s in *non-constant* time.
1009  //
1010  // The scalar is returned to support chaining.  This enables syntax like:
1011  // s3.InverseVal(s1).Mul(s2) so that s3 = s1^-1 * s2.
1012  func (s *ModNScalar) InverseValNonConst(val *ModNScalar) *ModNScalar {
1013  	// This is making use of big integers for now.  Ideally it will be replaced
1014  	// with an implementation that does not depend on big integers.
1015  	valBytes := val.Bytes()
1016  	bigVal := (&big.Int{}).SetBytes(valBytes[:])
1017  	bigVal.ModInverse(bigVal, curveParams.N)
1018  	s.SetByteSlice(bigVal.Bytes())
1019  	return s
1020  }
1021  
1022  // InverseNonConst finds the modular multiplicative inverse of the scalar in
1023  // *non-constant* time.  The existing scalar is modified.
1024  //
1025  // The scalar is returned to support chaining.  This enables syntax like:
1026  // s.Inverse().Mul(s2) so that s = s^-1 * s2.
1027  func (s *ModNScalar) InverseNonConst() *ModNScalar {
1028  	return s.InverseValNonConst(s)
1029  }
1030  
1031  // IsOverHalfOrder returns whether the scalar exceeds the group order
1032  // divided by 2 in constant time.
1033  func (s *ModNScalar) IsOverHalfOrder() bool {
1034  	// The intuition here is that the scalar is greater than half of the group
1035  	// order if one of the higher individual words is greater than the
1036  	// corresponding word of the half group order and all higher words in the
1037  	// scalar are equal to their corresponding word of the half group order.
1038  	//
1039  	// Note that the words 4, 5, and 6 are all the max uint32 value, so there is
1040  	// no need to test if those individual words of the scalar exceeds them,
1041  	// hence, only equality is checked for them.
1042  	result := constantTimeGreater(s.n[7], halfOrderWordSeven)
1043  	highWordsEqual := constantTimeEq(s.n[7], halfOrderWordSeven)
1044  	highWordsEqual &= constantTimeEq(s.n[6], halfOrderWordSix)
1045  	highWordsEqual &= constantTimeEq(s.n[5], halfOrderWordFive)
1046  	highWordsEqual &= constantTimeEq(s.n[4], halfOrderWordFour)
1047  	result |= highWordsEqual & constantTimeGreater(s.n[3], halfOrderWordThree)
1048  	highWordsEqual &= constantTimeEq(s.n[3], halfOrderWordThree)
1049  	result |= highWordsEqual & constantTimeGreater(s.n[2], halfOrderWordTwo)
1050  	highWordsEqual &= constantTimeEq(s.n[2], halfOrderWordTwo)
1051  	result |= highWordsEqual & constantTimeGreater(s.n[1], halfOrderWordOne)
1052  	highWordsEqual &= constantTimeEq(s.n[1], halfOrderWordOne)
1053  	result |= highWordsEqual & constantTimeGreater(s.n[0], halfOrderWordZero)
1054  	return result != 0
1055  }
1056