modnscalar.mx raw

   1  // Copyright (c) 2020-2023 The Decred developers
   2  // Use of this source code is governed by an ISC
   3  // license that can be found in the LICENSE file.
   4  
   5  package secp256k1
   6  
   7  import (
   8  	"math/big"
   9  
  10  	"encoding/hex"
  11  )
  12  
  13  // References:
  14  //   [SECG]: Recommended Elliptic Curve Domain Parameters
  15  //     https://www.secg.org/sec2-v2.pdf
  16  //
  17  //   [HAC]: Handbook of Applied Cryptography Menezes, van Oorschot, Vanstone.
  18  //     http://cacr.uwaterloo.ca/hac/
  19  
  20  // Many elliptic curve operations require working with scalars in a finite field
  21  // characterized by the order of the group underlying the secp256k1 curve.
  22  // Given this precision is larger than the biggest available native type,
  23  // obviously some form of bignum math is needed.  This code implements
  24  // specialized fixed-precision field arithmetic rather than relying on an
  25  // arbitrary-precision arithmetic package such as math/big for dealing with the
  26  // math modulo the group order since the size is known.  As a result, rather
  27  // large performance gains are achieved by taking advantage of many
  28  // optimizations not available to arbitrary-precision arithmetic and generic
  29  // modular arithmetic algorithms.
  30  //
  31  // There are various ways to internally represent each element.  For example,
  32  // the most obvious representation would be to use an array of 4 uint64s (64
  33  // bits * 4 = 256 bits).  However, that representation suffers from the fact
  34  // that there is no native Go type large enough to handle the intermediate
  35  // results while adding or multiplying two 64-bit numbers.
  36  //
  37  // Given the above, this implementation represents the field elements as 8
  38  // uint32s with each word (array entry) treated as base 2^32.  This was chosen
  39  // because most systems at the current time are 64-bit (or at least have 64-bit
  40  // registers available for specialized purposes such as MMX) so the intermediate
  41  // results can typically be done using a native register (and using uint64s to
  42  // avoid the need for additional half-word arithmetic)
  43  
  44  const (
  45  	// These fields provide convenient access to each of the words of the
  46  	// secp256k1 curve group order N to improve code readability.
  47  	//
  48  	// The group order of the curve per [SECG] is:
  49  	// 0xffffffff ffffffff ffffffff fffffffe baaedce6 af48a03b bfd25e8c d0364141
  50  	//
  51  	// nolint: dupword
  52  	orderWordZero  uint32 = 0xd0364141
  53  	orderWordOne   uint32 = 0xbfd25e8c
  54  	orderWordTwo   uint32 = 0xaf48a03b
  55  	orderWordThree uint32 = 0xbaaedce6
  56  	orderWordFour  uint32 = 0xfffffffe
  57  	orderWordFive  uint32 = 0xffffffff
  58  	orderWordSix   uint32 = 0xffffffff
  59  	orderWordSeven uint32 = 0xffffffff
  60  	// These fields provide convenient access to each of the words of the two's
  61  	// complement of the secp256k1 curve group order N to improve code
  62  	// readability.
  63  	//
  64  	// The two's complement of the group order is:
  65  	// 0x00000000 00000000 00000000 00000001 45512319 50b75fc4 402da173 2fc9bebf
  66  	orderComplementWordZero  = (^orderWordZero) + 1
  67  	orderComplementWordOne   = ^orderWordOne
  68  	orderComplementWordTwo   = ^orderWordTwo
  69  	orderComplementWordThree = ^orderWordThree
  70  	// orderComplementWordFour  uint32 = ^orderWordFour  // unused
  71  	// orderComplementWordFive  uint32 = ^orderWordFive  // unused
  72  	// orderComplementWordSix   uint32 = ^orderWordSix   // unused
  73  	// orderComplementWordSeven uint32 = ^orderWordSeven // unused
  74  	// These fields provide convenient access to each of the words of the
  75  	// secp256k1 curve group order N / 2 to improve code readability and avoid
  76  	// the need to recalculate them.
  77  	//
  78  	// The half order of the secp256k1 curve group is:
  79  	// 0x7fffffff ffffffff ffffffff ffffffff 5d576e73 57a4501d dfe92f46 681b20a0
  80  	//
  81  	// nolint: dupword
  82  	halfOrderWordZero  uint32 = 0x681b20a0
  83  	halfOrderWordOne   uint32 = 0xdfe92f46
  84  	halfOrderWordTwo   uint32 = 0x57a4501d
  85  	halfOrderWordThree uint32 = 0x5d576e73
  86  	halfOrderWordFour  uint32 = 0xffffffff
  87  	halfOrderWordFive  uint32 = 0xffffffff
  88  	halfOrderWordSix   uint32 = 0xffffffff
  89  	halfOrderWordSeven uint32 = 0x7fffffff
  90  	// uint32Mask is simply a mask with all bits set for a uint32 and is used to
  91  	// improve the readability of the code.
  92  	uint32Mask = 0xffffffff
  93  )
  94  
  95  
  96  // ModNScalar implements optimized 256-bit constant-time fixed-precision
  97  // arithmetic over the secp256k1 group order. This means all arithmetic is
  98  // performed modulo:
  99  //
 100  //	0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141
 101  //
 102  // It only implements the arithmetic needed for elliptic curve operations,
 103  // however, the operations that are not implemented can typically be worked
 104  // around if absolutely needed.  For example, subtraction can be performed by
 105  // adding the negation.
 106  //
 107  // Should it be absolutely necessary, conversion to the standard library
 108  // math/big.Int can be accomplished by using the Bytes method, slicing the
 109  // resulting fixed-size array, and feeding it to big.Int.SetBytes.  However,
 110  // that should typically be avoided when possible as conversion to big.Ints
 111  // requires allocations, is not constant time, and is slower when working modulo
 112  // the group order.
 113  type ModNScalar struct {
 114  	// The scalar is represented as 8 32-bit integers in base 2^32.
 115  	//
 116  	// The following depicts the internal representation:
 117  	// 	 ---------------------------------------------------------
 118  	// 	|       n[7]     |      n[6]      | ... |      n[0]      |
 119  	// 	| 32 bits        | 32 bits        | ... | 32 bits        |
 120  	// 	| Mult: 2^(32*7) | Mult: 2^(32*6) | ... | Mult: 2^(32*0) |
 121  	// 	 ---------------------------------------------------------
 122  	//
 123  	// For example, consider the number 2^87 + 2^42 + 1.  It would be
 124  	// represented as:
 125  	// 	n[0] = 1
 126  	// 	n[1] = 2^10
 127  	// 	n[2] = 2^23
 128  	// 	n[3..7] = 0
 129  	//
 130  	// The full 256-bit value is then calculated by looping i from 7..0 and
 131  	// doing sum(n[i] * 2^(32i)) like so:
 132  	// 	n[7] * 2^(32*7) = 0    * 2^224 = 0
 133  	// 	n[6] * 2^(32*6) = 0    * 2^192 = 0
 134  	// 	...
 135  	// 	n[2] * 2^(32*2) = 2^23 * 2^64  = 2^87
 136  	// 	n[1] * 2^(32*1) = 2^10 * 2^32  = 2^42
 137  	// 	n[0] * 2^(32*0) = 1    * 2^0   = 1
 138  	// 	Sum: 0 + 0 + ... + 2^87 + 2^42 + 1 = 2^87 + 2^42 + 1
 139  	n [8]uint32
 140  }
 141  
 142  // String returns the scalar as a human-readable hex string.
 143  //
 144  // This is NOT constant time.
 145  func (s ModNScalar) String() string {
 146  	b := s.Bytes()
 147  	return string(hex.EncodeToString(b[:]))
 148  }
 149  
 150  // Set sets the scalar equal to a copy of the passed one in constant time.
 151  //
 152  // The scalar is returned to support chaining.  This enables syntax like:
 153  // s := new(ModNScalar).Set(s2).Add(1) so that s = s2 + 1 where s2 is not
 154  // modified.
 155  func (s *ModNScalar) Set(val *ModNScalar) *ModNScalar {
 156  	*s = *val
 157  	return s
 158  }
 159  
 160  // Zero sets the scalar to zero in constant time.  A newly created scalar is
 161  // already set to zero.  This function can be useful to clear an existing scalar
 162  // for reuse.
 163  func (s *ModNScalar) Zero() {
 164  	s.n[0] = 0
 165  	s.n[1] = 0
 166  	s.n[2] = 0
 167  	s.n[3] = 0
 168  	s.n[4] = 0
 169  	s.n[5] = 0
 170  	s.n[6] = 0
 171  	s.n[7] = 0
 172  }
 173  
 174  // IsZeroBit returns 1 when the scalar is equal to zero or 0 otherwise in
 175  // constant time.
 176  //
 177  // Note that a bool is not used here because it is not possible in Go to convert
 178  // from a bool to numeric value in constant time and many constant-time
 179  // operations require a numeric value.  See IsZero for the version that returns
 180  // a bool.
 181  func (s *ModNScalar) IsZeroBit() uint32 {
 182  	// The scalar can only be zero if no bits are set in any of the words.
 183  	bits := s.n[0] | s.n[1] | s.n[2] | s.n[3] | s.n[4] | s.n[5] | s.n[6] | s.n[7]
 184  	return constantTimeEq(bits, 0)
 185  }
 186  
 187  // IsZero returns whether or not the scalar is equal to zero in constant time.
 188  func (s *ModNScalar) IsZero() bool {
 189  	// The scalar can only be zero if no bits are set in any of the words.
 190  	bits := s.n[0] | s.n[1] | s.n[2] | s.n[3] | s.n[4] | s.n[5] | s.n[6] | s.n[7]
 191  	return bits == 0
 192  }
 193  
 194  // SetInt sets the scalar to the passed integer in constant time.  This is a
 195  // convenience function since it is fairly common to perform some arithmetic
 196  // with small native integers.
 197  //
 198  // The scalar is returned to support chaining.  This enables syntax like:
 199  // s := new(ModNScalar).SetInt(2).Mul(s2) so that s = 2 * s2.
 200  func (s *ModNScalar) SetInt(ui uint32) *ModNScalar {
 201  	s.Zero()
 202  	s.n[0] = ui
 203  	return s
 204  }
 205  
 206  // constantTimeEq returns 1 if a == b or 0 otherwise in constant time.
 207  func constantTimeEq(a, b uint32) uint32 {
 208  	return uint32((uint64(a^b) - 1) >> 63)
 209  }
 210  
 211  // constantTimeNotEq returns 1 if a != b or 0 otherwise in constant time.
 212  func constantTimeNotEq(a, b uint32) uint32 {
 213  	return ^uint32((uint64(a^b)-1)>>63) & 1
 214  }
 215  
 216  // constantTimeLess returns 1 if a < b or 0 otherwise in constant time.
 217  func constantTimeLess(a, b uint32) uint32 {
 218  	return uint32((uint64(a) - uint64(b)) >> 63)
 219  }
 220  
 221  // constantTimeLessOrEq returns 1 if a <= b or 0 otherwise in constant time.
 222  func constantTimeLessOrEq(a, b uint32) uint32 {
 223  	return uint32((uint64(a) - uint64(b) - 1) >> 63)
 224  }
 225  
 226  // constantTimeGreater returns 1 if a > b or 0 otherwise in constant time.
 227  func constantTimeGreater(a, b uint32) uint32 {
 228  	return constantTimeLess(b, a)
 229  }
 230  
 231  // constantTimeGreaterOrEq returns 1 if a >= b or 0 otherwise in constant time.
 232  func constantTimeGreaterOrEq(a, b uint32) uint32 {
 233  	return constantTimeLessOrEq(b, a)
 234  }
 235  
 236  // constantTimeMin returns min(a,b) in constant time.
 237  func constantTimeMin(a, b uint32) uint32 {
 238  	return b ^ ((a ^ b) & -constantTimeLess(a, b))
 239  }
 240  
 241  // overflows determines if the current scalar is greater than or equal to the
 242  // group order in constant time and returns 1 if it is or 0 otherwise.
 243  func (s *ModNScalar) overflows() uint32 {
 244  	// The intuition here is that the scalar is greater than the group order if
 245  	// one of the higher individual words is greater than corresponding word of
 246  	// the group order and all higher words in the scalar are equal to their
 247  	// corresponding word of the group order.  Since this type is modulo the
 248  	// group order, being equal is also an overflow back to 0.
 249  	//
 250  	// Note that the words 5, 6, and 7 are all the max uint32 value, so there is
 251  	// no need to test if those individual words of the scalar exceeds them,
 252  	// hence, only equality is checked for them.
 253  	highWordsEqual := constantTimeEq(s.n[7], orderWordSeven)
 254  	highWordsEqual &= constantTimeEq(s.n[6], orderWordSix)
 255  	highWordsEqual &= constantTimeEq(s.n[5], orderWordFive)
 256  	overflow := highWordsEqual & constantTimeGreater(s.n[4], orderWordFour)
 257  	highWordsEqual &= constantTimeEq(s.n[4], orderWordFour)
 258  	overflow |= highWordsEqual & constantTimeGreater(s.n[3], orderWordThree)
 259  	highWordsEqual &= constantTimeEq(s.n[3], orderWordThree)
 260  	overflow |= highWordsEqual & constantTimeGreater(s.n[2], orderWordTwo)
 261  	highWordsEqual &= constantTimeEq(s.n[2], orderWordTwo)
 262  	overflow |= highWordsEqual & constantTimeGreater(s.n[1], orderWordOne)
 263  	highWordsEqual &= constantTimeEq(s.n[1], orderWordOne)
 264  	overflow |= highWordsEqual & constantTimeGreaterOrEq(s.n[0], orderWordZero)
 265  	return overflow
 266  }
 267  
 268  // reduce256 reduces the current scalar modulo the group order in accordance
 269  // with the overflows parameter in constant time.  The overflows parameter
 270  // specifies whether or not the scalar is known to be greater than the group
 271  // order and MUST either be 1 in the case it is or 0 in the case it is not for a
 272  // correct result.
 273  func (s *ModNScalar) reduce256(overflows uint32) {
 274  	// Notice that since s < 2^256 < 2N (where N is the group order), the max
 275  	// possible number of reductions required is one.  Therefore, in the case a
 276  	// reduction is needed, it can be performed with a single subtraction of N.
 277  	// Also, recall that subtraction is equivalent to addition by the two's
 278  	// complement while ignoring the carry.
 279  	//
 280  	// When s >= N, the overflows parameter will be 1.  Conversely, it will be 0
 281  	// when s < N.  Thus multiplying by the overflows parameter will either
 282  	// result in 0 or the multiplicand itself.
 283  	//
 284  	// Combining the above along with the fact that s + 0 = s, the following is
 285  	// a constant time implementation that works by either adding 0 or the two's
 286  	// complement of N as needed.
 287  	//
 288  	// The final result will be in the range 0 <= s < N as expected.
 289  	overflows64 := uint64(overflows)
 290  	c := uint64(s.n[0]) + overflows64*uint64(orderComplementWordZero)
 291  	s.n[0] = uint32(c & uint32Mask)
 292  	c = (c >> 32) + uint64(s.n[1]) + overflows64*uint64(orderComplementWordOne)
 293  	s.n[1] = uint32(c & uint32Mask)
 294  	c = (c >> 32) + uint64(s.n[2]) + overflows64*uint64(orderComplementWordTwo)
 295  	s.n[2] = uint32(c & uint32Mask)
 296  	c = (c >> 32) + uint64(s.n[3]) + overflows64*uint64(orderComplementWordThree)
 297  	s.n[3] = uint32(c & uint32Mask)
 298  	c = (c >> 32) + uint64(s.n[4]) + overflows64 // * 1
 299  	s.n[4] = uint32(c & uint32Mask)
 300  	c = (c >> 32) + uint64(s.n[5]) // + overflows64 * 0
 301  	s.n[5] = uint32(c & uint32Mask)
 302  	c = (c >> 32) + uint64(s.n[6]) // + overflows64 * 0
 303  	s.n[6] = uint32(c & uint32Mask)
 304  	c = (c >> 32) + uint64(s.n[7]) // + overflows64 * 0
 305  	s.n[7] = uint32(c & uint32Mask)
 306  }
 307  
 308  // SetBytes interprets the provided array as a 256-bit big-endian unsigned
 309  // integer, reduces it modulo the group order, sets the scalar to the result,
 310  // and returns either 1 if it was reduced (aka it overflowed) or 0 otherwise in
 311  // constant time.
 312  //
 313  // Note that a bool is not used here because it is not possible in Go to convert
 314  // from a bool to numeric value in constant time and many constant-time
 315  // operations require a numeric value.
 316  func (s *ModNScalar) SetBytes(b *[32]byte) uint32 {
 317  	// Pack the 256 total bits across the 8 uint32 words.  This could be done
 318  	// with a for loop, but benchmarks show this unrolled version is about 2
 319  	// times faster than the variant that uses a loop.
 320  	s.n[0] = uint32(b[31]) | uint32(b[30])<<8 | uint32(b[29])<<16 | uint32(b[28])<<24
 321  	s.n[1] = uint32(b[27]) | uint32(b[26])<<8 | uint32(b[25])<<16 | uint32(b[24])<<24
 322  	s.n[2] = uint32(b[23]) | uint32(b[22])<<8 | uint32(b[21])<<16 | uint32(b[20])<<24
 323  	s.n[3] = uint32(b[19]) | uint32(b[18])<<8 | uint32(b[17])<<16 | uint32(b[16])<<24
 324  	s.n[4] = uint32(b[15]) | uint32(b[14])<<8 | uint32(b[13])<<16 | uint32(b[12])<<24
 325  	s.n[5] = uint32(b[11]) | uint32(b[10])<<8 | uint32(b[9])<<16 | uint32(b[8])<<24
 326  	s.n[6] = uint32(b[7]) | uint32(b[6])<<8 | uint32(b[5])<<16 | uint32(b[4])<<24
 327  	s.n[7] = uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
 328  	// The value might be >= N, so reduce it as required and return whether or
 329  	// not it was reduced.
 330  	needsReduce := s.overflows()
 331  	s.reduce256(needsReduce)
 332  	return needsReduce
 333  }
 334  
 335  // zeroArray32 zeroes the provided 32-byte buffer.
 336  func zeroArray32(b *[32]byte) { *b = [32]byte{} }
 337  
 338  // SetByteSlice interprets the provided slice as a 256-bit big-endian unsigned
 339  // integer (meaning it is truncated to the first 32 bytes), reduces it modulo
 340  // the group order, sets the scalar to the result, and returns whether or not
 341  // the resulting truncated 256-bit integer overflowed in constant time.
 342  //
 343  // Note that since passing a slice with more than 32 bytes is truncated, it is
 344  // possible that the truncated value is less than the order of the curve and
 345  // hence it will not be reported as having overflowed in that case.  It is up to
 346  // the caller to decide whether it needs to provide numbers of the appropriate
 347  // size or it is acceptable to use this function with the described truncation
 348  // and overflow behavior.
 349  func (s *ModNScalar) SetByteSlice(b []byte) bool {
 350  	var b32 [32]byte
 351  	b = b[:constantTimeMin(uint32(len(b)), 32)]
 352  	copy(b32[:], b32[:32-len(b)])
 353  	copy(b32[32-len(b):], b)
 354  	result := s.SetBytes(&b32)
 355  	zeroArray32(&b32)
 356  	return result != 0
 357  }
 358  
 359  // PutBytesUnchecked unpacks the scalar to a 32-byte big-endian value directly
 360  // into the passed byte slice in constant time.  The target slice must have at
 361  // least 32 bytes available or it will panic.
 362  //
 363  // There is a similar function, PutBytes, which unpacks the scalar into a
 364  // 32-byte array directly.  This version is provided since it can be useful to
 365  // write directly into part of a larger buffer without needing a separate
 366  // allocation.
 367  //
 368  // Preconditions:
 369  //   - The target slice MUST have at least 32 bytes available
 370  func (s *ModNScalar) PutBytesUnchecked(b []byte) {
 371  	// Unpack the 256 total bits from the 8 uint32 words.  This could be done
 372  	// with a for loop, but benchmarks show this unrolled version is about 2
 373  	// times faster than the variant which uses a loop.
 374  	b[31] = byte(s.n[0])
 375  	b[30] = byte(s.n[0] >> 8)
 376  	b[29] = byte(s.n[0] >> 16)
 377  	b[28] = byte(s.n[0] >> 24)
 378  	b[27] = byte(s.n[1])
 379  	b[26] = byte(s.n[1] >> 8)
 380  	b[25] = byte(s.n[1] >> 16)
 381  	b[24] = byte(s.n[1] >> 24)
 382  	b[23] = byte(s.n[2])
 383  	b[22] = byte(s.n[2] >> 8)
 384  	b[21] = byte(s.n[2] >> 16)
 385  	b[20] = byte(s.n[2] >> 24)
 386  	b[19] = byte(s.n[3])
 387  	b[18] = byte(s.n[3] >> 8)
 388  	b[17] = byte(s.n[3] >> 16)
 389  	b[16] = byte(s.n[3] >> 24)
 390  	b[15] = byte(s.n[4])
 391  	b[14] = byte(s.n[4] >> 8)
 392  	b[13] = byte(s.n[4] >> 16)
 393  	b[12] = byte(s.n[4] >> 24)
 394  	b[11] = byte(s.n[5])
 395  	b[10] = byte(s.n[5] >> 8)
 396  	b[9] = byte(s.n[5] >> 16)
 397  	b[8] = byte(s.n[5] >> 24)
 398  	b[7] = byte(s.n[6])
 399  	b[6] = byte(s.n[6] >> 8)
 400  	b[5] = byte(s.n[6] >> 16)
 401  	b[4] = byte(s.n[6] >> 24)
 402  	b[3] = byte(s.n[7])
 403  	b[2] = byte(s.n[7] >> 8)
 404  	b[1] = byte(s.n[7] >> 16)
 405  	b[0] = byte(s.n[7] >> 24)
 406  }
 407  
 408  // PutBytes unpacks the scalar to a 32-byte big-endian value using the passed
 409  // byte array in constant time.
 410  //
 411  // There is a similar function, PutBytesUnchecked, which unpacks the scalar into
 412  // a slice that must have at least 32 bytes available.  This version is provided
 413  // since it can be useful to write directly into an array that is type checked.
 414  //
 415  // Alternatively, there is also Bytes, which unpacks the scalar into a new array
 416  // and returns that which can sometimes be more ergonomic in applications that
 417  // aren't concerned about an additional copy.
 418  func (s *ModNScalar) PutBytes(b *[32]byte) { s.PutBytesUnchecked(b[:]) }
 419  
 420  // Bytes unpacks the scalar to a 32-byte big-endian value in constant time.
 421  //
 422  // See PutBytes and PutBytesUnchecked for variants that allow an array or slice
 423  // to be passed which can be useful to cut down on the number of allocations
 424  // by allowing the caller to reuse a buffer or write directly into part of a
 425  // larger buffer.
 426  func (s *ModNScalar) Bytes() [32]byte {
 427  	var b [32]byte
 428  	s.PutBytesUnchecked(b[:])
 429  	return b
 430  }
 431  
 432  // IsOdd returns whether the scalar is an odd number in constant time.
 433  func (s *ModNScalar) IsOdd() bool {
 434  	// Only odd numbers have the bottom bit set.
 435  	return s.n[0]&1 == 1
 436  }
 437  
 438  // Equals returns whether or not the two scalars are the same in constant time.
 439  func (s *ModNScalar) Equals(val *ModNScalar) bool {
 440  	// Xor only sets bits when they are different, so the two scalars can only
 441  	// be the same if no bits are set after xoring each word.
 442  	bits := (s.n[0] ^ val.n[0]) | (s.n[1] ^ val.n[1]) | (s.n[2] ^ val.n[2]) |
 443  		(s.n[3] ^ val.n[3]) | (s.n[4] ^ val.n[4]) | (s.n[5] ^ val.n[5]) |
 444  		(s.n[6] ^ val.n[6]) | (s.n[7] ^ val.n[7])
 445  
 446  	return bits == 0
 447  }
 448  
 449  // Add2 adds the passed two scalars together modulo the group order in constant
 450  // time and stores the result in s.
 451  //
 452  // The scalar is returned to support chaining.  This enables syntax like:
 453  // s3.Add2(s, s2).AddInt(1) so that s3 = s + s2 + 1.
 454  func (s *ModNScalar) Add2(val1, val2 *ModNScalar) *ModNScalar {
 455  	c := uint64(val1.n[0]) + uint64(val2.n[0])
 456  	s.n[0] = uint32(c & uint32Mask)
 457  	c = (c >> 32) + uint64(val1.n[1]) + uint64(val2.n[1])
 458  	s.n[1] = uint32(c & uint32Mask)
 459  	c = (c >> 32) + uint64(val1.n[2]) + uint64(val2.n[2])
 460  	s.n[2] = uint32(c & uint32Mask)
 461  	c = (c >> 32) + uint64(val1.n[3]) + uint64(val2.n[3])
 462  	s.n[3] = uint32(c & uint32Mask)
 463  	c = (c >> 32) + uint64(val1.n[4]) + uint64(val2.n[4])
 464  	s.n[4] = uint32(c & uint32Mask)
 465  	c = (c >> 32) + uint64(val1.n[5]) + uint64(val2.n[5])
 466  	s.n[5] = uint32(c & uint32Mask)
 467  	c = (c >> 32) + uint64(val1.n[6]) + uint64(val2.n[6])
 468  	s.n[6] = uint32(c & uint32Mask)
 469  	c = (c >> 32) + uint64(val1.n[7]) + uint64(val2.n[7])
 470  	s.n[7] = uint32(c & uint32Mask)
 471  	// The result is now 256 bits, but it might still be >= N, so use the
 472  	// existing normal reduce method for 256-bit values.
 473  	s.reduce256(uint32(c>>32) + s.overflows())
 474  	return s
 475  }
 476  
 477  // Add adds the passed scalar to the existing one modulo the group order in
 478  // constant time and stores the result in s.
 479  //
 480  // The scalar is returned to support chaining.  This enables syntax like:
 481  // s.Add(s2).AddInt(1) so that s = s + s2 + 1.
 482  func (s *ModNScalar) Add(val *ModNScalar) *ModNScalar { return s.Add2(s, val) }
 483  
 484  // accumulator96 provides a 96-bit accumulator for use in the intermediate
 485  // calculations requiring more than 64-bits.
 486  type accumulator96 struct {
 487  	n [3]uint32
 488  }
 489  
 490  // Add adds the passed unsigned 64-bit value to the accumulator.
 491  func (a *accumulator96) Add(v uint64) {
 492  	low := uint32(v & uint32Mask)
 493  	hi := uint32(v >> 32)
 494  	a.n[0] += low
 495  	hi += constantTimeLess(a.n[0], low) // Carry if overflow in n[0].
 496  	a.n[1] += hi
 497  	a.n[2] += constantTimeLess(a.n[1], hi) // Carry if overflow in n[1].
 498  }
 499  
 500  // Rsh32 right shifts the accumulator by 32 bits.
 501  func (a *accumulator96) Rsh32() {
 502  	a.n[0] = a.n[1]
 503  	a.n[1] = a.n[2]
 504  	a.n[2] = 0
 505  }
 506  
 507  // reduce385 reduces the 385-bit intermediate result in the passed terms modulo
 508  // the group order in constant time and stores the result in s.
 509  func (s *ModNScalar) reduce385(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12 uint64) {
 510  	// At this point, the intermediate result in the passed terms has been
 511  	// reduced to fit within 385 bits, so reduce it again using the same method
 512  	// described in reduce512.  As before, the intermediate result will end up
 513  	// being reduced by another 127 bits to 258 bits, thus 9 32-bit terms are
 514  	// needed for this iteration.  The reduced terms are assigned back to t0
 515  	// through t8.
 516  	//
 517  	// Note that several of the intermediate calculations require adding 64-bit
 518  	// products together which would overflow a uint64, so a 96-bit accumulator
 519  	// is used instead until the value is reduced enough to use native uint64s.
 520  	// Terms for 2^(32*0).
 521  	var acc accumulator96
 522  	acc.n[0] = uint32(t0) // == acc.Add(t0) because acc is guaranteed to be 0.
 523  	acc.Add(t8 * uint64(orderComplementWordZero))
 524  	t0 = uint64(acc.n[0])
 525  	acc.Rsh32()
 526  	// Terms for 2^(32*1).
 527  	acc.Add(t1)
 528  	acc.Add(t8 * uint64(orderComplementWordOne))
 529  	acc.Add(t9 * uint64(orderComplementWordZero))
 530  	t1 = uint64(acc.n[0])
 531  	acc.Rsh32()
 532  	// Terms for 2^(32*2).
 533  	acc.Add(t2)
 534  	acc.Add(t8 * uint64(orderComplementWordTwo))
 535  	acc.Add(t9 * uint64(orderComplementWordOne))
 536  	acc.Add(t10 * uint64(orderComplementWordZero))
 537  	t2 = uint64(acc.n[0])
 538  	acc.Rsh32()
 539  	// Terms for 2^(32*3).
 540  	acc.Add(t3)
 541  	acc.Add(t8 * uint64(orderComplementWordThree))
 542  	acc.Add(t9 * uint64(orderComplementWordTwo))
 543  	acc.Add(t10 * uint64(orderComplementWordOne))
 544  	acc.Add(t11 * uint64(orderComplementWordZero))
 545  	t3 = uint64(acc.n[0])
 546  	acc.Rsh32()
 547  	// Terms for 2^(32*4).
 548  	acc.Add(t4)
 549  	acc.Add(t8) // * uint64(orderComplementWordFour) // * 1
 550  	acc.Add(t9 * uint64(orderComplementWordThree))
 551  	acc.Add(t10 * uint64(orderComplementWordTwo))
 552  	acc.Add(t11 * uint64(orderComplementWordOne))
 553  	acc.Add(t12 * uint64(orderComplementWordZero))
 554  	t4 = uint64(acc.n[0])
 555  	acc.Rsh32()
 556  	// Terms for 2^(32*5).
 557  	acc.Add(t5)
 558  	// acc.Add(t8 * uint64(orderComplementWordFive)) // 0
 559  	acc.Add(t9) // * uint64(orderComplementWordFour) // * 1
 560  	acc.Add(t10 * uint64(orderComplementWordThree))
 561  	acc.Add(t11 * uint64(orderComplementWordTwo))
 562  	acc.Add(t12 * uint64(orderComplementWordOne))
 563  	t5 = uint64(acc.n[0])
 564  	acc.Rsh32()
 565  	// Terms for 2^(32*6).
 566  	acc.Add(t6)
 567  	// acc.Add(t8 * uint64(orderComplementWordSix)) // 0
 568  	// acc.Add(t9 * uint64(orderComplementWordFive)) // 0
 569  	acc.Add(t10) // * uint64(orderComplementWordFour) // * 1
 570  	acc.Add(t11 * uint64(orderComplementWordThree))
 571  	acc.Add(t12 * uint64(orderComplementWordTwo))
 572  	t6 = uint64(acc.n[0])
 573  	acc.Rsh32()
 574  	// Terms for 2^(32*7).
 575  	acc.Add(t7)
 576  	// acc.Add(t8 * uint64(orderComplementWordSeven)) // 0
 577  	// acc.Add(t9 * uint64(orderComplementWordSix)) // 0
 578  	// acc.Add(t10 * uint64(orderComplementWordFive)) // 0
 579  	acc.Add(t11) // * uint64(orderComplementWordFour) // * 1
 580  	acc.Add(t12 * uint64(orderComplementWordThree))
 581  	t7 = uint64(acc.n[0])
 582  	acc.Rsh32()
 583  	// Terms for 2^(32*8).
 584  	// acc.Add(t9 * uint64(orderComplementWordSeven)) // 0
 585  	// acc.Add(t10 * uint64(orderComplementWordSix)) // 0
 586  	// acc.Add(t11 * uint64(orderComplementWordFive)) // 0
 587  	acc.Add(t12) // * uint64(orderComplementWordFour) // * 1
 588  	t8 = uint64(acc.n[0])
 589  	// acc.Rsh32() // No need since not used after this.  Guaranteed to be 0.
 590  	//
 591  	// NOTE: All of the remaining multiplications for this iteration result in 0
 592  	// as they all involve multiplying by combinations of the fifth, sixth, and
 593  	// seventh words of the two's complement of N, which are 0, so skip them.
 594  	//
 595  	// At this point, the result is reduced to fit within 258 bits, so reduce it
 596  	// again using a slightly modified version of the same method.  The maximum
 597  	// value in t8 is 2 at this point and therefore multiplying it by each word
 598  	// of the two's complement of N and adding it to a 32-bit term will result
 599  	// in a maximum requirement of 33 bits, so it is safe to use native uint64s
 600  	// here for the intermediate term carry propagation.
 601  	//
 602  	// Also, since the maximum value in t8 is 2, this ends up reducing by
 603  	// another 2 bits to 256 bits.
 604  	c := t0 + t8*uint64(orderComplementWordZero)
 605  	s.n[0] = uint32(c & uint32Mask)
 606  	c = (c >> 32) + t1 + t8*uint64(orderComplementWordOne)
 607  	s.n[1] = uint32(c & uint32Mask)
 608  	c = (c >> 32) + t2 + t8*uint64(orderComplementWordTwo)
 609  	s.n[2] = uint32(c & uint32Mask)
 610  	c = (c >> 32) + t3 + t8*uint64(orderComplementWordThree)
 611  	s.n[3] = uint32(c & uint32Mask)
 612  	c = (c >> 32) + t4 + t8 // * uint64(orderComplementWordFour) == * 1
 613  	s.n[4] = uint32(c & uint32Mask)
 614  	c = (c >> 32) + t5 // + t8*uint64(orderComplementWordFive) == 0
 615  	s.n[5] = uint32(c & uint32Mask)
 616  	c = (c >> 32) + t6 // + t8*uint64(orderComplementWordSix) == 0
 617  	s.n[6] = uint32(c & uint32Mask)
 618  	c = (c >> 32) + t7 // + t8*uint64(orderComplementWordSeven) == 0
 619  	s.n[7] = uint32(c & uint32Mask)
 620  	// The result is now 256 bits, but it might still be >= N, so use the
 621  	// existing normal reduce method for 256-bit values.
 622  	s.reduce256(uint32(c>>32) + s.overflows())
 623  }
 624  
 625  // reduce512 reduces the 512-bit intermediate result in the passed terms modulo
 626  // the group order down to 385 bits in constant time and stores the result in s.
 627  func (s *ModNScalar) reduce512(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15 uint64) {
 628  	// At this point, the intermediate result in the passed terms is grouped
 629  	// into the respective bases.
 630  	//
 631  	// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
 632  	// when the modulus is of the special form m = b^t - c, where log_2(c) < t,
 633  	// highly efficient reduction can be achieved per the provided algorithm.
 634  	//
 635  	// The secp256k1 group order fits this criteria since it is:
 636  	//   2^256 - 432420386565659656852420866394968145599
 637  	//
 638  	// Technically the max possible value here is (N-1)^2 since the two scalars
 639  	// being multiplied are always mod N.  Nevertheless, it is safer to consider
 640  	// it to be (2^256-1)^2 = 2^512 - 2^256 + 1 since it is the product of two
 641  	// 256-bit values.
 642  	//
 643  	// The algorithm is to reduce the result modulo the prime by subtracting
 644  	// multiples of the group order N.  However, in order simplify carry
 645  	// propagation, this adds with the two's complement of N to achieve the same
 646  	// result.
 647  	//
 648  	// Since the two's complement of N has 127 leading zero bits, this will end
 649  	// up reducing the intermediate result from 512 bits to 385 bits, resulting
 650  	// in 13 32-bit terms.  The reduced terms are assigned back to t0 through
 651  	// t12.
 652  	//
 653  	// Note that several of the intermediate calculations require adding 64-bit
 654  	// products together which would overflow a uint64, so a 96-bit accumulator
 655  	// is used instead.
 656  	//
 657  	// Terms for 2^(32*0).
 658  	var acc accumulator96
 659  	acc.n[0] = uint32(t0) // == acc.Add(t0) because acc is guaranteed to be 0.
 660  	acc.Add(t8 * uint64(orderComplementWordZero))
 661  	t0 = uint64(acc.n[0])
 662  	acc.Rsh32()
 663  	// Terms for 2^(32*1).
 664  	acc.Add(t1)
 665  	acc.Add(t8 * uint64(orderComplementWordOne))
 666  	acc.Add(t9 * uint64(orderComplementWordZero))
 667  	t1 = uint64(acc.n[0])
 668  	acc.Rsh32()
 669  	// Terms for 2^(32*2).
 670  	acc.Add(t2)
 671  	acc.Add(t8 * uint64(orderComplementWordTwo))
 672  	acc.Add(t9 * uint64(orderComplementWordOne))
 673  	acc.Add(t10 * uint64(orderComplementWordZero))
 674  	t2 = uint64(acc.n[0])
 675  	acc.Rsh32()
 676  	// Terms for 2^(32*3).
 677  	acc.Add(t3)
 678  	acc.Add(t8 * uint64(orderComplementWordThree))
 679  	acc.Add(t9 * uint64(orderComplementWordTwo))
 680  	acc.Add(t10 * uint64(orderComplementWordOne))
 681  	acc.Add(t11 * uint64(orderComplementWordZero))
 682  	t3 = uint64(acc.n[0])
 683  	acc.Rsh32()
 684  	// Terms for 2^(32*4).
 685  	acc.Add(t4)
 686  	acc.Add(t8) // * uint64(orderComplementWordFour) // * 1
 687  	acc.Add(t9 * uint64(orderComplementWordThree))
 688  	acc.Add(t10 * uint64(orderComplementWordTwo))
 689  	acc.Add(t11 * uint64(orderComplementWordOne))
 690  	acc.Add(t12 * uint64(orderComplementWordZero))
 691  	t4 = uint64(acc.n[0])
 692  	acc.Rsh32()
 693  	// Terms for 2^(32*5).
 694  	acc.Add(t5)
 695  	// acc.Add(t8 * uint64(orderComplementWordFive)) // 0
 696  	acc.Add(t9) // * uint64(orderComplementWordFour) // * 1
 697  	acc.Add(t10 * uint64(orderComplementWordThree))
 698  	acc.Add(t11 * uint64(orderComplementWordTwo))
 699  	acc.Add(t12 * uint64(orderComplementWordOne))
 700  	acc.Add(t13 * uint64(orderComplementWordZero))
 701  	t5 = uint64(acc.n[0])
 702  	acc.Rsh32()
 703  	// Terms for 2^(32*6).
 704  	acc.Add(t6)
 705  	// acc.Add(t8 * uint64(orderComplementWordSix)) // 0
 706  	// acc.Add(t9 * uint64(orderComplementWordFive)) // 0
 707  	acc.Add(t10) // * uint64(orderComplementWordFour)) // * 1
 708  	acc.Add(t11 * uint64(orderComplementWordThree))
 709  	acc.Add(t12 * uint64(orderComplementWordTwo))
 710  	acc.Add(t13 * uint64(orderComplementWordOne))
 711  	acc.Add(t14 * uint64(orderComplementWordZero))
 712  	t6 = uint64(acc.n[0])
 713  	acc.Rsh32()
 714  	// Terms for 2^(32*7).
 715  	acc.Add(t7)
 716  	// acc.Add(t8 * uint64(orderComplementWordSeven)) // 0
 717  	// acc.Add(t9 * uint64(orderComplementWordSix)) // 0
 718  	// acc.Add(t10 * uint64(orderComplementWordFive)) // 0
 719  	acc.Add(t11) // * uint64(orderComplementWordFour) // * 1
 720  	acc.Add(t12 * uint64(orderComplementWordThree))
 721  	acc.Add(t13 * uint64(orderComplementWordTwo))
 722  	acc.Add(t14 * uint64(orderComplementWordOne))
 723  	acc.Add(t15 * uint64(orderComplementWordZero))
 724  	t7 = uint64(acc.n[0])
 725  	acc.Rsh32()
 726  	// Terms for 2^(32*8).
 727  	// acc.Add(t9 * uint64(orderComplementWordSeven)) // 0
 728  	// acc.Add(t10 * uint64(orderComplementWordSix)) // 0
 729  	// acc.Add(t11 * uint64(orderComplementWordFive)) // 0
 730  	acc.Add(t12) // * uint64(orderComplementWordFour) // * 1
 731  	acc.Add(t13 * uint64(orderComplementWordThree))
 732  	acc.Add(t14 * uint64(orderComplementWordTwo))
 733  	acc.Add(t15 * uint64(orderComplementWordOne))
 734  	t8 = uint64(acc.n[0])
 735  	acc.Rsh32()
 736  	// Terms for 2^(32*9).
 737  	// acc.Add(t10 * uint64(orderComplementWordSeven)) // 0
 738  	// acc.Add(t11 * uint64(orderComplementWordSix)) // 0
 739  	// acc.Add(t12 * uint64(orderComplementWordFive)) // 0
 740  	acc.Add(t13) // * uint64(orderComplementWordFour) // * 1
 741  	acc.Add(t14 * uint64(orderComplementWordThree))
 742  	acc.Add(t15 * uint64(orderComplementWordTwo))
 743  	t9 = uint64(acc.n[0])
 744  	acc.Rsh32()
 745  	// Terms for 2^(32*10).
 746  	// acc.Add(t11 * uint64(orderComplementWordSeven)) // 0
 747  	// acc.Add(t12 * uint64(orderComplementWordSix)) // 0
 748  	// acc.Add(t13 * uint64(orderComplementWordFive)) // 0
 749  	acc.Add(t14) // * uint64(orderComplementWordFour) // * 1
 750  	acc.Add(t15 * uint64(orderComplementWordThree))
 751  	t10 = uint64(acc.n[0])
 752  	acc.Rsh32()
 753  	// Terms for 2^(32*11).
 754  	// acc.Add(t12 * uint64(orderComplementWordSeven)) // 0
 755  	// acc.Add(t13 * uint64(orderComplementWordSix)) // 0
 756  	// acc.Add(t14 * uint64(orderComplementWordFive)) // 0
 757  	acc.Add(t15) // * uint64(orderComplementWordFour) // * 1
 758  	t11 = uint64(acc.n[0])
 759  	acc.Rsh32()
 760  	// NOTE: All of the remaining multiplications for this iteration result in 0
 761  	// as they all involve multiplying by combinations of the fifth, sixth, and
 762  	// seventh words of the two's complement of N, which are 0, so skip them.
 763  	//
 764  	// Terms for 2^(32*12).
 765  	t12 = uint64(acc.n[0])
 766  	// acc.Rsh32() // No need since not used after this.  Guaranteed to be 0.
 767  	//
 768  	// At this point, the result is reduced to fit within 385 bits, so reduce it
 769  	// again using the same method accordingly.
 770  	s.reduce385(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12)
 771  }
 772  
 773  // Mul2 multiplies the passed two scalars together modulo the group order in
 774  // constant time and stores the result in s.
 775  //
 776  // The scalar is returned to support chaining.  This enables syntax like:
 777  // s3.Mul2(s, s2).AddInt(1) so that s3 = (s * s2) + 1.
 778  func (s *ModNScalar) Mul2(val, val2 *ModNScalar) *ModNScalar {
 779  	// This could be done with for loops and an array to store the intermediate
 780  	// terms, but this unrolled version is significantly faster.
 781  	//
 782  	// The overall strategy employed here is:
 783  	// 1) Calculate the 512-bit product of the two scalars using the standard
 784  	//    pencil-and-paper method.
 785  	// 2) Reduce the result modulo the prime by effectively subtracting
 786  	//    multiples of the group order N (actually performed by adding multiples
 787  	//    of the two's complement of N to avoid implementing subtraction).
 788  	// 3) Repeat step 2 noting that each iteration reduces the required number
 789  	//    of bits by 127 because the two's complement of N has 127 leading zero
 790  	//    bits.
 791  	// 4) Once reduced to 256 bits, call the existing reduce method to perform
 792  	//    a final reduction as needed.
 793  	//
 794  	// Note that several of the intermediate calculations require adding 64-bit
 795  	// products together which would overflow a uint64, so a 96-bit accumulator
 796  	// is used instead.
 797  	//
 798  	// Terms for 2^(32*0).
 799  	var acc accumulator96
 800  	acc.Add(uint64(val.n[0]) * uint64(val2.n[0]))
 801  	t0 := uint64(acc.n[0])
 802  	acc.Rsh32()
 803  	// Terms for 2^(32*1).
 804  	acc.Add(uint64(val.n[0]) * uint64(val2.n[1]))
 805  	acc.Add(uint64(val.n[1]) * uint64(val2.n[0]))
 806  	t1 := uint64(acc.n[0])
 807  	acc.Rsh32()
 808  	// Terms for 2^(32*2).
 809  	acc.Add(uint64(val.n[0]) * uint64(val2.n[2]))
 810  	acc.Add(uint64(val.n[1]) * uint64(val2.n[1]))
 811  	acc.Add(uint64(val.n[2]) * uint64(val2.n[0]))
 812  	t2 := uint64(acc.n[0])
 813  	acc.Rsh32()
 814  	// Terms for 2^(32*3).
 815  	acc.Add(uint64(val.n[0]) * uint64(val2.n[3]))
 816  	acc.Add(uint64(val.n[1]) * uint64(val2.n[2]))
 817  	acc.Add(uint64(val.n[2]) * uint64(val2.n[1]))
 818  	acc.Add(uint64(val.n[3]) * uint64(val2.n[0]))
 819  	t3 := uint64(acc.n[0])
 820  	acc.Rsh32()
 821  	// Terms for 2^(32*4).
 822  	acc.Add(uint64(val.n[0]) * uint64(val2.n[4]))
 823  	acc.Add(uint64(val.n[1]) * uint64(val2.n[3]))
 824  	acc.Add(uint64(val.n[2]) * uint64(val2.n[2]))
 825  	acc.Add(uint64(val.n[3]) * uint64(val2.n[1]))
 826  	acc.Add(uint64(val.n[4]) * uint64(val2.n[0]))
 827  	t4 := uint64(acc.n[0])
 828  	acc.Rsh32()
 829  	// Terms for 2^(32*5).
 830  	acc.Add(uint64(val.n[0]) * uint64(val2.n[5]))
 831  	acc.Add(uint64(val.n[1]) * uint64(val2.n[4]))
 832  	acc.Add(uint64(val.n[2]) * uint64(val2.n[3]))
 833  	acc.Add(uint64(val.n[3]) * uint64(val2.n[2]))
 834  	acc.Add(uint64(val.n[4]) * uint64(val2.n[1]))
 835  	acc.Add(uint64(val.n[5]) * uint64(val2.n[0]))
 836  	t5 := uint64(acc.n[0])
 837  	acc.Rsh32()
 838  	// Terms for 2^(32*6).
 839  	acc.Add(uint64(val.n[0]) * uint64(val2.n[6]))
 840  	acc.Add(uint64(val.n[1]) * uint64(val2.n[5]))
 841  	acc.Add(uint64(val.n[2]) * uint64(val2.n[4]))
 842  	acc.Add(uint64(val.n[3]) * uint64(val2.n[3]))
 843  	acc.Add(uint64(val.n[4]) * uint64(val2.n[2]))
 844  	acc.Add(uint64(val.n[5]) * uint64(val2.n[1]))
 845  	acc.Add(uint64(val.n[6]) * uint64(val2.n[0]))
 846  	t6 := uint64(acc.n[0])
 847  	acc.Rsh32()
 848  	// Terms for 2^(32*7).
 849  	acc.Add(uint64(val.n[0]) * uint64(val2.n[7]))
 850  	acc.Add(uint64(val.n[1]) * uint64(val2.n[6]))
 851  	acc.Add(uint64(val.n[2]) * uint64(val2.n[5]))
 852  	acc.Add(uint64(val.n[3]) * uint64(val2.n[4]))
 853  	acc.Add(uint64(val.n[4]) * uint64(val2.n[3]))
 854  	acc.Add(uint64(val.n[5]) * uint64(val2.n[2]))
 855  	acc.Add(uint64(val.n[6]) * uint64(val2.n[1]))
 856  	acc.Add(uint64(val.n[7]) * uint64(val2.n[0]))
 857  	t7 := uint64(acc.n[0])
 858  	acc.Rsh32()
 859  	// Terms for 2^(32*8).
 860  	acc.Add(uint64(val.n[1]) * uint64(val2.n[7]))
 861  	acc.Add(uint64(val.n[2]) * uint64(val2.n[6]))
 862  	acc.Add(uint64(val.n[3]) * uint64(val2.n[5]))
 863  	acc.Add(uint64(val.n[4]) * uint64(val2.n[4]))
 864  	acc.Add(uint64(val.n[5]) * uint64(val2.n[3]))
 865  	acc.Add(uint64(val.n[6]) * uint64(val2.n[2]))
 866  	acc.Add(uint64(val.n[7]) * uint64(val2.n[1]))
 867  	t8 := uint64(acc.n[0])
 868  	acc.Rsh32()
 869  	// Terms for 2^(32*9).
 870  	acc.Add(uint64(val.n[2]) * uint64(val2.n[7]))
 871  	acc.Add(uint64(val.n[3]) * uint64(val2.n[6]))
 872  	acc.Add(uint64(val.n[4]) * uint64(val2.n[5]))
 873  	acc.Add(uint64(val.n[5]) * uint64(val2.n[4]))
 874  	acc.Add(uint64(val.n[6]) * uint64(val2.n[3]))
 875  	acc.Add(uint64(val.n[7]) * uint64(val2.n[2]))
 876  	t9 := uint64(acc.n[0])
 877  	acc.Rsh32()
 878  	// Terms for 2^(32*10).
 879  	acc.Add(uint64(val.n[3]) * uint64(val2.n[7]))
 880  	acc.Add(uint64(val.n[4]) * uint64(val2.n[6]))
 881  	acc.Add(uint64(val.n[5]) * uint64(val2.n[5]))
 882  	acc.Add(uint64(val.n[6]) * uint64(val2.n[4]))
 883  	acc.Add(uint64(val.n[7]) * uint64(val2.n[3]))
 884  	t10 := uint64(acc.n[0])
 885  	acc.Rsh32()
 886  	// Terms for 2^(32*11).
 887  	acc.Add(uint64(val.n[4]) * uint64(val2.n[7]))
 888  	acc.Add(uint64(val.n[5]) * uint64(val2.n[6]))
 889  	acc.Add(uint64(val.n[6]) * uint64(val2.n[5]))
 890  	acc.Add(uint64(val.n[7]) * uint64(val2.n[4]))
 891  	t11 := uint64(acc.n[0])
 892  	acc.Rsh32()
 893  	// Terms for 2^(32*12).
 894  	acc.Add(uint64(val.n[5]) * uint64(val2.n[7]))
 895  	acc.Add(uint64(val.n[6]) * uint64(val2.n[6]))
 896  	acc.Add(uint64(val.n[7]) * uint64(val2.n[5]))
 897  	t12 := uint64(acc.n[0])
 898  	acc.Rsh32()
 899  	// Terms for 2^(32*13).
 900  	acc.Add(uint64(val.n[6]) * uint64(val2.n[7]))
 901  	acc.Add(uint64(val.n[7]) * uint64(val2.n[6]))
 902  	t13 := uint64(acc.n[0])
 903  	acc.Rsh32()
 904  	// Terms for 2^(32*14).
 905  	acc.Add(uint64(val.n[7]) * uint64(val2.n[7]))
 906  	t14 := uint64(acc.n[0])
 907  	acc.Rsh32()
 908  	// What's left is for 2^(32*15).
 909  	t15 := uint64(acc.n[0])
 910  	// acc.Rsh32() // No need since not used after this.  Guaranteed to be 0.
 911  	//
 912  	// At this point, all of the terms are grouped into their respective base
 913  	// and occupy up to 512 bits.  Reduce the result accordingly.
 914  	s.reduce512(
 915  		t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14,
 916  		t15,
 917  	)
 918  	return s
 919  }
 920  
 921  // Mul multiplies the passed scalar with the existing one modulo the group order
 922  // in constant time and stores the result in s.
 923  //
 924  // The scalar is returned to support chaining.  This enables syntax like:
 925  // s.Mul(s2).AddInt(1) so that s = (s * s2) + 1.
 926  func (s *ModNScalar) Mul(val *ModNScalar) *ModNScalar {
 927  	return s.Mul2(s, val)
 928  }
 929  
 930  // SquareVal squares the passed scalar modulo the group order in constant time
 931  // and stores the result in s.
 932  //
 933  // The scalar is returned to support chaining.  This enables syntax like:
 934  // s3.SquareVal(s).Mul(s) so that s3 = s^2 * s = s^3.
 935  func (s *ModNScalar) SquareVal(val *ModNScalar) *ModNScalar {
 936  	// This could technically be optimized slightly to take advantage of the
 937  	// fact that many of the intermediate calculations in squaring are just
 938  	// doubling, however, benchmarking has shown that due to the need to use a
 939  	// 96-bit accumulator, any savings are essentially offset by that and
 940  	// consequently there is no real difference in performance over just
 941  	// multiplying the value by itself to justify the extra code for now.  This
 942  	// can be revisited in the future if it becomes a bottleneck in practice.
 943  	return s.Mul2(val, val)
 944  }
 945  
 946  // Square squares the scalar modulo the group order in constant time.  The
 947  // existing scalar is modified.
 948  //
 949  // The scalar is returned to support chaining.  This enables syntax like:
 950  // s.Square().Mul(s2) so that s = s^2 * s2.
 951  func (s *ModNScalar) Square() *ModNScalar {
 952  	return s.SquareVal(s)
 953  }
 954  
 955  // NegateVal negates the passed scalar modulo the group order and stores the
 956  // result in s in constant time.
 957  //
 958  // The scalar is returned to support chaining.  This enables syntax like:
 959  // s.NegateVal(s2).AddInt(1) so that s = -s2 + 1.
 960  func (s *ModNScalar) NegateVal(val *ModNScalar) *ModNScalar {
 961  	// Since the scalar is already in the range 0 <= val < N, where N is the
 962  	// group order, negation modulo the group order is just the group order
 963  	// minus the value.  This implies that the result will always be in the
 964  	// desired range with the sole exception of 0 because N - 0 = N itself.
 965  	//
 966  	// Therefore, in order to avoid the need to reduce the result for every
 967  	// other case in order to achieve constant time, this creates a mask that is
 968  	// all 0s in the case of the scalar being negated is 0 and all 1s otherwise
 969  	// and bitwise ands that mask with each word.
 970  	//
 971  	// Finally, to simplify the carry propagation, this adds the two's
 972  	// complement of the scalar to N in order to achieve the same result.
 973  	bits := val.n[0] | val.n[1] | val.n[2] | val.n[3] | val.n[4] | val.n[5] |
 974  		val.n[6] | val.n[7]
 975  	mask := uint64(uint32Mask * constantTimeNotEq(bits, 0))
 976  	c := uint64(orderWordZero) + (uint64(^val.n[0]) + 1)
 977  	s.n[0] = uint32(c & mask)
 978  	c = (c >> 32) + uint64(orderWordOne) + uint64(^val.n[1])
 979  	s.n[1] = uint32(c & mask)
 980  	c = (c >> 32) + uint64(orderWordTwo) + uint64(^val.n[2])
 981  	s.n[2] = uint32(c & mask)
 982  	c = (c >> 32) + uint64(orderWordThree) + uint64(^val.n[3])
 983  	s.n[3] = uint32(c & mask)
 984  	c = (c >> 32) + uint64(orderWordFour) + uint64(^val.n[4])
 985  	s.n[4] = uint32(c & mask)
 986  	c = (c >> 32) + uint64(orderWordFive) + uint64(^val.n[5])
 987  	s.n[5] = uint32(c & mask)
 988  	c = (c >> 32) + uint64(orderWordSix) + uint64(^val.n[6])
 989  	s.n[6] = uint32(c & mask)
 990  	c = (c >> 32) + uint64(orderWordSeven) + uint64(^val.n[7])
 991  	s.n[7] = uint32(c & mask)
 992  	return s
 993  }
 994  
 995  // Negate negates the scalar modulo the group order in constant time.  The
 996  // existing scalar is modified.
 997  //
 998  // The scalar is returned to support chaining.  This enables syntax like:
 999  // s.Negate().AddInt(1) so that s = -s + 1.
1000  func (s *ModNScalar) Negate() *ModNScalar { return s.NegateVal(s) }
1001  
1002  // InverseValNonConst finds the modular multiplicative inverse of the passed
1003  // scalar and stores result in s in *non-constant* time.
1004  //
1005  // The scalar is returned to support chaining.  This enables syntax like:
1006  // s3.InverseVal(s1).Mul(s2) so that s3 = s1^-1 * s2.
1007  func (s *ModNScalar) InverseValNonConst(val *ModNScalar) *ModNScalar {
1008  	// This is making use of big integers for now.  Ideally it will be replaced
1009  	// with an implementation that does not depend on big integers.
1010  	valBytes := val.Bytes()
1011  	bigVal := (&big.Int{}).SetBytes(valBytes[:])
1012  	bigVal.ModInverse(bigVal, Params().N)
1013  	s.SetByteSlice(bigVal.Bytes())
1014  	return s
1015  }
1016  
1017  // InverseNonConst finds the modular multiplicative inverse of the scalar in
1018  // *non-constant* time.  The existing scalar is modified.
1019  //
1020  // The scalar is returned to support chaining.  This enables syntax like:
1021  // s.Inverse().Mul(s2) so that s = s^-1 * s2.
1022  func (s *ModNScalar) InverseNonConst() *ModNScalar {
1023  	return s.InverseValNonConst(s)
1024  }
1025  
1026  // IsOverHalfOrder returns whether the scalar exceeds the group order
1027  // divided by 2 in constant time.
1028  func (s *ModNScalar) IsOverHalfOrder() bool {
1029  	// The intuition here is that the scalar is greater than half of the group
1030  	// order if one of the higher individual words is greater than the
1031  	// corresponding word of the half group order and all higher words in the
1032  	// scalar are equal to their corresponding word of the half group order.
1033  	//
1034  	// Note that the words 4, 5, and 6 are all the max uint32 value, so there is
1035  	// no need to test if those individual words of the scalar exceeds them,
1036  	// hence, only equality is checked for them.
1037  	result := constantTimeGreater(s.n[7], halfOrderWordSeven)
1038  	highWordsEqual := constantTimeEq(s.n[7], halfOrderWordSeven)
1039  	highWordsEqual &= constantTimeEq(s.n[6], halfOrderWordSix)
1040  	highWordsEqual &= constantTimeEq(s.n[5], halfOrderWordFive)
1041  	highWordsEqual &= constantTimeEq(s.n[4], halfOrderWordFour)
1042  	result |= highWordsEqual & constantTimeGreater(s.n[3], halfOrderWordThree)
1043  	highWordsEqual &= constantTimeEq(s.n[3], halfOrderWordThree)
1044  	result |= highWordsEqual & constantTimeGreater(s.n[2], halfOrderWordTwo)
1045  	highWordsEqual &= constantTimeEq(s.n[2], halfOrderWordTwo)
1046  	result |= highWordsEqual & constantTimeGreater(s.n[1], halfOrderWordOne)
1047  	highWordsEqual &= constantTimeEq(s.n[1], halfOrderWordOne)
1048  	result |= highWordsEqual & constantTimeGreater(s.n[0], halfOrderWordZero)
1049  	return result != 0
1050  }
1051