1 // Copyright (c) 2020-2024 The Decred developers
2 // Use of this source code is governed by an ISC
3 // license that can be found in the LICENSE file.
4 5 package secp256k1
6 7 import (
8 "encoding/hex"
9 "math/big"
10 )
11 12 // References:
13 // [SECG]: Recommended Elliptic Curve Domain Parameters
14 // https://www.secg.org/sec2-v2.pdf
15 //
16 // [HAC]: Handbook of Applied Cryptography Menezes, van Oorschot, Vanstone.
17 // http://cacr.uwaterloo.ca/hac/
18 19 // Many elliptic curve operations require working with scalars in a finite field
20 // characterized by the order of the group underlying the secp256k1 curve.
21 // Given this precision is larger than the biggest available native type,
22 // obviously some form of bignum math is needed. This code implements
23 // specialized fixed-precision field arithmetic rather than relying on an
24 // arbitrary-precision arithmetic package such as math/big for dealing with the
25 // math modulo the group order since the size is known. As a result, rather
26 // large performance gains are achieved by taking advantage of many
27 // optimizations not available to arbitrary-precision arithmetic and generic
28 // modular arithmetic algorithms.
29 //
30 // There are various ways to internally represent each element. For example,
31 // the most obvious representation would be to use an array of 4 uint64s (64
32 // bits * 4 = 256 bits). However, that representation suffers from the fact
33 // that there is no native Go type large enough to handle the intermediate
34 // results while adding or multiplying two 64-bit numbers.
35 //
36 // Given the above, this implementation represents the field elements as 8
37 // uint32s with each word (array entry) treated as base 2^32. This was chosen
38 // because most systems at the current time are 64-bit (or at least have 64-bit
39 // registers available for specialized purposes such as MMX) so the intermediate
40 // results can typically be done using a native register (and using uint64s to
41 // avoid the need for additional half-word arithmetic)
42 43 const (
44 // These fields provide convenient access to each of the words of the
45 // secp256k1 curve group order N to improve code readability.
46 //
47 // The group order of the curve per [SECG] is:
48 // 0xffffffff ffffffff ffffffff fffffffe baaedce6 af48a03b bfd25e8c d0364141
49 //
50 // nolint: dupword
51 orderWordZero uint32 = 0xd0364141
52 orderWordOne uint32 = 0xbfd25e8c
53 orderWordTwo uint32 = 0xaf48a03b
54 orderWordThree uint32 = 0xbaaedce6
55 orderWordFour uint32 = 0xfffffffe
56 orderWordFive uint32 = 0xffffffff
57 orderWordSix uint32 = 0xffffffff
58 orderWordSeven uint32 = 0xffffffff
59 60 // These fields provide convenient access to each of the words of the two's
61 // complement of the secp256k1 curve group order N to improve code
62 // readability.
63 //
64 // The two's complement of the group order is:
65 // 0x00000000 00000000 00000000 00000001 45512319 50b75fc4 402da173 2fc9bebf
66 orderComplementWordZero uint32 = (^orderWordZero) + 1
67 orderComplementWordOne uint32 = ^orderWordOne
68 orderComplementWordTwo uint32 = ^orderWordTwo
69 orderComplementWordThree uint32 = ^orderWordThree
70 // orderComplementWordFour uint32 = ^orderWordFour // unused
71 // orderComplementWordFive uint32 = ^orderWordFive // unused
72 // orderComplementWordSix uint32 = ^orderWordSix // unused
73 // orderComplementWordSeven uint32 = ^orderWordSeven // unused
74 75 // These fields provide convenient access to each of the words of the
76 // secp256k1 curve group order N / 2 to improve code readability and avoid
77 // the need to recalculate them.
78 //
79 // The half order of the secp256k1 curve group is:
80 // 0x7fffffff ffffffff ffffffff ffffffff 5d576e73 57a4501d dfe92f46 681b20a0
81 //
82 // nolint: dupword
83 halfOrderWordZero uint32 = 0x681b20a0
84 halfOrderWordOne uint32 = 0xdfe92f46
85 halfOrderWordTwo uint32 = 0x57a4501d
86 halfOrderWordThree uint32 = 0x5d576e73
87 halfOrderWordFour uint32 = 0xffffffff
88 halfOrderWordFive uint32 = 0xffffffff
89 halfOrderWordSix uint32 = 0xffffffff
90 halfOrderWordSeven uint32 = 0x7fffffff
91 92 // uint32Mask is simply a mask with all bits set for a uint32 and is used to
93 // improve the readability of the code.
94 uint32Mask = 0xffffffff
95 )
96 97 var (
98 // zero32 is an array of 32 bytes used for the purposes of zeroing and is
99 // defined here to avoid extra allocations.
100 zero32 = [32]byte{}
101 )
102 103 // ModNScalar implements optimized 256-bit constant-time fixed-precision
104 // arithmetic over the secp256k1 group order. This means all arithmetic is
105 // performed modulo:
106 //
107 // 0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141
108 //
109 // It only implements the arithmetic needed for elliptic curve operations,
110 // however, the operations that are not implemented can typically be worked
111 // around if absolutely needed. For example, subtraction can be performed by
112 // adding the negation.
113 //
114 // Should it be absolutely necessary, conversion to the standard library
115 // math/big.Int can be accomplished by using the Bytes method, slicing the
116 // resulting fixed-size array, and feeding it to big.Int.SetBytes. However,
117 // that should typically be avoided when possible as conversion to big.Ints
118 // requires allocations, is not constant time, and is slower when working modulo
119 // the group order.
120 type ModNScalar struct {
121 // The scalar is represented as 8 32-bit integers in base 2^32.
122 //
123 // The following depicts the internal representation:
124 // ---------------------------------------------------------
125 // | n[7] | n[6] | ... | n[0] |
126 // | 32 bits | 32 bits | ... | 32 bits |
127 // | Mult: 2^(32*7) | Mult: 2^(32*6) | ... | Mult: 2^(32*0) |
128 // ---------------------------------------------------------
129 //
130 // For example, consider the number 2^87 + 2^42 + 1. It would be
131 // represented as:
132 // n[0] = 1
133 // n[1] = 2^10
134 // n[2] = 2^23
135 // n[3..7] = 0
136 //
137 // The full 256-bit value is then calculated by looping i from 7..0 and
138 // doing sum(n[i] * 2^(32i)) like so:
139 // n[7] * 2^(32*7) = 0 * 2^224 = 0
140 // n[6] * 2^(32*6) = 0 * 2^192 = 0
141 // ...
142 // n[2] * 2^(32*2) = 2^23 * 2^64 = 2^87
143 // n[1] * 2^(32*1) = 2^10 * 2^32 = 2^42
144 // n[0] * 2^(32*0) = 1 * 2^0 = 1
145 // Sum: 0 + 0 + ... + 2^87 + 2^42 + 1 = 2^87 + 2^42 + 1
146 n [8]uint32
147 }
148 149 // String returns the scalar as a human-readable hex string.
150 //
151 // This is NOT constant time.
152 func (s ModNScalar) String() string {
153 b := s.Bytes()
154 return hex.EncodeToString(b[:])
155 }
156 157 // Set sets the scalar equal to a copy of the passed one in constant time.
158 //
159 // The scalar is returned to support chaining. This enables syntax like:
160 // s := new(ModNScalar).Set(s2).Add(1) so that s = s2 + 1 where s2 is not
161 // modified.
162 func (s *ModNScalar) Set(val *ModNScalar) *ModNScalar {
163 *s = *val
164 return s
165 }
166 167 // Zero sets the scalar to zero in constant time. A newly created scalar is
168 // already set to zero. This function can be useful to clear an existing scalar
169 // for reuse.
170 func (s *ModNScalar) Zero() {
171 s.n[0] = 0
172 s.n[1] = 0
173 s.n[2] = 0
174 s.n[3] = 0
175 s.n[4] = 0
176 s.n[5] = 0
177 s.n[6] = 0
178 s.n[7] = 0
179 }
180 181 // IsZeroBit returns 1 when the scalar is equal to zero or 0 otherwise in
182 // constant time.
183 //
184 // Note that a bool is not used here because it is not possible in Go to convert
185 // from a bool to numeric value in constant time and many constant-time
186 // operations require a numeric value. See IsZero for the version that returns
187 // a bool.
188 func (s *ModNScalar) IsZeroBit() uint32 {
189 // The scalar can only be zero if no bits are set in any of the words.
190 bits := s.n[0] | s.n[1] | s.n[2] | s.n[3] | s.n[4] | s.n[5] | s.n[6] | s.n[7]
191 return constantTimeEq(bits, 0)
192 }
193 194 // IsZero returns whether or not the scalar is equal to zero in constant time.
195 func (s *ModNScalar) IsZero() bool {
196 // The scalar can only be zero if no bits are set in any of the words.
197 bits := s.n[0] | s.n[1] | s.n[2] | s.n[3] | s.n[4] | s.n[5] | s.n[6] | s.n[7]
198 return bits == 0
199 }
200 201 // SetInt sets the scalar to the passed integer in constant time. This is a
202 // convenience function since it is fairly common to perform some arithmetic
203 // with small native integers.
204 //
205 // The scalar is returned to support chaining. This enables syntax like:
206 // s := new(ModNScalar).SetInt(2).Mul(s2) so that s = 2 * s2.
207 func (s *ModNScalar) SetInt(ui uint32) *ModNScalar {
208 s.Zero()
209 s.n[0] = ui
210 return s
211 }
212 213 // constantTimeEq returns 1 if a == b or 0 otherwise in constant time.
214 func constantTimeEq(a, b uint32) uint32 {
215 return uint32((uint64(a^b) - 1) >> 63)
216 }
217 218 // constantTimeNotEq returns 1 if a != b or 0 otherwise in constant time.
219 func constantTimeNotEq(a, b uint32) uint32 {
220 return ^uint32((uint64(a^b)-1)>>63) & 1
221 }
222 223 // constantTimeLess returns 1 if a < b or 0 otherwise in constant time.
224 func constantTimeLess(a, b uint32) uint32 {
225 return uint32((uint64(a) - uint64(b)) >> 63)
226 }
227 228 // constantTimeLessOrEq returns 1 if a <= b or 0 otherwise in constant time.
229 func constantTimeLessOrEq(a, b uint32) uint32 {
230 return uint32((uint64(a) - uint64(b) - 1) >> 63)
231 }
232 233 // constantTimeGreater returns 1 if a > b or 0 otherwise in constant time.
234 func constantTimeGreater(a, b uint32) uint32 {
235 return constantTimeLess(b, a)
236 }
237 238 // constantTimeGreaterOrEq returns 1 if a >= b or 0 otherwise in constant time.
239 func constantTimeGreaterOrEq(a, b uint32) uint32 {
240 return constantTimeLessOrEq(b, a)
241 }
242 243 // constantTimeMin returns min(a,b) in constant time.
244 func constantTimeMin(a, b uint32) uint32 {
245 return b ^ ((a ^ b) & -constantTimeLess(a, b))
246 }
247 248 // overflows determines if the current scalar is greater than or equal to the
249 // group order in constant time and returns 1 if it is or 0 otherwise.
250 func (s *ModNScalar) overflows() uint32 {
251 // The intuition here is that the scalar is greater than the group order if
252 // one of the higher individual words is greater than corresponding word of
253 // the group order and all higher words in the scalar are equal to their
254 // corresponding word of the group order. Since this type is modulo the
255 // group order, being equal is also an overflow back to 0.
256 //
257 // Note that the words 5, 6, and 7 are all the max uint32 value, so there is
258 // no need to test if those individual words of the scalar exceeds them,
259 // hence, only equality is checked for them.
260 highWordsEqual := constantTimeEq(s.n[7], orderWordSeven)
261 highWordsEqual &= constantTimeEq(s.n[6], orderWordSix)
262 highWordsEqual &= constantTimeEq(s.n[5], orderWordFive)
263 overflow := highWordsEqual & constantTimeGreater(s.n[4], orderWordFour)
264 highWordsEqual &= constantTimeEq(s.n[4], orderWordFour)
265 overflow |= highWordsEqual & constantTimeGreater(s.n[3], orderWordThree)
266 highWordsEqual &= constantTimeEq(s.n[3], orderWordThree)
267 overflow |= highWordsEqual & constantTimeGreater(s.n[2], orderWordTwo)
268 highWordsEqual &= constantTimeEq(s.n[2], orderWordTwo)
269 overflow |= highWordsEqual & constantTimeGreater(s.n[1], orderWordOne)
270 highWordsEqual &= constantTimeEq(s.n[1], orderWordOne)
271 overflow |= highWordsEqual & constantTimeGreaterOrEq(s.n[0], orderWordZero)
272 273 return overflow
274 }
275 276 // reduce256 reduces the current scalar modulo the group order in accordance
277 // with the overflows parameter in constant time. The overflows parameter
278 // specifies whether or not the scalar is known to be greater than the group
279 // order and MUST either be 1 in the case it is or 0 in the case it is not for a
280 // correct result.
281 func (s *ModNScalar) reduce256(overflows uint32) {
282 // Notice that since s < 2^256 < 2N (where N is the group order), the max
283 // possible number of reductions required is one. Therefore, in the case a
284 // reduction is needed, it can be performed with a single subtraction of N.
285 // Also, recall that subtraction is equivalent to addition by the two's
286 // complement while ignoring the carry.
287 //
288 // When s >= N, the overflows parameter will be 1. Conversely, it will be 0
289 // when s < N. Thus multiplying by the overflows parameter will either
290 // result in 0 or the multiplicand itself.
291 //
292 // Combining the above along with the fact that s + 0 = s, the following is
293 // a constant time implementation that works by either adding 0 or the two's
294 // complement of N as needed.
295 //
296 // The final result will be in the range 0 <= s < N as expected.
297 overflows64 := uint64(overflows)
298 c := uint64(s.n[0]) + overflows64*uint64(orderComplementWordZero)
299 s.n[0] = uint32(c & uint32Mask)
300 c = (c >> 32) + uint64(s.n[1]) + overflows64*uint64(orderComplementWordOne)
301 s.n[1] = uint32(c & uint32Mask)
302 c = (c >> 32) + uint64(s.n[2]) + overflows64*uint64(orderComplementWordTwo)
303 s.n[2] = uint32(c & uint32Mask)
304 c = (c >> 32) + uint64(s.n[3]) + overflows64*uint64(orderComplementWordThree)
305 s.n[3] = uint32(c & uint32Mask)
306 c = (c >> 32) + uint64(s.n[4]) + overflows64 // * 1
307 s.n[4] = uint32(c & uint32Mask)
308 c = (c >> 32) + uint64(s.n[5]) // + overflows64 * 0
309 s.n[5] = uint32(c & uint32Mask)
310 c = (c >> 32) + uint64(s.n[6]) // + overflows64 * 0
311 s.n[6] = uint32(c & uint32Mask)
312 c = (c >> 32) + uint64(s.n[7]) // + overflows64 * 0
313 s.n[7] = uint32(c & uint32Mask)
314 }
315 316 // SetBytes interprets the provided array as a 256-bit big-endian unsigned
317 // integer, reduces it modulo the group order, sets the scalar to the result,
318 // and returns either 1 if it was reduced (aka it overflowed) or 0 otherwise in
319 // constant time.
320 //
321 // Note that a bool is not used here because it is not possible in Go to convert
322 // from a bool to numeric value in constant time and many constant-time
323 // operations require a numeric value.
324 func (s *ModNScalar) SetBytes(b *[32]byte) uint32 {
325 // Pack the 256 total bits across the 8 uint32 words. This could be done
326 // with a for loop, but benchmarks show this unrolled version is about 2
327 // times faster than the variant that uses a loop.
328 s.n[0] = uint32(b[31]) | uint32(b[30])<<8 | uint32(b[29])<<16 | uint32(b[28])<<24
329 s.n[1] = uint32(b[27]) | uint32(b[26])<<8 | uint32(b[25])<<16 | uint32(b[24])<<24
330 s.n[2] = uint32(b[23]) | uint32(b[22])<<8 | uint32(b[21])<<16 | uint32(b[20])<<24
331 s.n[3] = uint32(b[19]) | uint32(b[18])<<8 | uint32(b[17])<<16 | uint32(b[16])<<24
332 s.n[4] = uint32(b[15]) | uint32(b[14])<<8 | uint32(b[13])<<16 | uint32(b[12])<<24
333 s.n[5] = uint32(b[11]) | uint32(b[10])<<8 | uint32(b[9])<<16 | uint32(b[8])<<24
334 s.n[6] = uint32(b[7]) | uint32(b[6])<<8 | uint32(b[5])<<16 | uint32(b[4])<<24
335 s.n[7] = uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
336 337 // The value might be >= N, so reduce it as required and return whether or
338 // not it was reduced.
339 needsReduce := s.overflows()
340 s.reduce256(needsReduce)
341 return needsReduce
342 }
343 344 // zeroArray32 zeroes the provided 32-byte buffer.
345 func zeroArray32(b *[32]byte) {
346 copy(b[:], zero32[:])
347 }
348 349 // SetByteSlice interprets the provided slice as a 256-bit big-endian unsigned
350 // integer (meaning it is truncated to the first 32 bytes), reduces it modulo
351 // the group order, sets the scalar to the result, and returns whether or not
352 // the resulting truncated 256-bit integer overflowed in constant time.
353 //
354 // Note that since passing a slice with more than 32 bytes is truncated, it is
355 // possible that the truncated value is less than the order of the curve and
356 // hence it will not be reported as having overflowed in that case. It is up to
357 // the caller to decide whether it needs to provide numbers of the appropriate
358 // size or it is acceptable to use this function with the described truncation
359 // and overflow behavior.
360 func (s *ModNScalar) SetByteSlice(b []byte) bool {
361 var b32 [32]byte
362 b = b[:constantTimeMin(uint32(len(b)), 32)]
363 copy(b32[:], b32[:32-len(b)])
364 copy(b32[32-len(b):], b)
365 result := s.SetBytes(&b32)
366 zeroArray32(&b32)
367 return result != 0
368 }
369 370 // PutBytesUnchecked unpacks the scalar to a 32-byte big-endian value directly
371 // into the passed byte slice in constant time. The target slice must have at
372 // least 32 bytes available or it will panic.
373 //
374 // There is a similar function, PutBytes, which unpacks the scalar into a
375 // 32-byte array directly. This version is provided since it can be useful to
376 // write directly into part of a larger buffer without needing a separate
377 // allocation.
378 //
379 // Preconditions:
380 // - The target slice MUST have at least 32 bytes available
381 func (s *ModNScalar) PutBytesUnchecked(b []byte) {
382 // Unpack the 256 total bits from the 8 uint32 words. This could be done
383 // with a for loop, but benchmarks show this unrolled version is about 2
384 // times faster than the variant which uses a loop.
385 b[31] = byte(s.n[0])
386 b[30] = byte(s.n[0] >> 8)
387 b[29] = byte(s.n[0] >> 16)
388 b[28] = byte(s.n[0] >> 24)
389 b[27] = byte(s.n[1])
390 b[26] = byte(s.n[1] >> 8)
391 b[25] = byte(s.n[1] >> 16)
392 b[24] = byte(s.n[1] >> 24)
393 b[23] = byte(s.n[2])
394 b[22] = byte(s.n[2] >> 8)
395 b[21] = byte(s.n[2] >> 16)
396 b[20] = byte(s.n[2] >> 24)
397 b[19] = byte(s.n[3])
398 b[18] = byte(s.n[3] >> 8)
399 b[17] = byte(s.n[3] >> 16)
400 b[16] = byte(s.n[3] >> 24)
401 b[15] = byte(s.n[4])
402 b[14] = byte(s.n[4] >> 8)
403 b[13] = byte(s.n[4] >> 16)
404 b[12] = byte(s.n[4] >> 24)
405 b[11] = byte(s.n[5])
406 b[10] = byte(s.n[5] >> 8)
407 b[9] = byte(s.n[5] >> 16)
408 b[8] = byte(s.n[5] >> 24)
409 b[7] = byte(s.n[6])
410 b[6] = byte(s.n[6] >> 8)
411 b[5] = byte(s.n[6] >> 16)
412 b[4] = byte(s.n[6] >> 24)
413 b[3] = byte(s.n[7])
414 b[2] = byte(s.n[7] >> 8)
415 b[1] = byte(s.n[7] >> 16)
416 b[0] = byte(s.n[7] >> 24)
417 }
418 419 // PutBytes unpacks the scalar to a 32-byte big-endian value using the passed
420 // byte array in constant time.
421 //
422 // There is a similar function, PutBytesUnchecked, which unpacks the scalar into
423 // a slice that must have at least 32 bytes available. This version is provided
424 // since it can be useful to write directly into an array that is type checked.
425 //
426 // Alternatively, there is also Bytes, which unpacks the scalar into a new array
427 // and returns that which can sometimes be more ergonomic in applications that
428 // aren't concerned about an additional copy.
429 func (s *ModNScalar) PutBytes(b *[32]byte) {
430 s.PutBytesUnchecked(b[:])
431 }
432 433 // Bytes unpacks the scalar to a 32-byte big-endian value in constant time.
434 //
435 // See PutBytes and PutBytesUnchecked for variants that allow an array or slice
436 // to be passed which can be useful to cut down on the number of allocations
437 // by allowing the caller to reuse a buffer or write directly into part of a
438 // larger buffer.
439 func (s *ModNScalar) Bytes() [32]byte {
440 var b [32]byte
441 s.PutBytesUnchecked(b[:])
442 return b
443 }
444 445 // IsOdd returns whether or not the scalar is an odd number in constant time.
446 func (s *ModNScalar) IsOdd() bool {
447 // Only odd numbers have the bottom bit set.
448 return s.n[0]&1 == 1
449 }
450 451 // Equals returns whether or not the two scalars are the same in constant time.
452 func (s *ModNScalar) Equals(val *ModNScalar) bool {
453 // Xor only sets bits when they are different, so the two scalars can only
454 // be the same if no bits are set after xoring each word.
455 bits := (s.n[0] ^ val.n[0]) | (s.n[1] ^ val.n[1]) | (s.n[2] ^ val.n[2]) |
456 (s.n[3] ^ val.n[3]) | (s.n[4] ^ val.n[4]) | (s.n[5] ^ val.n[5]) |
457 (s.n[6] ^ val.n[6]) | (s.n[7] ^ val.n[7])
458 459 return bits == 0
460 }
461 462 // Add2 adds the passed two scalars together modulo the group order in constant
463 // time and stores the result in s.
464 //
465 // The scalar is returned to support chaining. This enables syntax like:
466 // s3.Add2(s, s2).AddInt(1) so that s3 = s + s2 + 1.
467 func (s *ModNScalar) Add2(val1, val2 *ModNScalar) *ModNScalar {
468 c := uint64(val1.n[0]) + uint64(val2.n[0])
469 s.n[0] = uint32(c & uint32Mask)
470 c = (c >> 32) + uint64(val1.n[1]) + uint64(val2.n[1])
471 s.n[1] = uint32(c & uint32Mask)
472 c = (c >> 32) + uint64(val1.n[2]) + uint64(val2.n[2])
473 s.n[2] = uint32(c & uint32Mask)
474 c = (c >> 32) + uint64(val1.n[3]) + uint64(val2.n[3])
475 s.n[3] = uint32(c & uint32Mask)
476 c = (c >> 32) + uint64(val1.n[4]) + uint64(val2.n[4])
477 s.n[4] = uint32(c & uint32Mask)
478 c = (c >> 32) + uint64(val1.n[5]) + uint64(val2.n[5])
479 s.n[5] = uint32(c & uint32Mask)
480 c = (c >> 32) + uint64(val1.n[6]) + uint64(val2.n[6])
481 s.n[6] = uint32(c & uint32Mask)
482 c = (c >> 32) + uint64(val1.n[7]) + uint64(val2.n[7])
483 s.n[7] = uint32(c & uint32Mask)
484 485 // The result is now 256 bits, but it might still be >= N, so use the
486 // existing normal reduce method for 256-bit values.
487 s.reduce256(uint32(c>>32) + s.overflows())
488 return s
489 }
490 491 // Add adds the passed scalar to the existing one modulo the group order in
492 // constant time and stores the result in s.
493 //
494 // The scalar is returned to support chaining. This enables syntax like:
495 // s.Add(s2).AddInt(1) so that s = s + s2 + 1.
496 func (s *ModNScalar) Add(val *ModNScalar) *ModNScalar {
497 return s.Add2(s, val)
498 }
499 500 // accumulator96 provides a 96-bit accumulator for use in the intermediate
501 // calculations requiring more than 64-bits.
502 type accumulator96 struct {
503 n [3]uint32
504 }
505 506 // Add adds the passed unsigned 64-bit value to the accumulator.
507 func (a *accumulator96) Add(v uint64) {
508 low := uint32(v & uint32Mask)
509 hi := uint32(v >> 32)
510 a.n[0] += low
511 hi += constantTimeLess(a.n[0], low) // Carry if overflow in n[0].
512 a.n[1] += hi
513 a.n[2] += constantTimeLess(a.n[1], hi) // Carry if overflow in n[1].
514 }
515 516 // Rsh32 right shifts the accumulator by 32 bits.
517 func (a *accumulator96) Rsh32() {
518 a.n[0] = a.n[1]
519 a.n[1] = a.n[2]
520 a.n[2] = 0
521 }
522 523 // reduce385 reduces the 385-bit intermediate result in the passed terms modulo
524 // the group order in constant time and stores the result in s.
525 func (s *ModNScalar) reduce385(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12 uint64) {
526 // At this point, the intermediate result in the passed terms has been
527 // reduced to fit within 385 bits, so reduce it again using the same method
528 // described in reduce512. As before, the intermediate result will end up
529 // being reduced by another 127 bits to 258 bits, thus 9 32-bit terms are
530 // needed for this iteration. The reduced terms are assigned back to t0
531 // through t8.
532 //
533 // Note that several of the intermediate calculations require adding 64-bit
534 // products together which would overflow a uint64, so a 96-bit accumulator
535 // is used instead until the value is reduced enough to use native uint64s.
536 537 // Terms for 2^(32*0).
538 var acc accumulator96
539 acc.n[0] = uint32(t0) // == acc.Add(t0) because acc is guaranteed to be 0.
540 acc.Add(t8 * uint64(orderComplementWordZero))
541 t0 = uint64(acc.n[0])
542 acc.Rsh32()
543 544 // Terms for 2^(32*1).
545 acc.Add(t1)
546 acc.Add(t8 * uint64(orderComplementWordOne))
547 acc.Add(t9 * uint64(orderComplementWordZero))
548 t1 = uint64(acc.n[0])
549 acc.Rsh32()
550 551 // Terms for 2^(32*2).
552 acc.Add(t2)
553 acc.Add(t8 * uint64(orderComplementWordTwo))
554 acc.Add(t9 * uint64(orderComplementWordOne))
555 acc.Add(t10 * uint64(orderComplementWordZero))
556 t2 = uint64(acc.n[0])
557 acc.Rsh32()
558 559 // Terms for 2^(32*3).
560 acc.Add(t3)
561 acc.Add(t8 * uint64(orderComplementWordThree))
562 acc.Add(t9 * uint64(orderComplementWordTwo))
563 acc.Add(t10 * uint64(orderComplementWordOne))
564 acc.Add(t11 * uint64(orderComplementWordZero))
565 t3 = uint64(acc.n[0])
566 acc.Rsh32()
567 568 // Terms for 2^(32*4).
569 acc.Add(t4)
570 acc.Add(t8) // * uint64(orderComplementWordFour) // * 1
571 acc.Add(t9 * uint64(orderComplementWordThree))
572 acc.Add(t10 * uint64(orderComplementWordTwo))
573 acc.Add(t11 * uint64(orderComplementWordOne))
574 acc.Add(t12 * uint64(orderComplementWordZero))
575 t4 = uint64(acc.n[0])
576 acc.Rsh32()
577 578 // Terms for 2^(32*5).
579 acc.Add(t5)
580 // acc.Add(t8 * uint64(orderComplementWordFive)) // 0
581 acc.Add(t9) // * uint64(orderComplementWordFour) // * 1
582 acc.Add(t10 * uint64(orderComplementWordThree))
583 acc.Add(t11 * uint64(orderComplementWordTwo))
584 acc.Add(t12 * uint64(orderComplementWordOne))
585 t5 = uint64(acc.n[0])
586 acc.Rsh32()
587 588 // Terms for 2^(32*6).
589 acc.Add(t6)
590 // acc.Add(t8 * uint64(orderComplementWordSix)) // 0
591 // acc.Add(t9 * uint64(orderComplementWordFive)) // 0
592 acc.Add(t10) // * uint64(orderComplementWordFour) // * 1
593 acc.Add(t11 * uint64(orderComplementWordThree))
594 acc.Add(t12 * uint64(orderComplementWordTwo))
595 t6 = uint64(acc.n[0])
596 acc.Rsh32()
597 598 // Terms for 2^(32*7).
599 acc.Add(t7)
600 // acc.Add(t8 * uint64(orderComplementWordSeven)) // 0
601 // acc.Add(t9 * uint64(orderComplementWordSix)) // 0
602 // acc.Add(t10 * uint64(orderComplementWordFive)) // 0
603 acc.Add(t11) // * uint64(orderComplementWordFour) // * 1
604 acc.Add(t12 * uint64(orderComplementWordThree))
605 t7 = uint64(acc.n[0])
606 acc.Rsh32()
607 608 // Terms for 2^(32*8).
609 // acc.Add(t9 * uint64(orderComplementWordSeven)) // 0
610 // acc.Add(t10 * uint64(orderComplementWordSix)) // 0
611 // acc.Add(t11 * uint64(orderComplementWordFive)) // 0
612 acc.Add(t12) // * uint64(orderComplementWordFour) // * 1
613 t8 = uint64(acc.n[0])
614 // acc.Rsh32() // No need since not used after this. Guaranteed to be 0.
615 616 // NOTE: All of the remaining multiplications for this iteration result in 0
617 // as they all involve multiplying by combinations of the fifth, sixth, and
618 // seventh words of the two's complement of N, which are 0, so skip them.
619 620 // At this point, the result is reduced to fit within 258 bits, so reduce it
621 // again using a slightly modified version of the same method. The maximum
622 // value in t8 is 2 at this point and therefore multiplying it by each word
623 // of the two's complement of N and adding it to a 32-bit term will result
624 // in a maximum requirement of 33 bits, so it is safe to use native uint64s
625 // here for the intermediate term carry propagation.
626 //
627 // Also, since the maximum value in t8 is 2, this ends up reducing by
628 // another 2 bits to 256 bits.
629 c := t0 + t8*uint64(orderComplementWordZero)
630 s.n[0] = uint32(c & uint32Mask)
631 c = (c >> 32) + t1 + t8*uint64(orderComplementWordOne)
632 s.n[1] = uint32(c & uint32Mask)
633 c = (c >> 32) + t2 + t8*uint64(orderComplementWordTwo)
634 s.n[2] = uint32(c & uint32Mask)
635 c = (c >> 32) + t3 + t8*uint64(orderComplementWordThree)
636 s.n[3] = uint32(c & uint32Mask)
637 c = (c >> 32) + t4 + t8 // * uint64(orderComplementWordFour) == * 1
638 s.n[4] = uint32(c & uint32Mask)
639 c = (c >> 32) + t5 // + t8*uint64(orderComplementWordFive) == 0
640 s.n[5] = uint32(c & uint32Mask)
641 c = (c >> 32) + t6 // + t8*uint64(orderComplementWordSix) == 0
642 s.n[6] = uint32(c & uint32Mask)
643 c = (c >> 32) + t7 // + t8*uint64(orderComplementWordSeven) == 0
644 s.n[7] = uint32(c & uint32Mask)
645 646 // The result is now 256 bits, but it might still be >= N, so use the
647 // existing normal reduce method for 256-bit values.
648 s.reduce256(uint32(c>>32) + s.overflows())
649 }
650 651 // reduce512 reduces the 512-bit intermediate result in the passed terms modulo
652 // the group order down to 385 bits in constant time and stores the result in s.
653 func (s *ModNScalar) reduce512(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15 uint64) {
654 // At this point, the intermediate result in the passed terms is grouped
655 // into the respective bases.
656 //
657 // Per [HAC] section 14.3.4: Reduction method of moduli of special form,
658 // when the modulus is of the special form m = b^t - c, where log_2(c) < t,
659 // highly efficient reduction can be achieved per the provided algorithm.
660 //
661 // The secp256k1 group order fits this criteria since it is:
662 // 2^256 - 432420386565659656852420866394968145599
663 //
664 // Technically the max possible value here is (N-1)^2 since the two scalars
665 // being multiplied are always mod N. Nevertheless, it is safer to consider
666 // it to be (2^256-1)^2 = 2^512 - 2^257 + 1 since it is the product of two
667 // 256-bit values.
668 //
669 // The algorithm is to reduce the result modulo the prime by subtracting
670 // multiples of the group order N. However, in order simplify carry
671 // propagation, this adds with the two's complement of N to achieve the same
672 // result.
673 //
674 // Since the two's complement of N has 127 leading zero bits, this will end
675 // up reducing the intermediate result from 512 bits to 385 bits, resulting
676 // in 13 32-bit terms. The reduced terms are assigned back to t0 through
677 // t12.
678 //
679 // Note that several of the intermediate calculations require adding 64-bit
680 // products together which would overflow a uint64, so a 96-bit accumulator
681 // is used instead.
682 683 // Terms for 2^(32*0).
684 var acc accumulator96
685 acc.n[0] = uint32(t0) // == acc.Add(t0) because acc is guaranteed to be 0.
686 acc.Add(t8 * uint64(orderComplementWordZero))
687 t0 = uint64(acc.n[0])
688 acc.Rsh32()
689 690 // Terms for 2^(32*1).
691 acc.Add(t1)
692 acc.Add(t8 * uint64(orderComplementWordOne))
693 acc.Add(t9 * uint64(orderComplementWordZero))
694 t1 = uint64(acc.n[0])
695 acc.Rsh32()
696 697 // Terms for 2^(32*2).
698 acc.Add(t2)
699 acc.Add(t8 * uint64(orderComplementWordTwo))
700 acc.Add(t9 * uint64(orderComplementWordOne))
701 acc.Add(t10 * uint64(orderComplementWordZero))
702 t2 = uint64(acc.n[0])
703 acc.Rsh32()
704 705 // Terms for 2^(32*3).
706 acc.Add(t3)
707 acc.Add(t8 * uint64(orderComplementWordThree))
708 acc.Add(t9 * uint64(orderComplementWordTwo))
709 acc.Add(t10 * uint64(orderComplementWordOne))
710 acc.Add(t11 * uint64(orderComplementWordZero))
711 t3 = uint64(acc.n[0])
712 acc.Rsh32()
713 714 // Terms for 2^(32*4).
715 acc.Add(t4)
716 acc.Add(t8) // * uint64(orderComplementWordFour) // * 1
717 acc.Add(t9 * uint64(orderComplementWordThree))
718 acc.Add(t10 * uint64(orderComplementWordTwo))
719 acc.Add(t11 * uint64(orderComplementWordOne))
720 acc.Add(t12 * uint64(orderComplementWordZero))
721 t4 = uint64(acc.n[0])
722 acc.Rsh32()
723 724 // Terms for 2^(32*5).
725 acc.Add(t5)
726 // acc.Add(t8 * uint64(orderComplementWordFive)) // 0
727 acc.Add(t9) // * uint64(orderComplementWordFour) // * 1
728 acc.Add(t10 * uint64(orderComplementWordThree))
729 acc.Add(t11 * uint64(orderComplementWordTwo))
730 acc.Add(t12 * uint64(orderComplementWordOne))
731 acc.Add(t13 * uint64(orderComplementWordZero))
732 t5 = uint64(acc.n[0])
733 acc.Rsh32()
734 735 // Terms for 2^(32*6).
736 acc.Add(t6)
737 // acc.Add(t8 * uint64(orderComplementWordSix)) // 0
738 // acc.Add(t9 * uint64(orderComplementWordFive)) // 0
739 acc.Add(t10) // * uint64(orderComplementWordFour)) // * 1
740 acc.Add(t11 * uint64(orderComplementWordThree))
741 acc.Add(t12 * uint64(orderComplementWordTwo))
742 acc.Add(t13 * uint64(orderComplementWordOne))
743 acc.Add(t14 * uint64(orderComplementWordZero))
744 t6 = uint64(acc.n[0])
745 acc.Rsh32()
746 747 // Terms for 2^(32*7).
748 acc.Add(t7)
749 // acc.Add(t8 * uint64(orderComplementWordSeven)) // 0
750 // acc.Add(t9 * uint64(orderComplementWordSix)) // 0
751 // acc.Add(t10 * uint64(orderComplementWordFive)) // 0
752 acc.Add(t11) // * uint64(orderComplementWordFour) // * 1
753 acc.Add(t12 * uint64(orderComplementWordThree))
754 acc.Add(t13 * uint64(orderComplementWordTwo))
755 acc.Add(t14 * uint64(orderComplementWordOne))
756 acc.Add(t15 * uint64(orderComplementWordZero))
757 t7 = uint64(acc.n[0])
758 acc.Rsh32()
759 760 // Terms for 2^(32*8).
761 // acc.Add(t9 * uint64(orderComplementWordSeven)) // 0
762 // acc.Add(t10 * uint64(orderComplementWordSix)) // 0
763 // acc.Add(t11 * uint64(orderComplementWordFive)) // 0
764 acc.Add(t12) // * uint64(orderComplementWordFour) // * 1
765 acc.Add(t13 * uint64(orderComplementWordThree))
766 acc.Add(t14 * uint64(orderComplementWordTwo))
767 acc.Add(t15 * uint64(orderComplementWordOne))
768 t8 = uint64(acc.n[0])
769 acc.Rsh32()
770 771 // Terms for 2^(32*9).
772 // acc.Add(t10 * uint64(orderComplementWordSeven)) // 0
773 // acc.Add(t11 * uint64(orderComplementWordSix)) // 0
774 // acc.Add(t12 * uint64(orderComplementWordFive)) // 0
775 acc.Add(t13) // * uint64(orderComplementWordFour) // * 1
776 acc.Add(t14 * uint64(orderComplementWordThree))
777 acc.Add(t15 * uint64(orderComplementWordTwo))
778 t9 = uint64(acc.n[0])
779 acc.Rsh32()
780 781 // Terms for 2^(32*10).
782 // acc.Add(t11 * uint64(orderComplementWordSeven)) // 0
783 // acc.Add(t12 * uint64(orderComplementWordSix)) // 0
784 // acc.Add(t13 * uint64(orderComplementWordFive)) // 0
785 acc.Add(t14) // * uint64(orderComplementWordFour) // * 1
786 acc.Add(t15 * uint64(orderComplementWordThree))
787 t10 = uint64(acc.n[0])
788 acc.Rsh32()
789 790 // Terms for 2^(32*11).
791 // acc.Add(t12 * uint64(orderComplementWordSeven)) // 0
792 // acc.Add(t13 * uint64(orderComplementWordSix)) // 0
793 // acc.Add(t14 * uint64(orderComplementWordFive)) // 0
794 acc.Add(t15) // * uint64(orderComplementWordFour) // * 1
795 t11 = uint64(acc.n[0])
796 acc.Rsh32()
797 798 // NOTE: All of the remaining multiplications for this iteration result in 0
799 // as they all involve multiplying by combinations of the fifth, sixth, and
800 // seventh words of the two's complement of N, which are 0, so skip them.
801 802 // Terms for 2^(32*12).
803 t12 = uint64(acc.n[0])
804 // acc.Rsh32() // No need since not used after this. Guaranteed to be 0.
805 806 // At this point, the result is reduced to fit within 385 bits, so reduce it
807 // again using the same method accordingly.
808 s.reduce385(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12)
809 }
810 811 // Mul2 multiplies the passed two scalars together modulo the group order in
812 // constant time and stores the result in s.
813 //
814 // The scalar is returned to support chaining. This enables syntax like:
815 // s3.Mul2(s, s2).AddInt(1) so that s3 = (s * s2) + 1.
816 func (s *ModNScalar) Mul2(val, val2 *ModNScalar) *ModNScalar {
817 // This could be done with for loops and an array to store the intermediate
818 // terms, but this unrolled version is significantly faster.
819 820 // The overall strategy employed here is:
821 // 1) Calculate the 512-bit product of the two scalars using the standard
822 // pencil-and-paper method.
823 // 2) Reduce the result modulo the prime by effectively subtracting
824 // multiples of the group order N (actually performed by adding multiples
825 // of the two's complement of N to avoid implementing subtraction).
826 // 3) Repeat step 2 noting that each iteration reduces the required number
827 // of bits by 127 because the two's complement of N has 127 leading zero
828 // bits.
829 // 4) Once reduced to 256 bits, call the existing reduce method to perform
830 // a final reduction as needed.
831 //
832 // Note that several of the intermediate calculations require adding 64-bit
833 // products together which would overflow a uint64, so a 96-bit accumulator
834 // is used instead.
835 836 // Terms for 2^(32*0).
837 var acc accumulator96
838 acc.Add(uint64(val.n[0]) * uint64(val2.n[0]))
839 t0 := uint64(acc.n[0])
840 acc.Rsh32()
841 842 // Terms for 2^(32*1).
843 acc.Add(uint64(val.n[0]) * uint64(val2.n[1]))
844 acc.Add(uint64(val.n[1]) * uint64(val2.n[0]))
845 t1 := uint64(acc.n[0])
846 acc.Rsh32()
847 848 // Terms for 2^(32*2).
849 acc.Add(uint64(val.n[0]) * uint64(val2.n[2]))
850 acc.Add(uint64(val.n[1]) * uint64(val2.n[1]))
851 acc.Add(uint64(val.n[2]) * uint64(val2.n[0]))
852 t2 := uint64(acc.n[0])
853 acc.Rsh32()
854 855 // Terms for 2^(32*3).
856 acc.Add(uint64(val.n[0]) * uint64(val2.n[3]))
857 acc.Add(uint64(val.n[1]) * uint64(val2.n[2]))
858 acc.Add(uint64(val.n[2]) * uint64(val2.n[1]))
859 acc.Add(uint64(val.n[3]) * uint64(val2.n[0]))
860 t3 := uint64(acc.n[0])
861 acc.Rsh32()
862 863 // Terms for 2^(32*4).
864 acc.Add(uint64(val.n[0]) * uint64(val2.n[4]))
865 acc.Add(uint64(val.n[1]) * uint64(val2.n[3]))
866 acc.Add(uint64(val.n[2]) * uint64(val2.n[2]))
867 acc.Add(uint64(val.n[3]) * uint64(val2.n[1]))
868 acc.Add(uint64(val.n[4]) * uint64(val2.n[0]))
869 t4 := uint64(acc.n[0])
870 acc.Rsh32()
871 872 // Terms for 2^(32*5).
873 acc.Add(uint64(val.n[0]) * uint64(val2.n[5]))
874 acc.Add(uint64(val.n[1]) * uint64(val2.n[4]))
875 acc.Add(uint64(val.n[2]) * uint64(val2.n[3]))
876 acc.Add(uint64(val.n[3]) * uint64(val2.n[2]))
877 acc.Add(uint64(val.n[4]) * uint64(val2.n[1]))
878 acc.Add(uint64(val.n[5]) * uint64(val2.n[0]))
879 t5 := uint64(acc.n[0])
880 acc.Rsh32()
881 882 // Terms for 2^(32*6).
883 acc.Add(uint64(val.n[0]) * uint64(val2.n[6]))
884 acc.Add(uint64(val.n[1]) * uint64(val2.n[5]))
885 acc.Add(uint64(val.n[2]) * uint64(val2.n[4]))
886 acc.Add(uint64(val.n[3]) * uint64(val2.n[3]))
887 acc.Add(uint64(val.n[4]) * uint64(val2.n[2]))
888 acc.Add(uint64(val.n[5]) * uint64(val2.n[1]))
889 acc.Add(uint64(val.n[6]) * uint64(val2.n[0]))
890 t6 := uint64(acc.n[0])
891 acc.Rsh32()
892 893 // Terms for 2^(32*7).
894 acc.Add(uint64(val.n[0]) * uint64(val2.n[7]))
895 acc.Add(uint64(val.n[1]) * uint64(val2.n[6]))
896 acc.Add(uint64(val.n[2]) * uint64(val2.n[5]))
897 acc.Add(uint64(val.n[3]) * uint64(val2.n[4]))
898 acc.Add(uint64(val.n[4]) * uint64(val2.n[3]))
899 acc.Add(uint64(val.n[5]) * uint64(val2.n[2]))
900 acc.Add(uint64(val.n[6]) * uint64(val2.n[1]))
901 acc.Add(uint64(val.n[7]) * uint64(val2.n[0]))
902 t7 := uint64(acc.n[0])
903 acc.Rsh32()
904 905 // Terms for 2^(32*8).
906 acc.Add(uint64(val.n[1]) * uint64(val2.n[7]))
907 acc.Add(uint64(val.n[2]) * uint64(val2.n[6]))
908 acc.Add(uint64(val.n[3]) * uint64(val2.n[5]))
909 acc.Add(uint64(val.n[4]) * uint64(val2.n[4]))
910 acc.Add(uint64(val.n[5]) * uint64(val2.n[3]))
911 acc.Add(uint64(val.n[6]) * uint64(val2.n[2]))
912 acc.Add(uint64(val.n[7]) * uint64(val2.n[1]))
913 t8 := uint64(acc.n[0])
914 acc.Rsh32()
915 916 // Terms for 2^(32*9).
917 acc.Add(uint64(val.n[2]) * uint64(val2.n[7]))
918 acc.Add(uint64(val.n[3]) * uint64(val2.n[6]))
919 acc.Add(uint64(val.n[4]) * uint64(val2.n[5]))
920 acc.Add(uint64(val.n[5]) * uint64(val2.n[4]))
921 acc.Add(uint64(val.n[6]) * uint64(val2.n[3]))
922 acc.Add(uint64(val.n[7]) * uint64(val2.n[2]))
923 t9 := uint64(acc.n[0])
924 acc.Rsh32()
925 926 // Terms for 2^(32*10).
927 acc.Add(uint64(val.n[3]) * uint64(val2.n[7]))
928 acc.Add(uint64(val.n[4]) * uint64(val2.n[6]))
929 acc.Add(uint64(val.n[5]) * uint64(val2.n[5]))
930 acc.Add(uint64(val.n[6]) * uint64(val2.n[4]))
931 acc.Add(uint64(val.n[7]) * uint64(val2.n[3]))
932 t10 := uint64(acc.n[0])
933 acc.Rsh32()
934 935 // Terms for 2^(32*11).
936 acc.Add(uint64(val.n[4]) * uint64(val2.n[7]))
937 acc.Add(uint64(val.n[5]) * uint64(val2.n[6]))
938 acc.Add(uint64(val.n[6]) * uint64(val2.n[5]))
939 acc.Add(uint64(val.n[7]) * uint64(val2.n[4]))
940 t11 := uint64(acc.n[0])
941 acc.Rsh32()
942 943 // Terms for 2^(32*12).
944 acc.Add(uint64(val.n[5]) * uint64(val2.n[7]))
945 acc.Add(uint64(val.n[6]) * uint64(val2.n[6]))
946 acc.Add(uint64(val.n[7]) * uint64(val2.n[5]))
947 t12 := uint64(acc.n[0])
948 acc.Rsh32()
949 950 // Terms for 2^(32*13).
951 acc.Add(uint64(val.n[6]) * uint64(val2.n[7]))
952 acc.Add(uint64(val.n[7]) * uint64(val2.n[6]))
953 t13 := uint64(acc.n[0])
954 acc.Rsh32()
955 956 // Terms for 2^(32*14).
957 acc.Add(uint64(val.n[7]) * uint64(val2.n[7]))
958 t14 := uint64(acc.n[0])
959 acc.Rsh32()
960 961 // What's left is for 2^(32*15).
962 t15 := uint64(acc.n[0])
963 // acc.Rsh32() // No need since not used after this. Guaranteed to be 0.
964 965 // At this point, all of the terms are grouped into their respective base
966 // and occupy up to 512 bits. Reduce the result accordingly.
967 s.reduce512(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14,
968 t15)
969 return s
970 }
971 972 // Mul multiplies the passed scalar with the existing one modulo the group order
973 // in constant time and stores the result in s.
974 //
975 // The scalar is returned to support chaining. This enables syntax like:
976 // s.Mul(s2).AddInt(1) so that s = (s * s2) + 1.
977 func (s *ModNScalar) Mul(val *ModNScalar) *ModNScalar {
978 return s.Mul2(s, val)
979 }
980 981 // SquareVal squares the passed scalar modulo the group order in constant time
982 // and stores the result in s.
983 //
984 // The scalar is returned to support chaining. This enables syntax like:
985 // s3.SquareVal(s).Mul(s) so that s3 = s^2 * s = s^3.
986 func (s *ModNScalar) SquareVal(val *ModNScalar) *ModNScalar {
987 // This could technically be optimized slightly to take advantage of the
988 // fact that many of the intermediate calculations in squaring are just
989 // doubling, however, benchmarking has shown that due to the need to use a
990 // 96-bit accumulator, any savings are essentially offset by that and
991 // consequently there is no real difference in performance over just
992 // multiplying the value by itself to justify the extra code for now. This
993 // can be revisited in the future if it becomes a bottleneck in practice.
994 995 return s.Mul2(val, val)
996 }
997 998 // Square squares the scalar modulo the group order in constant time. The
999 // existing scalar is modified.
1000 //
1001 // The scalar is returned to support chaining. This enables syntax like:
1002 // s.Square().Mul(s2) so that s = s^2 * s2.
1003 func (s *ModNScalar) Square() *ModNScalar {
1004 return s.SquareVal(s)
1005 }
1006 1007 // NegateVal negates the passed scalar modulo the group order and stores the
1008 // result in s in constant time.
1009 //
1010 // The scalar is returned to support chaining. This enables syntax like:
1011 // s.NegateVal(s2).AddInt(1) so that s = -s2 + 1.
1012 func (s *ModNScalar) NegateVal(val *ModNScalar) *ModNScalar {
1013 // Since the scalar is already in the range 0 <= val < N, where N is the
1014 // group order, negation modulo the group order is just the group order
1015 // minus the value. This implies that the result will always be in the
1016 // desired range with the sole exception of 0 because N - 0 = N itself.
1017 //
1018 // Therefore, in order to avoid the need to reduce the result for every
1019 // other case in order to achieve constant time, this creates a mask that is
1020 // all 0s in the case of the scalar being negated is 0 and all 1s otherwise
1021 // and bitwise ands that mask with each word.
1022 //
1023 // Finally, to simplify the carry propagation, this adds the two's
1024 // complement of the scalar to N in order to achieve the same result.
1025 bits := val.n[0] | val.n[1] | val.n[2] | val.n[3] | val.n[4] | val.n[5] |
1026 val.n[6] | val.n[7]
1027 mask := uint64(uint32Mask * constantTimeNotEq(bits, 0))
1028 c := uint64(orderWordZero) + (uint64(^val.n[0]) + 1)
1029 s.n[0] = uint32(c & mask)
1030 c = (c >> 32) + uint64(orderWordOne) + uint64(^val.n[1])
1031 s.n[1] = uint32(c & mask)
1032 c = (c >> 32) + uint64(orderWordTwo) + uint64(^val.n[2])
1033 s.n[2] = uint32(c & mask)
1034 c = (c >> 32) + uint64(orderWordThree) + uint64(^val.n[3])
1035 s.n[3] = uint32(c & mask)
1036 c = (c >> 32) + uint64(orderWordFour) + uint64(^val.n[4])
1037 s.n[4] = uint32(c & mask)
1038 c = (c >> 32) + uint64(orderWordFive) + uint64(^val.n[5])
1039 s.n[5] = uint32(c & mask)
1040 c = (c >> 32) + uint64(orderWordSix) + uint64(^val.n[6])
1041 s.n[6] = uint32(c & mask)
1042 c = (c >> 32) + uint64(orderWordSeven) + uint64(^val.n[7])
1043 s.n[7] = uint32(c & mask)
1044 return s
1045 }
1046 1047 // Negate negates the scalar modulo the group order in constant time. The
1048 // existing scalar is modified.
1049 //
1050 // The scalar is returned to support chaining. This enables syntax like:
1051 // s.Negate().AddInt(1) so that s = -s + 1.
1052 func (s *ModNScalar) Negate() *ModNScalar {
1053 return s.NegateVal(s)
1054 }
1055 1056 // InverseValNonConst finds the modular multiplicative inverse of the passed
1057 // scalar and stores result in s in *non-constant* time.
1058 //
1059 // The scalar is returned to support chaining. This enables syntax like:
1060 // s3.InverseVal(s1).Mul(s2) so that s3 = s1^-1 * s2.
1061 func (s *ModNScalar) InverseValNonConst(val *ModNScalar) *ModNScalar {
1062 // This is making use of big integers for now. Ideally it will be replaced
1063 // with an implementation that does not depend on big integers.
1064 valBytes := val.Bytes()
1065 bigVal := new(big.Int).SetBytes(valBytes[:])
1066 bigVal.ModInverse(bigVal, curveParams.N)
1067 s.SetByteSlice(bigVal.Bytes())
1068 return s
1069 }
1070 1071 // InverseNonConst finds the modular multiplicative inverse of the scalar in
1072 // *non-constant* time. The existing scalar is modified.
1073 //
1074 // The scalar is returned to support chaining. This enables syntax like:
1075 // s.Inverse().Mul(s2) so that s = s^-1 * s2.
1076 func (s *ModNScalar) InverseNonConst() *ModNScalar {
1077 return s.InverseValNonConst(s)
1078 }
1079 1080 // IsOverHalfOrder returns whether or not the scalar exceeds the group order
1081 // divided by 2 in constant time.
1082 func (s *ModNScalar) IsOverHalfOrder() bool {
1083 // The intuition here is that the scalar is greater than half of the group
1084 // order if one of the higher individual words is greater than the
1085 // corresponding word of the half group order and all higher words in the
1086 // scalar are equal to their corresponding word of the half group order.
1087 //
1088 // Note that the words 4, 5, and 6 are all the max uint32 value, so there is
1089 // no need to test if those individual words of the scalar exceeds them,
1090 // hence, only equality is checked for them.
1091 result := constantTimeGreater(s.n[7], halfOrderWordSeven)
1092 highWordsEqual := constantTimeEq(s.n[7], halfOrderWordSeven)
1093 highWordsEqual &= constantTimeEq(s.n[6], halfOrderWordSix)
1094 highWordsEqual &= constantTimeEq(s.n[5], halfOrderWordFive)
1095 highWordsEqual &= constantTimeEq(s.n[4], halfOrderWordFour)
1096 result |= highWordsEqual & constantTimeGreater(s.n[3], halfOrderWordThree)
1097 highWordsEqual &= constantTimeEq(s.n[3], halfOrderWordThree)
1098 result |= highWordsEqual & constantTimeGreater(s.n[2], halfOrderWordTwo)
1099 highWordsEqual &= constantTimeEq(s.n[2], halfOrderWordTwo)
1100 result |= highWordsEqual & constantTimeGreater(s.n[1], halfOrderWordOne)
1101 highWordsEqual &= constantTimeEq(s.n[1], halfOrderWordOne)
1102 result |= highWordsEqual & constantTimeGreater(s.n[0], halfOrderWordZero)
1103 1104 return result != 0
1105 }
1106