1 // Copyright (c) 2020-2023 The Decred developers
2 // Use of this source code is governed by an ISC
3 // license that can be found in the LICENSE file.
4 5 package secp256k1
6 7 import (
8 "math/big"
9 10 "smesh.lol/pkg/nostr/hex"
11 )
12 13 // References:
14 // [SECG]: Recommended Elliptic Curve Domain Parameters
15 // https://www.secg.org/sec2-v2.pdf
16 //
17 // [HAC]: Handbook of Applied Cryptography Menezes, van Oorschot, Vanstone.
18 // http://cacr.uwaterloo.ca/hac/
19 20 // Many elliptic curve operations require working with scalars in a finite field
21 // characterized by the order of the group underlying the secp256k1 curve.
22 // Given this precision is larger than the biggest available native type,
23 // obviously some form of bignum math is needed. This code implements
24 // specialized fixed-precision field arithmetic rather than relying on an
25 // arbitrary-precision arithmetic package such as math/big for dealing with the
26 // math modulo the group order since the size is known. As a result, rather
27 // large performance gains are achieved by taking advantage of many
28 // optimizations not available to arbitrary-precision arithmetic and generic
29 // modular arithmetic algorithms.
30 //
31 // There are various ways to internally represent each element. For example,
32 // the most obvious representation would be to use an array of 4 uint64s (64
33 // bits * 4 = 256 bits). However, that representation suffers from the fact
34 // that there is no native Go type large enough to handle the intermediate
35 // results while adding or multiplying two 64-bit numbers.
36 //
37 // Given the above, this implementation represents the field elements as 8
38 // uint32s with each word (array entry) treated as base 2^32. This was chosen
39 // because most systems at the current time are 64-bit (or at least have 64-bit
40 // registers available for specialized purposes such as MMX) so the intermediate
41 // results can typically be done using a native register (and using uint64s to
42 // avoid the need for additional half-word arithmetic)
43 44 const (
45 // These fields provide convenient access to each of the words of the
46 // secp256k1 curve group order N to improve code readability.
47 //
48 // The group order of the curve per [SECG] is:
49 // 0xffffffff ffffffff ffffffff fffffffe baaedce6 af48a03b bfd25e8c d0364141
50 //
51 // nolint: dupword
52 orderWordZero uint32 = 0xd0364141
53 orderWordOne uint32 = 0xbfd25e8c
54 orderWordTwo uint32 = 0xaf48a03b
55 orderWordThree uint32 = 0xbaaedce6
56 orderWordFour uint32 = 0xfffffffe
57 orderWordFive uint32 = 0xffffffff
58 orderWordSix uint32 = 0xffffffff
59 orderWordSeven uint32 = 0xffffffff
60 // These fields provide convenient access to each of the words of the two's
61 // complement of the secp256k1 curve group order N to improve code
62 // readability.
63 //
64 // The two's complement of the group order is:
65 // 0x00000000 00000000 00000000 00000001 45512319 50b75fc4 402da173 2fc9bebf
66 orderComplementWordZero = (^orderWordZero) + 1
67 orderComplementWordOne = ^orderWordOne
68 orderComplementWordTwo = ^orderWordTwo
69 orderComplementWordThree = ^orderWordThree
70 // orderComplementWordFour uint32 = ^orderWordFour // unused
71 // orderComplementWordFive uint32 = ^orderWordFive // unused
72 // orderComplementWordSix uint32 = ^orderWordSix // unused
73 // orderComplementWordSeven uint32 = ^orderWordSeven // unused
74 // These fields provide convenient access to each of the words of the
75 // secp256k1 curve group order N / 2 to improve code readability and avoid
76 // the need to recalculate them.
77 //
78 // The half order of the secp256k1 curve group is:
79 // 0x7fffffff ffffffff ffffffff ffffffff 5d576e73 57a4501d dfe92f46 681b20a0
80 //
81 // nolint: dupword
82 halfOrderWordZero uint32 = 0x681b20a0
83 halfOrderWordOne uint32 = 0xdfe92f46
84 halfOrderWordTwo uint32 = 0x57a4501d
85 halfOrderWordThree uint32 = 0x5d576e73
86 halfOrderWordFour uint32 = 0xffffffff
87 halfOrderWordFive uint32 = 0xffffffff
88 halfOrderWordSix uint32 = 0xffffffff
89 halfOrderWordSeven uint32 = 0x7fffffff
90 // uint32Mask is simply a mask with all bits set for a uint32 and is used to
91 // improve the readability of the code.
92 uint32Mask = 0xffffffff
93 )
94 95 var (
96 // zero32 is an array of 32 bytes used for the purposes of zeroing and is
97 // defined here to avoid extra allocations.
98 zero32 = [32]byte{}
99 )
100 101 // ModNScalar implements optimized 256-bit constant-time fixed-precision
102 // arithmetic over the secp256k1 group order. This means all arithmetic is
103 // performed modulo:
104 //
105 // 0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141
106 //
107 // It only implements the arithmetic needed for elliptic curve operations,
108 // however, the operations that are not implemented can typically be worked
109 // around if absolutely needed. For example, subtraction can be performed by
110 // adding the negation.
111 //
112 // Should it be absolutely necessary, conversion to the standard library
113 // math/big.Int can be accomplished by using the Bytes method, slicing the
114 // resulting fixed-size array, and feeding it to big.Int.SetBytes. However,
115 // that should typically be avoided when possible as conversion to big.Ints
116 // requires allocations, is not constant time, and is slower when working modulo
117 // the group order.
118 type ModNScalar struct {
119 // The scalar is represented as 8 32-bit integers in base 2^32.
120 //
121 // The following depicts the internal representation:
122 // ---------------------------------------------------------
123 // | n[7] | n[6] | ... | n[0] |
124 // | 32 bits | 32 bits | ... | 32 bits |
125 // | Mult: 2^(32*7) | Mult: 2^(32*6) | ... | Mult: 2^(32*0) |
126 // ---------------------------------------------------------
127 //
128 // For example, consider the number 2^87 + 2^42 + 1. It would be
129 // represented as:
130 // n[0] = 1
131 // n[1] = 2^10
132 // n[2] = 2^23
133 // n[3..7] = 0
134 //
135 // The full 256-bit value is then calculated by looping i from 7..0 and
136 // doing sum(n[i] * 2^(32i)) like so:
137 // n[7] * 2^(32*7) = 0 * 2^224 = 0
138 // n[6] * 2^(32*6) = 0 * 2^192 = 0
139 // ...
140 // n[2] * 2^(32*2) = 2^23 * 2^64 = 2^87
141 // n[1] * 2^(32*1) = 2^10 * 2^32 = 2^42
142 // n[0] * 2^(32*0) = 1 * 2^0 = 1
143 // Sum: 0 + 0 + ... + 2^87 + 2^42 + 1 = 2^87 + 2^42 + 1
144 n [8]uint32
145 }
146 147 // String returns the scalar as a human-readable hex string.
148 //
149 // This is NOT constant time.
150 func (s ModNScalar) String() string {
151 b := s.Bytes()
152 return hex.Enc(b[:])
153 }
154 155 // Set sets the scalar equal to a copy of the passed one in constant time.
156 //
157 // The scalar is returned to support chaining. This enables syntax like:
158 // s := new(ModNScalar).Set(s2).Add(1) so that s = s2 + 1 where s2 is not
159 // modified.
160 func (s *ModNScalar) Set(val *ModNScalar) *ModNScalar {
161 *s = *val
162 return s
163 }
164 165 // Zero sets the scalar to zero in constant time. A newly created scalar is
166 // already set to zero. This function can be useful to clear an existing scalar
167 // for reuse.
168 func (s *ModNScalar) Zero() {
169 s.n[0] = 0
170 s.n[1] = 0
171 s.n[2] = 0
172 s.n[3] = 0
173 s.n[4] = 0
174 s.n[5] = 0
175 s.n[6] = 0
176 s.n[7] = 0
177 }
178 179 // IsZeroBit returns 1 when the scalar is equal to zero or 0 otherwise in
180 // constant time.
181 //
182 // Note that a bool is not used here because it is not possible in Go to convert
183 // from a bool to numeric value in constant time and many constant-time
184 // operations require a numeric value. See IsZero for the version that returns
185 // a bool.
186 func (s *ModNScalar) IsZeroBit() uint32 {
187 // The scalar can only be zero if no bits are set in any of the words.
188 bits := s.n[0] | s.n[1] | s.n[2] | s.n[3] | s.n[4] | s.n[5] | s.n[6] | s.n[7]
189 return constantTimeEq(bits, 0)
190 }
191 192 // IsZero returns whether or not the scalar is equal to zero in constant time.
193 func (s *ModNScalar) IsZero() bool {
194 // The scalar can only be zero if no bits are set in any of the words.
195 bits := s.n[0] | s.n[1] | s.n[2] | s.n[3] | s.n[4] | s.n[5] | s.n[6] | s.n[7]
196 return bits == 0
197 }
198 199 // SetInt sets the scalar to the passed integer in constant time. This is a
200 // convenience function since it is fairly common to perform some arithmetic
201 // with small native integers.
202 //
203 // The scalar is returned to support chaining. This enables syntax like:
204 // s := new(ModNScalar).SetInt(2).Mul(s2) so that s = 2 * s2.
205 func (s *ModNScalar) SetInt(ui uint32) *ModNScalar {
206 s.Zero()
207 s.n[0] = ui
208 return s
209 }
210 211 // constantTimeEq returns 1 if a == b or 0 otherwise in constant time.
212 func constantTimeEq(a, b uint32) uint32 {
213 return uint32((uint64(a^b) - 1) >> 63)
214 }
215 216 // constantTimeNotEq returns 1 if a != b or 0 otherwise in constant time.
217 func constantTimeNotEq(a, b uint32) uint32 {
218 return ^uint32((uint64(a^b)-1)>>63) & 1
219 }
220 221 // constantTimeLess returns 1 if a < b or 0 otherwise in constant time.
222 func constantTimeLess(a, b uint32) uint32 {
223 return uint32((uint64(a) - uint64(b)) >> 63)
224 }
225 226 // constantTimeLessOrEq returns 1 if a <= b or 0 otherwise in constant time.
227 func constantTimeLessOrEq(a, b uint32) uint32 {
228 return uint32((uint64(a) - uint64(b) - 1) >> 63)
229 }
230 231 // constantTimeGreater returns 1 if a > b or 0 otherwise in constant time.
232 func constantTimeGreater(a, b uint32) uint32 {
233 return constantTimeLess(b, a)
234 }
235 236 // constantTimeGreaterOrEq returns 1 if a >= b or 0 otherwise in constant time.
237 func constantTimeGreaterOrEq(a, b uint32) uint32 {
238 return constantTimeLessOrEq(b, a)
239 }
240 241 // constantTimeMin returns min(a,b) in constant time.
242 func constantTimeMin(a, b uint32) uint32 {
243 return b ^ ((a ^ b) & -constantTimeLess(a, b))
244 }
245 246 // overflows determines if the current scalar is greater than or equal to the
247 // group order in constant time and returns 1 if it is or 0 otherwise.
248 func (s *ModNScalar) overflows() uint32 {
249 // The intuition here is that the scalar is greater than the group order if
250 // one of the higher individual words is greater than corresponding word of
251 // the group order and all higher words in the scalar are equal to their
252 // corresponding word of the group order. Since this type is modulo the
253 // group order, being equal is also an overflow back to 0.
254 //
255 // Note that the words 5, 6, and 7 are all the max uint32 value, so there is
256 // no need to test if those individual words of the scalar exceeds them,
257 // hence, only equality is checked for them.
258 highWordsEqual := constantTimeEq(s.n[7], orderWordSeven)
259 highWordsEqual &= constantTimeEq(s.n[6], orderWordSix)
260 highWordsEqual &= constantTimeEq(s.n[5], orderWordFive)
261 overflow := highWordsEqual & constantTimeGreater(s.n[4], orderWordFour)
262 highWordsEqual &= constantTimeEq(s.n[4], orderWordFour)
263 overflow |= highWordsEqual & constantTimeGreater(s.n[3], orderWordThree)
264 highWordsEqual &= constantTimeEq(s.n[3], orderWordThree)
265 overflow |= highWordsEqual & constantTimeGreater(s.n[2], orderWordTwo)
266 highWordsEqual &= constantTimeEq(s.n[2], orderWordTwo)
267 overflow |= highWordsEqual & constantTimeGreater(s.n[1], orderWordOne)
268 highWordsEqual &= constantTimeEq(s.n[1], orderWordOne)
269 overflow |= highWordsEqual & constantTimeGreaterOrEq(s.n[0], orderWordZero)
270 return overflow
271 }
272 273 // reduce256 reduces the current scalar modulo the group order in accordance
274 // with the overflows parameter in constant time. The overflows parameter
275 // specifies whether or not the scalar is known to be greater than the group
276 // order and MUST either be 1 in the case it is or 0 in the case it is not for a
277 // correct result.
278 func (s *ModNScalar) reduce256(overflows uint32) {
279 // Notice that since s < 2^256 < 2N (where N is the group order), the max
280 // possible number of reductions required is one. Therefore, in the case a
281 // reduction is needed, it can be performed with a single subtraction of N.
282 // Also, recall that subtraction is equivalent to addition by the two's
283 // complement while ignoring the carry.
284 //
285 // When s >= N, the overflows parameter will be 1. Conversely, it will be 0
286 // when s < N. Thus multiplying by the overflows parameter will either
287 // result in 0 or the multiplicand itself.
288 //
289 // Combining the above along with the fact that s + 0 = s, the following is
290 // a constant time implementation that works by either adding 0 or the two's
291 // complement of N as needed.
292 //
293 // The final result will be in the range 0 <= s < N as expected.
294 overflows64 := uint64(overflows)
295 c := uint64(s.n[0]) + overflows64*uint64(orderComplementWordZero)
296 s.n[0] = uint32(c & uint32Mask)
297 c = (c >> 32) + uint64(s.n[1]) + overflows64*uint64(orderComplementWordOne)
298 s.n[1] = uint32(c & uint32Mask)
299 c = (c >> 32) + uint64(s.n[2]) + overflows64*uint64(orderComplementWordTwo)
300 s.n[2] = uint32(c & uint32Mask)
301 c = (c >> 32) + uint64(s.n[3]) + overflows64*uint64(orderComplementWordThree)
302 s.n[3] = uint32(c & uint32Mask)
303 c = (c >> 32) + uint64(s.n[4]) + overflows64 // * 1
304 s.n[4] = uint32(c & uint32Mask)
305 c = (c >> 32) + uint64(s.n[5]) // + overflows64 * 0
306 s.n[5] = uint32(c & uint32Mask)
307 c = (c >> 32) + uint64(s.n[6]) // + overflows64 * 0
308 s.n[6] = uint32(c & uint32Mask)
309 c = (c >> 32) + uint64(s.n[7]) // + overflows64 * 0
310 s.n[7] = uint32(c & uint32Mask)
311 }
312 313 // SetBytes interprets the provided array as a 256-bit big-endian unsigned
314 // integer, reduces it modulo the group order, sets the scalar to the result,
315 // and returns either 1 if it was reduced (aka it overflowed) or 0 otherwise in
316 // constant time.
317 //
318 // Note that a bool is not used here because it is not possible in Go to convert
319 // from a bool to numeric value in constant time and many constant-time
320 // operations require a numeric value.
321 func (s *ModNScalar) SetBytes(b *[32]byte) uint32 {
322 // Pack the 256 total bits across the 8 uint32 words. This could be done
323 // with a for loop, but benchmarks show this unrolled version is about 2
324 // times faster than the variant that uses a loop.
325 s.n[0] = uint32(b[31]) | uint32(b[30])<<8 | uint32(b[29])<<16 | uint32(b[28])<<24
326 s.n[1] = uint32(b[27]) | uint32(b[26])<<8 | uint32(b[25])<<16 | uint32(b[24])<<24
327 s.n[2] = uint32(b[23]) | uint32(b[22])<<8 | uint32(b[21])<<16 | uint32(b[20])<<24
328 s.n[3] = uint32(b[19]) | uint32(b[18])<<8 | uint32(b[17])<<16 | uint32(b[16])<<24
329 s.n[4] = uint32(b[15]) | uint32(b[14])<<8 | uint32(b[13])<<16 | uint32(b[12])<<24
330 s.n[5] = uint32(b[11]) | uint32(b[10])<<8 | uint32(b[9])<<16 | uint32(b[8])<<24
331 s.n[6] = uint32(b[7]) | uint32(b[6])<<8 | uint32(b[5])<<16 | uint32(b[4])<<24
332 s.n[7] = uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
333 // The value might be >= N, so reduce it as required and return whether or
334 // not it was reduced.
335 needsReduce := s.overflows()
336 s.reduce256(needsReduce)
337 return needsReduce
338 }
339 340 // zeroArray32 zeroes the provided 32-byte buffer.
341 func zeroArray32(b *[32]byte) { copy(b[:], zero32[:]) }
342 343 // SetByteSlice interprets the provided slice as a 256-bit big-endian unsigned
344 // integer (meaning it is truncated to the first 32 bytes), reduces it modulo
345 // the group order, sets the scalar to the result, and returns whether or not
346 // the resulting truncated 256-bit integer overflowed in constant time.
347 //
348 // Note that since passing a slice with more than 32 bytes is truncated, it is
349 // possible that the truncated value is less than the order of the curve and
350 // hence it will not be reported as having overflowed in that case. It is up to
351 // the caller to decide whether it needs to provide numbers of the appropriate
352 // size or it is acceptable to use this function with the described truncation
353 // and overflow behavior.
354 func (s *ModNScalar) SetByteSlice(b []byte) bool {
355 var b32 [32]byte
356 b = b[:constantTimeMin(uint32(len(b)), 32)]
357 copy(b32[:], b32[:32-len(b)])
358 copy(b32[32-len(b):], b)
359 result := s.SetBytes(&b32)
360 zeroArray32(&b32)
361 return result != 0
362 }
363 364 // PutBytesUnchecked unpacks the scalar to a 32-byte big-endian value directly
365 // into the passed byte slice in constant time. The target slice must have at
366 // least 32 bytes available or it will panic.
367 //
368 // There is a similar function, PutBytes, which unpacks the scalar into a
369 // 32-byte array directly. This version is provided since it can be useful to
370 // write directly into part of a larger buffer without needing a separate
371 // allocation.
372 //
373 // Preconditions:
374 // - The target slice MUST have at least 32 bytes available
375 func (s *ModNScalar) PutBytesUnchecked(b []byte) {
376 // Unpack the 256 total bits from the 8 uint32 words. This could be done
377 // with a for loop, but benchmarks show this unrolled version is about 2
378 // times faster than the variant which uses a loop.
379 b[31] = byte(s.n[0])
380 b[30] = byte(s.n[0] >> 8)
381 b[29] = byte(s.n[0] >> 16)
382 b[28] = byte(s.n[0] >> 24)
383 b[27] = byte(s.n[1])
384 b[26] = byte(s.n[1] >> 8)
385 b[25] = byte(s.n[1] >> 16)
386 b[24] = byte(s.n[1] >> 24)
387 b[23] = byte(s.n[2])
388 b[22] = byte(s.n[2] >> 8)
389 b[21] = byte(s.n[2] >> 16)
390 b[20] = byte(s.n[2] >> 24)
391 b[19] = byte(s.n[3])
392 b[18] = byte(s.n[3] >> 8)
393 b[17] = byte(s.n[3] >> 16)
394 b[16] = byte(s.n[3] >> 24)
395 b[15] = byte(s.n[4])
396 b[14] = byte(s.n[4] >> 8)
397 b[13] = byte(s.n[4] >> 16)
398 b[12] = byte(s.n[4] >> 24)
399 b[11] = byte(s.n[5])
400 b[10] = byte(s.n[5] >> 8)
401 b[9] = byte(s.n[5] >> 16)
402 b[8] = byte(s.n[5] >> 24)
403 b[7] = byte(s.n[6])
404 b[6] = byte(s.n[6] >> 8)
405 b[5] = byte(s.n[6] >> 16)
406 b[4] = byte(s.n[6] >> 24)
407 b[3] = byte(s.n[7])
408 b[2] = byte(s.n[7] >> 8)
409 b[1] = byte(s.n[7] >> 16)
410 b[0] = byte(s.n[7] >> 24)
411 }
412 413 // PutBytes unpacks the scalar to a 32-byte big-endian value using the passed
414 // byte array in constant time.
415 //
416 // There is a similar function, PutBytesUnchecked, which unpacks the scalar into
417 // a slice that must have at least 32 bytes available. This version is provided
418 // since it can be useful to write directly into an array that is type checked.
419 //
420 // Alternatively, there is also Bytes, which unpacks the scalar into a new array
421 // and returns that which can sometimes be more ergonomic in applications that
422 // aren't concerned about an additional copy.
423 func (s *ModNScalar) PutBytes(b *[32]byte) { s.PutBytesUnchecked(b[:]) }
424 425 // Bytes unpacks the scalar to a 32-byte big-endian value in constant time.
426 //
427 // See PutBytes and PutBytesUnchecked for variants that allow an array or slice
428 // to be passed which can be useful to cut down on the number of allocations
429 // by allowing the caller to reuse a buffer or write directly into part of a
430 // larger buffer.
431 func (s *ModNScalar) Bytes() [32]byte {
432 var b [32]byte
433 s.PutBytesUnchecked(b[:])
434 return b
435 }
436 437 // IsOdd returns whether the scalar is an odd number in constant time.
438 func (s *ModNScalar) IsOdd() bool {
439 // Only odd numbers have the bottom bit set.
440 return s.n[0]&1 == 1
441 }
442 443 // Equals returns whether or not the two scalars are the same in constant time.
444 func (s *ModNScalar) Equals(val *ModNScalar) bool {
445 // Xor only sets bits when they are different, so the two scalars can only
446 // be the same if no bits are set after xoring each word.
447 bits := (s.n[0] ^ val.n[0]) | (s.n[1] ^ val.n[1]) | (s.n[2] ^ val.n[2]) |
448 (s.n[3] ^ val.n[3]) | (s.n[4] ^ val.n[4]) | (s.n[5] ^ val.n[5]) |
449 (s.n[6] ^ val.n[6]) | (s.n[7] ^ val.n[7])
450 451 return bits == 0
452 }
453 454 // Add2 adds the passed two scalars together modulo the group order in constant
455 // time and stores the result in s.
456 //
457 // The scalar is returned to support chaining. This enables syntax like:
458 // s3.Add2(s, s2).AddInt(1) so that s3 = s + s2 + 1.
459 func (s *ModNScalar) Add2(val1, val2 *ModNScalar) *ModNScalar {
460 c := uint64(val1.n[0]) + uint64(val2.n[0])
461 s.n[0] = uint32(c & uint32Mask)
462 c = (c >> 32) + uint64(val1.n[1]) + uint64(val2.n[1])
463 s.n[1] = uint32(c & uint32Mask)
464 c = (c >> 32) + uint64(val1.n[2]) + uint64(val2.n[2])
465 s.n[2] = uint32(c & uint32Mask)
466 c = (c >> 32) + uint64(val1.n[3]) + uint64(val2.n[3])
467 s.n[3] = uint32(c & uint32Mask)
468 c = (c >> 32) + uint64(val1.n[4]) + uint64(val2.n[4])
469 s.n[4] = uint32(c & uint32Mask)
470 c = (c >> 32) + uint64(val1.n[5]) + uint64(val2.n[5])
471 s.n[5] = uint32(c & uint32Mask)
472 c = (c >> 32) + uint64(val1.n[6]) + uint64(val2.n[6])
473 s.n[6] = uint32(c & uint32Mask)
474 c = (c >> 32) + uint64(val1.n[7]) + uint64(val2.n[7])
475 s.n[7] = uint32(c & uint32Mask)
476 // The result is now 256 bits, but it might still be >= N, so use the
477 // existing normal reduce method for 256-bit values.
478 s.reduce256(uint32(c>>32) + s.overflows())
479 return s
480 }
481 482 // Add adds the passed scalar to the existing one modulo the group order in
483 // constant time and stores the result in s.
484 //
485 // The scalar is returned to support chaining. This enables syntax like:
486 // s.Add(s2).AddInt(1) so that s = s + s2 + 1.
487 func (s *ModNScalar) Add(val *ModNScalar) *ModNScalar { return s.Add2(s, val) }
488 489 // accumulator96 provides a 96-bit accumulator for use in the intermediate
490 // calculations requiring more than 64-bits.
491 type accumulator96 struct {
492 n [3]uint32
493 }
494 495 // Add adds the passed unsigned 64-bit value to the accumulator.
496 func (a *accumulator96) Add(v uint64) {
497 low := uint32(v & uint32Mask)
498 hi := uint32(v >> 32)
499 a.n[0] += low
500 hi += constantTimeLess(a.n[0], low) // Carry if overflow in n[0].
501 a.n[1] += hi
502 a.n[2] += constantTimeLess(a.n[1], hi) // Carry if overflow in n[1].
503 }
504 505 // Rsh32 right shifts the accumulator by 32 bits.
506 func (a *accumulator96) Rsh32() {
507 a.n[0] = a.n[1]
508 a.n[1] = a.n[2]
509 a.n[2] = 0
510 }
511 512 // reduce385 reduces the 385-bit intermediate result in the passed terms modulo
513 // the group order in constant time and stores the result in s.
514 func (s *ModNScalar) reduce385(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12 uint64) {
515 // At this point, the intermediate result in the passed terms has been
516 // reduced to fit within 385 bits, so reduce it again using the same method
517 // described in reduce512. As before, the intermediate result will end up
518 // being reduced by another 127 bits to 258 bits, thus 9 32-bit terms are
519 // needed for this iteration. The reduced terms are assigned back to t0
520 // through t8.
521 //
522 // Note that several of the intermediate calculations require adding 64-bit
523 // products together which would overflow a uint64, so a 96-bit accumulator
524 // is used instead until the value is reduced enough to use native uint64s.
525 // Terms for 2^(32*0).
526 var acc accumulator96
527 acc.n[0] = uint32(t0) // == acc.Add(t0) because acc is guaranteed to be 0.
528 acc.Add(t8 * uint64(orderComplementWordZero))
529 t0 = uint64(acc.n[0])
530 acc.Rsh32()
531 // Terms for 2^(32*1).
532 acc.Add(t1)
533 acc.Add(t8 * uint64(orderComplementWordOne))
534 acc.Add(t9 * uint64(orderComplementWordZero))
535 t1 = uint64(acc.n[0])
536 acc.Rsh32()
537 // Terms for 2^(32*2).
538 acc.Add(t2)
539 acc.Add(t8 * uint64(orderComplementWordTwo))
540 acc.Add(t9 * uint64(orderComplementWordOne))
541 acc.Add(t10 * uint64(orderComplementWordZero))
542 t2 = uint64(acc.n[0])
543 acc.Rsh32()
544 // Terms for 2^(32*3).
545 acc.Add(t3)
546 acc.Add(t8 * uint64(orderComplementWordThree))
547 acc.Add(t9 * uint64(orderComplementWordTwo))
548 acc.Add(t10 * uint64(orderComplementWordOne))
549 acc.Add(t11 * uint64(orderComplementWordZero))
550 t3 = uint64(acc.n[0])
551 acc.Rsh32()
552 // Terms for 2^(32*4).
553 acc.Add(t4)
554 acc.Add(t8) // * uint64(orderComplementWordFour) // * 1
555 acc.Add(t9 * uint64(orderComplementWordThree))
556 acc.Add(t10 * uint64(orderComplementWordTwo))
557 acc.Add(t11 * uint64(orderComplementWordOne))
558 acc.Add(t12 * uint64(orderComplementWordZero))
559 t4 = uint64(acc.n[0])
560 acc.Rsh32()
561 // Terms for 2^(32*5).
562 acc.Add(t5)
563 // acc.Add(t8 * uint64(orderComplementWordFive)) // 0
564 acc.Add(t9) // * uint64(orderComplementWordFour) // * 1
565 acc.Add(t10 * uint64(orderComplementWordThree))
566 acc.Add(t11 * uint64(orderComplementWordTwo))
567 acc.Add(t12 * uint64(orderComplementWordOne))
568 t5 = uint64(acc.n[0])
569 acc.Rsh32()
570 // Terms for 2^(32*6).
571 acc.Add(t6)
572 // acc.Add(t8 * uint64(orderComplementWordSix)) // 0
573 // acc.Add(t9 * uint64(orderComplementWordFive)) // 0
574 acc.Add(t10) // * uint64(orderComplementWordFour) // * 1
575 acc.Add(t11 * uint64(orderComplementWordThree))
576 acc.Add(t12 * uint64(orderComplementWordTwo))
577 t6 = uint64(acc.n[0])
578 acc.Rsh32()
579 // Terms for 2^(32*7).
580 acc.Add(t7)
581 // acc.Add(t8 * uint64(orderComplementWordSeven)) // 0
582 // acc.Add(t9 * uint64(orderComplementWordSix)) // 0
583 // acc.Add(t10 * uint64(orderComplementWordFive)) // 0
584 acc.Add(t11) // * uint64(orderComplementWordFour) // * 1
585 acc.Add(t12 * uint64(orderComplementWordThree))
586 t7 = uint64(acc.n[0])
587 acc.Rsh32()
588 // Terms for 2^(32*8).
589 // acc.Add(t9 * uint64(orderComplementWordSeven)) // 0
590 // acc.Add(t10 * uint64(orderComplementWordSix)) // 0
591 // acc.Add(t11 * uint64(orderComplementWordFive)) // 0
592 acc.Add(t12) // * uint64(orderComplementWordFour) // * 1
593 t8 = uint64(acc.n[0])
594 // acc.Rsh32() // No need since not used after this. Guaranteed to be 0.
595 //
596 // NOTE: All of the remaining multiplications for this iteration result in 0
597 // as they all involve multiplying by combinations of the fifth, sixth, and
598 // seventh words of the two's complement of N, which are 0, so skip them.
599 //
600 // At this point, the result is reduced to fit within 258 bits, so reduce it
601 // again using a slightly modified version of the same method. The maximum
602 // value in t8 is 2 at this point and therefore multiplying it by each word
603 // of the two's complement of N and adding it to a 32-bit term will result
604 // in a maximum requirement of 33 bits, so it is safe to use native uint64s
605 // here for the intermediate term carry propagation.
606 //
607 // Also, since the maximum value in t8 is 2, this ends up reducing by
608 // another 2 bits to 256 bits.
609 c := t0 + t8*uint64(orderComplementWordZero)
610 s.n[0] = uint32(c & uint32Mask)
611 c = (c >> 32) + t1 + t8*uint64(orderComplementWordOne)
612 s.n[1] = uint32(c & uint32Mask)
613 c = (c >> 32) + t2 + t8*uint64(orderComplementWordTwo)
614 s.n[2] = uint32(c & uint32Mask)
615 c = (c >> 32) + t3 + t8*uint64(orderComplementWordThree)
616 s.n[3] = uint32(c & uint32Mask)
617 c = (c >> 32) + t4 + t8 // * uint64(orderComplementWordFour) == * 1
618 s.n[4] = uint32(c & uint32Mask)
619 c = (c >> 32) + t5 // + t8*uint64(orderComplementWordFive) == 0
620 s.n[5] = uint32(c & uint32Mask)
621 c = (c >> 32) + t6 // + t8*uint64(orderComplementWordSix) == 0
622 s.n[6] = uint32(c & uint32Mask)
623 c = (c >> 32) + t7 // + t8*uint64(orderComplementWordSeven) == 0
624 s.n[7] = uint32(c & uint32Mask)
625 // The result is now 256 bits, but it might still be >= N, so use the
626 // existing normal reduce method for 256-bit values.
627 s.reduce256(uint32(c>>32) + s.overflows())
628 }
629 630 // reduce512 reduces the 512-bit intermediate result in the passed terms modulo
631 // the group order down to 385 bits in constant time and stores the result in s.
632 func (s *ModNScalar) reduce512(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15 uint64) {
633 // At this point, the intermediate result in the passed terms is grouped
634 // into the respective bases.
635 //
636 // Per [HAC] section 14.3.4: Reduction method of moduli of special form,
637 // when the modulus is of the special form m = b^t - c, where log_2(c) < t,
638 // highly efficient reduction can be achieved per the provided algorithm.
639 //
640 // The secp256k1 group order fits this criteria since it is:
641 // 2^256 - 432420386565659656852420866394968145599
642 //
643 // Technically the max possible value here is (N-1)^2 since the two scalars
644 // being multiplied are always mod N. Nevertheless, it is safer to consider
645 // it to be (2^256-1)^2 = 2^512 - 2^256 + 1 since it is the product of two
646 // 256-bit values.
647 //
648 // The algorithm is to reduce the result modulo the prime by subtracting
649 // multiples of the group order N. However, in order simplify carry
650 // propagation, this adds with the two's complement of N to achieve the same
651 // result.
652 //
653 // Since the two's complement of N has 127 leading zero bits, this will end
654 // up reducing the intermediate result from 512 bits to 385 bits, resulting
655 // in 13 32-bit terms. The reduced terms are assigned back to t0 through
656 // t12.
657 //
658 // Note that several of the intermediate calculations require adding 64-bit
659 // products together which would overflow a uint64, so a 96-bit accumulator
660 // is used instead.
661 //
662 // Terms for 2^(32*0).
663 var acc accumulator96
664 acc.n[0] = uint32(t0) // == acc.Add(t0) because acc is guaranteed to be 0.
665 acc.Add(t8 * uint64(orderComplementWordZero))
666 t0 = uint64(acc.n[0])
667 acc.Rsh32()
668 // Terms for 2^(32*1).
669 acc.Add(t1)
670 acc.Add(t8 * uint64(orderComplementWordOne))
671 acc.Add(t9 * uint64(orderComplementWordZero))
672 t1 = uint64(acc.n[0])
673 acc.Rsh32()
674 // Terms for 2^(32*2).
675 acc.Add(t2)
676 acc.Add(t8 * uint64(orderComplementWordTwo))
677 acc.Add(t9 * uint64(orderComplementWordOne))
678 acc.Add(t10 * uint64(orderComplementWordZero))
679 t2 = uint64(acc.n[0])
680 acc.Rsh32()
681 // Terms for 2^(32*3).
682 acc.Add(t3)
683 acc.Add(t8 * uint64(orderComplementWordThree))
684 acc.Add(t9 * uint64(orderComplementWordTwo))
685 acc.Add(t10 * uint64(orderComplementWordOne))
686 acc.Add(t11 * uint64(orderComplementWordZero))
687 t3 = uint64(acc.n[0])
688 acc.Rsh32()
689 // Terms for 2^(32*4).
690 acc.Add(t4)
691 acc.Add(t8) // * uint64(orderComplementWordFour) // * 1
692 acc.Add(t9 * uint64(orderComplementWordThree))
693 acc.Add(t10 * uint64(orderComplementWordTwo))
694 acc.Add(t11 * uint64(orderComplementWordOne))
695 acc.Add(t12 * uint64(orderComplementWordZero))
696 t4 = uint64(acc.n[0])
697 acc.Rsh32()
698 // Terms for 2^(32*5).
699 acc.Add(t5)
700 // acc.Add(t8 * uint64(orderComplementWordFive)) // 0
701 acc.Add(t9) // * uint64(orderComplementWordFour) // * 1
702 acc.Add(t10 * uint64(orderComplementWordThree))
703 acc.Add(t11 * uint64(orderComplementWordTwo))
704 acc.Add(t12 * uint64(orderComplementWordOne))
705 acc.Add(t13 * uint64(orderComplementWordZero))
706 t5 = uint64(acc.n[0])
707 acc.Rsh32()
708 // Terms for 2^(32*6).
709 acc.Add(t6)
710 // acc.Add(t8 * uint64(orderComplementWordSix)) // 0
711 // acc.Add(t9 * uint64(orderComplementWordFive)) // 0
712 acc.Add(t10) // * uint64(orderComplementWordFour)) // * 1
713 acc.Add(t11 * uint64(orderComplementWordThree))
714 acc.Add(t12 * uint64(orderComplementWordTwo))
715 acc.Add(t13 * uint64(orderComplementWordOne))
716 acc.Add(t14 * uint64(orderComplementWordZero))
717 t6 = uint64(acc.n[0])
718 acc.Rsh32()
719 // Terms for 2^(32*7).
720 acc.Add(t7)
721 // acc.Add(t8 * uint64(orderComplementWordSeven)) // 0
722 // acc.Add(t9 * uint64(orderComplementWordSix)) // 0
723 // acc.Add(t10 * uint64(orderComplementWordFive)) // 0
724 acc.Add(t11) // * uint64(orderComplementWordFour) // * 1
725 acc.Add(t12 * uint64(orderComplementWordThree))
726 acc.Add(t13 * uint64(orderComplementWordTwo))
727 acc.Add(t14 * uint64(orderComplementWordOne))
728 acc.Add(t15 * uint64(orderComplementWordZero))
729 t7 = uint64(acc.n[0])
730 acc.Rsh32()
731 // Terms for 2^(32*8).
732 // acc.Add(t9 * uint64(orderComplementWordSeven)) // 0
733 // acc.Add(t10 * uint64(orderComplementWordSix)) // 0
734 // acc.Add(t11 * uint64(orderComplementWordFive)) // 0
735 acc.Add(t12) // * uint64(orderComplementWordFour) // * 1
736 acc.Add(t13 * uint64(orderComplementWordThree))
737 acc.Add(t14 * uint64(orderComplementWordTwo))
738 acc.Add(t15 * uint64(orderComplementWordOne))
739 t8 = uint64(acc.n[0])
740 acc.Rsh32()
741 // Terms for 2^(32*9).
742 // acc.Add(t10 * uint64(orderComplementWordSeven)) // 0
743 // acc.Add(t11 * uint64(orderComplementWordSix)) // 0
744 // acc.Add(t12 * uint64(orderComplementWordFive)) // 0
745 acc.Add(t13) // * uint64(orderComplementWordFour) // * 1
746 acc.Add(t14 * uint64(orderComplementWordThree))
747 acc.Add(t15 * uint64(orderComplementWordTwo))
748 t9 = uint64(acc.n[0])
749 acc.Rsh32()
750 // Terms for 2^(32*10).
751 // acc.Add(t11 * uint64(orderComplementWordSeven)) // 0
752 // acc.Add(t12 * uint64(orderComplementWordSix)) // 0
753 // acc.Add(t13 * uint64(orderComplementWordFive)) // 0
754 acc.Add(t14) // * uint64(orderComplementWordFour) // * 1
755 acc.Add(t15 * uint64(orderComplementWordThree))
756 t10 = uint64(acc.n[0])
757 acc.Rsh32()
758 // Terms for 2^(32*11).
759 // acc.Add(t12 * uint64(orderComplementWordSeven)) // 0
760 // acc.Add(t13 * uint64(orderComplementWordSix)) // 0
761 // acc.Add(t14 * uint64(orderComplementWordFive)) // 0
762 acc.Add(t15) // * uint64(orderComplementWordFour) // * 1
763 t11 = uint64(acc.n[0])
764 acc.Rsh32()
765 // NOTE: All of the remaining multiplications for this iteration result in 0
766 // as they all involve multiplying by combinations of the fifth, sixth, and
767 // seventh words of the two's complement of N, which are 0, so skip them.
768 //
769 // Terms for 2^(32*12).
770 t12 = uint64(acc.n[0])
771 // acc.Rsh32() // No need since not used after this. Guaranteed to be 0.
772 //
773 // At this point, the result is reduced to fit within 385 bits, so reduce it
774 // again using the same method accordingly.
775 s.reduce385(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12)
776 }
777 778 // Mul2 multiplies the passed two scalars together modulo the group order in
779 // constant time and stores the result in s.
780 //
781 // The scalar is returned to support chaining. This enables syntax like:
782 // s3.Mul2(s, s2).AddInt(1) so that s3 = (s * s2) + 1.
783 func (s *ModNScalar) Mul2(val, val2 *ModNScalar) *ModNScalar {
784 // This could be done with for loops and an array to store the intermediate
785 // terms, but this unrolled version is significantly faster.
786 //
787 // The overall strategy employed here is:
788 // 1) Calculate the 512-bit product of the two scalars using the standard
789 // pencil-and-paper method.
790 // 2) Reduce the result modulo the prime by effectively subtracting
791 // multiples of the group order N (actually performed by adding multiples
792 // of the two's complement of N to avoid implementing subtraction).
793 // 3) Repeat step 2 noting that each iteration reduces the required number
794 // of bits by 127 because the two's complement of N has 127 leading zero
795 // bits.
796 // 4) Once reduced to 256 bits, call the existing reduce method to perform
797 // a final reduction as needed.
798 //
799 // Note that several of the intermediate calculations require adding 64-bit
800 // products together which would overflow a uint64, so a 96-bit accumulator
801 // is used instead.
802 //
803 // Terms for 2^(32*0).
804 var acc accumulator96
805 acc.Add(uint64(val.n[0]) * uint64(val2.n[0]))
806 t0 := uint64(acc.n[0])
807 acc.Rsh32()
808 // Terms for 2^(32*1).
809 acc.Add(uint64(val.n[0]) * uint64(val2.n[1]))
810 acc.Add(uint64(val.n[1]) * uint64(val2.n[0]))
811 t1 := uint64(acc.n[0])
812 acc.Rsh32()
813 // Terms for 2^(32*2).
814 acc.Add(uint64(val.n[0]) * uint64(val2.n[2]))
815 acc.Add(uint64(val.n[1]) * uint64(val2.n[1]))
816 acc.Add(uint64(val.n[2]) * uint64(val2.n[0]))
817 t2 := uint64(acc.n[0])
818 acc.Rsh32()
819 // Terms for 2^(32*3).
820 acc.Add(uint64(val.n[0]) * uint64(val2.n[3]))
821 acc.Add(uint64(val.n[1]) * uint64(val2.n[2]))
822 acc.Add(uint64(val.n[2]) * uint64(val2.n[1]))
823 acc.Add(uint64(val.n[3]) * uint64(val2.n[0]))
824 t3 := uint64(acc.n[0])
825 acc.Rsh32()
826 // Terms for 2^(32*4).
827 acc.Add(uint64(val.n[0]) * uint64(val2.n[4]))
828 acc.Add(uint64(val.n[1]) * uint64(val2.n[3]))
829 acc.Add(uint64(val.n[2]) * uint64(val2.n[2]))
830 acc.Add(uint64(val.n[3]) * uint64(val2.n[1]))
831 acc.Add(uint64(val.n[4]) * uint64(val2.n[0]))
832 t4 := uint64(acc.n[0])
833 acc.Rsh32()
834 // Terms for 2^(32*5).
835 acc.Add(uint64(val.n[0]) * uint64(val2.n[5]))
836 acc.Add(uint64(val.n[1]) * uint64(val2.n[4]))
837 acc.Add(uint64(val.n[2]) * uint64(val2.n[3]))
838 acc.Add(uint64(val.n[3]) * uint64(val2.n[2]))
839 acc.Add(uint64(val.n[4]) * uint64(val2.n[1]))
840 acc.Add(uint64(val.n[5]) * uint64(val2.n[0]))
841 t5 := uint64(acc.n[0])
842 acc.Rsh32()
843 // Terms for 2^(32*6).
844 acc.Add(uint64(val.n[0]) * uint64(val2.n[6]))
845 acc.Add(uint64(val.n[1]) * uint64(val2.n[5]))
846 acc.Add(uint64(val.n[2]) * uint64(val2.n[4]))
847 acc.Add(uint64(val.n[3]) * uint64(val2.n[3]))
848 acc.Add(uint64(val.n[4]) * uint64(val2.n[2]))
849 acc.Add(uint64(val.n[5]) * uint64(val2.n[1]))
850 acc.Add(uint64(val.n[6]) * uint64(val2.n[0]))
851 t6 := uint64(acc.n[0])
852 acc.Rsh32()
853 // Terms for 2^(32*7).
854 acc.Add(uint64(val.n[0]) * uint64(val2.n[7]))
855 acc.Add(uint64(val.n[1]) * uint64(val2.n[6]))
856 acc.Add(uint64(val.n[2]) * uint64(val2.n[5]))
857 acc.Add(uint64(val.n[3]) * uint64(val2.n[4]))
858 acc.Add(uint64(val.n[4]) * uint64(val2.n[3]))
859 acc.Add(uint64(val.n[5]) * uint64(val2.n[2]))
860 acc.Add(uint64(val.n[6]) * uint64(val2.n[1]))
861 acc.Add(uint64(val.n[7]) * uint64(val2.n[0]))
862 t7 := uint64(acc.n[0])
863 acc.Rsh32()
864 // Terms for 2^(32*8).
865 acc.Add(uint64(val.n[1]) * uint64(val2.n[7]))
866 acc.Add(uint64(val.n[2]) * uint64(val2.n[6]))
867 acc.Add(uint64(val.n[3]) * uint64(val2.n[5]))
868 acc.Add(uint64(val.n[4]) * uint64(val2.n[4]))
869 acc.Add(uint64(val.n[5]) * uint64(val2.n[3]))
870 acc.Add(uint64(val.n[6]) * uint64(val2.n[2]))
871 acc.Add(uint64(val.n[7]) * uint64(val2.n[1]))
872 t8 := uint64(acc.n[0])
873 acc.Rsh32()
874 // Terms for 2^(32*9).
875 acc.Add(uint64(val.n[2]) * uint64(val2.n[7]))
876 acc.Add(uint64(val.n[3]) * uint64(val2.n[6]))
877 acc.Add(uint64(val.n[4]) * uint64(val2.n[5]))
878 acc.Add(uint64(val.n[5]) * uint64(val2.n[4]))
879 acc.Add(uint64(val.n[6]) * uint64(val2.n[3]))
880 acc.Add(uint64(val.n[7]) * uint64(val2.n[2]))
881 t9 := uint64(acc.n[0])
882 acc.Rsh32()
883 // Terms for 2^(32*10).
884 acc.Add(uint64(val.n[3]) * uint64(val2.n[7]))
885 acc.Add(uint64(val.n[4]) * uint64(val2.n[6]))
886 acc.Add(uint64(val.n[5]) * uint64(val2.n[5]))
887 acc.Add(uint64(val.n[6]) * uint64(val2.n[4]))
888 acc.Add(uint64(val.n[7]) * uint64(val2.n[3]))
889 t10 := uint64(acc.n[0])
890 acc.Rsh32()
891 // Terms for 2^(32*11).
892 acc.Add(uint64(val.n[4]) * uint64(val2.n[7]))
893 acc.Add(uint64(val.n[5]) * uint64(val2.n[6]))
894 acc.Add(uint64(val.n[6]) * uint64(val2.n[5]))
895 acc.Add(uint64(val.n[7]) * uint64(val2.n[4]))
896 t11 := uint64(acc.n[0])
897 acc.Rsh32()
898 // Terms for 2^(32*12).
899 acc.Add(uint64(val.n[5]) * uint64(val2.n[7]))
900 acc.Add(uint64(val.n[6]) * uint64(val2.n[6]))
901 acc.Add(uint64(val.n[7]) * uint64(val2.n[5]))
902 t12 := uint64(acc.n[0])
903 acc.Rsh32()
904 // Terms for 2^(32*13).
905 acc.Add(uint64(val.n[6]) * uint64(val2.n[7]))
906 acc.Add(uint64(val.n[7]) * uint64(val2.n[6]))
907 t13 := uint64(acc.n[0])
908 acc.Rsh32()
909 // Terms for 2^(32*14).
910 acc.Add(uint64(val.n[7]) * uint64(val2.n[7]))
911 t14 := uint64(acc.n[0])
912 acc.Rsh32()
913 // What's left is for 2^(32*15).
914 t15 := uint64(acc.n[0])
915 // acc.Rsh32() // No need since not used after this. Guaranteed to be 0.
916 //
917 // At this point, all of the terms are grouped into their respective base
918 // and occupy up to 512 bits. Reduce the result accordingly.
919 s.reduce512(
920 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14,
921 t15,
922 )
923 return s
924 }
925 926 // Mul multiplies the passed scalar with the existing one modulo the group order
927 // in constant time and stores the result in s.
928 //
929 // The scalar is returned to support chaining. This enables syntax like:
930 // s.Mul(s2).AddInt(1) so that s = (s * s2) + 1.
931 func (s *ModNScalar) Mul(val *ModNScalar) *ModNScalar {
932 return s.Mul2(s, val)
933 }
934 935 // SquareVal squares the passed scalar modulo the group order in constant time
936 // and stores the result in s.
937 //
938 // The scalar is returned to support chaining. This enables syntax like:
939 // s3.SquareVal(s).Mul(s) so that s3 = s^2 * s = s^3.
940 func (s *ModNScalar) SquareVal(val *ModNScalar) *ModNScalar {
941 // This could technically be optimized slightly to take advantage of the
942 // fact that many of the intermediate calculations in squaring are just
943 // doubling, however, benchmarking has shown that due to the need to use a
944 // 96-bit accumulator, any savings are essentially offset by that and
945 // consequently there is no real difference in performance over just
946 // multiplying the value by itself to justify the extra code for now. This
947 // can be revisited in the future if it becomes a bottleneck in practice.
948 return s.Mul2(val, val)
949 }
950 951 // Square squares the scalar modulo the group order in constant time. The
952 // existing scalar is modified.
953 //
954 // The scalar is returned to support chaining. This enables syntax like:
955 // s.Square().Mul(s2) so that s = s^2 * s2.
956 func (s *ModNScalar) Square() *ModNScalar {
957 return s.SquareVal(s)
958 }
959 960 // NegateVal negates the passed scalar modulo the group order and stores the
961 // result in s in constant time.
962 //
963 // The scalar is returned to support chaining. This enables syntax like:
964 // s.NegateVal(s2).AddInt(1) so that s = -s2 + 1.
965 func (s *ModNScalar) NegateVal(val *ModNScalar) *ModNScalar {
966 // Since the scalar is already in the range 0 <= val < N, where N is the
967 // group order, negation modulo the group order is just the group order
968 // minus the value. This implies that the result will always be in the
969 // desired range with the sole exception of 0 because N - 0 = N itself.
970 //
971 // Therefore, in order to avoid the need to reduce the result for every
972 // other case in order to achieve constant time, this creates a mask that is
973 // all 0s in the case of the scalar being negated is 0 and all 1s otherwise
974 // and bitwise ands that mask with each word.
975 //
976 // Finally, to simplify the carry propagation, this adds the two's
977 // complement of the scalar to N in order to achieve the same result.
978 bits := val.n[0] | val.n[1] | val.n[2] | val.n[3] | val.n[4] | val.n[5] |
979 val.n[6] | val.n[7]
980 mask := uint64(uint32Mask * constantTimeNotEq(bits, 0))
981 c := uint64(orderWordZero) + (uint64(^val.n[0]) + 1)
982 s.n[0] = uint32(c & mask)
983 c = (c >> 32) + uint64(orderWordOne) + uint64(^val.n[1])
984 s.n[1] = uint32(c & mask)
985 c = (c >> 32) + uint64(orderWordTwo) + uint64(^val.n[2])
986 s.n[2] = uint32(c & mask)
987 c = (c >> 32) + uint64(orderWordThree) + uint64(^val.n[3])
988 s.n[3] = uint32(c & mask)
989 c = (c >> 32) + uint64(orderWordFour) + uint64(^val.n[4])
990 s.n[4] = uint32(c & mask)
991 c = (c >> 32) + uint64(orderWordFive) + uint64(^val.n[5])
992 s.n[5] = uint32(c & mask)
993 c = (c >> 32) + uint64(orderWordSix) + uint64(^val.n[6])
994 s.n[6] = uint32(c & mask)
995 c = (c >> 32) + uint64(orderWordSeven) + uint64(^val.n[7])
996 s.n[7] = uint32(c & mask)
997 return s
998 }
999 1000 // Negate negates the scalar modulo the group order in constant time. The
1001 // existing scalar is modified.
1002 //
1003 // The scalar is returned to support chaining. This enables syntax like:
1004 // s.Negate().AddInt(1) so that s = -s + 1.
1005 func (s *ModNScalar) Negate() *ModNScalar { return s.NegateVal(s) }
1006 1007 // InverseValNonConst finds the modular multiplicative inverse of the passed
1008 // scalar and stores result in s in *non-constant* time.
1009 //
1010 // The scalar is returned to support chaining. This enables syntax like:
1011 // s3.InverseVal(s1).Mul(s2) so that s3 = s1^-1 * s2.
1012 func (s *ModNScalar) InverseValNonConst(val *ModNScalar) *ModNScalar {
1013 // This is making use of big integers for now. Ideally it will be replaced
1014 // with an implementation that does not depend on big integers.
1015 valBytes := val.Bytes()
1016 bigVal := (&big.Int{}).SetBytes(valBytes[:])
1017 bigVal.ModInverse(bigVal, curveParams.N)
1018 s.SetByteSlice(bigVal.Bytes())
1019 return s
1020 }
1021 1022 // InverseNonConst finds the modular multiplicative inverse of the scalar in
1023 // *non-constant* time. The existing scalar is modified.
1024 //
1025 // The scalar is returned to support chaining. This enables syntax like:
1026 // s.Inverse().Mul(s2) so that s = s^-1 * s2.
1027 func (s *ModNScalar) InverseNonConst() *ModNScalar {
1028 return s.InverseValNonConst(s)
1029 }
1030 1031 // IsOverHalfOrder returns whether the scalar exceeds the group order
1032 // divided by 2 in constant time.
1033 func (s *ModNScalar) IsOverHalfOrder() bool {
1034 // The intuition here is that the scalar is greater than half of the group
1035 // order if one of the higher individual words is greater than the
1036 // corresponding word of the half group order and all higher words in the
1037 // scalar are equal to their corresponding word of the half group order.
1038 //
1039 // Note that the words 4, 5, and 6 are all the max uint32 value, so there is
1040 // no need to test if those individual words of the scalar exceeds them,
1041 // hence, only equality is checked for them.
1042 result := constantTimeGreater(s.n[7], halfOrderWordSeven)
1043 highWordsEqual := constantTimeEq(s.n[7], halfOrderWordSeven)
1044 highWordsEqual &= constantTimeEq(s.n[6], halfOrderWordSix)
1045 highWordsEqual &= constantTimeEq(s.n[5], halfOrderWordFive)
1046 highWordsEqual &= constantTimeEq(s.n[4], halfOrderWordFour)
1047 result |= highWordsEqual & constantTimeGreater(s.n[3], halfOrderWordThree)
1048 highWordsEqual &= constantTimeEq(s.n[3], halfOrderWordThree)
1049 result |= highWordsEqual & constantTimeGreater(s.n[2], halfOrderWordTwo)
1050 highWordsEqual &= constantTimeEq(s.n[2], halfOrderWordTwo)
1051 result |= highWordsEqual & constantTimeGreater(s.n[1], halfOrderWordOne)
1052 highWordsEqual &= constantTimeEq(s.n[1], halfOrderWordOne)
1053 result |= highWordsEqual & constantTimeGreater(s.n[0], halfOrderWordZero)
1054 return result != 0
1055 }
1056