1 // Copyright (c) 2013-2016 The btcsuite developers
2 // Copyright (c) 2013-2016 Dave Collins
3 // Use of this source code is governed by an ISC
4 // license that can be found in the LICENSE file.
5 6 package ecc
7 8 // References:
9 // [HAC]: Handbook of Applied Cryptography Menezes, van Oorschot, Vanstone.
10 // http://cacr.uwaterloo.ca/hac/
11 12 // All elliptic curve operations for secp256k1 are done in a finite field
13 // characterized by a 256-bit prime. Given this precision is larger than the
14 // biggest available native type, obviously some form of bignum math is needed.
15 // This package implements specialized fixed-precision field arithmetic rather
16 // than relying on an arbitrary-precision arithmetic package such as math/big
17 // for dealing with the field math since the size is known. As a result, rather
18 // large performance gains are achieved by taking advantage of many
19 // optimizations not available to arbitrary-precision arithmetic and generic
20 // modular arithmetic algorithms.
21 //
22 // There are various ways to internally represent each finite field element.
23 // For example, the most obvious representation would be to use an array of 4
24 // uint64s (64 bits * 4 = 256 bits). However, that representation suffers from
25 // a couple of issues. First, there is no native Go type large enough to handle
26 // the intermediate results while adding or multiplying two 64-bit numbers, and
27 // second there is no space left for overflows when performing the intermediate
28 // arithmetic between each array element which would lead to expensive carry
29 // propagation.
30 //
31 // Given the above, this implementation represents the the field elements as
32 // 10 uint32s with each word (array entry) treated as base 2^26. This was
33 // chosen for the following reasons:
34 // 1) Most systems at the current time are 64-bit (or at least have 64-bit
35 // registers available for specialized purposes such as MMX) so the
36 // intermediate results can typically be done using a native register (and
37 // using uint64s to avoid the need for additional half-word arithmetic)
38 // 2) In order to allow addition of the internal words without having to
39 // propagate the the carry, the max normalized value for each register must
40 // be less than the number of bits available in the register
41 // 3) Since we're dealing with 32-bit values, 64-bits of overflow is a
42 // reasonable choice for #2
43 // 4) Given the need for 256-bits of precision and the properties stated in #1,
44 // #2, and #3, the representation which best accommodates this is 10 uint32s
45 // with base 2^26 (26 bits * 10 = 260 bits, so the final word only needs 22
46 // bits) which leaves the desired 64 bits (32 * 10 = 320, 320 - 256 = 64) for
47 // overflow
48 //
49 // Since it is so important that the field arithmetic is extremely fast for
50 // high performance crypto, this package does not perform any validation where
51 // it ordinarily would. For example, some functions only give the correct
52 // result is the field is normalized and there is no checking to ensure it is.
53 // While I typically prefer to ensure all state and input is valid for most
54 // packages, this code is really only used internally and every extra check
55 // counts.
56 57 import (
58 "encoding/hex"
59 )
60 61 // Constants used to make the code more readable.
62 const (
63 twoBitsMask = 0x3
64 fourBitsMask = 0xf
65 sixBitsMask = 0x3f
66 eightBitsMask = 0xff
67 )
68 69 // Constants related to the field representation.
70 const (
71 // fieldWords is the number of words used to internally represent the
72 // 256-bit value.
73 fieldWords = 10
74 75 // fieldBase is the exponent used to form the numeric base of each word.
76 // 2^(fieldBase*i) where i is the word position.
77 fieldBase = 26
78 79 // fieldOverflowBits is the minimum number of "overflow" bits for each
80 // word in the field value.
81 fieldOverflowBits = 32 - fieldBase
82 83 // fieldBaseMask is the mask for the bits in each word needed to
84 // represent the numeric base of each word (except the most significant
85 // word).
86 fieldBaseMask = (1 << fieldBase) - 1
87 88 // fieldMSBBits is the number of bits in the most significant word used
89 // to represent the value.
90 fieldMSBBits = 256 - (fieldBase * (fieldWords - 1))
91 92 // fieldMSBMask is the mask for the bits in the most significant word
93 // needed to represent the value.
94 fieldMSBMask = (1 << fieldMSBBits) - 1
95 96 // fieldPrimeWordZero is word zero of the secp256k1 prime in the
97 // internal field representation. It is used during negation.
98 fieldPrimeWordZero = 0x3fffc2f
99 100 // fieldPrimeWordOne is word one of the secp256k1 prime in the
101 // internal field representation. It is used during negation.
102 fieldPrimeWordOne = 0x3ffffbf
103 )
104 105 var (
106 // fieldQBytes is the value Q = (P+1)/4 for the secp256k1 prime P. This
107 // value is used to efficiently compute the square root of values in the
108 // field via exponentiation. The value of Q in hex is:
109 //
110 // Q = 3fffffffffffffffffffffffffffffffffffffffffffffffffffffffbfffff0c
111 fieldQBytes = []byte{
112 0x3f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
113 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
114 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
115 0xff, 0xff, 0xff, 0xff, 0xbf, 0xff, 0xff, 0x0c,
116 }
117 )
118 119 // fieldVal implements optimized fixed-precision arithmetic over the
120 // secp256k1 finite field. This means all arithmetic is performed modulo
121 // 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f. It
122 // represents each 256-bit value as 10 32-bit integers in base 2^26. This
123 // provides 6 bits of overflow in each word (10 bits in the most significant
124 // word) for a total of 64 bits of overflow (9*6 + 10 = 64). It only implements
125 // the arithmetic needed for elliptic curve operations.
126 //
127 // The following depicts the internal representation:
128 // -----------------------------------------------------------------
129 // | n[9] | n[8] | ... | n[0] |
130 // | 32 bits available | 32 bits available | ... | 32 bits available |
131 // | 22 bits for value | 26 bits for value | ... | 26 bits for value |
132 // | 10 bits overflow | 6 bits overflow | ... | 6 bits overflow |
133 // | Mult: 2^(26*9) | Mult: 2^(26*8) | ... | Mult: 2^(26*0) |
134 // -----------------------------------------------------------------
135 //
136 // For example, consider the number 2^49 + 1. It would be represented as:
137 // n[0] = 1
138 // n[1] = 2^23
139 // n[2..9] = 0
140 //
141 // The full 256-bit value is then calculated by looping i from 9..0 and
142 // doing sum(n[i] * 2^(26i)) like so:
143 // n[9] * 2^(26*9) = 0 * 2^234 = 0
144 // n[8] * 2^(26*8) = 0 * 2^208 = 0
145 // ...
146 // n[1] * 2^(26*1) = 2^23 * 2^26 = 2^49
147 // n[0] * 2^(26*0) = 1 * 2^0 = 1
148 // Sum: 0 + 0 + ... + 2^49 + 1 = 2^49 + 1
149 type fieldVal struct {
150 n [10]uint32
151 }
152 153 // String returns the field value as a human-readable hex string.
154 func (f fieldVal) String() string {
155 t := new(fieldVal).Set(&f).Normalize()
156 return hex.EncodeToString(t.Bytes()[:])
157 }
158 159 // Zero sets the field value to zero. A newly created field value is already
160 // set to zero. This function can be useful to clear an existing field value
161 // for reuse.
162 func (f *fieldVal) Zero() {
163 f.n[0] = 0
164 f.n[1] = 0
165 f.n[2] = 0
166 f.n[3] = 0
167 f.n[4] = 0
168 f.n[5] = 0
169 f.n[6] = 0
170 f.n[7] = 0
171 f.n[8] = 0
172 f.n[9] = 0
173 }
174 175 // Set sets the field value equal to the passed value.
176 //
177 // The field value is returned to support chaining. This enables syntax like:
178 // f := new(fieldVal).Set(f2).Add(1) so that f = f2 + 1 where f2 is not
179 // modified.
180 func (f *fieldVal) Set(val *fieldVal) *fieldVal {
181 *f = *val
182 return f
183 }
184 185 // SetInt sets the field value to the passed integer. This is a convenience
186 // function since it is fairly common to perform some arithemetic with small
187 // native integers.
188 //
189 // The field value is returned to support chaining. This enables syntax such
190 // as f := new(fieldVal).SetInt(2).Mul(f2) so that f = 2 * f2.
191 func (f *fieldVal) SetInt(ui uint) *fieldVal {
192 f.Zero()
193 f.n[0] = uint32(ui)
194 return f
195 }
196 197 // SetBytes packs the passed 32-byte big-endian value into the internal field
198 // value representation.
199 //
200 // The field value is returned to support chaining. This enables syntax like:
201 // f := new(fieldVal).SetBytes(byteArray).Mul(f2) so that f = ba * f2.
202 func (f *fieldVal) SetBytes(b *[32]byte) *fieldVal {
203 // Pack the 256 total bits across the 10 uint32 words with a max of
204 // 26-bits per word. This could be done with a couple of for loops,
205 // but this unrolled version is significantly faster. Benchmarks show
206 // this is about 34 times faster than the variant which uses loops.
207 f.n[0] = uint32(b[31]) | uint32(b[30])<<8 | uint32(b[29])<<16 |
208 (uint32(b[28])&twoBitsMask)<<24
209 f.n[1] = uint32(b[28])>>2 | uint32(b[27])<<6 | uint32(b[26])<<14 |
210 (uint32(b[25])&fourBitsMask)<<22
211 f.n[2] = uint32(b[25])>>4 | uint32(b[24])<<4 | uint32(b[23])<<12 |
212 (uint32(b[22])&sixBitsMask)<<20
213 f.n[3] = uint32(b[22])>>6 | uint32(b[21])<<2 | uint32(b[20])<<10 |
214 uint32(b[19])<<18
215 f.n[4] = uint32(b[18]) | uint32(b[17])<<8 | uint32(b[16])<<16 |
216 (uint32(b[15])&twoBitsMask)<<24
217 f.n[5] = uint32(b[15])>>2 | uint32(b[14])<<6 | uint32(b[13])<<14 |
218 (uint32(b[12])&fourBitsMask)<<22
219 f.n[6] = uint32(b[12])>>4 | uint32(b[11])<<4 | uint32(b[10])<<12 |
220 (uint32(b[9])&sixBitsMask)<<20
221 f.n[7] = uint32(b[9])>>6 | uint32(b[8])<<2 | uint32(b[7])<<10 |
222 uint32(b[6])<<18
223 f.n[8] = uint32(b[5]) | uint32(b[4])<<8 | uint32(b[3])<<16 |
224 (uint32(b[2])&twoBitsMask)<<24
225 f.n[9] = uint32(b[2])>>2 | uint32(b[1])<<6 | uint32(b[0])<<14
226 return f
227 }
228 229 // SetByteSlice interprets the provided slice as a 256-bit big-endian unsigned
230 // integer (meaning it is truncated to the first 32 bytes), packs it into the
231 // internal field value representation, and returns the updated field value.
232 //
233 // Note that since passing a slice with more than 32 bytes is truncated, it is
234 // possible that the truncated value is less than the field prime. It is up to
235 // the caller to decide whether it needs to provide numbers of the appropriate
236 // size or if it is acceptable to use this function with the described
237 // truncation behavior.
238 //
239 // The field value is returned to support chaining. This enables syntax like:
240 // f := new(fieldVal).SetByteSlice(byteSlice)
241 func (f *fieldVal) SetByteSlice(b []byte) *fieldVal {
242 var b32 [32]byte
243 if len(b) > 32 {
244 b = b[:32]
245 }
246 copy(b32[32-len(b):], b)
247 return f.SetBytes(&b32)
248 }
249 250 // SetHex decodes the passed big-endian hex string into the internal field value
251 // representation. Only the first 32-bytes are used.
252 //
253 // The field value is returned to support chaining. This enables syntax like:
254 // f := new(fieldVal).SetHex("0abc").Add(1) so that f = 0x0abc + 1
255 func (f *fieldVal) SetHex(hexString string) *fieldVal {
256 if len(hexString)%2 != 0 {
257 hexString = "0" + hexString
258 }
259 bytes, _ := hex.DecodeString(hexString)
260 return f.SetByteSlice(bytes)
261 }
262 263 // Normalize normalizes the internal field words into the desired range and
264 // performs fast modular reduction over the secp256k1 prime by making use of the
265 // special form of the prime.
266 func (f *fieldVal) Normalize() *fieldVal {
267 // The field representation leaves 6 bits of overflow in each word so
268 // intermediate calculations can be performed without needing to
269 // propagate the carry to each higher word during the calculations. In
270 // order to normalize, we need to "compact" the full 256-bit value to
271 // the right while propagating any carries through to the high order
272 // word.
273 //
274 // Since this field is doing arithmetic modulo the secp256k1 prime, we
275 // also need to perform modular reduction over the prime.
276 //
277 // Per [HAC] section 14.3.4: Reduction method of moduli of special form,
278 // when the modulus is of the special form m = b^t - c, highly efficient
279 // reduction can be achieved.
280 //
281 // The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits
282 // this criteria.
283 //
284 // 4294968273 in field representation (base 2^26) is:
285 // n[0] = 977
286 // n[1] = 64
287 // That is to say (2^26 * 64) + 977 = 4294968273
288 //
289 // The algorithm presented in the referenced section typically repeats
290 // until the quotient is zero. However, due to our field representation
291 // we already know to within one reduction how many times we would need
292 // to repeat as it's the uppermost bits of the high order word. Thus we
293 // can simply multiply the magnitude by the field representation of the
294 // prime and do a single iteration. After this step there might be an
295 // additional carry to bit 256 (bit 22 of the high order word).
296 t9 := f.n[9]
297 m := t9 >> fieldMSBBits
298 t9 = t9 & fieldMSBMask
299 t0 := f.n[0] + m*977
300 t1 := (t0 >> fieldBase) + f.n[1] + (m << 6)
301 t0 = t0 & fieldBaseMask
302 t2 := (t1 >> fieldBase) + f.n[2]
303 t1 = t1 & fieldBaseMask
304 t3 := (t2 >> fieldBase) + f.n[3]
305 t2 = t2 & fieldBaseMask
306 t4 := (t3 >> fieldBase) + f.n[4]
307 t3 = t3 & fieldBaseMask
308 t5 := (t4 >> fieldBase) + f.n[5]
309 t4 = t4 & fieldBaseMask
310 t6 := (t5 >> fieldBase) + f.n[6]
311 t5 = t5 & fieldBaseMask
312 t7 := (t6 >> fieldBase) + f.n[7]
313 t6 = t6 & fieldBaseMask
314 t8 := (t7 >> fieldBase) + f.n[8]
315 t7 = t7 & fieldBaseMask
316 t9 = (t8 >> fieldBase) + t9
317 t8 = t8 & fieldBaseMask
318 319 // At this point, the magnitude is guaranteed to be one, however, the
320 // value could still be greater than the prime if there was either a
321 // carry through to bit 256 (bit 22 of the higher order word) or the
322 // value is greater than or equal to the field characteristic. The
323 // following determines if either or these conditions are true and does
324 // the final reduction in constant time.
325 //
326 // Note that the if/else statements here intentionally do the bitwise
327 // operators even when it won't change the value to ensure constant time
328 // between the branches. Also note that 'm' will be zero when neither
329 // of the aforementioned conditions are true and the value will not be
330 // changed when 'm' is zero.
331 m = 1
332 if t9 == fieldMSBMask {
333 m &= 1
334 } else {
335 m &= 0
336 }
337 if t2&t3&t4&t5&t6&t7&t8 == fieldBaseMask {
338 m &= 1
339 } else {
340 m &= 0
341 }
342 if ((t0+977)>>fieldBase + t1 + 64) > fieldBaseMask {
343 m &= 1
344 } else {
345 m &= 0
346 }
347 if t9>>fieldMSBBits != 0 {
348 m |= 1
349 } else {
350 m |= 0
351 }
352 t0 = t0 + m*977
353 t1 = (t0 >> fieldBase) + t1 + (m << 6)
354 t0 = t0 & fieldBaseMask
355 t2 = (t1 >> fieldBase) + t2
356 t1 = t1 & fieldBaseMask
357 t3 = (t2 >> fieldBase) + t3
358 t2 = t2 & fieldBaseMask
359 t4 = (t3 >> fieldBase) + t4
360 t3 = t3 & fieldBaseMask
361 t5 = (t4 >> fieldBase) + t5
362 t4 = t4 & fieldBaseMask
363 t6 = (t5 >> fieldBase) + t6
364 t5 = t5 & fieldBaseMask
365 t7 = (t6 >> fieldBase) + t7
366 t6 = t6 & fieldBaseMask
367 t8 = (t7 >> fieldBase) + t8
368 t7 = t7 & fieldBaseMask
369 t9 = (t8 >> fieldBase) + t9
370 t8 = t8 & fieldBaseMask
371 t9 = t9 & fieldMSBMask // Remove potential multiple of 2^256.
372 373 // Finally, set the normalized and reduced words.
374 f.n[0] = t0
375 f.n[1] = t1
376 f.n[2] = t2
377 f.n[3] = t3
378 f.n[4] = t4
379 f.n[5] = t5
380 f.n[6] = t6
381 f.n[7] = t7
382 f.n[8] = t8
383 f.n[9] = t9
384 return f
385 }
386 387 // PutBytes unpacks the field value to a 32-byte big-endian value using the
388 // passed byte array. There is a similar function, Bytes, which unpacks the
389 // field value into a new array and returns that. This version is provided
390 // since it can be useful to cut down on the number of allocations by allowing
391 // the caller to reuse a buffer.
392 //
393 // The field value must be normalized for this function to return the correct
394 // result.
395 func (f *fieldVal) PutBytes(b *[32]byte) {
396 // Unpack the 256 total bits from the 10 uint32 words with a max of
397 // 26-bits per word. This could be done with a couple of for loops,
398 // but this unrolled version is a bit faster. Benchmarks show this is
399 // about 10 times faster than the variant which uses loops.
400 b[31] = byte(f.n[0] & eightBitsMask)
401 b[30] = byte((f.n[0] >> 8) & eightBitsMask)
402 b[29] = byte((f.n[0] >> 16) & eightBitsMask)
403 b[28] = byte((f.n[0]>>24)&twoBitsMask | (f.n[1]&sixBitsMask)<<2)
404 b[27] = byte((f.n[1] >> 6) & eightBitsMask)
405 b[26] = byte((f.n[1] >> 14) & eightBitsMask)
406 b[25] = byte((f.n[1]>>22)&fourBitsMask | (f.n[2]&fourBitsMask)<<4)
407 b[24] = byte((f.n[2] >> 4) & eightBitsMask)
408 b[23] = byte((f.n[2] >> 12) & eightBitsMask)
409 b[22] = byte((f.n[2]>>20)&sixBitsMask | (f.n[3]&twoBitsMask)<<6)
410 b[21] = byte((f.n[3] >> 2) & eightBitsMask)
411 b[20] = byte((f.n[3] >> 10) & eightBitsMask)
412 b[19] = byte((f.n[3] >> 18) & eightBitsMask)
413 b[18] = byte(f.n[4] & eightBitsMask)
414 b[17] = byte((f.n[4] >> 8) & eightBitsMask)
415 b[16] = byte((f.n[4] >> 16) & eightBitsMask)
416 b[15] = byte((f.n[4]>>24)&twoBitsMask | (f.n[5]&sixBitsMask)<<2)
417 b[14] = byte((f.n[5] >> 6) & eightBitsMask)
418 b[13] = byte((f.n[5] >> 14) & eightBitsMask)
419 b[12] = byte((f.n[5]>>22)&fourBitsMask | (f.n[6]&fourBitsMask)<<4)
420 b[11] = byte((f.n[6] >> 4) & eightBitsMask)
421 b[10] = byte((f.n[6] >> 12) & eightBitsMask)
422 b[9] = byte((f.n[6]>>20)&sixBitsMask | (f.n[7]&twoBitsMask)<<6)
423 b[8] = byte((f.n[7] >> 2) & eightBitsMask)
424 b[7] = byte((f.n[7] >> 10) & eightBitsMask)
425 b[6] = byte((f.n[7] >> 18) & eightBitsMask)
426 b[5] = byte(f.n[8] & eightBitsMask)
427 b[4] = byte((f.n[8] >> 8) & eightBitsMask)
428 b[3] = byte((f.n[8] >> 16) & eightBitsMask)
429 b[2] = byte((f.n[8]>>24)&twoBitsMask | (f.n[9]&sixBitsMask)<<2)
430 b[1] = byte((f.n[9] >> 6) & eightBitsMask)
431 b[0] = byte((f.n[9] >> 14) & eightBitsMask)
432 }
433 434 // Bytes unpacks the field value to a 32-byte big-endian value. See PutBytes
435 // for a variant that allows the a buffer to be passed which can be useful to
436 // to cut down on the number of allocations by allowing the caller to reuse a
437 // buffer.
438 //
439 // The field value must be normalized for this function to return correct
440 // result.
441 func (f *fieldVal) Bytes() *[32]byte {
442 b := new([32]byte)
443 f.PutBytes(b)
444 return b
445 }
446 447 // IsZero returns whether or not the field value is equal to zero.
448 func (f *fieldVal) IsZero() bool {
449 // The value can only be zero if no bits are set in any of the words.
450 // This is a constant time implementation.
451 bits := f.n[0] | f.n[1] | f.n[2] | f.n[3] | f.n[4] |
452 f.n[5] | f.n[6] | f.n[7] | f.n[8] | f.n[9]
453 454 return bits == 0
455 }
456 457 // IsOdd returns whether or not the field value is an odd number.
458 //
459 // The field value must be normalized for this function to return correct
460 // result.
461 func (f *fieldVal) IsOdd() bool {
462 // Only odd numbers have the bottom bit set.
463 return f.n[0]&1 == 1
464 }
465 466 // Equals returns whether or not the two field values are the same. Both
467 // field values being compared must be normalized for this function to return
468 // the correct result.
469 func (f *fieldVal) Equals(val *fieldVal) bool {
470 // Xor only sets bits when they are different, so the two field values
471 // can only be the same if no bits are set after xoring each word.
472 // This is a constant time implementation.
473 bits := (f.n[0] ^ val.n[0]) | (f.n[1] ^ val.n[1]) | (f.n[2] ^ val.n[2]) |
474 (f.n[3] ^ val.n[3]) | (f.n[4] ^ val.n[4]) | (f.n[5] ^ val.n[5]) |
475 (f.n[6] ^ val.n[6]) | (f.n[7] ^ val.n[7]) | (f.n[8] ^ val.n[8]) |
476 (f.n[9] ^ val.n[9])
477 478 return bits == 0
479 }
480 481 // NegateVal negates the passed value and stores the result in f. The caller
482 // must provide the magnitude of the passed value for a correct result.
483 //
484 // The field value is returned to support chaining. This enables syntax like:
485 // f.NegateVal(f2).AddInt(1) so that f = -f2 + 1.
486 func (f *fieldVal) NegateVal(val *fieldVal, magnitude uint32) *fieldVal {
487 // Negation in the field is just the prime minus the value. However,
488 // in order to allow negation against a field value without having to
489 // normalize/reduce it first, multiply by the magnitude (that is how
490 // "far" away it is from the normalized value) to adjust. Also, since
491 // negating a value pushes it one more order of magnitude away from the
492 // normalized range, add 1 to compensate.
493 //
494 // For some intuition here, imagine you're performing mod 12 arithmetic
495 // (picture a clock) and you are negating the number 7. So you start at
496 // 12 (which is of course 0 under mod 12) and count backwards (left on
497 // the clock) 7 times to arrive at 5. Notice this is just 12-7 = 5.
498 // Now, assume you're starting with 19, which is a number that is
499 // already larger than the modulus and congruent to 7 (mod 12). When a
500 // value is already in the desired range, its magnitude is 1. Since 19
501 // is an additional "step", its magnitude (mod 12) is 2. Since any
502 // multiple of the modulus is conguent to zero (mod m), the answer can
503 // be shortcut by simply mulplying the magnitude by the modulus and
504 // subtracting. Keeping with the example, this would be (2*12)-19 = 5.
505 f.n[0] = (magnitude+1)*fieldPrimeWordZero - val.n[0]
506 f.n[1] = (magnitude+1)*fieldPrimeWordOne - val.n[1]
507 f.n[2] = (magnitude+1)*fieldBaseMask - val.n[2]
508 f.n[3] = (magnitude+1)*fieldBaseMask - val.n[3]
509 f.n[4] = (magnitude+1)*fieldBaseMask - val.n[4]
510 f.n[5] = (magnitude+1)*fieldBaseMask - val.n[5]
511 f.n[6] = (magnitude+1)*fieldBaseMask - val.n[6]
512 f.n[7] = (magnitude+1)*fieldBaseMask - val.n[7]
513 f.n[8] = (magnitude+1)*fieldBaseMask - val.n[8]
514 f.n[9] = (magnitude+1)*fieldMSBMask - val.n[9]
515 516 return f
517 }
518 519 // Negate negates the field value. The existing field value is modified. The
520 // caller must provide the magnitude of the field value for a correct result.
521 //
522 // The field value is returned to support chaining. This enables syntax like:
523 // f.Negate().AddInt(1) so that f = -f + 1.
524 func (f *fieldVal) Negate(magnitude uint32) *fieldVal {
525 return f.NegateVal(f, magnitude)
526 }
527 528 // AddInt adds the passed integer to the existing field value and stores the
529 // result in f. This is a convenience function since it is fairly common to
530 // perform some arithemetic with small native integers.
531 //
532 // The field value is returned to support chaining. This enables syntax like:
533 // f.AddInt(1).Add(f2) so that f = f + 1 + f2.
534 func (f *fieldVal) AddInt(ui uint) *fieldVal {
535 // Since the field representation intentionally provides overflow bits,
536 // it's ok to use carryless addition as the carry bit is safely part of
537 // the word and will be normalized out.
538 f.n[0] += uint32(ui)
539 540 return f
541 }
542 543 // Add adds the passed value to the existing field value and stores the result
544 // in f.
545 //
546 // The field value is returned to support chaining. This enables syntax like:
547 // f.Add(f2).AddInt(1) so that f = f + f2 + 1.
548 func (f *fieldVal) Add(val *fieldVal) *fieldVal {
549 // Since the field representation intentionally provides overflow bits,
550 // it's ok to use carryless addition as the carry bit is safely part of
551 // each word and will be normalized out. This could obviously be done
552 // in a loop, but the unrolled version is faster.
553 f.n[0] += val.n[0]
554 f.n[1] += val.n[1]
555 f.n[2] += val.n[2]
556 f.n[3] += val.n[3]
557 f.n[4] += val.n[4]
558 f.n[5] += val.n[5]
559 f.n[6] += val.n[6]
560 f.n[7] += val.n[7]
561 f.n[8] += val.n[8]
562 f.n[9] += val.n[9]
563 564 return f
565 }
566 567 // Add2 adds the passed two field values together and stores the result in f.
568 //
569 // The field value is returned to support chaining. This enables syntax like:
570 // f3.Add2(f, f2).AddInt(1) so that f3 = f + f2 + 1.
571 func (f *fieldVal) Add2(val *fieldVal, val2 *fieldVal) *fieldVal {
572 // Since the field representation intentionally provides overflow bits,
573 // it's ok to use carryless addition as the carry bit is safely part of
574 // each word and will be normalized out. This could obviously be done
575 // in a loop, but the unrolled version is faster.
576 f.n[0] = val.n[0] + val2.n[0]
577 f.n[1] = val.n[1] + val2.n[1]
578 f.n[2] = val.n[2] + val2.n[2]
579 f.n[3] = val.n[3] + val2.n[3]
580 f.n[4] = val.n[4] + val2.n[4]
581 f.n[5] = val.n[5] + val2.n[5]
582 f.n[6] = val.n[6] + val2.n[6]
583 f.n[7] = val.n[7] + val2.n[7]
584 f.n[8] = val.n[8] + val2.n[8]
585 f.n[9] = val.n[9] + val2.n[9]
586 587 return f
588 }
589 590 // MulInt multiplies the field value by the passed int and stores the result in
591 // f. Note that this function can overflow if multiplying the value by any of
592 // the individual words exceeds a max uint32. Therefore it is important that
593 // the caller ensures no overflows will occur before using this function.
594 //
595 // The field value is returned to support chaining. This enables syntax like:
596 // f.MulInt(2).Add(f2) so that f = 2 * f + f2.
597 func (f *fieldVal) MulInt(val uint) *fieldVal {
598 // Since each word of the field representation can hold up to
599 // fieldOverflowBits extra bits which will be normalized out, it's safe
600 // to multiply each word without using a larger type or carry
601 // propagation so long as the values won't overflow a uint32. This
602 // could obviously be done in a loop, but the unrolled version is
603 // faster.
604 ui := uint32(val)
605 f.n[0] *= ui
606 f.n[1] *= ui
607 f.n[2] *= ui
608 f.n[3] *= ui
609 f.n[4] *= ui
610 f.n[5] *= ui
611 f.n[6] *= ui
612 f.n[7] *= ui
613 f.n[8] *= ui
614 f.n[9] *= ui
615 616 return f
617 }
618 619 // Mul multiplies the passed value to the existing field value and stores the
620 // result in f. Note that this function can overflow if multiplying any
621 // of the individual words exceeds a max uint32. In practice, this means the
622 // magnitude of either value involved in the multiplication must be a max of
623 // 8.
624 //
625 // The field value is returned to support chaining. This enables syntax like:
626 // f.Mul(f2).AddInt(1) so that f = (f * f2) + 1.
627 func (f *fieldVal) Mul(val *fieldVal) *fieldVal {
628 return f.Mul2(f, val)
629 }
630 631 // Mul2 multiplies the passed two field values together and stores the result
632 // result in f. Note that this function can overflow if multiplying any of
633 // the individual words exceeds a max uint32. In practice, this means the
634 // magnitude of either value involved in the multiplication must be a max of
635 // 8.
636 //
637 // The field value is returned to support chaining. This enables syntax like:
638 // f3.Mul2(f, f2).AddInt(1) so that f3 = (f * f2) + 1.
639 func (f *fieldVal) Mul2(val *fieldVal, val2 *fieldVal) *fieldVal {
640 // This could be done with a couple of for loops and an array to store
641 // the intermediate terms, but this unrolled version is significantly
642 // faster.
643 644 // Terms for 2^(fieldBase*0).
645 m := uint64(val.n[0]) * uint64(val2.n[0])
646 t0 := m & fieldBaseMask
647 648 // Terms for 2^(fieldBase*1).
649 m = (m >> fieldBase) +
650 uint64(val.n[0])*uint64(val2.n[1]) +
651 uint64(val.n[1])*uint64(val2.n[0])
652 t1 := m & fieldBaseMask
653 654 // Terms for 2^(fieldBase*2).
655 m = (m >> fieldBase) +
656 uint64(val.n[0])*uint64(val2.n[2]) +
657 uint64(val.n[1])*uint64(val2.n[1]) +
658 uint64(val.n[2])*uint64(val2.n[0])
659 t2 := m & fieldBaseMask
660 661 // Terms for 2^(fieldBase*3).
662 m = (m >> fieldBase) +
663 uint64(val.n[0])*uint64(val2.n[3]) +
664 uint64(val.n[1])*uint64(val2.n[2]) +
665 uint64(val.n[2])*uint64(val2.n[1]) +
666 uint64(val.n[3])*uint64(val2.n[0])
667 t3 := m & fieldBaseMask
668 669 // Terms for 2^(fieldBase*4).
670 m = (m >> fieldBase) +
671 uint64(val.n[0])*uint64(val2.n[4]) +
672 uint64(val.n[1])*uint64(val2.n[3]) +
673 uint64(val.n[2])*uint64(val2.n[2]) +
674 uint64(val.n[3])*uint64(val2.n[1]) +
675 uint64(val.n[4])*uint64(val2.n[0])
676 t4 := m & fieldBaseMask
677 678 // Terms for 2^(fieldBase*5).
679 m = (m >> fieldBase) +
680 uint64(val.n[0])*uint64(val2.n[5]) +
681 uint64(val.n[1])*uint64(val2.n[4]) +
682 uint64(val.n[2])*uint64(val2.n[3]) +
683 uint64(val.n[3])*uint64(val2.n[2]) +
684 uint64(val.n[4])*uint64(val2.n[1]) +
685 uint64(val.n[5])*uint64(val2.n[0])
686 t5 := m & fieldBaseMask
687 688 // Terms for 2^(fieldBase*6).
689 m = (m >> fieldBase) +
690 uint64(val.n[0])*uint64(val2.n[6]) +
691 uint64(val.n[1])*uint64(val2.n[5]) +
692 uint64(val.n[2])*uint64(val2.n[4]) +
693 uint64(val.n[3])*uint64(val2.n[3]) +
694 uint64(val.n[4])*uint64(val2.n[2]) +
695 uint64(val.n[5])*uint64(val2.n[1]) +
696 uint64(val.n[6])*uint64(val2.n[0])
697 t6 := m & fieldBaseMask
698 699 // Terms for 2^(fieldBase*7).
700 m = (m >> fieldBase) +
701 uint64(val.n[0])*uint64(val2.n[7]) +
702 uint64(val.n[1])*uint64(val2.n[6]) +
703 uint64(val.n[2])*uint64(val2.n[5]) +
704 uint64(val.n[3])*uint64(val2.n[4]) +
705 uint64(val.n[4])*uint64(val2.n[3]) +
706 uint64(val.n[5])*uint64(val2.n[2]) +
707 uint64(val.n[6])*uint64(val2.n[1]) +
708 uint64(val.n[7])*uint64(val2.n[0])
709 t7 := m & fieldBaseMask
710 711 // Terms for 2^(fieldBase*8).
712 m = (m >> fieldBase) +
713 uint64(val.n[0])*uint64(val2.n[8]) +
714 uint64(val.n[1])*uint64(val2.n[7]) +
715 uint64(val.n[2])*uint64(val2.n[6]) +
716 uint64(val.n[3])*uint64(val2.n[5]) +
717 uint64(val.n[4])*uint64(val2.n[4]) +
718 uint64(val.n[5])*uint64(val2.n[3]) +
719 uint64(val.n[6])*uint64(val2.n[2]) +
720 uint64(val.n[7])*uint64(val2.n[1]) +
721 uint64(val.n[8])*uint64(val2.n[0])
722 t8 := m & fieldBaseMask
723 724 // Terms for 2^(fieldBase*9).
725 m = (m >> fieldBase) +
726 uint64(val.n[0])*uint64(val2.n[9]) +
727 uint64(val.n[1])*uint64(val2.n[8]) +
728 uint64(val.n[2])*uint64(val2.n[7]) +
729 uint64(val.n[3])*uint64(val2.n[6]) +
730 uint64(val.n[4])*uint64(val2.n[5]) +
731 uint64(val.n[5])*uint64(val2.n[4]) +
732 uint64(val.n[6])*uint64(val2.n[3]) +
733 uint64(val.n[7])*uint64(val2.n[2]) +
734 uint64(val.n[8])*uint64(val2.n[1]) +
735 uint64(val.n[9])*uint64(val2.n[0])
736 t9 := m & fieldBaseMask
737 738 // Terms for 2^(fieldBase*10).
739 m = (m >> fieldBase) +
740 uint64(val.n[1])*uint64(val2.n[9]) +
741 uint64(val.n[2])*uint64(val2.n[8]) +
742 uint64(val.n[3])*uint64(val2.n[7]) +
743 uint64(val.n[4])*uint64(val2.n[6]) +
744 uint64(val.n[5])*uint64(val2.n[5]) +
745 uint64(val.n[6])*uint64(val2.n[4]) +
746 uint64(val.n[7])*uint64(val2.n[3]) +
747 uint64(val.n[8])*uint64(val2.n[2]) +
748 uint64(val.n[9])*uint64(val2.n[1])
749 t10 := m & fieldBaseMask
750 751 // Terms for 2^(fieldBase*11).
752 m = (m >> fieldBase) +
753 uint64(val.n[2])*uint64(val2.n[9]) +
754 uint64(val.n[3])*uint64(val2.n[8]) +
755 uint64(val.n[4])*uint64(val2.n[7]) +
756 uint64(val.n[5])*uint64(val2.n[6]) +
757 uint64(val.n[6])*uint64(val2.n[5]) +
758 uint64(val.n[7])*uint64(val2.n[4]) +
759 uint64(val.n[8])*uint64(val2.n[3]) +
760 uint64(val.n[9])*uint64(val2.n[2])
761 t11 := m & fieldBaseMask
762 763 // Terms for 2^(fieldBase*12).
764 m = (m >> fieldBase) +
765 uint64(val.n[3])*uint64(val2.n[9]) +
766 uint64(val.n[4])*uint64(val2.n[8]) +
767 uint64(val.n[5])*uint64(val2.n[7]) +
768 uint64(val.n[6])*uint64(val2.n[6]) +
769 uint64(val.n[7])*uint64(val2.n[5]) +
770 uint64(val.n[8])*uint64(val2.n[4]) +
771 uint64(val.n[9])*uint64(val2.n[3])
772 t12 := m & fieldBaseMask
773 774 // Terms for 2^(fieldBase*13).
775 m = (m >> fieldBase) +
776 uint64(val.n[4])*uint64(val2.n[9]) +
777 uint64(val.n[5])*uint64(val2.n[8]) +
778 uint64(val.n[6])*uint64(val2.n[7]) +
779 uint64(val.n[7])*uint64(val2.n[6]) +
780 uint64(val.n[8])*uint64(val2.n[5]) +
781 uint64(val.n[9])*uint64(val2.n[4])
782 t13 := m & fieldBaseMask
783 784 // Terms for 2^(fieldBase*14).
785 m = (m >> fieldBase) +
786 uint64(val.n[5])*uint64(val2.n[9]) +
787 uint64(val.n[6])*uint64(val2.n[8]) +
788 uint64(val.n[7])*uint64(val2.n[7]) +
789 uint64(val.n[8])*uint64(val2.n[6]) +
790 uint64(val.n[9])*uint64(val2.n[5])
791 t14 := m & fieldBaseMask
792 793 // Terms for 2^(fieldBase*15).
794 m = (m >> fieldBase) +
795 uint64(val.n[6])*uint64(val2.n[9]) +
796 uint64(val.n[7])*uint64(val2.n[8]) +
797 uint64(val.n[8])*uint64(val2.n[7]) +
798 uint64(val.n[9])*uint64(val2.n[6])
799 t15 := m & fieldBaseMask
800 801 // Terms for 2^(fieldBase*16).
802 m = (m >> fieldBase) +
803 uint64(val.n[7])*uint64(val2.n[9]) +
804 uint64(val.n[8])*uint64(val2.n[8]) +
805 uint64(val.n[9])*uint64(val2.n[7])
806 t16 := m & fieldBaseMask
807 808 // Terms for 2^(fieldBase*17).
809 m = (m >> fieldBase) +
810 uint64(val.n[8])*uint64(val2.n[9]) +
811 uint64(val.n[9])*uint64(val2.n[8])
812 t17 := m & fieldBaseMask
813 814 // Terms for 2^(fieldBase*18).
815 m = (m >> fieldBase) + uint64(val.n[9])*uint64(val2.n[9])
816 t18 := m & fieldBaseMask
817 818 // What's left is for 2^(fieldBase*19).
819 t19 := m >> fieldBase
820 821 // At this point, all of the terms are grouped into their respective
822 // base.
823 //
824 // Per [HAC] section 14.3.4: Reduction method of moduli of special form,
825 // when the modulus is of the special form m = b^t - c, highly efficient
826 // reduction can be achieved per the provided algorithm.
827 //
828 // The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits
829 // this criteria.
830 //
831 // 4294968273 in field representation (base 2^26) is:
832 // n[0] = 977
833 // n[1] = 64
834 // That is to say (2^26 * 64) + 977 = 4294968273
835 //
836 // Since each word is in base 26, the upper terms (t10 and up) start
837 // at 260 bits (versus the final desired range of 256 bits), so the
838 // field representation of 'c' from above needs to be adjusted for the
839 // extra 4 bits by multiplying it by 2^4 = 16. 4294968273 * 16 =
840 // 68719492368. Thus, the adjusted field representation of 'c' is:
841 // n[0] = 977 * 16 = 15632
842 // n[1] = 64 * 16 = 1024
843 // That is to say (2^26 * 1024) + 15632 = 68719492368
844 //
845 // To reduce the final term, t19, the entire 'c' value is needed instead
846 // of only n[0] because there are no more terms left to handle n[1].
847 // This means there might be some magnitude left in the upper bits that
848 // is handled below.
849 m = t0 + t10*15632
850 t0 = m & fieldBaseMask
851 m = (m >> fieldBase) + t1 + t10*1024 + t11*15632
852 t1 = m & fieldBaseMask
853 m = (m >> fieldBase) + t2 + t11*1024 + t12*15632
854 t2 = m & fieldBaseMask
855 m = (m >> fieldBase) + t3 + t12*1024 + t13*15632
856 t3 = m & fieldBaseMask
857 m = (m >> fieldBase) + t4 + t13*1024 + t14*15632
858 t4 = m & fieldBaseMask
859 m = (m >> fieldBase) + t5 + t14*1024 + t15*15632
860 t5 = m & fieldBaseMask
861 m = (m >> fieldBase) + t6 + t15*1024 + t16*15632
862 t6 = m & fieldBaseMask
863 m = (m >> fieldBase) + t7 + t16*1024 + t17*15632
864 t7 = m & fieldBaseMask
865 m = (m >> fieldBase) + t8 + t17*1024 + t18*15632
866 t8 = m & fieldBaseMask
867 m = (m >> fieldBase) + t9 + t18*1024 + t19*68719492368
868 t9 = m & fieldMSBMask
869 m = m >> fieldMSBBits
870 871 // At this point, if the magnitude is greater than 0, the overall value
872 // is greater than the max possible 256-bit value. In particular, it is
873 // "how many times larger" than the max value it is.
874 //
875 // The algorithm presented in [HAC] section 14.3.4 repeats until the
876 // quotient is zero. However, due to the above, we already know at
877 // least how many times we would need to repeat as it's the value
878 // currently in m. Thus we can simply multiply the magnitude by the
879 // field representation of the prime and do a single iteration. Notice
880 // that nothing will be changed when the magnitude is zero, so we could
881 // skip this in that case, however always running regardless allows it
882 // to run in constant time. The final result will be in the range
883 // 0 <= result <= prime + (2^64 - c), so it is guaranteed to have a
884 // magnitude of 1, but it is denormalized.
885 d := t0 + m*977
886 f.n[0] = uint32(d & fieldBaseMask)
887 d = (d >> fieldBase) + t1 + m*64
888 f.n[1] = uint32(d & fieldBaseMask)
889 f.n[2] = uint32((d >> fieldBase) + t2)
890 f.n[3] = uint32(t3)
891 f.n[4] = uint32(t4)
892 f.n[5] = uint32(t5)
893 f.n[6] = uint32(t6)
894 f.n[7] = uint32(t7)
895 f.n[8] = uint32(t8)
896 f.n[9] = uint32(t9)
897 898 return f
899 }
900 901 // Square squares the field value. The existing field value is modified. Note
902 // that this function can overflow if multiplying any of the individual words
903 // exceeds a max uint32. In practice, this means the magnitude of the field
904 // must be a max of 8 to prevent overflow.
905 //
906 // The field value is returned to support chaining. This enables syntax like:
907 // f.Square().Mul(f2) so that f = f^2 * f2.
908 func (f *fieldVal) Square() *fieldVal {
909 return f.SquareVal(f)
910 }
911 912 // SquareVal squares the passed value and stores the result in f. Note that
913 // this function can overflow if multiplying any of the individual words
914 // exceeds a max uint32. In practice, this means the magnitude of the field
915 // being squred must be a max of 8 to prevent overflow.
916 //
917 // The field value is returned to support chaining. This enables syntax like:
918 // f3.SquareVal(f).Mul(f) so that f3 = f^2 * f = f^3.
919 func (f *fieldVal) SquareVal(val *fieldVal) *fieldVal {
920 // This could be done with a couple of for loops and an array to store
921 // the intermediate terms, but this unrolled version is significantly
922 // faster.
923 924 // Terms for 2^(fieldBase*0).
925 m := uint64(val.n[0]) * uint64(val.n[0])
926 t0 := m & fieldBaseMask
927 928 // Terms for 2^(fieldBase*1).
929 m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[1])
930 t1 := m & fieldBaseMask
931 932 // Terms for 2^(fieldBase*2).
933 m = (m >> fieldBase) +
934 2*uint64(val.n[0])*uint64(val.n[2]) +
935 uint64(val.n[1])*uint64(val.n[1])
936 t2 := m & fieldBaseMask
937 938 // Terms for 2^(fieldBase*3).
939 m = (m >> fieldBase) +
940 2*uint64(val.n[0])*uint64(val.n[3]) +
941 2*uint64(val.n[1])*uint64(val.n[2])
942 t3 := m & fieldBaseMask
943 944 // Terms for 2^(fieldBase*4).
945 m = (m >> fieldBase) +
946 2*uint64(val.n[0])*uint64(val.n[4]) +
947 2*uint64(val.n[1])*uint64(val.n[3]) +
948 uint64(val.n[2])*uint64(val.n[2])
949 t4 := m & fieldBaseMask
950 951 // Terms for 2^(fieldBase*5).
952 m = (m >> fieldBase) +
953 2*uint64(val.n[0])*uint64(val.n[5]) +
954 2*uint64(val.n[1])*uint64(val.n[4]) +
955 2*uint64(val.n[2])*uint64(val.n[3])
956 t5 := m & fieldBaseMask
957 958 // Terms for 2^(fieldBase*6).
959 m = (m >> fieldBase) +
960 2*uint64(val.n[0])*uint64(val.n[6]) +
961 2*uint64(val.n[1])*uint64(val.n[5]) +
962 2*uint64(val.n[2])*uint64(val.n[4]) +
963 uint64(val.n[3])*uint64(val.n[3])
964 t6 := m & fieldBaseMask
965 966 // Terms for 2^(fieldBase*7).
967 m = (m >> fieldBase) +
968 2*uint64(val.n[0])*uint64(val.n[7]) +
969 2*uint64(val.n[1])*uint64(val.n[6]) +
970 2*uint64(val.n[2])*uint64(val.n[5]) +
971 2*uint64(val.n[3])*uint64(val.n[4])
972 t7 := m & fieldBaseMask
973 974 // Terms for 2^(fieldBase*8).
975 m = (m >> fieldBase) +
976 2*uint64(val.n[0])*uint64(val.n[8]) +
977 2*uint64(val.n[1])*uint64(val.n[7]) +
978 2*uint64(val.n[2])*uint64(val.n[6]) +
979 2*uint64(val.n[3])*uint64(val.n[5]) +
980 uint64(val.n[4])*uint64(val.n[4])
981 t8 := m & fieldBaseMask
982 983 // Terms for 2^(fieldBase*9).
984 m = (m >> fieldBase) +
985 2*uint64(val.n[0])*uint64(val.n[9]) +
986 2*uint64(val.n[1])*uint64(val.n[8]) +
987 2*uint64(val.n[2])*uint64(val.n[7]) +
988 2*uint64(val.n[3])*uint64(val.n[6]) +
989 2*uint64(val.n[4])*uint64(val.n[5])
990 t9 := m & fieldBaseMask
991 992 // Terms for 2^(fieldBase*10).
993 m = (m >> fieldBase) +
994 2*uint64(val.n[1])*uint64(val.n[9]) +
995 2*uint64(val.n[2])*uint64(val.n[8]) +
996 2*uint64(val.n[3])*uint64(val.n[7]) +
997 2*uint64(val.n[4])*uint64(val.n[6]) +
998 uint64(val.n[5])*uint64(val.n[5])
999 t10 := m & fieldBaseMask
1000 1001 // Terms for 2^(fieldBase*11).
1002 m = (m >> fieldBase) +
1003 2*uint64(val.n[2])*uint64(val.n[9]) +
1004 2*uint64(val.n[3])*uint64(val.n[8]) +
1005 2*uint64(val.n[4])*uint64(val.n[7]) +
1006 2*uint64(val.n[5])*uint64(val.n[6])
1007 t11 := m & fieldBaseMask
1008 1009 // Terms for 2^(fieldBase*12).
1010 m = (m >> fieldBase) +
1011 2*uint64(val.n[3])*uint64(val.n[9]) +
1012 2*uint64(val.n[4])*uint64(val.n[8]) +
1013 2*uint64(val.n[5])*uint64(val.n[7]) +
1014 uint64(val.n[6])*uint64(val.n[6])
1015 t12 := m & fieldBaseMask
1016 1017 // Terms for 2^(fieldBase*13).
1018 m = (m >> fieldBase) +
1019 2*uint64(val.n[4])*uint64(val.n[9]) +
1020 2*uint64(val.n[5])*uint64(val.n[8]) +
1021 2*uint64(val.n[6])*uint64(val.n[7])
1022 t13 := m & fieldBaseMask
1023 1024 // Terms for 2^(fieldBase*14).
1025 m = (m >> fieldBase) +
1026 2*uint64(val.n[5])*uint64(val.n[9]) +
1027 2*uint64(val.n[6])*uint64(val.n[8]) +
1028 uint64(val.n[7])*uint64(val.n[7])
1029 t14 := m & fieldBaseMask
1030 1031 // Terms for 2^(fieldBase*15).
1032 m = (m >> fieldBase) +
1033 2*uint64(val.n[6])*uint64(val.n[9]) +
1034 2*uint64(val.n[7])*uint64(val.n[8])
1035 t15 := m & fieldBaseMask
1036 1037 // Terms for 2^(fieldBase*16).
1038 m = (m >> fieldBase) +
1039 2*uint64(val.n[7])*uint64(val.n[9]) +
1040 uint64(val.n[8])*uint64(val.n[8])
1041 t16 := m & fieldBaseMask
1042 1043 // Terms for 2^(fieldBase*17).
1044 m = (m >> fieldBase) + 2*uint64(val.n[8])*uint64(val.n[9])
1045 t17 := m & fieldBaseMask
1046 1047 // Terms for 2^(fieldBase*18).
1048 m = (m >> fieldBase) + uint64(val.n[9])*uint64(val.n[9])
1049 t18 := m & fieldBaseMask
1050 1051 // What's left is for 2^(fieldBase*19).
1052 t19 := m >> fieldBase
1053 1054 // At this point, all of the terms are grouped into their respective
1055 // base.
1056 //
1057 // Per [HAC] section 14.3.4: Reduction method of moduli of special form,
1058 // when the modulus is of the special form m = b^t - c, highly efficient
1059 // reduction can be achieved per the provided algorithm.
1060 //
1061 // The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits
1062 // this criteria.
1063 //
1064 // 4294968273 in field representation (base 2^26) is:
1065 // n[0] = 977
1066 // n[1] = 64
1067 // That is to say (2^26 * 64) + 977 = 4294968273
1068 //
1069 // Since each word is in base 26, the upper terms (t10 and up) start
1070 // at 260 bits (versus the final desired range of 256 bits), so the
1071 // field representation of 'c' from above needs to be adjusted for the
1072 // extra 4 bits by multiplying it by 2^4 = 16. 4294968273 * 16 =
1073 // 68719492368. Thus, the adjusted field representation of 'c' is:
1074 // n[0] = 977 * 16 = 15632
1075 // n[1] = 64 * 16 = 1024
1076 // That is to say (2^26 * 1024) + 15632 = 68719492368
1077 //
1078 // To reduce the final term, t19, the entire 'c' value is needed instead
1079 // of only n[0] because there are no more terms left to handle n[1].
1080 // This means there might be some magnitude left in the upper bits that
1081 // is handled below.
1082 m = t0 + t10*15632
1083 t0 = m & fieldBaseMask
1084 m = (m >> fieldBase) + t1 + t10*1024 + t11*15632
1085 t1 = m & fieldBaseMask
1086 m = (m >> fieldBase) + t2 + t11*1024 + t12*15632
1087 t2 = m & fieldBaseMask
1088 m = (m >> fieldBase) + t3 + t12*1024 + t13*15632
1089 t3 = m & fieldBaseMask
1090 m = (m >> fieldBase) + t4 + t13*1024 + t14*15632
1091 t4 = m & fieldBaseMask
1092 m = (m >> fieldBase) + t5 + t14*1024 + t15*15632
1093 t5 = m & fieldBaseMask
1094 m = (m >> fieldBase) + t6 + t15*1024 + t16*15632
1095 t6 = m & fieldBaseMask
1096 m = (m >> fieldBase) + t7 + t16*1024 + t17*15632
1097 t7 = m & fieldBaseMask
1098 m = (m >> fieldBase) + t8 + t17*1024 + t18*15632
1099 t8 = m & fieldBaseMask
1100 m = (m >> fieldBase) + t9 + t18*1024 + t19*68719492368
1101 t9 = m & fieldMSBMask
1102 m = m >> fieldMSBBits
1103 1104 // At this point, if the magnitude is greater than 0, the overall value
1105 // is greater than the max possible 256-bit value. In particular, it is
1106 // "how many times larger" than the max value it is.
1107 //
1108 // The algorithm presented in [HAC] section 14.3.4 repeats until the
1109 // quotient is zero. However, due to the above, we already know at
1110 // least how many times we would need to repeat as it's the value
1111 // currently in m. Thus we can simply multiply the magnitude by the
1112 // field representation of the prime and do a single iteration. Notice
1113 // that nothing will be changed when the magnitude is zero, so we could
1114 // skip this in that case, however always running regardless allows it
1115 // to run in constant time. The final result will be in the range
1116 // 0 <= result <= prime + (2^64 - c), so it is guaranteed to have a
1117 // magnitude of 1, but it is denormalized.
1118 n := t0 + m*977
1119 f.n[0] = uint32(n & fieldBaseMask)
1120 n = (n >> fieldBase) + t1 + m*64
1121 f.n[1] = uint32(n & fieldBaseMask)
1122 f.n[2] = uint32((n >> fieldBase) + t2)
1123 f.n[3] = uint32(t3)
1124 f.n[4] = uint32(t4)
1125 f.n[5] = uint32(t5)
1126 f.n[6] = uint32(t6)
1127 f.n[7] = uint32(t7)
1128 f.n[8] = uint32(t8)
1129 f.n[9] = uint32(t9)
1130 1131 return f
1132 }
1133 1134 // Inverse finds the modular multiplicative inverse of the field value. The
1135 // existing field value is modified.
1136 //
1137 // The field value is returned to support chaining. This enables syntax like:
1138 // f.Inverse().Mul(f2) so that f = f^-1 * f2.
1139 func (f *fieldVal) Inverse() *fieldVal {
1140 // Fermat's little theorem states that for a nonzero number a and prime
1141 // prime p, a^(p-1) = 1 (mod p). Since the multipliciative inverse is
1142 // a*b = 1 (mod p), it follows that b = a*a^(p-2) = a^(p-1) = 1 (mod p).
1143 // Thus, a^(p-2) is the multiplicative inverse.
1144 //
1145 // In order to efficiently compute a^(p-2), p-2 needs to be split into
1146 // a sequence of squares and multipications that minimizes the number of
1147 // multiplications needed (since they are more costly than squarings).
1148 // Intermediate results are saved and reused as well.
1149 //
1150 // The secp256k1 prime - 2 is 2^256 - 4294968275.
1151 //
1152 // This has a cost of 258 field squarings and 33 field multiplications.
1153 var a2, a3, a4, a10, a11, a21, a42, a45, a63, a1019, a1023 fieldVal
1154 a2.SquareVal(f)
1155 a3.Mul2(&a2, f)
1156 a4.SquareVal(&a2)
1157 a10.SquareVal(&a4).Mul(&a2)
1158 a11.Mul2(&a10, f)
1159 a21.Mul2(&a10, &a11)
1160 a42.SquareVal(&a21)
1161 a45.Mul2(&a42, &a3)
1162 a63.Mul2(&a42, &a21)
1163 a1019.SquareVal(&a63).Square().Square().Square().Mul(&a11)
1164 a1023.Mul2(&a1019, &a4)
1165 f.Set(&a63) // f = a^(2^6 - 1)
1166 f.Square().Square().Square().Square().Square() // f = a^(2^11 - 32)
1167 f.Square().Square().Square().Square().Square() // f = a^(2^16 - 1024)
1168 f.Mul(&a1023) // f = a^(2^16 - 1)
1169 f.Square().Square().Square().Square().Square() // f = a^(2^21 - 32)
1170 f.Square().Square().Square().Square().Square() // f = a^(2^26 - 1024)
1171 f.Mul(&a1023) // f = a^(2^26 - 1)
1172 f.Square().Square().Square().Square().Square() // f = a^(2^31 - 32)
1173 f.Square().Square().Square().Square().Square() // f = a^(2^36 - 1024)
1174 f.Mul(&a1023) // f = a^(2^36 - 1)
1175 f.Square().Square().Square().Square().Square() // f = a^(2^41 - 32)
1176 f.Square().Square().Square().Square().Square() // f = a^(2^46 - 1024)
1177 f.Mul(&a1023) // f = a^(2^46 - 1)
1178 f.Square().Square().Square().Square().Square() // f = a^(2^51 - 32)
1179 f.Square().Square().Square().Square().Square() // f = a^(2^56 - 1024)
1180 f.Mul(&a1023) // f = a^(2^56 - 1)
1181 f.Square().Square().Square().Square().Square() // f = a^(2^61 - 32)
1182 f.Square().Square().Square().Square().Square() // f = a^(2^66 - 1024)
1183 f.Mul(&a1023) // f = a^(2^66 - 1)
1184 f.Square().Square().Square().Square().Square() // f = a^(2^71 - 32)
1185 f.Square().Square().Square().Square().Square() // f = a^(2^76 - 1024)
1186 f.Mul(&a1023) // f = a^(2^76 - 1)
1187 f.Square().Square().Square().Square().Square() // f = a^(2^81 - 32)
1188 f.Square().Square().Square().Square().Square() // f = a^(2^86 - 1024)
1189 f.Mul(&a1023) // f = a^(2^86 - 1)
1190 f.Square().Square().Square().Square().Square() // f = a^(2^91 - 32)
1191 f.Square().Square().Square().Square().Square() // f = a^(2^96 - 1024)
1192 f.Mul(&a1023) // f = a^(2^96 - 1)
1193 f.Square().Square().Square().Square().Square() // f = a^(2^101 - 32)
1194 f.Square().Square().Square().Square().Square() // f = a^(2^106 - 1024)
1195 f.Mul(&a1023) // f = a^(2^106 - 1)
1196 f.Square().Square().Square().Square().Square() // f = a^(2^111 - 32)
1197 f.Square().Square().Square().Square().Square() // f = a^(2^116 - 1024)
1198 f.Mul(&a1023) // f = a^(2^116 - 1)
1199 f.Square().Square().Square().Square().Square() // f = a^(2^121 - 32)
1200 f.Square().Square().Square().Square().Square() // f = a^(2^126 - 1024)
1201 f.Mul(&a1023) // f = a^(2^126 - 1)
1202 f.Square().Square().Square().Square().Square() // f = a^(2^131 - 32)
1203 f.Square().Square().Square().Square().Square() // f = a^(2^136 - 1024)
1204 f.Mul(&a1023) // f = a^(2^136 - 1)
1205 f.Square().Square().Square().Square().Square() // f = a^(2^141 - 32)
1206 f.Square().Square().Square().Square().Square() // f = a^(2^146 - 1024)
1207 f.Mul(&a1023) // f = a^(2^146 - 1)
1208 f.Square().Square().Square().Square().Square() // f = a^(2^151 - 32)
1209 f.Square().Square().Square().Square().Square() // f = a^(2^156 - 1024)
1210 f.Mul(&a1023) // f = a^(2^156 - 1)
1211 f.Square().Square().Square().Square().Square() // f = a^(2^161 - 32)
1212 f.Square().Square().Square().Square().Square() // f = a^(2^166 - 1024)
1213 f.Mul(&a1023) // f = a^(2^166 - 1)
1214 f.Square().Square().Square().Square().Square() // f = a^(2^171 - 32)
1215 f.Square().Square().Square().Square().Square() // f = a^(2^176 - 1024)
1216 f.Mul(&a1023) // f = a^(2^176 - 1)
1217 f.Square().Square().Square().Square().Square() // f = a^(2^181 - 32)
1218 f.Square().Square().Square().Square().Square() // f = a^(2^186 - 1024)
1219 f.Mul(&a1023) // f = a^(2^186 - 1)
1220 f.Square().Square().Square().Square().Square() // f = a^(2^191 - 32)
1221 f.Square().Square().Square().Square().Square() // f = a^(2^196 - 1024)
1222 f.Mul(&a1023) // f = a^(2^196 - 1)
1223 f.Square().Square().Square().Square().Square() // f = a^(2^201 - 32)
1224 f.Square().Square().Square().Square().Square() // f = a^(2^206 - 1024)
1225 f.Mul(&a1023) // f = a^(2^206 - 1)
1226 f.Square().Square().Square().Square().Square() // f = a^(2^211 - 32)
1227 f.Square().Square().Square().Square().Square() // f = a^(2^216 - 1024)
1228 f.Mul(&a1023) // f = a^(2^216 - 1)
1229 f.Square().Square().Square().Square().Square() // f = a^(2^221 - 32)
1230 f.Square().Square().Square().Square().Square() // f = a^(2^226 - 1024)
1231 f.Mul(&a1019) // f = a^(2^226 - 5)
1232 f.Square().Square().Square().Square().Square() // f = a^(2^231 - 160)
1233 f.Square().Square().Square().Square().Square() // f = a^(2^236 - 5120)
1234 f.Mul(&a1023) // f = a^(2^236 - 4097)
1235 f.Square().Square().Square().Square().Square() // f = a^(2^241 - 131104)
1236 f.Square().Square().Square().Square().Square() // f = a^(2^246 - 4195328)
1237 f.Mul(&a1023) // f = a^(2^246 - 4194305)
1238 f.Square().Square().Square().Square().Square() // f = a^(2^251 - 134217760)
1239 f.Square().Square().Square().Square().Square() // f = a^(2^256 - 4294968320)
1240 return f.Mul(&a45) // f = a^(2^256 - 4294968275) = a^(p-2)
1241 }
1242 1243 // SqrtVal computes the square root of x modulo the curve's prime, and stores
1244 // the result in f. The square root is computed via exponentiation of x by the
1245 // value Q = (P+1)/4 using the curve's precomputed big-endian representation of
1246 // the Q. This method uses a modified version of square-and-multiply
1247 // exponentiation over secp256k1 fieldVals to operate on bytes instead of bits,
1248 // which offers better performance over both big.Int exponentiation and bit-wise
1249 // square-and-multiply.
1250 //
1251 // NOTE: This method only works when P is intended to be the secp256k1 prime and
1252 // is not constant time. The returned value is of magnitude 1, but is
1253 // denormalized.
1254 func (f *fieldVal) SqrtVal(x *fieldVal) *fieldVal {
1255 // The following computation iteratively computes x^((P+1)/4) = x^Q
1256 // using the recursive, piece-wise definition:
1257 //
1258 // x^n = (x^2)^(n/2) mod P if n is even
1259 // x^n = x(x^2)^(n-1/2) mod P if n is odd
1260 //
1261 // Given n in its big-endian representation b_k, ..., b_0, x^n can be
1262 // computed by defining the sequence r_k+1, ..., r_0, where:
1263 //
1264 // r_k+1 = 1
1265 // r_i = (r_i+1)^2 * x^b_i for i = k, ..., 0
1266 //
1267 // The final value r_0 = x^n.
1268 //
1269 // See https://en.wikipedia.org/wiki/Exponentiation_by_squaring for more
1270 // details.
1271 //
1272 // This can be further optimized, by observing that the value of Q in
1273 // secp256k1 has the value:
1274 //
1275 // Q = 3fffffffffffffffffffffffffffffffffffffffffffffffffffffffbfffff0c
1276 //
1277 // We can unroll the typical bit-wise interpretation of the
1278 // exponentiation algorithm above to instead operate on bytes.
1279 // This reduces the number of comparisons by an order of magnitude,
1280 // reducing the overhead of failed branch predictions and additional
1281 // comparisons in this method.
1282 //
1283 // Since there there are only 4 unique bytes of Q, this keeps the jump
1284 // table small without the need to handle all possible 8-bit values.
1285 // Further, we observe that 29 of the 32 bytes are 0xff; making the
1286 // first case handle 0xff therefore optimizes the hot path.
1287 f.SetInt(1)
1288 for _, b := range fieldQBytes {
1289 switch b {
1290 1291 // Most common case, where all 8 bits are set.
1292 case 0xff:
1293 f.Square().Mul(x)
1294 f.Square().Mul(x)
1295 f.Square().Mul(x)
1296 f.Square().Mul(x)
1297 f.Square().Mul(x)
1298 f.Square().Mul(x)
1299 f.Square().Mul(x)
1300 f.Square().Mul(x)
1301 1302 // First byte of Q (0x3f), where all but the top two bits are
1303 // set. Note that this case only applies six operations, since
1304 // the highest bit of Q resides in bit six of the first byte. We
1305 // ignore the first two bits, since squaring for these bits will
1306 // result in an invalid result. We forgo squaring f before the
1307 // first multiply, since 1^2 = 1.
1308 case 0x3f:
1309 f.Mul(x)
1310 f.Square().Mul(x)
1311 f.Square().Mul(x)
1312 f.Square().Mul(x)
1313 f.Square().Mul(x)
1314 f.Square().Mul(x)
1315 1316 // Byte 28 of Q (0xbf), where only bit 7 is unset.
1317 case 0xbf:
1318 f.Square().Mul(x)
1319 f.Square()
1320 f.Square().Mul(x)
1321 f.Square().Mul(x)
1322 f.Square().Mul(x)
1323 f.Square().Mul(x)
1324 f.Square().Mul(x)
1325 f.Square().Mul(x)
1326 1327 // Byte 31 of Q (0x0c), where only bits 3 and 4 are set.
1328 default:
1329 f.Square()
1330 f.Square()
1331 f.Square()
1332 f.Square()
1333 f.Square().Mul(x)
1334 f.Square().Mul(x)
1335 f.Square()
1336 f.Square()
1337 }
1338 }
1339 1340 return f
1341 }
1342 1343 // Sqrt computes the square root of f modulo the curve's prime, and stores the
1344 // result in f. The square root is computed via exponentiation of x by the value
1345 // Q = (P+1)/4 using the curve's precomputed big-endian representation of the Q.
1346 // This method uses a modified version of square-and-multiply exponentiation
1347 // over secp256k1 fieldVals to operate on bytes instead of bits, which offers
1348 // better performance over both big.Int exponentiation and bit-wise
1349 // square-and-multiply.
1350 //
1351 // NOTE: This method only works when P is intended to be the secp256k1 prime and
1352 // is not constant time. The returned value is of magnitude 1, but is
1353 // denormalized.
1354 func (f *fieldVal) Sqrt() *fieldVal {
1355 return f.SqrtVal(f)
1356 }
1357