field_mul.go raw
1 //go:build !js && !wasm && !tinygo && !wasm32
2
3 package p256k1
4
5 import "math/bits"
6
7 // uint128 represents a 128-bit unsigned integer for field arithmetic
8 type uint128 struct {
9 high, low uint64
10 }
11
12 // mulU64ToU128 multiplies two uint64 values and returns a uint128
13 func mulU64ToU128(a, b uint64) uint128 {
14 hi, lo := bits.Mul64(a, b)
15 return uint128{high: hi, low: lo}
16 }
17
18 // addMulU128 computes c + a*b and returns the result as uint128
19 func addMulU128(c uint128, a, b uint64) uint128 {
20 hi, lo := bits.Mul64(a, b)
21
22 // Add lo to c.low
23 newLo, carry := bits.Add64(c.low, lo, 0)
24
25 // Add hi and carry to c.high
26 newHi, _ := bits.Add64(c.high, hi, carry)
27
28 return uint128{high: newHi, low: newLo}
29 }
30
31 // addU128 adds a uint64 to a uint128
32 func addU128(c uint128, a uint64) uint128 {
33 newLo, carry := bits.Add64(c.low, a, 0)
34 newHi, _ := bits.Add64(c.high, 0, carry)
35 return uint128{high: newHi, low: newLo}
36 }
37
38 // lo returns the lower 64 bits
39 func (u uint128) lo() uint64 {
40 return u.low
41 }
42
43 // hi returns the upper 64 bits
44 func (u uint128) hi() uint64 {
45 return u.high
46 }
47
48 // rshift shifts the uint128 right by n bits
49 func (u uint128) rshift(n uint) uint128 {
50 if n >= 64 {
51 return uint128{high: 0, low: u.high >> (n - 64)}
52 }
53 return uint128{
54 high: u.high >> n,
55 low: (u.low >> n) | (u.high << (64 - n)),
56 }
57 }
58
59 // mul multiplies two field elements: r = a * b
60 // This implementation follows the C secp256k1_fe_mul_inner algorithm
61 // Optimized: avoid copies when magnitude is low enough
62 func (r *FieldElement) mul(a, b *FieldElement) {
63 // Use pointers directly if magnitude is low enough (optimization)
64 var aNorm, bNorm *FieldElement
65 var aTemp, bTemp FieldElement
66
67 if a.magnitude > 8 {
68 aTemp = *a
69 aTemp.normalizeWeak()
70 aNorm = &aTemp
71 } else {
72 aNorm = a // Use directly, no copy needed
73 }
74
75 if b.magnitude > 8 {
76 bTemp = *b
77 bTemp.normalizeWeak()
78 bNorm = &bTemp
79 } else {
80 bNorm = b // Use directly, no copy needed
81 }
82
83 // Use 4x64 BMI2 implementation if available (fastest on amd64)
84 if hasField4x64() {
85 field4x64Mul(r, aNorm, bNorm)
86 return
87 }
88
89 // Use BMI2+ADX assembly if available
90 if hasFieldAsmBMI2() {
91 fieldMulAsmBMI2(r, aNorm, bNorm)
92 r.magnitude = 1
93 r.normalized = false
94 return
95 }
96
97 // Use regular assembly if available
98 if hasFieldAsm() {
99 fieldMulAsm(r, aNorm, bNorm)
100 r.magnitude = 1
101 r.normalized = false
102 return
103 }
104
105 // Extract limbs for easier access
106 a0, a1, a2, a3, a4 := aNorm.n[0], aNorm.n[1], aNorm.n[2], aNorm.n[3], aNorm.n[4]
107 b0, b1, b2, b3, b4 := bNorm.n[0], bNorm.n[1], bNorm.n[2], bNorm.n[3], bNorm.n[4]
108
109 const M = 0xFFFFFFFFFFFFF // 2^52 - 1
110 const R = fieldReductionConstantShifted // 0x1000003D10
111
112 // Following the C implementation algorithm exactly
113 // [... a b c] is shorthand for ... + a<<104 + b<<52 + c<<0 mod n
114
115 // Compute p3 = a0*b3 + a1*b2 + a2*b1 + a3*b0
116 var c, d uint128
117 d = mulU64ToU128(a0, b3)
118 d = addMulU128(d, a1, b2)
119 d = addMulU128(d, a2, b1)
120 d = addMulU128(d, a3, b0)
121
122 // Compute p8 = a4*b4
123 c = mulU64ToU128(a4, b4)
124
125 // d += R * c_lo; c >>= 64
126 d = addMulU128(d, R, c.lo())
127 c = c.rshift(64)
128
129 // Extract t3 and shift d
130 t3 := d.lo() & M
131 d = d.rshift(52)
132
133 // Compute p4 = a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0
134 d = addMulU128(d, a0, b4)
135 d = addMulU128(d, a1, b3)
136 d = addMulU128(d, a2, b2)
137 d = addMulU128(d, a3, b1)
138 d = addMulU128(d, a4, b0)
139
140 // d += (R << 12) * c_lo
141 d = addMulU128(d, R<<12, c.lo())
142
143 // Extract t4 and tx
144 t4 := d.lo() & M
145 d = d.rshift(52)
146 tx := t4 >> 48
147 t4 &= (M >> 4)
148
149 // Compute p0 = a0*b0
150 c = mulU64ToU128(a0, b0)
151
152 // Compute p5 = a1*b4 + a2*b3 + a3*b2 + a4*b1
153 d = addMulU128(d, a1, b4)
154 d = addMulU128(d, a2, b3)
155 d = addMulU128(d, a3, b2)
156 d = addMulU128(d, a4, b1)
157
158 // Extract u0
159 u0 := d.lo() & M
160 d = d.rshift(52)
161 u0 = (u0 << 4) | tx
162
163 // c += u0 * (R >> 4)
164 c = addMulU128(c, u0, R>>4)
165
166 // r[0]
167 r.n[0] = c.lo() & M
168 c = c.rshift(52)
169
170 // Compute p1 = a0*b1 + a1*b0
171 c = addMulU128(c, a0, b1)
172 c = addMulU128(c, a1, b0)
173
174 // Compute p6 = a2*b4 + a3*b3 + a4*b2
175 d = addMulU128(d, a2, b4)
176 d = addMulU128(d, a3, b3)
177 d = addMulU128(d, a4, b2)
178
179 // c += R * (d & M); d >>= 52
180 c = addMulU128(c, R, d.lo()&M)
181 d = d.rshift(52)
182
183 // r[1]
184 r.n[1] = c.lo() & M
185 c = c.rshift(52)
186
187 // Compute p2 = a0*b2 + a1*b1 + a2*b0
188 c = addMulU128(c, a0, b2)
189 c = addMulU128(c, a1, b1)
190 c = addMulU128(c, a2, b0)
191
192 // Compute p7 = a3*b4 + a4*b3
193 d = addMulU128(d, a3, b4)
194 d = addMulU128(d, a4, b3)
195
196 // c += R * d_lo; d >>= 64
197 c = addMulU128(c, R, d.lo())
198 d = d.rshift(64)
199
200 // r[2]
201 r.n[2] = c.lo() & M
202 c = c.rshift(52)
203
204 // c += (R << 12) * d_lo + t3
205 c = addMulU128(c, R<<12, d.lo())
206 c = addU128(c, t3)
207
208 // r[3]
209 r.n[3] = c.lo() & M
210 c = c.rshift(52)
211
212 // r[4]
213 r.n[4] = c.lo() + t4
214
215 // Set magnitude and normalization
216 r.magnitude = 1
217 r.normalized = false
218 }
219
220 // reduceFromWide reduces a 520-bit (10 limb) value modulo the field prime
221 func (r *FieldElement) reduceFromWide(t [10]uint64) {
222 // The field prime is p = 2^256 - 2^32 - 977 = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F
223 // We use the fact that 2^256 ≡ 2^32 + 977 (mod p)
224
225 // First, handle the upper limbs (t[5] through t[9])
226 // Each represents a multiple of 2^(52*i) where i >= 5
227
228 // Reduction constant for secp256k1: 2^32 + 977 = 0x1000003D1
229 const M = uint64(0x1000003D1)
230
231 // Start from the highest limb and work down
232 for i := 9; i >= 5; i-- {
233 if t[i] == 0 {
234 continue
235 }
236
237 // t[i] * 2^(52*i) ≡ t[i] * 2^(52*(i-5)) * 2^(52*5) ≡ t[i] * 2^(52*(i-5)) * 2^260
238 // Since 2^256 ≡ M (mod p), we have 2^260 ≡ 2^4 * M ≡ 16 * M (mod p)
239
240 // For i=5: 2^260 ≡ 16*M (mod p)
241 // For i=6: 2^312 ≡ 2^52 * 16*M ≡ 2^56 * M (mod p)
242 // etc.
243
244 shift := uint(52 * (i - 5) + 4) // Additional 4 bits for the 16 factor
245
246 // Multiply t[i] by the appropriate power of M
247 var carry uint64
248 if shift < 64 {
249 // Simple case: can multiply directly
250 factor := M << shift
251 hi, lo := bits.Mul64(t[i], factor)
252
253 // Add to appropriate position
254 pos := 0
255 t[pos], carry = bits.Add64(t[pos], lo, 0)
256 if pos+1 < 10 {
257 t[pos+1], carry = bits.Add64(t[pos+1], hi, carry)
258 }
259
260 // Propagate carry
261 for j := pos + 2; j < 10 && carry != 0; j++ {
262 t[j], carry = bits.Add64(t[j], 0, carry)
263 }
264 } else {
265 // Need to handle larger shifts by distributing across limbs
266 hi, lo := bits.Mul64(t[i], M)
267 limbShift := shift / 52
268 bitShift := shift % 52
269
270 if bitShift == 0 {
271 // Aligned to limb boundary
272 if limbShift < 10 {
273 t[limbShift], carry = bits.Add64(t[limbShift], lo, 0)
274 if limbShift+1 < 10 {
275 t[limbShift+1], carry = bits.Add64(t[limbShift+1], hi, carry)
276 }
277 }
278 } else {
279 // Need to split across limbs
280 loShifted := lo << bitShift
281 hiShifted := (lo >> (64 - bitShift)) | (hi << bitShift)
282
283 if limbShift < 10 {
284 t[limbShift], carry = bits.Add64(t[limbShift], loShifted, 0)
285 if limbShift+1 < 10 {
286 t[limbShift+1], carry = bits.Add64(t[limbShift+1], hiShifted, carry)
287 }
288 }
289 }
290
291 // Propagate any remaining carry
292 for j := int(limbShift) + 2; j < 10 && carry != 0; j++ {
293 t[j], carry = bits.Add64(t[j], 0, carry)
294 }
295 }
296
297 t[i] = 0 // Clear the processed limb
298 }
299
300 // Now we have a value in t[0..4] that may still be >= p
301 // Convert to 5x52 format and normalize
302 r.n[0] = t[0] & limb0Max
303 r.n[1] = ((t[0] >> 52) | (t[1] << 12)) & limb0Max
304 r.n[2] = ((t[1] >> 40) | (t[2] << 24)) & limb0Max
305 r.n[3] = ((t[2] >> 28) | (t[3] << 36)) & limb0Max
306 r.n[4] = ((t[3] >> 16) | (t[4] << 48)) & limb4Max
307
308 r.magnitude = 1
309 r.normalized = false
310
311 // Final reduction if needed
312 if r.n[4] == limb4Max && r.n[3] == limb0Max && r.n[2] == limb0Max &&
313 r.n[1] == limb0Max && r.n[0] >= fieldModulusLimb0 {
314 r.reduce()
315 }
316 }
317
318 // sqr squares a field element: r = a^2
319 // This implementation follows the C secp256k1_fe_sqr_inner algorithm
320 // Optimized: avoid copies when magnitude is low enough
321 func (r *FieldElement) sqr(a *FieldElement) {
322 // Use pointer directly if magnitude is low enough (optimization)
323 var aNorm *FieldElement
324 var aTemp FieldElement
325
326 if a.magnitude > 8 {
327 aTemp = *a
328 aTemp.normalizeWeak()
329 aNorm = &aTemp
330 } else {
331 aNorm = a // Use directly, no copy needed
332 }
333
334 // Use 4x64 BMI2 implementation if available (fastest on amd64)
335 if hasField4x64() {
336 field4x64Sqr(r, aNorm)
337 return
338 }
339
340 // Use BMI2+ADX assembly if available
341 if hasFieldAsmBMI2() {
342 fieldSqrAsmBMI2(r, aNorm)
343 r.magnitude = 1
344 r.normalized = false
345 return
346 }
347
348 // Use regular assembly if available
349 if hasFieldAsm() {
350 fieldSqrAsm(r, aNorm)
351 r.magnitude = 1
352 r.normalized = false
353 return
354 }
355
356 // Extract limbs for easier access
357 a0, a1, a2, a3, a4 := aNorm.n[0], aNorm.n[1], aNorm.n[2], aNorm.n[3], aNorm.n[4]
358
359 const M = 0xFFFFFFFFFFFFF // 2^52 - 1
360 const R = fieldReductionConstantShifted // 0x1000003D10
361
362 // Following the C implementation algorithm exactly
363
364 // Compute p3 = 2*a0*a3 + 2*a1*a2
365 var c, d uint128
366 d = mulU64ToU128(a0*2, a3)
367 d = addMulU128(d, a1*2, a2)
368
369 // Compute p8 = a4*a4
370 c = mulU64ToU128(a4, a4)
371
372 // d += R * c_lo; c >>= 64
373 d = addMulU128(d, R, c.lo())
374 c = c.rshift(64)
375
376 // Extract t3 and shift d
377 t3 := d.lo() & M
378 d = d.rshift(52)
379
380 // Compute p4 = a0*a4*2 + a1*a3*2 + a2*a2
381 a4 *= 2
382 d = addMulU128(d, a0, a4)
383 d = addMulU128(d, a1*2, a3)
384 d = addMulU128(d, a2, a2)
385
386 // d += (R << 12) * c_lo
387 d = addMulU128(d, R<<12, c.lo())
388
389 // Extract t4 and tx
390 t4 := d.lo() & M
391 d = d.rshift(52)
392 tx := t4 >> 48
393 t4 &= (M >> 4)
394
395 // Compute p0 = a0*a0
396 c = mulU64ToU128(a0, a0)
397
398 // Compute p5 = a1*a4 + a2*a3*2
399 d = addMulU128(d, a1, a4)
400 d = addMulU128(d, a2*2, a3)
401
402 // Extract u0
403 u0 := d.lo() & M
404 d = d.rshift(52)
405 u0 = (u0 << 4) | tx
406
407 // c += u0 * (R >> 4)
408 c = addMulU128(c, u0, R>>4)
409
410 // r[0]
411 r.n[0] = c.lo() & M
412 c = c.rshift(52)
413
414 // Compute p1 = a0*a1*2
415 a0 *= 2
416 c = addMulU128(c, a0, a1)
417
418 // Compute p6 = a2*a4 + a3*a3
419 d = addMulU128(d, a2, a4)
420 d = addMulU128(d, a3, a3)
421
422 // c += R * (d & M); d >>= 52
423 c = addMulU128(c, R, d.lo()&M)
424 d = d.rshift(52)
425
426 // r[1]
427 r.n[1] = c.lo() & M
428 c = c.rshift(52)
429
430 // Compute p2 = a0*a2 + a1*a1
431 c = addMulU128(c, a0, a2)
432 c = addMulU128(c, a1, a1)
433
434 // Compute p7 = a3*a4
435 d = addMulU128(d, a3, a4)
436
437 // c += R * d_lo; d >>= 64
438 c = addMulU128(c, R, d.lo())
439 d = d.rshift(64)
440
441 // r[2]
442 r.n[2] = c.lo() & M
443 c = c.rshift(52)
444
445 // c += (R << 12) * d_lo + t3
446 c = addMulU128(c, R<<12, d.lo())
447 c = addU128(c, t3)
448
449 // r[3]
450 r.n[3] = c.lo() & M
451 c = c.rshift(52)
452
453 // r[4]
454 r.n[4] = c.lo() + t4
455
456 // Set magnitude and normalization
457 r.magnitude = 1
458 r.normalized = false
459 }
460
461 // inv computes the modular inverse of a field element using Fermat's little theorem
462 // This implements a^(p-2) mod p where p is the secp256k1 field prime
463 // Uses an optimized addition chain similar to sqrt() for efficiency.
464 //
465 // p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D
466 //
467 // Binary structure:
468 // - bits 255-33: 223 ones (= 2^223 - 1 shifted left by 33)
469 // - bit 32: 0
470 // - bits 31-12: 20 ones
471 // - bits 11-0: 110000101101 (0xC2D)
472 //
473 // This uses ~266 squarings + ~15 multiplications instead of ~256 + ~127 for naive binary exp.
474 func (r *FieldElement) inv(a *FieldElement) {
475 var aNorm FieldElement
476 aNorm = *a
477 aNorm.normalize()
478
479 // Build up powers using addition chain (same as sqrt)
480 // x^(2^k - 1) for k in {2, 3, 6, 9, 11, 22, 44, 88, 176, 220, 223}
481 var x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1 FieldElement
482
483 // x2 = a^(2^2 - 1) = a^3
484 x2.sqr(&aNorm)
485 x2.mul(&x2, &aNorm)
486
487 // x3 = a^(2^3 - 1) = a^7
488 x3.sqr(&x2)
489 x3.mul(&x3, &aNorm)
490
491 // x6 = a^(2^6 - 1) = a^63
492 x6 = x3
493 for j := 0; j < 3; j++ {
494 x6.sqr(&x6)
495 }
496 x6.mul(&x6, &x3)
497
498 // x9 = a^(2^9 - 1) = a^511
499 x9 = x6
500 for j := 0; j < 3; j++ {
501 x9.sqr(&x9)
502 }
503 x9.mul(&x9, &x3)
504
505 // x11 = a^(2^11 - 1) = a^2047
506 x11 = x9
507 for j := 0; j < 2; j++ {
508 x11.sqr(&x11)
509 }
510 x11.mul(&x11, &x2)
511
512 // x22 = a^(2^22 - 1)
513 x22 = x11
514 for j := 0; j < 11; j++ {
515 x22.sqr(&x22)
516 }
517 x22.mul(&x22, &x11)
518
519 // x44 = a^(2^44 - 1)
520 x44 = x22
521 for j := 0; j < 22; j++ {
522 x44.sqr(&x44)
523 }
524 x44.mul(&x44, &x22)
525
526 // x88 = a^(2^88 - 1)
527 x88 = x44
528 for j := 0; j < 44; j++ {
529 x88.sqr(&x88)
530 }
531 x88.mul(&x88, &x44)
532
533 // x176 = a^(2^176 - 1)
534 x176 = x88
535 for j := 0; j < 88; j++ {
536 x176.sqr(&x176)
537 }
538 x176.mul(&x176, &x88)
539
540 // x220 = a^(2^220 - 1)
541 x220 = x176
542 for j := 0; j < 44; j++ {
543 x220.sqr(&x220)
544 }
545 x220.mul(&x220, &x44)
546
547 // x223 = a^(2^223 - 1)
548 x223 = x220
549 for j := 0; j < 3; j++ {
550 x223.sqr(&x223)
551 }
552 x223.mul(&x223, &x3)
553
554 // Now assemble p-2 = 2^256 - 2^32 - 979
555 // = (2^223 - 1) * 2^33 + (2^32 - 1) * 2^0 + ... (adjusted for the specific bit pattern)
556 //
557 // p-2 in binary (from MSB):
558 // 223 ones, 0, 20 ones, 00, 1, 0, 00, 1, 0, 1, 1, 0, 1
559 // [223 ones][0][20 ones][11-0 bits = 0xC2D = 110000101101]
560 //
561 // Step 1: Start with x223 and shift left by 33 bits
562 t1 = x223
563 for j := 0; j < 33; j++ {
564 t1.sqr(&t1)
565 }
566 // Now t1 = a^((2^223 - 1) * 2^33)
567
568 // Step 2: Add contribution for bits 31-12 (20 ones)
569 // We need x^(2^32 - 2^12) = x^((2^20 - 1) * 2^12)
570 // But we don't have x20 precomputed. Use x22 and adjust.
571 // Actually, let's use: x11 * 2^11 * x11 = x^((2^11-1)*2^11 + (2^11-1)) = x^(2^22 - 2^11 + 2^11 - 1) = x^(2^22-1) = x22
572 // We need x^(2^20 - 1). We can compute: x11 shifted by 9, then multiply by x9
573 // x^((2^11-1)*2^9) * x^(2^9-1) = x^(2^20 - 2^9 + 2^9 - 1) = x^(2^20 - 1)
574 var x20 FieldElement
575 x20 = x11
576 for j := 0; j < 9; j++ {
577 x20.sqr(&x20)
578 }
579 x20.mul(&x20, &x9)
580 // Now x20 = a^(2^20 - 1)
581
582 // Multiply into t1: t1 * x20 (but first shift x20 to correct position)
583 // Current t1 is at position 33 (exponent ends at bit 33)
584 // We need bits 31-12 = 20 ones, which is x^((2^20-1) * 2^12)
585 // So we need to shift t1 down? No, we're building the exponent.
586 //
587 // Let me reconsider. We have:
588 // t1 = a^((2^223-1) * 2^33) = a^(2^256 - 2^33)
589 //
590 // p-2 = 2^256 - 2^32 - 979
591 //
592 // 2^256 - 2^33 vs 2^256 - 2^32 - 979
593 // We need to add: 2^32 - 979 = FFFFFC2D
594 //
595 // So: p-2 = (2^256 - 2^33) + (2^32 - 979)
596 // = t1_exp + 0xFFFFFC2D (as addition of exponents = multiplication)
597 //
598 // But that's not how exponentiation works! a^x * a^y = a^(x+y)
599 // So we need t1 * a^(0xFFFFFC2D)
600 //
601 // 0xFFFFFC2D = 4294966317
602 // This is still a big exponent to compute naively.
603 //
604 // Let me decompose 0xFFFFFC2D in terms of our precomputed powers.
605 // 0xFFFFFC2D = 0b 1111 1111 1111 1111 1111 1100 0010 1101
606 // = 2^32 - 979
607 // = (2^32 - 1) - 978
608 // = (2^32 - 1) - 2^9 - 2^8 - 2^7 - 2^6 - 2^4 - 2^1
609 //
610 // Hmm, subtraction in exponent is division... that doesn't help.
611 //
612 // Let me think about this differently using the bit pattern directly.
613 // p-2 bits (from MSB to LSB):
614 // bit 255: 1
615 // ...
616 // bit 33: 1 (223 ones total in bits 255-33)
617 // bit 32: 0
618 // bit 31: 1
619 // ...
620 // bit 12: 1 (20 ones total in bits 31-12)
621 // bit 11: 0
622 // bit 10: 0
623 // bit 9: 1
624 // bit 8: 0
625 // bit 7: 0
626 // bit 6: 0
627 // bit 5: 1
628 // bit 4: 0
629 // bit 3: 1
630 // bit 2: 1
631 // bit 1: 0
632 // bit 0: 1
633 //
634 // Using sliding window from MSB:
635 // Start: t1 = a^1
636 // Process 223 ones in bits 255-33:
637 // t1 = a^(2^223 - 1) using x223 (done above via shift)
638 // After shifting 33 times: a^((2^223-1) * 2^33) = a^(2^256 - 2^33)
639 //
640 // bit 32 = 0: square once
641 // t1 = a^(2^257 - 2^34) -- wait, this goes beyond 256 bits
642 //
643 // I think I'm overcomplicating this. Let me use a simpler sliding window approach.
644
645 // Alternative approach: build from LSB using the precomputed values
646 // Actually, let's use the approach from libsecp256k1's sqrt which builds from MSB.
647
648 // p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D
649 //
650 // We can write this as:
651 // ((((x223 << 23) * x22) << 5) * x2) << 3) * x3) << 2) * x2) << 1) << 1) * a
652 //
653 // Let me verify: starting with a^(2^223 - 1) = x223
654 //
655 // Actually, the libsecp256k1 modinv doesn't use addition chains - it uses safegcd.
656 // Let me check what the modular inverse in libsecp256k1 actually does.
657
658 // For now, let me implement a working addition chain that's still much faster than binary exp.
659
660 // p-2 structure analysis:
661 // Bits 255-33: 223 consecutive 1s
662 // Bit 32: 0
663 // Bits 31-12: 20 consecutive 1s (but we don't have x20, need to construct)
664 // Bits 11-0: 0xC2D = 0b110000101101
665
666 // Approach: sliding window from MSB
667 // 1. Start with x223 (bits 255-33)
668 // 2. Square for bit 32 (which is 0)
669 // 3. Square and multiply for bits 31-12 (20 ones)
670 // 4. Handle remaining bits 11-0
671
672 // t1 already = x223 = a^(2^223 - 1)
673
674 // Square 23 times for bits 255-33 positioning, then multiply by x22 for next block
675 // Wait, I'm confusing myself. Let me restart with a cleaner approach.
676
677 // Standard addition chain for computing a^e where e = p-2:
678 // We compute a^e by starting with a and repeatedly squaring and multiplying.
679 // The exponent p-2 has this binary form:
680 // 1{223}01{20}001000010110 1
681 // (223 ones)(zero)(20 ones)(0xC2D pattern)
682
683 // Efficient approach: compute x^((2^223-1) * 2^33 + (2^20-1) * 2^13 + specific_bits)
684 //
685 // Let's verify: (2^223-1)*2^33 = 2^256 - 2^33
686 // (2^20-1)*2^13 = 2^33 - 2^13
687 // Adding: 2^256 - 2^33 + 2^33 - 2^13 = 2^256 - 2^13
688 // But p-2 = 2^256 - 2^32 - 979, so this isn't quite right.
689
690 // Let me compute more carefully.
691 // The exponent as sum of weighted runs of 1s:
692 // bits 255-33 (223 ones): (2^223 - 1) * 2^33 = 2^256 - 2^33
693 // bits 31-12 (20 ones): (2^20 - 1) * 2^12 = 2^32 - 2^12
694 // bit 9: 2^9
695 // bit 5: 2^5
696 // bit 3: 2^3
697 // bit 2: 2^2
698 // bit 0: 2^0
699 //
700 // Sum = 2^256 - 2^33 + 2^32 - 2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0
701 // = 2^256 - 2^32 - 2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0 (since 2^33 - 2^32 = 2^32)
702 //
703 // Hmm, -2^33 + 2^32 = -2^32, so:
704 // = 2^256 - 2^32 - 2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0
705 //
706 // And we need p-2 = 2^256 - 2^32 - 979
707 // So: -979 = -2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0
708 // = -4096 + 512 + 32 + 8 + 4 + 1
709 // = -4096 + 557
710 // = -3539
711 //
712 // That's not -979. Let me re-examine the bit pattern of 0xFFFFFC2D.
713
714 // 0xC2D = 0b110000101101 (12 bits)
715 // bit 11: 1 → 2^11
716 // bit 10: 1 → 2^10
717 // bit 9: 0
718 // bit 8: 0
719 // bit 7: 0
720 // bit 6: 0
721 // bit 5: 1 → 2^5
722 // bit 4: 0
723 // bit 3: 1 → 2^3
724 // bit 2: 1 → 2^2
725 // bit 1: 0
726 // bit 0: 1 → 2^0
727
728 // So bits 11-0 contribute: 2^11 + 2^10 + 2^5 + 2^3 + 2^2 + 2^0
729 // = 2048 + 1024 + 32 + 8 + 4 + 1
730 // = 3117
731
732 // Full p-2:
733 // bits 255-33 (223 ones): 2^256 - 2^33
734 // bits 31-12 (20 ones): 2^32 - 2^12
735 // bits 11-0: 3117
736 //
737 // Sum = 2^256 - 2^33 + 2^32 - 2^12 + 3117
738 // = 2^256 - 2^32 - 2^12 + 3117
739 // = 2^256 - 2^32 - 4096 + 3117
740 // = 2^256 - 2^32 - 979 ✓
741
742 // Great! So the bit pattern is correct. Now I can build the addition chain.
743
744 // Build result from MSB using sliding window:
745 // t1 = x223 = a^(2^223 - 1)
746
747 // Shift left by 33 (square 33 times) and multiply by x22 for the next window?
748 // No wait, we have a gap (bit 32 = 0) and then 20 ones.
749
750 // Let me use a cleaner approach:
751 // 1. t1 = x223
752 // 2. t1 = t1^(2^33) = a^((2^223-1)*2^33) = a^(2^256 - 2^33)
753 // 3. t1 = t1 * x20 = a^(2^256 - 2^33 + 2^20 - 1) -- but we need (2^20-1)*2^12 not just 2^20-1
754
755 // Hmm, this is getting complicated. Let me try a different formulation.
756
757 // Instead of trying to be clever, just do windowed exponentiation:
758 // Split p-2 into windows and multiply.
759
760 // p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D
761 //
762 // From MSB, process in chunks matching our precomputed values:
763 // Window 1: bits 255-33 = 223 ones → use x223, then square 33 times
764 // After: a^((2^223-1) * 2^33) = a^(2^256 - 2^33)
765 //
766 // Window 2: bits 32-13 = 0 followed by 20 ones
767 // We already squared 33 times, so we're at bit 32.
768 // bit 32 = 0: just square (no multiply)
769 // bits 31-12 = 20 ones
770 // But x20 isn't precomputed. Let me compute it.
771
772 // Hmm, I already computed x20 above. Let me restructure.
773
774 // Reset and do it properly:
775 t1 = x223
776 // t1 = a^(2^223 - 1)
777
778 // Square 33 times to shift the window
779 for j := 0; j < 33; j++ {
780 t1.sqr(&t1)
781 }
782 // t1 = a^((2^223-1) * 2^33)
783
784 // Multiply by x22 to add 22 more ones? No, we need exactly 20 ones.
785 // Actually, I already computed x20 above. Let's use it.
786 // Multiply by x20 shifted appropriately.
787 // But x20 at its base position is a^(2^20 - 1).
788 // We need a^((2^20 - 1) * 2^12) to contribute bits 31-12.
789 // So we need to shift x20 by 12.
790
791 // Current t1 exponent: (2^223 - 1) * 2^33 = 2^256 - 2^33
792 // We want to add: (2^20 - 1) * 2^12 = 2^32 - 2^12 - (2^12 - 1) = wait no
793 // (2^20 - 1) * 2^12 = 2^32 - 2^12
794
795 // So after adding: 2^256 - 2^33 + 2^32 - 2^12 = 2^256 - 2^32 - 2^12
796
797 // Wait, 2^33 - 2^32 = 2^32, so -2^33 + 2^32 = -2^32
798
799 // Now we need to add the remaining bits 11-0 = 0xC2D = 3117
800
801 // t1_final = t1 * a^((2^20-1)*2^12) * a^3117
802
803 // For a^((2^20-1)*2^12): square x20 12 times
804 var temp FieldElement
805 temp = x20
806 for j := 0; j < 12; j++ {
807 temp.sqr(&temp)
808 }
809 // temp = a^((2^20-1)*2^12) = a^(2^32 - 2^12)
810
811 t1.mul(&t1, &temp)
812 // t1 = a^(2^256 - 2^33 + 2^32 - 2^12) = a^(2^256 - 2^32 - 2^12)
813
814 // Now add bits 11-0 = 0xC2D = 2^11 + 2^10 + 2^5 + 2^3 + 2^2 + 2^0 = 3117
815 // We need a^3117
816
817 // 3117 = 2^11 + 2^10 + 2^5 + 2^3 + 2^2 + 2^0
818 // = 2048 + 1024 + 32 + 8 + 4 + 1
819
820 // Using our precomputed values:
821 // a^(2^11 - 1) = x11 = a^2047
822 // We need a^2048 = a^(2^11), which is a squared 11 times.
823 // Actually, let's build a^3117 more directly.
824
825 // a^3117 = a^(2^11) * a^(2^10) * a^(2^5) * a^(2^3) * a^(2^2) * a^(2^0)
826 // = (a^2048) * (a^1024) * (a^32) * (a^8) * (a^4) * a
827
828 // But computing each power separately and multiplying is expensive.
829 // Better: use sliding window on the bits.
830
831 // 3117 in binary: 110000101101
832 // From MSB: 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1
833
834 // Or we can decompose using our precomputed values:
835 // 3117 = 2047 + 1024 + 32 + 8 + 4 + 2 = 2047 + 1070
836 // Hmm, 2047 = x11, but 1070 is awkward.
837
838 // Let's just compute a^3117 with a small addition chain.
839 // 3117 = 0b110000101101
840
841 // Build a^3117:
842 var low FieldElement
843 low = aNorm
844 low.sqr(&low) // a^2
845 low.mul(&low, &aNorm) // a^3
846 low.sqr(&low) // a^6
847 low.sqr(&low) // a^12
848 low.mul(&low, &aNorm) // a^13
849 low.sqr(&low) // a^26
850 low.sqr(&low) // a^52
851 low.sqr(&low) // a^104
852 low.sqr(&low) // a^208
853 low.mul(&low, &aNorm) // a^209
854 low.sqr(&low) // a^418
855 low.sqr(&low) // a^836
856 low.mul(&low, &aNorm) // a^837
857 low.sqr(&low) // a^1674
858 low.mul(&low, &aNorm) // a^1675
859 low.sqr(&low) // a^3350
860 // Hmm, this is getting messy and doesn't give 3117.
861
862 // Let me use a simpler approach: compute a^3117 using binary method (small number)
863 // This adds only ~12 squarings and ~6 multiplications
864 low.setInt(1)
865 base := aNorm
866 e := uint32(3117)
867 for e > 0 {
868 if e&1 == 1 {
869 low.mul(&low, &base)
870 }
871 e >>= 1
872 if e > 0 {
873 base.sqr(&base)
874 }
875 }
876 // low = a^3117
877
878 // But wait, our temp and t1 computations used a shifted a. Let me reconsider.
879 // Actually no, the exponent additions are correct. We multiply t1 by a^(low_bits)
880 // where low_bits is computed from the original a.
881
882 // Hmm, but we shifted x20 by 12 bits already, contributing exponent (2^20-1)*2^12.
883 // And now low = a^3117.
884 // But 3117 spans bits 11-0, and we need to add this to the current exponent.
885
886 // Current t1 exponent: 2^256 - 2^32 - 2^12
887 // We want: 2^256 - 2^32 - 979
888 // So we need: -979 - (-2^12) = -979 + 4096 = 3117 ✓
889
890 // So t1 * low = a^(2^256 - 2^32 - 2^12 + 3117) = a^(2^256 - 2^32 - 979) = a^(p-2)
891
892 t1.mul(&t1, &low)
893
894 *r = t1
895 r.magnitude = 1
896 r.normalized = true
897 }
898
899 // sqrt computes the square root of a field element if it exists
900 // This follows the C secp256k1_fe_sqrt implementation exactly
901 func (r *FieldElement) sqrt(a *FieldElement) bool {
902 // Given that p is congruent to 3 mod 4, we can compute the square root of
903 // a mod p as the (p+1)/4'th power of a.
904 //
905 // As (p+1)/4 is an even number, it will have the same result for a and for
906 // (-a). Only one of these two numbers actually has a square root however,
907 // so we test at the end by squaring and comparing to the input.
908
909 var aNorm FieldElement
910 aNorm = *a
911
912 // Normalize input if magnitude is too high
913 if aNorm.magnitude > 8 {
914 aNorm.normalizeWeak()
915 } else {
916 aNorm.normalize()
917 }
918
919 // The binary representation of (p + 1)/4 has 3 blocks of 1s, with lengths in
920 // { 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
921 // 1, [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
922
923 var x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1 FieldElement
924
925 // x2 = a^3
926 x2.sqr(&aNorm)
927 x2.mul(&x2, &aNorm)
928
929 // x3 = a^7
930 x3.sqr(&x2)
931 x3.mul(&x3, &aNorm)
932
933 // x6 = a^63
934 x6 = x3
935 for j := 0; j < 3; j++ {
936 x6.sqr(&x6)
937 }
938 x6.mul(&x6, &x3)
939
940 // x9 = a^511
941 x9 = x6
942 for j := 0; j < 3; j++ {
943 x9.sqr(&x9)
944 }
945 x9.mul(&x9, &x3)
946
947 // x11 = a^2047
948 x11 = x9
949 for j := 0; j < 2; j++ {
950 x11.sqr(&x11)
951 }
952 x11.mul(&x11, &x2)
953
954 // x22 = a^4194303
955 x22 = x11
956 for j := 0; j < 11; j++ {
957 x22.sqr(&x22)
958 }
959 x22.mul(&x22, &x11)
960
961 // x44 = a^17592186044415
962 x44 = x22
963 for j := 0; j < 22; j++ {
964 x44.sqr(&x44)
965 }
966 x44.mul(&x44, &x22)
967
968 // x88 = a^72057594037927935
969 x88 = x44
970 for j := 0; j < 44; j++ {
971 x88.sqr(&x88)
972 }
973 x88.mul(&x88, &x44)
974
975 // x176 = a^1180591620717411303423
976 x176 = x88
977 for j := 0; j < 88; j++ {
978 x176.sqr(&x176)
979 }
980 x176.mul(&x176, &x88)
981
982 // x220 = a^172543658669764094685868767685
983 x220 = x176
984 for j := 0; j < 44; j++ {
985 x220.sqr(&x220)
986 }
987 x220.mul(&x220, &x44)
988
989 // x223 = a^13479973333575319897333507543509815336818572211270286240551805124607
990 x223 = x220
991 for j := 0; j < 3; j++ {
992 x223.sqr(&x223)
993 }
994 x223.mul(&x223, &x3)
995
996 // The final result is then assembled using a sliding window over the blocks.
997 t1 = x223
998 for j := 0; j < 23; j++ {
999 t1.sqr(&t1)
1000 }
1001 t1.mul(&t1, &x22)
1002 for j := 0; j < 6; j++ {
1003 t1.sqr(&t1)
1004 }
1005 t1.mul(&t1, &x2)
1006 t1.sqr(&t1)
1007 r.sqr(&t1)
1008
1009 // Check that a square root was actually calculated
1010 var check FieldElement
1011 check.sqr(r)
1012 check.normalize()
1013 aNorm.normalize()
1014
1015 ret := check.equal(&aNorm)
1016
1017 // If sqrt(a) doesn't exist, compute sqrt(-a) instead (as per field.h comment)
1018 if !ret {
1019 var negA FieldElement
1020 negA.negate(&aNorm, 1)
1021 negA.normalize()
1022
1023 t1 = x223
1024 for j := 0; j < 23; j++ {
1025 t1.sqr(&t1)
1026 }
1027 t1.mul(&t1, &x22)
1028 for j := 0; j < 6; j++ {
1029 t1.sqr(&t1)
1030 }
1031 t1.mul(&t1, &x2)
1032 t1.sqr(&t1)
1033 r.sqr(&t1)
1034
1035 check.sqr(r)
1036 check.normalize()
1037
1038 // Return whether sqrt(-a) exists
1039 return check.equal(&negA)
1040 }
1041
1042 return ret
1043 }
1044
1045 // isSquare checks if a field element is a quadratic residue
1046 func (a *FieldElement) isSquare() bool {
1047 // Use Legendre symbol: a^((p-1)/2) mod p
1048 // If result is 1, then a is a quadratic residue
1049
1050 var result FieldElement
1051 result = *a
1052
1053 // Compute a^((p-1)/2) - simplified implementation
1054 for i := 0; i < 127; i++ { // Approximate (p-1)/2 bit length
1055 result.sqr(&result)
1056 }
1057
1058 result.normalize()
1059 return result.equal(&FieldElementOne)
1060 }
1061
1062 // half computes r = a/2 mod p
1063 func (r *FieldElement) half(a *FieldElement) {
1064 // This follows the C secp256k1_fe_impl_half implementation exactly
1065 *r = *a
1066
1067 t0, t1, t2, t3, t4 := r.n[0], r.n[1], r.n[2], r.n[3], r.n[4]
1068 one := uint64(1)
1069 // In C: mask = -(t0 & one) >> 12
1070 // In Go, we need to convert to signed, negate, then convert back
1071 mask := uint64(-int64(t0 & one)) >> 12
1072
1073 // Conditionally add field modulus if odd
1074 t0 += 0xFFFFEFFFFFC2F & mask
1075 t1 += mask
1076 t2 += mask
1077 t3 += mask
1078 t4 += mask >> 4
1079
1080 // Right shift with carry propagation
1081 r.n[0] = (t0 >> 1) + ((t1 & one) << 51)
1082 r.n[1] = (t1 >> 1) + ((t2 & one) << 51)
1083 r.n[2] = (t2 >> 1) + ((t3 & one) << 51)
1084 r.n[3] = (t3 >> 1) + ((t4 & one) << 51)
1085 r.n[4] = t4 >> 1
1086
1087 // Update magnitude as per C implementation
1088 r.magnitude = (r.magnitude >> 1) + 1
1089 r.normalized = false
1090 }
1091