//go:build !js && !wasm && !tinygo && !wasm32 package p256k1 import "math/bits" // uint128 represents a 128-bit unsigned integer for field arithmetic type uint128 struct { high, low uint64 } // mulU64ToU128 multiplies two uint64 values and returns a uint128 func mulU64ToU128(a, b uint64) uint128 { hi, lo := bits.Mul64(a, b) return uint128{high: hi, low: lo} } // addMulU128 computes c + a*b and returns the result as uint128 func addMulU128(c uint128, a, b uint64) uint128 { hi, lo := bits.Mul64(a, b) // Add lo to c.low newLo, carry := bits.Add64(c.low, lo, 0) // Add hi and carry to c.high newHi, _ := bits.Add64(c.high, hi, carry) return uint128{high: newHi, low: newLo} } // addU128 adds a uint64 to a uint128 func addU128(c uint128, a uint64) uint128 { newLo, carry := bits.Add64(c.low, a, 0) newHi, _ := bits.Add64(c.high, 0, carry) return uint128{high: newHi, low: newLo} } // lo returns the lower 64 bits func (u uint128) lo() uint64 { return u.low } // hi returns the upper 64 bits func (u uint128) hi() uint64 { return u.high } // rshift shifts the uint128 right by n bits func (u uint128) rshift(n uint) uint128 { if n >= 64 { return uint128{high: 0, low: u.high >> (n - 64)} } return uint128{ high: u.high >> n, low: (u.low >> n) | (u.high << (64 - n)), } } // mul multiplies two field elements: r = a * b // This implementation follows the C secp256k1_fe_mul_inner algorithm // Optimized: avoid copies when magnitude is low enough func (r *FieldElement) mul(a, b *FieldElement) { // Use pointers directly if magnitude is low enough (optimization) var aNorm, bNorm *FieldElement var aTemp, bTemp FieldElement if a.magnitude > 8 { aTemp = *a aTemp.normalizeWeak() aNorm = &aTemp } else { aNorm = a // Use directly, no copy needed } if b.magnitude > 8 { bTemp = *b bTemp.normalizeWeak() bNorm = &bTemp } else { bNorm = b // Use directly, no copy needed } // Use 4x64 BMI2 implementation if available (fastest on amd64) if hasField4x64() { field4x64Mul(r, aNorm, bNorm) return } // Use BMI2+ADX assembly if available if hasFieldAsmBMI2() { fieldMulAsmBMI2(r, aNorm, bNorm) r.magnitude = 1 r.normalized = false return } // Use regular assembly if available if hasFieldAsm() { fieldMulAsm(r, aNorm, bNorm) r.magnitude = 1 r.normalized = false return } // Extract limbs for easier access a0, a1, a2, a3, a4 := aNorm.n[0], aNorm.n[1], aNorm.n[2], aNorm.n[3], aNorm.n[4] b0, b1, b2, b3, b4 := bNorm.n[0], bNorm.n[1], bNorm.n[2], bNorm.n[3], bNorm.n[4] const M = 0xFFFFFFFFFFFFF // 2^52 - 1 const R = fieldReductionConstantShifted // 0x1000003D10 // Following the C implementation algorithm exactly // [... a b c] is shorthand for ... + a<<104 + b<<52 + c<<0 mod n // Compute p3 = a0*b3 + a1*b2 + a2*b1 + a3*b0 var c, d uint128 d = mulU64ToU128(a0, b3) d = addMulU128(d, a1, b2) d = addMulU128(d, a2, b1) d = addMulU128(d, a3, b0) // Compute p8 = a4*b4 c = mulU64ToU128(a4, b4) // d += R * c_lo; c >>= 64 d = addMulU128(d, R, c.lo()) c = c.rshift(64) // Extract t3 and shift d t3 := d.lo() & M d = d.rshift(52) // Compute p4 = a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0 d = addMulU128(d, a0, b4) d = addMulU128(d, a1, b3) d = addMulU128(d, a2, b2) d = addMulU128(d, a3, b1) d = addMulU128(d, a4, b0) // d += (R << 12) * c_lo d = addMulU128(d, R<<12, c.lo()) // Extract t4 and tx t4 := d.lo() & M d = d.rshift(52) tx := t4 >> 48 t4 &= (M >> 4) // Compute p0 = a0*b0 c = mulU64ToU128(a0, b0) // Compute p5 = a1*b4 + a2*b3 + a3*b2 + a4*b1 d = addMulU128(d, a1, b4) d = addMulU128(d, a2, b3) d = addMulU128(d, a3, b2) d = addMulU128(d, a4, b1) // Extract u0 u0 := d.lo() & M d = d.rshift(52) u0 = (u0 << 4) | tx // c += u0 * (R >> 4) c = addMulU128(c, u0, R>>4) // r[0] r.n[0] = c.lo() & M c = c.rshift(52) // Compute p1 = a0*b1 + a1*b0 c = addMulU128(c, a0, b1) c = addMulU128(c, a1, b0) // Compute p6 = a2*b4 + a3*b3 + a4*b2 d = addMulU128(d, a2, b4) d = addMulU128(d, a3, b3) d = addMulU128(d, a4, b2) // c += R * (d & M); d >>= 52 c = addMulU128(c, R, d.lo()&M) d = d.rshift(52) // r[1] r.n[1] = c.lo() & M c = c.rshift(52) // Compute p2 = a0*b2 + a1*b1 + a2*b0 c = addMulU128(c, a0, b2) c = addMulU128(c, a1, b1) c = addMulU128(c, a2, b0) // Compute p7 = a3*b4 + a4*b3 d = addMulU128(d, a3, b4) d = addMulU128(d, a4, b3) // c += R * d_lo; d >>= 64 c = addMulU128(c, R, d.lo()) d = d.rshift(64) // r[2] r.n[2] = c.lo() & M c = c.rshift(52) // c += (R << 12) * d_lo + t3 c = addMulU128(c, R<<12, d.lo()) c = addU128(c, t3) // r[3] r.n[3] = c.lo() & M c = c.rshift(52) // r[4] r.n[4] = c.lo() + t4 // Set magnitude and normalization r.magnitude = 1 r.normalized = false } // reduceFromWide reduces a 520-bit (10 limb) value modulo the field prime func (r *FieldElement) reduceFromWide(t [10]uint64) { // The field prime is p = 2^256 - 2^32 - 977 = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F // We use the fact that 2^256 ≡ 2^32 + 977 (mod p) // First, handle the upper limbs (t[5] through t[9]) // Each represents a multiple of 2^(52*i) where i >= 5 // Reduction constant for secp256k1: 2^32 + 977 = 0x1000003D1 const M = uint64(0x1000003D1) // Start from the highest limb and work down for i := 9; i >= 5; i-- { if t[i] == 0 { continue } // t[i] * 2^(52*i) ≡ t[i] * 2^(52*(i-5)) * 2^(52*5) ≡ t[i] * 2^(52*(i-5)) * 2^260 // Since 2^256 ≡ M (mod p), we have 2^260 ≡ 2^4 * M ≡ 16 * M (mod p) // For i=5: 2^260 ≡ 16*M (mod p) // For i=6: 2^312 ≡ 2^52 * 16*M ≡ 2^56 * M (mod p) // etc. shift := uint(52 * (i - 5) + 4) // Additional 4 bits for the 16 factor // Multiply t[i] by the appropriate power of M var carry uint64 if shift < 64 { // Simple case: can multiply directly factor := M << shift hi, lo := bits.Mul64(t[i], factor) // Add to appropriate position pos := 0 t[pos], carry = bits.Add64(t[pos], lo, 0) if pos+1 < 10 { t[pos+1], carry = bits.Add64(t[pos+1], hi, carry) } // Propagate carry for j := pos + 2; j < 10 && carry != 0; j++ { t[j], carry = bits.Add64(t[j], 0, carry) } } else { // Need to handle larger shifts by distributing across limbs hi, lo := bits.Mul64(t[i], M) limbShift := shift / 52 bitShift := shift % 52 if bitShift == 0 { // Aligned to limb boundary if limbShift < 10 { t[limbShift], carry = bits.Add64(t[limbShift], lo, 0) if limbShift+1 < 10 { t[limbShift+1], carry = bits.Add64(t[limbShift+1], hi, carry) } } } else { // Need to split across limbs loShifted := lo << bitShift hiShifted := (lo >> (64 - bitShift)) | (hi << bitShift) if limbShift < 10 { t[limbShift], carry = bits.Add64(t[limbShift], loShifted, 0) if limbShift+1 < 10 { t[limbShift+1], carry = bits.Add64(t[limbShift+1], hiShifted, carry) } } } // Propagate any remaining carry for j := int(limbShift) + 2; j < 10 && carry != 0; j++ { t[j], carry = bits.Add64(t[j], 0, carry) } } t[i] = 0 // Clear the processed limb } // Now we have a value in t[0..4] that may still be >= p // Convert to 5x52 format and normalize r.n[0] = t[0] & limb0Max r.n[1] = ((t[0] >> 52) | (t[1] << 12)) & limb0Max r.n[2] = ((t[1] >> 40) | (t[2] << 24)) & limb0Max r.n[3] = ((t[2] >> 28) | (t[3] << 36)) & limb0Max r.n[4] = ((t[3] >> 16) | (t[4] << 48)) & limb4Max r.magnitude = 1 r.normalized = false // Final reduction if needed if r.n[4] == limb4Max && r.n[3] == limb0Max && r.n[2] == limb0Max && r.n[1] == limb0Max && r.n[0] >= fieldModulusLimb0 { r.reduce() } } // sqr squares a field element: r = a^2 // This implementation follows the C secp256k1_fe_sqr_inner algorithm // Optimized: avoid copies when magnitude is low enough func (r *FieldElement) sqr(a *FieldElement) { // Use pointer directly if magnitude is low enough (optimization) var aNorm *FieldElement var aTemp FieldElement if a.magnitude > 8 { aTemp = *a aTemp.normalizeWeak() aNorm = &aTemp } else { aNorm = a // Use directly, no copy needed } // Use 4x64 BMI2 implementation if available (fastest on amd64) if hasField4x64() { field4x64Sqr(r, aNorm) return } // Use BMI2+ADX assembly if available if hasFieldAsmBMI2() { fieldSqrAsmBMI2(r, aNorm) r.magnitude = 1 r.normalized = false return } // Use regular assembly if available if hasFieldAsm() { fieldSqrAsm(r, aNorm) r.magnitude = 1 r.normalized = false return } // Extract limbs for easier access a0, a1, a2, a3, a4 := aNorm.n[0], aNorm.n[1], aNorm.n[2], aNorm.n[3], aNorm.n[4] const M = 0xFFFFFFFFFFFFF // 2^52 - 1 const R = fieldReductionConstantShifted // 0x1000003D10 // Following the C implementation algorithm exactly // Compute p3 = 2*a0*a3 + 2*a1*a2 var c, d uint128 d = mulU64ToU128(a0*2, a3) d = addMulU128(d, a1*2, a2) // Compute p8 = a4*a4 c = mulU64ToU128(a4, a4) // d += R * c_lo; c >>= 64 d = addMulU128(d, R, c.lo()) c = c.rshift(64) // Extract t3 and shift d t3 := d.lo() & M d = d.rshift(52) // Compute p4 = a0*a4*2 + a1*a3*2 + a2*a2 a4 *= 2 d = addMulU128(d, a0, a4) d = addMulU128(d, a1*2, a3) d = addMulU128(d, a2, a2) // d += (R << 12) * c_lo d = addMulU128(d, R<<12, c.lo()) // Extract t4 and tx t4 := d.lo() & M d = d.rshift(52) tx := t4 >> 48 t4 &= (M >> 4) // Compute p0 = a0*a0 c = mulU64ToU128(a0, a0) // Compute p5 = a1*a4 + a2*a3*2 d = addMulU128(d, a1, a4) d = addMulU128(d, a2*2, a3) // Extract u0 u0 := d.lo() & M d = d.rshift(52) u0 = (u0 << 4) | tx // c += u0 * (R >> 4) c = addMulU128(c, u0, R>>4) // r[0] r.n[0] = c.lo() & M c = c.rshift(52) // Compute p1 = a0*a1*2 a0 *= 2 c = addMulU128(c, a0, a1) // Compute p6 = a2*a4 + a3*a3 d = addMulU128(d, a2, a4) d = addMulU128(d, a3, a3) // c += R * (d & M); d >>= 52 c = addMulU128(c, R, d.lo()&M) d = d.rshift(52) // r[1] r.n[1] = c.lo() & M c = c.rshift(52) // Compute p2 = a0*a2 + a1*a1 c = addMulU128(c, a0, a2) c = addMulU128(c, a1, a1) // Compute p7 = a3*a4 d = addMulU128(d, a3, a4) // c += R * d_lo; d >>= 64 c = addMulU128(c, R, d.lo()) d = d.rshift(64) // r[2] r.n[2] = c.lo() & M c = c.rshift(52) // c += (R << 12) * d_lo + t3 c = addMulU128(c, R<<12, d.lo()) c = addU128(c, t3) // r[3] r.n[3] = c.lo() & M c = c.rshift(52) // r[4] r.n[4] = c.lo() + t4 // Set magnitude and normalization r.magnitude = 1 r.normalized = false } // inv computes the modular inverse of a field element using Fermat's little theorem // This implements a^(p-2) mod p where p is the secp256k1 field prime // Uses an optimized addition chain similar to sqrt() for efficiency. // // p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D // // Binary structure: // - bits 255-33: 223 ones (= 2^223 - 1 shifted left by 33) // - bit 32: 0 // - bits 31-12: 20 ones // - bits 11-0: 110000101101 (0xC2D) // // This uses ~266 squarings + ~15 multiplications instead of ~256 + ~127 for naive binary exp. func (r *FieldElement) inv(a *FieldElement) { var aNorm FieldElement aNorm = *a aNorm.normalize() // Build up powers using addition chain (same as sqrt) // x^(2^k - 1) for k in {2, 3, 6, 9, 11, 22, 44, 88, 176, 220, 223} var x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1 FieldElement // x2 = a^(2^2 - 1) = a^3 x2.sqr(&aNorm) x2.mul(&x2, &aNorm) // x3 = a^(2^3 - 1) = a^7 x3.sqr(&x2) x3.mul(&x3, &aNorm) // x6 = a^(2^6 - 1) = a^63 x6 = x3 for j := 0; j < 3; j++ { x6.sqr(&x6) } x6.mul(&x6, &x3) // x9 = a^(2^9 - 1) = a^511 x9 = x6 for j := 0; j < 3; j++ { x9.sqr(&x9) } x9.mul(&x9, &x3) // x11 = a^(2^11 - 1) = a^2047 x11 = x9 for j := 0; j < 2; j++ { x11.sqr(&x11) } x11.mul(&x11, &x2) // x22 = a^(2^22 - 1) x22 = x11 for j := 0; j < 11; j++ { x22.sqr(&x22) } x22.mul(&x22, &x11) // x44 = a^(2^44 - 1) x44 = x22 for j := 0; j < 22; j++ { x44.sqr(&x44) } x44.mul(&x44, &x22) // x88 = a^(2^88 - 1) x88 = x44 for j := 0; j < 44; j++ { x88.sqr(&x88) } x88.mul(&x88, &x44) // x176 = a^(2^176 - 1) x176 = x88 for j := 0; j < 88; j++ { x176.sqr(&x176) } x176.mul(&x176, &x88) // x220 = a^(2^220 - 1) x220 = x176 for j := 0; j < 44; j++ { x220.sqr(&x220) } x220.mul(&x220, &x44) // x223 = a^(2^223 - 1) x223 = x220 for j := 0; j < 3; j++ { x223.sqr(&x223) } x223.mul(&x223, &x3) // Now assemble p-2 = 2^256 - 2^32 - 979 // = (2^223 - 1) * 2^33 + (2^32 - 1) * 2^0 + ... (adjusted for the specific bit pattern) // // p-2 in binary (from MSB): // 223 ones, 0, 20 ones, 00, 1, 0, 00, 1, 0, 1, 1, 0, 1 // [223 ones][0][20 ones][11-0 bits = 0xC2D = 110000101101] // // Step 1: Start with x223 and shift left by 33 bits t1 = x223 for j := 0; j < 33; j++ { t1.sqr(&t1) } // Now t1 = a^((2^223 - 1) * 2^33) // Step 2: Add contribution for bits 31-12 (20 ones) // We need x^(2^32 - 2^12) = x^((2^20 - 1) * 2^12) // But we don't have x20 precomputed. Use x22 and adjust. // Actually, let's use: x11 * 2^11 * x11 = x^((2^11-1)*2^11 + (2^11-1)) = x^(2^22 - 2^11 + 2^11 - 1) = x^(2^22-1) = x22 // We need x^(2^20 - 1). We can compute: x11 shifted by 9, then multiply by x9 // x^((2^11-1)*2^9) * x^(2^9-1) = x^(2^20 - 2^9 + 2^9 - 1) = x^(2^20 - 1) var x20 FieldElement x20 = x11 for j := 0; j < 9; j++ { x20.sqr(&x20) } x20.mul(&x20, &x9) // Now x20 = a^(2^20 - 1) // Multiply into t1: t1 * x20 (but first shift x20 to correct position) // Current t1 is at position 33 (exponent ends at bit 33) // We need bits 31-12 = 20 ones, which is x^((2^20-1) * 2^12) // So we need to shift t1 down? No, we're building the exponent. // // Let me reconsider. We have: // t1 = a^((2^223-1) * 2^33) = a^(2^256 - 2^33) // // p-2 = 2^256 - 2^32 - 979 // // 2^256 - 2^33 vs 2^256 - 2^32 - 979 // We need to add: 2^32 - 979 = FFFFFC2D // // So: p-2 = (2^256 - 2^33) + (2^32 - 979) // = t1_exp + 0xFFFFFC2D (as addition of exponents = multiplication) // // But that's not how exponentiation works! a^x * a^y = a^(x+y) // So we need t1 * a^(0xFFFFFC2D) // // 0xFFFFFC2D = 4294966317 // This is still a big exponent to compute naively. // // Let me decompose 0xFFFFFC2D in terms of our precomputed powers. // 0xFFFFFC2D = 0b 1111 1111 1111 1111 1111 1100 0010 1101 // = 2^32 - 979 // = (2^32 - 1) - 978 // = (2^32 - 1) - 2^9 - 2^8 - 2^7 - 2^6 - 2^4 - 2^1 // // Hmm, subtraction in exponent is division... that doesn't help. // // Let me think about this differently using the bit pattern directly. // p-2 bits (from MSB to LSB): // bit 255: 1 // ... // bit 33: 1 (223 ones total in bits 255-33) // bit 32: 0 // bit 31: 1 // ... // bit 12: 1 (20 ones total in bits 31-12) // bit 11: 0 // bit 10: 0 // bit 9: 1 // bit 8: 0 // bit 7: 0 // bit 6: 0 // bit 5: 1 // bit 4: 0 // bit 3: 1 // bit 2: 1 // bit 1: 0 // bit 0: 1 // // Using sliding window from MSB: // Start: t1 = a^1 // Process 223 ones in bits 255-33: // t1 = a^(2^223 - 1) using x223 (done above via shift) // After shifting 33 times: a^((2^223-1) * 2^33) = a^(2^256 - 2^33) // // bit 32 = 0: square once // t1 = a^(2^257 - 2^34) -- wait, this goes beyond 256 bits // // I think I'm overcomplicating this. Let me use a simpler sliding window approach. // Alternative approach: build from LSB using the precomputed values // Actually, let's use the approach from libsecp256k1's sqrt which builds from MSB. // p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D // // We can write this as: // ((((x223 << 23) * x22) << 5) * x2) << 3) * x3) << 2) * x2) << 1) << 1) * a // // Let me verify: starting with a^(2^223 - 1) = x223 // // Actually, the libsecp256k1 modinv doesn't use addition chains - it uses safegcd. // Let me check what the modular inverse in libsecp256k1 actually does. // For now, let me implement a working addition chain that's still much faster than binary exp. // p-2 structure analysis: // Bits 255-33: 223 consecutive 1s // Bit 32: 0 // Bits 31-12: 20 consecutive 1s (but we don't have x20, need to construct) // Bits 11-0: 0xC2D = 0b110000101101 // Approach: sliding window from MSB // 1. Start with x223 (bits 255-33) // 2. Square for bit 32 (which is 0) // 3. Square and multiply for bits 31-12 (20 ones) // 4. Handle remaining bits 11-0 // t1 already = x223 = a^(2^223 - 1) // Square 23 times for bits 255-33 positioning, then multiply by x22 for next block // Wait, I'm confusing myself. Let me restart with a cleaner approach. // Standard addition chain for computing a^e where e = p-2: // We compute a^e by starting with a and repeatedly squaring and multiplying. // The exponent p-2 has this binary form: // 1{223}01{20}001000010110 1 // (223 ones)(zero)(20 ones)(0xC2D pattern) // Efficient approach: compute x^((2^223-1) * 2^33 + (2^20-1) * 2^13 + specific_bits) // // Let's verify: (2^223-1)*2^33 = 2^256 - 2^33 // (2^20-1)*2^13 = 2^33 - 2^13 // Adding: 2^256 - 2^33 + 2^33 - 2^13 = 2^256 - 2^13 // But p-2 = 2^256 - 2^32 - 979, so this isn't quite right. // Let me compute more carefully. // The exponent as sum of weighted runs of 1s: // bits 255-33 (223 ones): (2^223 - 1) * 2^33 = 2^256 - 2^33 // bits 31-12 (20 ones): (2^20 - 1) * 2^12 = 2^32 - 2^12 // bit 9: 2^9 // bit 5: 2^5 // bit 3: 2^3 // bit 2: 2^2 // bit 0: 2^0 // // Sum = 2^256 - 2^33 + 2^32 - 2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0 // = 2^256 - 2^32 - 2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0 (since 2^33 - 2^32 = 2^32) // // Hmm, -2^33 + 2^32 = -2^32, so: // = 2^256 - 2^32 - 2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0 // // And we need p-2 = 2^256 - 2^32 - 979 // So: -979 = -2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0 // = -4096 + 512 + 32 + 8 + 4 + 1 // = -4096 + 557 // = -3539 // // That's not -979. Let me re-examine the bit pattern of 0xFFFFFC2D. // 0xC2D = 0b110000101101 (12 bits) // bit 11: 1 → 2^11 // bit 10: 1 → 2^10 // bit 9: 0 // bit 8: 0 // bit 7: 0 // bit 6: 0 // bit 5: 1 → 2^5 // bit 4: 0 // bit 3: 1 → 2^3 // bit 2: 1 → 2^2 // bit 1: 0 // bit 0: 1 → 2^0 // So bits 11-0 contribute: 2^11 + 2^10 + 2^5 + 2^3 + 2^2 + 2^0 // = 2048 + 1024 + 32 + 8 + 4 + 1 // = 3117 // Full p-2: // bits 255-33 (223 ones): 2^256 - 2^33 // bits 31-12 (20 ones): 2^32 - 2^12 // bits 11-0: 3117 // // Sum = 2^256 - 2^33 + 2^32 - 2^12 + 3117 // = 2^256 - 2^32 - 2^12 + 3117 // = 2^256 - 2^32 - 4096 + 3117 // = 2^256 - 2^32 - 979 ✓ // Great! So the bit pattern is correct. Now I can build the addition chain. // Build result from MSB using sliding window: // t1 = x223 = a^(2^223 - 1) // Shift left by 33 (square 33 times) and multiply by x22 for the next window? // No wait, we have a gap (bit 32 = 0) and then 20 ones. // Let me use a cleaner approach: // 1. t1 = x223 // 2. t1 = t1^(2^33) = a^((2^223-1)*2^33) = a^(2^256 - 2^33) // 3. t1 = t1 * x20 = a^(2^256 - 2^33 + 2^20 - 1) -- but we need (2^20-1)*2^12 not just 2^20-1 // Hmm, this is getting complicated. Let me try a different formulation. // Instead of trying to be clever, just do windowed exponentiation: // Split p-2 into windows and multiply. // p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D // // From MSB, process in chunks matching our precomputed values: // Window 1: bits 255-33 = 223 ones → use x223, then square 33 times // After: a^((2^223-1) * 2^33) = a^(2^256 - 2^33) // // Window 2: bits 32-13 = 0 followed by 20 ones // We already squared 33 times, so we're at bit 32. // bit 32 = 0: just square (no multiply) // bits 31-12 = 20 ones // But x20 isn't precomputed. Let me compute it. // Hmm, I already computed x20 above. Let me restructure. // Reset and do it properly: t1 = x223 // t1 = a^(2^223 - 1) // Square 33 times to shift the window for j := 0; j < 33; j++ { t1.sqr(&t1) } // t1 = a^((2^223-1) * 2^33) // Multiply by x22 to add 22 more ones? No, we need exactly 20 ones. // Actually, I already computed x20 above. Let's use it. // Multiply by x20 shifted appropriately. // But x20 at its base position is a^(2^20 - 1). // We need a^((2^20 - 1) * 2^12) to contribute bits 31-12. // So we need to shift x20 by 12. // Current t1 exponent: (2^223 - 1) * 2^33 = 2^256 - 2^33 // We want to add: (2^20 - 1) * 2^12 = 2^32 - 2^12 - (2^12 - 1) = wait no // (2^20 - 1) * 2^12 = 2^32 - 2^12 // So after adding: 2^256 - 2^33 + 2^32 - 2^12 = 2^256 - 2^32 - 2^12 // Wait, 2^33 - 2^32 = 2^32, so -2^33 + 2^32 = -2^32 // Now we need to add the remaining bits 11-0 = 0xC2D = 3117 // t1_final = t1 * a^((2^20-1)*2^12) * a^3117 // For a^((2^20-1)*2^12): square x20 12 times var temp FieldElement temp = x20 for j := 0; j < 12; j++ { temp.sqr(&temp) } // temp = a^((2^20-1)*2^12) = a^(2^32 - 2^12) t1.mul(&t1, &temp) // t1 = a^(2^256 - 2^33 + 2^32 - 2^12) = a^(2^256 - 2^32 - 2^12) // Now add bits 11-0 = 0xC2D = 2^11 + 2^10 + 2^5 + 2^3 + 2^2 + 2^0 = 3117 // We need a^3117 // 3117 = 2^11 + 2^10 + 2^5 + 2^3 + 2^2 + 2^0 // = 2048 + 1024 + 32 + 8 + 4 + 1 // Using our precomputed values: // a^(2^11 - 1) = x11 = a^2047 // We need a^2048 = a^(2^11), which is a squared 11 times. // Actually, let's build a^3117 more directly. // a^3117 = a^(2^11) * a^(2^10) * a^(2^5) * a^(2^3) * a^(2^2) * a^(2^0) // = (a^2048) * (a^1024) * (a^32) * (a^8) * (a^4) * a // But computing each power separately and multiplying is expensive. // Better: use sliding window on the bits. // 3117 in binary: 110000101101 // From MSB: 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1 // Or we can decompose using our precomputed values: // 3117 = 2047 + 1024 + 32 + 8 + 4 + 2 = 2047 + 1070 // Hmm, 2047 = x11, but 1070 is awkward. // Let's just compute a^3117 with a small addition chain. // 3117 = 0b110000101101 // Build a^3117: var low FieldElement low = aNorm low.sqr(&low) // a^2 low.mul(&low, &aNorm) // a^3 low.sqr(&low) // a^6 low.sqr(&low) // a^12 low.mul(&low, &aNorm) // a^13 low.sqr(&low) // a^26 low.sqr(&low) // a^52 low.sqr(&low) // a^104 low.sqr(&low) // a^208 low.mul(&low, &aNorm) // a^209 low.sqr(&low) // a^418 low.sqr(&low) // a^836 low.mul(&low, &aNorm) // a^837 low.sqr(&low) // a^1674 low.mul(&low, &aNorm) // a^1675 low.sqr(&low) // a^3350 // Hmm, this is getting messy and doesn't give 3117. // Let me use a simpler approach: compute a^3117 using binary method (small number) // This adds only ~12 squarings and ~6 multiplications low.setInt(1) base := aNorm e := uint32(3117) for e > 0 { if e&1 == 1 { low.mul(&low, &base) } e >>= 1 if e > 0 { base.sqr(&base) } } // low = a^3117 // But wait, our temp and t1 computations used a shifted a. Let me reconsider. // Actually no, the exponent additions are correct. We multiply t1 by a^(low_bits) // where low_bits is computed from the original a. // Hmm, but we shifted x20 by 12 bits already, contributing exponent (2^20-1)*2^12. // And now low = a^3117. // But 3117 spans bits 11-0, and we need to add this to the current exponent. // Current t1 exponent: 2^256 - 2^32 - 2^12 // We want: 2^256 - 2^32 - 979 // So we need: -979 - (-2^12) = -979 + 4096 = 3117 ✓ // So t1 * low = a^(2^256 - 2^32 - 2^12 + 3117) = a^(2^256 - 2^32 - 979) = a^(p-2) t1.mul(&t1, &low) *r = t1 r.magnitude = 1 r.normalized = true } // sqrt computes the square root of a field element if it exists // This follows the C secp256k1_fe_sqrt implementation exactly func (r *FieldElement) sqrt(a *FieldElement) bool { // Given that p is congruent to 3 mod 4, we can compute the square root of // a mod p as the (p+1)/4'th power of a. // // As (p+1)/4 is an even number, it will have the same result for a and for // (-a). Only one of these two numbers actually has a square root however, // so we test at the end by squaring and comparing to the input. var aNorm FieldElement aNorm = *a // Normalize input if magnitude is too high if aNorm.magnitude > 8 { aNorm.normalizeWeak() } else { aNorm.normalize() } // The binary representation of (p + 1)/4 has 3 blocks of 1s, with lengths in // { 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block: // 1, [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223] var x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1 FieldElement // x2 = a^3 x2.sqr(&aNorm) x2.mul(&x2, &aNorm) // x3 = a^7 x3.sqr(&x2) x3.mul(&x3, &aNorm) // x6 = a^63 x6 = x3 for j := 0; j < 3; j++ { x6.sqr(&x6) } x6.mul(&x6, &x3) // x9 = a^511 x9 = x6 for j := 0; j < 3; j++ { x9.sqr(&x9) } x9.mul(&x9, &x3) // x11 = a^2047 x11 = x9 for j := 0; j < 2; j++ { x11.sqr(&x11) } x11.mul(&x11, &x2) // x22 = a^4194303 x22 = x11 for j := 0; j < 11; j++ { x22.sqr(&x22) } x22.mul(&x22, &x11) // x44 = a^17592186044415 x44 = x22 for j := 0; j < 22; j++ { x44.sqr(&x44) } x44.mul(&x44, &x22) // x88 = a^72057594037927935 x88 = x44 for j := 0; j < 44; j++ { x88.sqr(&x88) } x88.mul(&x88, &x44) // x176 = a^1180591620717411303423 x176 = x88 for j := 0; j < 88; j++ { x176.sqr(&x176) } x176.mul(&x176, &x88) // x220 = a^172543658669764094685868767685 x220 = x176 for j := 0; j < 44; j++ { x220.sqr(&x220) } x220.mul(&x220, &x44) // x223 = a^13479973333575319897333507543509815336818572211270286240551805124607 x223 = x220 for j := 0; j < 3; j++ { x223.sqr(&x223) } x223.mul(&x223, &x3) // The final result is then assembled using a sliding window over the blocks. t1 = x223 for j := 0; j < 23; j++ { t1.sqr(&t1) } t1.mul(&t1, &x22) for j := 0; j < 6; j++ { t1.sqr(&t1) } t1.mul(&t1, &x2) t1.sqr(&t1) r.sqr(&t1) // Check that a square root was actually calculated var check FieldElement check.sqr(r) check.normalize() aNorm.normalize() ret := check.equal(&aNorm) // If sqrt(a) doesn't exist, compute sqrt(-a) instead (as per field.h comment) if !ret { var negA FieldElement negA.negate(&aNorm, 1) negA.normalize() t1 = x223 for j := 0; j < 23; j++ { t1.sqr(&t1) } t1.mul(&t1, &x22) for j := 0; j < 6; j++ { t1.sqr(&t1) } t1.mul(&t1, &x2) t1.sqr(&t1) r.sqr(&t1) check.sqr(r) check.normalize() // Return whether sqrt(-a) exists return check.equal(&negA) } return ret } // isSquare checks if a field element is a quadratic residue func (a *FieldElement) isSquare() bool { // Use Legendre symbol: a^((p-1)/2) mod p // If result is 1, then a is a quadratic residue var result FieldElement result = *a // Compute a^((p-1)/2) - simplified implementation for i := 0; i < 127; i++ { // Approximate (p-1)/2 bit length result.sqr(&result) } result.normalize() return result.equal(&FieldElementOne) } // half computes r = a/2 mod p func (r *FieldElement) half(a *FieldElement) { // This follows the C secp256k1_fe_impl_half implementation exactly *r = *a t0, t1, t2, t3, t4 := r.n[0], r.n[1], r.n[2], r.n[3], r.n[4] one := uint64(1) // In C: mask = -(t0 & one) >> 12 // In Go, we need to convert to signed, negate, then convert back mask := uint64(-int64(t0 & one)) >> 12 // Conditionally add field modulus if odd t0 += 0xFFFFEFFFFFC2F & mask t1 += mask t2 += mask t3 += mask t4 += mask >> 4 // Right shift with carry propagation r.n[0] = (t0 >> 1) + ((t1 & one) << 51) r.n[1] = (t1 >> 1) + ((t2 & one) << 51) r.n[2] = (t2 >> 1) + ((t3 & one) << 51) r.n[3] = (t3 >> 1) + ((t4 & one) << 51) r.n[4] = t4 >> 1 // Update magnitude as per C implementation r.magnitude = (r.magnitude >> 1) + 1 r.normalized = false }