//go:build !js && !wasm && !tinygo && !wasm32

package p256k1

import "math/bits"

// uint128 represents a 128-bit unsigned integer for field arithmetic
type uint128 struct {
	high, low uint64
}

// mulU64ToU128 multiplies two uint64 values and returns a uint128
func mulU64ToU128(a, b uint64) uint128 {
	hi, lo := bits.Mul64(a, b)
	return uint128{high: hi, low: lo}
}

// addMulU128 computes c + a*b and returns the result as uint128
func addMulU128(c uint128, a, b uint64) uint128 {
	hi, lo := bits.Mul64(a, b)
	
	// Add lo to c.low
	newLo, carry := bits.Add64(c.low, lo, 0)
	
	// Add hi and carry to c.high
	newHi, _ := bits.Add64(c.high, hi, carry)
	
	return uint128{high: newHi, low: newLo}
}

// addU128 adds a uint64 to a uint128
func addU128(c uint128, a uint64) uint128 {
	newLo, carry := bits.Add64(c.low, a, 0)
	newHi, _ := bits.Add64(c.high, 0, carry)
	return uint128{high: newHi, low: newLo}
}

// lo returns the lower 64 bits
func (u uint128) lo() uint64 {
	return u.low
}

// hi returns the upper 64 bits
func (u uint128) hi() uint64 {
	return u.high
}

// rshift shifts the uint128 right by n bits
func (u uint128) rshift(n uint) uint128 {
	if n >= 64 {
		return uint128{high: 0, low: u.high >> (n - 64)}
	}
	return uint128{
		high: u.high >> n,
		low: (u.low >> n) | (u.high << (64 - n)),
	}
}

// mul multiplies two field elements: r = a * b
// This implementation follows the C secp256k1_fe_mul_inner algorithm
// Optimized: avoid copies when magnitude is low enough
func (r *FieldElement) mul(a, b *FieldElement) {
	// Use pointers directly if magnitude is low enough (optimization)
	var aNorm, bNorm *FieldElement
	var aTemp, bTemp FieldElement

	if a.magnitude > 8 {
		aTemp = *a
		aTemp.normalizeWeak()
		aNorm = &aTemp
	} else {
		aNorm = a // Use directly, no copy needed
	}

	if b.magnitude > 8 {
		bTemp = *b
		bTemp.normalizeWeak()
		bNorm = &bTemp
	} else {
		bNorm = b // Use directly, no copy needed
	}

	// Use 4x64 BMI2 implementation if available (fastest on amd64)
	if hasField4x64() {
		field4x64Mul(r, aNorm, bNorm)
		return
	}

	// Use BMI2+ADX assembly if available
	if hasFieldAsmBMI2() {
		fieldMulAsmBMI2(r, aNorm, bNorm)
		r.magnitude = 1
		r.normalized = false
		return
	}

	// Use regular assembly if available
	if hasFieldAsm() {
		fieldMulAsm(r, aNorm, bNorm)
		r.magnitude = 1
		r.normalized = false
		return
	}

	// Extract limbs for easier access
	a0, a1, a2, a3, a4 := aNorm.n[0], aNorm.n[1], aNorm.n[2], aNorm.n[3], aNorm.n[4]
	b0, b1, b2, b3, b4 := bNorm.n[0], bNorm.n[1], bNorm.n[2], bNorm.n[3], bNorm.n[4]

	const M = 0xFFFFFFFFFFFFF     // 2^52 - 1
	const R = fieldReductionConstantShifted // 0x1000003D10

	// Following the C implementation algorithm exactly
	// [... a b c] is shorthand for ... + a<<104 + b<<52 + c<<0 mod n
	
	// Compute p3 = a0*b3 + a1*b2 + a2*b1 + a3*b0
	var c, d uint128
	d = mulU64ToU128(a0, b3)
	d = addMulU128(d, a1, b2)
	d = addMulU128(d, a2, b1)
	d = addMulU128(d, a3, b0)
	
	// Compute p8 = a4*b4
	c = mulU64ToU128(a4, b4)
	
	// d += R * c_lo; c >>= 64
	d = addMulU128(d, R, c.lo())
	c = c.rshift(64)
	
	// Extract t3 and shift d
	t3 := d.lo() & M
	d = d.rshift(52)
	
	// Compute p4 = a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0
	d = addMulU128(d, a0, b4)
	d = addMulU128(d, a1, b3)
	d = addMulU128(d, a2, b2)
	d = addMulU128(d, a3, b1)
	d = addMulU128(d, a4, b0)
	
	// d += (R << 12) * c_lo
	d = addMulU128(d, R<<12, c.lo())
	
	// Extract t4 and tx
	t4 := d.lo() & M
	d = d.rshift(52)
	tx := t4 >> 48
	t4 &= (M >> 4)
	
	// Compute p0 = a0*b0
	c = mulU64ToU128(a0, b0)
	
	// Compute p5 = a1*b4 + a2*b3 + a3*b2 + a4*b1
	d = addMulU128(d, a1, b4)
	d = addMulU128(d, a2, b3)
	d = addMulU128(d, a3, b2)
	d = addMulU128(d, a4, b1)
	
	// Extract u0
	u0 := d.lo() & M
	d = d.rshift(52)
	u0 = (u0 << 4) | tx
	
	// c += u0 * (R >> 4)
	c = addMulU128(c, u0, R>>4)
	
	// r[0]
	r.n[0] = c.lo() & M
	c = c.rshift(52)
	
	// Compute p1 = a0*b1 + a1*b0
	c = addMulU128(c, a0, b1)
	c = addMulU128(c, a1, b0)
	
	// Compute p6 = a2*b4 + a3*b3 + a4*b2
	d = addMulU128(d, a2, b4)
	d = addMulU128(d, a3, b3)
	d = addMulU128(d, a4, b2)
	
	// c += R * (d & M); d >>= 52
	c = addMulU128(c, R, d.lo()&M)
	d = d.rshift(52)
	
	// r[1]
	r.n[1] = c.lo() & M
	c = c.rshift(52)
	
	// Compute p2 = a0*b2 + a1*b1 + a2*b0
	c = addMulU128(c, a0, b2)
	c = addMulU128(c, a1, b1)
	c = addMulU128(c, a2, b0)
	
	// Compute p7 = a3*b4 + a4*b3
	d = addMulU128(d, a3, b4)
	d = addMulU128(d, a4, b3)
	
	// c += R * d_lo; d >>= 64
	c = addMulU128(c, R, d.lo())
	d = d.rshift(64)
	
	// r[2]
	r.n[2] = c.lo() & M
	c = c.rshift(52)
	
	// c += (R << 12) * d_lo + t3
	c = addMulU128(c, R<<12, d.lo())
	c = addU128(c, t3)
	
	// r[3]
	r.n[3] = c.lo() & M
	c = c.rshift(52)
	
	// r[4]
	r.n[4] = c.lo() + t4
	
	// Set magnitude and normalization
	r.magnitude = 1
	r.normalized = false
}

// reduceFromWide reduces a 520-bit (10 limb) value modulo the field prime
func (r *FieldElement) reduceFromWide(t [10]uint64) {
	// The field prime is p = 2^256 - 2^32 - 977 = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F
	// We use the fact that 2^256 ≡ 2^32 + 977 (mod p)
	
	// First, handle the upper limbs (t[5] through t[9])
	// Each represents a multiple of 2^(52*i) where i >= 5
	
	// Reduction constant for secp256k1: 2^32 + 977 = 0x1000003D1
	const M = uint64(0x1000003D1)
	
	// Start from the highest limb and work down
	for i := 9; i >= 5; i-- {
		if t[i] == 0 {
			continue
		}
		
		// t[i] * 2^(52*i) ≡ t[i] * 2^(52*(i-5)) * 2^(52*5) ≡ t[i] * 2^(52*(i-5)) * 2^260
		// Since 2^256 ≡ M (mod p), we have 2^260 ≡ 2^4 * M ≡ 16 * M (mod p)
		
		// For i=5: 2^260 ≡ 16*M (mod p)
		// For i=6: 2^312 ≡ 2^52 * 16*M ≡ 2^56 * M (mod p)
		// etc.
		
		shift := uint(52 * (i - 5) + 4) // Additional 4 bits for the 16 factor
		
		// Multiply t[i] by the appropriate power of M
		var carry uint64
		if shift < 64 {
			// Simple case: can multiply directly
			factor := M << shift
			hi, lo := bits.Mul64(t[i], factor)
			
			// Add to appropriate position
			pos := 0
			t[pos], carry = bits.Add64(t[pos], lo, 0)
			if pos+1 < 10 {
				t[pos+1], carry = bits.Add64(t[pos+1], hi, carry)
			}
			
			// Propagate carry
			for j := pos + 2; j < 10 && carry != 0; j++ {
				t[j], carry = bits.Add64(t[j], 0, carry)
			}
		} else {
			// Need to handle larger shifts by distributing across limbs
			hi, lo := bits.Mul64(t[i], M)
			limbShift := shift / 52
			bitShift := shift % 52
			
			if bitShift == 0 {
				// Aligned to limb boundary
				if limbShift < 10 {
					t[limbShift], carry = bits.Add64(t[limbShift], lo, 0)
					if limbShift+1 < 10 {
						t[limbShift+1], carry = bits.Add64(t[limbShift+1], hi, carry)
					}
				}
			} else {
				// Need to split across limbs
				loShifted := lo << bitShift
				hiShifted := (lo >> (64 - bitShift)) | (hi << bitShift)
				
				if limbShift < 10 {
					t[limbShift], carry = bits.Add64(t[limbShift], loShifted, 0)
					if limbShift+1 < 10 {
						t[limbShift+1], carry = bits.Add64(t[limbShift+1], hiShifted, carry)
					}
				}
			}
			
			// Propagate any remaining carry
			for j := int(limbShift) + 2; j < 10 && carry != 0; j++ {
				t[j], carry = bits.Add64(t[j], 0, carry)
			}
		}
		
		t[i] = 0 // Clear the processed limb
	}
	
	// Now we have a value in t[0..4] that may still be >= p
	// Convert to 5x52 format and normalize
	r.n[0] = t[0] & limb0Max
	r.n[1] = ((t[0] >> 52) | (t[1] << 12)) & limb0Max
	r.n[2] = ((t[1] >> 40) | (t[2] << 24)) & limb0Max
	r.n[3] = ((t[2] >> 28) | (t[3] << 36)) & limb0Max
	r.n[4] = ((t[3] >> 16) | (t[4] << 48)) & limb4Max
	
	r.magnitude = 1
	r.normalized = false
	
	// Final reduction if needed
	if r.n[4] == limb4Max && r.n[3] == limb0Max && r.n[2] == limb0Max && 
	   r.n[1] == limb0Max && r.n[0] >= fieldModulusLimb0 {
		r.reduce()
	}
}

// sqr squares a field element: r = a^2
// This implementation follows the C secp256k1_fe_sqr_inner algorithm
// Optimized: avoid copies when magnitude is low enough
func (r *FieldElement) sqr(a *FieldElement) {
	// Use pointer directly if magnitude is low enough (optimization)
	var aNorm *FieldElement
	var aTemp FieldElement

	if a.magnitude > 8 {
		aTemp = *a
		aTemp.normalizeWeak()
		aNorm = &aTemp
	} else {
		aNorm = a // Use directly, no copy needed
	}

	// Use 4x64 BMI2 implementation if available (fastest on amd64)
	if hasField4x64() {
		field4x64Sqr(r, aNorm)
		return
	}

	// Use BMI2+ADX assembly if available
	if hasFieldAsmBMI2() {
		fieldSqrAsmBMI2(r, aNorm)
		r.magnitude = 1
		r.normalized = false
		return
	}

	// Use regular assembly if available
	if hasFieldAsm() {
		fieldSqrAsm(r, aNorm)
		r.magnitude = 1
		r.normalized = false
		return
	}

	// Extract limbs for easier access
	a0, a1, a2, a3, a4 := aNorm.n[0], aNorm.n[1], aNorm.n[2], aNorm.n[3], aNorm.n[4]

	const M = 0xFFFFFFFFFFFFF     // 2^52 - 1
	const R = fieldReductionConstantShifted // 0x1000003D10

	// Following the C implementation algorithm exactly
	
	// Compute p3 = 2*a0*a3 + 2*a1*a2
	var c, d uint128
	d = mulU64ToU128(a0*2, a3)
	d = addMulU128(d, a1*2, a2)
	
	// Compute p8 = a4*a4
	c = mulU64ToU128(a4, a4)
	
	// d += R * c_lo; c >>= 64
	d = addMulU128(d, R, c.lo())
	c = c.rshift(64)
	
	// Extract t3 and shift d
	t3 := d.lo() & M
	d = d.rshift(52)
	
	// Compute p4 = a0*a4*2 + a1*a3*2 + a2*a2
	a4 *= 2
	d = addMulU128(d, a0, a4)
	d = addMulU128(d, a1*2, a3)
	d = addMulU128(d, a2, a2)
	
	// d += (R << 12) * c_lo
	d = addMulU128(d, R<<12, c.lo())
	
	// Extract t4 and tx
	t4 := d.lo() & M
	d = d.rshift(52)
	tx := t4 >> 48
	t4 &= (M >> 4)
	
	// Compute p0 = a0*a0
	c = mulU64ToU128(a0, a0)
	
	// Compute p5 = a1*a4 + a2*a3*2
	d = addMulU128(d, a1, a4)
	d = addMulU128(d, a2*2, a3)
	
	// Extract u0
	u0 := d.lo() & M
	d = d.rshift(52)
	u0 = (u0 << 4) | tx
	
	// c += u0 * (R >> 4)
	c = addMulU128(c, u0, R>>4)
	
	// r[0]
	r.n[0] = c.lo() & M
	c = c.rshift(52)
	
	// Compute p1 = a0*a1*2
	a0 *= 2
	c = addMulU128(c, a0, a1)
	
	// Compute p6 = a2*a4 + a3*a3
	d = addMulU128(d, a2, a4)
	d = addMulU128(d, a3, a3)
	
	// c += R * (d & M); d >>= 52
	c = addMulU128(c, R, d.lo()&M)
	d = d.rshift(52)
	
	// r[1]
	r.n[1] = c.lo() & M
	c = c.rshift(52)
	
	// Compute p2 = a0*a2 + a1*a1
	c = addMulU128(c, a0, a2)
	c = addMulU128(c, a1, a1)
	
	// Compute p7 = a3*a4
	d = addMulU128(d, a3, a4)
	
	// c += R * d_lo; d >>= 64
	c = addMulU128(c, R, d.lo())
	d = d.rshift(64)
	
	// r[2]
	r.n[2] = c.lo() & M
	c = c.rshift(52)
	
	// c += (R << 12) * d_lo + t3
	c = addMulU128(c, R<<12, d.lo())
	c = addU128(c, t3)
	
	// r[3]
	r.n[3] = c.lo() & M
	c = c.rshift(52)
	
	// r[4]
	r.n[4] = c.lo() + t4
	
	// Set magnitude and normalization
	r.magnitude = 1
	r.normalized = false
}

// inv computes the modular inverse of a field element using Fermat's little theorem
// This implements a^(p-2) mod p where p is the secp256k1 field prime
// Uses an optimized addition chain similar to sqrt() for efficiency.
//
// p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D
//
// Binary structure:
//   - bits 255-33: 223 ones (= 2^223 - 1 shifted left by 33)
//   - bit 32: 0
//   - bits 31-12: 20 ones
//   - bits 11-0: 110000101101 (0xC2D)
//
// This uses ~266 squarings + ~15 multiplications instead of ~256 + ~127 for naive binary exp.
func (r *FieldElement) inv(a *FieldElement) {
	var aNorm FieldElement
	aNorm = *a
	aNorm.normalize()

	// Build up powers using addition chain (same as sqrt)
	// x^(2^k - 1) for k in {2, 3, 6, 9, 11, 22, 44, 88, 176, 220, 223}
	var x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1 FieldElement

	// x2 = a^(2^2 - 1) = a^3
	x2.sqr(&aNorm)
	x2.mul(&x2, &aNorm)

	// x3 = a^(2^3 - 1) = a^7
	x3.sqr(&x2)
	x3.mul(&x3, &aNorm)

	// x6 = a^(2^6 - 1) = a^63
	x6 = x3
	for j := 0; j < 3; j++ {
		x6.sqr(&x6)
	}
	x6.mul(&x6, &x3)

	// x9 = a^(2^9 - 1) = a^511
	x9 = x6
	for j := 0; j < 3; j++ {
		x9.sqr(&x9)
	}
	x9.mul(&x9, &x3)

	// x11 = a^(2^11 - 1) = a^2047
	x11 = x9
	for j := 0; j < 2; j++ {
		x11.sqr(&x11)
	}
	x11.mul(&x11, &x2)

	// x22 = a^(2^22 - 1)
	x22 = x11
	for j := 0; j < 11; j++ {
		x22.sqr(&x22)
	}
	x22.mul(&x22, &x11)

	// x44 = a^(2^44 - 1)
	x44 = x22
	for j := 0; j < 22; j++ {
		x44.sqr(&x44)
	}
	x44.mul(&x44, &x22)

	// x88 = a^(2^88 - 1)
	x88 = x44
	for j := 0; j < 44; j++ {
		x88.sqr(&x88)
	}
	x88.mul(&x88, &x44)

	// x176 = a^(2^176 - 1)
	x176 = x88
	for j := 0; j < 88; j++ {
		x176.sqr(&x176)
	}
	x176.mul(&x176, &x88)

	// x220 = a^(2^220 - 1)
	x220 = x176
	for j := 0; j < 44; j++ {
		x220.sqr(&x220)
	}
	x220.mul(&x220, &x44)

	// x223 = a^(2^223 - 1)
	x223 = x220
	for j := 0; j < 3; j++ {
		x223.sqr(&x223)
	}
	x223.mul(&x223, &x3)

	// Now assemble p-2 = 2^256 - 2^32 - 979
	// = (2^223 - 1) * 2^33 + (2^32 - 1) * 2^0 + ... (adjusted for the specific bit pattern)
	//
	// p-2 in binary (from MSB):
	//   223 ones, 0, 20 ones, 00, 1, 0, 00, 1, 0, 1, 1, 0, 1
	//   [223 ones][0][20 ones][11-0 bits = 0xC2D = 110000101101]
	//
	// Step 1: Start with x223 and shift left by 33 bits
	t1 = x223
	for j := 0; j < 33; j++ {
		t1.sqr(&t1)
	}
	// Now t1 = a^((2^223 - 1) * 2^33)

	// Step 2: Add contribution for bits 31-12 (20 ones)
	// We need x^(2^32 - 2^12) = x^((2^20 - 1) * 2^12)
	// But we don't have x20 precomputed. Use x22 and adjust.
	// Actually, let's use: x11 * 2^11 * x11 = x^((2^11-1)*2^11 + (2^11-1)) = x^(2^22 - 2^11 + 2^11 - 1) = x^(2^22-1) = x22
	// We need x^(2^20 - 1). We can compute: x11 shifted by 9, then multiply by x9
	// x^((2^11-1)*2^9) * x^(2^9-1) = x^(2^20 - 2^9 + 2^9 - 1) = x^(2^20 - 1)
	var x20 FieldElement
	x20 = x11
	for j := 0; j < 9; j++ {
		x20.sqr(&x20)
	}
	x20.mul(&x20, &x9)
	// Now x20 = a^(2^20 - 1)

	// Multiply into t1: t1 * x20 (but first shift x20 to correct position)
	// Current t1 is at position 33 (exponent ends at bit 33)
	// We need bits 31-12 = 20 ones, which is x^((2^20-1) * 2^12)
	// So we need to shift t1 down? No, we're building the exponent.
	//
	// Let me reconsider. We have:
	// t1 = a^((2^223-1) * 2^33) = a^(2^256 - 2^33)
	//
	// p-2 = 2^256 - 2^32 - 979
	//
	// 2^256 - 2^33 vs 2^256 - 2^32 - 979
	// We need to add: 2^32 - 979 = FFFFFC2D
	//
	// So: p-2 = (2^256 - 2^33) + (2^32 - 979)
	//         = t1_exp + 0xFFFFFC2D (as addition of exponents = multiplication)
	//
	// But that's not how exponentiation works! a^x * a^y = a^(x+y)
	// So we need t1 * a^(0xFFFFFC2D)
	//
	// 0xFFFFFC2D = 4294966317
	// This is still a big exponent to compute naively.
	//
	// Let me decompose 0xFFFFFC2D in terms of our precomputed powers.
	// 0xFFFFFC2D = 0b 1111 1111 1111 1111 1111 1100 0010 1101
	//            = 2^32 - 979
	//            = (2^32 - 1) - 978
	//            = (2^32 - 1) - 2^9 - 2^8 - 2^7 - 2^6 - 2^4 - 2^1
	//
	// Hmm, subtraction in exponent is division... that doesn't help.
	//
	// Let me think about this differently using the bit pattern directly.
	// p-2 bits (from MSB to LSB):
	//   bit 255: 1
	//   ...
	//   bit 33: 1  (223 ones total in bits 255-33)
	//   bit 32: 0
	//   bit 31: 1
	//   ...
	//   bit 12: 1  (20 ones total in bits 31-12)
	//   bit 11: 0
	//   bit 10: 0
	//   bit 9: 1
	//   bit 8: 0
	//   bit 7: 0
	//   bit 6: 0
	//   bit 5: 1
	//   bit 4: 0
	//   bit 3: 1
	//   bit 2: 1
	//   bit 1: 0
	//   bit 0: 1
	//
	// Using sliding window from MSB:
	// Start: t1 = a^1
	// Process 223 ones in bits 255-33:
	//   t1 = a^(2^223 - 1) using x223 (done above via shift)
	//   After shifting 33 times: a^((2^223-1) * 2^33) = a^(2^256 - 2^33)
	//
	// bit 32 = 0: square once
	//   t1 = a^(2^257 - 2^34) -- wait, this goes beyond 256 bits
	//
	// I think I'm overcomplicating this. Let me use a simpler sliding window approach.

	// Alternative approach: build from LSB using the precomputed values
	// Actually, let's use the approach from libsecp256k1's sqrt which builds from MSB.

	// p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D
	//
	// We can write this as:
	// ((((x223 << 23) * x22) << 5) * x2) << 3) * x3) << 2) * x2) << 1) << 1) * a
	//
	// Let me verify: starting with a^(2^223 - 1) = x223
	//
	// Actually, the libsecp256k1 modinv doesn't use addition chains - it uses safegcd.
	// Let me check what the modular inverse in libsecp256k1 actually does.

	// For now, let me implement a working addition chain that's still much faster than binary exp.

	// p-2 structure analysis:
	// Bits 255-33: 223 consecutive 1s
	// Bit 32: 0
	// Bits 31-12: 20 consecutive 1s (but we don't have x20, need to construct)
	// Bits 11-0: 0xC2D = 0b110000101101

	// Approach: sliding window from MSB
	// 1. Start with x223 (bits 255-33)
	// 2. Square for bit 32 (which is 0)
	// 3. Square and multiply for bits 31-12 (20 ones)
	// 4. Handle remaining bits 11-0

	// t1 already = x223 = a^(2^223 - 1)

	// Square 23 times for bits 255-33 positioning, then multiply by x22 for next block
	// Wait, I'm confusing myself. Let me restart with a cleaner approach.

	// Standard addition chain for computing a^e where e = p-2:
	// We compute a^e by starting with a and repeatedly squaring and multiplying.
	// The exponent p-2 has this binary form:
	//   1{223}01{20}001000010110 1
	//   (223 ones)(zero)(20 ones)(0xC2D pattern)

	// Efficient approach: compute x^((2^223-1) * 2^33 + (2^20-1) * 2^13 + specific_bits)
	//
	// Let's verify: (2^223-1)*2^33 = 2^256 - 2^33
	// (2^20-1)*2^13 = 2^33 - 2^13
	// Adding: 2^256 - 2^33 + 2^33 - 2^13 = 2^256 - 2^13
	// But p-2 = 2^256 - 2^32 - 979, so this isn't quite right.

	// Let me compute more carefully.
	// The exponent as sum of weighted runs of 1s:
	// bits 255-33 (223 ones): (2^223 - 1) * 2^33 = 2^256 - 2^33
	// bits 31-12 (20 ones): (2^20 - 1) * 2^12 = 2^32 - 2^12
	// bit 9: 2^9
	// bit 5: 2^5
	// bit 3: 2^3
	// bit 2: 2^2
	// bit 0: 2^0
	//
	// Sum = 2^256 - 2^33 + 2^32 - 2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0
	//     = 2^256 - 2^32 - 2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0  (since 2^33 - 2^32 = 2^32)
	//
	// Hmm, -2^33 + 2^32 = -2^32, so:
	//     = 2^256 - 2^32 - 2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0
	//
	// And we need p-2 = 2^256 - 2^32 - 979
	// So: -979 = -2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0
	//          = -4096 + 512 + 32 + 8 + 4 + 1
	//          = -4096 + 557
	//          = -3539
	//
	// That's not -979. Let me re-examine the bit pattern of 0xFFFFFC2D.

	// 0xC2D = 0b110000101101 (12 bits)
	// bit 11: 1 → 2^11
	// bit 10: 1 → 2^10
	// bit 9: 0
	// bit 8: 0
	// bit 7: 0
	// bit 6: 0
	// bit 5: 1 → 2^5
	// bit 4: 0
	// bit 3: 1 → 2^3
	// bit 2: 1 → 2^2
	// bit 1: 0
	// bit 0: 1 → 2^0

	// So bits 11-0 contribute: 2^11 + 2^10 + 2^5 + 2^3 + 2^2 + 2^0
	//                        = 2048 + 1024 + 32 + 8 + 4 + 1
	//                        = 3117

	// Full p-2:
	// bits 255-33 (223 ones): 2^256 - 2^33
	// bits 31-12 (20 ones): 2^32 - 2^12
	// bits 11-0: 3117
	//
	// Sum = 2^256 - 2^33 + 2^32 - 2^12 + 3117
	//     = 2^256 - 2^32 - 2^12 + 3117
	//     = 2^256 - 2^32 - 4096 + 3117
	//     = 2^256 - 2^32 - 979 ✓

	// Great! So the bit pattern is correct. Now I can build the addition chain.

	// Build result from MSB using sliding window:
	// t1 = x223 = a^(2^223 - 1)

	// Shift left by 33 (square 33 times) and multiply by x22 for the next window?
	// No wait, we have a gap (bit 32 = 0) and then 20 ones.

	// Let me use a cleaner approach:
	// 1. t1 = x223
	// 2. t1 = t1^(2^33) = a^((2^223-1)*2^33) = a^(2^256 - 2^33)
	// 3. t1 = t1 * x20 = a^(2^256 - 2^33 + 2^20 - 1)  -- but we need (2^20-1)*2^12 not just 2^20-1

	// Hmm, this is getting complicated. Let me try a different formulation.

	// Instead of trying to be clever, just do windowed exponentiation:
	// Split p-2 into windows and multiply.

	// p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D
	//
	// From MSB, process in chunks matching our precomputed values:
	// Window 1: bits 255-33 = 223 ones → use x223, then square 33 times
	// After: a^((2^223-1) * 2^33) = a^(2^256 - 2^33)
	//
	// Window 2: bits 32-13 = 0 followed by 20 ones
	// We already squared 33 times, so we're at bit 32.
	// bit 32 = 0: just square (no multiply)
	// bits 31-12 = 20 ones
	// But x20 isn't precomputed. Let me compute it.

	// Hmm, I already computed x20 above. Let me restructure.

	// Reset and do it properly:
	t1 = x223
	// t1 = a^(2^223 - 1)

	// Square 33 times to shift the window
	for j := 0; j < 33; j++ {
		t1.sqr(&t1)
	}
	// t1 = a^((2^223-1) * 2^33)

	// Multiply by x22 to add 22 more ones? No, we need exactly 20 ones.
	// Actually, I already computed x20 above. Let's use it.
	// Multiply by x20 shifted appropriately.
	// But x20 at its base position is a^(2^20 - 1).
	// We need a^((2^20 - 1) * 2^12) to contribute bits 31-12.
	// So we need to shift x20 by 12.

	// Current t1 exponent: (2^223 - 1) * 2^33 = 2^256 - 2^33
	// We want to add: (2^20 - 1) * 2^12 = 2^32 - 2^12 - (2^12 - 1) = wait no
	// (2^20 - 1) * 2^12 = 2^32 - 2^12

	// So after adding: 2^256 - 2^33 + 2^32 - 2^12 = 2^256 - 2^32 - 2^12

	// Wait, 2^33 - 2^32 = 2^32, so -2^33 + 2^32 = -2^32

	// Now we need to add the remaining bits 11-0 = 0xC2D = 3117

	// t1_final = t1 * a^((2^20-1)*2^12) * a^3117

	// For a^((2^20-1)*2^12): square x20 12 times
	var temp FieldElement
	temp = x20
	for j := 0; j < 12; j++ {
		temp.sqr(&temp)
	}
	// temp = a^((2^20-1)*2^12) = a^(2^32 - 2^12)

	t1.mul(&t1, &temp)
	// t1 = a^(2^256 - 2^33 + 2^32 - 2^12) = a^(2^256 - 2^32 - 2^12)

	// Now add bits 11-0 = 0xC2D = 2^11 + 2^10 + 2^5 + 2^3 + 2^2 + 2^0 = 3117
	// We need a^3117

	// 3117 = 2^11 + 2^10 + 2^5 + 2^3 + 2^2 + 2^0
	// = 2048 + 1024 + 32 + 8 + 4 + 1

	// Using our precomputed values:
	// a^(2^11 - 1) = x11 = a^2047
	// We need a^2048 = a^(2^11), which is a squared 11 times.
	// Actually, let's build a^3117 more directly.

	// a^3117 = a^(2^11) * a^(2^10) * a^(2^5) * a^(2^3) * a^(2^2) * a^(2^0)
	// = (a^2048) * (a^1024) * (a^32) * (a^8) * (a^4) * a

	// But computing each power separately and multiplying is expensive.
	// Better: use sliding window on the bits.

	// 3117 in binary: 110000101101
	// From MSB: 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1

	// Or we can decompose using our precomputed values:
	// 3117 = 2047 + 1024 + 32 + 8 + 4 + 2 = 2047 + 1070
	// Hmm, 2047 = x11, but 1070 is awkward.

	// Let's just compute a^3117 with a small addition chain.
	// 3117 = 0b110000101101

	// Build a^3117:
	var low FieldElement
	low = aNorm
	low.sqr(&low)           // a^2
	low.mul(&low, &aNorm)   // a^3
	low.sqr(&low)           // a^6
	low.sqr(&low)           // a^12
	low.mul(&low, &aNorm)   // a^13
	low.sqr(&low)           // a^26
	low.sqr(&low)           // a^52
	low.sqr(&low)           // a^104
	low.sqr(&low)           // a^208
	low.mul(&low, &aNorm)   // a^209
	low.sqr(&low)           // a^418
	low.sqr(&low)           // a^836
	low.mul(&low, &aNorm)   // a^837
	low.sqr(&low)           // a^1674
	low.mul(&low, &aNorm)   // a^1675
	low.sqr(&low)           // a^3350
	// Hmm, this is getting messy and doesn't give 3117.

	// Let me use a simpler approach: compute a^3117 using binary method (small number)
	// This adds only ~12 squarings and ~6 multiplications
	low.setInt(1)
	base := aNorm
	e := uint32(3117)
	for e > 0 {
		if e&1 == 1 {
			low.mul(&low, &base)
		}
		e >>= 1
		if e > 0 {
			base.sqr(&base)
		}
	}
	// low = a^3117

	// But wait, our temp and t1 computations used a shifted a. Let me reconsider.
	// Actually no, the exponent additions are correct. We multiply t1 by a^(low_bits)
	// where low_bits is computed from the original a.

	// Hmm, but we shifted x20 by 12 bits already, contributing exponent (2^20-1)*2^12.
	// And now low = a^3117.
	// But 3117 spans bits 11-0, and we need to add this to the current exponent.

	// Current t1 exponent: 2^256 - 2^32 - 2^12
	// We want: 2^256 - 2^32 - 979
	// So we need: -979 - (-2^12) = -979 + 4096 = 3117 ✓

	// So t1 * low = a^(2^256 - 2^32 - 2^12 + 3117) = a^(2^256 - 2^32 - 979) = a^(p-2)

	t1.mul(&t1, &low)

	*r = t1
	r.magnitude = 1
	r.normalized = true
}

// sqrt computes the square root of a field element if it exists
// This follows the C secp256k1_fe_sqrt implementation exactly
func (r *FieldElement) sqrt(a *FieldElement) bool {
	// Given that p is congruent to 3 mod 4, we can compute the square root of
	// a mod p as the (p+1)/4'th power of a.
	//
	// As (p+1)/4 is an even number, it will have the same result for a and for
	// (-a). Only one of these two numbers actually has a square root however,
	// so we test at the end by squaring and comparing to the input.
	
	var aNorm FieldElement
	aNorm = *a
	
	// Normalize input if magnitude is too high
	if aNorm.magnitude > 8 {
		aNorm.normalizeWeak()
	} else {
		aNorm.normalize()
	}
	
	// The binary representation of (p + 1)/4 has 3 blocks of 1s, with lengths in
	// { 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
	// 1, [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
	
	var x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1 FieldElement
	
	// x2 = a^3
	x2.sqr(&aNorm)
	x2.mul(&x2, &aNorm)
	
	// x3 = a^7
	x3.sqr(&x2)
	x3.mul(&x3, &aNorm)
	
	// x6 = a^63
	x6 = x3
	for j := 0; j < 3; j++ {
		x6.sqr(&x6)
	}
	x6.mul(&x6, &x3)
	
	// x9 = a^511
	x9 = x6
	for j := 0; j < 3; j++ {
		x9.sqr(&x9)
	}
	x9.mul(&x9, &x3)
	
	// x11 = a^2047
	x11 = x9
	for j := 0; j < 2; j++ {
		x11.sqr(&x11)
	}
	x11.mul(&x11, &x2)
	
	// x22 = a^4194303
	x22 = x11
	for j := 0; j < 11; j++ {
		x22.sqr(&x22)
	}
	x22.mul(&x22, &x11)
	
	// x44 = a^17592186044415
	x44 = x22
	for j := 0; j < 22; j++ {
		x44.sqr(&x44)
	}
	x44.mul(&x44, &x22)
	
	// x88 = a^72057594037927935
	x88 = x44
	for j := 0; j < 44; j++ {
		x88.sqr(&x88)
	}
	x88.mul(&x88, &x44)
	
	// x176 = a^1180591620717411303423
	x176 = x88
	for j := 0; j < 88; j++ {
		x176.sqr(&x176)
	}
	x176.mul(&x176, &x88)
	
	// x220 = a^172543658669764094685868767685
	x220 = x176
	for j := 0; j < 44; j++ {
		x220.sqr(&x220)
	}
	x220.mul(&x220, &x44)
	
	// x223 = a^13479973333575319897333507543509815336818572211270286240551805124607
	x223 = x220
	for j := 0; j < 3; j++ {
		x223.sqr(&x223)
	}
	x223.mul(&x223, &x3)
	
	// The final result is then assembled using a sliding window over the blocks.
	t1 = x223
	for j := 0; j < 23; j++ {
		t1.sqr(&t1)
	}
	t1.mul(&t1, &x22)
	for j := 0; j < 6; j++ {
		t1.sqr(&t1)
	}
	t1.mul(&t1, &x2)
	t1.sqr(&t1)
	r.sqr(&t1)
	
	// Check that a square root was actually calculated
	var check FieldElement
	check.sqr(r)
	check.normalize()
	aNorm.normalize()
	
	ret := check.equal(&aNorm)
	
	// If sqrt(a) doesn't exist, compute sqrt(-a) instead (as per field.h comment)
	if !ret {
		var negA FieldElement
		negA.negate(&aNorm, 1)
		negA.normalize()
		
		t1 = x223
		for j := 0; j < 23; j++ {
			t1.sqr(&t1)
		}
		t1.mul(&t1, &x22)
		for j := 0; j < 6; j++ {
			t1.sqr(&t1)
		}
		t1.mul(&t1, &x2)
		t1.sqr(&t1)
		r.sqr(&t1)
		
		check.sqr(r)
		check.normalize()
		
		// Return whether sqrt(-a) exists
		return check.equal(&negA)
	}
	
	return ret
}

// isSquare checks if a field element is a quadratic residue
func (a *FieldElement) isSquare() bool {
	// Use Legendre symbol: a^((p-1)/2) mod p
	// If result is 1, then a is a quadratic residue

	var result FieldElement
	result = *a

	// Compute a^((p-1)/2) - simplified implementation
	for i := 0; i < 127; i++ { // Approximate (p-1)/2 bit length
		result.sqr(&result)
	}

	result.normalize()
	return result.equal(&FieldElementOne)
}

// half computes r = a/2 mod p
func (r *FieldElement) half(a *FieldElement) {
	// This follows the C secp256k1_fe_impl_half implementation exactly
	*r = *a
	
	t0, t1, t2, t3, t4 := r.n[0], r.n[1], r.n[2], r.n[3], r.n[4]
	one := uint64(1)
	// In C: mask = -(t0 & one) >> 12
	// In Go, we need to convert to signed, negate, then convert back
	mask := uint64(-int64(t0 & one)) >> 12
	
	// Conditionally add field modulus if odd
	t0 += 0xFFFFEFFFFFC2F & mask
	t1 += mask
	t2 += mask  
	t3 += mask
	t4 += mask >> 4
	
	// Right shift with carry propagation
	r.n[0] = (t0 >> 1) + ((t1 & one) << 51)
	r.n[1] = (t1 >> 1) + ((t2 & one) << 51)
	r.n[2] = (t2 >> 1) + ((t3 & one) << 51)
	r.n[3] = (t3 >> 1) + ((t4 & one) << 51)
	r.n[4] = t4 >> 1
	
	// Update magnitude as per C implementation
	r.magnitude = (r.magnitude >> 1) + 1
	r.normalized = false
}