//go:build amd64 && !purego

package p256k1

import (
	"crypto/subtle"
	"errors"
	"math/bits"
	"unsafe"
)

var errField4x64InvalidLen = errors.New("field element must be 32 bytes")

// Field4x64 represents a field element using 4x64-bit limbs with lazy reduction.
// This representation allows fewer partial products (16 vs 25 for 5x52) and
// deferred reduction for sequences of additions.
//
// The value is: n[0] + n[1]*2^64 + n[2]*2^128 + n[3]*2^192
//
// Magnitude tracks how many additions have occurred since the last reduction.
// Multiplication requires magnitude ≤ 1 (fully reduced inputs).
type Field4x64 struct {
	n          [4]uint64
	magnitude  int  // 1 = normalized, increases with additions
	normalized bool // whether the field element is fully normalized
}

// secp256k1 field prime: p = 2^256 - 2^32 - 977
// p = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F
var field4x64Prime = [4]uint64{
	0xFFFFFFFEFFFFFC2F, // n[0]
	0xFFFFFFFFFFFFFFFF, // n[1]
	0xFFFFFFFFFFFFFFFF, // n[2]
	0xFFFFFFFFFFFFFFFF, // n[3]
}

// 2^256 mod p = 2^32 + 977 = 0x1000003D1
const field4x64R = uint64(0x1000003D1)

// SetZero sets f to zero.
func (f *Field4x64) SetZero() *Field4x64 {
	f.n[0], f.n[1], f.n[2], f.n[3] = 0, 0, 0, 0
	f.magnitude = 0
	f.normalized = true
	return f
}

// SetOne sets f to one.
func (f *Field4x64) SetOne() *Field4x64 {
	f.n[0], f.n[1], f.n[2], f.n[3] = 1, 0, 0, 0
	f.magnitude = 1
	f.normalized = true
	return f
}

// Set copies a into f.
func (f *Field4x64) Set(a *Field4x64) *Field4x64 {
	f.n = a.n
	f.magnitude = a.magnitude
	f.normalized = a.normalized
	return f
}

// IsZero returns true if f is zero.
func (f *Field4x64) IsZero() bool {
	// Must be normalized first
	return f.n[0] == 0 && f.n[1] == 0 && f.n[2] == 0 && f.n[3] == 0
}

// Equal returns true if f equals a.
func (f *Field4x64) Equal(a *Field4x64) bool {
	// Both must be normalized
	return f.n[0] == a.n[0] && f.n[1] == a.n[1] && f.n[2] == a.n[2] && f.n[3] == a.n[3]
}

// Reduce reduces f modulo p, setting magnitude to 1.
// Uses the identity: 2^256 ≡ 2^32 + 977 (mod p)
func (f *Field4x64) Reduce() *Field4x64 {
	// Fast path: already normalized
	if f.normalized {
		return f
	}

	n0, n1, n2, n3 := f.n[0], f.n[1], f.n[2], f.n[3]

	// Reduce: if value >= p, subtract p
	// Check if n >= p by comparing from high limb
	var c uint64
	var borrow uint64

	// Subtract p and check for borrow
	t0, borrow := bits.Sub64(n0, field4x64Prime[0], 0)
	t1, borrow := bits.Sub64(n1, field4x64Prime[1], borrow)
	t2, borrow := bits.Sub64(n2, field4x64Prime[2], borrow)
	t3, borrow := bits.Sub64(n3, field4x64Prime[3], borrow)

	// If no borrow, result is valid (n >= p), use subtracted value
	// If borrow, n < p, keep original
	c = borrow - 1 // c = 0xFFFF... if no borrow, 0 if borrow

	f.n[0] = (t0 & c) | (n0 & ^c)
	f.n[1] = (t1 & c) | (n1 & ^c)
	f.n[2] = (t2 & c) | (n2 & ^c)
	f.n[3] = (t3 & c) | (n3 & ^c)
	f.magnitude = 1
	f.normalized = true

	return f
}

// Add computes f = a + b (mod p) with lazy reduction.
func (f *Field4x64) Add(a, b *Field4x64) *Field4x64 {
	var carry uint64

	f.n[0], carry = bits.Add64(a.n[0], b.n[0], 0)
	f.n[1], carry = bits.Add64(a.n[1], b.n[1], carry)
	f.n[2], carry = bits.Add64(a.n[2], b.n[2], carry)
	f.n[3], carry = bits.Add64(a.n[3], b.n[3], carry)

	// If carry, we overflowed 2^256, reduce by adding R = 2^256 mod p
	if carry != 0 {
		f.n[0], carry = bits.Add64(f.n[0], field4x64R, 0)
		f.n[1], carry = bits.Add64(f.n[1], 0, carry)
		f.n[2], carry = bits.Add64(f.n[2], 0, carry)
		f.n[3], _ = bits.Add64(f.n[3], 0, carry)
	}

	f.magnitude = a.magnitude + b.magnitude
	f.normalized = false
	if f.magnitude > 8 { // Reduce if magnitude gets too high
		f.Reduce()
	}

	return f
}

// Sub computes f = a - b (mod p).
func (f *Field4x64) Sub(a, b *Field4x64) *Field4x64 {
	var borrow uint64

	f.n[0], borrow = bits.Sub64(a.n[0], b.n[0], 0)
	f.n[1], borrow = bits.Sub64(a.n[1], b.n[1], borrow)
	f.n[2], borrow = bits.Sub64(a.n[2], b.n[2], borrow)
	f.n[3], borrow = bits.Sub64(a.n[3], b.n[3], borrow)

	// If borrow, we went negative, add p back
	if borrow != 0 {
		var carry uint64
		f.n[0], carry = bits.Add64(f.n[0], field4x64Prime[0], 0)
		f.n[1], carry = bits.Add64(f.n[1], field4x64Prime[1], carry)
		f.n[2], carry = bits.Add64(f.n[2], field4x64Prime[2], carry)
		f.n[3], _ = bits.Add64(f.n[3], field4x64Prime[3], carry)
	}

	f.magnitude = 1
	f.normalized = false
	return f
}

// Negate computes f = -a (mod p).
// Handles inputs that may be in range [0, 2p).
func (f *Field4x64) Negate(a *Field4x64) *Field4x64 {
	// -a = p - a (when a != 0)
	if a.IsZero() {
		return f.SetZero()
	}

	// Compute p - a. If a >= p, this will underflow (borrow != 0).
	var borrow uint64
	f.n[0], borrow = bits.Sub64(field4x64Prime[0], a.n[0], 0)
	f.n[1], borrow = bits.Sub64(field4x64Prime[1], a.n[1], borrow)
	f.n[2], borrow = bits.Sub64(field4x64Prime[2], a.n[2], borrow)
	f.n[3], borrow = bits.Sub64(field4x64Prime[3], a.n[3], borrow)

	// If there was a borrow, a >= p. We need to compute p - (a - p) = 2p - a.
	// But since we computed p - a which underflowed, the result is 2^256 + p - a = 2^256 - (a - p).
	// We need to add p to get: 2^256 + 2p - a. Then subtract 2^256 (which is implicit).
	// Actually: if a in [p, 2p), then -a = p - a is negative, and we wrapped to 2^256 + p - a.
	// We want the result in [0, p). Since a < 2p, we have a - p < p, so -(a-p) = p - (a-p) = 2p - a.
	// The wrapped result 2^256 + p - a needs to be converted to 2p - a by adding p (mod 2^256).
	if borrow != 0 {
		var carry uint64
		f.n[0], carry = bits.Add64(f.n[0], field4x64Prime[0], 0)
		f.n[1], carry = bits.Add64(f.n[1], field4x64Prime[1], carry)
		f.n[2], carry = bits.Add64(f.n[2], field4x64Prime[2], carry)
		f.n[3], _ = bits.Add64(f.n[3], field4x64Prime[3], carry)
	}

	f.magnitude = 1
	f.normalized = false
	return f
}

// MulPureGo computes f = a * b (mod p) using pure Go.
// This is the reference implementation; assembly version is faster.
func (f *Field4x64) MulPureGo(a, b *Field4x64) *Field4x64 {
	// Ensure inputs are reduced
	if a.magnitude > 1 {
		a.Reduce()
	}
	if b.magnitude > 1 {
		b.Reduce()
	}

	// Compute 512-bit product using schoolbook multiplication
	// Use 128-bit accumulators (represented as hi:lo pairs)
	// Note: bits.Mul64 returns (hi, lo) - high word first!
	var r [8]uint64

	// For each output limb r[k], accumulate all products a[i]*b[j] where i+j=k
	// Use proper carry propagation throughout

	var c0, c1, c2 uint64 // Three-word accumulator (c2:c1:c0)
	var hi, lo uint64

	// Column 0: a0*b0
	hi, lo = bits.Mul64(a.n[0], b.n[0])
	c0 = lo
	c1 = hi
	c2 = 0
	r[0] = c0
	c0 = c1
	c1 = c2
	c2 = 0

	// Column 1: a0*b1 + a1*b0
	hi, lo = bits.Mul64(a.n[0], b.n[1])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	hi, lo = bits.Mul64(a.n[1], b.n[0])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	r[1] = c0
	c0 = c1
	c1 = c2
	c2 = 0

	// Column 2: a0*b2 + a1*b1 + a2*b0
	hi, lo = bits.Mul64(a.n[0], b.n[2])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	hi, lo = bits.Mul64(a.n[1], b.n[1])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	hi, lo = bits.Mul64(a.n[2], b.n[0])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	r[2] = c0
	c0 = c1
	c1 = c2
	c2 = 0

	// Column 3: a0*b3 + a1*b2 + a2*b1 + a3*b0
	hi, lo = bits.Mul64(a.n[0], b.n[3])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	hi, lo = bits.Mul64(a.n[1], b.n[2])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	hi, lo = bits.Mul64(a.n[2], b.n[1])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	hi, lo = bits.Mul64(a.n[3], b.n[0])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	r[3] = c0
	c0 = c1
	c1 = c2
	c2 = 0

	// Column 4: a1*b3 + a2*b2 + a3*b1
	hi, lo = bits.Mul64(a.n[1], b.n[3])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	hi, lo = bits.Mul64(a.n[2], b.n[2])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	hi, lo = bits.Mul64(a.n[3], b.n[1])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	r[4] = c0
	c0 = c1
	c1 = c2
	c2 = 0

	// Column 5: a2*b3 + a3*b2
	hi, lo = bits.Mul64(a.n[2], b.n[3])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	hi, lo = bits.Mul64(a.n[3], b.n[2])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, hi = bits.Add64(c1, hi, lo)
	c2 += hi

	r[5] = c0
	c0 = c1
	c1 = c2

	// Column 6: a3*b3
	hi, lo = bits.Mul64(a.n[3], b.n[3])
	c0, lo = bits.Add64(c0, lo, 0)
	c1, _ = bits.Add64(c1, hi, lo)

	r[6] = c0
	r[7] = c1

	// Reduce 512-bit result modulo p
	f.reduce512(&r)

	return f
}

// reduce512 reduces a 512-bit value to 256 bits modulo p.
// Uses the identity: 2^256 ≡ 2^32 + 977 (mod p)
func (f *Field4x64) reduce512(r *[8]uint64) {
	// High part (r[4..7]) needs to be multiplied by R = 2^32 + 977 and added to low part

	// Step 1: Compute r[4..7] * R and add to r[0..3]
	// R = 0x1000003D1
	// Note: bits.Mul64 returns (hi, lo)

	var hi, lo, carry uint64
	var t [5]uint64 // Accumulator for r[4..7] * R

	// r[4] * R
	hi, lo = bits.Mul64(r[4], field4x64R)
	t[0] = lo
	t[1] = hi

	// r[5] * R
	hi, lo = bits.Mul64(r[5], field4x64R)
	t[1], carry = bits.Add64(t[1], lo, 0)
	t[2] = hi + carry

	// r[6] * R
	hi, lo = bits.Mul64(r[6], field4x64R)
	t[2], carry = bits.Add64(t[2], lo, 0)
	t[3] = hi + carry

	// r[7] * R
	hi, lo = bits.Mul64(r[7], field4x64R)
	t[3], carry = bits.Add64(t[3], lo, 0)
	t[4] = hi + carry

	// Add t to r[0..3]
	var c uint64
	f.n[0], c = bits.Add64(r[0], t[0], 0)
	f.n[1], c = bits.Add64(r[1], t[1], c)
	f.n[2], c = bits.Add64(r[2], t[2], c)
	f.n[3], c = bits.Add64(r[3], t[3], c)

	// Handle overflow from t[4] + carry
	overflow := t[4] + c

	// If there's overflow, multiply it by R and add again
	if overflow != 0 {
		hi, lo = bits.Mul64(overflow, field4x64R)
		f.n[0], c = bits.Add64(f.n[0], lo, 0)
		f.n[1], c = bits.Add64(f.n[1], hi, c)
		f.n[2], c = bits.Add64(f.n[2], 0, c)
		f.n[3], c = bits.Add64(f.n[3], 0, c)

		// One more reduction if still overflowing (very rare)
		if c != 0 {
			f.n[0], c = bits.Add64(f.n[0], field4x64R, 0)
			f.n[1], c = bits.Add64(f.n[1], 0, c)
			f.n[2], c = bits.Add64(f.n[2], 0, c)
			f.n[3], _ = bits.Add64(f.n[3], 0, c)
		}
	}

	f.magnitude = 1
	f.Reduce() // Final reduction to ensure result < p
}

// Sqr computes f = a^2 (mod p).
// Uses optimized assembly that exploits squaring symmetry.
func (f *Field4x64) Sqr(a *Field4x64) *Field4x64 {
	// Ensure input is reduced
	if a.magnitude > 1 {
		a.Reduce()
	}
	field4x64SqrAsm(&f.n, &a.n)
	f.magnitude = 1
	f.normalized = false
	return f
}

// Mul computes f = a * b (mod p).
// Uses BMI2 MULX assembly for high performance.
func (f *Field4x64) Mul(a, b *Field4x64) *Field4x64 {
	// Ensure inputs are reduced
	if a.magnitude > 1 {
		a.Reduce()
	}
	if b.magnitude > 1 {
		b.Reduce()
	}
	field4x64MulAsm(&f.n, &a.n, &b.n)
	f.magnitude = 1
	f.normalized = false
	return f
}

// SetB32 sets f from a 32-byte big-endian representation.
func (f *Field4x64) SetB32(b []byte) error {
	if len(b) != 32 {
		return errField4x64InvalidLen
	}

	// Big-endian: b[0] is most significant
	f.n[3] = uint64(b[0])<<56 | uint64(b[1])<<48 | uint64(b[2])<<40 | uint64(b[3])<<32 |
		uint64(b[4])<<24 | uint64(b[5])<<16 | uint64(b[6])<<8 | uint64(b[7])
	f.n[2] = uint64(b[8])<<56 | uint64(b[9])<<48 | uint64(b[10])<<40 | uint64(b[11])<<32 |
		uint64(b[12])<<24 | uint64(b[13])<<16 | uint64(b[14])<<8 | uint64(b[15])
	f.n[1] = uint64(b[16])<<56 | uint64(b[17])<<48 | uint64(b[18])<<40 | uint64(b[19])<<32 |
		uint64(b[20])<<24 | uint64(b[21])<<16 | uint64(b[22])<<8 | uint64(b[23])
	f.n[0] = uint64(b[24])<<56 | uint64(b[25])<<48 | uint64(b[26])<<40 | uint64(b[27])<<32 |
		uint64(b[28])<<24 | uint64(b[29])<<16 | uint64(b[30])<<8 | uint64(b[31])

	f.magnitude = 1
	f.normalized = false
	return nil
}

// GetB32 writes f to a 32-byte big-endian representation.
func (f *Field4x64) GetB32(b []byte) {
	if len(b) != 32 {
		panic("GetB32: buffer must be 32 bytes")
	}

	// Ensure normalized
	if f.magnitude > 1 {
		f.Reduce()
	}

	// Big-endian: b[0] is most significant
	b[0] = byte(f.n[3] >> 56)
	b[1] = byte(f.n[3] >> 48)
	b[2] = byte(f.n[3] >> 40)
	b[3] = byte(f.n[3] >> 32)
	b[4] = byte(f.n[3] >> 24)
	b[5] = byte(f.n[3] >> 16)
	b[6] = byte(f.n[3] >> 8)
	b[7] = byte(f.n[3])
	b[8] = byte(f.n[2] >> 56)
	b[9] = byte(f.n[2] >> 48)
	b[10] = byte(f.n[2] >> 40)
	b[11] = byte(f.n[2] >> 32)
	b[12] = byte(f.n[2] >> 24)
	b[13] = byte(f.n[2] >> 16)
	b[14] = byte(f.n[2] >> 8)
	b[15] = byte(f.n[2])
	b[16] = byte(f.n[1] >> 56)
	b[17] = byte(f.n[1] >> 48)
	b[18] = byte(f.n[1] >> 40)
	b[19] = byte(f.n[1] >> 32)
	b[20] = byte(f.n[1] >> 24)
	b[21] = byte(f.n[1] >> 16)
	b[22] = byte(f.n[1] >> 8)
	b[23] = byte(f.n[1])
	b[24] = byte(f.n[0] >> 56)
	b[25] = byte(f.n[0] >> 48)
	b[26] = byte(f.n[0] >> 40)
	b[27] = byte(f.n[0] >> 32)
	b[28] = byte(f.n[0] >> 24)
	b[29] = byte(f.n[0] >> 16)
	b[30] = byte(f.n[0] >> 8)
	b[31] = byte(f.n[0])
}

// ============================================================================
// FieldElement-compatible methods (lowercase for internal API compatibility)
// ============================================================================

// normalize normalizes f to canonical form (same as Reduce).
func (f *Field4x64) normalize() {
	f.Reduce()
}

// normalizeWeak performs weak normalization (magnitude <= 1).
func (f *Field4x64) normalizeWeak() {
	if f.magnitude <= 1 {
		return
	}
	f.Reduce()
}

// isZero returns true if f is zero (must be normalized).
func (f *Field4x64) isZero() bool {
	if !f.normalized {
		panic("field element must be normalized")
	}
	return f.n[0] == 0 && f.n[1] == 0 && f.n[2] == 0 && f.n[3] == 0
}

// isOdd returns true if f is odd (must be normalized).
func (f *Field4x64) isOdd() bool {
	if !f.normalized {
		panic("field element must be normalized")
	}
	return f.n[0]&1 == 1
}

// normalizesToZeroVar checks if f normalizes to zero (variable-time).
func (f *Field4x64) normalizesToZeroVar() bool {
	var t Field4x64
	t = *f
	t.normalize()
	return t.isZero()
}

// equal returns true if f equals a (both must be normalized).
func (f *Field4x64) equal(a *Field4x64) bool {
	if !f.normalized || !a.normalized {
		panic("field elements must be normalized for comparison")
	}
	return subtle.ConstantTimeCompare(
		(*[32]byte)(unsafe.Pointer(&f.n[0]))[:32],
		(*[32]byte)(unsafe.Pointer(&a.n[0]))[:32],
	) == 1
}

// setInt sets f to a small integer value.
func (f *Field4x64) setInt(a int) {
	if a < 0 || a > 0x7FFF {
		panic("value out of range")
	}
	f.n[0] = uint64(a)
	f.n[1] = 0
	f.n[2] = 0
	f.n[3] = 0
	if a == 0 {
		f.magnitude = 0
	} else {
		f.magnitude = 1
	}
	f.normalized = true
}

// clear clears f to prevent leaking sensitive information.
func (f *Field4x64) clear() {
	f.n[0], f.n[1], f.n[2], f.n[3] = 0, 0, 0, 0
	f.magnitude = 0
	f.normalized = true
}

// negate negates f: r = -a with given magnitude.
func (f *Field4x64) negate(a *Field4x64, m int) {
	if m < 0 || m > 31 {
		panic("magnitude out of range")
	}
	// For 4x64, we use a simpler approach: just compute p - a
	// The magnitude parameter is for compatibility with 5x52
	f.Negate(a)
	f.magnitude = m + 1
	f.normalized = false
}

// add adds a to f in place: f += a.
func (f *Field4x64) add(a *Field4x64) {
	var carry uint64
	f.n[0], carry = bits.Add64(f.n[0], a.n[0], 0)
	f.n[1], carry = bits.Add64(f.n[1], a.n[1], carry)
	f.n[2], carry = bits.Add64(f.n[2], a.n[2], carry)
	f.n[3], carry = bits.Add64(f.n[3], a.n[3], carry)

	if carry != 0 {
		f.n[0], carry = bits.Add64(f.n[0], field4x64R, 0)
		f.n[1], carry = bits.Add64(f.n[1], 0, carry)
		f.n[2], carry = bits.Add64(f.n[2], 0, carry)
		f.n[3], _ = bits.Add64(f.n[3], 0, carry)
	}

	f.magnitude += a.magnitude
	f.normalized = false
	if f.magnitude > 8 {
		f.Reduce()
	}
}

// sub subtracts a from f in place: f -= a.
func (f *Field4x64) sub(a *Field4x64) {
	var negA Field4x64
	negA.negate(a, a.magnitude)
	f.add(&negA)
}

// mulInt multiplies f by a small integer.
func (f *Field4x64) mulInt(a int) {
	if a < 0 || a > 32 {
		panic("multiplier out of range")
	}
	ua := uint64(a)

	// Multiply each limb
	var carry uint64
	var hi uint64

	hi, f.n[0] = bits.Mul64(f.n[0], ua)
	carry = hi

	hi, f.n[1] = bits.Mul64(f.n[1], ua)
	f.n[1], carry = bits.Add64(f.n[1], carry, 0)
	carry = hi + carry

	hi, f.n[2] = bits.Mul64(f.n[2], ua)
	f.n[2], carry = bits.Add64(f.n[2], carry, 0)
	carry = hi + carry

	hi, f.n[3] = bits.Mul64(f.n[3], ua)
	f.n[3], carry = bits.Add64(f.n[3], carry, 0)
	carry = hi + carry

	// Handle overflow
	if carry != 0 {
		var c uint64
		hi, lo := bits.Mul64(carry, field4x64R)
		f.n[0], c = bits.Add64(f.n[0], lo, 0)
		f.n[1], c = bits.Add64(f.n[1], hi, c)
		f.n[2], c = bits.Add64(f.n[2], 0, c)
		f.n[3], _ = bits.Add64(f.n[3], 0, c)
	}

	f.magnitude *= a
	f.normalized = false
}

// cmov conditionally moves a into f if flag is non-zero.
func (f *Field4x64) cmov(a *Field4x64, flag int) {
	mask := uint64(-(int64(flag) & 1))
	f.n[0] ^= mask & (f.n[0] ^ a.n[0])
	f.n[1] ^= mask & (f.n[1] ^ a.n[1])
	f.n[2] ^= mask & (f.n[2] ^ a.n[2])
	f.n[3] ^= mask & (f.n[3] ^ a.n[3])

	if flag != 0 {
		f.magnitude = a.magnitude
		f.normalized = a.normalized
	}
}

// toStorage converts f to storage format.
func (f *Field4x64) toStorage(s *FieldElementStorage) {
	if !f.normalized {
		f.normalize()
	}
	s.n = f.n
}

// fromStorage converts from storage format to f.
func (f *Field4x64) fromStorage(s *FieldElementStorage) {
	f.n = s.n
	f.magnitude = 1
	f.normalized = false
}

// setB32 sets f from 32-byte big-endian representation (lowercase version).
func (f *Field4x64) setB32(b []byte) error {
	return f.SetB32(b)
}

// getB32 writes f to 32-byte big-endian representation (lowercase version).
func (f *Field4x64) getB32(b []byte) {
	f.GetB32(b)
}

// mul multiplies two field elements: f = a * b (lowercase version).
func (f *Field4x64) mul(a, b *Field4x64) {
	f.Mul(a, b)
}

// sqr squares a field element: f = a^2 (lowercase version).
func (f *Field4x64) sqr(a *Field4x64) {
	f.Sqr(a)
}

// inv computes modular inverse using Fermat's little theorem: f = a^(p-2).
// Uses an optimized addition chain for efficiency.
//
// p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D
//
// Binary structure:
//   - bits 255-33: 223 ones
//   - bit 32: 0
//   - bits 31-12: 20 ones
//   - bits 11-0: 0xC2D
func (f *Field4x64) inv(a *Field4x64) {
	var aNorm Field4x64
	aNorm = *a
	aNorm.normalize()

	// Build up powers using addition chain (same as sqrt)
	var x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1 Field4x64

	// x2 = a^(2^2 - 1) = a^3
	x2.sqr(&aNorm)
	x2.mul(&x2, &aNorm)

	// x3 = a^(2^3 - 1) = a^7
	x3.sqr(&x2)
	x3.mul(&x3, &aNorm)

	// x6 = a^(2^6 - 1) = a^63
	x6 = x3
	for j := 0; j < 3; j++ {
		x6.sqr(&x6)
	}
	x6.mul(&x6, &x3)

	// x9 = a^(2^9 - 1) = a^511
	x9 = x6
	for j := 0; j < 3; j++ {
		x9.sqr(&x9)
	}
	x9.mul(&x9, &x3)

	// x11 = a^(2^11 - 1) = a^2047
	x11 = x9
	for j := 0; j < 2; j++ {
		x11.sqr(&x11)
	}
	x11.mul(&x11, &x2)

	// x22 = a^(2^22 - 1)
	x22 = x11
	for j := 0; j < 11; j++ {
		x22.sqr(&x22)
	}
	x22.mul(&x22, &x11)

	// x44 = a^(2^44 - 1)
	x44 = x22
	for j := 0; j < 22; j++ {
		x44.sqr(&x44)
	}
	x44.mul(&x44, &x22)

	// x88 = a^(2^88 - 1)
	x88 = x44
	for j := 0; j < 44; j++ {
		x88.sqr(&x88)
	}
	x88.mul(&x88, &x44)

	// x176 = a^(2^176 - 1)
	x176 = x88
	for j := 0; j < 88; j++ {
		x176.sqr(&x176)
	}
	x176.mul(&x176, &x88)

	// x220 = a^(2^220 - 1)
	x220 = x176
	for j := 0; j < 44; j++ {
		x220.sqr(&x220)
	}
	x220.mul(&x220, &x44)

	// x223 = a^(2^223 - 1)
	x223 = x220
	for j := 0; j < 3; j++ {
		x223.sqr(&x223)
	}
	x223.mul(&x223, &x3)

	// x20 = a^(2^20 - 1) (needed for bits 31-12)
	var x20 Field4x64
	x20 = x11
	for j := 0; j < 9; j++ {
		x20.sqr(&x20)
	}
	x20.mul(&x20, &x9)

	// Assemble p-2:
	// Start with x223 and shift by 33 bits
	t1 = x223
	for j := 0; j < 33; j++ {
		t1.sqr(&t1)
	}
	// t1 = a^((2^223-1) * 2^33) = a^(2^256 - 2^33)

	// Add contribution for bits 31-12 (20 ones): x20 shifted by 12
	var temp Field4x64
	temp = x20
	for j := 0; j < 12; j++ {
		temp.sqr(&temp)
	}
	// temp = a^((2^20-1)*2^12) = a^(2^32 - 2^12)

	t1.mul(&t1, &temp)
	// t1 = a^(2^256 - 2^32 - 2^12)

	// Add bits 11-0 = 0xC2D = 3117
	// Compute a^3117 using binary exponentiation (small number)
	var low, base Field4x64
	low.setInt(1)
	base = aNorm
	e := uint32(3117)
	for e > 0 {
		if e&1 == 1 {
			low.mul(&low, &base)
		}
		e >>= 1
		if e > 0 {
			base.sqr(&base)
		}
	}
	// low = a^3117

	t1.mul(&t1, &low)
	// t1 = a^(2^256 - 2^32 - 979) = a^(p-2)

	*f = t1
	f.magnitude = 1
	f.normalized = true
}

// sqrt computes square root of f if it exists.
func (f *Field4x64) sqrt(a *Field4x64) bool {
	// Normalize input
	var aNorm Field4x64
	aNorm = *a
	if aNorm.magnitude > 8 {
		aNorm.normalizeWeak()
	} else {
		aNorm.normalize()
	}

	// For p ≡ 3 (mod 4), sqrt(a) = a^((p+1)/4)
	// Use addition chain for efficiency
	var x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1 Field4x64

	// x2 = a^3
	x2.sqr(&aNorm)
	x2.mul(&x2, &aNorm)

	// x3 = a^7
	x3.sqr(&x2)
	x3.mul(&x3, &aNorm)

	// x6 = a^63
	x6 = x3
	for j := 0; j < 3; j++ {
		x6.sqr(&x6)
	}
	x6.mul(&x6, &x3)

	// x9 = a^511
	x9 = x6
	for j := 0; j < 3; j++ {
		x9.sqr(&x9)
	}
	x9.mul(&x9, &x3)

	// x11 = a^2047
	x11 = x9
	for j := 0; j < 2; j++ {
		x11.sqr(&x11)
	}
	x11.mul(&x11, &x2)

	// x22 = a^4194303
	x22 = x11
	for j := 0; j < 11; j++ {
		x22.sqr(&x22)
	}
	x22.mul(&x22, &x11)

	// x44 = a^17592186044415
	x44 = x22
	for j := 0; j < 22; j++ {
		x44.sqr(&x44)
	}
	x44.mul(&x44, &x22)

	// x88
	x88 = x44
	for j := 0; j < 44; j++ {
		x88.sqr(&x88)
	}
	x88.mul(&x88, &x44)

	// x176
	x176 = x88
	for j := 0; j < 88; j++ {
		x176.sqr(&x176)
	}
	x176.mul(&x176, &x88)

	// x220
	x220 = x176
	for j := 0; j < 44; j++ {
		x220.sqr(&x220)
	}
	x220.mul(&x220, &x44)

	// x223
	x223 = x220
	for j := 0; j < 3; j++ {
		x223.sqr(&x223)
	}
	x223.mul(&x223, &x3)

	// Final assembly
	t1 = x223
	for j := 0; j < 23; j++ {
		t1.sqr(&t1)
	}
	t1.mul(&t1, &x22)
	for j := 0; j < 6; j++ {
		t1.sqr(&t1)
	}
	t1.mul(&t1, &x2)
	t1.sqr(&t1)
	f.sqr(&t1)

	// Verify: check that f^2 == a
	var check Field4x64
	check.sqr(f)
	check.normalize()
	aNorm.normalize()

	return check.equal(&aNorm)
}

// half computes f = a/2 (mod p).
func (f *Field4x64) half(a *Field4x64) {
	*f = *a

	// If odd, add p first (so result is even)
	// We need to preserve the overflow bit for the right shift
	var overflow uint64
	if f.n[0]&1 == 1 {
		var carry uint64
		f.n[0], carry = bits.Add64(f.n[0], field4x64Prime[0], 0)
		f.n[1], carry = bits.Add64(f.n[1], field4x64Prime[1], carry)
		f.n[2], carry = bits.Add64(f.n[2], field4x64Prime[2], carry)
		f.n[3], overflow = bits.Add64(f.n[3], field4x64Prime[3], carry)
	}

	// Right shift by 1, with overflow bit becoming the top bit
	f.n[0] = (f.n[0] >> 1) | (f.n[1] << 63)
	f.n[1] = (f.n[1] >> 1) | (f.n[2] << 63)
	f.n[2] = (f.n[2] >> 1) | (f.n[3] << 63)
	f.n[3] = (f.n[3] >> 1) | (overflow << 63)

	f.magnitude = (f.magnitude >> 1) + 1
	f.normalized = false
}