package gnarl

// Montgomery field arithmetic over Z_P for the Gnarl signature scheme.
//
// Representation: 4×uint64 little-endian limbs in Montgomery form.
// A value a is stored as aR mod P, where R = 2^256.
// Multiplication: montMul(aR, bR) = abR mod P (one R cancels).
//
// P is a 216-bit prime (27 bytes), but we use 256-bit Montgomery form
// internally for uniformity with the CIOS algorithm. The top ~40 bits
// of limb[3] are always zero.
//
// P ≡ 2 mod 3, P mod 5 = 2, 5 is QNR, N = P+1 = 6Q where Q is prime.

import "math/bits"

// fe is a field element in Montgomery form: 4 uint64 limbs, little-endian.
type fe [4]uint64

// Prime P limbs (little-endian).
// P hex = 0x9563a6b6d81bb9b02e5e5d121e79ccd682cc9931f9791e0f9ee4f5
// P bits = 216
var pLimbs = fe{
	0x31f9791e0f9ee4f5,
	0x121e79ccd682cc99,
	0xb6d81bb9b02e5e5d,
	0x00000000009563a6,
}

// R mod P (the Montgomery form of 1).
var feOne = fe{
	0x761ae156c8e8e77c,
	0x023540a7764c229f,
	0x315e327188dada36,
	0x00000000001012bc,
}

// R^2 mod P (used to convert normal form → Montgomery form).
var feR2 = fe{
	0xf56927577991be00,
	0xf6eb880a87ed3681,
	0x18d8b12f949488c0,
	0x0000000000684a62,
}

// feZero is the zero element (Montgomery form of 0 is 0).
var feZero fe

// pPrime = -P^{-1} mod 2^64, the Montgomery reduction constant.
const pPrime uint64 = 0xf07d39ef3ea058a3

// Tonelli-Shanks constants for square root.
// P - 1 = 2^s * tsOddQ, where s = 2.
const tsS = 2

// tsOddQ = (P-1) / 4, the odd part of P-1.
var tsOddQ = fe{
	0x4c7e5e4783e7b93d,
	0x44879e7335a0b326,
	0xadb606ee6c0b9797,
	0x00000000002558e9,
}

// tsQm1h = (tsOddQ - 1) / 2, used in Tonelli-Shanks.
var tsQm1h = fe{
	0x263f2f23c1f3dc9e,
	0xa243cf399ad05993,
	0xd6db03773605cbcb,
	0x000000000012ac74,
}

// thirdPNorm = (P-1)/3 in normal (non-Montgomery) form, for y-coordinate
// 3-way canonicalization. With P ≡ 2 mod 3, the torus has index 6 = 2×3,
// and we canonicalize y to [0, P/3) instead of [0, P/2).
var thirdPNorm = fe{
	0xbb53285f5a8a4c51,
	0xb0b4d3444780eedd,
	0x3cf2b3e8900f74c9,
	0x000000000031cbe2,
}

// halfPNorm = (P-1)/2 in normal (non-Montgomery) form, for alternate checks.
var halfPNorm = fe{
	0x98fcbc8f07cf727a,
	0x890f3ce66b41664c,
	0x5b6c0ddcd8172f2e,
	0x00000000004ab1d3,
}

// pMinus2 = P - 2, used for Fermat inversion.
var pMinus2 = fe{
	0x31f9791e0f9ee4f3,
	0x121e79ccd682cc99,
	0xb6d81bb9b02e5e5d,
	0x00000000009563a6,
}

// feSet copies src into dst.
func feSet(dst, src *fe) {
	*dst = *src
}

// feIsZero returns 1 if a == 0 (constant-time).
func feIsZero(a *fe) int {
	z := a[0] | a[1] | a[2] | a[3]
	z = (z | (0 - z)) >> 63
	return 1 - int(z)
}

// feEqual returns 1 if a == b (constant-time).
func feEqual(a, b *fe) int {
	z := (a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]) | (a[3] ^ b[3])
	z = (z | (0 - z)) >> 63
	return 1 - int(z)
}

// feReduce reduces a to [0, P) if a >= P. Constant-time.
func feReduce(a *fe) {
	d0, borrow := bits.Sub64(a[0], pLimbs[0], 0)
	d1, borrow := bits.Sub64(a[1], pLimbs[1], borrow)
	d2, borrow := bits.Sub64(a[2], pLimbs[2], borrow)
	d3, borrow := bits.Sub64(a[3], pLimbs[3], borrow)

	mask := uint64(0) - borrow
	a[0] = (a[0] & mask) | (d0 & ^mask)
	a[1] = (a[1] & mask) | (d1 & ^mask)
	a[2] = (a[2] & mask) | (d2 & ^mask)
	a[3] = (a[3] & mask) | (d3 & ^mask)
}

// feAdd computes r = a + b mod P.
func feAdd(r, a, b *fe) {
	var carry uint64
	r[0], carry = bits.Add64(a[0], b[0], 0)
	r[1], carry = bits.Add64(a[1], b[1], carry)
	r[2], carry = bits.Add64(a[2], b[2], carry)
	r[3], carry = bits.Add64(a[3], b[3], carry)

	d0, borrow := bits.Sub64(r[0], pLimbs[0], 0)
	d1, borrow := bits.Sub64(r[1], pLimbs[1], borrow)
	d2, borrow := bits.Sub64(r[2], pLimbs[2], borrow)
	d3, borrow := bits.Sub64(r[3], pLimbs[3], borrow)

	underflow := borrow &^ carry
	mask := uint64(0) - underflow
	r[0] = (r[0] & mask) | (d0 & ^mask)
	r[1] = (r[1] & mask) | (d1 & ^mask)
	r[2] = (r[2] & mask) | (d2 & ^mask)
	r[3] = (r[3] & mask) | (d3 & ^mask)
}

// feSub computes r = a - b mod P.
func feSub(r, a, b *fe) {
	var borrow uint64
	r[0], borrow = bits.Sub64(a[0], b[0], 0)
	r[1], borrow = bits.Sub64(a[1], b[1], borrow)
	r[2], borrow = bits.Sub64(a[2], b[2], borrow)
	r[3], borrow = bits.Sub64(a[3], b[3], borrow)

	mask := uint64(0) - borrow
	var carry uint64
	r[0], carry = bits.Add64(r[0], pLimbs[0]&mask, 0)
	r[1], carry = bits.Add64(r[1], pLimbs[1]&mask, carry)
	r[2], carry = bits.Add64(r[2], pLimbs[2]&mask, carry)
	r[3], _ = bits.Add64(r[3], pLimbs[3]&mask, carry)
}

// feNeg computes r = -a mod P.
func feNeg(r, a *fe) {
	z := a[0] | a[1] | a[2] | a[3]
	nonzero := z | (0 - z)
	mask := 0 - (nonzero >> 63)

	var borrow uint64
	r[0], borrow = bits.Sub64(pLimbs[0], a[0], 0)
	r[1], borrow = bits.Sub64(pLimbs[1], a[1], borrow)
	r[2], borrow = bits.Sub64(pLimbs[2], a[2], borrow)
	r[3], _ = bits.Sub64(pLimbs[3], a[3], borrow)
	r[0] &= mask
	r[1] &= mask
	r[2] &= mask
	r[3] &= mask
}

// montMulGeneric computes r = a * b * R^{-1} mod P using fully-unrolled CIOS.
//
// All carry propagation uses bits.Add64 to avoid silent overflow on hi += c.
func montMulGeneric(r, a, b *fe) {
	var t0, t1, t2, t3, t4 uint64
	var hi, lo, c0, c1, m, carry uint64

	// ---- i = 0 ----
	// Part A: t += a[0] * b
	hi, lo = bits.Mul64(a[0], b[0])
	t0, c0 = bits.Add64(lo, t0, 0)
	carry, _ = bits.Add64(hi, 0, c0)

	hi, lo = bits.Mul64(a[0], b[1])
	lo, c0 = bits.Add64(lo, t1, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t1, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(a[0], b[2])
	lo, c0 = bits.Add64(lo, t2, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t2, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(a[0], b[3])
	lo, c0 = bits.Add64(lo, t3, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t3, c0 = bits.Add64(lo, carry, 0)
	t4, _ = bits.Add64(hi, c1, c0)

	// Part B: Montgomery reduction
	m = t0 * pPrime
	hi, lo = bits.Mul64(m, pLimbs[0])
	_, c0 = bits.Add64(t0, lo, 0)
	carry, _ = bits.Add64(hi, 0, c0)

	hi, lo = bits.Mul64(m, pLimbs[1])
	lo, c0 = bits.Add64(lo, t1, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t0, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(m, pLimbs[2])
	lo, c0 = bits.Add64(lo, t2, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t1, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(m, pLimbs[3])
	lo, c0 = bits.Add64(lo, t3, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t2, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)
	t3, c0 = bits.Add64(t4, carry, 0)
	t4 = c0

	// ---- i = 1 ----
	hi, lo = bits.Mul64(a[1], b[0])
	t0, c0 = bits.Add64(lo, t0, 0)
	carry, _ = bits.Add64(hi, 0, c0)

	hi, lo = bits.Mul64(a[1], b[1])
	lo, c0 = bits.Add64(lo, t1, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t1, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(a[1], b[2])
	lo, c0 = bits.Add64(lo, t2, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t2, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(a[1], b[3])
	lo, c0 = bits.Add64(lo, t3, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t3, c0 = bits.Add64(lo, carry, 0)
	hi, _ = bits.Add64(hi, c1, c0)
	t4 += hi

	m = t0 * pPrime
	hi, lo = bits.Mul64(m, pLimbs[0])
	_, c0 = bits.Add64(t0, lo, 0)
	carry, _ = bits.Add64(hi, 0, c0)

	hi, lo = bits.Mul64(m, pLimbs[1])
	lo, c0 = bits.Add64(lo, t1, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t0, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(m, pLimbs[2])
	lo, c0 = bits.Add64(lo, t2, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t1, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(m, pLimbs[3])
	lo, c0 = bits.Add64(lo, t3, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t2, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)
	t3, c0 = bits.Add64(t4, carry, 0)
	t4 = c0

	// ---- i = 2 ----
	hi, lo = bits.Mul64(a[2], b[0])
	t0, c0 = bits.Add64(lo, t0, 0)
	carry, _ = bits.Add64(hi, 0, c0)

	hi, lo = bits.Mul64(a[2], b[1])
	lo, c0 = bits.Add64(lo, t1, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t1, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(a[2], b[2])
	lo, c0 = bits.Add64(lo, t2, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t2, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(a[2], b[3])
	lo, c0 = bits.Add64(lo, t3, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t3, c0 = bits.Add64(lo, carry, 0)
	hi, _ = bits.Add64(hi, c1, c0)
	t4 += hi

	m = t0 * pPrime
	hi, lo = bits.Mul64(m, pLimbs[0])
	_, c0 = bits.Add64(t0, lo, 0)
	carry, _ = bits.Add64(hi, 0, c0)

	hi, lo = bits.Mul64(m, pLimbs[1])
	lo, c0 = bits.Add64(lo, t1, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t0, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(m, pLimbs[2])
	lo, c0 = bits.Add64(lo, t2, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t1, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(m, pLimbs[3])
	lo, c0 = bits.Add64(lo, t3, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t2, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)
	t3, c0 = bits.Add64(t4, carry, 0)
	t4 = c0

	// ---- i = 3 ----
	hi, lo = bits.Mul64(a[3], b[0])
	t0, c0 = bits.Add64(lo, t0, 0)
	carry, _ = bits.Add64(hi, 0, c0)

	hi, lo = bits.Mul64(a[3], b[1])
	lo, c0 = bits.Add64(lo, t1, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t1, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(a[3], b[2])
	lo, c0 = bits.Add64(lo, t2, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t2, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(a[3], b[3])
	lo, c0 = bits.Add64(lo, t3, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t3, c0 = bits.Add64(lo, carry, 0)
	hi, _ = bits.Add64(hi, c1, c0)
	t4 += hi

	m = t0 * pPrime
	hi, lo = bits.Mul64(m, pLimbs[0])
	_, c0 = bits.Add64(t0, lo, 0)
	carry, _ = bits.Add64(hi, 0, c0)

	hi, lo = bits.Mul64(m, pLimbs[1])
	lo, c0 = bits.Add64(lo, t1, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t0, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(m, pLimbs[2])
	lo, c0 = bits.Add64(lo, t2, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t1, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)

	hi, lo = bits.Mul64(m, pLimbs[3])
	lo, c0 = bits.Add64(lo, t3, 0)
	hi, c1 = bits.Add64(hi, 0, c0)
	t2, c0 = bits.Add64(lo, carry, 0)
	carry, _ = bits.Add64(hi, c1, c0)
	t3, c0 = bits.Add64(t4, carry, 0)
	t4 = c0

	r[0] = t0
	r[1] = t1
	r[2] = t2
	r[3] = t3
	feReduce(r)
}

// montSquareGeneric computes r = a^2 * R^{-1} mod P.
func montSquareGeneric(r, a *fe) {
	montMulGeneric(r, a, a)
}

// feToMont converts a normal-form value to Montgomery form: aR mod P.
func feToMont(r, a *fe) {
	montMul(r, a, &feR2)
}

// feFromMont converts a Montgomery-form value to normal form: a * R^{-1} mod P.
func feFromMont(r, a *fe) {
	one := fe{1, 0, 0, 0}
	montMul(r, a, &one)
}

// feFromSmall converts a small integer to Montgomery form.
func feFromSmall(r *fe, v int64) {
	if v >= 0 {
		*r = fe{uint64(v), 0, 0, 0}
	} else {
		*r = pLimbs
		var borrow uint64
		r[0], borrow = bits.Sub64(r[0], uint64(-v), 0)
		r[1], borrow = bits.Sub64(r[1], 0, borrow)
		r[2], borrow = bits.Sub64(r[2], 0, borrow)
		r[3], _ = bits.Sub64(r[3], 0, borrow)
	}
	feToMont(r, r)
}

// fieldLen is the byte length of a serialized field element (27 bytes for 216-bit P).
const fieldLen = 27

// feFromBytes27 sets r from a 27-byte big-endian encoding and converts to Montgomery form.
// Returns false if the value >= P.
func feFromBytes27(r *fe, b []byte) bool {
	if len(b) < 27 {
		*r = feZero
		return false
	}
	// 27 bytes = 216 bits. Big-endian layout:
	// b[0..2] → low 24 bits of limb[3] (bits 192..215)
	// b[3..10] → limb[2] (bits 128..191)
	// b[11..18] → limb[1] (bits 64..127)
	// b[19..26] → limb[0] (bits 0..63)
	r[3] = uint64(b[0])<<16 | uint64(b[1])<<8 | uint64(b[2])
	r[2] = uint64(b[3])<<56 | uint64(b[4])<<48 | uint64(b[5])<<40 | uint64(b[6])<<32 |
		uint64(b[7])<<24 | uint64(b[8])<<16 | uint64(b[9])<<8 | uint64(b[10])
	r[1] = uint64(b[11])<<56 | uint64(b[12])<<48 | uint64(b[13])<<40 | uint64(b[14])<<32 |
		uint64(b[15])<<24 | uint64(b[16])<<16 | uint64(b[17])<<8 | uint64(b[18])
	r[0] = uint64(b[19])<<56 | uint64(b[20])<<48 | uint64(b[21])<<40 | uint64(b[22])<<32 |
		uint64(b[23])<<24 | uint64(b[24])<<16 | uint64(b[25])<<8 | uint64(b[26])

	// Check >= P.
	d0, borrow := bits.Sub64(r[0], pLimbs[0], 0)
	d1, borrow := bits.Sub64(r[1], pLimbs[1], borrow)
	_, borrow = bits.Sub64(r[2], pLimbs[2], borrow)
	_, borrow = bits.Sub64(r[3], pLimbs[3], borrow)
	_ = d0
	_ = d1
	if borrow == 0 {
		*r = feZero
		return false
	}

	feToMont(r, r)
	return true
}

// feToBytes27 writes a Montgomery-form element to a 27-byte big-endian buffer.
func feToBytes27(b []byte, a *fe) {
	var norm fe
	feFromMont(&norm, a)
	// limb[3] low 24 bits → b[0..2]
	b[0] = byte(norm[3] >> 16)
	b[1] = byte(norm[3] >> 8)
	b[2] = byte(norm[3])
	// limb[2] → b[3..10]
	b[3] = byte(norm[2] >> 56)
	b[4] = byte(norm[2] >> 48)
	b[5] = byte(norm[2] >> 40)
	b[6] = byte(norm[2] >> 32)
	b[7] = byte(norm[2] >> 24)
	b[8] = byte(norm[2] >> 16)
	b[9] = byte(norm[2] >> 8)
	b[10] = byte(norm[2])
	// limb[1] → b[11..18]
	b[11] = byte(norm[1] >> 56)
	b[12] = byte(norm[1] >> 48)
	b[13] = byte(norm[1] >> 40)
	b[14] = byte(norm[1] >> 32)
	b[15] = byte(norm[1] >> 24)
	b[16] = byte(norm[1] >> 16)
	b[17] = byte(norm[1] >> 8)
	b[18] = byte(norm[1])
	// limb[0] → b[19..26]
	b[19] = byte(norm[0] >> 56)
	b[20] = byte(norm[0] >> 48)
	b[21] = byte(norm[0] >> 40)
	b[22] = byte(norm[0] >> 32)
	b[23] = byte(norm[0] >> 24)
	b[24] = byte(norm[0] >> 16)
	b[25] = byte(norm[0] >> 8)
	b[26] = byte(norm[0])
}

// fePow computes r = base^exp where exp is a 256-bit exponent in little-endian limbs
// (plain integer, NOT Montgomery). base is in Montgomery form, result in Montgomery form.
func fePow(r, base *fe, exp *fe) {
	var result fe
	feSet(&result, &feOne)
	var b fe
	feSet(&b, base)

	for i := 0; i < 4; i++ {
		word := exp[i]
		for bit := 0; bit < 64; bit++ {
			if word&1 == 1 {
				montMul(&result, &result, &b)
			}
			montSquare(&b, &b)
			word >>= 1
		}
	}
	feSet(r, &result)
}

// feInv computes r = a^{-1} in Montgomery form via Fermat: a^{P-2}.
func feInv(r, a *fe) {
	fePow(r, a, &pMinus2)
}

// feSqrt computes r = sqrt(a) mod P in Montgomery form using Tonelli-Shanks.
// Returns false if a is not a quadratic residue.
//
// P - 1 = 2^2 * q where q is odd (tsS = 2).
// Non-residue: 5 (QNR by construction of P).
func feSqrt(r, a *fe) bool {
	if feIsZero(a) == 1 {
		*r = feZero
		return true
	}

	var feFive fe
	feFive[0] = 5
	feToMont(&feFive, &feFive)

	// t = a^q
	var t fe
	fePow(&t, a, &tsOddQ)

	// rr = a^{(q+1)/2} = a * a^{(q-1)/2}
	var rr, tmp fe
	fePow(&tmp, a, &tsQm1h)
	montMul(&rr, &tmp, a)

	// c = 5^q
	var c fe
	fePow(&c, &feFive, &tsOddQ)

	m := tsS

	for {
		if feEqual(&t, &feOne) == 1 {
			feSet(r, &rr)
			return true
		}

		var b fe
		feSet(&b, &t)
		i := 0
		for j := 1; j < m; j++ {
			montSquare(&b, &b)
			if feEqual(&b, &feOne) == 1 {
				i = j
				break
			}
		}
		if i == 0 {
			return false
		}

		feSet(&b, &c)
		for j := 0; j < m-i-1; j++ {
			montSquare(&b, &b)
		}

		montMul(&rr, &rr, &b)
		montSquare(&c, &b)
		montMul(&t, &t, &c)
		m = i
	}
}

// feIsGtThirdP returns 1 if a (in NORMAL form, not Montgomery) is > (P-1)/3.
// Used for 3-way canonicalization of torus y-coordinate.
func feIsGtThirdP(a *fe) int {
	for i := 3; i >= 0; i-- {
		if a[i] > thirdPNorm[i] {
			return 1
		}
		if a[i] < thirdPNorm[i] {
			return 0
		}
	}
	return 0 // equal → not strictly greater
}

// feIsGtHalfP returns 1 if a (in NORMAL form, not Montgomery) is > (P-1)/2.
func feIsGtHalfP(a *fe) int {
	for i := 3; i >= 0; i-- {
		if a[i] > halfPNorm[i] {
			return 1
		}
		if a[i] < halfPNorm[i] {
			return 0
		}
	}
	return 0
}