package crypto

// Radix-3 Number-Theoretic Transform (NTT) over Z_271 for trinary Hamadryad.
//
// The trinary analogue of the radix-2 NTT in ntt.go. Transforms a
// length-27 polynomial from coefficient form to evaluation form over Z_271,
// enabling O(n log n) polynomial multiplication via pointwise multiply.
//
// For trinary Hamadryad: n=27 = 3^3, p=271.
//
// 271 is prime and 270 = 271-1 = 2 × 5 × 3^3, so Z_271* has order 270.
// g = 6 is a primitive root (generator) of Z_271*.
// psi = 6^(270/54) = 6^5 = 188 is a primitive 54th root of unity.
//
// For polynomials mod (x^27 + 1), we use the negacyclic NTT:
// pre-multiply coefficients by powers of psi (a 54th root),
// then apply a standard 27-point NTT with omega = psi^2 = 114
// (a primitive 27th root of unity).
//
// The radix-3 butterfly uses the primitive cube root of unity
// w3 = omega^9 = 242, which satisfies 1 + w3 + w3^2 = 0 (mod 271).

// GnarlP is the ring modulus for the trinary Hamadryad.
const GnarlP = 271

// GnarlN is the ring dimension: degree of x^27 + 1.
const GnarlN = 27

// mod271 reduces x into [0, 270]. x must be in [0, 73441).
// Uses Barrett reduction: q = (x * 483) >> 17, r = x - q*271.
// Magic constant: 483 = floor(2^17 / 271). All arithmetic fits in uint32.
// Max intermediate: 73170 * 483 = 35,341,110 < 2^32.
//
//go:nosplit
func mod271(x uint32) uint16 {
	q := (x * 483) >> 17
	r := x - q*271
	if r >= 271 {
		r -= 271
	}
	return uint16(r)
}

// mod271s reduces a signed int into [0, 270]. Used only in test/rare paths.
func mod271s(x int) uint16 {
	x %= GnarlP
	if x < 0 {
		x += GnarlP
	}
	return uint16(x)
}

// powMod271 computes base^exp mod 271 via binary exponentiation.
func powMod271(base, exp int) uint16 {
	result := 1
	b := base % GnarlP
	if b < 0 {
		b += GnarlP
	}
	for e := exp; e > 0; e >>= 1 {
		if e&1 == 1 {
			result = (result * b) % GnarlP
		}
		b = (b * b) % GnarlP
	}
	return uint16(result)
}

// invMod271 computes the modular inverse of a mod 271.
// Uses Fermat's little theorem: a^(-1) = a^(p-2) mod p.
func invMod271(a uint16) uint16 {
	return powMod271(int(a), GnarlP-2)
}

// digitRev3 reverses the base-3 digits of x with the given number of digits.
// For n=27 = 3^3, we use 3 digits.
func digitRev3(x, digits int) int {
	r := 0
	for range digits {
		r = r*3 + x%3
		x /= 3
	}
	return r
}

// Pre-computed tables for the radix-3 NTT, filled by initNTT27Tables.
var (
	// psiPows27[i] = psi^i mod 271, for i=0..53.
	// psi = 188 (primitive 54th root of unity).
	psiPows27 [54]uint16

	// psiInvPows27[i] = psi^(-i) mod 271.
	psiInvPows27 [54]uint16

	// invN27 = 27^(-1) mod 271 = 261.
	invN27 uint16

	ntt27TablesReady bool
)

// initNTT27Tables populates the radix-3 NTT lookup tables.
func initNTT27Tables() {
	if ntt27TablesReady {
		return
	}
	const psi = 188 // primitive 54th root of unity in Z_271

	psiPows27[0] = 1
	for i := 1; i < 54; i++ {
		psiPows27[i] = mod271(uint32(psiPows27[i-1]) * psi)
	}

	psiInv := invMod271(psi)
	psiInvPows27[0] = 1
	for i := 1; i < 54; i++ {
		psiInvPows27[i] = mod271(uint32(psiInvPows27[i-1]) * uint32(psiInv))
	}

	invN27 = invMod271(uint16(GnarlN))
	ntt27TablesReady = true
}

func init() {
	initNTT27Tables()
}

// ntt27 computes the forward negacyclic NTT of a length-27 polynomial over Z_271.
//
// The negacyclic NTT evaluates the polynomial at the roots of (x^27 + 1),
// which are psi^(2k+1) for k=0..26. Implementation:
//  1. Pre-multiply: a[i] *= psi^i
//  2. Radix-3 DIT NTT with omega = psi^2 (primitive 27th root of unity)
//
// The radix-3 butterfly for indices [j, j+m, j+2m] with twiddle w:
//
//	a0' = a0 + w*a1 + w^2*a2
//	a1' = a0 + w*a1*w3 + w^2*a2*w3^2
//	a2' = a0 + w*a1*w3^2 + w^2*a2*w3
//
// where w3 = omega^(27/3) = omega^9 = 242 is the primitive cube root of unity.
func ntt27(a *[GnarlN]uint16) {
	// Fully-unrolled negacyclic NTT over Z_271, n=27 = 3^3.
	// Three stages of radix-3 butterflies with precomputed twiddles.

	// Step 1: pre-multiply by psi^i for negacyclic.
	// psi = 188, psiPows27 = [1,188,114,23,259,183,258,266,144,243,156,60,
	//   169,65,25,93,140,33,242,239,217,146,77,113,106,145,160]
	// a[0] *= 1 (no-op)
	a[1] = mod271(uint32(a[1]) * 188)
	a[2] = mod271(uint32(a[2]) * 114)
	a[3] = mod271(uint32(a[3]) * 23)
	a[4] = mod271(uint32(a[4]) * 259)
	a[5] = mod271(uint32(a[5]) * 183)
	a[6] = mod271(uint32(a[6]) * 258)
	a[7] = mod271(uint32(a[7]) * 266)
	a[8] = mod271(uint32(a[8]) * 144)
	a[9] = mod271(uint32(a[9]) * 243)
	a[10] = mod271(uint32(a[10]) * 156)
	a[11] = mod271(uint32(a[11]) * 60)
	a[12] = mod271(uint32(a[12]) * 169)
	a[13] = mod271(uint32(a[13]) * 65)
	a[14] = mod271(uint32(a[14]) * 25)
	a[15] = mod271(uint32(a[15]) * 93)
	a[16] = mod271(uint32(a[16]) * 140)
	a[17] = mod271(uint32(a[17]) * 33)
	a[18] = mod271(uint32(a[18]) * 242)
	a[19] = mod271(uint32(a[19]) * 239)
	a[20] = mod271(uint32(a[20]) * 217)
	a[21] = mod271(uint32(a[21]) * 146)
	a[22] = mod271(uint32(a[22]) * 77)
	a[23] = mod271(uint32(a[23]) * 113)
	a[24] = mod271(uint32(a[24]) * 106)
	a[25] = mod271(uint32(a[25]) * 145)
	a[26] = mod271(uint32(a[26]) * 160)

	// Step 2: digit-reversal permutation (9 swaps).
	a[1], a[9] = a[9], a[1]
	a[2], a[18] = a[18], a[2]
	a[4], a[12] = a[12], a[4]
	a[5], a[21] = a[21], a[5]
	a[7], a[15] = a[15], a[7]
	a[8], a[24] = a[24], a[8]
	a[11], a[19] = a[19], a[11]
	a[14], a[22] = a[22], a[14]
	a[17], a[25] = a[25], a[17]

	// Step 3: radix-3 butterfly stages (fully unrolled).
	// w3 = 242, w3sq = 28. All twiddles precomputed.

	// --- Stage 0: stride=1, groupSize=3, all tw1=tw2=1 ---
	// 9 butterflies, each: b0=a0+a1+a2, b1=a0+242*a1+28*a2, b2=a0+28*a1+242*a2
	nttBfly1(a, 0, 1, 2)
	nttBfly1(a, 3, 4, 5)
	nttBfly1(a, 6, 7, 8)
	nttBfly1(a, 9, 10, 11)
	nttBfly1(a, 12, 13, 14)
	nttBfly1(a, 15, 16, 17)
	nttBfly1(a, 18, 19, 20)
	nttBfly1(a, 21, 22, 23)
	nttBfly1(a, 24, 25, 26)

	// --- Stage 1: stride=3, groupSize=9, twiddleStep=3 ---
	// j=0: tw1=1, tw2=1 (same as stage 0 kernel)
	nttBfly1(a, 0, 3, 6)
	nttBfly1(a, 9, 12, 15)
	nttBfly1(a, 18, 21, 24)
	// j=1: tw1=258, tw2=169
	nttBfly(a, 1, 4, 7, 258, 169)
	nttBfly(a, 10, 13, 16, 258, 169)
	nttBfly(a, 19, 22, 25, 258, 169)
	// j=2: tw1=169, tw2=106
	nttBfly(a, 2, 5, 8, 169, 106)
	nttBfly(a, 11, 14, 17, 169, 106)
	nttBfly(a, 20, 23, 26, 169, 106)

	// --- Stage 2: stride=9, groupSize=27, twiddleStep=1 ---
	// j=0: tw1=1, tw2=1
	nttBfly1(a, 0, 9, 18)
	// j=1: tw1=114, tw2=259
	nttBfly(a, 1, 10, 19, 114, 259)
	// j=2: tw1=259, tw2=144
	nttBfly(a, 2, 11, 20, 259, 144)
	// j=3: tw1=258, tw2=169
	nttBfly(a, 3, 12, 21, 258, 169)
	// j=4: tw1=144, tw2=140
	nttBfly(a, 4, 13, 22, 144, 140)
	// j=5: tw1=156, tw2=217
	nttBfly(a, 5, 14, 23, 156, 217)
	// j=6: tw1=169, tw2=106
	nttBfly(a, 6, 15, 24, 169, 106)
	// j=7: tw1=25, tw2=83
	nttBfly(a, 7, 16, 25, 25, 83)
	// j=8: tw1=140, tw2=88
	nttBfly(a, 8, 17, 26, 140, 88)
}

// nttBfly1 performs a radix-3 butterfly with trivial twiddles (tw1=tw2=1).
// b0 = a0 + a1 + a2, b1 = a0 + 242*a1 + 28*a2, b2 = a0 + 28*a1 + 242*a2
//
//go:nosplit
func nttBfly1(a *[27]uint16, i0, i1, i2 int) {
	v0 := uint32(a[i0])
	v1 := uint32(a[i1])
	v2 := uint32(a[i2])
	a[i0] = mod271(v0 + v1 + v2)
	a[i1] = mod271(v0 + uint32(mod271(v1*242)) + uint32(mod271(v2*28)))
	a[i2] = mod271(v0 + uint32(mod271(v1*28)) + uint32(mod271(v2*242)))
}

// nttBfly performs a radix-3 butterfly with given twiddle factors tw1, tw2.
//
//go:nosplit
func nttBfly(a *[27]uint16, i0, i1, i2 int, tw1, tw2 uint32) {
	v0 := uint32(a[i0])
	a1tw := uint32(mod271(uint32(a[i1]) * tw1))
	a2tw := uint32(mod271(uint32(a[i2]) * tw2))
	a[i0] = mod271(v0 + a1tw + a2tw)
	a[i1] = mod271(v0 + uint32(mod271(a1tw*242)) + uint32(mod271(a2tw*28)))
	a[i2] = mod271(v0 + uint32(mod271(a1tw*28)) + uint32(mod271(a2tw*242)))
}

// intt27 computes the inverse negacyclic NTT, recovering coefficients.
//
// Uses the Decimation-In-Frequency (DIF) structure: butterflies top-down
// (largest group first), followed by digit-reversal permutation.
// This is the natural inverse of the forward DIT NTT.
func intt27(a *[GnarlN]uint16) {
	// Fully-unrolled inverse negacyclic NTT (DIF, top-down).

	// --- Stage 2: groupSize=27, stride=9, twiddleStep=1 ---
	inttBfly1(a, 0, 9, 18)
	inttBfly(a, 1, 10, 19, 126, 158)
	inttBfly(a, 2, 11, 20, 158, 32)
	inttBfly(a, 3, 12, 21, 125, 178)
	inttBfly(a, 4, 13, 22, 32, 211)
	inttBfly(a, 5, 14, 23, 238, 5)
	inttBfly(a, 6, 15, 24, 178, 248)
	inttBfly(a, 7, 16, 25, 206, 160)
	inttBfly(a, 8, 17, 26, 211, 77)

	// --- Stage 1: groupSize=9, stride=3, twiddleStep=3 ---
	inttBfly1(a, 0, 3, 6)
	inttBfly(a, 1, 4, 7, 125, 178)
	inttBfly(a, 2, 5, 8, 178, 248)
	inttBfly1(a, 9, 12, 15)
	inttBfly(a, 10, 13, 16, 125, 178)
	inttBfly(a, 11, 14, 17, 178, 248)
	inttBfly1(a, 18, 21, 24)
	inttBfly(a, 19, 22, 25, 125, 178)
	inttBfly(a, 20, 23, 26, 178, 248)

	// --- Stage 0: groupSize=3, stride=1, twiddleStep=9 (all trivial) ---
	inttBfly1(a, 0, 1, 2)
	inttBfly1(a, 3, 4, 5)
	inttBfly1(a, 6, 7, 8)
	inttBfly1(a, 9, 10, 11)
	inttBfly1(a, 12, 13, 14)
	inttBfly1(a, 15, 16, 17)
	inttBfly1(a, 18, 19, 20)
	inttBfly1(a, 21, 22, 23)
	inttBfly1(a, 24, 25, 26)

	// Digit-reversal permutation (9 swaps).
	a[1], a[9] = a[9], a[1]
	a[2], a[18] = a[18], a[2]
	a[4], a[12] = a[12], a[4]
	a[5], a[21] = a[21], a[5]
	a[7], a[15] = a[15], a[7]
	a[8], a[24] = a[24], a[8]
	a[11], a[19] = a[19], a[11]
	a[14], a[22] = a[22], a[14]
	a[17], a[25] = a[25], a[17]

	// Post-multiply: fused invN27 * psiInvPows27[i] for each coefficient.
	a[0] = mod271(uint32(a[0]) * 261)
	a[1] = mod271(uint32(a[1]) * 245)
	a[2] = mod271(uint32(a[2]) * 95)
	a[3] = mod271(uint32(a[3]) * 247)
	a[4] = mod271(uint32(a[4]) * 46)
	a[5] = mod271(uint32(a[5]) * 228)
	a[6] = mod271(uint32(a[6]) * 105)
	a[7] = mod271(uint32(a[7]) * 2)
	a[8] = mod271(uint32(a[8]) * 222)
	a[9] = mod271(uint32(a[9]) * 252)
	a[10] = mod271(uint32(a[10]) * 59)
	a[11] = mod271(uint32(a[11]) * 45)
	a[12] = mod271(uint32(a[12]) * 117)
	a[13] = mod271(uint32(a[13]) * 250)
	a[14] = mod271(uint32(a[14]) * 108)
	a[15] = mod271(uint32(a[15]) * 64)
	a[16] = mod271(uint32(a[16]) * 58)
	a[17] = mod271(uint32(a[17]) * 205)
	a[18] = mod271(uint32(a[18]) * 262)
	a[19] = mod271(uint32(a[19]) * 85)
	a[20] = mod271(uint32(a[20]) * 221)
	a[21] = mod271(uint32(a[21]) * 141)
	a[22] = mod271(uint32(a[22]) * 204)
	a[23] = mod271(uint32(a[23]) * 151)
	a[24] = mod271(uint32(a[24]) * 230)
	a[25] = mod271(uint32(a[25]) * 56)
	a[26] = mod271(uint32(a[26]) * 254)
}

// inttBfly1 performs an inverse radix-3 DIF butterfly with trivial twiddles (tw1inv=tw2inv=1).
// b0 = a0 + a1 + a2, b1 = a0 + 28*a1 + 242*a2, b2 = a0 + 242*a1 + 28*a2
//
//go:nosplit
func inttBfly1(a *[27]uint16, i0, i1, i2 int) {
	v0 := uint32(a[i0])
	v1 := uint32(a[i1])
	v2 := uint32(a[i2])
	a[i0] = mod271(v0 + v1 + v2)
	a[i1] = mod271(v0 + uint32(mod271(v1*28)) + uint32(mod271(v2*242)))
	a[i2] = mod271(v0 + uint32(mod271(v1*242)) + uint32(mod271(v2*28)))
}

// inttBfly performs an inverse radix-3 DIF butterfly with given twiddle factors.
// After the butterfly, b1 *= tw1inv, b2 *= tw2inv.
//
//go:nosplit
func inttBfly(a *[27]uint16, i0, i1, i2 int, tw1inv, tw2inv uint32) {
	v0 := uint32(a[i0])
	v1 := uint32(a[i1])
	v2 := uint32(a[i2])
	a[i0] = mod271(v0 + v1 + v2)
	b1 := mod271(v0 + uint32(mod271(v1*28)) + uint32(mod271(v2*242)))
	b2 := mod271(v0 + uint32(mod271(v1*242)) + uint32(mod271(v2*28)))
	a[i1] = mod271(uint32(b1) * tw1inv)
	a[i2] = mod271(uint32(b2) * tw2inv)
}