package gnarl // Montgomery field arithmetic over Z_P for the Gnarl signature scheme. // // Representation: 4×uint64 little-endian limbs in Montgomery form. // A value a is stored as aR mod P, where R = 2^256. // Multiplication: montMul(aR, bR) = abR mod P (one R cancels). // // P is a 216-bit prime (27 bytes), but we use 256-bit Montgomery form // internally for uniformity with the CIOS algorithm. The top ~40 bits // of limb[3] are always zero. // // P ≡ 2 mod 3, P mod 5 = 2, 5 is QNR, N = P+1 = 6Q where Q is prime. import "math/bits" // fe is a field element in Montgomery form: 4 uint64 limbs, little-endian. type fe [4]uint64 // Prime P limbs (little-endian). // P hex = 0x9563a6b6d81bb9b02e5e5d121e79ccd682cc9931f9791e0f9ee4f5 // P bits = 216 var pLimbs = fe{ 0x31f9791e0f9ee4f5, 0x121e79ccd682cc99, 0xb6d81bb9b02e5e5d, 0x00000000009563a6, } // R mod P (the Montgomery form of 1). var feOne = fe{ 0x761ae156c8e8e77c, 0x023540a7764c229f, 0x315e327188dada36, 0x00000000001012bc, } // R^2 mod P (used to convert normal form → Montgomery form). var feR2 = fe{ 0xf56927577991be00, 0xf6eb880a87ed3681, 0x18d8b12f949488c0, 0x0000000000684a62, } // feZero is the zero element (Montgomery form of 0 is 0). var feZero fe // pPrime = -P^{-1} mod 2^64, the Montgomery reduction constant. const pPrime uint64 = 0xf07d39ef3ea058a3 // Tonelli-Shanks constants for square root. // P - 1 = 2^s * tsOddQ, where s = 2. const tsS = 2 // tsOddQ = (P-1) / 4, the odd part of P-1. var tsOddQ = fe{ 0x4c7e5e4783e7b93d, 0x44879e7335a0b326, 0xadb606ee6c0b9797, 0x00000000002558e9, } // tsQm1h = (tsOddQ - 1) / 2, used in Tonelli-Shanks. var tsQm1h = fe{ 0x263f2f23c1f3dc9e, 0xa243cf399ad05993, 0xd6db03773605cbcb, 0x000000000012ac74, } // thirdPNorm = (P-1)/3 in normal (non-Montgomery) form, for y-coordinate // 3-way canonicalization. With P ≡ 2 mod 3, the torus has index 6 = 2×3, // and we canonicalize y to [0, P/3) instead of [0, P/2). var thirdPNorm = fe{ 0xbb53285f5a8a4c51, 0xb0b4d3444780eedd, 0x3cf2b3e8900f74c9, 0x000000000031cbe2, } // halfPNorm = (P-1)/2 in normal (non-Montgomery) form, for alternate checks. var halfPNorm = fe{ 0x98fcbc8f07cf727a, 0x890f3ce66b41664c, 0x5b6c0ddcd8172f2e, 0x00000000004ab1d3, } // pMinus2 = P - 2, used for Fermat inversion. var pMinus2 = fe{ 0x31f9791e0f9ee4f3, 0x121e79ccd682cc99, 0xb6d81bb9b02e5e5d, 0x00000000009563a6, } // feSet copies src into dst. func feSet(dst, src *fe) { *dst = *src } // feIsZero returns 1 if a == 0 (constant-time). func feIsZero(a *fe) int { z := a[0] | a[1] | a[2] | a[3] z = (z | (0 - z)) >> 63 return 1 - int(z) } // feEqual returns 1 if a == b (constant-time). func feEqual(a, b *fe) int { z := (a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]) | (a[3] ^ b[3]) z = (z | (0 - z)) >> 63 return 1 - int(z) } // feReduce reduces a to [0, P) if a >= P. Constant-time. func feReduce(a *fe) { d0, borrow := bits.Sub64(a[0], pLimbs[0], 0) d1, borrow := bits.Sub64(a[1], pLimbs[1], borrow) d2, borrow := bits.Sub64(a[2], pLimbs[2], borrow) d3, borrow := bits.Sub64(a[3], pLimbs[3], borrow) mask := uint64(0) - borrow a[0] = (a[0] & mask) | (d0 & ^mask) a[1] = (a[1] & mask) | (d1 & ^mask) a[2] = (a[2] & mask) | (d2 & ^mask) a[3] = (a[3] & mask) | (d3 & ^mask) } // feAdd computes r = a + b mod P. func feAdd(r, a, b *fe) { var carry uint64 r[0], carry = bits.Add64(a[0], b[0], 0) r[1], carry = bits.Add64(a[1], b[1], carry) r[2], carry = bits.Add64(a[2], b[2], carry) r[3], carry = bits.Add64(a[3], b[3], carry) d0, borrow := bits.Sub64(r[0], pLimbs[0], 0) d1, borrow := bits.Sub64(r[1], pLimbs[1], borrow) d2, borrow := bits.Sub64(r[2], pLimbs[2], borrow) d3, borrow := bits.Sub64(r[3], pLimbs[3], borrow) underflow := borrow &^ carry mask := uint64(0) - underflow r[0] = (r[0] & mask) | (d0 & ^mask) r[1] = (r[1] & mask) | (d1 & ^mask) r[2] = (r[2] & mask) | (d2 & ^mask) r[3] = (r[3] & mask) | (d3 & ^mask) } // feSub computes r = a - b mod P. func feSub(r, a, b *fe) { var borrow uint64 r[0], borrow = bits.Sub64(a[0], b[0], 0) r[1], borrow = bits.Sub64(a[1], b[1], borrow) r[2], borrow = bits.Sub64(a[2], b[2], borrow) r[3], borrow = bits.Sub64(a[3], b[3], borrow) mask := uint64(0) - borrow var carry uint64 r[0], carry = bits.Add64(r[0], pLimbs[0]&mask, 0) r[1], carry = bits.Add64(r[1], pLimbs[1]&mask, carry) r[2], carry = bits.Add64(r[2], pLimbs[2]&mask, carry) r[3], _ = bits.Add64(r[3], pLimbs[3]&mask, carry) } // feNeg computes r = -a mod P. func feNeg(r, a *fe) { z := a[0] | a[1] | a[2] | a[3] nonzero := z | (0 - z) mask := 0 - (nonzero >> 63) var borrow uint64 r[0], borrow = bits.Sub64(pLimbs[0], a[0], 0) r[1], borrow = bits.Sub64(pLimbs[1], a[1], borrow) r[2], borrow = bits.Sub64(pLimbs[2], a[2], borrow) r[3], _ = bits.Sub64(pLimbs[3], a[3], borrow) r[0] &= mask r[1] &= mask r[2] &= mask r[3] &= mask } // montMulGeneric computes r = a * b * R^{-1} mod P using fully-unrolled CIOS. // // All carry propagation uses bits.Add64 to avoid silent overflow on hi += c. func montMulGeneric(r, a, b *fe) { var t0, t1, t2, t3, t4 uint64 var hi, lo, c0, c1, m, carry uint64 // ---- i = 0 ---- // Part A: t += a[0] * b hi, lo = bits.Mul64(a[0], b[0]) t0, c0 = bits.Add64(lo, t0, 0) carry, _ = bits.Add64(hi, 0, c0) hi, lo = bits.Mul64(a[0], b[1]) lo, c0 = bits.Add64(lo, t1, 0) hi, c1 = bits.Add64(hi, 0, c0) t1, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(a[0], b[2]) lo, c0 = bits.Add64(lo, t2, 0) hi, c1 = bits.Add64(hi, 0, c0) t2, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(a[0], b[3]) lo, c0 = bits.Add64(lo, t3, 0) hi, c1 = bits.Add64(hi, 0, c0) t3, c0 = bits.Add64(lo, carry, 0) t4, _ = bits.Add64(hi, c1, c0) // Part B: Montgomery reduction m = t0 * pPrime hi, lo = bits.Mul64(m, pLimbs[0]) _, c0 = bits.Add64(t0, lo, 0) carry, _ = bits.Add64(hi, 0, c0) hi, lo = bits.Mul64(m, pLimbs[1]) lo, c0 = bits.Add64(lo, t1, 0) hi, c1 = bits.Add64(hi, 0, c0) t0, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(m, pLimbs[2]) lo, c0 = bits.Add64(lo, t2, 0) hi, c1 = bits.Add64(hi, 0, c0) t1, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(m, pLimbs[3]) lo, c0 = bits.Add64(lo, t3, 0) hi, c1 = bits.Add64(hi, 0, c0) t2, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) t3, c0 = bits.Add64(t4, carry, 0) t4 = c0 // ---- i = 1 ---- hi, lo = bits.Mul64(a[1], b[0]) t0, c0 = bits.Add64(lo, t0, 0) carry, _ = bits.Add64(hi, 0, c0) hi, lo = bits.Mul64(a[1], b[1]) lo, c0 = bits.Add64(lo, t1, 0) hi, c1 = bits.Add64(hi, 0, c0) t1, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(a[1], b[2]) lo, c0 = bits.Add64(lo, t2, 0) hi, c1 = bits.Add64(hi, 0, c0) t2, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(a[1], b[3]) lo, c0 = bits.Add64(lo, t3, 0) hi, c1 = bits.Add64(hi, 0, c0) t3, c0 = bits.Add64(lo, carry, 0) hi, _ = bits.Add64(hi, c1, c0) t4 += hi m = t0 * pPrime hi, lo = bits.Mul64(m, pLimbs[0]) _, c0 = bits.Add64(t0, lo, 0) carry, _ = bits.Add64(hi, 0, c0) hi, lo = bits.Mul64(m, pLimbs[1]) lo, c0 = bits.Add64(lo, t1, 0) hi, c1 = bits.Add64(hi, 0, c0) t0, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(m, pLimbs[2]) lo, c0 = bits.Add64(lo, t2, 0) hi, c1 = bits.Add64(hi, 0, c0) t1, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(m, pLimbs[3]) lo, c0 = bits.Add64(lo, t3, 0) hi, c1 = bits.Add64(hi, 0, c0) t2, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) t3, c0 = bits.Add64(t4, carry, 0) t4 = c0 // ---- i = 2 ---- hi, lo = bits.Mul64(a[2], b[0]) t0, c0 = bits.Add64(lo, t0, 0) carry, _ = bits.Add64(hi, 0, c0) hi, lo = bits.Mul64(a[2], b[1]) lo, c0 = bits.Add64(lo, t1, 0) hi, c1 = bits.Add64(hi, 0, c0) t1, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(a[2], b[2]) lo, c0 = bits.Add64(lo, t2, 0) hi, c1 = bits.Add64(hi, 0, c0) t2, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(a[2], b[3]) lo, c0 = bits.Add64(lo, t3, 0) hi, c1 = bits.Add64(hi, 0, c0) t3, c0 = bits.Add64(lo, carry, 0) hi, _ = bits.Add64(hi, c1, c0) t4 += hi m = t0 * pPrime hi, lo = bits.Mul64(m, pLimbs[0]) _, c0 = bits.Add64(t0, lo, 0) carry, _ = bits.Add64(hi, 0, c0) hi, lo = bits.Mul64(m, pLimbs[1]) lo, c0 = bits.Add64(lo, t1, 0) hi, c1 = bits.Add64(hi, 0, c0) t0, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(m, pLimbs[2]) lo, c0 = bits.Add64(lo, t2, 0) hi, c1 = bits.Add64(hi, 0, c0) t1, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(m, pLimbs[3]) lo, c0 = bits.Add64(lo, t3, 0) hi, c1 = bits.Add64(hi, 0, c0) t2, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) t3, c0 = bits.Add64(t4, carry, 0) t4 = c0 // ---- i = 3 ---- hi, lo = bits.Mul64(a[3], b[0]) t0, c0 = bits.Add64(lo, t0, 0) carry, _ = bits.Add64(hi, 0, c0) hi, lo = bits.Mul64(a[3], b[1]) lo, c0 = bits.Add64(lo, t1, 0) hi, c1 = bits.Add64(hi, 0, c0) t1, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(a[3], b[2]) lo, c0 = bits.Add64(lo, t2, 0) hi, c1 = bits.Add64(hi, 0, c0) t2, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(a[3], b[3]) lo, c0 = bits.Add64(lo, t3, 0) hi, c1 = bits.Add64(hi, 0, c0) t3, c0 = bits.Add64(lo, carry, 0) hi, _ = bits.Add64(hi, c1, c0) t4 += hi m = t0 * pPrime hi, lo = bits.Mul64(m, pLimbs[0]) _, c0 = bits.Add64(t0, lo, 0) carry, _ = bits.Add64(hi, 0, c0) hi, lo = bits.Mul64(m, pLimbs[1]) lo, c0 = bits.Add64(lo, t1, 0) hi, c1 = bits.Add64(hi, 0, c0) t0, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(m, pLimbs[2]) lo, c0 = bits.Add64(lo, t2, 0) hi, c1 = bits.Add64(hi, 0, c0) t1, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) hi, lo = bits.Mul64(m, pLimbs[3]) lo, c0 = bits.Add64(lo, t3, 0) hi, c1 = bits.Add64(hi, 0, c0) t2, c0 = bits.Add64(lo, carry, 0) carry, _ = bits.Add64(hi, c1, c0) t3, c0 = bits.Add64(t4, carry, 0) t4 = c0 r[0] = t0 r[1] = t1 r[2] = t2 r[3] = t3 feReduce(r) } // montSquareGeneric computes r = a^2 * R^{-1} mod P. func montSquareGeneric(r, a *fe) { montMulGeneric(r, a, a) } // feToMont converts a normal-form value to Montgomery form: aR mod P. func feToMont(r, a *fe) { montMul(r, a, &feR2) } // feFromMont converts a Montgomery-form value to normal form: a * R^{-1} mod P. func feFromMont(r, a *fe) { one := fe{1, 0, 0, 0} montMul(r, a, &one) } // feFromSmall converts a small integer to Montgomery form. func feFromSmall(r *fe, v int64) { if v >= 0 { *r = fe{uint64(v), 0, 0, 0} } else { *r = pLimbs var borrow uint64 r[0], borrow = bits.Sub64(r[0], uint64(-v), 0) r[1], borrow = bits.Sub64(r[1], 0, borrow) r[2], borrow = bits.Sub64(r[2], 0, borrow) r[3], _ = bits.Sub64(r[3], 0, borrow) } feToMont(r, r) } // fieldLen is the byte length of a serialized field element (27 bytes for 216-bit P). const fieldLen = 27 // feFromBytes27 sets r from a 27-byte big-endian encoding and converts to Montgomery form. // Returns false if the value >= P. func feFromBytes27(r *fe, b []byte) bool { if len(b) < 27 { *r = feZero return false } // 27 bytes = 216 bits. Big-endian layout: // b[0..2] → low 24 bits of limb[3] (bits 192..215) // b[3..10] → limb[2] (bits 128..191) // b[11..18] → limb[1] (bits 64..127) // b[19..26] → limb[0] (bits 0..63) r[3] = uint64(b[0])<<16 | uint64(b[1])<<8 | uint64(b[2]) r[2] = uint64(b[3])<<56 | uint64(b[4])<<48 | uint64(b[5])<<40 | uint64(b[6])<<32 | uint64(b[7])<<24 | uint64(b[8])<<16 | uint64(b[9])<<8 | uint64(b[10]) r[1] = uint64(b[11])<<56 | uint64(b[12])<<48 | uint64(b[13])<<40 | uint64(b[14])<<32 | uint64(b[15])<<24 | uint64(b[16])<<16 | uint64(b[17])<<8 | uint64(b[18]) r[0] = uint64(b[19])<<56 | uint64(b[20])<<48 | uint64(b[21])<<40 | uint64(b[22])<<32 | uint64(b[23])<<24 | uint64(b[24])<<16 | uint64(b[25])<<8 | uint64(b[26]) // Check >= P. d0, borrow := bits.Sub64(r[0], pLimbs[0], 0) d1, borrow := bits.Sub64(r[1], pLimbs[1], borrow) _, borrow = bits.Sub64(r[2], pLimbs[2], borrow) _, borrow = bits.Sub64(r[3], pLimbs[3], borrow) _ = d0 _ = d1 if borrow == 0 { *r = feZero return false } feToMont(r, r) return true } // feToBytes27 writes a Montgomery-form element to a 27-byte big-endian buffer. func feToBytes27(b []byte, a *fe) { var norm fe feFromMont(&norm, a) // limb[3] low 24 bits → b[0..2] b[0] = byte(norm[3] >> 16) b[1] = byte(norm[3] >> 8) b[2] = byte(norm[3]) // limb[2] → b[3..10] b[3] = byte(norm[2] >> 56) b[4] = byte(norm[2] >> 48) b[5] = byte(norm[2] >> 40) b[6] = byte(norm[2] >> 32) b[7] = byte(norm[2] >> 24) b[8] = byte(norm[2] >> 16) b[9] = byte(norm[2] >> 8) b[10] = byte(norm[2]) // limb[1] → b[11..18] b[11] = byte(norm[1] >> 56) b[12] = byte(norm[1] >> 48) b[13] = byte(norm[1] >> 40) b[14] = byte(norm[1] >> 32) b[15] = byte(norm[1] >> 24) b[16] = byte(norm[1] >> 16) b[17] = byte(norm[1] >> 8) b[18] = byte(norm[1]) // limb[0] → b[19..26] b[19] = byte(norm[0] >> 56) b[20] = byte(norm[0] >> 48) b[21] = byte(norm[0] >> 40) b[22] = byte(norm[0] >> 32) b[23] = byte(norm[0] >> 24) b[24] = byte(norm[0] >> 16) b[25] = byte(norm[0] >> 8) b[26] = byte(norm[0]) } // fePow computes r = base^exp where exp is a 256-bit exponent in little-endian limbs // (plain integer, NOT Montgomery). base is in Montgomery form, result in Montgomery form. func fePow(r, base *fe, exp *fe) { var result fe feSet(&result, &feOne) var b fe feSet(&b, base) for i := 0; i < 4; i++ { word := exp[i] for bit := 0; bit < 64; bit++ { if word&1 == 1 { montMul(&result, &result, &b) } montSquare(&b, &b) word >>= 1 } } feSet(r, &result) } // feInv computes r = a^{-1} in Montgomery form via Fermat: a^{P-2}. func feInv(r, a *fe) { fePow(r, a, &pMinus2) } // feSqrt computes r = sqrt(a) mod P in Montgomery form using Tonelli-Shanks. // Returns false if a is not a quadratic residue. // // P - 1 = 2^2 * q where q is odd (tsS = 2). // Non-residue: 5 (QNR by construction of P). func feSqrt(r, a *fe) bool { if feIsZero(a) == 1 { *r = feZero return true } var feFive fe feFive[0] = 5 feToMont(&feFive, &feFive) // t = a^q var t fe fePow(&t, a, &tsOddQ) // rr = a^{(q+1)/2} = a * a^{(q-1)/2} var rr, tmp fe fePow(&tmp, a, &tsQm1h) montMul(&rr, &tmp, a) // c = 5^q var c fe fePow(&c, &feFive, &tsOddQ) m := tsS for { if feEqual(&t, &feOne) == 1 { feSet(r, &rr) return true } var b fe feSet(&b, &t) i := 0 for j := 1; j < m; j++ { montSquare(&b, &b) if feEqual(&b, &feOne) == 1 { i = j break } } if i == 0 { return false } feSet(&b, &c) for j := 0; j < m-i-1; j++ { montSquare(&b, &b) } montMul(&rr, &rr, &b) montSquare(&c, &b) montMul(&t, &t, &c) m = i } } // feIsGtThirdP returns 1 if a (in NORMAL form, not Montgomery) is > (P-1)/3. // Used for 3-way canonicalization of torus y-coordinate. func feIsGtThirdP(a *fe) int { for i := 3; i >= 0; i-- { if a[i] > thirdPNorm[i] { return 1 } if a[i] < thirdPNorm[i] { return 0 } } return 0 // equal → not strictly greater } // feIsGtHalfP returns 1 if a (in NORMAL form, not Montgomery) is > (P-1)/2. func feIsGtHalfP(a *fe) int { for i := 3; i >= 0; i-- { if a[i] > halfPNorm[i] { return 1 } if a[i] < halfPNorm[i] { return 0 } } return 0 }