//go:build amd64 && !purego

#include "textflag.h"

// Montgomery multiplication constants for the Gnarl 216-bit prime.
// P = 0x9563a6_b6d81bb9b02e5e5d_121e79ccd682cc99_31f9791e0f9ee4f5
// pPrime = -P^{-1} mod 2^64 = 0xf07d39ef3ea058a3

DATA p0<>+0(SB)/8, $0x31f9791e0f9ee4f5
DATA p1<>+0(SB)/8, $0x121e79ccd682cc99
DATA p2<>+0(SB)/8, $0xb6d81bb9b02e5e5d
DATA p3<>+0(SB)/8, $0x00000000009563a6
DATA pp<>+0(SB)/8, $0xf07d39ef3ea058a3
GLOBL p0<>(SB), RODATA|NOPTR, $8
GLOBL p1<>(SB), RODATA|NOPTR, $8
GLOBL p2<>(SB), RODATA|NOPTR, $8
GLOBL p3<>(SB), RODATA|NOPTR, $8
GLOBL pp<>(SB), RODATA|NOPTR, $8

// func montMul(r, a, b *fe)
//
// 4-limb CIOS Montgomery multiplication.
// r = a * b * R^{-1} mod P where R = 2^256.
//
// Register allocation:
//   BX = r pointer
//   SI = a pointer
//   DI = b pointer
//   R8,R9,R10,R11 = t0,t1,t2,t3 (accumulator)
//   R12 = t4 (overflow limb)
//   R13 = current a[i] or m value
//   R14 = carry accumulator
//   AX,DX = MULQ operands/results
TEXT ·montMul(SB), NOSPLIT, $0-24
	MOVQ r+0(FP), BX
	MOVQ a+8(FP), SI
	MOVQ b+16(FP), DI

	// Zero accumulator.
	XORQ R8, R8
	XORQ R9, R9
	XORQ R10, R10
	XORQ R11, R11
	XORQ R12, R12

	// ---- i = 0: t += a[0] * b ----
	MOVQ 0(SI), R13       // R13 = a[0]

	MOVQ R13, AX
	MULQ 0(DI)            // DX:AX = a[0] * b[0]
	ADDQ AX, R8           // t0 += lo
	ADCQ $0, DX
	MOVQ DX, R14          // R14 = carry

	MOVQ R13, AX
	MULQ 8(DI)            // DX:AX = a[0] * b[1]
	ADDQ R14, R9          // t1 += prev carry
	ADCQ $0, DX
	ADDQ AX, R9           // t1 += lo
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ 16(DI)           // DX:AX = a[0] * b[2]
	ADDQ R14, R10         // t2 += prev carry
	ADCQ $0, DX
	ADDQ AX, R10          // t2 += lo
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ 24(DI)           // DX:AX = a[0] * b[3]
	ADDQ R14, R11         // t3 += prev carry
	ADCQ $0, DX
	ADDQ AX, R11          // t3 += lo
	ADCQ DX, R12          // t4 += hi

	// Montgomery reduction: m = t0 * pPrime; t += m * P; shift right.
	MOVQ R8, AX
	MULQ pp<>(SB)         // DX:AX = t0 * pPrime
	MOVQ AX, R13          // R13 = m (only low 64 bits matter)

	MOVQ R13, AX
	MULQ p0<>(SB)         // DX:AX = m * P[0]
	ADDQ AX, R8           // t0 += lo (cancels to 0 mod 2^64)
	ADCQ $0, DX
	MOVQ DX, R14          // R14 = carry

	MOVQ R13, AX
	MULQ p1<>(SB)         // DX:AX = m * P[1]
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ p2<>(SB)         // DX:AX = m * P[2]
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ p3<>(SB)         // DX:AX = m * P[3]
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12

	// Shift: t0=t1, t1=t2, t2=t3, t3=t4, t4=0
	MOVQ R9, R8
	MOVQ R10, R9
	MOVQ R11, R10
	MOVQ R12, R11
	XORQ R12, R12

	// ---- i = 1: t += a[1] * b ----
	MOVQ 8(SI), R13

	MOVQ R13, AX
	MULQ 0(DI)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ 8(DI)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ 16(DI)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ 24(DI)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12

	// Montgomery reduction i=1
	MOVQ R8, AX
	MULQ pp<>(SB)
	MOVQ AX, R13

	MOVQ R13, AX
	MULQ p0<>(SB)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ p1<>(SB)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ p2<>(SB)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ p3<>(SB)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12

	MOVQ R9, R8
	MOVQ R10, R9
	MOVQ R11, R10
	MOVQ R12, R11
	XORQ R12, R12

	// ---- i = 2: t += a[2] * b ----
	MOVQ 16(SI), R13

	MOVQ R13, AX
	MULQ 0(DI)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ 8(DI)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ 16(DI)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ 24(DI)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12

	// Montgomery reduction i=2
	MOVQ R8, AX
	MULQ pp<>(SB)
	MOVQ AX, R13

	MOVQ R13, AX
	MULQ p0<>(SB)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ p1<>(SB)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ p2<>(SB)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ p3<>(SB)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12

	MOVQ R9, R8
	MOVQ R10, R9
	MOVQ R11, R10
	MOVQ R12, R11
	XORQ R12, R12

	// ---- i = 3: t += a[3] * b ----
	MOVQ 24(SI), R13

	MOVQ R13, AX
	MULQ 0(DI)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ 8(DI)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ 16(DI)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ 24(DI)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12

	// Montgomery reduction i=3
	MOVQ R8, AX
	MULQ pp<>(SB)
	MOVQ AX, R13

	MOVQ R13, AX
	MULQ p0<>(SB)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ p1<>(SB)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ p2<>(SB)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14

	MOVQ R13, AX
	MULQ p3<>(SB)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12

	MOVQ R9, R8
	MOVQ R10, R9
	MOVQ R11, R10
	MOVQ R12, R11

	// Conditional subtraction: if t >= P, t -= P.
	MOVQ R8, AX
	MOVQ R9, CX
	MOVQ R10, DX
	MOVQ R11, R13

	SUBQ p0<>(SB), AX
	SBBQ p1<>(SB), CX
	SBBQ p2<>(SB), DX
	SBBQ p3<>(SB), R13

	// If borrow (CF=1), keep original t; else use t-P.
	CMOVQCS R8, AX
	CMOVQCS R9, CX
	CMOVQCS R10, DX
	CMOVQCS R11, R13

	MOVQ AX, 0(BX)
	MOVQ CX, 8(BX)
	MOVQ DX, 16(BX)
	MOVQ R13, 24(BX)
	RET

// func montSquare(r, a *fe)
//
// Computes r = a^2 * R^{-1} mod P. Inlined CIOS with b = a.
TEXT ·montSquare(SB), NOSPLIT, $0-16
	MOVQ r+0(FP), BX
	MOVQ a+8(FP), SI
	MOVQ SI, DI           // b = a

	XORQ R8, R8
	XORQ R9, R9
	XORQ R10, R10
	XORQ R11, R11
	XORQ R12, R12

	// ---- i = 0 ----
	MOVQ 0(SI), R13
	MOVQ R13, AX
	MULQ 0(DI)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ 8(DI)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ 16(DI)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ 24(DI)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12
	MOVQ R8, AX
	MULQ pp<>(SB)
	MOVQ AX, R13
	MOVQ R13, AX
	MULQ p0<>(SB)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ p1<>(SB)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ p2<>(SB)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ p3<>(SB)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12
	MOVQ R9, R8
	MOVQ R10, R9
	MOVQ R11, R10
	MOVQ R12, R11
	XORQ R12, R12

	// ---- i = 1 ----
	MOVQ 8(SI), R13
	MOVQ R13, AX
	MULQ 0(DI)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ 8(DI)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ 16(DI)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ 24(DI)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12
	MOVQ R8, AX
	MULQ pp<>(SB)
	MOVQ AX, R13
	MOVQ R13, AX
	MULQ p0<>(SB)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ p1<>(SB)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ p2<>(SB)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ p3<>(SB)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12
	MOVQ R9, R8
	MOVQ R10, R9
	MOVQ R11, R10
	MOVQ R12, R11
	XORQ R12, R12

	// ---- i = 2 ----
	MOVQ 16(SI), R13
	MOVQ R13, AX
	MULQ 0(DI)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ 8(DI)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ 16(DI)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ 24(DI)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12
	MOVQ R8, AX
	MULQ pp<>(SB)
	MOVQ AX, R13
	MOVQ R13, AX
	MULQ p0<>(SB)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ p1<>(SB)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ p2<>(SB)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ p3<>(SB)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12
	MOVQ R9, R8
	MOVQ R10, R9
	MOVQ R11, R10
	MOVQ R12, R11
	XORQ R12, R12

	// ---- i = 3 ----
	MOVQ 24(SI), R13
	MOVQ R13, AX
	MULQ 0(DI)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ 8(DI)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ 16(DI)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ 24(DI)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12
	MOVQ R8, AX
	MULQ pp<>(SB)
	MOVQ AX, R13
	MOVQ R13, AX
	MULQ p0<>(SB)
	ADDQ AX, R8
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ p1<>(SB)
	ADDQ R14, R9
	ADCQ $0, DX
	ADDQ AX, R9
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ p2<>(SB)
	ADDQ R14, R10
	ADCQ $0, DX
	ADDQ AX, R10
	ADCQ $0, DX
	MOVQ DX, R14
	MOVQ R13, AX
	MULQ p3<>(SB)
	ADDQ R14, R11
	ADCQ $0, DX
	ADDQ AX, R11
	ADCQ DX, R12
	MOVQ R9, R8
	MOVQ R10, R9
	MOVQ R11, R10
	MOVQ R12, R11

	// Conditional subtraction.
	MOVQ R8, AX
	MOVQ R9, CX
	MOVQ R10, DX
	MOVQ R11, R13
	SUBQ p0<>(SB), AX
	SBBQ p1<>(SB), CX
	SBBQ p2<>(SB), DX
	SBBQ p3<>(SB), R13
	CMOVQCS R8, AX
	CMOVQCS R9, CX
	CMOVQCS R10, DX
	CMOVQCS R11, R13
	MOVQ AX, 0(BX)
	MOVQ CX, 8(BX)
	MOVQ DX, 16(BX)
	MOVQ R13, 24(BX)
	RET
