field_amd64.s raw

   1  //go:build amd64
   2  
   3  #include "textflag.h"
   4  
   5  // Field prime p = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F
   6  DATA fieldP<>+0x00(SB)/8, $0xFFFFFFFEFFFFFC2F
   7  DATA fieldP<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
   8  DATA fieldP<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   9  DATA fieldP<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
  10  GLOBL fieldP<>(SB), RODATA|NOPTR, $32
  11  
  12  // 2^256 - p = 2^32 + 977 = 0x1000003D1
  13  DATA fieldPC<>+0x00(SB)/8, $0x1000003D1
  14  DATA fieldPC<>+0x08(SB)/8, $0x0000000000000000
  15  DATA fieldPC<>+0x10(SB)/8, $0x0000000000000000
  16  DATA fieldPC<>+0x18(SB)/8, $0x0000000000000000
  17  GLOBL fieldPC<>(SB), RODATA|NOPTR, $32
  18  
  19  // func FieldAddAVX2(r, a, b *FieldElement)
  20  // Adds two 256-bit field elements mod p.
  21  TEXT ·FieldAddAVX2(SB), NOSPLIT, $0-24
  22  	MOVQ r+0(FP), DI
  23  	MOVQ a+8(FP), SI
  24  	MOVQ b+16(FP), DX
  25  
  26  	// Load a
  27  	MOVQ 0(SI), AX
  28  	MOVQ 8(SI), BX
  29  	MOVQ 16(SI), CX
  30  	MOVQ 24(SI), R8
  31  
  32  	// Add b with carry chain
  33  	ADDQ 0(DX), AX
  34  	ADCQ 8(DX), BX
  35  	ADCQ 16(DX), CX
  36  	ADCQ 24(DX), R8
  37  
  38  	// Save carry
  39  	SETCS R9B
  40  
  41  	// Store preliminary result
  42  	MOVQ AX, 0(DI)
  43  	MOVQ BX, 8(DI)
  44  	MOVQ CX, 16(DI)
  45  	MOVQ R8, 24(DI)
  46  
  47  	// Check if we need to reduce
  48  	TESTB R9B, R9B
  49  	JNZ field_reduce
  50  
  51  	// Compare with p (from high to low)
  52  	// p.Hi = 0xFFFFFFFFFFFFFFFF (all limbs except first)
  53  	// p.Lo = 0xFFFFFFFEFFFFFC2F
  54  	MOVQ $0xFFFFFFFFFFFFFFFF, R10
  55  	CMPQ R8, R10
  56  	JB field_done
  57  	JA field_reduce
  58  	CMPQ CX, R10
  59  	JB field_done
  60  	JA field_reduce
  61  	CMPQ BX, R10
  62  	JB field_done
  63  	JA field_reduce
  64  	MOVQ fieldP<>+0x00(SB), R10
  65  	CMPQ AX, R10
  66  	JB field_done
  67  
  68  field_reduce:
  69  	// Subtract p by adding 2^256 - p = 0x1000003D1
  70  	MOVQ 0(DI), AX
  71  	MOVQ 8(DI), BX
  72  	MOVQ 16(DI), CX
  73  	MOVQ 24(DI), R8
  74  
  75  	MOVQ fieldPC<>+0x00(SB), R10
  76  	ADDQ R10, AX
  77  	ADCQ $0, BX
  78  	ADCQ $0, CX
  79  	ADCQ $0, R8
  80  
  81  	MOVQ AX, 0(DI)
  82  	MOVQ BX, 8(DI)
  83  	MOVQ CX, 16(DI)
  84  	MOVQ R8, 24(DI)
  85  
  86  field_done:
  87  	VZEROUPPER
  88  	RET
  89  
  90  // func FieldSubAVX2(r, a, b *FieldElement)
  91  // Subtracts two 256-bit field elements mod p.
  92  TEXT ·FieldSubAVX2(SB), NOSPLIT, $0-24
  93  	MOVQ r+0(FP), DI
  94  	MOVQ a+8(FP), SI
  95  	MOVQ b+16(FP), DX
  96  
  97  	// Load a
  98  	MOVQ 0(SI), AX
  99  	MOVQ 8(SI), BX
 100  	MOVQ 16(SI), CX
 101  	MOVQ 24(SI), R8
 102  
 103  	// Subtract b with borrow chain
 104  	SUBQ 0(DX), AX
 105  	SBBQ 8(DX), BX
 106  	SBBQ 16(DX), CX
 107  	SBBQ 24(DX), R8
 108  
 109  	// Save borrow
 110  	SETCS R9B
 111  
 112  	// Store preliminary result
 113  	MOVQ AX, 0(DI)
 114  	MOVQ BX, 8(DI)
 115  	MOVQ CX, 16(DI)
 116  	MOVQ R8, 24(DI)
 117  
 118  	// If borrow, add p back
 119  	TESTB R9B, R9B
 120  	JZ field_sub_done
 121  
 122  	// Add p from memory
 123  	MOVQ fieldP<>+0x00(SB), R10
 124  	ADDQ R10, AX
 125  	MOVQ fieldP<>+0x08(SB), R10
 126  	ADCQ R10, BX
 127  	MOVQ fieldP<>+0x10(SB), R10
 128  	ADCQ R10, CX
 129  	MOVQ fieldP<>+0x18(SB), R10
 130  	ADCQ R10, R8
 131  
 132  	MOVQ AX, 0(DI)
 133  	MOVQ BX, 8(DI)
 134  	MOVQ CX, 16(DI)
 135  	MOVQ R8, 24(DI)
 136  
 137  field_sub_done:
 138  	VZEROUPPER
 139  	RET
 140  
 141  // func FieldMulAVX2(r, a, b *FieldElement)
 142  // Multiplies two 256-bit field elements mod p.
 143  TEXT ·FieldMulAVX2(SB), NOSPLIT, $64-24
 144  	MOVQ r+0(FP), DI
 145  	MOVQ a+8(FP), SI
 146  	MOVQ b+16(FP), DX
 147  
 148  	// Load a limbs
 149  	MOVQ 0(SI), R8      // a0
 150  	MOVQ 8(SI), R9      // a1
 151  	MOVQ 16(SI), R10    // a2
 152  	MOVQ 24(SI), R11    // a3
 153  
 154  	// Store b pointer
 155  	MOVQ DX, R12
 156  
 157  	// Initialize 512-bit product on stack
 158  	XORQ AX, AX
 159  	MOVQ AX, 0(SP)
 160  	MOVQ AX, 8(SP)
 161  	MOVQ AX, 16(SP)
 162  	MOVQ AX, 24(SP)
 163  	MOVQ AX, 32(SP)
 164  	MOVQ AX, 40(SP)
 165  	MOVQ AX, 48(SP)
 166  	MOVQ AX, 56(SP)
 167  
 168  	// Schoolbook multiplication (same as scalar, but with field reduction)
 169  	// a0 * b[0..3]
 170  	MOVQ R8, AX
 171  	MULQ 0(R12)
 172  	MOVQ AX, 0(SP)
 173  	MOVQ DX, R13
 174  
 175  	MOVQ R8, AX
 176  	MULQ 8(R12)
 177  	ADDQ R13, AX
 178  	ADCQ $0, DX
 179  	MOVQ AX, 8(SP)
 180  	MOVQ DX, R13
 181  
 182  	MOVQ R8, AX
 183  	MULQ 16(R12)
 184  	ADDQ R13, AX
 185  	ADCQ $0, DX
 186  	MOVQ AX, 16(SP)
 187  	MOVQ DX, R13
 188  
 189  	MOVQ R8, AX
 190  	MULQ 24(R12)
 191  	ADDQ R13, AX
 192  	ADCQ $0, DX
 193  	MOVQ AX, 24(SP)
 194  	MOVQ DX, 32(SP)
 195  
 196  	// a1 * b[0..3]
 197  	MOVQ R9, AX
 198  	MULQ 0(R12)
 199  	ADDQ AX, 8(SP)
 200  	ADCQ DX, 16(SP)
 201  	ADCQ $0, 24(SP)
 202  	ADCQ $0, 32(SP)
 203  
 204  	MOVQ R9, AX
 205  	MULQ 8(R12)
 206  	ADDQ AX, 16(SP)
 207  	ADCQ DX, 24(SP)
 208  	ADCQ $0, 32(SP)
 209  
 210  	MOVQ R9, AX
 211  	MULQ 16(R12)
 212  	ADDQ AX, 24(SP)
 213  	ADCQ DX, 32(SP)
 214  	ADCQ $0, 40(SP)
 215  
 216  	MOVQ R9, AX
 217  	MULQ 24(R12)
 218  	ADDQ AX, 32(SP)
 219  	ADCQ DX, 40(SP)
 220  
 221  	// a2 * b[0..3]
 222  	MOVQ R10, AX
 223  	MULQ 0(R12)
 224  	ADDQ AX, 16(SP)
 225  	ADCQ DX, 24(SP)
 226  	ADCQ $0, 32(SP)
 227  	ADCQ $0, 40(SP)
 228  
 229  	MOVQ R10, AX
 230  	MULQ 8(R12)
 231  	ADDQ AX, 24(SP)
 232  	ADCQ DX, 32(SP)
 233  	ADCQ $0, 40(SP)
 234  
 235  	MOVQ R10, AX
 236  	MULQ 16(R12)
 237  	ADDQ AX, 32(SP)
 238  	ADCQ DX, 40(SP)
 239  	ADCQ $0, 48(SP)
 240  
 241  	MOVQ R10, AX
 242  	MULQ 24(R12)
 243  	ADDQ AX, 40(SP)
 244  	ADCQ DX, 48(SP)
 245  
 246  	// a3 * b[0..3]
 247  	MOVQ R11, AX
 248  	MULQ 0(R12)
 249  	ADDQ AX, 24(SP)
 250  	ADCQ DX, 32(SP)
 251  	ADCQ $0, 40(SP)
 252  	ADCQ $0, 48(SP)
 253  
 254  	MOVQ R11, AX
 255  	MULQ 8(R12)
 256  	ADDQ AX, 32(SP)
 257  	ADCQ DX, 40(SP)
 258  	ADCQ $0, 48(SP)
 259  
 260  	MOVQ R11, AX
 261  	MULQ 16(R12)
 262  	ADDQ AX, 40(SP)
 263  	ADCQ DX, 48(SP)
 264  	ADCQ $0, 56(SP)
 265  
 266  	MOVQ R11, AX
 267  	MULQ 24(R12)
 268  	ADDQ AX, 48(SP)
 269  	ADCQ DX, 56(SP)
 270  
 271  	// Now reduce 512-bit product mod p
 272  	// Using 2^256 ≡ 2^32 + 977 (mod p)
 273  
 274  	// high = [32(SP), 40(SP), 48(SP), 56(SP)]
 275  	// low = [0(SP), 8(SP), 16(SP), 24(SP)]
 276  	// result = low + high * (2^32 + 977)
 277  
 278  	// Multiply high * 0x1000003D1
 279  	MOVQ fieldPC<>+0x00(SB), R13
 280  
 281  	MOVQ 32(SP), AX
 282  	MULQ R13
 283  	MOVQ AX, R8     // reduction[0]
 284  	MOVQ DX, R14    // carry
 285  
 286  	MOVQ 40(SP), AX
 287  	MULQ R13
 288  	ADDQ R14, AX
 289  	ADCQ $0, DX
 290  	MOVQ AX, R9     // reduction[1]
 291  	MOVQ DX, R14
 292  
 293  	MOVQ 48(SP), AX
 294  	MULQ R13
 295  	ADDQ R14, AX
 296  	ADCQ $0, DX
 297  	MOVQ AX, R10    // reduction[2]
 298  	MOVQ DX, R14
 299  
 300  	MOVQ 56(SP), AX
 301  	MULQ R13
 302  	ADDQ R14, AX
 303  	ADCQ $0, DX
 304  	MOVQ AX, R11    // reduction[3]
 305  	MOVQ DX, R14    // reduction[4] (overflow)
 306  
 307  	// Add low + reduction
 308  	ADDQ 0(SP), R8
 309  	ADCQ 8(SP), R9
 310  	ADCQ 16(SP), R10
 311  	ADCQ 24(SP), R11
 312  	ADCQ $0, R14    // Capture any carry into R14
 313  
 314  	// If R14 is non-zero, reduce again
 315  	TESTQ R14, R14
 316  	JZ field_mul_check
 317  
 318  	// R14 * 0x1000003D1
 319  	MOVQ R14, AX
 320  	MULQ R13
 321  	ADDQ AX, R8
 322  	ADCQ DX, R9
 323  	ADCQ $0, R10
 324  	ADCQ $0, R11
 325  
 326  field_mul_check:
 327  	// Check if result >= p and reduce if needed
 328  	MOVQ $0xFFFFFFFFFFFFFFFF, R15
 329  	CMPQ R11, R15
 330  	JB field_mul_store
 331  	JA field_mul_reduce2
 332  	CMPQ R10, R15
 333  	JB field_mul_store
 334  	JA field_mul_reduce2
 335  	CMPQ R9, R15
 336  	JB field_mul_store
 337  	JA field_mul_reduce2
 338  	MOVQ fieldP<>+0x00(SB), R15
 339  	CMPQ R8, R15
 340  	JB field_mul_store
 341  
 342  field_mul_reduce2:
 343  	MOVQ fieldPC<>+0x00(SB), R15
 344  	ADDQ R15, R8
 345  	ADCQ $0, R9
 346  	ADCQ $0, R10
 347  	ADCQ $0, R11
 348  
 349  field_mul_store:
 350  	MOVQ r+0(FP), DI
 351  	MOVQ R8, 0(DI)
 352  	MOVQ R9, 8(DI)
 353  	MOVQ R10, 16(DI)
 354  	MOVQ R11, 24(DI)
 355  
 356  	VZEROUPPER
 357  	RET
 358  
 359  // func FieldSqrAVX2(r, a *FieldElement)
 360  // Squares a 256-bit field element mod p.
 361  // For now, just calls FieldMulAVX2(r, a, a)
 362  TEXT ·FieldSqrAVX2(SB), NOSPLIT, $24-16
 363  	MOVQ r+0(FP), AX
 364  	MOVQ a+8(FP), BX
 365  	MOVQ AX, 0(SP)
 366  	MOVQ BX, 8(SP)
 367  	MOVQ BX, 16(SP)
 368  	CALL ·FieldMulAVX2(SB)
 369  	RET
 370