field_amd64.s raw
1 //go:build amd64
2
3 #include "textflag.h"
4
5 // Field prime p = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F
6 DATA fieldP<>+0x00(SB)/8, $0xFFFFFFFEFFFFFC2F
7 DATA fieldP<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
8 DATA fieldP<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
9 DATA fieldP<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
10 GLOBL fieldP<>(SB), RODATA|NOPTR, $32
11
12 // 2^256 - p = 2^32 + 977 = 0x1000003D1
13 DATA fieldPC<>+0x00(SB)/8, $0x1000003D1
14 DATA fieldPC<>+0x08(SB)/8, $0x0000000000000000
15 DATA fieldPC<>+0x10(SB)/8, $0x0000000000000000
16 DATA fieldPC<>+0x18(SB)/8, $0x0000000000000000
17 GLOBL fieldPC<>(SB), RODATA|NOPTR, $32
18
19 // func FieldAddAVX2(r, a, b *FieldElement)
20 // Adds two 256-bit field elements mod p.
21 TEXT ·FieldAddAVX2(SB), NOSPLIT, $0-24
22 MOVQ r+0(FP), DI
23 MOVQ a+8(FP), SI
24 MOVQ b+16(FP), DX
25
26 // Load a
27 MOVQ 0(SI), AX
28 MOVQ 8(SI), BX
29 MOVQ 16(SI), CX
30 MOVQ 24(SI), R8
31
32 // Add b with carry chain
33 ADDQ 0(DX), AX
34 ADCQ 8(DX), BX
35 ADCQ 16(DX), CX
36 ADCQ 24(DX), R8
37
38 // Save carry
39 SETCS R9B
40
41 // Store preliminary result
42 MOVQ AX, 0(DI)
43 MOVQ BX, 8(DI)
44 MOVQ CX, 16(DI)
45 MOVQ R8, 24(DI)
46
47 // Check if we need to reduce
48 TESTB R9B, R9B
49 JNZ field_reduce
50
51 // Compare with p (from high to low)
52 // p.Hi = 0xFFFFFFFFFFFFFFFF (all limbs except first)
53 // p.Lo = 0xFFFFFFFEFFFFFC2F
54 MOVQ $0xFFFFFFFFFFFFFFFF, R10
55 CMPQ R8, R10
56 JB field_done
57 JA field_reduce
58 CMPQ CX, R10
59 JB field_done
60 JA field_reduce
61 CMPQ BX, R10
62 JB field_done
63 JA field_reduce
64 MOVQ fieldP<>+0x00(SB), R10
65 CMPQ AX, R10
66 JB field_done
67
68 field_reduce:
69 // Subtract p by adding 2^256 - p = 0x1000003D1
70 MOVQ 0(DI), AX
71 MOVQ 8(DI), BX
72 MOVQ 16(DI), CX
73 MOVQ 24(DI), R8
74
75 MOVQ fieldPC<>+0x00(SB), R10
76 ADDQ R10, AX
77 ADCQ $0, BX
78 ADCQ $0, CX
79 ADCQ $0, R8
80
81 MOVQ AX, 0(DI)
82 MOVQ BX, 8(DI)
83 MOVQ CX, 16(DI)
84 MOVQ R8, 24(DI)
85
86 field_done:
87 VZEROUPPER
88 RET
89
90 // func FieldSubAVX2(r, a, b *FieldElement)
91 // Subtracts two 256-bit field elements mod p.
92 TEXT ·FieldSubAVX2(SB), NOSPLIT, $0-24
93 MOVQ r+0(FP), DI
94 MOVQ a+8(FP), SI
95 MOVQ b+16(FP), DX
96
97 // Load a
98 MOVQ 0(SI), AX
99 MOVQ 8(SI), BX
100 MOVQ 16(SI), CX
101 MOVQ 24(SI), R8
102
103 // Subtract b with borrow chain
104 SUBQ 0(DX), AX
105 SBBQ 8(DX), BX
106 SBBQ 16(DX), CX
107 SBBQ 24(DX), R8
108
109 // Save borrow
110 SETCS R9B
111
112 // Store preliminary result
113 MOVQ AX, 0(DI)
114 MOVQ BX, 8(DI)
115 MOVQ CX, 16(DI)
116 MOVQ R8, 24(DI)
117
118 // If borrow, add p back
119 TESTB R9B, R9B
120 JZ field_sub_done
121
122 // Add p from memory
123 MOVQ fieldP<>+0x00(SB), R10
124 ADDQ R10, AX
125 MOVQ fieldP<>+0x08(SB), R10
126 ADCQ R10, BX
127 MOVQ fieldP<>+0x10(SB), R10
128 ADCQ R10, CX
129 MOVQ fieldP<>+0x18(SB), R10
130 ADCQ R10, R8
131
132 MOVQ AX, 0(DI)
133 MOVQ BX, 8(DI)
134 MOVQ CX, 16(DI)
135 MOVQ R8, 24(DI)
136
137 field_sub_done:
138 VZEROUPPER
139 RET
140
141 // func FieldMulAVX2(r, a, b *FieldElement)
142 // Multiplies two 256-bit field elements mod p.
143 TEXT ·FieldMulAVX2(SB), NOSPLIT, $64-24
144 MOVQ r+0(FP), DI
145 MOVQ a+8(FP), SI
146 MOVQ b+16(FP), DX
147
148 // Load a limbs
149 MOVQ 0(SI), R8 // a0
150 MOVQ 8(SI), R9 // a1
151 MOVQ 16(SI), R10 // a2
152 MOVQ 24(SI), R11 // a3
153
154 // Store b pointer
155 MOVQ DX, R12
156
157 // Initialize 512-bit product on stack
158 XORQ AX, AX
159 MOVQ AX, 0(SP)
160 MOVQ AX, 8(SP)
161 MOVQ AX, 16(SP)
162 MOVQ AX, 24(SP)
163 MOVQ AX, 32(SP)
164 MOVQ AX, 40(SP)
165 MOVQ AX, 48(SP)
166 MOVQ AX, 56(SP)
167
168 // Schoolbook multiplication (same as scalar, but with field reduction)
169 // a0 * b[0..3]
170 MOVQ R8, AX
171 MULQ 0(R12)
172 MOVQ AX, 0(SP)
173 MOVQ DX, R13
174
175 MOVQ R8, AX
176 MULQ 8(R12)
177 ADDQ R13, AX
178 ADCQ $0, DX
179 MOVQ AX, 8(SP)
180 MOVQ DX, R13
181
182 MOVQ R8, AX
183 MULQ 16(R12)
184 ADDQ R13, AX
185 ADCQ $0, DX
186 MOVQ AX, 16(SP)
187 MOVQ DX, R13
188
189 MOVQ R8, AX
190 MULQ 24(R12)
191 ADDQ R13, AX
192 ADCQ $0, DX
193 MOVQ AX, 24(SP)
194 MOVQ DX, 32(SP)
195
196 // a1 * b[0..3]
197 MOVQ R9, AX
198 MULQ 0(R12)
199 ADDQ AX, 8(SP)
200 ADCQ DX, 16(SP)
201 ADCQ $0, 24(SP)
202 ADCQ $0, 32(SP)
203
204 MOVQ R9, AX
205 MULQ 8(R12)
206 ADDQ AX, 16(SP)
207 ADCQ DX, 24(SP)
208 ADCQ $0, 32(SP)
209
210 MOVQ R9, AX
211 MULQ 16(R12)
212 ADDQ AX, 24(SP)
213 ADCQ DX, 32(SP)
214 ADCQ $0, 40(SP)
215
216 MOVQ R9, AX
217 MULQ 24(R12)
218 ADDQ AX, 32(SP)
219 ADCQ DX, 40(SP)
220
221 // a2 * b[0..3]
222 MOVQ R10, AX
223 MULQ 0(R12)
224 ADDQ AX, 16(SP)
225 ADCQ DX, 24(SP)
226 ADCQ $0, 32(SP)
227 ADCQ $0, 40(SP)
228
229 MOVQ R10, AX
230 MULQ 8(R12)
231 ADDQ AX, 24(SP)
232 ADCQ DX, 32(SP)
233 ADCQ $0, 40(SP)
234
235 MOVQ R10, AX
236 MULQ 16(R12)
237 ADDQ AX, 32(SP)
238 ADCQ DX, 40(SP)
239 ADCQ $0, 48(SP)
240
241 MOVQ R10, AX
242 MULQ 24(R12)
243 ADDQ AX, 40(SP)
244 ADCQ DX, 48(SP)
245
246 // a3 * b[0..3]
247 MOVQ R11, AX
248 MULQ 0(R12)
249 ADDQ AX, 24(SP)
250 ADCQ DX, 32(SP)
251 ADCQ $0, 40(SP)
252 ADCQ $0, 48(SP)
253
254 MOVQ R11, AX
255 MULQ 8(R12)
256 ADDQ AX, 32(SP)
257 ADCQ DX, 40(SP)
258 ADCQ $0, 48(SP)
259
260 MOVQ R11, AX
261 MULQ 16(R12)
262 ADDQ AX, 40(SP)
263 ADCQ DX, 48(SP)
264 ADCQ $0, 56(SP)
265
266 MOVQ R11, AX
267 MULQ 24(R12)
268 ADDQ AX, 48(SP)
269 ADCQ DX, 56(SP)
270
271 // Now reduce 512-bit product mod p
272 // Using 2^256 ≡ 2^32 + 977 (mod p)
273
274 // high = [32(SP), 40(SP), 48(SP), 56(SP)]
275 // low = [0(SP), 8(SP), 16(SP), 24(SP)]
276 // result = low + high * (2^32 + 977)
277
278 // Multiply high * 0x1000003D1
279 MOVQ fieldPC<>+0x00(SB), R13
280
281 MOVQ 32(SP), AX
282 MULQ R13
283 MOVQ AX, R8 // reduction[0]
284 MOVQ DX, R14 // carry
285
286 MOVQ 40(SP), AX
287 MULQ R13
288 ADDQ R14, AX
289 ADCQ $0, DX
290 MOVQ AX, R9 // reduction[1]
291 MOVQ DX, R14
292
293 MOVQ 48(SP), AX
294 MULQ R13
295 ADDQ R14, AX
296 ADCQ $0, DX
297 MOVQ AX, R10 // reduction[2]
298 MOVQ DX, R14
299
300 MOVQ 56(SP), AX
301 MULQ R13
302 ADDQ R14, AX
303 ADCQ $0, DX
304 MOVQ AX, R11 // reduction[3]
305 MOVQ DX, R14 // reduction[4] (overflow)
306
307 // Add low + reduction
308 ADDQ 0(SP), R8
309 ADCQ 8(SP), R9
310 ADCQ 16(SP), R10
311 ADCQ 24(SP), R11
312 ADCQ $0, R14 // Capture any carry into R14
313
314 // If R14 is non-zero, reduce again
315 TESTQ R14, R14
316 JZ field_mul_check
317
318 // R14 * 0x1000003D1
319 MOVQ R14, AX
320 MULQ R13
321 ADDQ AX, R8
322 ADCQ DX, R9
323 ADCQ $0, R10
324 ADCQ $0, R11
325
326 field_mul_check:
327 // Check if result >= p and reduce if needed
328 MOVQ $0xFFFFFFFFFFFFFFFF, R15
329 CMPQ R11, R15
330 JB field_mul_store
331 JA field_mul_reduce2
332 CMPQ R10, R15
333 JB field_mul_store
334 JA field_mul_reduce2
335 CMPQ R9, R15
336 JB field_mul_store
337 JA field_mul_reduce2
338 MOVQ fieldP<>+0x00(SB), R15
339 CMPQ R8, R15
340 JB field_mul_store
341
342 field_mul_reduce2:
343 MOVQ fieldPC<>+0x00(SB), R15
344 ADDQ R15, R8
345 ADCQ $0, R9
346 ADCQ $0, R10
347 ADCQ $0, R11
348
349 field_mul_store:
350 MOVQ r+0(FP), DI
351 MOVQ R8, 0(DI)
352 MOVQ R9, 8(DI)
353 MOVQ R10, 16(DI)
354 MOVQ R11, 24(DI)
355
356 VZEROUPPER
357 RET
358
359 // func FieldSqrAVX2(r, a *FieldElement)
360 // Squares a 256-bit field element mod p.
361 // For now, just calls FieldMulAVX2(r, a, a)
362 TEXT ·FieldSqrAVX2(SB), NOSPLIT, $24-16
363 MOVQ r+0(FP), AX
364 MOVQ a+8(FP), BX
365 MOVQ AX, 0(SP)
366 MOVQ BX, 8(SP)
367 MOVQ BX, 16(SP)
368 CALL ·FieldMulAVX2(SB)
369 RET
370