field_4x64_amd64.s raw
1 //go:build amd64 && !purego
2
3 #include "textflag.h"
4
5 // Field multiplication for secp256k1 using 4x64-bit limbs with BMI2 instructions.
6 // Uses MULX for flag-free multiplication.
7 //
8 // The field element is represented as 4 limbs of 64 bits each:
9 // n[0..3] where value = n[0] + n[1]*2^64 + n[2]*2^128 + n[3]*2^192
10 //
11 // Field prime p = 2^256 - 2^32 - 977
12 // Reduction constant R = 2^256 mod p = 2^32 + 977 = 0x1000003D1
13 //
14 // func field4x64MulAsm(r, a, b *[4]uint64)
15 TEXT ·field4x64MulAsm(SB), NOSPLIT, $0-24
16 MOVQ r+0(FP), DI // result pointer
17 MOVQ a+8(FP), SI // a pointer
18 MOVQ b+16(FP), CX // b pointer
19
20 // Load a[0..3]
21 MOVQ 0(SI), R8 // a0
22 MOVQ 8(SI), R9 // a1
23 MOVQ 16(SI), R10 // a2
24 MOVQ 24(SI), R11 // a3
25
26 // We'll compute the 512-bit product in R12:R13:R14:R15:AX:BX:BP:DX
27 // Actually, we'll use a different approach: accumulate column by column
28
29 // Column 0: a0*b0
30 MOVQ 0(CX), DX // b0 into DX for MULX
31 MULXQ R8, R12, R13 // a0*b0 -> R13:R12 (hi:lo)
32
33 // Column 1: a0*b1 + a1*b0
34 MOVQ 8(CX), DX // b1
35 MULXQ R8, AX, BX // a0*b1 -> BX:AX
36 ADDQ AX, R13
37 ADCQ $0, BX
38
39 MOVQ 0(CX), DX // b0
40 MULXQ R9, AX, R14 // a1*b0 -> R14:AX
41 ADDQ AX, R13
42 ADCQ BX, R14
43 MOVQ $0, R15
44 ADCQ $0, R15
45
46 // Column 2: a0*b2 + a1*b1 + a2*b0
47 MOVQ 16(CX), DX // b2
48 MULXQ R8, AX, BX // a0*b2 -> BX:AX
49 ADDQ AX, R14
50 ADCQ BX, R15
51
52 MOVQ 8(CX), DX // b1
53 MULXQ R9, AX, BX // a1*b1 -> BX:AX
54 ADDQ AX, R14
55 ADCQ BX, R15
56 MOVQ $0, BP
57 ADCQ $0, BP
58
59 MOVQ 0(CX), DX // b0
60 MULXQ R10, AX, BX // a2*b0 -> BX:AX
61 ADDQ AX, R14
62 ADCQ BX, R15
63 ADCQ $0, BP
64
65 // Column 3: a0*b3 + a1*b2 + a2*b1 + a3*b0
66 // Save R12-R14 (columns 0-2), use them for column 3+
67 MOVQ R12, 0(DI) // Save r0
68 MOVQ R13, 8(DI) // Save r1
69 MOVQ R14, 16(DI) // Save r2
70
71 // Now R12, R13, R14 are free
72 MOVQ R15, R12 // r3 accumulator low
73 MOVQ BP, R13 // r3 accumulator high
74 XORQ R14, R14 // r4 accumulator
75
76 MOVQ 24(CX), DX // b3
77 MULXQ R8, AX, BX // a0*b3 -> BX:AX
78 ADDQ AX, R12
79 ADCQ BX, R13
80 ADCQ $0, R14
81
82 MOVQ 16(CX), DX // b2
83 MULXQ R9, AX, BX // a1*b2 -> BX:AX
84 ADDQ AX, R12
85 ADCQ BX, R13
86 ADCQ $0, R14
87
88 MOVQ 8(CX), DX // b1
89 MULXQ R10, AX, BX // a2*b1 -> BX:AX
90 ADDQ AX, R12
91 ADCQ BX, R13
92 ADCQ $0, R14
93
94 MOVQ 0(CX), DX // b0
95 MULXQ R11, AX, BX // a3*b0 -> BX:AX
96 ADDQ AX, R12
97 ADCQ BX, R13
98 ADCQ $0, R14
99
100 MOVQ R12, 24(DI) // Save r3
101
102 // Column 4: a1*b3 + a2*b2 + a3*b1
103 MOVQ R13, R12 // r4 accumulator low
104 MOVQ R14, R13 // r4 accumulator high
105 XORQ R14, R14
106
107 MOVQ 24(CX), DX // b3
108 MULXQ R9, AX, BX // a1*b3 -> BX:AX
109 ADDQ AX, R12
110 ADCQ BX, R13
111 ADCQ $0, R14
112
113 MOVQ 16(CX), DX // b2
114 MULXQ R10, AX, BX // a2*b2 -> BX:AX
115 ADDQ AX, R12
116 ADCQ BX, R13
117 ADCQ $0, R14
118
119 MOVQ 8(CX), DX // b1
120 MULXQ R11, AX, BX // a3*b1 -> BX:AX
121 ADDQ AX, R12
122 ADCQ BX, R13
123 ADCQ $0, R14
124
125 // r4 is in R12, carry in R13:R14
126
127 // Column 5: a2*b3 + a3*b2
128 MOVQ R13, R15 // r5 accumulator low
129 MOVQ R14, BP // r5 accumulator high
130 XORQ R8, R8 // reuse R8 for r6
131
132 MOVQ 24(CX), DX // b3
133 MULXQ R10, AX, BX // a2*b3 -> BX:AX
134 ADDQ AX, R15
135 ADCQ BX, BP
136 ADCQ $0, R8
137
138 MOVQ 16(CX), DX // b2
139 MULXQ R11, AX, BX // a3*b2 -> BX:AX
140 ADDQ AX, R15
141 ADCQ BX, BP
142 ADCQ $0, R8
143
144 // Column 6: a3*b3
145 MOVQ BP, R9 // r6 accumulator low
146 MOVQ R8, R10 // r6 accumulator high (will be r7)
147
148 MOVQ 24(CX), DX // b3
149 MULXQ R11, AX, BX // a3*b3 -> BX:AX
150 ADDQ AX, R9
151 ADCQ BX, R10
152
153 // Now we have:
154 // r[0..3] in memory at DI
155 // r[4] = R12
156 // r[5] = R15
157 // r[6] = R9
158 // r[7] = R10
159
160 // === Reduction: r[4..7] * R where R = 0x1000003D1 ===
161 // t[i] = r[i+4] * R, then add t to r[0..3]
162
163 MOVQ $0x1000003D1, DX // R constant
164
165 // t0 = r4 * R
166 MULXQ R12, R8, R11 // r4 * R -> R11:R8 (hi:lo)
167
168 // t1 = r5 * R + hi(t0)
169 MULXQ R15, AX, BX // r5 * R -> BX:AX
170 ADDQ R11, AX
171 ADCQ $0, BX
172 MOVQ AX, R11 // t1 low
173 MOVQ BX, R12 // t1 hi -> will be t2
174
175 // t2 = r6 * R + hi(t1)
176 MULXQ R9, AX, BX // r6 * R -> BX:AX
177 ADDQ R12, AX
178 ADCQ $0, BX
179 MOVQ AX, R12 // t2 low
180 MOVQ BX, R13 // t2 hi -> will be t3
181
182 // t3 = r7 * R + hi(t2)
183 MULXQ R10, AX, BX // r7 * R -> BX:AX
184 ADDQ R13, AX
185 ADCQ $0, BX
186 MOVQ AX, R13 // t3 low
187 MOVQ BX, R14 // t4 (overflow)
188
189 // Add t[0..3] to r[0..3]
190 ADDQ R8, 0(DI) // r0 += t0
191 ADCQ R11, 8(DI) // r1 += t1
192 ADCQ R12, 16(DI) // r2 += t2
193 ADCQ R13, 24(DI) // r3 += t3
194 ADCQ $0, R14 // capture final carry into t4
195
196 // If t4 != 0, we need another reduction round
197 TESTQ R14, R14
198 JZ done
199
200 // overflow * R
201 MULXQ R14, AX, BX // t4 * R -> BX:AX
202 ADDQ AX, 0(DI)
203 ADCQ BX, 8(DI)
204 ADCQ $0, 16(DI)
205 ADCQ $0, 24(DI)
206 // If this still overflows, add R one more time (extremely rare)
207 JNC done
208 MOVQ $0x1000003D1, AX
209 ADDQ AX, 0(DI)
210 ADCQ $0, 8(DI)
211 ADCQ $0, 16(DI)
212 ADCQ $0, 24(DI)
213
214 done:
215 RET
216
217 // func field4x64SqrAsm(r, a *[4]uint64)
218 // Optimized squaring: exploits symmetry a[i]*a[j] = a[j]*a[i]
219 // For now, inline calls to mul logic with b=a
220 TEXT ·field4x64SqrAsm(SB), NOSPLIT, $0-16
221 MOVQ r+0(FP), DI // result pointer
222 MOVQ a+8(FP), SI // a pointer
223 MOVQ SI, CX // b = a (same pointer)
224
225 // Load a[0..3]
226 MOVQ 0(SI), R8 // a0
227 MOVQ 8(SI), R9 // a1
228 MOVQ 16(SI), R10 // a2
229 MOVQ 24(SI), R11 // a3
230
231 // Column 0: a0*a0
232 MOVQ R8, DX // a0 into DX for MULX
233 MULXQ R8, R12, R13 // a0*a0 -> R13:R12 (hi:lo)
234
235 // Column 1: 2*a0*a1
236 // Need to compute: R14:R13 += 2*(BX:AX) where BX:AX = a0*a1
237 MOVQ R9, DX // a1
238 MULXQ R8, AX, BX // a0*a1 -> BX:AX
239 XORQ R14, R14
240 XORQ R15, R15
241 ADDQ AX, R13 // R13 += AX, CF1
242 ADCQ $0, R14 // R14 = CF1
243 ADDQ AX, R13 // R13 += AX again (2*AX total), CF2
244 ADCQ BX, R14 // R14 += BX + CF2
245 ADCQ $0, R15 // R15 = overflow from R14
246 ADDQ BX, R14 // R14 += BX again (2*BX total), CF3
247 ADCQ $0, R15 // R15 += CF3
248
249 // Column 2: 2*a0*a2 + a1*a1
250 MOVQ R10, DX // a2
251 MULXQ R8, AX, BX // a0*a2 -> BX:AX
252 ADDQ AX, R14
253 ADCQ BX, R15
254 ADDQ AX, R14 // double it
255 ADCQ BX, R15
256 MOVQ $0, BP
257 ADCQ $0, BP
258
259 MOVQ R9, DX // a1
260 MULXQ R9, AX, BX // a1*a1 -> BX:AX
261 ADDQ AX, R14
262 ADCQ BX, R15
263 ADCQ $0, BP
264
265 // Save r0, r1, r2
266 MOVQ R12, 0(DI)
267 MOVQ R13, 8(DI)
268 MOVQ R14, 16(DI)
269
270 // Column 3: 2*a0*a3 + 2*a1*a2
271 MOVQ R15, R12
272 MOVQ BP, R13
273 XORQ R14, R14
274
275 MOVQ R11, DX // a3
276 MULXQ R8, AX, BX // a0*a3 -> BX:AX
277 ADDQ AX, R12
278 ADCQ BX, R13
279 ADCQ $0, R14
280 ADDQ AX, R12 // double
281 ADCQ BX, R13
282 ADCQ $0, R14
283
284 MOVQ R10, DX // a2
285 MULXQ R9, AX, BX // a1*a2 -> BX:AX
286 ADDQ AX, R12
287 ADCQ BX, R13
288 ADCQ $0, R14
289 ADDQ AX, R12 // double
290 ADCQ BX, R13
291 ADCQ $0, R14
292
293 MOVQ R12, 24(DI) // Save r3
294
295 // Column 4: 2*a1*a3 + a2*a2
296 MOVQ R13, R12
297 MOVQ R14, R13
298 XORQ R14, R14
299
300 MOVQ R11, DX // a3
301 MULXQ R9, AX, BX // a1*a3 -> BX:AX
302 ADDQ AX, R12
303 ADCQ BX, R13
304 ADCQ $0, R14
305 ADDQ AX, R12 // double
306 ADCQ BX, R13
307 ADCQ $0, R14
308
309 MOVQ R10, DX // a2
310 MULXQ R10, AX, BX // a2*a2 -> BX:AX
311 ADDQ AX, R12
312 ADCQ BX, R13
313 ADCQ $0, R14
314
315 // Column 5: 2*a2*a3
316 MOVQ R13, R15
317 MOVQ R14, BP
318 XORQ R8, R8
319
320 MOVQ R11, DX // a3
321 MULXQ R10, AX, BX // a2*a3 -> BX:AX
322 ADDQ AX, R15
323 ADCQ BX, BP
324 ADCQ $0, R8
325 ADDQ AX, R15 // double
326 ADCQ BX, BP
327 ADCQ $0, R8
328
329 // Column 6: a3*a3
330 MOVQ BP, R9
331 MOVQ R8, R10
332
333 MOVQ R11, DX // a3
334 MULXQ R11, AX, BX // a3*a3 -> BX:AX
335 ADDQ AX, R9
336 ADCQ BX, R10
337
338 // Now we have:
339 // r[0..3] in memory at DI
340 // r[4] = R12, r[5] = R15, r[6] = R9, r[7] = R10
341
342 // === Reduction: r[4..7] * R where R = 0x1000003D1 ===
343 MOVQ $0x1000003D1, DX
344
345 // t0 = r4 * R
346 MULXQ R12, R8, R11 // r4 * R -> R11:R8
347
348 // t1 = r5 * R + hi(t0)
349 MULXQ R15, AX, BX // r5 * R -> BX:AX
350 ADDQ R11, AX
351 ADCQ $0, BX
352 MOVQ AX, R11
353 MOVQ BX, R12
354
355 // t2 = r6 * R + hi(t1)
356 MULXQ R9, AX, BX // r6 * R -> BX:AX
357 ADDQ R12, AX
358 ADCQ $0, BX
359 MOVQ AX, R12
360 MOVQ BX, R13
361
362 // t3 = r7 * R + hi(t2)
363 MULXQ R10, AX, BX // r7 * R -> BX:AX
364 ADDQ R13, AX
365 ADCQ $0, BX
366 MOVQ AX, R13
367 MOVQ BX, R14
368
369 // Add t[0..3] to r[0..3]
370 ADDQ R8, 0(DI)
371 ADCQ R11, 8(DI)
372 ADCQ R12, 16(DI)
373 ADCQ R13, 24(DI)
374 ADCQ $0, R14
375
376 // If t4 != 0, we need another reduction round
377 TESTQ R14, R14
378 JZ sqr_done
379
380 // overflow * R
381 MULXQ R14, AX, BX
382 ADDQ AX, 0(DI)
383 ADCQ BX, 8(DI)
384 ADCQ $0, 16(DI)
385 ADCQ $0, 24(DI)
386 JNC sqr_done
387 MOVQ $0x1000003D1, AX
388 ADDQ AX, 0(DI)
389 ADCQ $0, 8(DI)
390 ADCQ $0, 16(DI)
391 ADCQ $0, 24(DI)
392
393 sqr_done:
394 RET
395