scalar_amd64.s raw
1 //go:build amd64
2
3 #include "textflag.h"
4
5 // Constants for scalar reduction
6 // n = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141
7 DATA p256k1ScalarN<>+0x00(SB)/8, $0xBFD25E8CD0364141
8 DATA p256k1ScalarN<>+0x08(SB)/8, $0xBAAEDCE6AF48A03B
9 DATA p256k1ScalarN<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFE
10 DATA p256k1ScalarN<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
11 GLOBL p256k1ScalarN<>(SB), RODATA|NOPTR, $32
12
13 // 2^256 - n (for reduction)
14 // NC0 = 0x402DA1732FC9BEBF
15 // NC1 = 0x4551231950B75FC4
16 // NC2 = 1
17 DATA p256k1ScalarNC<>+0x00(SB)/8, $0x402DA1732FC9BEBF
18 DATA p256k1ScalarNC<>+0x08(SB)/8, $0x4551231950B75FC4
19 DATA p256k1ScalarNC<>+0x10(SB)/8, $0x0000000000000001
20 DATA p256k1ScalarNC<>+0x18(SB)/8, $0x0000000000000000
21 GLOBL p256k1ScalarNC<>(SB), RODATA|NOPTR, $32
22
23 // func scalarAddAVX2(r, a, b *Scalar)
24 // Adds two 256-bit scalars with carry chain and modular reduction.
25 TEXT ·scalarAddAVX2(SB), NOSPLIT, $0-24
26 MOVQ r+0(FP), DI
27 MOVQ a+8(FP), SI
28 MOVQ b+16(FP), DX
29
30 // Load a and b into registers (scalar loads for carry chain)
31 MOVQ 0(SI), AX // a.d[0]
32 MOVQ 8(SI), BX // a.d[1]
33 MOVQ 16(SI), CX // a.d[2]
34 MOVQ 24(SI), R8 // a.d[3]
35
36 // Add b with carry chain
37 ADDQ 0(DX), AX // a.d[0] + b.d[0]
38 ADCQ 8(DX), BX // a.d[1] + b.d[1] + carry
39 ADCQ 16(DX), CX // a.d[2] + b.d[2] + carry
40 ADCQ 24(DX), R8 // a.d[3] + b.d[3] + carry
41
42 // Save carry flag
43 SETCS R9B
44
45 // Store preliminary result
46 MOVQ AX, 0(DI)
47 MOVQ BX, 8(DI)
48 MOVQ CX, 16(DI)
49 MOVQ R8, 24(DI)
50
51 // Check if we need to reduce (carry set or result >= n)
52 TESTB R9B, R9B
53 JNZ add_reduce
54
55 // Compare with n (from high to low)
56 MOVQ $0xFFFFFFFFFFFFFFFF, R10
57 CMPQ R8, R10
58 JB add_done
59 JA add_reduce
60 MOVQ p256k1ScalarN<>+0x10(SB), R10
61 CMPQ CX, R10
62 JB add_done
63 JA add_reduce
64 MOVQ p256k1ScalarN<>+0x08(SB), R10
65 CMPQ BX, R10
66 JB add_done
67 JA add_reduce
68 MOVQ p256k1ScalarN<>+0x00(SB), R10
69 CMPQ AX, R10
70 JB add_done
71
72 add_reduce:
73 // Add 2^256 - n (which is equivalent to subtracting n)
74 MOVQ 0(DI), AX
75 MOVQ 8(DI), BX
76 MOVQ 16(DI), CX
77 MOVQ 24(DI), R8
78
79 MOVQ p256k1ScalarNC<>+0x00(SB), R10
80 ADDQ R10, AX
81 MOVQ p256k1ScalarNC<>+0x08(SB), R10
82 ADCQ R10, BX
83 MOVQ p256k1ScalarNC<>+0x10(SB), R10
84 ADCQ R10, CX
85 MOVQ p256k1ScalarNC<>+0x18(SB), R10
86 ADCQ R10, R8
87
88 MOVQ AX, 0(DI)
89 MOVQ BX, 8(DI)
90 MOVQ CX, 16(DI)
91 MOVQ R8, 24(DI)
92
93 add_done:
94 VZEROUPPER
95 RET
96
97 // func scalarSubAVX2(r, a, b *Scalar)
98 // Subtracts two 256-bit scalars.
99 TEXT ·scalarSubAVX2(SB), NOSPLIT, $0-24
100 MOVQ r+0(FP), DI
101 MOVQ a+8(FP), SI
102 MOVQ b+16(FP), DX
103
104 // Load a
105 MOVQ 0(SI), AX
106 MOVQ 8(SI), BX
107 MOVQ 16(SI), CX
108 MOVQ 24(SI), R8
109
110 // Subtract b with borrow chain
111 SUBQ 0(DX), AX
112 SBBQ 8(DX), BX
113 SBBQ 16(DX), CX
114 SBBQ 24(DX), R8
115
116 // Save borrow flag
117 SETCS R9B
118
119 // Store preliminary result
120 MOVQ AX, 0(DI)
121 MOVQ BX, 8(DI)
122 MOVQ CX, 16(DI)
123 MOVQ R8, 24(DI)
124
125 // If borrow, add n back
126 TESTB R9B, R9B
127 JZ sub_done
128
129 // Add n
130 MOVQ p256k1ScalarN<>+0x00(SB), R10
131 ADDQ R10, AX
132 MOVQ p256k1ScalarN<>+0x08(SB), R10
133 ADCQ R10, BX
134 MOVQ p256k1ScalarN<>+0x10(SB), R10
135 ADCQ R10, CX
136 MOVQ p256k1ScalarN<>+0x18(SB), R10
137 ADCQ R10, R8
138
139 MOVQ AX, 0(DI)
140 MOVQ BX, 8(DI)
141 MOVQ CX, 16(DI)
142 MOVQ R8, 24(DI)
143
144 sub_done:
145 VZEROUPPER
146 RET
147
148 // func scalarMulAVX2(r, a, b *Scalar)
149 // Multiplies two 256-bit scalars and reduces mod n.
150 // This implementation follows the bitcoin-core secp256k1 algorithm exactly.
151 TEXT ·scalarMulAVX2(SB), NOSPLIT, $128-24
152 MOVQ r+0(FP), DI
153 MOVQ a+8(FP), SI
154 MOVQ b+16(FP), DX
155
156 // Load a limbs
157 MOVQ 0(SI), R8 // a0
158 MOVQ 8(SI), R9 // a1
159 MOVQ 16(SI), R10 // a2
160 MOVQ 24(SI), R11 // a3
161
162 // Store b pointer for later use
163 MOVQ DX, R12
164
165 // Compute 512-bit product using schoolbook multiplication
166 // Product stored on stack at SP+0 to SP+56 (8 limbs: l0..l7)
167
168 // Initialize product to zero
169 XORQ AX, AX
170 MOVQ AX, 0(SP) // l0
171 MOVQ AX, 8(SP) // l1
172 MOVQ AX, 16(SP) // l2
173 MOVQ AX, 24(SP) // l3
174 MOVQ AX, 32(SP) // l4
175 MOVQ AX, 40(SP) // l5
176 MOVQ AX, 48(SP) // l6
177 MOVQ AX, 56(SP) // l7
178
179 // Multiply a0 * b[0..3]
180 MOVQ R8, AX
181 MULQ 0(R12) // a0 * b0
182 MOVQ AX, 0(SP)
183 MOVQ DX, R13 // carry
184
185 MOVQ R8, AX
186 MULQ 8(R12) // a0 * b1
187 ADDQ R13, AX
188 ADCQ $0, DX
189 MOVQ AX, 8(SP)
190 MOVQ DX, R13
191
192 MOVQ R8, AX
193 MULQ 16(R12) // a0 * b2
194 ADDQ R13, AX
195 ADCQ $0, DX
196 MOVQ AX, 16(SP)
197 MOVQ DX, R13
198
199 MOVQ R8, AX
200 MULQ 24(R12) // a0 * b3
201 ADDQ R13, AX
202 ADCQ $0, DX
203 MOVQ AX, 24(SP)
204 MOVQ DX, 32(SP)
205
206 // Multiply a1 * b[0..3] and add
207 MOVQ R9, AX
208 MULQ 0(R12) // a1 * b0
209 ADDQ AX, 8(SP)
210 ADCQ DX, 16(SP)
211 ADCQ $0, 24(SP)
212 ADCQ $0, 32(SP)
213
214 MOVQ R9, AX
215 MULQ 8(R12) // a1 * b1
216 ADDQ AX, 16(SP)
217 ADCQ DX, 24(SP)
218 ADCQ $0, 32(SP)
219
220 MOVQ R9, AX
221 MULQ 16(R12) // a1 * b2
222 ADDQ AX, 24(SP)
223 ADCQ DX, 32(SP)
224 ADCQ $0, 40(SP)
225
226 MOVQ R9, AX
227 MULQ 24(R12) // a1 * b3
228 ADDQ AX, 32(SP)
229 ADCQ DX, 40(SP)
230
231 // Multiply a2 * b[0..3] and add
232 MOVQ R10, AX
233 MULQ 0(R12) // a2 * b0
234 ADDQ AX, 16(SP)
235 ADCQ DX, 24(SP)
236 ADCQ $0, 32(SP)
237 ADCQ $0, 40(SP)
238
239 MOVQ R10, AX
240 MULQ 8(R12) // a2 * b1
241 ADDQ AX, 24(SP)
242 ADCQ DX, 32(SP)
243 ADCQ $0, 40(SP)
244
245 MOVQ R10, AX
246 MULQ 16(R12) // a2 * b2
247 ADDQ AX, 32(SP)
248 ADCQ DX, 40(SP)
249 ADCQ $0, 48(SP)
250
251 MOVQ R10, AX
252 MULQ 24(R12) // a2 * b3
253 ADDQ AX, 40(SP)
254 ADCQ DX, 48(SP)
255
256 // Multiply a3 * b[0..3] and add
257 MOVQ R11, AX
258 MULQ 0(R12) // a3 * b0
259 ADDQ AX, 24(SP)
260 ADCQ DX, 32(SP)
261 ADCQ $0, 40(SP)
262 ADCQ $0, 48(SP)
263
264 MOVQ R11, AX
265 MULQ 8(R12) // a3 * b1
266 ADDQ AX, 32(SP)
267 ADCQ DX, 40(SP)
268 ADCQ $0, 48(SP)
269
270 MOVQ R11, AX
271 MULQ 16(R12) // a3 * b2
272 ADDQ AX, 40(SP)
273 ADCQ DX, 48(SP)
274 ADCQ $0, 56(SP)
275
276 MOVQ R11, AX
277 MULQ 24(R12) // a3 * b3
278 ADDQ AX, 48(SP)
279 ADCQ DX, 56(SP)
280
281 // Now we have the 512-bit product in SP+0..SP+56 (l[0..7])
282 // Reduce using the exact algorithm from bitcoin-core secp256k1
283 //
284 // Phase 1: Reduce 512 bits into 385 bits
285 // m[0..6] = l[0..3] + n[0..3] * SECP256K1_N_C
286 // where n[0..3] = l[4..7] (high 256 bits)
287 //
288 // NC0 = 0x402DA1732FC9BEBF
289 // NC1 = 0x4551231950B75FC4
290 // NC2 = 1
291
292 // Load high limbs (l4..l7 = n0..n3)
293 MOVQ 32(SP), R8 // n0 = l4
294 MOVQ 40(SP), R9 // n1 = l5
295 MOVQ 48(SP), R10 // n2 = l6
296 MOVQ 56(SP), R11 // n3 = l7
297
298 // Load constants
299 MOVQ $0x402DA1732FC9BEBF, R12 // NC0
300 MOVQ $0x4551231950B75FC4, R13 // NC1
301
302 // Use stack locations 64-112 for intermediate m values
303 // We'll use a 160-bit accumulator approach like the C code
304 // c0 (R14), c1 (R15), c2 (stored on stack at 120(SP))
305
306 // === m0 ===
307 // c0 = l[0], c1 = 0
308 // muladd_fast(n0, NC0): hi,lo = n0*NC0; c0 += lo, c1 += hi + carry
309 // m0 = extract_fast() = c0; c0 = c1; c1 = 0
310 MOVQ 0(SP), R14 // c0 = l0
311 XORQ R15, R15 // c1 = 0
312 MOVQ R8, AX
313 MULQ R12 // DX:AX = n0 * NC0
314 ADDQ AX, R14 // c0 += lo
315 ADCQ DX, R15 // c1 += hi + carry
316 MOVQ R14, 64(SP) // m0 = c0
317 MOVQ R15, R14 // c0 = c1
318 XORQ R15, R15 // c1 = 0
319 MOVQ $0, 120(SP) // c2 = 0
320
321 // === m1 ===
322 // sumadd_fast(l[1])
323 // muladd(n1, NC0)
324 // muladd(n0, NC1)
325 // m1 = extract()
326 ADDQ 8(SP), R14 // c0 += l1
327 ADCQ $0, R15 // c1 += carry
328
329 MOVQ R9, AX
330 MULQ R12 // DX:AX = n1 * NC0
331 ADDQ AX, R14 // c0 += lo
332 ADCQ DX, R15 // c1 += hi + carry
333 ADCQ $0, 120(SP) // c2 += carry
334
335 MOVQ R8, AX
336 MULQ R13 // DX:AX = n0 * NC1
337 ADDQ AX, R14 // c0 += lo
338 ADCQ DX, R15 // c1 += hi + carry
339 ADCQ $0, 120(SP) // c2 += carry
340
341 MOVQ R14, 72(SP) // m1 = c0
342 MOVQ R15, R14 // c0 = c1
343 MOVQ 120(SP), R15 // c1 = c2
344 MOVQ $0, 120(SP) // c2 = 0
345
346 // === m2 ===
347 // sumadd(l[2])
348 // muladd(n2, NC0)
349 // muladd(n1, NC1)
350 // sumadd(n0) (because NC2 = 1)
351 // m2 = extract()
352 ADDQ 16(SP), R14 // c0 += l2
353 ADCQ $0, R15
354 ADCQ $0, 120(SP)
355
356 MOVQ R10, AX
357 MULQ R12 // DX:AX = n2 * NC0
358 ADDQ AX, R14
359 ADCQ DX, R15
360 ADCQ $0, 120(SP)
361
362 MOVQ R9, AX
363 MULQ R13 // DX:AX = n1 * NC1
364 ADDQ AX, R14
365 ADCQ DX, R15
366 ADCQ $0, 120(SP)
367
368 ADDQ R8, R14 // c0 += n0 (n0 * NC2 = n0 * 1)
369 ADCQ $0, R15
370 ADCQ $0, 120(SP)
371
372 MOVQ R14, 80(SP) // m2 = c0
373 MOVQ R15, R14 // c0 = c1
374 MOVQ 120(SP), R15 // c1 = c2
375 MOVQ $0, 120(SP) // c2 = 0
376
377 // === m3 ===
378 // sumadd(l[3])
379 // muladd(n3, NC0)
380 // muladd(n2, NC1)
381 // sumadd(n1)
382 // m3 = extract()
383 ADDQ 24(SP), R14 // c0 += l3
384 ADCQ $0, R15
385 ADCQ $0, 120(SP)
386
387 MOVQ R11, AX
388 MULQ R12 // DX:AX = n3 * NC0
389 ADDQ AX, R14
390 ADCQ DX, R15
391 ADCQ $0, 120(SP)
392
393 MOVQ R10, AX
394 MULQ R13 // DX:AX = n2 * NC1
395 ADDQ AX, R14
396 ADCQ DX, R15
397 ADCQ $0, 120(SP)
398
399 ADDQ R9, R14 // c0 += n1
400 ADCQ $0, R15
401 ADCQ $0, 120(SP)
402
403 MOVQ R14, 88(SP) // m3 = c0
404 MOVQ R15, R14 // c0 = c1
405 MOVQ 120(SP), R15 // c1 = c2
406 MOVQ $0, 120(SP) // c2 = 0
407
408 // === m4 ===
409 // muladd(n3, NC1)
410 // sumadd(n2)
411 // m4 = extract()
412 MOVQ R11, AX
413 MULQ R13 // DX:AX = n3 * NC1
414 ADDQ AX, R14
415 ADCQ DX, R15
416 ADCQ $0, 120(SP)
417
418 ADDQ R10, R14 // c0 += n2
419 ADCQ $0, R15
420 ADCQ $0, 120(SP)
421
422 MOVQ R14, 96(SP) // m4 = c0
423 MOVQ R15, R14 // c0 = c1
424 MOVQ 120(SP), R15 // c1 = c2
425
426 // === m5 ===
427 // sumadd_fast(n3)
428 // m5 = extract_fast()
429 ADDQ R11, R14 // c0 += n3
430 ADCQ $0, R15 // c1 += carry
431
432 MOVQ R14, 104(SP) // m5 = c0
433 MOVQ R15, R14 // c0 = c1
434
435 // === m6 ===
436 // m6 = c0 (low 32 bits only, but we keep full 64 bits for simplicity)
437 MOVQ R14, 112(SP) // m6 = c0
438
439 // Phase 2: Reduce 385 bits into 258 bits
440 // p[0..4] = m[0..3] + m[4..6] * SECP256K1_N_C
441 // m4, m5 are 64-bit, m6 is at most 33 bits
442
443 // Load m values
444 MOVQ 96(SP), R8 // m4
445 MOVQ 104(SP), R9 // m5
446 MOVQ 112(SP), R10 // m6
447
448 // === p0 ===
449 // c0 = m0, c1 = 0
450 // muladd_fast(m4, NC0)
451 // p0 = extract_fast()
452 MOVQ 64(SP), R14 // c0 = m0
453 XORQ R15, R15 // c1 = 0
454
455 MOVQ R8, AX
456 MULQ R12 // DX:AX = m4 * NC0
457 ADDQ AX, R14
458 ADCQ DX, R15
459
460 MOVQ R14, 64(SP) // p0 = c0 (reuse m0 location)
461 MOVQ R15, R14 // c0 = c1
462 XORQ R15, R15 // c1 = 0
463 MOVQ $0, 120(SP) // c2 = 0
464
465 // === p1 ===
466 // sumadd_fast(m1)
467 // muladd(m5, NC0)
468 // muladd(m4, NC1)
469 // p1 = extract()
470 ADDQ 72(SP), R14 // c0 += m1
471 ADCQ $0, R15
472
473 MOVQ R9, AX
474 MULQ R12 // DX:AX = m5 * NC0
475 ADDQ AX, R14
476 ADCQ DX, R15
477 ADCQ $0, 120(SP)
478
479 MOVQ R8, AX
480 MULQ R13 // DX:AX = m4 * NC1
481 ADDQ AX, R14
482 ADCQ DX, R15
483 ADCQ $0, 120(SP)
484
485 MOVQ R14, 72(SP) // p1 = c0
486 MOVQ R15, R14 // c0 = c1
487 MOVQ 120(SP), R15 // c1 = c2
488 MOVQ $0, 120(SP) // c2 = 0
489
490 // === p2 ===
491 // sumadd(m2)
492 // muladd(m6, NC0)
493 // muladd(m5, NC1)
494 // sumadd(m4)
495 // p2 = extract()
496 ADDQ 80(SP), R14 // c0 += m2
497 ADCQ $0, R15
498 ADCQ $0, 120(SP)
499
500 MOVQ R10, AX
501 MULQ R12 // DX:AX = m6 * NC0
502 ADDQ AX, R14
503 ADCQ DX, R15
504 ADCQ $0, 120(SP)
505
506 MOVQ R9, AX
507 MULQ R13 // DX:AX = m5 * NC1
508 ADDQ AX, R14
509 ADCQ DX, R15
510 ADCQ $0, 120(SP)
511
512 ADDQ R8, R14 // c0 += m4
513 ADCQ $0, R15
514 ADCQ $0, 120(SP)
515
516 MOVQ R14, 80(SP) // p2 = c0
517 MOVQ R15, R14 // c0 = c1
518 MOVQ 120(SP), R15 // c1 = c2
519
520 // === p3 ===
521 // sumadd_fast(m3)
522 // muladd_fast(m6, NC1)
523 // sumadd_fast(m5)
524 // p3 = extract_fast()
525 ADDQ 88(SP), R14 // c0 += m3
526 ADCQ $0, R15
527
528 MOVQ R10, AX
529 MULQ R13 // DX:AX = m6 * NC1
530 ADDQ AX, R14
531 ADCQ DX, R15
532
533 ADDQ R9, R14 // c0 += m5
534 ADCQ $0, R15
535
536 MOVQ R14, 88(SP) // p3 = c0
537 // p4 = c1 + m6
538 ADDQ R15, R10 // p4 = c1 + m6
539
540 // === p4 ===
541 MOVQ R10, 96(SP) // p4
542
543 // Phase 3: Reduce 258 bits into 256 bits
544 // r[0..3] = p[0..3] + p[4] * SECP256K1_N_C
545 // Then check for overflow and reduce once more if needed
546
547 // Use 128-bit arithmetic for this phase
548 // t = p0 + p4 * NC0
549 MOVQ 96(SP), R11 // p4
550
551 // r0 = (p0 + p4 * NC0) mod 2^64, carry to next
552 MOVQ R11, AX
553 MULQ R12 // DX:AX = p4 * NC0
554 ADDQ 64(SP), AX // AX = p0 + lo
555 ADCQ $0, DX // DX = hi + carry
556 MOVQ AX, R8 // r0
557 MOVQ DX, R14 // carry
558
559 // r1 = p1 + p4 * NC1 + carry
560 MOVQ R11, AX
561 MULQ R13 // DX:AX = p4 * NC1
562 ADDQ R14, AX // AX += carry
563 ADCQ $0, DX
564 ADDQ 72(SP), AX // AX += p1
565 ADCQ $0, DX
566 MOVQ AX, R9 // r1
567 MOVQ DX, R14 // carry
568
569 // r2 = p2 + p4 * NC2 + carry = p2 + p4 + carry
570 MOVQ 80(SP), AX
571 ADDQ R14, AX // AX = p2 + carry
572 MOVQ $0, DX
573 ADCQ $0, DX
574 ADDQ R11, AX // AX += p4 (NC2 = 1)
575 ADCQ $0, DX
576 MOVQ AX, R10 // r2
577 MOVQ DX, R14 // carry
578
579 // r3 = p3 + carry
580 MOVQ 88(SP), AX
581 ADDQ R14, AX
582 SETCS R14B // final carry
583 MOVQ AX, R11 // r3
584
585 // Check if we need to reduce (carry or result >= n)
586 TESTB R14B, R14B
587 JNZ mul_do_final_reduce
588
589 // Compare with n (from high to low)
590 MOVQ $0xFFFFFFFFFFFFFFFF, R15
591 CMPQ R11, R15
592 JB mul_store_result
593 JA mul_do_final_reduce
594 MOVQ $0xFFFFFFFFFFFFFFFE, R15
595 CMPQ R10, R15
596 JB mul_store_result
597 JA mul_do_final_reduce
598 MOVQ $0xBAAEDCE6AF48A03B, R15
599 CMPQ R9, R15
600 JB mul_store_result
601 JA mul_do_final_reduce
602 MOVQ $0xBFD25E8CD0364141, R15
603 CMPQ R8, R15
604 JB mul_store_result
605
606 mul_do_final_reduce:
607 // Add 2^256 - n
608 ADDQ R12, R8 // r0 += NC0
609 ADCQ R13, R9 // r1 += NC1
610 ADCQ $1, R10 // r2 += NC2 = 1
611 ADCQ $0, R11 // r3 += 0
612
613 mul_store_result:
614 // Store result
615 MOVQ r+0(FP), DI
616 MOVQ R8, 0(DI)
617 MOVQ R9, 8(DI)
618 MOVQ R10, 16(DI)
619 MOVQ R11, 24(DI)
620
621 VZEROUPPER
622 RET
623