field_amd64.s raw
1 //go:build amd64 && !purego
2
3 #include "textflag.h"
4
5 // Montgomery multiplication constants for the Gnarl 216-bit prime.
6 // P = 0x9563a6_b6d81bb9b02e5e5d_121e79ccd682cc99_31f9791e0f9ee4f5
7 // pPrime = -P^{-1} mod 2^64 = 0xf07d39ef3ea058a3
8
9 DATA p0<>+0(SB)/8, $0x31f9791e0f9ee4f5
10 DATA p1<>+0(SB)/8, $0x121e79ccd682cc99
11 DATA p2<>+0(SB)/8, $0xb6d81bb9b02e5e5d
12 DATA p3<>+0(SB)/8, $0x00000000009563a6
13 DATA pp<>+0(SB)/8, $0xf07d39ef3ea058a3
14 GLOBL p0<>(SB), RODATA|NOPTR, $8
15 GLOBL p1<>(SB), RODATA|NOPTR, $8
16 GLOBL p2<>(SB), RODATA|NOPTR, $8
17 GLOBL p3<>(SB), RODATA|NOPTR, $8
18 GLOBL pp<>(SB), RODATA|NOPTR, $8
19
20 // func montMul(r, a, b *fe)
21 //
22 // 4-limb CIOS Montgomery multiplication.
23 // r = a * b * R^{-1} mod P where R = 2^256.
24 //
25 // Register allocation:
26 // BX = r pointer
27 // SI = a pointer
28 // DI = b pointer
29 // R8,R9,R10,R11 = t0,t1,t2,t3 (accumulator)
30 // R12 = t4 (overflow limb)
31 // R13 = current a[i] or m value
32 // R14 = carry accumulator
33 // AX,DX = MULQ operands/results
34 TEXT ·montMul(SB), NOSPLIT, $0-24
35 MOVQ r+0(FP), BX
36 MOVQ a+8(FP), SI
37 MOVQ b+16(FP), DI
38
39 // Zero accumulator.
40 XORQ R8, R8
41 XORQ R9, R9
42 XORQ R10, R10
43 XORQ R11, R11
44 XORQ R12, R12
45
46 // ---- i = 0: t += a[0] * b ----
47 MOVQ 0(SI), R13 // R13 = a[0]
48
49 MOVQ R13, AX
50 MULQ 0(DI) // DX:AX = a[0] * b[0]
51 ADDQ AX, R8 // t0 += lo
52 ADCQ $0, DX
53 MOVQ DX, R14 // R14 = carry
54
55 MOVQ R13, AX
56 MULQ 8(DI) // DX:AX = a[0] * b[1]
57 ADDQ R14, R9 // t1 += prev carry
58 ADCQ $0, DX
59 ADDQ AX, R9 // t1 += lo
60 ADCQ $0, DX
61 MOVQ DX, R14
62
63 MOVQ R13, AX
64 MULQ 16(DI) // DX:AX = a[0] * b[2]
65 ADDQ R14, R10 // t2 += prev carry
66 ADCQ $0, DX
67 ADDQ AX, R10 // t2 += lo
68 ADCQ $0, DX
69 MOVQ DX, R14
70
71 MOVQ R13, AX
72 MULQ 24(DI) // DX:AX = a[0] * b[3]
73 ADDQ R14, R11 // t3 += prev carry
74 ADCQ $0, DX
75 ADDQ AX, R11 // t3 += lo
76 ADCQ DX, R12 // t4 += hi
77
78 // Montgomery reduction: m = t0 * pPrime; t += m * P; shift right.
79 MOVQ R8, AX
80 MULQ pp<>(SB) // DX:AX = t0 * pPrime
81 MOVQ AX, R13 // R13 = m (only low 64 bits matter)
82
83 MOVQ R13, AX
84 MULQ p0<>(SB) // DX:AX = m * P[0]
85 ADDQ AX, R8 // t0 += lo (cancels to 0 mod 2^64)
86 ADCQ $0, DX
87 MOVQ DX, R14 // R14 = carry
88
89 MOVQ R13, AX
90 MULQ p1<>(SB) // DX:AX = m * P[1]
91 ADDQ R14, R9
92 ADCQ $0, DX
93 ADDQ AX, R9
94 ADCQ $0, DX
95 MOVQ DX, R14
96
97 MOVQ R13, AX
98 MULQ p2<>(SB) // DX:AX = m * P[2]
99 ADDQ R14, R10
100 ADCQ $0, DX
101 ADDQ AX, R10
102 ADCQ $0, DX
103 MOVQ DX, R14
104
105 MOVQ R13, AX
106 MULQ p3<>(SB) // DX:AX = m * P[3]
107 ADDQ R14, R11
108 ADCQ $0, DX
109 ADDQ AX, R11
110 ADCQ DX, R12
111
112 // Shift: t0=t1, t1=t2, t2=t3, t3=t4, t4=0
113 MOVQ R9, R8
114 MOVQ R10, R9
115 MOVQ R11, R10
116 MOVQ R12, R11
117 XORQ R12, R12
118
119 // ---- i = 1: t += a[1] * b ----
120 MOVQ 8(SI), R13
121
122 MOVQ R13, AX
123 MULQ 0(DI)
124 ADDQ AX, R8
125 ADCQ $0, DX
126 MOVQ DX, R14
127
128 MOVQ R13, AX
129 MULQ 8(DI)
130 ADDQ R14, R9
131 ADCQ $0, DX
132 ADDQ AX, R9
133 ADCQ $0, DX
134 MOVQ DX, R14
135
136 MOVQ R13, AX
137 MULQ 16(DI)
138 ADDQ R14, R10
139 ADCQ $0, DX
140 ADDQ AX, R10
141 ADCQ $0, DX
142 MOVQ DX, R14
143
144 MOVQ R13, AX
145 MULQ 24(DI)
146 ADDQ R14, R11
147 ADCQ $0, DX
148 ADDQ AX, R11
149 ADCQ DX, R12
150
151 // Montgomery reduction i=1
152 MOVQ R8, AX
153 MULQ pp<>(SB)
154 MOVQ AX, R13
155
156 MOVQ R13, AX
157 MULQ p0<>(SB)
158 ADDQ AX, R8
159 ADCQ $0, DX
160 MOVQ DX, R14
161
162 MOVQ R13, AX
163 MULQ p1<>(SB)
164 ADDQ R14, R9
165 ADCQ $0, DX
166 ADDQ AX, R9
167 ADCQ $0, DX
168 MOVQ DX, R14
169
170 MOVQ R13, AX
171 MULQ p2<>(SB)
172 ADDQ R14, R10
173 ADCQ $0, DX
174 ADDQ AX, R10
175 ADCQ $0, DX
176 MOVQ DX, R14
177
178 MOVQ R13, AX
179 MULQ p3<>(SB)
180 ADDQ R14, R11
181 ADCQ $0, DX
182 ADDQ AX, R11
183 ADCQ DX, R12
184
185 MOVQ R9, R8
186 MOVQ R10, R9
187 MOVQ R11, R10
188 MOVQ R12, R11
189 XORQ R12, R12
190
191 // ---- i = 2: t += a[2] * b ----
192 MOVQ 16(SI), R13
193
194 MOVQ R13, AX
195 MULQ 0(DI)
196 ADDQ AX, R8
197 ADCQ $0, DX
198 MOVQ DX, R14
199
200 MOVQ R13, AX
201 MULQ 8(DI)
202 ADDQ R14, R9
203 ADCQ $0, DX
204 ADDQ AX, R9
205 ADCQ $0, DX
206 MOVQ DX, R14
207
208 MOVQ R13, AX
209 MULQ 16(DI)
210 ADDQ R14, R10
211 ADCQ $0, DX
212 ADDQ AX, R10
213 ADCQ $0, DX
214 MOVQ DX, R14
215
216 MOVQ R13, AX
217 MULQ 24(DI)
218 ADDQ R14, R11
219 ADCQ $0, DX
220 ADDQ AX, R11
221 ADCQ DX, R12
222
223 // Montgomery reduction i=2
224 MOVQ R8, AX
225 MULQ pp<>(SB)
226 MOVQ AX, R13
227
228 MOVQ R13, AX
229 MULQ p0<>(SB)
230 ADDQ AX, R8
231 ADCQ $0, DX
232 MOVQ DX, R14
233
234 MOVQ R13, AX
235 MULQ p1<>(SB)
236 ADDQ R14, R9
237 ADCQ $0, DX
238 ADDQ AX, R9
239 ADCQ $0, DX
240 MOVQ DX, R14
241
242 MOVQ R13, AX
243 MULQ p2<>(SB)
244 ADDQ R14, R10
245 ADCQ $0, DX
246 ADDQ AX, R10
247 ADCQ $0, DX
248 MOVQ DX, R14
249
250 MOVQ R13, AX
251 MULQ p3<>(SB)
252 ADDQ R14, R11
253 ADCQ $0, DX
254 ADDQ AX, R11
255 ADCQ DX, R12
256
257 MOVQ R9, R8
258 MOVQ R10, R9
259 MOVQ R11, R10
260 MOVQ R12, R11
261 XORQ R12, R12
262
263 // ---- i = 3: t += a[3] * b ----
264 MOVQ 24(SI), R13
265
266 MOVQ R13, AX
267 MULQ 0(DI)
268 ADDQ AX, R8
269 ADCQ $0, DX
270 MOVQ DX, R14
271
272 MOVQ R13, AX
273 MULQ 8(DI)
274 ADDQ R14, R9
275 ADCQ $0, DX
276 ADDQ AX, R9
277 ADCQ $0, DX
278 MOVQ DX, R14
279
280 MOVQ R13, AX
281 MULQ 16(DI)
282 ADDQ R14, R10
283 ADCQ $0, DX
284 ADDQ AX, R10
285 ADCQ $0, DX
286 MOVQ DX, R14
287
288 MOVQ R13, AX
289 MULQ 24(DI)
290 ADDQ R14, R11
291 ADCQ $0, DX
292 ADDQ AX, R11
293 ADCQ DX, R12
294
295 // Montgomery reduction i=3
296 MOVQ R8, AX
297 MULQ pp<>(SB)
298 MOVQ AX, R13
299
300 MOVQ R13, AX
301 MULQ p0<>(SB)
302 ADDQ AX, R8
303 ADCQ $0, DX
304 MOVQ DX, R14
305
306 MOVQ R13, AX
307 MULQ p1<>(SB)
308 ADDQ R14, R9
309 ADCQ $0, DX
310 ADDQ AX, R9
311 ADCQ $0, DX
312 MOVQ DX, R14
313
314 MOVQ R13, AX
315 MULQ p2<>(SB)
316 ADDQ R14, R10
317 ADCQ $0, DX
318 ADDQ AX, R10
319 ADCQ $0, DX
320 MOVQ DX, R14
321
322 MOVQ R13, AX
323 MULQ p3<>(SB)
324 ADDQ R14, R11
325 ADCQ $0, DX
326 ADDQ AX, R11
327 ADCQ DX, R12
328
329 MOVQ R9, R8
330 MOVQ R10, R9
331 MOVQ R11, R10
332 MOVQ R12, R11
333
334 // Conditional subtraction: if t >= P, t -= P.
335 MOVQ R8, AX
336 MOVQ R9, CX
337 MOVQ R10, DX
338 MOVQ R11, R13
339
340 SUBQ p0<>(SB), AX
341 SBBQ p1<>(SB), CX
342 SBBQ p2<>(SB), DX
343 SBBQ p3<>(SB), R13
344
345 // If borrow (CF=1), keep original t; else use t-P.
346 CMOVQCS R8, AX
347 CMOVQCS R9, CX
348 CMOVQCS R10, DX
349 CMOVQCS R11, R13
350
351 MOVQ AX, 0(BX)
352 MOVQ CX, 8(BX)
353 MOVQ DX, 16(BX)
354 MOVQ R13, 24(BX)
355 RET
356
357 // func montSquare(r, a *fe)
358 //
359 // Computes r = a^2 * R^{-1} mod P. Inlined CIOS with b = a.
360 TEXT ·montSquare(SB), NOSPLIT, $0-16
361 MOVQ r+0(FP), BX
362 MOVQ a+8(FP), SI
363 MOVQ SI, DI // b = a
364
365 XORQ R8, R8
366 XORQ R9, R9
367 XORQ R10, R10
368 XORQ R11, R11
369 XORQ R12, R12
370
371 // ---- i = 0 ----
372 MOVQ 0(SI), R13
373 MOVQ R13, AX
374 MULQ 0(DI)
375 ADDQ AX, R8
376 ADCQ $0, DX
377 MOVQ DX, R14
378 MOVQ R13, AX
379 MULQ 8(DI)
380 ADDQ R14, R9
381 ADCQ $0, DX
382 ADDQ AX, R9
383 ADCQ $0, DX
384 MOVQ DX, R14
385 MOVQ R13, AX
386 MULQ 16(DI)
387 ADDQ R14, R10
388 ADCQ $0, DX
389 ADDQ AX, R10
390 ADCQ $0, DX
391 MOVQ DX, R14
392 MOVQ R13, AX
393 MULQ 24(DI)
394 ADDQ R14, R11
395 ADCQ $0, DX
396 ADDQ AX, R11
397 ADCQ DX, R12
398 MOVQ R8, AX
399 MULQ pp<>(SB)
400 MOVQ AX, R13
401 MOVQ R13, AX
402 MULQ p0<>(SB)
403 ADDQ AX, R8
404 ADCQ $0, DX
405 MOVQ DX, R14
406 MOVQ R13, AX
407 MULQ p1<>(SB)
408 ADDQ R14, R9
409 ADCQ $0, DX
410 ADDQ AX, R9
411 ADCQ $0, DX
412 MOVQ DX, R14
413 MOVQ R13, AX
414 MULQ p2<>(SB)
415 ADDQ R14, R10
416 ADCQ $0, DX
417 ADDQ AX, R10
418 ADCQ $0, DX
419 MOVQ DX, R14
420 MOVQ R13, AX
421 MULQ p3<>(SB)
422 ADDQ R14, R11
423 ADCQ $0, DX
424 ADDQ AX, R11
425 ADCQ DX, R12
426 MOVQ R9, R8
427 MOVQ R10, R9
428 MOVQ R11, R10
429 MOVQ R12, R11
430 XORQ R12, R12
431
432 // ---- i = 1 ----
433 MOVQ 8(SI), R13
434 MOVQ R13, AX
435 MULQ 0(DI)
436 ADDQ AX, R8
437 ADCQ $0, DX
438 MOVQ DX, R14
439 MOVQ R13, AX
440 MULQ 8(DI)
441 ADDQ R14, R9
442 ADCQ $0, DX
443 ADDQ AX, R9
444 ADCQ $0, DX
445 MOVQ DX, R14
446 MOVQ R13, AX
447 MULQ 16(DI)
448 ADDQ R14, R10
449 ADCQ $0, DX
450 ADDQ AX, R10
451 ADCQ $0, DX
452 MOVQ DX, R14
453 MOVQ R13, AX
454 MULQ 24(DI)
455 ADDQ R14, R11
456 ADCQ $0, DX
457 ADDQ AX, R11
458 ADCQ DX, R12
459 MOVQ R8, AX
460 MULQ pp<>(SB)
461 MOVQ AX, R13
462 MOVQ R13, AX
463 MULQ p0<>(SB)
464 ADDQ AX, R8
465 ADCQ $0, DX
466 MOVQ DX, R14
467 MOVQ R13, AX
468 MULQ p1<>(SB)
469 ADDQ R14, R9
470 ADCQ $0, DX
471 ADDQ AX, R9
472 ADCQ $0, DX
473 MOVQ DX, R14
474 MOVQ R13, AX
475 MULQ p2<>(SB)
476 ADDQ R14, R10
477 ADCQ $0, DX
478 ADDQ AX, R10
479 ADCQ $0, DX
480 MOVQ DX, R14
481 MOVQ R13, AX
482 MULQ p3<>(SB)
483 ADDQ R14, R11
484 ADCQ $0, DX
485 ADDQ AX, R11
486 ADCQ DX, R12
487 MOVQ R9, R8
488 MOVQ R10, R9
489 MOVQ R11, R10
490 MOVQ R12, R11
491 XORQ R12, R12
492
493 // ---- i = 2 ----
494 MOVQ 16(SI), R13
495 MOVQ R13, AX
496 MULQ 0(DI)
497 ADDQ AX, R8
498 ADCQ $0, DX
499 MOVQ DX, R14
500 MOVQ R13, AX
501 MULQ 8(DI)
502 ADDQ R14, R9
503 ADCQ $0, DX
504 ADDQ AX, R9
505 ADCQ $0, DX
506 MOVQ DX, R14
507 MOVQ R13, AX
508 MULQ 16(DI)
509 ADDQ R14, R10
510 ADCQ $0, DX
511 ADDQ AX, R10
512 ADCQ $0, DX
513 MOVQ DX, R14
514 MOVQ R13, AX
515 MULQ 24(DI)
516 ADDQ R14, R11
517 ADCQ $0, DX
518 ADDQ AX, R11
519 ADCQ DX, R12
520 MOVQ R8, AX
521 MULQ pp<>(SB)
522 MOVQ AX, R13
523 MOVQ R13, AX
524 MULQ p0<>(SB)
525 ADDQ AX, R8
526 ADCQ $0, DX
527 MOVQ DX, R14
528 MOVQ R13, AX
529 MULQ p1<>(SB)
530 ADDQ R14, R9
531 ADCQ $0, DX
532 ADDQ AX, R9
533 ADCQ $0, DX
534 MOVQ DX, R14
535 MOVQ R13, AX
536 MULQ p2<>(SB)
537 ADDQ R14, R10
538 ADCQ $0, DX
539 ADDQ AX, R10
540 ADCQ $0, DX
541 MOVQ DX, R14
542 MOVQ R13, AX
543 MULQ p3<>(SB)
544 ADDQ R14, R11
545 ADCQ $0, DX
546 ADDQ AX, R11
547 ADCQ DX, R12
548 MOVQ R9, R8
549 MOVQ R10, R9
550 MOVQ R11, R10
551 MOVQ R12, R11
552 XORQ R12, R12
553
554 // ---- i = 3 ----
555 MOVQ 24(SI), R13
556 MOVQ R13, AX
557 MULQ 0(DI)
558 ADDQ AX, R8
559 ADCQ $0, DX
560 MOVQ DX, R14
561 MOVQ R13, AX
562 MULQ 8(DI)
563 ADDQ R14, R9
564 ADCQ $0, DX
565 ADDQ AX, R9
566 ADCQ $0, DX
567 MOVQ DX, R14
568 MOVQ R13, AX
569 MULQ 16(DI)
570 ADDQ R14, R10
571 ADCQ $0, DX
572 ADDQ AX, R10
573 ADCQ $0, DX
574 MOVQ DX, R14
575 MOVQ R13, AX
576 MULQ 24(DI)
577 ADDQ R14, R11
578 ADCQ $0, DX
579 ADDQ AX, R11
580 ADCQ DX, R12
581 MOVQ R8, AX
582 MULQ pp<>(SB)
583 MOVQ AX, R13
584 MOVQ R13, AX
585 MULQ p0<>(SB)
586 ADDQ AX, R8
587 ADCQ $0, DX
588 MOVQ DX, R14
589 MOVQ R13, AX
590 MULQ p1<>(SB)
591 ADDQ R14, R9
592 ADCQ $0, DX
593 ADDQ AX, R9
594 ADCQ $0, DX
595 MOVQ DX, R14
596 MOVQ R13, AX
597 MULQ p2<>(SB)
598 ADDQ R14, R10
599 ADCQ $0, DX
600 ADDQ AX, R10
601 ADCQ $0, DX
602 MOVQ DX, R14
603 MOVQ R13, AX
604 MULQ p3<>(SB)
605 ADDQ R14, R11
606 ADCQ $0, DX
607 ADDQ AX, R11
608 ADCQ DX, R12
609 MOVQ R9, R8
610 MOVQ R10, R9
611 MOVQ R11, R10
612 MOVQ R12, R11
613
614 // Conditional subtraction.
615 MOVQ R8, AX
616 MOVQ R9, CX
617 MOVQ R10, DX
618 MOVQ R11, R13
619 SUBQ p0<>(SB), AX
620 SBBQ p1<>(SB), CX
621 SBBQ p2<>(SB), DX
622 SBBQ p3<>(SB), R13
623 CMOVQCS R8, AX
624 CMOVQCS R9, CX
625 CMOVQCS R10, DX
626 CMOVQCS R11, R13
627 MOVQ AX, 0(BX)
628 MOVQ CX, 8(BX)
629 MOVQ DX, 16(BX)
630 MOVQ R13, 24(BX)
631 RET
632