field_4x64.go raw
1 //go:build amd64 && !purego
2
3 package p256k1
4
5 import (
6 "crypto/subtle"
7 "errors"
8 "math/bits"
9 "unsafe"
10 )
11
12 var errField4x64InvalidLen = errors.New("field element must be 32 bytes")
13
14 // Field4x64 represents a field element using 4x64-bit limbs with lazy reduction.
15 // This representation allows fewer partial products (16 vs 25 for 5x52) and
16 // deferred reduction for sequences of additions.
17 //
18 // The value is: n[0] + n[1]*2^64 + n[2]*2^128 + n[3]*2^192
19 //
20 // Magnitude tracks how many additions have occurred since the last reduction.
21 // Multiplication requires magnitude ≤ 1 (fully reduced inputs).
22 type Field4x64 struct {
23 n [4]uint64
24 magnitude int // 1 = normalized, increases with additions
25 normalized bool // whether the field element is fully normalized
26 }
27
28 // secp256k1 field prime: p = 2^256 - 2^32 - 977
29 // p = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F
30 var field4x64Prime = [4]uint64{
31 0xFFFFFFFEFFFFFC2F, // n[0]
32 0xFFFFFFFFFFFFFFFF, // n[1]
33 0xFFFFFFFFFFFFFFFF, // n[2]
34 0xFFFFFFFFFFFFFFFF, // n[3]
35 }
36
37 // 2^256 mod p = 2^32 + 977 = 0x1000003D1
38 const field4x64R = uint64(0x1000003D1)
39
40 // SetZero sets f to zero.
41 func (f *Field4x64) SetZero() *Field4x64 {
42 f.n[0], f.n[1], f.n[2], f.n[3] = 0, 0, 0, 0
43 f.magnitude = 0
44 f.normalized = true
45 return f
46 }
47
48 // SetOne sets f to one.
49 func (f *Field4x64) SetOne() *Field4x64 {
50 f.n[0], f.n[1], f.n[2], f.n[3] = 1, 0, 0, 0
51 f.magnitude = 1
52 f.normalized = true
53 return f
54 }
55
56 // Set copies a into f.
57 func (f *Field4x64) Set(a *Field4x64) *Field4x64 {
58 f.n = a.n
59 f.magnitude = a.magnitude
60 f.normalized = a.normalized
61 return f
62 }
63
64 // IsZero returns true if f is zero.
65 func (f *Field4x64) IsZero() bool {
66 // Must be normalized first
67 return f.n[0] == 0 && f.n[1] == 0 && f.n[2] == 0 && f.n[3] == 0
68 }
69
70 // Equal returns true if f equals a.
71 func (f *Field4x64) Equal(a *Field4x64) bool {
72 // Both must be normalized
73 return f.n[0] == a.n[0] && f.n[1] == a.n[1] && f.n[2] == a.n[2] && f.n[3] == a.n[3]
74 }
75
76 // Reduce reduces f modulo p, setting magnitude to 1.
77 // Uses the identity: 2^256 ≡ 2^32 + 977 (mod p)
78 func (f *Field4x64) Reduce() *Field4x64 {
79 // Fast path: already normalized
80 if f.normalized {
81 return f
82 }
83
84 n0, n1, n2, n3 := f.n[0], f.n[1], f.n[2], f.n[3]
85
86 // Reduce: if value >= p, subtract p
87 // Check if n >= p by comparing from high limb
88 var c uint64
89 var borrow uint64
90
91 // Subtract p and check for borrow
92 t0, borrow := bits.Sub64(n0, field4x64Prime[0], 0)
93 t1, borrow := bits.Sub64(n1, field4x64Prime[1], borrow)
94 t2, borrow := bits.Sub64(n2, field4x64Prime[2], borrow)
95 t3, borrow := bits.Sub64(n3, field4x64Prime[3], borrow)
96
97 // If no borrow, result is valid (n >= p), use subtracted value
98 // If borrow, n < p, keep original
99 c = borrow - 1 // c = 0xFFFF... if no borrow, 0 if borrow
100
101 f.n[0] = (t0 & c) | (n0 & ^c)
102 f.n[1] = (t1 & c) | (n1 & ^c)
103 f.n[2] = (t2 & c) | (n2 & ^c)
104 f.n[3] = (t3 & c) | (n3 & ^c)
105 f.magnitude = 1
106 f.normalized = true
107
108 return f
109 }
110
111 // Add computes f = a + b (mod p) with lazy reduction.
112 func (f *Field4x64) Add(a, b *Field4x64) *Field4x64 {
113 var carry uint64
114
115 f.n[0], carry = bits.Add64(a.n[0], b.n[0], 0)
116 f.n[1], carry = bits.Add64(a.n[1], b.n[1], carry)
117 f.n[2], carry = bits.Add64(a.n[2], b.n[2], carry)
118 f.n[3], carry = bits.Add64(a.n[3], b.n[3], carry)
119
120 // If carry, we overflowed 2^256, reduce by adding R = 2^256 mod p
121 if carry != 0 {
122 f.n[0], carry = bits.Add64(f.n[0], field4x64R, 0)
123 f.n[1], carry = bits.Add64(f.n[1], 0, carry)
124 f.n[2], carry = bits.Add64(f.n[2], 0, carry)
125 f.n[3], _ = bits.Add64(f.n[3], 0, carry)
126 }
127
128 f.magnitude = a.magnitude + b.magnitude
129 f.normalized = false
130 if f.magnitude > 8 { // Reduce if magnitude gets too high
131 f.Reduce()
132 }
133
134 return f
135 }
136
137 // Sub computes f = a - b (mod p).
138 func (f *Field4x64) Sub(a, b *Field4x64) *Field4x64 {
139 var borrow uint64
140
141 f.n[0], borrow = bits.Sub64(a.n[0], b.n[0], 0)
142 f.n[1], borrow = bits.Sub64(a.n[1], b.n[1], borrow)
143 f.n[2], borrow = bits.Sub64(a.n[2], b.n[2], borrow)
144 f.n[3], borrow = bits.Sub64(a.n[3], b.n[3], borrow)
145
146 // If borrow, we went negative, add p back
147 if borrow != 0 {
148 var carry uint64
149 f.n[0], carry = bits.Add64(f.n[0], field4x64Prime[0], 0)
150 f.n[1], carry = bits.Add64(f.n[1], field4x64Prime[1], carry)
151 f.n[2], carry = bits.Add64(f.n[2], field4x64Prime[2], carry)
152 f.n[3], _ = bits.Add64(f.n[3], field4x64Prime[3], carry)
153 }
154
155 f.magnitude = 1
156 f.normalized = false
157 return f
158 }
159
160 // Negate computes f = -a (mod p).
161 // Handles inputs that may be in range [0, 2p).
162 func (f *Field4x64) Negate(a *Field4x64) *Field4x64 {
163 // -a = p - a (when a != 0)
164 if a.IsZero() {
165 return f.SetZero()
166 }
167
168 // Compute p - a. If a >= p, this will underflow (borrow != 0).
169 var borrow uint64
170 f.n[0], borrow = bits.Sub64(field4x64Prime[0], a.n[0], 0)
171 f.n[1], borrow = bits.Sub64(field4x64Prime[1], a.n[1], borrow)
172 f.n[2], borrow = bits.Sub64(field4x64Prime[2], a.n[2], borrow)
173 f.n[3], borrow = bits.Sub64(field4x64Prime[3], a.n[3], borrow)
174
175 // If there was a borrow, a >= p. We need to compute p - (a - p) = 2p - a.
176 // But since we computed p - a which underflowed, the result is 2^256 + p - a = 2^256 - (a - p).
177 // We need to add p to get: 2^256 + 2p - a. Then subtract 2^256 (which is implicit).
178 // Actually: if a in [p, 2p), then -a = p - a is negative, and we wrapped to 2^256 + p - a.
179 // We want the result in [0, p). Since a < 2p, we have a - p < p, so -(a-p) = p - (a-p) = 2p - a.
180 // The wrapped result 2^256 + p - a needs to be converted to 2p - a by adding p (mod 2^256).
181 if borrow != 0 {
182 var carry uint64
183 f.n[0], carry = bits.Add64(f.n[0], field4x64Prime[0], 0)
184 f.n[1], carry = bits.Add64(f.n[1], field4x64Prime[1], carry)
185 f.n[2], carry = bits.Add64(f.n[2], field4x64Prime[2], carry)
186 f.n[3], _ = bits.Add64(f.n[3], field4x64Prime[3], carry)
187 }
188
189 f.magnitude = 1
190 f.normalized = false
191 return f
192 }
193
194 // MulPureGo computes f = a * b (mod p) using pure Go.
195 // This is the reference implementation; assembly version is faster.
196 func (f *Field4x64) MulPureGo(a, b *Field4x64) *Field4x64 {
197 // Ensure inputs are reduced
198 if a.magnitude > 1 {
199 a.Reduce()
200 }
201 if b.magnitude > 1 {
202 b.Reduce()
203 }
204
205 // Compute 512-bit product using schoolbook multiplication
206 // Use 128-bit accumulators (represented as hi:lo pairs)
207 // Note: bits.Mul64 returns (hi, lo) - high word first!
208 var r [8]uint64
209
210 // For each output limb r[k], accumulate all products a[i]*b[j] where i+j=k
211 // Use proper carry propagation throughout
212
213 var c0, c1, c2 uint64 // Three-word accumulator (c2:c1:c0)
214 var hi, lo uint64
215
216 // Column 0: a0*b0
217 hi, lo = bits.Mul64(a.n[0], b.n[0])
218 c0 = lo
219 c1 = hi
220 c2 = 0
221 r[0] = c0
222 c0 = c1
223 c1 = c2
224 c2 = 0
225
226 // Column 1: a0*b1 + a1*b0
227 hi, lo = bits.Mul64(a.n[0], b.n[1])
228 c0, lo = bits.Add64(c0, lo, 0)
229 c1, hi = bits.Add64(c1, hi, lo)
230 c2 += hi
231
232 hi, lo = bits.Mul64(a.n[1], b.n[0])
233 c0, lo = bits.Add64(c0, lo, 0)
234 c1, hi = bits.Add64(c1, hi, lo)
235 c2 += hi
236
237 r[1] = c0
238 c0 = c1
239 c1 = c2
240 c2 = 0
241
242 // Column 2: a0*b2 + a1*b1 + a2*b0
243 hi, lo = bits.Mul64(a.n[0], b.n[2])
244 c0, lo = bits.Add64(c0, lo, 0)
245 c1, hi = bits.Add64(c1, hi, lo)
246 c2 += hi
247
248 hi, lo = bits.Mul64(a.n[1], b.n[1])
249 c0, lo = bits.Add64(c0, lo, 0)
250 c1, hi = bits.Add64(c1, hi, lo)
251 c2 += hi
252
253 hi, lo = bits.Mul64(a.n[2], b.n[0])
254 c0, lo = bits.Add64(c0, lo, 0)
255 c1, hi = bits.Add64(c1, hi, lo)
256 c2 += hi
257
258 r[2] = c0
259 c0 = c1
260 c1 = c2
261 c2 = 0
262
263 // Column 3: a0*b3 + a1*b2 + a2*b1 + a3*b0
264 hi, lo = bits.Mul64(a.n[0], b.n[3])
265 c0, lo = bits.Add64(c0, lo, 0)
266 c1, hi = bits.Add64(c1, hi, lo)
267 c2 += hi
268
269 hi, lo = bits.Mul64(a.n[1], b.n[2])
270 c0, lo = bits.Add64(c0, lo, 0)
271 c1, hi = bits.Add64(c1, hi, lo)
272 c2 += hi
273
274 hi, lo = bits.Mul64(a.n[2], b.n[1])
275 c0, lo = bits.Add64(c0, lo, 0)
276 c1, hi = bits.Add64(c1, hi, lo)
277 c2 += hi
278
279 hi, lo = bits.Mul64(a.n[3], b.n[0])
280 c0, lo = bits.Add64(c0, lo, 0)
281 c1, hi = bits.Add64(c1, hi, lo)
282 c2 += hi
283
284 r[3] = c0
285 c0 = c1
286 c1 = c2
287 c2 = 0
288
289 // Column 4: a1*b3 + a2*b2 + a3*b1
290 hi, lo = bits.Mul64(a.n[1], b.n[3])
291 c0, lo = bits.Add64(c0, lo, 0)
292 c1, hi = bits.Add64(c1, hi, lo)
293 c2 += hi
294
295 hi, lo = bits.Mul64(a.n[2], b.n[2])
296 c0, lo = bits.Add64(c0, lo, 0)
297 c1, hi = bits.Add64(c1, hi, lo)
298 c2 += hi
299
300 hi, lo = bits.Mul64(a.n[3], b.n[1])
301 c0, lo = bits.Add64(c0, lo, 0)
302 c1, hi = bits.Add64(c1, hi, lo)
303 c2 += hi
304
305 r[4] = c0
306 c0 = c1
307 c1 = c2
308 c2 = 0
309
310 // Column 5: a2*b3 + a3*b2
311 hi, lo = bits.Mul64(a.n[2], b.n[3])
312 c0, lo = bits.Add64(c0, lo, 0)
313 c1, hi = bits.Add64(c1, hi, lo)
314 c2 += hi
315
316 hi, lo = bits.Mul64(a.n[3], b.n[2])
317 c0, lo = bits.Add64(c0, lo, 0)
318 c1, hi = bits.Add64(c1, hi, lo)
319 c2 += hi
320
321 r[5] = c0
322 c0 = c1
323 c1 = c2
324
325 // Column 6: a3*b3
326 hi, lo = bits.Mul64(a.n[3], b.n[3])
327 c0, lo = bits.Add64(c0, lo, 0)
328 c1, _ = bits.Add64(c1, hi, lo)
329
330 r[6] = c0
331 r[7] = c1
332
333 // Reduce 512-bit result modulo p
334 f.reduce512(&r)
335
336 return f
337 }
338
339 // reduce512 reduces a 512-bit value to 256 bits modulo p.
340 // Uses the identity: 2^256 ≡ 2^32 + 977 (mod p)
341 func (f *Field4x64) reduce512(r *[8]uint64) {
342 // High part (r[4..7]) needs to be multiplied by R = 2^32 + 977 and added to low part
343
344 // Step 1: Compute r[4..7] * R and add to r[0..3]
345 // R = 0x1000003D1
346 // Note: bits.Mul64 returns (hi, lo)
347
348 var hi, lo, carry uint64
349 var t [5]uint64 // Accumulator for r[4..7] * R
350
351 // r[4] * R
352 hi, lo = bits.Mul64(r[4], field4x64R)
353 t[0] = lo
354 t[1] = hi
355
356 // r[5] * R
357 hi, lo = bits.Mul64(r[5], field4x64R)
358 t[1], carry = bits.Add64(t[1], lo, 0)
359 t[2] = hi + carry
360
361 // r[6] * R
362 hi, lo = bits.Mul64(r[6], field4x64R)
363 t[2], carry = bits.Add64(t[2], lo, 0)
364 t[3] = hi + carry
365
366 // r[7] * R
367 hi, lo = bits.Mul64(r[7], field4x64R)
368 t[3], carry = bits.Add64(t[3], lo, 0)
369 t[4] = hi + carry
370
371 // Add t to r[0..3]
372 var c uint64
373 f.n[0], c = bits.Add64(r[0], t[0], 0)
374 f.n[1], c = bits.Add64(r[1], t[1], c)
375 f.n[2], c = bits.Add64(r[2], t[2], c)
376 f.n[3], c = bits.Add64(r[3], t[3], c)
377
378 // Handle overflow from t[4] + carry
379 overflow := t[4] + c
380
381 // If there's overflow, multiply it by R and add again
382 if overflow != 0 {
383 hi, lo = bits.Mul64(overflow, field4x64R)
384 f.n[0], c = bits.Add64(f.n[0], lo, 0)
385 f.n[1], c = bits.Add64(f.n[1], hi, c)
386 f.n[2], c = bits.Add64(f.n[2], 0, c)
387 f.n[3], c = bits.Add64(f.n[3], 0, c)
388
389 // One more reduction if still overflowing (very rare)
390 if c != 0 {
391 f.n[0], c = bits.Add64(f.n[0], field4x64R, 0)
392 f.n[1], c = bits.Add64(f.n[1], 0, c)
393 f.n[2], c = bits.Add64(f.n[2], 0, c)
394 f.n[3], _ = bits.Add64(f.n[3], 0, c)
395 }
396 }
397
398 f.magnitude = 1
399 f.Reduce() // Final reduction to ensure result < p
400 }
401
402 // Sqr computes f = a^2 (mod p).
403 // Uses optimized assembly that exploits squaring symmetry.
404 func (f *Field4x64) Sqr(a *Field4x64) *Field4x64 {
405 // Ensure input is reduced
406 if a.magnitude > 1 {
407 a.Reduce()
408 }
409 field4x64SqrAsm(&f.n, &a.n)
410 f.magnitude = 1
411 f.normalized = false
412 return f
413 }
414
415 // Mul computes f = a * b (mod p).
416 // Uses BMI2 MULX assembly for high performance.
417 func (f *Field4x64) Mul(a, b *Field4x64) *Field4x64 {
418 // Ensure inputs are reduced
419 if a.magnitude > 1 {
420 a.Reduce()
421 }
422 if b.magnitude > 1 {
423 b.Reduce()
424 }
425 field4x64MulAsm(&f.n, &a.n, &b.n)
426 f.magnitude = 1
427 f.normalized = false
428 return f
429 }
430
431 // SetB32 sets f from a 32-byte big-endian representation.
432 func (f *Field4x64) SetB32(b []byte) error {
433 if len(b) != 32 {
434 return errField4x64InvalidLen
435 }
436
437 // Big-endian: b[0] is most significant
438 f.n[3] = uint64(b[0])<<56 | uint64(b[1])<<48 | uint64(b[2])<<40 | uint64(b[3])<<32 |
439 uint64(b[4])<<24 | uint64(b[5])<<16 | uint64(b[6])<<8 | uint64(b[7])
440 f.n[2] = uint64(b[8])<<56 | uint64(b[9])<<48 | uint64(b[10])<<40 | uint64(b[11])<<32 |
441 uint64(b[12])<<24 | uint64(b[13])<<16 | uint64(b[14])<<8 | uint64(b[15])
442 f.n[1] = uint64(b[16])<<56 | uint64(b[17])<<48 | uint64(b[18])<<40 | uint64(b[19])<<32 |
443 uint64(b[20])<<24 | uint64(b[21])<<16 | uint64(b[22])<<8 | uint64(b[23])
444 f.n[0] = uint64(b[24])<<56 | uint64(b[25])<<48 | uint64(b[26])<<40 | uint64(b[27])<<32 |
445 uint64(b[28])<<24 | uint64(b[29])<<16 | uint64(b[30])<<8 | uint64(b[31])
446
447 f.magnitude = 1
448 f.normalized = false
449 return nil
450 }
451
452 // GetB32 writes f to a 32-byte big-endian representation.
453 func (f *Field4x64) GetB32(b []byte) {
454 if len(b) != 32 {
455 panic("GetB32: buffer must be 32 bytes")
456 }
457
458 // Ensure normalized
459 if f.magnitude > 1 {
460 f.Reduce()
461 }
462
463 // Big-endian: b[0] is most significant
464 b[0] = byte(f.n[3] >> 56)
465 b[1] = byte(f.n[3] >> 48)
466 b[2] = byte(f.n[3] >> 40)
467 b[3] = byte(f.n[3] >> 32)
468 b[4] = byte(f.n[3] >> 24)
469 b[5] = byte(f.n[3] >> 16)
470 b[6] = byte(f.n[3] >> 8)
471 b[7] = byte(f.n[3])
472 b[8] = byte(f.n[2] >> 56)
473 b[9] = byte(f.n[2] >> 48)
474 b[10] = byte(f.n[2] >> 40)
475 b[11] = byte(f.n[2] >> 32)
476 b[12] = byte(f.n[2] >> 24)
477 b[13] = byte(f.n[2] >> 16)
478 b[14] = byte(f.n[2] >> 8)
479 b[15] = byte(f.n[2])
480 b[16] = byte(f.n[1] >> 56)
481 b[17] = byte(f.n[1] >> 48)
482 b[18] = byte(f.n[1] >> 40)
483 b[19] = byte(f.n[1] >> 32)
484 b[20] = byte(f.n[1] >> 24)
485 b[21] = byte(f.n[1] >> 16)
486 b[22] = byte(f.n[1] >> 8)
487 b[23] = byte(f.n[1])
488 b[24] = byte(f.n[0] >> 56)
489 b[25] = byte(f.n[0] >> 48)
490 b[26] = byte(f.n[0] >> 40)
491 b[27] = byte(f.n[0] >> 32)
492 b[28] = byte(f.n[0] >> 24)
493 b[29] = byte(f.n[0] >> 16)
494 b[30] = byte(f.n[0] >> 8)
495 b[31] = byte(f.n[0])
496 }
497
498 // ============================================================================
499 // FieldElement-compatible methods (lowercase for internal API compatibility)
500 // ============================================================================
501
502 // normalize normalizes f to canonical form (same as Reduce).
503 func (f *Field4x64) normalize() {
504 f.Reduce()
505 }
506
507 // normalizeWeak performs weak normalization (magnitude <= 1).
508 func (f *Field4x64) normalizeWeak() {
509 if f.magnitude <= 1 {
510 return
511 }
512 f.Reduce()
513 }
514
515 // isZero returns true if f is zero (must be normalized).
516 func (f *Field4x64) isZero() bool {
517 if !f.normalized {
518 panic("field element must be normalized")
519 }
520 return f.n[0] == 0 && f.n[1] == 0 && f.n[2] == 0 && f.n[3] == 0
521 }
522
523 // isOdd returns true if f is odd (must be normalized).
524 func (f *Field4x64) isOdd() bool {
525 if !f.normalized {
526 panic("field element must be normalized")
527 }
528 return f.n[0]&1 == 1
529 }
530
531 // normalizesToZeroVar checks if f normalizes to zero (variable-time).
532 func (f *Field4x64) normalizesToZeroVar() bool {
533 var t Field4x64
534 t = *f
535 t.normalize()
536 return t.isZero()
537 }
538
539 // equal returns true if f equals a (both must be normalized).
540 func (f *Field4x64) equal(a *Field4x64) bool {
541 if !f.normalized || !a.normalized {
542 panic("field elements must be normalized for comparison")
543 }
544 return subtle.ConstantTimeCompare(
545 (*[32]byte)(unsafe.Pointer(&f.n[0]))[:32],
546 (*[32]byte)(unsafe.Pointer(&a.n[0]))[:32],
547 ) == 1
548 }
549
550 // setInt sets f to a small integer value.
551 func (f *Field4x64) setInt(a int) {
552 if a < 0 || a > 0x7FFF {
553 panic("value out of range")
554 }
555 f.n[0] = uint64(a)
556 f.n[1] = 0
557 f.n[2] = 0
558 f.n[3] = 0
559 if a == 0 {
560 f.magnitude = 0
561 } else {
562 f.magnitude = 1
563 }
564 f.normalized = true
565 }
566
567 // clear clears f to prevent leaking sensitive information.
568 func (f *Field4x64) clear() {
569 f.n[0], f.n[1], f.n[2], f.n[3] = 0, 0, 0, 0
570 f.magnitude = 0
571 f.normalized = true
572 }
573
574 // negate negates f: r = -a with given magnitude.
575 func (f *Field4x64) negate(a *Field4x64, m int) {
576 if m < 0 || m > 31 {
577 panic("magnitude out of range")
578 }
579 // For 4x64, we use a simpler approach: just compute p - a
580 // The magnitude parameter is for compatibility with 5x52
581 f.Negate(a)
582 f.magnitude = m + 1
583 f.normalized = false
584 }
585
586 // add adds a to f in place: f += a.
587 func (f *Field4x64) add(a *Field4x64) {
588 var carry uint64
589 f.n[0], carry = bits.Add64(f.n[0], a.n[0], 0)
590 f.n[1], carry = bits.Add64(f.n[1], a.n[1], carry)
591 f.n[2], carry = bits.Add64(f.n[2], a.n[2], carry)
592 f.n[3], carry = bits.Add64(f.n[3], a.n[3], carry)
593
594 if carry != 0 {
595 f.n[0], carry = bits.Add64(f.n[0], field4x64R, 0)
596 f.n[1], carry = bits.Add64(f.n[1], 0, carry)
597 f.n[2], carry = bits.Add64(f.n[2], 0, carry)
598 f.n[3], _ = bits.Add64(f.n[3], 0, carry)
599 }
600
601 f.magnitude += a.magnitude
602 f.normalized = false
603 if f.magnitude > 8 {
604 f.Reduce()
605 }
606 }
607
608 // sub subtracts a from f in place: f -= a.
609 func (f *Field4x64) sub(a *Field4x64) {
610 var negA Field4x64
611 negA.negate(a, a.magnitude)
612 f.add(&negA)
613 }
614
615 // mulInt multiplies f by a small integer.
616 func (f *Field4x64) mulInt(a int) {
617 if a < 0 || a > 32 {
618 panic("multiplier out of range")
619 }
620 ua := uint64(a)
621
622 // Multiply each limb
623 var carry uint64
624 var hi uint64
625
626 hi, f.n[0] = bits.Mul64(f.n[0], ua)
627 carry = hi
628
629 hi, f.n[1] = bits.Mul64(f.n[1], ua)
630 f.n[1], carry = bits.Add64(f.n[1], carry, 0)
631 carry = hi + carry
632
633 hi, f.n[2] = bits.Mul64(f.n[2], ua)
634 f.n[2], carry = bits.Add64(f.n[2], carry, 0)
635 carry = hi + carry
636
637 hi, f.n[3] = bits.Mul64(f.n[3], ua)
638 f.n[3], carry = bits.Add64(f.n[3], carry, 0)
639 carry = hi + carry
640
641 // Handle overflow
642 if carry != 0 {
643 var c uint64
644 hi, lo := bits.Mul64(carry, field4x64R)
645 f.n[0], c = bits.Add64(f.n[0], lo, 0)
646 f.n[1], c = bits.Add64(f.n[1], hi, c)
647 f.n[2], c = bits.Add64(f.n[2], 0, c)
648 f.n[3], _ = bits.Add64(f.n[3], 0, c)
649 }
650
651 f.magnitude *= a
652 f.normalized = false
653 }
654
655 // cmov conditionally moves a into f if flag is non-zero.
656 func (f *Field4x64) cmov(a *Field4x64, flag int) {
657 mask := uint64(-(int64(flag) & 1))
658 f.n[0] ^= mask & (f.n[0] ^ a.n[0])
659 f.n[1] ^= mask & (f.n[1] ^ a.n[1])
660 f.n[2] ^= mask & (f.n[2] ^ a.n[2])
661 f.n[3] ^= mask & (f.n[3] ^ a.n[3])
662
663 if flag != 0 {
664 f.magnitude = a.magnitude
665 f.normalized = a.normalized
666 }
667 }
668
669 // toStorage converts f to storage format.
670 func (f *Field4x64) toStorage(s *FieldElementStorage) {
671 if !f.normalized {
672 f.normalize()
673 }
674 s.n = f.n
675 }
676
677 // fromStorage converts from storage format to f.
678 func (f *Field4x64) fromStorage(s *FieldElementStorage) {
679 f.n = s.n
680 f.magnitude = 1
681 f.normalized = false
682 }
683
684 // setB32 sets f from 32-byte big-endian representation (lowercase version).
685 func (f *Field4x64) setB32(b []byte) error {
686 return f.SetB32(b)
687 }
688
689 // getB32 writes f to 32-byte big-endian representation (lowercase version).
690 func (f *Field4x64) getB32(b []byte) {
691 f.GetB32(b)
692 }
693
694 // mul multiplies two field elements: f = a * b (lowercase version).
695 func (f *Field4x64) mul(a, b *Field4x64) {
696 f.Mul(a, b)
697 }
698
699 // sqr squares a field element: f = a^2 (lowercase version).
700 func (f *Field4x64) sqr(a *Field4x64) {
701 f.Sqr(a)
702 }
703
704 // inv computes modular inverse using Fermat's little theorem: f = a^(p-2).
705 // Uses an optimized addition chain for efficiency.
706 //
707 // p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D
708 //
709 // Binary structure:
710 // - bits 255-33: 223 ones
711 // - bit 32: 0
712 // - bits 31-12: 20 ones
713 // - bits 11-0: 0xC2D
714 func (f *Field4x64) inv(a *Field4x64) {
715 var aNorm Field4x64
716 aNorm = *a
717 aNorm.normalize()
718
719 // Build up powers using addition chain (same as sqrt)
720 var x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1 Field4x64
721
722 // x2 = a^(2^2 - 1) = a^3
723 x2.sqr(&aNorm)
724 x2.mul(&x2, &aNorm)
725
726 // x3 = a^(2^3 - 1) = a^7
727 x3.sqr(&x2)
728 x3.mul(&x3, &aNorm)
729
730 // x6 = a^(2^6 - 1) = a^63
731 x6 = x3
732 for j := 0; j < 3; j++ {
733 x6.sqr(&x6)
734 }
735 x6.mul(&x6, &x3)
736
737 // x9 = a^(2^9 - 1) = a^511
738 x9 = x6
739 for j := 0; j < 3; j++ {
740 x9.sqr(&x9)
741 }
742 x9.mul(&x9, &x3)
743
744 // x11 = a^(2^11 - 1) = a^2047
745 x11 = x9
746 for j := 0; j < 2; j++ {
747 x11.sqr(&x11)
748 }
749 x11.mul(&x11, &x2)
750
751 // x22 = a^(2^22 - 1)
752 x22 = x11
753 for j := 0; j < 11; j++ {
754 x22.sqr(&x22)
755 }
756 x22.mul(&x22, &x11)
757
758 // x44 = a^(2^44 - 1)
759 x44 = x22
760 for j := 0; j < 22; j++ {
761 x44.sqr(&x44)
762 }
763 x44.mul(&x44, &x22)
764
765 // x88 = a^(2^88 - 1)
766 x88 = x44
767 for j := 0; j < 44; j++ {
768 x88.sqr(&x88)
769 }
770 x88.mul(&x88, &x44)
771
772 // x176 = a^(2^176 - 1)
773 x176 = x88
774 for j := 0; j < 88; j++ {
775 x176.sqr(&x176)
776 }
777 x176.mul(&x176, &x88)
778
779 // x220 = a^(2^220 - 1)
780 x220 = x176
781 for j := 0; j < 44; j++ {
782 x220.sqr(&x220)
783 }
784 x220.mul(&x220, &x44)
785
786 // x223 = a^(2^223 - 1)
787 x223 = x220
788 for j := 0; j < 3; j++ {
789 x223.sqr(&x223)
790 }
791 x223.mul(&x223, &x3)
792
793 // x20 = a^(2^20 - 1) (needed for bits 31-12)
794 var x20 Field4x64
795 x20 = x11
796 for j := 0; j < 9; j++ {
797 x20.sqr(&x20)
798 }
799 x20.mul(&x20, &x9)
800
801 // Assemble p-2:
802 // Start with x223 and shift by 33 bits
803 t1 = x223
804 for j := 0; j < 33; j++ {
805 t1.sqr(&t1)
806 }
807 // t1 = a^((2^223-1) * 2^33) = a^(2^256 - 2^33)
808
809 // Add contribution for bits 31-12 (20 ones): x20 shifted by 12
810 var temp Field4x64
811 temp = x20
812 for j := 0; j < 12; j++ {
813 temp.sqr(&temp)
814 }
815 // temp = a^((2^20-1)*2^12) = a^(2^32 - 2^12)
816
817 t1.mul(&t1, &temp)
818 // t1 = a^(2^256 - 2^32 - 2^12)
819
820 // Add bits 11-0 = 0xC2D = 3117
821 // Compute a^3117 using binary exponentiation (small number)
822 var low, base Field4x64
823 low.setInt(1)
824 base = aNorm
825 e := uint32(3117)
826 for e > 0 {
827 if e&1 == 1 {
828 low.mul(&low, &base)
829 }
830 e >>= 1
831 if e > 0 {
832 base.sqr(&base)
833 }
834 }
835 // low = a^3117
836
837 t1.mul(&t1, &low)
838 // t1 = a^(2^256 - 2^32 - 979) = a^(p-2)
839
840 *f = t1
841 f.magnitude = 1
842 f.normalized = true
843 }
844
845 // sqrt computes square root of f if it exists.
846 func (f *Field4x64) sqrt(a *Field4x64) bool {
847 // Normalize input
848 var aNorm Field4x64
849 aNorm = *a
850 if aNorm.magnitude > 8 {
851 aNorm.normalizeWeak()
852 } else {
853 aNorm.normalize()
854 }
855
856 // For p ≡ 3 (mod 4), sqrt(a) = a^((p+1)/4)
857 // Use addition chain for efficiency
858 var x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1 Field4x64
859
860 // x2 = a^3
861 x2.sqr(&aNorm)
862 x2.mul(&x2, &aNorm)
863
864 // x3 = a^7
865 x3.sqr(&x2)
866 x3.mul(&x3, &aNorm)
867
868 // x6 = a^63
869 x6 = x3
870 for j := 0; j < 3; j++ {
871 x6.sqr(&x6)
872 }
873 x6.mul(&x6, &x3)
874
875 // x9 = a^511
876 x9 = x6
877 for j := 0; j < 3; j++ {
878 x9.sqr(&x9)
879 }
880 x9.mul(&x9, &x3)
881
882 // x11 = a^2047
883 x11 = x9
884 for j := 0; j < 2; j++ {
885 x11.sqr(&x11)
886 }
887 x11.mul(&x11, &x2)
888
889 // x22 = a^4194303
890 x22 = x11
891 for j := 0; j < 11; j++ {
892 x22.sqr(&x22)
893 }
894 x22.mul(&x22, &x11)
895
896 // x44 = a^17592186044415
897 x44 = x22
898 for j := 0; j < 22; j++ {
899 x44.sqr(&x44)
900 }
901 x44.mul(&x44, &x22)
902
903 // x88
904 x88 = x44
905 for j := 0; j < 44; j++ {
906 x88.sqr(&x88)
907 }
908 x88.mul(&x88, &x44)
909
910 // x176
911 x176 = x88
912 for j := 0; j < 88; j++ {
913 x176.sqr(&x176)
914 }
915 x176.mul(&x176, &x88)
916
917 // x220
918 x220 = x176
919 for j := 0; j < 44; j++ {
920 x220.sqr(&x220)
921 }
922 x220.mul(&x220, &x44)
923
924 // x223
925 x223 = x220
926 for j := 0; j < 3; j++ {
927 x223.sqr(&x223)
928 }
929 x223.mul(&x223, &x3)
930
931 // Final assembly
932 t1 = x223
933 for j := 0; j < 23; j++ {
934 t1.sqr(&t1)
935 }
936 t1.mul(&t1, &x22)
937 for j := 0; j < 6; j++ {
938 t1.sqr(&t1)
939 }
940 t1.mul(&t1, &x2)
941 t1.sqr(&t1)
942 f.sqr(&t1)
943
944 // Verify: check that f^2 == a
945 var check Field4x64
946 check.sqr(f)
947 check.normalize()
948 aNorm.normalize()
949
950 return check.equal(&aNorm)
951 }
952
953 // half computes f = a/2 (mod p).
954 func (f *Field4x64) half(a *Field4x64) {
955 *f = *a
956
957 // If odd, add p first (so result is even)
958 // We need to preserve the overflow bit for the right shift
959 var overflow uint64
960 if f.n[0]&1 == 1 {
961 var carry uint64
962 f.n[0], carry = bits.Add64(f.n[0], field4x64Prime[0], 0)
963 f.n[1], carry = bits.Add64(f.n[1], field4x64Prime[1], carry)
964 f.n[2], carry = bits.Add64(f.n[2], field4x64Prime[2], carry)
965 f.n[3], overflow = bits.Add64(f.n[3], field4x64Prime[3], carry)
966 }
967
968 // Right shift by 1, with overflow bit becoming the top bit
969 f.n[0] = (f.n[0] >> 1) | (f.n[1] << 63)
970 f.n[1] = (f.n[1] >> 1) | (f.n[2] << 63)
971 f.n[2] = (f.n[2] >> 1) | (f.n[3] << 63)
972 f.n[3] = (f.n[3] >> 1) | (overflow << 63)
973
974 f.magnitude = (f.magnitude >> 1) + 1
975 f.normalized = false
976 }
977