ecdh_amd64.go raw
1 //go:build amd64 && !purego
2
3 package p256k1
4
5 // AMD64-optimized Strauss+GLV+wNAF multiplication using Group4x64
6 // This provides significant speedup for verification and ECDH operations
7
8 // ecmultStraussWNAFGLV4x64 computes r = q * a using Strauss algorithm with GLV
9 // and Field4x64 operations for maximum performance on AMD64 with BMI2.
10 func ecmultStraussWNAFGLV4x64(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) {
11 if a.isInfinity() {
12 r.setInfinity()
13 return
14 }
15
16 if q.isZero() {
17 r.setInfinity()
18 return
19 }
20
21 // Split scalar using GLV endomorphism: q = q1 + q2*λ
22 // Also get the transformed points p1 = a, p2 = λ*a
23 var q1, q2 Scalar
24 var p1, p2 GroupElementAffine
25 ecmultEndoSplit(&q1, &q2, &p1, &p2, q, a)
26
27 // Normalize scalars if high
28 neg1 := q1.isHigh()
29 if neg1 {
30 q1.negate(&q1)
31 }
32 neg2 := q2.isHigh()
33 if neg2 {
34 q2.negate(&q2)
35 }
36
37 // Build odd multiples tables in 4x64 format
38 var p1Jac GroupElementJacobian
39 p1Jac.setGE(&p1)
40
41 var preA1_4x64 [glvTableSize]GroupElement4x64Affine
42 buildOddMultiplesTable4x64(&preA1_4x64, &p1Jac)
43
44 // Build table for p2 (λ*a)
45 var p2Jac GroupElementJacobian
46 p2Jac.setGE(&p2)
47
48 var preA2_4x64 [glvTableSize]GroupElement4x64Affine
49 buildOddMultiplesTable4x64(&preA2_4x64, &p2Jac)
50
51 // Convert scalars to wNAF representation
52 var wnaf1, wnaf2 [257]int8
53
54 bits1 := q1.wNAF(&wnaf1, glvWNAFW)
55 bits2 := q2.wNAF(&wnaf2, glvWNAFW)
56
57 // Find the maximum bit position
58 maxBits := bits1
59 if bits2 > maxBits {
60 maxBits = bits2
61 }
62
63 // Perform the Strauss algorithm using 4x64 operations
64 var r4x64 GroupElement4x64Jacobian
65 r4x64.setInfinity()
66
67 for i := maxBits - 1; i >= 0; i-- {
68 // Double the result
69 if !r4x64.isInfinity() {
70 r4x64.double(&r4x64)
71 }
72
73 // Add contribution from q1
74 if i < bits1 && wnaf1[i] != 0 {
75 var pt GroupElement4x64Affine
76 n := int(wnaf1[i])
77
78 var idx int
79 if n > 0 {
80 idx = (n - 1) / 2
81 } else {
82 idx = (-n - 1) / 2
83 }
84
85 if idx < glvTableSize {
86 pt = preA1_4x64[idx]
87 // Negate if wNAF digit is negative
88 if n < 0 {
89 pt.negate(&pt)
90 }
91 // Negate if q1 was negated during normalization
92 if neg1 {
93 pt.negate(&pt)
94 }
95
96 if r4x64.isInfinity() {
97 r4x64.setGE(&pt)
98 } else {
99 r4x64.addGE(&r4x64, &pt)
100 }
101 }
102 }
103
104 // Add contribution from q2
105 if i < bits2 && wnaf2[i] != 0 {
106 var pt GroupElement4x64Affine
107 n := int(wnaf2[i])
108
109 var idx int
110 if n > 0 {
111 idx = (n - 1) / 2
112 } else {
113 idx = (-n - 1) / 2
114 }
115
116 if idx < glvTableSize {
117 pt = preA2_4x64[idx]
118 // Negate if wNAF digit is negative
119 if n < 0 {
120 pt.negate(&pt)
121 }
122 // Negate if q2 was negated during normalization
123 if neg2 {
124 pt.negate(&pt)
125 }
126
127 if r4x64.isInfinity() {
128 r4x64.setGE(&pt)
129 } else {
130 r4x64.addGE(&r4x64, &pt)
131 }
132 }
133 }
134 }
135
136 // Convert result back to standard format
137 r4x64.ToGroupElementJacobian(r)
138 }
139
140 // buildOddMultiplesTable4x64 builds a precomputation table in 4x64 format
141 func buildOddMultiplesTable4x64(pre *[glvTableSize]GroupElement4x64Affine, a *GroupElementJacobian) {
142 // Build odd multiples in 4x64 Jacobian coordinates
143 var a4x64 GroupElement4x64Jacobian
144 a4x64.FromGroupElementJacobian(a)
145
146 var preJac [glvTableSize]GroupElement4x64Jacobian
147
148 // pre[0] = a (which is 1*a)
149 preJac[0] = a4x64
150
151 if glvTableSize > 1 {
152 // Compute 2*a
153 var twoA GroupElement4x64Jacobian
154 twoA.double(&a4x64)
155
156 // Build odd multiples: pre[i] = pre[i-1] + 2*a for i >= 1
157 for i := 1; i < glvTableSize; i++ {
158 preJac[i].addVar(&preJac[i-1], &twoA)
159 }
160 }
161
162 // Batch convert to affine (much more efficient than individual conversions)
163 batchNormalize4x64(pre[:], preJac[:])
164 }
165
166 // Ecmult4x64 computes r = q * a using 4x64 optimized operations
167 // This is the AMD64-specific optimized version
168 func Ecmult4x64(r *GroupElementJacobian, a *GroupElementJacobian, q *Scalar) {
169 if a.isInfinity() {
170 r.setInfinity()
171 return
172 }
173
174 if q.isZero() {
175 r.setInfinity()
176 return
177 }
178
179 // Convert to affine for GLV multiplication
180 var aAff GroupElementAffine
181 aAff.setGEJ(a)
182
183 // Use optimized 4x64 GLV+Strauss+wNAF multiplication
184 ecmultStraussWNAFGLV4x64(r, &aAff, q)
185 }
186
187 // ecmultStraussCombined4x64 computes r = na*a + ng*G using 4x64 operations
188 // This is the AMD64-optimized combined Strauss algorithm
189 func ecmultStraussCombined4x64(r *GroupElementJacobian, a *GroupElementJacobian, na, ng *Scalar) {
190 // Ensure 4x64 generator tables are initialized
191 initGen4x64Tables()
192
193 // Split na using GLV: na = na1 + na2*λ (scalar split only)
194 var na1, na2 Scalar
195 scalarSplitLambda(&na1, &na2, na)
196
197 // Split ng using GLV: ng = ng1 + ng2*λ
198 var ng1, ng2 Scalar
199 scalarSplitLambda(&ng1, &ng2, ng)
200
201 // Compute p1 = a, p2 = λ*a directly in Jacobian coordinates
202 // This avoids the expensive Jacobian→Affine conversion
203 var p1Jac, p2Jac GroupElementJacobian
204 p1Jac = *a
205 p2Jac.mulLambda(a)
206
207 // Normalize all scalars to be "low" (not high)
208 // If scalar is high, negate both scalar and point
209 if na1.isHigh() {
210 na1.negate(&na1)
211 p1Jac.negate(&p1Jac)
212 }
213 if na2.isHigh() {
214 na2.negate(&na2)
215 p2Jac.negate(&p2Jac)
216 }
217 negNg1 := ng1.isHigh()
218 if negNg1 {
219 ng1.negate(&ng1)
220 }
221 negNg2 := ng2.isHigh()
222 if negNg2 {
223 ng2.negate(&ng2)
224 }
225
226 // Build precomputed tables for a and λ*a in 4x64 format
227 // buildOddMultiplesTable4x64 handles Jacobian→Affine conversion internally
228 var preA1_4x64 [glvTableSize]GroupElement4x64Affine
229 buildOddMultiplesTable4x64(&preA1_4x64, &p1Jac)
230
231 var preA2_4x64 [glvTableSize]GroupElement4x64Affine
232 buildOddMultiplesTable4x64(&preA2_4x64, &p2Jac)
233
234 // Convert all four scalars to wNAF
235 var wnafNa1, wnafNa2 [257]int8
236 var wnafNg1, wnafNg2 [257]int8
237
238 bitsNa1 := na1.wNAF(&wnafNa1, glvWNAFW)
239 bitsNa2 := na2.wNAF(&wnafNa2, glvWNAFW)
240 bitsNg1 := ng1.wNAF(&wnafNg1, genWindowSize)
241 bitsNg2 := ng2.wNAF(&wnafNg2, genWindowSize)
242
243 // Find maximum bit position across all four
244 maxBits := bitsNa1
245 if bitsNa2 > maxBits {
246 maxBits = bitsNa2
247 }
248 if bitsNg1 > maxBits {
249 maxBits = bitsNg1
250 }
251 if bitsNg2 > maxBits {
252 maxBits = bitsNg2
253 }
254
255 // Combined Strauss loop using 4x64 operations
256 var r4x64 GroupElement4x64Jacobian
257 r4x64.setInfinity()
258
259 for i := maxBits - 1; i >= 0; i-- {
260 // Double once (shared across all four multiplications)
261 if !r4x64.isInfinity() {
262 r4x64.double(&r4x64)
263 }
264
265 // Add contribution from na1 (using preA1_4x64 table)
266 if i < bitsNa1 && wnafNa1[i] != 0 {
267 var pt GroupElement4x64Affine
268 n := int(wnafNa1[i])
269 var idx int
270 if n > 0 {
271 idx = (n - 1) / 2
272 } else {
273 idx = (-n - 1) / 2
274 }
275 if idx < glvTableSize {
276 pt = preA1_4x64[idx]
277 if n < 0 {
278 pt.negate(&pt)
279 }
280 // Note: scalar normalization is handled at table-build time
281 // by negating p1Jac if na1 was high
282 if r4x64.isInfinity() {
283 r4x64.setGE(&pt)
284 } else {
285 r4x64.addGE(&r4x64, &pt)
286 }
287 }
288 }
289
290 // Add contribution from na2 (using preA2_4x64 table)
291 if i < bitsNa2 && wnafNa2[i] != 0 {
292 var pt GroupElement4x64Affine
293 n := int(wnafNa2[i])
294 var idx int
295 if n > 0 {
296 idx = (n - 1) / 2
297 } else {
298 idx = (-n - 1) / 2
299 }
300 if idx < glvTableSize {
301 pt = preA2_4x64[idx]
302 if n < 0 {
303 pt.negate(&pt)
304 }
305 // Note: scalar normalization is handled at table-build time
306 // by negating p2Jac if na2 was high
307 if r4x64.isInfinity() {
308 r4x64.setGE(&pt)
309 } else {
310 r4x64.addGE(&r4x64, &pt)
311 }
312 }
313 }
314
315 // Add contribution from ng1 (using preGenG4x64 table)
316 if i < bitsNg1 && wnafNg1[i] != 0 {
317 var pt GroupElement4x64Affine
318 n := int(wnafNg1[i])
319 var idx int
320 if n > 0 {
321 idx = (n - 1) / 2
322 } else {
323 idx = (-n - 1) / 2
324 }
325 if idx < genTableSize {
326 pt = preGenG4x64[idx]
327 if n < 0 {
328 pt.negate(&pt)
329 }
330 if negNg1 {
331 pt.negate(&pt)
332 }
333 if r4x64.isInfinity() {
334 r4x64.setGE(&pt)
335 } else {
336 r4x64.addGE(&r4x64, &pt)
337 }
338 }
339 }
340
341 // Add contribution from ng2 (using preGenLambdaG4x64 table)
342 if i < bitsNg2 && wnafNg2[i] != 0 {
343 var pt GroupElement4x64Affine
344 n := int(wnafNg2[i])
345 var idx int
346 if n > 0 {
347 idx = (n - 1) / 2
348 } else {
349 idx = (-n - 1) / 2
350 }
351 if idx < genTableSize {
352 pt = preGenLambdaG4x64[idx]
353 if n < 0 {
354 pt.negate(&pt)
355 }
356 if negNg2 {
357 pt.negate(&pt)
358 }
359 if r4x64.isInfinity() {
360 r4x64.setGE(&pt)
361 } else {
362 r4x64.addGE(&r4x64, &pt)
363 }
364 }
365 }
366 }
367
368 // Convert result back to standard format
369 r4x64.ToGroupElementJacobian(r)
370 }
371
372 // EcmultCombined4x64 is the AMD64-optimized version of EcmultCombined
373 func EcmultCombined4x64(r *GroupElementJacobian, a *GroupElementJacobian, na, ng *Scalar) {
374 // Handle edge cases
375 naZero := na == nil || na.isZero()
376 ngZero := ng == nil || ng.isZero()
377 aInf := a == nil || a.isInfinity()
378
379 if naZero && ngZero {
380 r.setInfinity()
381 return
382 }
383
384 if naZero || aInf {
385 EcmultGen(r, ng)
386 return
387 }
388
389 if ngZero {
390 var aAff GroupElementAffine
391 aAff.setGEJ(a)
392 ecmultStraussWNAFGLV4x64(r, &aAff, na)
393 return
394 }
395
396 ecmultStraussCombined4x64(r, a, na, ng)
397 }
398
399