ecdh_amd64.go raw

   1  //go:build amd64 && !purego
   2  
   3  package p256k1
   4  
   5  // AMD64-optimized Strauss+GLV+wNAF multiplication using Group4x64
   6  // This provides significant speedup for verification and ECDH operations
   7  
   8  // ecmultStraussWNAFGLV4x64 computes r = q * a using Strauss algorithm with GLV
   9  // and Field4x64 operations for maximum performance on AMD64 with BMI2.
  10  func ecmultStraussWNAFGLV4x64(r *GroupElementJacobian, a *GroupElementAffine, q *Scalar) {
  11  	if a.isInfinity() {
  12  		r.setInfinity()
  13  		return
  14  	}
  15  
  16  	if q.isZero() {
  17  		r.setInfinity()
  18  		return
  19  	}
  20  
  21  	// Split scalar using GLV endomorphism: q = q1 + q2*λ
  22  	// Also get the transformed points p1 = a, p2 = λ*a
  23  	var q1, q2 Scalar
  24  	var p1, p2 GroupElementAffine
  25  	ecmultEndoSplit(&q1, &q2, &p1, &p2, q, a)
  26  
  27  	// Normalize scalars if high
  28  	neg1 := q1.isHigh()
  29  	if neg1 {
  30  		q1.negate(&q1)
  31  	}
  32  	neg2 := q2.isHigh()
  33  	if neg2 {
  34  		q2.negate(&q2)
  35  	}
  36  
  37  	// Build odd multiples tables in 4x64 format
  38  	var p1Jac GroupElementJacobian
  39  	p1Jac.setGE(&p1)
  40  
  41  	var preA1_4x64 [glvTableSize]GroupElement4x64Affine
  42  	buildOddMultiplesTable4x64(&preA1_4x64, &p1Jac)
  43  
  44  	// Build table for p2 (λ*a)
  45  	var p2Jac GroupElementJacobian
  46  	p2Jac.setGE(&p2)
  47  
  48  	var preA2_4x64 [glvTableSize]GroupElement4x64Affine
  49  	buildOddMultiplesTable4x64(&preA2_4x64, &p2Jac)
  50  
  51  	// Convert scalars to wNAF representation
  52  	var wnaf1, wnaf2 [257]int8
  53  
  54  	bits1 := q1.wNAF(&wnaf1, glvWNAFW)
  55  	bits2 := q2.wNAF(&wnaf2, glvWNAFW)
  56  
  57  	// Find the maximum bit position
  58  	maxBits := bits1
  59  	if bits2 > maxBits {
  60  		maxBits = bits2
  61  	}
  62  
  63  	// Perform the Strauss algorithm using 4x64 operations
  64  	var r4x64 GroupElement4x64Jacobian
  65  	r4x64.setInfinity()
  66  
  67  	for i := maxBits - 1; i >= 0; i-- {
  68  		// Double the result
  69  		if !r4x64.isInfinity() {
  70  			r4x64.double(&r4x64)
  71  		}
  72  
  73  		// Add contribution from q1
  74  		if i < bits1 && wnaf1[i] != 0 {
  75  			var pt GroupElement4x64Affine
  76  			n := int(wnaf1[i])
  77  
  78  			var idx int
  79  			if n > 0 {
  80  				idx = (n - 1) / 2
  81  			} else {
  82  				idx = (-n - 1) / 2
  83  			}
  84  
  85  			if idx < glvTableSize {
  86  				pt = preA1_4x64[idx]
  87  				// Negate if wNAF digit is negative
  88  				if n < 0 {
  89  					pt.negate(&pt)
  90  				}
  91  				// Negate if q1 was negated during normalization
  92  				if neg1 {
  93  					pt.negate(&pt)
  94  				}
  95  
  96  				if r4x64.isInfinity() {
  97  					r4x64.setGE(&pt)
  98  				} else {
  99  					r4x64.addGE(&r4x64, &pt)
 100  				}
 101  			}
 102  		}
 103  
 104  		// Add contribution from q2
 105  		if i < bits2 && wnaf2[i] != 0 {
 106  			var pt GroupElement4x64Affine
 107  			n := int(wnaf2[i])
 108  
 109  			var idx int
 110  			if n > 0 {
 111  				idx = (n - 1) / 2
 112  			} else {
 113  				idx = (-n - 1) / 2
 114  			}
 115  
 116  			if idx < glvTableSize {
 117  				pt = preA2_4x64[idx]
 118  				// Negate if wNAF digit is negative
 119  				if n < 0 {
 120  					pt.negate(&pt)
 121  				}
 122  				// Negate if q2 was negated during normalization
 123  				if neg2 {
 124  					pt.negate(&pt)
 125  				}
 126  
 127  				if r4x64.isInfinity() {
 128  					r4x64.setGE(&pt)
 129  				} else {
 130  					r4x64.addGE(&r4x64, &pt)
 131  				}
 132  			}
 133  		}
 134  	}
 135  
 136  	// Convert result back to standard format
 137  	r4x64.ToGroupElementJacobian(r)
 138  }
 139  
 140  // buildOddMultiplesTable4x64 builds a precomputation table in 4x64 format
 141  func buildOddMultiplesTable4x64(pre *[glvTableSize]GroupElement4x64Affine, a *GroupElementJacobian) {
 142  	// Build odd multiples in 4x64 Jacobian coordinates
 143  	var a4x64 GroupElement4x64Jacobian
 144  	a4x64.FromGroupElementJacobian(a)
 145  
 146  	var preJac [glvTableSize]GroupElement4x64Jacobian
 147  
 148  	// pre[0] = a (which is 1*a)
 149  	preJac[0] = a4x64
 150  
 151  	if glvTableSize > 1 {
 152  		// Compute 2*a
 153  		var twoA GroupElement4x64Jacobian
 154  		twoA.double(&a4x64)
 155  
 156  		// Build odd multiples: pre[i] = pre[i-1] + 2*a for i >= 1
 157  		for i := 1; i < glvTableSize; i++ {
 158  			preJac[i].addVar(&preJac[i-1], &twoA)
 159  		}
 160  	}
 161  
 162  	// Batch convert to affine (much more efficient than individual conversions)
 163  	batchNormalize4x64(pre[:], preJac[:])
 164  }
 165  
 166  // Ecmult4x64 computes r = q * a using 4x64 optimized operations
 167  // This is the AMD64-specific optimized version
 168  func Ecmult4x64(r *GroupElementJacobian, a *GroupElementJacobian, q *Scalar) {
 169  	if a.isInfinity() {
 170  		r.setInfinity()
 171  		return
 172  	}
 173  
 174  	if q.isZero() {
 175  		r.setInfinity()
 176  		return
 177  	}
 178  
 179  	// Convert to affine for GLV multiplication
 180  	var aAff GroupElementAffine
 181  	aAff.setGEJ(a)
 182  
 183  	// Use optimized 4x64 GLV+Strauss+wNAF multiplication
 184  	ecmultStraussWNAFGLV4x64(r, &aAff, q)
 185  }
 186  
 187  // ecmultStraussCombined4x64 computes r = na*a + ng*G using 4x64 operations
 188  // This is the AMD64-optimized combined Strauss algorithm
 189  func ecmultStraussCombined4x64(r *GroupElementJacobian, a *GroupElementJacobian, na, ng *Scalar) {
 190  	// Ensure 4x64 generator tables are initialized
 191  	initGen4x64Tables()
 192  
 193  	// Split na using GLV: na = na1 + na2*λ (scalar split only)
 194  	var na1, na2 Scalar
 195  	scalarSplitLambda(&na1, &na2, na)
 196  
 197  	// Split ng using GLV: ng = ng1 + ng2*λ
 198  	var ng1, ng2 Scalar
 199  	scalarSplitLambda(&ng1, &ng2, ng)
 200  
 201  	// Compute p1 = a, p2 = λ*a directly in Jacobian coordinates
 202  	// This avoids the expensive Jacobian→Affine conversion
 203  	var p1Jac, p2Jac GroupElementJacobian
 204  	p1Jac = *a
 205  	p2Jac.mulLambda(a)
 206  
 207  	// Normalize all scalars to be "low" (not high)
 208  	// If scalar is high, negate both scalar and point
 209  	if na1.isHigh() {
 210  		na1.negate(&na1)
 211  		p1Jac.negate(&p1Jac)
 212  	}
 213  	if na2.isHigh() {
 214  		na2.negate(&na2)
 215  		p2Jac.negate(&p2Jac)
 216  	}
 217  	negNg1 := ng1.isHigh()
 218  	if negNg1 {
 219  		ng1.negate(&ng1)
 220  	}
 221  	negNg2 := ng2.isHigh()
 222  	if negNg2 {
 223  		ng2.negate(&ng2)
 224  	}
 225  
 226  	// Build precomputed tables for a and λ*a in 4x64 format
 227  	// buildOddMultiplesTable4x64 handles Jacobian→Affine conversion internally
 228  	var preA1_4x64 [glvTableSize]GroupElement4x64Affine
 229  	buildOddMultiplesTable4x64(&preA1_4x64, &p1Jac)
 230  
 231  	var preA2_4x64 [glvTableSize]GroupElement4x64Affine
 232  	buildOddMultiplesTable4x64(&preA2_4x64, &p2Jac)
 233  
 234  	// Convert all four scalars to wNAF
 235  	var wnafNa1, wnafNa2 [257]int8
 236  	var wnafNg1, wnafNg2 [257]int8
 237  
 238  	bitsNa1 := na1.wNAF(&wnafNa1, glvWNAFW)
 239  	bitsNa2 := na2.wNAF(&wnafNa2, glvWNAFW)
 240  	bitsNg1 := ng1.wNAF(&wnafNg1, genWindowSize)
 241  	bitsNg2 := ng2.wNAF(&wnafNg2, genWindowSize)
 242  
 243  	// Find maximum bit position across all four
 244  	maxBits := bitsNa1
 245  	if bitsNa2 > maxBits {
 246  		maxBits = bitsNa2
 247  	}
 248  	if bitsNg1 > maxBits {
 249  		maxBits = bitsNg1
 250  	}
 251  	if bitsNg2 > maxBits {
 252  		maxBits = bitsNg2
 253  	}
 254  
 255  	// Combined Strauss loop using 4x64 operations
 256  	var r4x64 GroupElement4x64Jacobian
 257  	r4x64.setInfinity()
 258  
 259  	for i := maxBits - 1; i >= 0; i-- {
 260  		// Double once (shared across all four multiplications)
 261  		if !r4x64.isInfinity() {
 262  			r4x64.double(&r4x64)
 263  		}
 264  
 265  		// Add contribution from na1 (using preA1_4x64 table)
 266  		if i < bitsNa1 && wnafNa1[i] != 0 {
 267  			var pt GroupElement4x64Affine
 268  			n := int(wnafNa1[i])
 269  			var idx int
 270  			if n > 0 {
 271  				idx = (n - 1) / 2
 272  			} else {
 273  				idx = (-n - 1) / 2
 274  			}
 275  			if idx < glvTableSize {
 276  				pt = preA1_4x64[idx]
 277  				if n < 0 {
 278  					pt.negate(&pt)
 279  				}
 280  				// Note: scalar normalization is handled at table-build time
 281  				// by negating p1Jac if na1 was high
 282  				if r4x64.isInfinity() {
 283  					r4x64.setGE(&pt)
 284  				} else {
 285  					r4x64.addGE(&r4x64, &pt)
 286  				}
 287  			}
 288  		}
 289  
 290  		// Add contribution from na2 (using preA2_4x64 table)
 291  		if i < bitsNa2 && wnafNa2[i] != 0 {
 292  			var pt GroupElement4x64Affine
 293  			n := int(wnafNa2[i])
 294  			var idx int
 295  			if n > 0 {
 296  				idx = (n - 1) / 2
 297  			} else {
 298  				idx = (-n - 1) / 2
 299  			}
 300  			if idx < glvTableSize {
 301  				pt = preA2_4x64[idx]
 302  				if n < 0 {
 303  					pt.negate(&pt)
 304  				}
 305  				// Note: scalar normalization is handled at table-build time
 306  				// by negating p2Jac if na2 was high
 307  				if r4x64.isInfinity() {
 308  					r4x64.setGE(&pt)
 309  				} else {
 310  					r4x64.addGE(&r4x64, &pt)
 311  				}
 312  			}
 313  		}
 314  
 315  		// Add contribution from ng1 (using preGenG4x64 table)
 316  		if i < bitsNg1 && wnafNg1[i] != 0 {
 317  			var pt GroupElement4x64Affine
 318  			n := int(wnafNg1[i])
 319  			var idx int
 320  			if n > 0 {
 321  				idx = (n - 1) / 2
 322  			} else {
 323  				idx = (-n - 1) / 2
 324  			}
 325  			if idx < genTableSize {
 326  				pt = preGenG4x64[idx]
 327  				if n < 0 {
 328  					pt.negate(&pt)
 329  				}
 330  				if negNg1 {
 331  					pt.negate(&pt)
 332  				}
 333  				if r4x64.isInfinity() {
 334  					r4x64.setGE(&pt)
 335  				} else {
 336  					r4x64.addGE(&r4x64, &pt)
 337  				}
 338  			}
 339  		}
 340  
 341  		// Add contribution from ng2 (using preGenLambdaG4x64 table)
 342  		if i < bitsNg2 && wnafNg2[i] != 0 {
 343  			var pt GroupElement4x64Affine
 344  			n := int(wnafNg2[i])
 345  			var idx int
 346  			if n > 0 {
 347  				idx = (n - 1) / 2
 348  			} else {
 349  				idx = (-n - 1) / 2
 350  			}
 351  			if idx < genTableSize {
 352  				pt = preGenLambdaG4x64[idx]
 353  				if n < 0 {
 354  					pt.negate(&pt)
 355  				}
 356  				if negNg2 {
 357  					pt.negate(&pt)
 358  				}
 359  				if r4x64.isInfinity() {
 360  					r4x64.setGE(&pt)
 361  				} else {
 362  					r4x64.addGE(&r4x64, &pt)
 363  				}
 364  			}
 365  		}
 366  	}
 367  
 368  	// Convert result back to standard format
 369  	r4x64.ToGroupElementJacobian(r)
 370  }
 371  
 372  // EcmultCombined4x64 is the AMD64-optimized version of EcmultCombined
 373  func EcmultCombined4x64(r *GroupElementJacobian, a *GroupElementJacobian, na, ng *Scalar) {
 374  	// Handle edge cases
 375  	naZero := na == nil || na.isZero()
 376  	ngZero := ng == nil || ng.isZero()
 377  	aInf := a == nil || a.isInfinity()
 378  
 379  	if naZero && ngZero {
 380  		r.setInfinity()
 381  		return
 382  	}
 383  
 384  	if naZero || aInf {
 385  		EcmultGen(r, ng)
 386  		return
 387  	}
 388  
 389  	if ngZero {
 390  		var aAff GroupElementAffine
 391  		aAff.setGEJ(a)
 392  		ecmultStraussWNAFGLV4x64(r, &aAff, na)
 393  		return
 394  	}
 395  
 396  	ecmultStraussCombined4x64(r, a, na, ng)
 397  }
 398  
 399