//go:build amd64 && !purego package p256k1 // ============================================================================= // AMD64-optimized Generator Precomputation using Group4x64 // ============================================================================= // // This file contains optimized generator multiplication using the faster // Group4x64 field representation with BMI2 MULX instructions. // Precomputed tables in 4x64 format for faster operations var ( // preGenG4x64 contains odd multiples of G in 4x64 format preGenG4x64 [genTableSize]GroupElement4x64Affine // preGenLambdaG4x64 contains odd multiples of λ*G in 4x64 format preGenLambdaG4x64 [genTableSize]GroupElement4x64Affine // gen4x64TablesInitialized tracks whether the 4x64 tables have been computed gen4x64TablesInitialized bool ) // initGen4x64Tables converts the precomputed tables to 4x64 format func initGen4x64Tables() { if gen4x64TablesInitialized { return } // Ensure base tables are initialized first initGenTables() // Convert preGenG to 4x64 format for i := 0; i < genTableSize; i++ { preGenG4x64[i].FromGroupElementAffine(&preGenG[i]) } // Convert preGenLambdaG to 4x64 format for i := 0; i < genTableSize; i++ { preGenLambdaG4x64[i].FromGroupElementAffine(&preGenLambdaG[i]) } gen4x64TablesInitialized = true } // ecmultGenGLV4x64 computes r = k * G using 4x64 optimized operations // This is significantly faster than the generic version on AMD64 func ecmultGenGLV4x64(r *GroupElementJacobian, k *Scalar) { if k.isZero() { r.setInfinity() return } // Ensure tables are initialized initGen4x64Tables() // Split scalar using GLV: k = k1 + k2*λ var k1, k2 Scalar scalarSplitLambda(&k1, &k2, k) // Normalize k1 and k2 to be "low" (not high) neg1 := k1.isHigh() if neg1 { k1.negate(&k1) } neg2 := k2.isHigh() if neg2 { k2.negate(&k2) } // Convert to wNAF var wnaf1, wnaf2 [257]int8 bits1 := k1.wNAF(&wnaf1, genWindowSize) bits2 := k2.wNAF(&wnaf2, genWindowSize) // Find maximum bit position maxBits := bits1 if bits2 > maxBits { maxBits = bits2 } // Perform Strauss algorithm using 4x64 operations var r4x64 GroupElement4x64Jacobian r4x64.setInfinity() for i := maxBits - 1; i >= 0; i-- { // Double the result if !r4x64.isInfinity() { r4x64.double(&r4x64) } // Add contribution from k1 (using preGenG4x64 table) if i < bits1 && wnaf1[i] != 0 { var pt GroupElement4x64Affine n := int(wnaf1[i]) var idx int if n > 0 { idx = (n - 1) / 2 } else { idx = (-n - 1) / 2 } if idx < genTableSize { pt = preGenG4x64[idx] // Negate if wNAF digit is negative if n < 0 { pt.negate(&pt) } // Negate if k1 was negated during normalization if neg1 { pt.negate(&pt) } if r4x64.isInfinity() { r4x64.setGE(&pt) } else { r4x64.addGE(&r4x64, &pt) } } } // Add contribution from k2 (using preGenLambdaG4x64 table) if i < bits2 && wnaf2[i] != 0 { var pt GroupElement4x64Affine n := int(wnaf2[i]) var idx int if n > 0 { idx = (n - 1) / 2 } else { idx = (-n - 1) / 2 } if idx < genTableSize { pt = preGenLambdaG4x64[idx] // Negate if wNAF digit is negative if n < 0 { pt.negate(&pt) } // Negate if k2 was negated during normalization if neg2 { pt.negate(&pt) } if r4x64.isInfinity() { r4x64.setGE(&pt) } else { r4x64.addGE(&r4x64, &pt) } } } } // Convert result back to standard format r4x64.ToGroupElementJacobian(r) } // EcmultGen computes r = k * G using the fastest available method // On AMD64, this uses the optimized 4x64 implementation func EcmultGen(r *GroupElementJacobian, k *Scalar) { ecmultGenGLV4x64(r, k) } // EcmultGenGLV is the public interface for fast generator multiplication // r = k * G func EcmultGenGLV(r *GroupElementJacobian, k *Scalar) { ecmultGenGLV4x64(r, k) }