field_mul.go raw

   1  //go:build !js && !wasm && !tinygo && !wasm32
   2  
   3  package p256k1
   4  
   5  import "math/bits"
   6  
   7  // uint128 represents a 128-bit unsigned integer for field arithmetic
   8  type uint128 struct {
   9  	high, low uint64
  10  }
  11  
  12  // mulU64ToU128 multiplies two uint64 values and returns a uint128
  13  func mulU64ToU128(a, b uint64) uint128 {
  14  	hi, lo := bits.Mul64(a, b)
  15  	return uint128{high: hi, low: lo}
  16  }
  17  
  18  // addMulU128 computes c + a*b and returns the result as uint128
  19  func addMulU128(c uint128, a, b uint64) uint128 {
  20  	hi, lo := bits.Mul64(a, b)
  21  	
  22  	// Add lo to c.low
  23  	newLo, carry := bits.Add64(c.low, lo, 0)
  24  	
  25  	// Add hi and carry to c.high
  26  	newHi, _ := bits.Add64(c.high, hi, carry)
  27  	
  28  	return uint128{high: newHi, low: newLo}
  29  }
  30  
  31  // addU128 adds a uint64 to a uint128
  32  func addU128(c uint128, a uint64) uint128 {
  33  	newLo, carry := bits.Add64(c.low, a, 0)
  34  	newHi, _ := bits.Add64(c.high, 0, carry)
  35  	return uint128{high: newHi, low: newLo}
  36  }
  37  
  38  // lo returns the lower 64 bits
  39  func (u uint128) lo() uint64 {
  40  	return u.low
  41  }
  42  
  43  // hi returns the upper 64 bits
  44  func (u uint128) hi() uint64 {
  45  	return u.high
  46  }
  47  
  48  // rshift shifts the uint128 right by n bits
  49  func (u uint128) rshift(n uint) uint128 {
  50  	if n >= 64 {
  51  		return uint128{high: 0, low: u.high >> (n - 64)}
  52  	}
  53  	return uint128{
  54  		high: u.high >> n,
  55  		low: (u.low >> n) | (u.high << (64 - n)),
  56  	}
  57  }
  58  
  59  // mul multiplies two field elements: r = a * b
  60  // This implementation follows the C secp256k1_fe_mul_inner algorithm
  61  // Optimized: avoid copies when magnitude is low enough
  62  func (r *FieldElement) mul(a, b *FieldElement) {
  63  	// Use pointers directly if magnitude is low enough (optimization)
  64  	var aNorm, bNorm *FieldElement
  65  	var aTemp, bTemp FieldElement
  66  
  67  	if a.magnitude > 8 {
  68  		aTemp = *a
  69  		aTemp.normalizeWeak()
  70  		aNorm = &aTemp
  71  	} else {
  72  		aNorm = a // Use directly, no copy needed
  73  	}
  74  
  75  	if b.magnitude > 8 {
  76  		bTemp = *b
  77  		bTemp.normalizeWeak()
  78  		bNorm = &bTemp
  79  	} else {
  80  		bNorm = b // Use directly, no copy needed
  81  	}
  82  
  83  	// Use 4x64 BMI2 implementation if available (fastest on amd64)
  84  	if hasField4x64() {
  85  		field4x64Mul(r, aNorm, bNorm)
  86  		return
  87  	}
  88  
  89  	// Use BMI2+ADX assembly if available
  90  	if hasFieldAsmBMI2() {
  91  		fieldMulAsmBMI2(r, aNorm, bNorm)
  92  		r.magnitude = 1
  93  		r.normalized = false
  94  		return
  95  	}
  96  
  97  	// Use regular assembly if available
  98  	if hasFieldAsm() {
  99  		fieldMulAsm(r, aNorm, bNorm)
 100  		r.magnitude = 1
 101  		r.normalized = false
 102  		return
 103  	}
 104  
 105  	// Extract limbs for easier access
 106  	a0, a1, a2, a3, a4 := aNorm.n[0], aNorm.n[1], aNorm.n[2], aNorm.n[3], aNorm.n[4]
 107  	b0, b1, b2, b3, b4 := bNorm.n[0], bNorm.n[1], bNorm.n[2], bNorm.n[3], bNorm.n[4]
 108  
 109  	const M = 0xFFFFFFFFFFFFF     // 2^52 - 1
 110  	const R = fieldReductionConstantShifted // 0x1000003D10
 111  
 112  	// Following the C implementation algorithm exactly
 113  	// [... a b c] is shorthand for ... + a<<104 + b<<52 + c<<0 mod n
 114  	
 115  	// Compute p3 = a0*b3 + a1*b2 + a2*b1 + a3*b0
 116  	var c, d uint128
 117  	d = mulU64ToU128(a0, b3)
 118  	d = addMulU128(d, a1, b2)
 119  	d = addMulU128(d, a2, b1)
 120  	d = addMulU128(d, a3, b0)
 121  	
 122  	// Compute p8 = a4*b4
 123  	c = mulU64ToU128(a4, b4)
 124  	
 125  	// d += R * c_lo; c >>= 64
 126  	d = addMulU128(d, R, c.lo())
 127  	c = c.rshift(64)
 128  	
 129  	// Extract t3 and shift d
 130  	t3 := d.lo() & M
 131  	d = d.rshift(52)
 132  	
 133  	// Compute p4 = a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0
 134  	d = addMulU128(d, a0, b4)
 135  	d = addMulU128(d, a1, b3)
 136  	d = addMulU128(d, a2, b2)
 137  	d = addMulU128(d, a3, b1)
 138  	d = addMulU128(d, a4, b0)
 139  	
 140  	// d += (R << 12) * c_lo
 141  	d = addMulU128(d, R<<12, c.lo())
 142  	
 143  	// Extract t4 and tx
 144  	t4 := d.lo() & M
 145  	d = d.rshift(52)
 146  	tx := t4 >> 48
 147  	t4 &= (M >> 4)
 148  	
 149  	// Compute p0 = a0*b0
 150  	c = mulU64ToU128(a0, b0)
 151  	
 152  	// Compute p5 = a1*b4 + a2*b3 + a3*b2 + a4*b1
 153  	d = addMulU128(d, a1, b4)
 154  	d = addMulU128(d, a2, b3)
 155  	d = addMulU128(d, a3, b2)
 156  	d = addMulU128(d, a4, b1)
 157  	
 158  	// Extract u0
 159  	u0 := d.lo() & M
 160  	d = d.rshift(52)
 161  	u0 = (u0 << 4) | tx
 162  	
 163  	// c += u0 * (R >> 4)
 164  	c = addMulU128(c, u0, R>>4)
 165  	
 166  	// r[0]
 167  	r.n[0] = c.lo() & M
 168  	c = c.rshift(52)
 169  	
 170  	// Compute p1 = a0*b1 + a1*b0
 171  	c = addMulU128(c, a0, b1)
 172  	c = addMulU128(c, a1, b0)
 173  	
 174  	// Compute p6 = a2*b4 + a3*b3 + a4*b2
 175  	d = addMulU128(d, a2, b4)
 176  	d = addMulU128(d, a3, b3)
 177  	d = addMulU128(d, a4, b2)
 178  	
 179  	// c += R * (d & M); d >>= 52
 180  	c = addMulU128(c, R, d.lo()&M)
 181  	d = d.rshift(52)
 182  	
 183  	// r[1]
 184  	r.n[1] = c.lo() & M
 185  	c = c.rshift(52)
 186  	
 187  	// Compute p2 = a0*b2 + a1*b1 + a2*b0
 188  	c = addMulU128(c, a0, b2)
 189  	c = addMulU128(c, a1, b1)
 190  	c = addMulU128(c, a2, b0)
 191  	
 192  	// Compute p7 = a3*b4 + a4*b3
 193  	d = addMulU128(d, a3, b4)
 194  	d = addMulU128(d, a4, b3)
 195  	
 196  	// c += R * d_lo; d >>= 64
 197  	c = addMulU128(c, R, d.lo())
 198  	d = d.rshift(64)
 199  	
 200  	// r[2]
 201  	r.n[2] = c.lo() & M
 202  	c = c.rshift(52)
 203  	
 204  	// c += (R << 12) * d_lo + t3
 205  	c = addMulU128(c, R<<12, d.lo())
 206  	c = addU128(c, t3)
 207  	
 208  	// r[3]
 209  	r.n[3] = c.lo() & M
 210  	c = c.rshift(52)
 211  	
 212  	// r[4]
 213  	r.n[4] = c.lo() + t4
 214  	
 215  	// Set magnitude and normalization
 216  	r.magnitude = 1
 217  	r.normalized = false
 218  }
 219  
 220  // reduceFromWide reduces a 520-bit (10 limb) value modulo the field prime
 221  func (r *FieldElement) reduceFromWide(t [10]uint64) {
 222  	// The field prime is p = 2^256 - 2^32 - 977 = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F
 223  	// We use the fact that 2^256 ≡ 2^32 + 977 (mod p)
 224  	
 225  	// First, handle the upper limbs (t[5] through t[9])
 226  	// Each represents a multiple of 2^(52*i) where i >= 5
 227  	
 228  	// Reduction constant for secp256k1: 2^32 + 977 = 0x1000003D1
 229  	const M = uint64(0x1000003D1)
 230  	
 231  	// Start from the highest limb and work down
 232  	for i := 9; i >= 5; i-- {
 233  		if t[i] == 0 {
 234  			continue
 235  		}
 236  		
 237  		// t[i] * 2^(52*i) ≡ t[i] * 2^(52*(i-5)) * 2^(52*5) ≡ t[i] * 2^(52*(i-5)) * 2^260
 238  		// Since 2^256 ≡ M (mod p), we have 2^260 ≡ 2^4 * M ≡ 16 * M (mod p)
 239  		
 240  		// For i=5: 2^260 ≡ 16*M (mod p)
 241  		// For i=6: 2^312 ≡ 2^52 * 16*M ≡ 2^56 * M (mod p)
 242  		// etc.
 243  		
 244  		shift := uint(52 * (i - 5) + 4) // Additional 4 bits for the 16 factor
 245  		
 246  		// Multiply t[i] by the appropriate power of M
 247  		var carry uint64
 248  		if shift < 64 {
 249  			// Simple case: can multiply directly
 250  			factor := M << shift
 251  			hi, lo := bits.Mul64(t[i], factor)
 252  			
 253  			// Add to appropriate position
 254  			pos := 0
 255  			t[pos], carry = bits.Add64(t[pos], lo, 0)
 256  			if pos+1 < 10 {
 257  				t[pos+1], carry = bits.Add64(t[pos+1], hi, carry)
 258  			}
 259  			
 260  			// Propagate carry
 261  			for j := pos + 2; j < 10 && carry != 0; j++ {
 262  				t[j], carry = bits.Add64(t[j], 0, carry)
 263  			}
 264  		} else {
 265  			// Need to handle larger shifts by distributing across limbs
 266  			hi, lo := bits.Mul64(t[i], M)
 267  			limbShift := shift / 52
 268  			bitShift := shift % 52
 269  			
 270  			if bitShift == 0 {
 271  				// Aligned to limb boundary
 272  				if limbShift < 10 {
 273  					t[limbShift], carry = bits.Add64(t[limbShift], lo, 0)
 274  					if limbShift+1 < 10 {
 275  						t[limbShift+1], carry = bits.Add64(t[limbShift+1], hi, carry)
 276  					}
 277  				}
 278  			} else {
 279  				// Need to split across limbs
 280  				loShifted := lo << bitShift
 281  				hiShifted := (lo >> (64 - bitShift)) | (hi << bitShift)
 282  				
 283  				if limbShift < 10 {
 284  					t[limbShift], carry = bits.Add64(t[limbShift], loShifted, 0)
 285  					if limbShift+1 < 10 {
 286  						t[limbShift+1], carry = bits.Add64(t[limbShift+1], hiShifted, carry)
 287  					}
 288  				}
 289  			}
 290  			
 291  			// Propagate any remaining carry
 292  			for j := int(limbShift) + 2; j < 10 && carry != 0; j++ {
 293  				t[j], carry = bits.Add64(t[j], 0, carry)
 294  			}
 295  		}
 296  		
 297  		t[i] = 0 // Clear the processed limb
 298  	}
 299  	
 300  	// Now we have a value in t[0..4] that may still be >= p
 301  	// Convert to 5x52 format and normalize
 302  	r.n[0] = t[0] & limb0Max
 303  	r.n[1] = ((t[0] >> 52) | (t[1] << 12)) & limb0Max
 304  	r.n[2] = ((t[1] >> 40) | (t[2] << 24)) & limb0Max
 305  	r.n[3] = ((t[2] >> 28) | (t[3] << 36)) & limb0Max
 306  	r.n[4] = ((t[3] >> 16) | (t[4] << 48)) & limb4Max
 307  	
 308  	r.magnitude = 1
 309  	r.normalized = false
 310  	
 311  	// Final reduction if needed
 312  	if r.n[4] == limb4Max && r.n[3] == limb0Max && r.n[2] == limb0Max && 
 313  	   r.n[1] == limb0Max && r.n[0] >= fieldModulusLimb0 {
 314  		r.reduce()
 315  	}
 316  }
 317  
 318  // sqr squares a field element: r = a^2
 319  // This implementation follows the C secp256k1_fe_sqr_inner algorithm
 320  // Optimized: avoid copies when magnitude is low enough
 321  func (r *FieldElement) sqr(a *FieldElement) {
 322  	// Use pointer directly if magnitude is low enough (optimization)
 323  	var aNorm *FieldElement
 324  	var aTemp FieldElement
 325  
 326  	if a.magnitude > 8 {
 327  		aTemp = *a
 328  		aTemp.normalizeWeak()
 329  		aNorm = &aTemp
 330  	} else {
 331  		aNorm = a // Use directly, no copy needed
 332  	}
 333  
 334  	// Use 4x64 BMI2 implementation if available (fastest on amd64)
 335  	if hasField4x64() {
 336  		field4x64Sqr(r, aNorm)
 337  		return
 338  	}
 339  
 340  	// Use BMI2+ADX assembly if available
 341  	if hasFieldAsmBMI2() {
 342  		fieldSqrAsmBMI2(r, aNorm)
 343  		r.magnitude = 1
 344  		r.normalized = false
 345  		return
 346  	}
 347  
 348  	// Use regular assembly if available
 349  	if hasFieldAsm() {
 350  		fieldSqrAsm(r, aNorm)
 351  		r.magnitude = 1
 352  		r.normalized = false
 353  		return
 354  	}
 355  
 356  	// Extract limbs for easier access
 357  	a0, a1, a2, a3, a4 := aNorm.n[0], aNorm.n[1], aNorm.n[2], aNorm.n[3], aNorm.n[4]
 358  
 359  	const M = 0xFFFFFFFFFFFFF     // 2^52 - 1
 360  	const R = fieldReductionConstantShifted // 0x1000003D10
 361  
 362  	// Following the C implementation algorithm exactly
 363  	
 364  	// Compute p3 = 2*a0*a3 + 2*a1*a2
 365  	var c, d uint128
 366  	d = mulU64ToU128(a0*2, a3)
 367  	d = addMulU128(d, a1*2, a2)
 368  	
 369  	// Compute p8 = a4*a4
 370  	c = mulU64ToU128(a4, a4)
 371  	
 372  	// d += R * c_lo; c >>= 64
 373  	d = addMulU128(d, R, c.lo())
 374  	c = c.rshift(64)
 375  	
 376  	// Extract t3 and shift d
 377  	t3 := d.lo() & M
 378  	d = d.rshift(52)
 379  	
 380  	// Compute p4 = a0*a4*2 + a1*a3*2 + a2*a2
 381  	a4 *= 2
 382  	d = addMulU128(d, a0, a4)
 383  	d = addMulU128(d, a1*2, a3)
 384  	d = addMulU128(d, a2, a2)
 385  	
 386  	// d += (R << 12) * c_lo
 387  	d = addMulU128(d, R<<12, c.lo())
 388  	
 389  	// Extract t4 and tx
 390  	t4 := d.lo() & M
 391  	d = d.rshift(52)
 392  	tx := t4 >> 48
 393  	t4 &= (M >> 4)
 394  	
 395  	// Compute p0 = a0*a0
 396  	c = mulU64ToU128(a0, a0)
 397  	
 398  	// Compute p5 = a1*a4 + a2*a3*2
 399  	d = addMulU128(d, a1, a4)
 400  	d = addMulU128(d, a2*2, a3)
 401  	
 402  	// Extract u0
 403  	u0 := d.lo() & M
 404  	d = d.rshift(52)
 405  	u0 = (u0 << 4) | tx
 406  	
 407  	// c += u0 * (R >> 4)
 408  	c = addMulU128(c, u0, R>>4)
 409  	
 410  	// r[0]
 411  	r.n[0] = c.lo() & M
 412  	c = c.rshift(52)
 413  	
 414  	// Compute p1 = a0*a1*2
 415  	a0 *= 2
 416  	c = addMulU128(c, a0, a1)
 417  	
 418  	// Compute p6 = a2*a4 + a3*a3
 419  	d = addMulU128(d, a2, a4)
 420  	d = addMulU128(d, a3, a3)
 421  	
 422  	// c += R * (d & M); d >>= 52
 423  	c = addMulU128(c, R, d.lo()&M)
 424  	d = d.rshift(52)
 425  	
 426  	// r[1]
 427  	r.n[1] = c.lo() & M
 428  	c = c.rshift(52)
 429  	
 430  	// Compute p2 = a0*a2 + a1*a1
 431  	c = addMulU128(c, a0, a2)
 432  	c = addMulU128(c, a1, a1)
 433  	
 434  	// Compute p7 = a3*a4
 435  	d = addMulU128(d, a3, a4)
 436  	
 437  	// c += R * d_lo; d >>= 64
 438  	c = addMulU128(c, R, d.lo())
 439  	d = d.rshift(64)
 440  	
 441  	// r[2]
 442  	r.n[2] = c.lo() & M
 443  	c = c.rshift(52)
 444  	
 445  	// c += (R << 12) * d_lo + t3
 446  	c = addMulU128(c, R<<12, d.lo())
 447  	c = addU128(c, t3)
 448  	
 449  	// r[3]
 450  	r.n[3] = c.lo() & M
 451  	c = c.rshift(52)
 452  	
 453  	// r[4]
 454  	r.n[4] = c.lo() + t4
 455  	
 456  	// Set magnitude and normalization
 457  	r.magnitude = 1
 458  	r.normalized = false
 459  }
 460  
 461  // inv computes the modular inverse of a field element using Fermat's little theorem
 462  // This implements a^(p-2) mod p where p is the secp256k1 field prime
 463  // Uses an optimized addition chain similar to sqrt() for efficiency.
 464  //
 465  // p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D
 466  //
 467  // Binary structure:
 468  //   - bits 255-33: 223 ones (= 2^223 - 1 shifted left by 33)
 469  //   - bit 32: 0
 470  //   - bits 31-12: 20 ones
 471  //   - bits 11-0: 110000101101 (0xC2D)
 472  //
 473  // This uses ~266 squarings + ~15 multiplications instead of ~256 + ~127 for naive binary exp.
 474  func (r *FieldElement) inv(a *FieldElement) {
 475  	var aNorm FieldElement
 476  	aNorm = *a
 477  	aNorm.normalize()
 478  
 479  	// Build up powers using addition chain (same as sqrt)
 480  	// x^(2^k - 1) for k in {2, 3, 6, 9, 11, 22, 44, 88, 176, 220, 223}
 481  	var x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1 FieldElement
 482  
 483  	// x2 = a^(2^2 - 1) = a^3
 484  	x2.sqr(&aNorm)
 485  	x2.mul(&x2, &aNorm)
 486  
 487  	// x3 = a^(2^3 - 1) = a^7
 488  	x3.sqr(&x2)
 489  	x3.mul(&x3, &aNorm)
 490  
 491  	// x6 = a^(2^6 - 1) = a^63
 492  	x6 = x3
 493  	for j := 0; j < 3; j++ {
 494  		x6.sqr(&x6)
 495  	}
 496  	x6.mul(&x6, &x3)
 497  
 498  	// x9 = a^(2^9 - 1) = a^511
 499  	x9 = x6
 500  	for j := 0; j < 3; j++ {
 501  		x9.sqr(&x9)
 502  	}
 503  	x9.mul(&x9, &x3)
 504  
 505  	// x11 = a^(2^11 - 1) = a^2047
 506  	x11 = x9
 507  	for j := 0; j < 2; j++ {
 508  		x11.sqr(&x11)
 509  	}
 510  	x11.mul(&x11, &x2)
 511  
 512  	// x22 = a^(2^22 - 1)
 513  	x22 = x11
 514  	for j := 0; j < 11; j++ {
 515  		x22.sqr(&x22)
 516  	}
 517  	x22.mul(&x22, &x11)
 518  
 519  	// x44 = a^(2^44 - 1)
 520  	x44 = x22
 521  	for j := 0; j < 22; j++ {
 522  		x44.sqr(&x44)
 523  	}
 524  	x44.mul(&x44, &x22)
 525  
 526  	// x88 = a^(2^88 - 1)
 527  	x88 = x44
 528  	for j := 0; j < 44; j++ {
 529  		x88.sqr(&x88)
 530  	}
 531  	x88.mul(&x88, &x44)
 532  
 533  	// x176 = a^(2^176 - 1)
 534  	x176 = x88
 535  	for j := 0; j < 88; j++ {
 536  		x176.sqr(&x176)
 537  	}
 538  	x176.mul(&x176, &x88)
 539  
 540  	// x220 = a^(2^220 - 1)
 541  	x220 = x176
 542  	for j := 0; j < 44; j++ {
 543  		x220.sqr(&x220)
 544  	}
 545  	x220.mul(&x220, &x44)
 546  
 547  	// x223 = a^(2^223 - 1)
 548  	x223 = x220
 549  	for j := 0; j < 3; j++ {
 550  		x223.sqr(&x223)
 551  	}
 552  	x223.mul(&x223, &x3)
 553  
 554  	// Now assemble p-2 = 2^256 - 2^32 - 979
 555  	// = (2^223 - 1) * 2^33 + (2^32 - 1) * 2^0 + ... (adjusted for the specific bit pattern)
 556  	//
 557  	// p-2 in binary (from MSB):
 558  	//   223 ones, 0, 20 ones, 00, 1, 0, 00, 1, 0, 1, 1, 0, 1
 559  	//   [223 ones][0][20 ones][11-0 bits = 0xC2D = 110000101101]
 560  	//
 561  	// Step 1: Start with x223 and shift left by 33 bits
 562  	t1 = x223
 563  	for j := 0; j < 33; j++ {
 564  		t1.sqr(&t1)
 565  	}
 566  	// Now t1 = a^((2^223 - 1) * 2^33)
 567  
 568  	// Step 2: Add contribution for bits 31-12 (20 ones)
 569  	// We need x^(2^32 - 2^12) = x^((2^20 - 1) * 2^12)
 570  	// But we don't have x20 precomputed. Use x22 and adjust.
 571  	// Actually, let's use: x11 * 2^11 * x11 = x^((2^11-1)*2^11 + (2^11-1)) = x^(2^22 - 2^11 + 2^11 - 1) = x^(2^22-1) = x22
 572  	// We need x^(2^20 - 1). We can compute: x11 shifted by 9, then multiply by x9
 573  	// x^((2^11-1)*2^9) * x^(2^9-1) = x^(2^20 - 2^9 + 2^9 - 1) = x^(2^20 - 1)
 574  	var x20 FieldElement
 575  	x20 = x11
 576  	for j := 0; j < 9; j++ {
 577  		x20.sqr(&x20)
 578  	}
 579  	x20.mul(&x20, &x9)
 580  	// Now x20 = a^(2^20 - 1)
 581  
 582  	// Multiply into t1: t1 * x20 (but first shift x20 to correct position)
 583  	// Current t1 is at position 33 (exponent ends at bit 33)
 584  	// We need bits 31-12 = 20 ones, which is x^((2^20-1) * 2^12)
 585  	// So we need to shift t1 down? No, we're building the exponent.
 586  	//
 587  	// Let me reconsider. We have:
 588  	// t1 = a^((2^223-1) * 2^33) = a^(2^256 - 2^33)
 589  	//
 590  	// p-2 = 2^256 - 2^32 - 979
 591  	//
 592  	// 2^256 - 2^33 vs 2^256 - 2^32 - 979
 593  	// We need to add: 2^32 - 979 = FFFFFC2D
 594  	//
 595  	// So: p-2 = (2^256 - 2^33) + (2^32 - 979)
 596  	//         = t1_exp + 0xFFFFFC2D (as addition of exponents = multiplication)
 597  	//
 598  	// But that's not how exponentiation works! a^x * a^y = a^(x+y)
 599  	// So we need t1 * a^(0xFFFFFC2D)
 600  	//
 601  	// 0xFFFFFC2D = 4294966317
 602  	// This is still a big exponent to compute naively.
 603  	//
 604  	// Let me decompose 0xFFFFFC2D in terms of our precomputed powers.
 605  	// 0xFFFFFC2D = 0b 1111 1111 1111 1111 1111 1100 0010 1101
 606  	//            = 2^32 - 979
 607  	//            = (2^32 - 1) - 978
 608  	//            = (2^32 - 1) - 2^9 - 2^8 - 2^7 - 2^6 - 2^4 - 2^1
 609  	//
 610  	// Hmm, subtraction in exponent is division... that doesn't help.
 611  	//
 612  	// Let me think about this differently using the bit pattern directly.
 613  	// p-2 bits (from MSB to LSB):
 614  	//   bit 255: 1
 615  	//   ...
 616  	//   bit 33: 1  (223 ones total in bits 255-33)
 617  	//   bit 32: 0
 618  	//   bit 31: 1
 619  	//   ...
 620  	//   bit 12: 1  (20 ones total in bits 31-12)
 621  	//   bit 11: 0
 622  	//   bit 10: 0
 623  	//   bit 9: 1
 624  	//   bit 8: 0
 625  	//   bit 7: 0
 626  	//   bit 6: 0
 627  	//   bit 5: 1
 628  	//   bit 4: 0
 629  	//   bit 3: 1
 630  	//   bit 2: 1
 631  	//   bit 1: 0
 632  	//   bit 0: 1
 633  	//
 634  	// Using sliding window from MSB:
 635  	// Start: t1 = a^1
 636  	// Process 223 ones in bits 255-33:
 637  	//   t1 = a^(2^223 - 1) using x223 (done above via shift)
 638  	//   After shifting 33 times: a^((2^223-1) * 2^33) = a^(2^256 - 2^33)
 639  	//
 640  	// bit 32 = 0: square once
 641  	//   t1 = a^(2^257 - 2^34) -- wait, this goes beyond 256 bits
 642  	//
 643  	// I think I'm overcomplicating this. Let me use a simpler sliding window approach.
 644  
 645  	// Alternative approach: build from LSB using the precomputed values
 646  	// Actually, let's use the approach from libsecp256k1's sqrt which builds from MSB.
 647  
 648  	// p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D
 649  	//
 650  	// We can write this as:
 651  	// ((((x223 << 23) * x22) << 5) * x2) << 3) * x3) << 2) * x2) << 1) << 1) * a
 652  	//
 653  	// Let me verify: starting with a^(2^223 - 1) = x223
 654  	//
 655  	// Actually, the libsecp256k1 modinv doesn't use addition chains - it uses safegcd.
 656  	// Let me check what the modular inverse in libsecp256k1 actually does.
 657  
 658  	// For now, let me implement a working addition chain that's still much faster than binary exp.
 659  
 660  	// p-2 structure analysis:
 661  	// Bits 255-33: 223 consecutive 1s
 662  	// Bit 32: 0
 663  	// Bits 31-12: 20 consecutive 1s (but we don't have x20, need to construct)
 664  	// Bits 11-0: 0xC2D = 0b110000101101
 665  
 666  	// Approach: sliding window from MSB
 667  	// 1. Start with x223 (bits 255-33)
 668  	// 2. Square for bit 32 (which is 0)
 669  	// 3. Square and multiply for bits 31-12 (20 ones)
 670  	// 4. Handle remaining bits 11-0
 671  
 672  	// t1 already = x223 = a^(2^223 - 1)
 673  
 674  	// Square 23 times for bits 255-33 positioning, then multiply by x22 for next block
 675  	// Wait, I'm confusing myself. Let me restart with a cleaner approach.
 676  
 677  	// Standard addition chain for computing a^e where e = p-2:
 678  	// We compute a^e by starting with a and repeatedly squaring and multiplying.
 679  	// The exponent p-2 has this binary form:
 680  	//   1{223}01{20}001000010110 1
 681  	//   (223 ones)(zero)(20 ones)(0xC2D pattern)
 682  
 683  	// Efficient approach: compute x^((2^223-1) * 2^33 + (2^20-1) * 2^13 + specific_bits)
 684  	//
 685  	// Let's verify: (2^223-1)*2^33 = 2^256 - 2^33
 686  	// (2^20-1)*2^13 = 2^33 - 2^13
 687  	// Adding: 2^256 - 2^33 + 2^33 - 2^13 = 2^256 - 2^13
 688  	// But p-2 = 2^256 - 2^32 - 979, so this isn't quite right.
 689  
 690  	// Let me compute more carefully.
 691  	// The exponent as sum of weighted runs of 1s:
 692  	// bits 255-33 (223 ones): (2^223 - 1) * 2^33 = 2^256 - 2^33
 693  	// bits 31-12 (20 ones): (2^20 - 1) * 2^12 = 2^32 - 2^12
 694  	// bit 9: 2^9
 695  	// bit 5: 2^5
 696  	// bit 3: 2^3
 697  	// bit 2: 2^2
 698  	// bit 0: 2^0
 699  	//
 700  	// Sum = 2^256 - 2^33 + 2^32 - 2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0
 701  	//     = 2^256 - 2^32 - 2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0  (since 2^33 - 2^32 = 2^32)
 702  	//
 703  	// Hmm, -2^33 + 2^32 = -2^32, so:
 704  	//     = 2^256 - 2^32 - 2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0
 705  	//
 706  	// And we need p-2 = 2^256 - 2^32 - 979
 707  	// So: -979 = -2^12 + 2^9 + 2^5 + 2^3 + 2^2 + 2^0
 708  	//          = -4096 + 512 + 32 + 8 + 4 + 1
 709  	//          = -4096 + 557
 710  	//          = -3539
 711  	//
 712  	// That's not -979. Let me re-examine the bit pattern of 0xFFFFFC2D.
 713  
 714  	// 0xC2D = 0b110000101101 (12 bits)
 715  	// bit 11: 1 → 2^11
 716  	// bit 10: 1 → 2^10
 717  	// bit 9: 0
 718  	// bit 8: 0
 719  	// bit 7: 0
 720  	// bit 6: 0
 721  	// bit 5: 1 → 2^5
 722  	// bit 4: 0
 723  	// bit 3: 1 → 2^3
 724  	// bit 2: 1 → 2^2
 725  	// bit 1: 0
 726  	// bit 0: 1 → 2^0
 727  
 728  	// So bits 11-0 contribute: 2^11 + 2^10 + 2^5 + 2^3 + 2^2 + 2^0
 729  	//                        = 2048 + 1024 + 32 + 8 + 4 + 1
 730  	//                        = 3117
 731  
 732  	// Full p-2:
 733  	// bits 255-33 (223 ones): 2^256 - 2^33
 734  	// bits 31-12 (20 ones): 2^32 - 2^12
 735  	// bits 11-0: 3117
 736  	//
 737  	// Sum = 2^256 - 2^33 + 2^32 - 2^12 + 3117
 738  	//     = 2^256 - 2^32 - 2^12 + 3117
 739  	//     = 2^256 - 2^32 - 4096 + 3117
 740  	//     = 2^256 - 2^32 - 979 ✓
 741  
 742  	// Great! So the bit pattern is correct. Now I can build the addition chain.
 743  
 744  	// Build result from MSB using sliding window:
 745  	// t1 = x223 = a^(2^223 - 1)
 746  
 747  	// Shift left by 33 (square 33 times) and multiply by x22 for the next window?
 748  	// No wait, we have a gap (bit 32 = 0) and then 20 ones.
 749  
 750  	// Let me use a cleaner approach:
 751  	// 1. t1 = x223
 752  	// 2. t1 = t1^(2^33) = a^((2^223-1)*2^33) = a^(2^256 - 2^33)
 753  	// 3. t1 = t1 * x20 = a^(2^256 - 2^33 + 2^20 - 1)  -- but we need (2^20-1)*2^12 not just 2^20-1
 754  
 755  	// Hmm, this is getting complicated. Let me try a different formulation.
 756  
 757  	// Instead of trying to be clever, just do windowed exponentiation:
 758  	// Split p-2 into windows and multiply.
 759  
 760  	// p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D
 761  	//
 762  	// From MSB, process in chunks matching our precomputed values:
 763  	// Window 1: bits 255-33 = 223 ones → use x223, then square 33 times
 764  	// After: a^((2^223-1) * 2^33) = a^(2^256 - 2^33)
 765  	//
 766  	// Window 2: bits 32-13 = 0 followed by 20 ones
 767  	// We already squared 33 times, so we're at bit 32.
 768  	// bit 32 = 0: just square (no multiply)
 769  	// bits 31-12 = 20 ones
 770  	// But x20 isn't precomputed. Let me compute it.
 771  
 772  	// Hmm, I already computed x20 above. Let me restructure.
 773  
 774  	// Reset and do it properly:
 775  	t1 = x223
 776  	// t1 = a^(2^223 - 1)
 777  
 778  	// Square 33 times to shift the window
 779  	for j := 0; j < 33; j++ {
 780  		t1.sqr(&t1)
 781  	}
 782  	// t1 = a^((2^223-1) * 2^33)
 783  
 784  	// Multiply by x22 to add 22 more ones? No, we need exactly 20 ones.
 785  	// Actually, I already computed x20 above. Let's use it.
 786  	// Multiply by x20 shifted appropriately.
 787  	// But x20 at its base position is a^(2^20 - 1).
 788  	// We need a^((2^20 - 1) * 2^12) to contribute bits 31-12.
 789  	// So we need to shift x20 by 12.
 790  
 791  	// Current t1 exponent: (2^223 - 1) * 2^33 = 2^256 - 2^33
 792  	// We want to add: (2^20 - 1) * 2^12 = 2^32 - 2^12 - (2^12 - 1) = wait no
 793  	// (2^20 - 1) * 2^12 = 2^32 - 2^12
 794  
 795  	// So after adding: 2^256 - 2^33 + 2^32 - 2^12 = 2^256 - 2^32 - 2^12
 796  
 797  	// Wait, 2^33 - 2^32 = 2^32, so -2^33 + 2^32 = -2^32
 798  
 799  	// Now we need to add the remaining bits 11-0 = 0xC2D = 3117
 800  
 801  	// t1_final = t1 * a^((2^20-1)*2^12) * a^3117
 802  
 803  	// For a^((2^20-1)*2^12): square x20 12 times
 804  	var temp FieldElement
 805  	temp = x20
 806  	for j := 0; j < 12; j++ {
 807  		temp.sqr(&temp)
 808  	}
 809  	// temp = a^((2^20-1)*2^12) = a^(2^32 - 2^12)
 810  
 811  	t1.mul(&t1, &temp)
 812  	// t1 = a^(2^256 - 2^33 + 2^32 - 2^12) = a^(2^256 - 2^32 - 2^12)
 813  
 814  	// Now add bits 11-0 = 0xC2D = 2^11 + 2^10 + 2^5 + 2^3 + 2^2 + 2^0 = 3117
 815  	// We need a^3117
 816  
 817  	// 3117 = 2^11 + 2^10 + 2^5 + 2^3 + 2^2 + 2^0
 818  	// = 2048 + 1024 + 32 + 8 + 4 + 1
 819  
 820  	// Using our precomputed values:
 821  	// a^(2^11 - 1) = x11 = a^2047
 822  	// We need a^2048 = a^(2^11), which is a squared 11 times.
 823  	// Actually, let's build a^3117 more directly.
 824  
 825  	// a^3117 = a^(2^11) * a^(2^10) * a^(2^5) * a^(2^3) * a^(2^2) * a^(2^0)
 826  	// = (a^2048) * (a^1024) * (a^32) * (a^8) * (a^4) * a
 827  
 828  	// But computing each power separately and multiplying is expensive.
 829  	// Better: use sliding window on the bits.
 830  
 831  	// 3117 in binary: 110000101101
 832  	// From MSB: 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1
 833  
 834  	// Or we can decompose using our precomputed values:
 835  	// 3117 = 2047 + 1024 + 32 + 8 + 4 + 2 = 2047 + 1070
 836  	// Hmm, 2047 = x11, but 1070 is awkward.
 837  
 838  	// Let's just compute a^3117 with a small addition chain.
 839  	// 3117 = 0b110000101101
 840  
 841  	// Build a^3117:
 842  	var low FieldElement
 843  	low = aNorm
 844  	low.sqr(&low)           // a^2
 845  	low.mul(&low, &aNorm)   // a^3
 846  	low.sqr(&low)           // a^6
 847  	low.sqr(&low)           // a^12
 848  	low.mul(&low, &aNorm)   // a^13
 849  	low.sqr(&low)           // a^26
 850  	low.sqr(&low)           // a^52
 851  	low.sqr(&low)           // a^104
 852  	low.sqr(&low)           // a^208
 853  	low.mul(&low, &aNorm)   // a^209
 854  	low.sqr(&low)           // a^418
 855  	low.sqr(&low)           // a^836
 856  	low.mul(&low, &aNorm)   // a^837
 857  	low.sqr(&low)           // a^1674
 858  	low.mul(&low, &aNorm)   // a^1675
 859  	low.sqr(&low)           // a^3350
 860  	// Hmm, this is getting messy and doesn't give 3117.
 861  
 862  	// Let me use a simpler approach: compute a^3117 using binary method (small number)
 863  	// This adds only ~12 squarings and ~6 multiplications
 864  	low.setInt(1)
 865  	base := aNorm
 866  	e := uint32(3117)
 867  	for e > 0 {
 868  		if e&1 == 1 {
 869  			low.mul(&low, &base)
 870  		}
 871  		e >>= 1
 872  		if e > 0 {
 873  			base.sqr(&base)
 874  		}
 875  	}
 876  	// low = a^3117
 877  
 878  	// But wait, our temp and t1 computations used a shifted a. Let me reconsider.
 879  	// Actually no, the exponent additions are correct. We multiply t1 by a^(low_bits)
 880  	// where low_bits is computed from the original a.
 881  
 882  	// Hmm, but we shifted x20 by 12 bits already, contributing exponent (2^20-1)*2^12.
 883  	// And now low = a^3117.
 884  	// But 3117 spans bits 11-0, and we need to add this to the current exponent.
 885  
 886  	// Current t1 exponent: 2^256 - 2^32 - 2^12
 887  	// We want: 2^256 - 2^32 - 979
 888  	// So we need: -979 - (-2^12) = -979 + 4096 = 3117 ✓
 889  
 890  	// So t1 * low = a^(2^256 - 2^32 - 2^12 + 3117) = a^(2^256 - 2^32 - 979) = a^(p-2)
 891  
 892  	t1.mul(&t1, &low)
 893  
 894  	*r = t1
 895  	r.magnitude = 1
 896  	r.normalized = true
 897  }
 898  
 899  // sqrt computes the square root of a field element if it exists
 900  // This follows the C secp256k1_fe_sqrt implementation exactly
 901  func (r *FieldElement) sqrt(a *FieldElement) bool {
 902  	// Given that p is congruent to 3 mod 4, we can compute the square root of
 903  	// a mod p as the (p+1)/4'th power of a.
 904  	//
 905  	// As (p+1)/4 is an even number, it will have the same result for a and for
 906  	// (-a). Only one of these two numbers actually has a square root however,
 907  	// so we test at the end by squaring and comparing to the input.
 908  	
 909  	var aNorm FieldElement
 910  	aNorm = *a
 911  	
 912  	// Normalize input if magnitude is too high
 913  	if aNorm.magnitude > 8 {
 914  		aNorm.normalizeWeak()
 915  	} else {
 916  		aNorm.normalize()
 917  	}
 918  	
 919  	// The binary representation of (p + 1)/4 has 3 blocks of 1s, with lengths in
 920  	// { 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
 921  	// 1, [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
 922  	
 923  	var x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1 FieldElement
 924  	
 925  	// x2 = a^3
 926  	x2.sqr(&aNorm)
 927  	x2.mul(&x2, &aNorm)
 928  	
 929  	// x3 = a^7
 930  	x3.sqr(&x2)
 931  	x3.mul(&x3, &aNorm)
 932  	
 933  	// x6 = a^63
 934  	x6 = x3
 935  	for j := 0; j < 3; j++ {
 936  		x6.sqr(&x6)
 937  	}
 938  	x6.mul(&x6, &x3)
 939  	
 940  	// x9 = a^511
 941  	x9 = x6
 942  	for j := 0; j < 3; j++ {
 943  		x9.sqr(&x9)
 944  	}
 945  	x9.mul(&x9, &x3)
 946  	
 947  	// x11 = a^2047
 948  	x11 = x9
 949  	for j := 0; j < 2; j++ {
 950  		x11.sqr(&x11)
 951  	}
 952  	x11.mul(&x11, &x2)
 953  	
 954  	// x22 = a^4194303
 955  	x22 = x11
 956  	for j := 0; j < 11; j++ {
 957  		x22.sqr(&x22)
 958  	}
 959  	x22.mul(&x22, &x11)
 960  	
 961  	// x44 = a^17592186044415
 962  	x44 = x22
 963  	for j := 0; j < 22; j++ {
 964  		x44.sqr(&x44)
 965  	}
 966  	x44.mul(&x44, &x22)
 967  	
 968  	// x88 = a^72057594037927935
 969  	x88 = x44
 970  	for j := 0; j < 44; j++ {
 971  		x88.sqr(&x88)
 972  	}
 973  	x88.mul(&x88, &x44)
 974  	
 975  	// x176 = a^1180591620717411303423
 976  	x176 = x88
 977  	for j := 0; j < 88; j++ {
 978  		x176.sqr(&x176)
 979  	}
 980  	x176.mul(&x176, &x88)
 981  	
 982  	// x220 = a^172543658669764094685868767685
 983  	x220 = x176
 984  	for j := 0; j < 44; j++ {
 985  		x220.sqr(&x220)
 986  	}
 987  	x220.mul(&x220, &x44)
 988  	
 989  	// x223 = a^13479973333575319897333507543509815336818572211270286240551805124607
 990  	x223 = x220
 991  	for j := 0; j < 3; j++ {
 992  		x223.sqr(&x223)
 993  	}
 994  	x223.mul(&x223, &x3)
 995  	
 996  	// The final result is then assembled using a sliding window over the blocks.
 997  	t1 = x223
 998  	for j := 0; j < 23; j++ {
 999  		t1.sqr(&t1)
1000  	}
1001  	t1.mul(&t1, &x22)
1002  	for j := 0; j < 6; j++ {
1003  		t1.sqr(&t1)
1004  	}
1005  	t1.mul(&t1, &x2)
1006  	t1.sqr(&t1)
1007  	r.sqr(&t1)
1008  	
1009  	// Check that a square root was actually calculated
1010  	var check FieldElement
1011  	check.sqr(r)
1012  	check.normalize()
1013  	aNorm.normalize()
1014  	
1015  	ret := check.equal(&aNorm)
1016  	
1017  	// If sqrt(a) doesn't exist, compute sqrt(-a) instead (as per field.h comment)
1018  	if !ret {
1019  		var negA FieldElement
1020  		negA.negate(&aNorm, 1)
1021  		negA.normalize()
1022  		
1023  		t1 = x223
1024  		for j := 0; j < 23; j++ {
1025  			t1.sqr(&t1)
1026  		}
1027  		t1.mul(&t1, &x22)
1028  		for j := 0; j < 6; j++ {
1029  			t1.sqr(&t1)
1030  		}
1031  		t1.mul(&t1, &x2)
1032  		t1.sqr(&t1)
1033  		r.sqr(&t1)
1034  		
1035  		check.sqr(r)
1036  		check.normalize()
1037  		
1038  		// Return whether sqrt(-a) exists
1039  		return check.equal(&negA)
1040  	}
1041  	
1042  	return ret
1043  }
1044  
1045  // isSquare checks if a field element is a quadratic residue
1046  func (a *FieldElement) isSquare() bool {
1047  	// Use Legendre symbol: a^((p-1)/2) mod p
1048  	// If result is 1, then a is a quadratic residue
1049  
1050  	var result FieldElement
1051  	result = *a
1052  
1053  	// Compute a^((p-1)/2) - simplified implementation
1054  	for i := 0; i < 127; i++ { // Approximate (p-1)/2 bit length
1055  		result.sqr(&result)
1056  	}
1057  
1058  	result.normalize()
1059  	return result.equal(&FieldElementOne)
1060  }
1061  
1062  // half computes r = a/2 mod p
1063  func (r *FieldElement) half(a *FieldElement) {
1064  	// This follows the C secp256k1_fe_impl_half implementation exactly
1065  	*r = *a
1066  	
1067  	t0, t1, t2, t3, t4 := r.n[0], r.n[1], r.n[2], r.n[3], r.n[4]
1068  	one := uint64(1)
1069  	// In C: mask = -(t0 & one) >> 12
1070  	// In Go, we need to convert to signed, negate, then convert back
1071  	mask := uint64(-int64(t0 & one)) >> 12
1072  	
1073  	// Conditionally add field modulus if odd
1074  	t0 += 0xFFFFEFFFFFC2F & mask
1075  	t1 += mask
1076  	t2 += mask  
1077  	t3 += mask
1078  	t4 += mask >> 4
1079  	
1080  	// Right shift with carry propagation
1081  	r.n[0] = (t0 >> 1) + ((t1 & one) << 51)
1082  	r.n[1] = (t1 >> 1) + ((t2 & one) << 51)
1083  	r.n[2] = (t2 >> 1) + ((t3 & one) << 51)
1084  	r.n[3] = (t3 >> 1) + ((t4 & one) << 51)
1085  	r.n[4] = t4 >> 1
1086  	
1087  	// Update magnitude as per C implementation
1088  	r.magnitude = (r.magnitude >> 1) + 1
1089  	r.normalized = false
1090  }
1091