uint128_amd64.s raw

   1  //go:build amd64
   2  
   3  #include "textflag.h"
   4  
   5  // func uint128Mul(a, b Uint128) [4]uint64
   6  // Multiplies two 128-bit values and returns a 256-bit result.
   7  //
   8  // Input:
   9  //   a.Lo = arg+0(FP)
  10  //   a.Hi = arg+8(FP)
  11  //   b.Lo = arg+16(FP)
  12  //   b.Hi = arg+24(FP)
  13  //
  14  // Output:
  15  //   result[0] = ret+32(FP)  (bits 0-63)
  16  //   result[1] = ret+40(FP)  (bits 64-127)
  17  //   result[2] = ret+48(FP)  (bits 128-191)
  18  //   result[3] = ret+56(FP)  (bits 192-255)
  19  //
  20  // Algorithm:
  21  //   (a.Hi*2^64 + a.Lo) * (b.Hi*2^64 + b.Lo)
  22  //   = a.Hi*b.Hi*2^128 + (a.Hi*b.Lo + a.Lo*b.Hi)*2^64 + a.Lo*b.Lo
  23  //
  24  TEXT ·uint128Mul(SB), NOSPLIT, $0-64
  25  	// Load inputs
  26  	MOVQ a_Lo+0(FP), AX      // AX = a.Lo
  27  	MOVQ a_Hi+8(FP), BX      // BX = a.Hi
  28  	MOVQ b_Lo+16(FP), CX     // CX = b.Lo
  29  	MOVQ b_Hi+24(FP), DX     // DX = b.Hi
  30  
  31  	// Save b.Hi for later (DX will be clobbered by MUL)
  32  	MOVQ DX, R11             // R11 = b.Hi
  33  
  34  	// r0:r1 = a.Lo * b.Lo
  35  	MOVQ AX, R8              // R8 = a.Lo (save for later)
  36  	MULQ CX                  // DX:AX = a.Lo * b.Lo
  37  	MOVQ AX, R9              // R9 = result[0] (low 64 bits)
  38  	MOVQ DX, R10             // R10 = carry to result[1]
  39  
  40  	// r1:r2 += a.Hi * b.Lo
  41  	MOVQ BX, AX              // AX = a.Hi
  42  	MULQ CX                  // DX:AX = a.Hi * b.Lo
  43  	ADDQ AX, R10             // R10 += low part
  44  	ADCQ $0, DX              // DX += carry
  45  	MOVQ DX, CX              // CX = carry to result[2]
  46  
  47  	// r1:r2 += a.Lo * b.Hi
  48  	MOVQ R8, AX              // AX = a.Lo
  49  	MULQ R11                 // DX:AX = a.Lo * b.Hi
  50  	ADDQ AX, R10             // R10 += low part
  51  	ADCQ DX, CX              // CX += high part + carry
  52  	MOVQ $0, R8
  53  	ADCQ $0, R8              // R8 = carry to result[3]
  54  
  55  	// r2:r3 += a.Hi * b.Hi
  56  	MOVQ BX, AX              // AX = a.Hi
  57  	MULQ R11                 // DX:AX = a.Hi * b.Hi
  58  	ADDQ AX, CX              // CX += low part
  59  	ADCQ DX, R8              // R8 += high part + carry
  60  
  61  	// Store results
  62  	MOVQ R9, ret+32(FP)      // result[0]
  63  	MOVQ R10, ret+40(FP)     // result[1]
  64  	MOVQ CX, ret+48(FP)      // result[2]
  65  	MOVQ R8, ret+56(FP)      // result[3]
  66  
  67  	RET
  68