uint128_amd64.s raw
1 //go:build amd64
2
3 #include "textflag.h"
4
5 // func uint128Mul(a, b Uint128) [4]uint64
6 // Multiplies two 128-bit values and returns a 256-bit result.
7 //
8 // Input:
9 // a.Lo = arg+0(FP)
10 // a.Hi = arg+8(FP)
11 // b.Lo = arg+16(FP)
12 // b.Hi = arg+24(FP)
13 //
14 // Output:
15 // result[0] = ret+32(FP) (bits 0-63)
16 // result[1] = ret+40(FP) (bits 64-127)
17 // result[2] = ret+48(FP) (bits 128-191)
18 // result[3] = ret+56(FP) (bits 192-255)
19 //
20 // Algorithm:
21 // (a.Hi*2^64 + a.Lo) * (b.Hi*2^64 + b.Lo)
22 // = a.Hi*b.Hi*2^128 + (a.Hi*b.Lo + a.Lo*b.Hi)*2^64 + a.Lo*b.Lo
23 //
24 TEXT ·uint128Mul(SB), NOSPLIT, $0-64
25 // Load inputs
26 MOVQ a_Lo+0(FP), AX // AX = a.Lo
27 MOVQ a_Hi+8(FP), BX // BX = a.Hi
28 MOVQ b_Lo+16(FP), CX // CX = b.Lo
29 MOVQ b_Hi+24(FP), DX // DX = b.Hi
30
31 // Save b.Hi for later (DX will be clobbered by MUL)
32 MOVQ DX, R11 // R11 = b.Hi
33
34 // r0:r1 = a.Lo * b.Lo
35 MOVQ AX, R8 // R8 = a.Lo (save for later)
36 MULQ CX // DX:AX = a.Lo * b.Lo
37 MOVQ AX, R9 // R9 = result[0] (low 64 bits)
38 MOVQ DX, R10 // R10 = carry to result[1]
39
40 // r1:r2 += a.Hi * b.Lo
41 MOVQ BX, AX // AX = a.Hi
42 MULQ CX // DX:AX = a.Hi * b.Lo
43 ADDQ AX, R10 // R10 += low part
44 ADCQ $0, DX // DX += carry
45 MOVQ DX, CX // CX = carry to result[2]
46
47 // r1:r2 += a.Lo * b.Hi
48 MOVQ R8, AX // AX = a.Lo
49 MULQ R11 // DX:AX = a.Lo * b.Hi
50 ADDQ AX, R10 // R10 += low part
51 ADCQ DX, CX // CX += high part + carry
52 MOVQ $0, R8
53 ADCQ $0, R8 // R8 = carry to result[3]
54
55 // r2:r3 += a.Hi * b.Hi
56 MOVQ BX, AX // AX = a.Hi
57 MULQ R11 // DX:AX = a.Hi * b.Hi
58 ADDQ AX, CX // CX += low part
59 ADCQ DX, R8 // R8 += high part + carry
60
61 // Store results
62 MOVQ R9, ret+32(FP) // result[0]
63 MOVQ R10, ret+40(FP) // result[1]
64 MOVQ CX, ret+48(FP) // result[2]
65 MOVQ R8, ret+56(FP) // result[3]
66
67 RET
68