arith_amd64.s raw

   1  // Copyright 2025 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
   6  
   7  //go:build !math_big_pure_go
   8  
   9  #include "textflag.h"
  10  
  11  // func addVV(z, x, y []Word) (c Word)
  12  TEXT ·addVV(SB), NOSPLIT, $0
  13  	MOVQ z_len+8(FP), BX
  14  	MOVQ x_base+24(FP), SI
  15  	MOVQ y_base+48(FP), DI
  16  	MOVQ z_base+0(FP), R8
  17  	// compute unrolled loop lengths
  18  	MOVQ BX, R9
  19  	ANDQ $3, R9
  20  	SHRQ $2, BX
  21  	MOVQ $0, R10	// clear saved carry
  22  loop1:
  23  	TESTQ R9, R9; JZ loop1done
  24  loop1cont:
  25  	// unroll 1X
  26  	ADDQ R10, R10	// restore carry
  27  	MOVQ 0(SI), R10
  28  	ADCQ 0(DI), R10
  29  	MOVQ R10, 0(R8)
  30  	SBBQ R10, R10	// save carry
  31  	LEAQ 8(SI), SI	// ADD $8, SI
  32  	LEAQ 8(DI), DI	// ADD $8, DI
  33  	LEAQ 8(R8), R8	// ADD $8, R8
  34  	SUBQ $1, R9; JNZ loop1cont
  35  loop1done:
  36  loop4:
  37  	TESTQ BX, BX; JZ loop4done
  38  loop4cont:
  39  	// unroll 4X
  40  	ADDQ R10, R10	// restore carry
  41  	MOVQ 0(SI), R9
  42  	MOVQ 8(SI), R10
  43  	MOVQ 16(SI), R11
  44  	MOVQ 24(SI), R12
  45  	ADCQ 0(DI), R9
  46  	ADCQ 8(DI), R10
  47  	ADCQ 16(DI), R11
  48  	ADCQ 24(DI), R12
  49  	MOVQ R9, 0(R8)
  50  	MOVQ R10, 8(R8)
  51  	MOVQ R11, 16(R8)
  52  	MOVQ R12, 24(R8)
  53  	SBBQ R10, R10	// save carry
  54  	LEAQ 32(SI), SI	// ADD $32, SI
  55  	LEAQ 32(DI), DI	// ADD $32, DI
  56  	LEAQ 32(R8), R8	// ADD $32, R8
  57  	SUBQ $1, BX; JNZ loop4cont
  58  loop4done:
  59  	NEGQ R10	// convert add carry
  60  	MOVQ R10, c+72(FP)
  61  	RET
  62  
  63  // func subVV(z, x, y []Word) (c Word)
  64  TEXT ·subVV(SB), NOSPLIT, $0
  65  	MOVQ z_len+8(FP), BX
  66  	MOVQ x_base+24(FP), SI
  67  	MOVQ y_base+48(FP), DI
  68  	MOVQ z_base+0(FP), R8
  69  	// compute unrolled loop lengths
  70  	MOVQ BX, R9
  71  	ANDQ $3, R9
  72  	SHRQ $2, BX
  73  	MOVQ $0, R10	// clear saved carry
  74  loop1:
  75  	TESTQ R9, R9; JZ loop1done
  76  loop1cont:
  77  	// unroll 1X
  78  	ADDQ R10, R10	// restore carry
  79  	MOVQ 0(SI), R10
  80  	SBBQ 0(DI), R10
  81  	MOVQ R10, 0(R8)
  82  	SBBQ R10, R10	// save carry
  83  	LEAQ 8(SI), SI	// ADD $8, SI
  84  	LEAQ 8(DI), DI	// ADD $8, DI
  85  	LEAQ 8(R8), R8	// ADD $8, R8
  86  	SUBQ $1, R9; JNZ loop1cont
  87  loop1done:
  88  loop4:
  89  	TESTQ BX, BX; JZ loop4done
  90  loop4cont:
  91  	// unroll 4X
  92  	ADDQ R10, R10	// restore carry
  93  	MOVQ 0(SI), R9
  94  	MOVQ 8(SI), R10
  95  	MOVQ 16(SI), R11
  96  	MOVQ 24(SI), R12
  97  	SBBQ 0(DI), R9
  98  	SBBQ 8(DI), R10
  99  	SBBQ 16(DI), R11
 100  	SBBQ 24(DI), R12
 101  	MOVQ R9, 0(R8)
 102  	MOVQ R10, 8(R8)
 103  	MOVQ R11, 16(R8)
 104  	MOVQ R12, 24(R8)
 105  	SBBQ R10, R10	// save carry
 106  	LEAQ 32(SI), SI	// ADD $32, SI
 107  	LEAQ 32(DI), DI	// ADD $32, DI
 108  	LEAQ 32(R8), R8	// ADD $32, R8
 109  	SUBQ $1, BX; JNZ loop4cont
 110  loop4done:
 111  	NEGQ R10	// convert sub carry
 112  	MOVQ R10, c+72(FP)
 113  	RET
 114  
 115  // func lshVU(z, x []Word, s uint) (c Word)
 116  TEXT ·lshVU(SB), NOSPLIT, $0
 117  	MOVQ z_len+8(FP), BX
 118  	TESTQ BX, BX; JZ ret0
 119  	MOVQ s+48(FP), CX
 120  	MOVQ x_base+24(FP), SI
 121  	MOVQ z_base+0(FP), DI
 122  	// run loop backward
 123  	LEAQ (SI)(BX*8), SI
 124  	LEAQ (DI)(BX*8), DI
 125  	// shift first word into carry
 126  	MOVQ -8(SI), R8
 127  	MOVQ $0, R9
 128  	SHLQ CX, R8, R9
 129  	MOVQ R9, c+56(FP)
 130  	// shift remaining words
 131  	SUBQ $1, BX
 132  	// compute unrolled loop lengths
 133  	MOVQ BX, R9
 134  	ANDQ $3, R9
 135  	SHRQ $2, BX
 136  loop1:
 137  	TESTQ R9, R9; JZ loop1done
 138  loop1cont:
 139  	// unroll 1X
 140  	MOVQ -16(SI), R10
 141  	SHLQ CX, R10, R8
 142  	MOVQ R8, -8(DI)
 143  	MOVQ R10, R8
 144  	LEAQ -8(SI), SI	// ADD $-8, SI
 145  	LEAQ -8(DI), DI	// ADD $-8, DI
 146  	SUBQ $1, R9; JNZ loop1cont
 147  loop1done:
 148  loop4:
 149  	TESTQ BX, BX; JZ loop4done
 150  loop4cont:
 151  	// unroll 4X
 152  	MOVQ -16(SI), R9
 153  	MOVQ -24(SI), R10
 154  	MOVQ -32(SI), R11
 155  	MOVQ -40(SI), R12
 156  	SHLQ CX, R9, R8
 157  	SHLQ CX, R10, R9
 158  	SHLQ CX, R11, R10
 159  	SHLQ CX, R12, R11
 160  	MOVQ R8, -8(DI)
 161  	MOVQ R9, -16(DI)
 162  	MOVQ R10, -24(DI)
 163  	MOVQ R11, -32(DI)
 164  	MOVQ R12, R8
 165  	LEAQ -32(SI), SI	// ADD $-32, SI
 166  	LEAQ -32(DI), DI	// ADD $-32, DI
 167  	SUBQ $1, BX; JNZ loop4cont
 168  loop4done:
 169  	// store final shifted bits
 170  	SHLQ CX, R8
 171  	MOVQ R8, -8(DI)
 172  	RET
 173  ret0:
 174  	MOVQ $0, c+56(FP)
 175  	RET
 176  
 177  // func rshVU(z, x []Word, s uint) (c Word)
 178  TEXT ·rshVU(SB), NOSPLIT, $0
 179  	MOVQ z_len+8(FP), BX
 180  	TESTQ BX, BX; JZ ret0
 181  	MOVQ s+48(FP), CX
 182  	MOVQ x_base+24(FP), SI
 183  	MOVQ z_base+0(FP), DI
 184  	// shift first word into carry
 185  	MOVQ 0(SI), R8
 186  	MOVQ $0, R9
 187  	SHRQ CX, R8, R9
 188  	MOVQ R9, c+56(FP)
 189  	// shift remaining words
 190  	SUBQ $1, BX
 191  	// compute unrolled loop lengths
 192  	MOVQ BX, R9
 193  	ANDQ $3, R9
 194  	SHRQ $2, BX
 195  loop1:
 196  	TESTQ R9, R9; JZ loop1done
 197  loop1cont:
 198  	// unroll 1X
 199  	MOVQ 8(SI), R10
 200  	SHRQ CX, R10, R8
 201  	MOVQ R8, 0(DI)
 202  	MOVQ R10, R8
 203  	LEAQ 8(SI), SI	// ADD $8, SI
 204  	LEAQ 8(DI), DI	// ADD $8, DI
 205  	SUBQ $1, R9; JNZ loop1cont
 206  loop1done:
 207  loop4:
 208  	TESTQ BX, BX; JZ loop4done
 209  loop4cont:
 210  	// unroll 4X
 211  	MOVQ 8(SI), R9
 212  	MOVQ 16(SI), R10
 213  	MOVQ 24(SI), R11
 214  	MOVQ 32(SI), R12
 215  	SHRQ CX, R9, R8
 216  	SHRQ CX, R10, R9
 217  	SHRQ CX, R11, R10
 218  	SHRQ CX, R12, R11
 219  	MOVQ R8, 0(DI)
 220  	MOVQ R9, 8(DI)
 221  	MOVQ R10, 16(DI)
 222  	MOVQ R11, 24(DI)
 223  	MOVQ R12, R8
 224  	LEAQ 32(SI), SI	// ADD $32, SI
 225  	LEAQ 32(DI), DI	// ADD $32, DI
 226  	SUBQ $1, BX; JNZ loop4cont
 227  loop4done:
 228  	// store final shifted bits
 229  	SHRQ CX, R8
 230  	MOVQ R8, 0(DI)
 231  	RET
 232  ret0:
 233  	MOVQ $0, c+56(FP)
 234  	RET
 235  
 236  // func mulAddVWW(z, x []Word, m, a Word) (c Word)
 237  TEXT ·mulAddVWW(SB), NOSPLIT, $0
 238  	MOVQ m+48(FP), BX
 239  	MOVQ a+56(FP), SI
 240  	MOVQ z_len+8(FP), DI
 241  	MOVQ x_base+24(FP), R8
 242  	MOVQ z_base+0(FP), R9
 243  	// compute unrolled loop lengths
 244  	MOVQ DI, R10
 245  	ANDQ $3, R10
 246  	SHRQ $2, DI
 247  loop1:
 248  	TESTQ R10, R10; JZ loop1done
 249  loop1cont:
 250  	// unroll 1X in batches of 1
 251  	MOVQ 0(R8), AX
 252  	// multiply
 253  	MULQ BX
 254  	ADDQ SI, AX
 255  	MOVQ DX, SI
 256  	ADCQ $0, SI
 257  	MOVQ AX, 0(R9)
 258  	LEAQ 8(R8), R8	// ADD $8, R8
 259  	LEAQ 8(R9), R9	// ADD $8, R9
 260  	SUBQ $1, R10; JNZ loop1cont
 261  loop1done:
 262  loop4:
 263  	TESTQ DI, DI; JZ loop4done
 264  loop4cont:
 265  	// unroll 4X in batches of 1
 266  	MOVQ 0(R8), AX
 267  	// multiply
 268  	MULQ BX
 269  	ADDQ SI, AX
 270  	MOVQ DX, SI
 271  	ADCQ $0, SI
 272  	MOVQ AX, 0(R9)
 273  	MOVQ 8(R8), AX
 274  	// multiply
 275  	MULQ BX
 276  	ADDQ SI, AX
 277  	MOVQ DX, SI
 278  	ADCQ $0, SI
 279  	MOVQ AX, 8(R9)
 280  	MOVQ 16(R8), AX
 281  	// multiply
 282  	MULQ BX
 283  	ADDQ SI, AX
 284  	MOVQ DX, SI
 285  	ADCQ $0, SI
 286  	MOVQ AX, 16(R9)
 287  	MOVQ 24(R8), AX
 288  	// multiply
 289  	MULQ BX
 290  	ADDQ SI, AX
 291  	MOVQ DX, SI
 292  	ADCQ $0, SI
 293  	MOVQ AX, 24(R9)
 294  	LEAQ 32(R8), R8	// ADD $32, R8
 295  	LEAQ 32(R9), R9	// ADD $32, R9
 296  	SUBQ $1, DI; JNZ loop4cont
 297  loop4done:
 298  	MOVQ SI, c+64(FP)
 299  	RET
 300  
 301  // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
 302  TEXT ·addMulVVWW(SB), NOSPLIT, $0
 303  	CMPB ·hasADX(SB), $0; JNZ altcarry
 304  	MOVQ m+72(FP), BX
 305  	MOVQ a+80(FP), SI
 306  	MOVQ z_len+8(FP), DI
 307  	MOVQ x_base+24(FP), R8
 308  	MOVQ y_base+48(FP), R9
 309  	MOVQ z_base+0(FP), R10
 310  	// compute unrolled loop lengths
 311  	MOVQ DI, R11
 312  	ANDQ $3, R11
 313  	SHRQ $2, DI
 314  loop1:
 315  	TESTQ R11, R11; JZ loop1done
 316  loop1cont:
 317  	// unroll 1X in batches of 1
 318  	MOVQ 0(R9), AX
 319  	// multiply
 320  	MULQ BX
 321  	ADDQ SI, AX
 322  	MOVQ DX, SI
 323  	ADCQ $0, SI
 324  	// add
 325  	ADDQ 0(R8), AX
 326  	ADCQ $0, SI
 327  	MOVQ AX, 0(R10)
 328  	LEAQ 8(R8), R8	// ADD $8, R8
 329  	LEAQ 8(R9), R9	// ADD $8, R9
 330  	LEAQ 8(R10), R10	// ADD $8, R10
 331  	SUBQ $1, R11; JNZ loop1cont
 332  loop1done:
 333  loop4:
 334  	TESTQ DI, DI; JZ loop4done
 335  loop4cont:
 336  	// unroll 4X in batches of 1
 337  	MOVQ 0(R9), AX
 338  	// multiply
 339  	MULQ BX
 340  	ADDQ SI, AX
 341  	MOVQ DX, SI
 342  	ADCQ $0, SI
 343  	// add
 344  	ADDQ 0(R8), AX
 345  	ADCQ $0, SI
 346  	MOVQ AX, 0(R10)
 347  	MOVQ 8(R9), AX
 348  	// multiply
 349  	MULQ BX
 350  	ADDQ SI, AX
 351  	MOVQ DX, SI
 352  	ADCQ $0, SI
 353  	// add
 354  	ADDQ 8(R8), AX
 355  	ADCQ $0, SI
 356  	MOVQ AX, 8(R10)
 357  	MOVQ 16(R9), AX
 358  	// multiply
 359  	MULQ BX
 360  	ADDQ SI, AX
 361  	MOVQ DX, SI
 362  	ADCQ $0, SI
 363  	// add
 364  	ADDQ 16(R8), AX
 365  	ADCQ $0, SI
 366  	MOVQ AX, 16(R10)
 367  	MOVQ 24(R9), AX
 368  	// multiply
 369  	MULQ BX
 370  	ADDQ SI, AX
 371  	MOVQ DX, SI
 372  	ADCQ $0, SI
 373  	// add
 374  	ADDQ 24(R8), AX
 375  	ADCQ $0, SI
 376  	MOVQ AX, 24(R10)
 377  	LEAQ 32(R8), R8	// ADD $32, R8
 378  	LEAQ 32(R9), R9	// ADD $32, R9
 379  	LEAQ 32(R10), R10	// ADD $32, R10
 380  	SUBQ $1, DI; JNZ loop4cont
 381  loop4done:
 382  	MOVQ SI, c+88(FP)
 383  	RET
 384  altcarry:
 385  	MOVQ m+72(FP), DX
 386  	MOVQ a+80(FP), BX
 387  	MOVQ z_len+8(FP), SI
 388  	MOVQ $0, DI
 389  	MOVQ x_base+24(FP), R8
 390  	MOVQ y_base+48(FP), R9
 391  	MOVQ z_base+0(FP), R10
 392  	// compute unrolled loop lengths
 393  	MOVQ SI, R11
 394  	ANDQ $7, R11
 395  	SHRQ $3, SI
 396  alt1:
 397  	TESTQ R11, R11; JZ alt1done
 398  alt1cont:
 399  	// unroll 1X
 400  	// multiply and add
 401  	TESTQ AX, AX	// clear carry
 402  	TESTQ AX, AX	// clear carry
 403  	MULXQ 0(R9), R13, R12
 404  	ADCXQ BX, R13
 405  	ADOXQ 0(R8), R13
 406  	MOVQ R13, 0(R10)
 407  	MOVQ R12, BX
 408  	ADCXQ DI, BX
 409  	ADOXQ DI, BX
 410  	LEAQ 8(R8), R8	// ADD $8, R8
 411  	LEAQ 8(R9), R9	// ADD $8, R9
 412  	LEAQ 8(R10), R10	// ADD $8, R10
 413  	SUBQ $1, R11; JNZ alt1cont
 414  alt1done:
 415  alt8:
 416  	TESTQ SI, SI; JZ alt8done
 417  alt8cont:
 418  	// unroll 8X in batches of 2
 419  	// multiply and add
 420  	TESTQ AX, AX	// clear carry
 421  	TESTQ AX, AX	// clear carry
 422  	MULXQ 0(R9), R13, R11
 423  	ADCXQ BX, R13
 424  	ADOXQ 0(R8), R13
 425  	MULXQ 8(R9), R14, BX
 426  	ADCXQ R11, R14
 427  	ADOXQ 8(R8), R14
 428  	MOVQ R13, 0(R10)
 429  	MOVQ R14, 8(R10)
 430  	MULXQ 16(R9), R13, R11
 431  	ADCXQ BX, R13
 432  	ADOXQ 16(R8), R13
 433  	MULXQ 24(R9), R14, BX
 434  	ADCXQ R11, R14
 435  	ADOXQ 24(R8), R14
 436  	MOVQ R13, 16(R10)
 437  	MOVQ R14, 24(R10)
 438  	MULXQ 32(R9), R13, R11
 439  	ADCXQ BX, R13
 440  	ADOXQ 32(R8), R13
 441  	MULXQ 40(R9), R14, BX
 442  	ADCXQ R11, R14
 443  	ADOXQ 40(R8), R14
 444  	MOVQ R13, 32(R10)
 445  	MOVQ R14, 40(R10)
 446  	MULXQ 48(R9), R13, R11
 447  	ADCXQ BX, R13
 448  	ADOXQ 48(R8), R13
 449  	MULXQ 56(R9), R14, BX
 450  	ADCXQ R11, R14
 451  	ADOXQ 56(R8), R14
 452  	MOVQ R13, 48(R10)
 453  	MOVQ R14, 56(R10)
 454  	ADCXQ DI, BX
 455  	ADOXQ DI, BX
 456  	LEAQ 64(R8), R8	// ADD $64, R8
 457  	LEAQ 64(R9), R9	// ADD $64, R9
 458  	LEAQ 64(R10), R10	// ADD $64, R10
 459  	SUBQ $1, SI; JNZ alt8cont
 460  alt8done:
 461  	MOVQ BX, c+88(FP)
 462  	RET
 463