arith_loong64.s raw

   1  // Copyright 2025 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
   6  
   7  //go:build !math_big_pure_go
   8  
   9  #include "textflag.h"
  10  
  11  // func addVV(z, x, y []Word) (c Word)
  12  TEXT ·addVV(SB), NOSPLIT, $0
  13  	MOVV z_len+8(FP), R4
  14  	MOVV x_base+24(FP), R5
  15  	MOVV y_base+48(FP), R6
  16  	MOVV z_base+0(FP), R7
  17  	// compute unrolled loop lengths
  18  	AND $3, R4, R8
  19  	SRLV $2, R4
  20  	XOR R28, R28	// clear carry
  21  loop1:
  22  	BEQ R8, loop1done
  23  loop1cont:
  24  	// unroll 1X
  25  	MOVV 0(R5), R9
  26  	MOVV 0(R6), R10
  27  	ADDVU R10, R9	// ADCS R10, R9, R9 (cr=R28)
  28  	SGTU R10, R9, R30	// ...
  29  	ADDVU R28, R9	// ...
  30  	SGTU R28, R9, R28	// ...
  31  	ADDVU R30, R28	// ...
  32  	MOVV R9, 0(R7)
  33  	ADDVU $8, R5
  34  	ADDVU $8, R6
  35  	ADDVU $8, R7
  36  	SUBVU $1, R8
  37  	BNE R8, loop1cont
  38  loop1done:
  39  loop4:
  40  	BEQ R4, loop4done
  41  loop4cont:
  42  	// unroll 4X
  43  	MOVV 0(R5), R8
  44  	MOVV 8(R5), R9
  45  	MOVV 16(R5), R10
  46  	MOVV 24(R5), R11
  47  	MOVV 0(R6), R12
  48  	MOVV 8(R6), R13
  49  	MOVV 16(R6), R14
  50  	MOVV 24(R6), R15
  51  	ADDVU R12, R8	// ADCS R12, R8, R8 (cr=R28)
  52  	SGTU R12, R8, R30	// ...
  53  	ADDVU R28, R8	// ...
  54  	SGTU R28, R8, R28	// ...
  55  	ADDVU R30, R28	// ...
  56  	ADDVU R13, R9	// ADCS R13, R9, R9 (cr=R28)
  57  	SGTU R13, R9, R30	// ...
  58  	ADDVU R28, R9	// ...
  59  	SGTU R28, R9, R28	// ...
  60  	ADDVU R30, R28	// ...
  61  	ADDVU R14, R10	// ADCS R14, R10, R10 (cr=R28)
  62  	SGTU R14, R10, R30	// ...
  63  	ADDVU R28, R10	// ...
  64  	SGTU R28, R10, R28	// ...
  65  	ADDVU R30, R28	// ...
  66  	ADDVU R15, R11	// ADCS R15, R11, R11 (cr=R28)
  67  	SGTU R15, R11, R30	// ...
  68  	ADDVU R28, R11	// ...
  69  	SGTU R28, R11, R28	// ...
  70  	ADDVU R30, R28	// ...
  71  	MOVV R8, 0(R7)
  72  	MOVV R9, 8(R7)
  73  	MOVV R10, 16(R7)
  74  	MOVV R11, 24(R7)
  75  	ADDVU $32, R5
  76  	ADDVU $32, R6
  77  	ADDVU $32, R7
  78  	SUBVU $1, R4
  79  	BNE R4, loop4cont
  80  loop4done:
  81  	MOVV R28, c+72(FP)
  82  	RET
  83  
  84  // func subVV(z, x, y []Word) (c Word)
  85  TEXT ·subVV(SB), NOSPLIT, $0
  86  	MOVV z_len+8(FP), R4
  87  	MOVV x_base+24(FP), R5
  88  	MOVV y_base+48(FP), R6
  89  	MOVV z_base+0(FP), R7
  90  	// compute unrolled loop lengths
  91  	AND $3, R4, R8
  92  	SRLV $2, R4
  93  	XOR R28, R28	// clear carry
  94  loop1:
  95  	BEQ R8, loop1done
  96  loop1cont:
  97  	// unroll 1X
  98  	MOVV 0(R5), R9
  99  	MOVV 0(R6), R10
 100  	SGTU R28, R9, R30	// SBCS R10, R9, R9
 101  	SUBVU R28, R9	// ...
 102  	SGTU R10, R9, R28	// ...
 103  	SUBVU R10, R9	// ...
 104  	ADDVU R30, R28	// ...
 105  	MOVV R9, 0(R7)
 106  	ADDVU $8, R5
 107  	ADDVU $8, R6
 108  	ADDVU $8, R7
 109  	SUBVU $1, R8
 110  	BNE R8, loop1cont
 111  loop1done:
 112  loop4:
 113  	BEQ R4, loop4done
 114  loop4cont:
 115  	// unroll 4X
 116  	MOVV 0(R5), R8
 117  	MOVV 8(R5), R9
 118  	MOVV 16(R5), R10
 119  	MOVV 24(R5), R11
 120  	MOVV 0(R6), R12
 121  	MOVV 8(R6), R13
 122  	MOVV 16(R6), R14
 123  	MOVV 24(R6), R15
 124  	SGTU R28, R8, R30	// SBCS R12, R8, R8
 125  	SUBVU R28, R8	// ...
 126  	SGTU R12, R8, R28	// ...
 127  	SUBVU R12, R8	// ...
 128  	ADDVU R30, R28	// ...
 129  	SGTU R28, R9, R30	// SBCS R13, R9, R9
 130  	SUBVU R28, R9	// ...
 131  	SGTU R13, R9, R28	// ...
 132  	SUBVU R13, R9	// ...
 133  	ADDVU R30, R28	// ...
 134  	SGTU R28, R10, R30	// SBCS R14, R10, R10
 135  	SUBVU R28, R10	// ...
 136  	SGTU R14, R10, R28	// ...
 137  	SUBVU R14, R10	// ...
 138  	ADDVU R30, R28	// ...
 139  	SGTU R28, R11, R30	// SBCS R15, R11, R11
 140  	SUBVU R28, R11	// ...
 141  	SGTU R15, R11, R28	// ...
 142  	SUBVU R15, R11	// ...
 143  	ADDVU R30, R28	// ...
 144  	MOVV R8, 0(R7)
 145  	MOVV R9, 8(R7)
 146  	MOVV R10, 16(R7)
 147  	MOVV R11, 24(R7)
 148  	ADDVU $32, R5
 149  	ADDVU $32, R6
 150  	ADDVU $32, R7
 151  	SUBVU $1, R4
 152  	BNE R4, loop4cont
 153  loop4done:
 154  	MOVV R28, c+72(FP)
 155  	RET
 156  
 157  // func lshVU(z, x []Word, s uint) (c Word)
 158  TEXT ·lshVU(SB), NOSPLIT, $0
 159  	MOVV z_len+8(FP), R4
 160  	BEQ R4, ret0
 161  	MOVV s+48(FP), R5
 162  	MOVV x_base+24(FP), R6
 163  	MOVV z_base+0(FP), R7
 164  	// run loop backward
 165  	SLLV $3, R4, R8
 166  	ADDVU R8, R6
 167  	SLLV $3, R4, R8
 168  	ADDVU R8, R7
 169  	// shift first word into carry
 170  	MOVV -8(R6), R8
 171  	MOVV $64, R9
 172  	SUBVU R5, R9
 173  	SRLV R9, R8, R10
 174  	SLLV R5, R8
 175  	MOVV R10, c+56(FP)
 176  	// shift remaining words
 177  	SUBVU $1, R4
 178  	// compute unrolled loop lengths
 179  	AND $3, R4, R10
 180  	SRLV $2, R4
 181  loop1:
 182  	BEQ R10, loop1done
 183  loop1cont:
 184  	// unroll 1X
 185  	MOVV -16(R6), R11
 186  	SRLV R9, R11, R12
 187  	OR R8, R12
 188  	SLLV R5, R11, R8
 189  	MOVV R12, -8(R7)
 190  	ADDVU $-8, R6
 191  	ADDVU $-8, R7
 192  	SUBVU $1, R10
 193  	BNE R10, loop1cont
 194  loop1done:
 195  loop4:
 196  	BEQ R4, loop4done
 197  loop4cont:
 198  	// unroll 4X
 199  	MOVV -16(R6), R10
 200  	MOVV -24(R6), R11
 201  	MOVV -32(R6), R12
 202  	MOVV -40(R6), R13
 203  	SRLV R9, R10, R14
 204  	OR R8, R14
 205  	SLLV R5, R10, R8
 206  	SRLV R9, R11, R10
 207  	OR R8, R10
 208  	SLLV R5, R11, R8
 209  	SRLV R9, R12, R11
 210  	OR R8, R11
 211  	SLLV R5, R12, R8
 212  	SRLV R9, R13, R12
 213  	OR R8, R12
 214  	SLLV R5, R13, R8
 215  	MOVV R14, -8(R7)
 216  	MOVV R10, -16(R7)
 217  	MOVV R11, -24(R7)
 218  	MOVV R12, -32(R7)
 219  	ADDVU $-32, R6
 220  	ADDVU $-32, R7
 221  	SUBVU $1, R4
 222  	BNE R4, loop4cont
 223  loop4done:
 224  	// store final shifted bits
 225  	MOVV R8, -8(R7)
 226  	RET
 227  ret0:
 228  	MOVV R0, c+56(FP)
 229  	RET
 230  
 231  // func rshVU(z, x []Word, s uint) (c Word)
 232  TEXT ·rshVU(SB), NOSPLIT, $0
 233  	MOVV z_len+8(FP), R4
 234  	BEQ R4, ret0
 235  	MOVV s+48(FP), R5
 236  	MOVV x_base+24(FP), R6
 237  	MOVV z_base+0(FP), R7
 238  	// shift first word into carry
 239  	MOVV 0(R6), R8
 240  	MOVV $64, R9
 241  	SUBVU R5, R9
 242  	SLLV R9, R8, R10
 243  	SRLV R5, R8
 244  	MOVV R10, c+56(FP)
 245  	// shift remaining words
 246  	SUBVU $1, R4
 247  	// compute unrolled loop lengths
 248  	AND $3, R4, R10
 249  	SRLV $2, R4
 250  loop1:
 251  	BEQ R10, loop1done
 252  loop1cont:
 253  	// unroll 1X
 254  	MOVV 8(R6), R11
 255  	SLLV R9, R11, R12
 256  	OR R8, R12
 257  	SRLV R5, R11, R8
 258  	MOVV R12, 0(R7)
 259  	ADDVU $8, R6
 260  	ADDVU $8, R7
 261  	SUBVU $1, R10
 262  	BNE R10, loop1cont
 263  loop1done:
 264  loop4:
 265  	BEQ R4, loop4done
 266  loop4cont:
 267  	// unroll 4X
 268  	MOVV 8(R6), R10
 269  	MOVV 16(R6), R11
 270  	MOVV 24(R6), R12
 271  	MOVV 32(R6), R13
 272  	SLLV R9, R10, R14
 273  	OR R8, R14
 274  	SRLV R5, R10, R8
 275  	SLLV R9, R11, R10
 276  	OR R8, R10
 277  	SRLV R5, R11, R8
 278  	SLLV R9, R12, R11
 279  	OR R8, R11
 280  	SRLV R5, R12, R8
 281  	SLLV R9, R13, R12
 282  	OR R8, R12
 283  	SRLV R5, R13, R8
 284  	MOVV R14, 0(R7)
 285  	MOVV R10, 8(R7)
 286  	MOVV R11, 16(R7)
 287  	MOVV R12, 24(R7)
 288  	ADDVU $32, R6
 289  	ADDVU $32, R7
 290  	SUBVU $1, R4
 291  	BNE R4, loop4cont
 292  loop4done:
 293  	// store final shifted bits
 294  	MOVV R8, 0(R7)
 295  	RET
 296  ret0:
 297  	MOVV R0, c+56(FP)
 298  	RET
 299  
 300  // func mulAddVWW(z, x []Word, m, a Word) (c Word)
 301  TEXT ·mulAddVWW(SB), NOSPLIT, $0
 302  	MOVV m+48(FP), R4
 303  	MOVV a+56(FP), R5
 304  	MOVV z_len+8(FP), R6
 305  	MOVV x_base+24(FP), R7
 306  	MOVV z_base+0(FP), R8
 307  	// compute unrolled loop lengths
 308  	AND $3, R6, R9
 309  	SRLV $2, R6
 310  loop1:
 311  	BEQ R9, loop1done
 312  loop1cont:
 313  	// unroll 1X
 314  	MOVV 0(R7), R10
 315  	// synthetic carry, one column at a time
 316  	MULV R4, R10, R11
 317  	MULHVU R4, R10, R12
 318  	ADDVU R5, R11, R10	// ADDS R5, R11, R10 (cr=R28)
 319  	SGTU R5, R10, R28	// ...
 320  	ADDVU R28, R12, R5	// ADC $0, R12, R5
 321  	MOVV R10, 0(R8)
 322  	ADDVU $8, R7
 323  	ADDVU $8, R8
 324  	SUBVU $1, R9
 325  	BNE R9, loop1cont
 326  loop1done:
 327  loop4:
 328  	BEQ R6, loop4done
 329  loop4cont:
 330  	// unroll 4X
 331  	MOVV 0(R7), R9
 332  	MOVV 8(R7), R10
 333  	MOVV 16(R7), R11
 334  	MOVV 24(R7), R12
 335  	// synthetic carry, one column at a time
 336  	MULV R4, R9, R13
 337  	MULHVU R4, R9, R14
 338  	ADDVU R5, R13, R9	// ADDS R5, R13, R9 (cr=R28)
 339  	SGTU R5, R9, R28	// ...
 340  	ADDVU R28, R14, R5	// ADC $0, R14, R5
 341  	MULV R4, R10, R13
 342  	MULHVU R4, R10, R14
 343  	ADDVU R5, R13, R10	// ADDS R5, R13, R10 (cr=R28)
 344  	SGTU R5, R10, R28	// ...
 345  	ADDVU R28, R14, R5	// ADC $0, R14, R5
 346  	MULV R4, R11, R13
 347  	MULHVU R4, R11, R14
 348  	ADDVU R5, R13, R11	// ADDS R5, R13, R11 (cr=R28)
 349  	SGTU R5, R11, R28	// ...
 350  	ADDVU R28, R14, R5	// ADC $0, R14, R5
 351  	MULV R4, R12, R13
 352  	MULHVU R4, R12, R14
 353  	ADDVU R5, R13, R12	// ADDS R5, R13, R12 (cr=R28)
 354  	SGTU R5, R12, R28	// ...
 355  	ADDVU R28, R14, R5	// ADC $0, R14, R5
 356  	MOVV R9, 0(R8)
 357  	MOVV R10, 8(R8)
 358  	MOVV R11, 16(R8)
 359  	MOVV R12, 24(R8)
 360  	ADDVU $32, R7
 361  	ADDVU $32, R8
 362  	SUBVU $1, R6
 363  	BNE R6, loop4cont
 364  loop4done:
 365  	MOVV R5, c+64(FP)
 366  	RET
 367  
 368  // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
 369  TEXT ·addMulVVWW(SB), NOSPLIT, $0
 370  	MOVV m+72(FP), R4
 371  	MOVV a+80(FP), R5
 372  	MOVV z_len+8(FP), R6
 373  	MOVV x_base+24(FP), R7
 374  	MOVV y_base+48(FP), R8
 375  	MOVV z_base+0(FP), R9
 376  	// compute unrolled loop lengths
 377  	AND $3, R6, R10
 378  	SRLV $2, R6
 379  loop1:
 380  	BEQ R10, loop1done
 381  loop1cont:
 382  	// unroll 1X
 383  	MOVV 0(R7), R11
 384  	MOVV 0(R8), R12
 385  	// synthetic carry, one column at a time
 386  	MULV R4, R12, R13
 387  	MULHVU R4, R12, R14
 388  	ADDVU R11, R13	// ADDS R11, R13, R13 (cr=R28)
 389  	SGTU R11, R13, R28	// ...
 390  	ADDVU R28, R14	// ADC $0, R14, R14
 391  	ADDVU R5, R13, R12	// ADDS R5, R13, R12 (cr=R28)
 392  	SGTU R5, R12, R28	// ...
 393  	ADDVU R28, R14, R5	// ADC $0, R14, R5
 394  	MOVV R12, 0(R9)
 395  	ADDVU $8, R7
 396  	ADDVU $8, R8
 397  	ADDVU $8, R9
 398  	SUBVU $1, R10
 399  	BNE R10, loop1cont
 400  loop1done:
 401  loop4:
 402  	BEQ R6, loop4done
 403  loop4cont:
 404  	// unroll 4X
 405  	MOVV 0(R7), R10
 406  	MOVV 8(R7), R11
 407  	MOVV 16(R7), R12
 408  	MOVV 24(R7), R13
 409  	MOVV 0(R8), R14
 410  	MOVV 8(R8), R15
 411  	MOVV 16(R8), R16
 412  	MOVV 24(R8), R17
 413  	// synthetic carry, one column at a time
 414  	MULV R4, R14, R18
 415  	MULHVU R4, R14, R19
 416  	ADDVU R10, R18	// ADDS R10, R18, R18 (cr=R28)
 417  	SGTU R10, R18, R28	// ...
 418  	ADDVU R28, R19	// ADC $0, R19, R19
 419  	ADDVU R5, R18, R14	// ADDS R5, R18, R14 (cr=R28)
 420  	SGTU R5, R14, R28	// ...
 421  	ADDVU R28, R19, R5	// ADC $0, R19, R5
 422  	MULV R4, R15, R18
 423  	MULHVU R4, R15, R19
 424  	ADDVU R11, R18	// ADDS R11, R18, R18 (cr=R28)
 425  	SGTU R11, R18, R28	// ...
 426  	ADDVU R28, R19	// ADC $0, R19, R19
 427  	ADDVU R5, R18, R15	// ADDS R5, R18, R15 (cr=R28)
 428  	SGTU R5, R15, R28	// ...
 429  	ADDVU R28, R19, R5	// ADC $0, R19, R5
 430  	MULV R4, R16, R18
 431  	MULHVU R4, R16, R19
 432  	ADDVU R12, R18	// ADDS R12, R18, R18 (cr=R28)
 433  	SGTU R12, R18, R28	// ...
 434  	ADDVU R28, R19	// ADC $0, R19, R19
 435  	ADDVU R5, R18, R16	// ADDS R5, R18, R16 (cr=R28)
 436  	SGTU R5, R16, R28	// ...
 437  	ADDVU R28, R19, R5	// ADC $0, R19, R5
 438  	MULV R4, R17, R18
 439  	MULHVU R4, R17, R19
 440  	ADDVU R13, R18	// ADDS R13, R18, R18 (cr=R28)
 441  	SGTU R13, R18, R28	// ...
 442  	ADDVU R28, R19	// ADC $0, R19, R19
 443  	ADDVU R5, R18, R17	// ADDS R5, R18, R17 (cr=R28)
 444  	SGTU R5, R17, R28	// ...
 445  	ADDVU R28, R19, R5	// ADC $0, R19, R5
 446  	MOVV R14, 0(R9)
 447  	MOVV R15, 8(R9)
 448  	MOVV R16, 16(R9)
 449  	MOVV R17, 24(R9)
 450  	ADDVU $32, R7
 451  	ADDVU $32, R8
 452  	ADDVU $32, R9
 453  	SUBVU $1, R6
 454  	BNE R6, loop4cont
 455  loop4done:
 456  	MOVV R5, c+88(FP)
 457  	RET
 458