arith_arm64.s raw

   1  // Copyright 2025 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
   6  
   7  //go:build !math_big_pure_go
   8  
   9  #include "textflag.h"
  10  
  11  // func addVV(z, x, y []Word) (c Word)
  12  TEXT ·addVV(SB), NOSPLIT, $0
  13  	MOVD z_len+8(FP), R0
  14  	MOVD x_base+24(FP), R1
  15  	MOVD y_base+48(FP), R2
  16  	MOVD z_base+0(FP), R3
  17  	// compute unrolled loop lengths
  18  	AND $3, R0, R4
  19  	LSR $2, R0
  20  	ADDS ZR, R0	// clear carry
  21  loop1:
  22  	CBZ R4, loop1done
  23  loop1cont:
  24  	// unroll 1X
  25  	MOVD.P 8(R1), R5
  26  	MOVD.P 8(R2), R6
  27  	ADCS R6, R5
  28  	MOVD.P R5, 8(R3)
  29  	SUB $1, R4
  30  	CBNZ R4, loop1cont
  31  loop1done:
  32  loop4:
  33  	CBZ R0, loop4done
  34  loop4cont:
  35  	// unroll 4X
  36  	LDP.P 32(R1), (R4, R5)
  37  	LDP -16(R1), (R6, R7)
  38  	LDP.P 32(R2), (R8, R9)
  39  	LDP -16(R2), (R10, R11)
  40  	ADCS R8, R4
  41  	ADCS R9, R5
  42  	ADCS R10, R6
  43  	ADCS R11, R7
  44  	STP.P (R4, R5), 32(R3)
  45  	STP (R6, R7), -16(R3)
  46  	SUB $1, R0
  47  	CBNZ R0, loop4cont
  48  loop4done:
  49  	ADC ZR, ZR, R1	// save & convert add carry
  50  	MOVD R1, c+72(FP)
  51  	RET
  52  
  53  // func subVV(z, x, y []Word) (c Word)
  54  TEXT ·subVV(SB), NOSPLIT, $0
  55  	MOVD z_len+8(FP), R0
  56  	MOVD x_base+24(FP), R1
  57  	MOVD y_base+48(FP), R2
  58  	MOVD z_base+0(FP), R3
  59  	// compute unrolled loop lengths
  60  	AND $3, R0, R4
  61  	LSR $2, R0
  62  	SUBS ZR, R0	// clear carry
  63  loop1:
  64  	CBZ R4, loop1done
  65  loop1cont:
  66  	// unroll 1X
  67  	MOVD.P 8(R1), R5
  68  	MOVD.P 8(R2), R6
  69  	SBCS R6, R5
  70  	MOVD.P R5, 8(R3)
  71  	SUB $1, R4
  72  	CBNZ R4, loop1cont
  73  loop1done:
  74  loop4:
  75  	CBZ R0, loop4done
  76  loop4cont:
  77  	// unroll 4X
  78  	LDP.P 32(R1), (R4, R5)
  79  	LDP -16(R1), (R6, R7)
  80  	LDP.P 32(R2), (R8, R9)
  81  	LDP -16(R2), (R10, R11)
  82  	SBCS R8, R4
  83  	SBCS R9, R5
  84  	SBCS R10, R6
  85  	SBCS R11, R7
  86  	STP.P (R4, R5), 32(R3)
  87  	STP (R6, R7), -16(R3)
  88  	SUB $1, R0
  89  	CBNZ R0, loop4cont
  90  loop4done:
  91  	SBC R1, R1	// save carry
  92  	SUB R1, ZR, R1	// convert sub carry
  93  	MOVD R1, c+72(FP)
  94  	RET
  95  
  96  // func lshVU(z, x []Word, s uint) (c Word)
  97  TEXT ·lshVU(SB), NOSPLIT, $0
  98  	MOVD z_len+8(FP), R0
  99  	CBZ R0, ret0
 100  	MOVD s+48(FP), R1
 101  	MOVD x_base+24(FP), R2
 102  	MOVD z_base+0(FP), R3
 103  	// run loop backward
 104  	ADD R0<<3, R2, R2
 105  	ADD R0<<3, R3, R3
 106  	// shift first word into carry
 107  	MOVD.W -8(R2), R4
 108  	MOVD $64, R5
 109  	SUB R1, R5
 110  	LSR R5, R4, R6
 111  	LSL R1, R4
 112  	MOVD R6, c+56(FP)
 113  	// shift remaining words
 114  	SUB $1, R0
 115  	// compute unrolled loop lengths
 116  	AND $3, R0, R6
 117  	LSR $2, R0
 118  loop1:
 119  	CBZ R6, loop1done
 120  loop1cont:
 121  	// unroll 1X
 122  	MOVD.W -8(R2), R7
 123  	LSR R5, R7, R8
 124  	ORR R4, R8
 125  	LSL R1, R7, R4
 126  	MOVD.W R8, -8(R3)
 127  	SUB $1, R6
 128  	CBNZ R6, loop1cont
 129  loop1done:
 130  loop4:
 131  	CBZ R0, loop4done
 132  loop4cont:
 133  	// unroll 4X
 134  	LDP.W -32(R2), (R9, R8)
 135  	LDP 16(R2), (R7, R6)
 136  	LSR R5, R6, R10
 137  	ORR R4, R10
 138  	LSL R1, R6, R4
 139  	LSR R5, R7, R6
 140  	ORR R4, R6
 141  	LSL R1, R7, R4
 142  	LSR R5, R8, R7
 143  	ORR R4, R7
 144  	LSL R1, R8, R4
 145  	LSR R5, R9, R8
 146  	ORR R4, R8
 147  	LSL R1, R9, R4
 148  	STP.W (R8, R7), -32(R3)
 149  	STP (R6, R10), 16(R3)
 150  	SUB $1, R0
 151  	CBNZ R0, loop4cont
 152  loop4done:
 153  	// store final shifted bits
 154  	MOVD.W R4, -8(R3)
 155  	RET
 156  ret0:
 157  	MOVD ZR, c+56(FP)
 158  	RET
 159  
 160  // func rshVU(z, x []Word, s uint) (c Word)
 161  TEXT ·rshVU(SB), NOSPLIT, $0
 162  	MOVD z_len+8(FP), R0
 163  	CBZ R0, ret0
 164  	MOVD s+48(FP), R1
 165  	MOVD x_base+24(FP), R2
 166  	MOVD z_base+0(FP), R3
 167  	// shift first word into carry
 168  	MOVD.P 8(R2), R4
 169  	MOVD $64, R5
 170  	SUB R1, R5
 171  	LSL R5, R4, R6
 172  	LSR R1, R4
 173  	MOVD R6, c+56(FP)
 174  	// shift remaining words
 175  	SUB $1, R0
 176  	// compute unrolled loop lengths
 177  	AND $3, R0, R6
 178  	LSR $2, R0
 179  loop1:
 180  	CBZ R6, loop1done
 181  loop1cont:
 182  	// unroll 1X
 183  	MOVD.P 8(R2), R7
 184  	LSL R5, R7, R8
 185  	ORR R4, R8
 186  	LSR R1, R7, R4
 187  	MOVD.P R8, 8(R3)
 188  	SUB $1, R6
 189  	CBNZ R6, loop1cont
 190  loop1done:
 191  loop4:
 192  	CBZ R0, loop4done
 193  loop4cont:
 194  	// unroll 4X
 195  	LDP.P 32(R2), (R6, R7)
 196  	LDP -16(R2), (R8, R9)
 197  	LSL R5, R6, R10
 198  	ORR R4, R10
 199  	LSR R1, R6, R4
 200  	LSL R5, R7, R6
 201  	ORR R4, R6
 202  	LSR R1, R7, R4
 203  	LSL R5, R8, R7
 204  	ORR R4, R7
 205  	LSR R1, R8, R4
 206  	LSL R5, R9, R8
 207  	ORR R4, R8
 208  	LSR R1, R9, R4
 209  	STP.P (R10, R6), 32(R3)
 210  	STP (R7, R8), -16(R3)
 211  	SUB $1, R0
 212  	CBNZ R0, loop4cont
 213  loop4done:
 214  	// store final shifted bits
 215  	MOVD.P R4, 8(R3)
 216  	RET
 217  ret0:
 218  	MOVD ZR, c+56(FP)
 219  	RET
 220  
 221  // func mulAddVWW(z, x []Word, m, a Word) (c Word)
 222  TEXT ·mulAddVWW(SB), NOSPLIT, $0
 223  	MOVD m+48(FP), R0
 224  	MOVD a+56(FP), R1
 225  	MOVD z_len+8(FP), R2
 226  	MOVD x_base+24(FP), R3
 227  	MOVD z_base+0(FP), R4
 228  	// compute unrolled loop lengths
 229  	AND $7, R2, R5
 230  	LSR $3, R2
 231  loop1:
 232  	CBZ R5, loop1done
 233  loop1cont:
 234  	// unroll 1X
 235  	MOVD.P 8(R3), R6
 236  	// multiply
 237  	UMULH R0, R6, R7
 238  	MUL R0, R6
 239  	ADDS R1, R6
 240  	ADC ZR, R7, R1
 241  	MOVD.P R6, 8(R4)
 242  	SUB $1, R5
 243  	CBNZ R5, loop1cont
 244  loop1done:
 245  loop8:
 246  	CBZ R2, loop8done
 247  loop8cont:
 248  	// unroll 8X
 249  	LDP.P 64(R3), (R5, R6)
 250  	LDP -48(R3), (R7, R8)
 251  	LDP -32(R3), (R9, R10)
 252  	LDP -16(R3), (R11, R12)
 253  	// multiply
 254  	UMULH R0, R5, R13
 255  	MUL R0, R5
 256  	ADDS R1, R5
 257  	UMULH R0, R6, R14
 258  	MUL R0, R6
 259  	ADCS R13, R6
 260  	UMULH R0, R7, R13
 261  	MUL R0, R7
 262  	ADCS R14, R7
 263  	UMULH R0, R8, R14
 264  	MUL R0, R8
 265  	ADCS R13, R8
 266  	UMULH R0, R9, R13
 267  	MUL R0, R9
 268  	ADCS R14, R9
 269  	UMULH R0, R10, R14
 270  	MUL R0, R10
 271  	ADCS R13, R10
 272  	UMULH R0, R11, R13
 273  	MUL R0, R11
 274  	ADCS R14, R11
 275  	UMULH R0, R12, R14
 276  	MUL R0, R12
 277  	ADCS R13, R12
 278  	ADC ZR, R14, R1
 279  	STP.P (R5, R6), 64(R4)
 280  	STP (R7, R8), -48(R4)
 281  	STP (R9, R10), -32(R4)
 282  	STP (R11, R12), -16(R4)
 283  	SUB $1, R2
 284  	CBNZ R2, loop8cont
 285  loop8done:
 286  	MOVD R1, c+64(FP)
 287  	RET
 288  
 289  // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
 290  TEXT ·addMulVVWW(SB), NOSPLIT, $0
 291  	MOVD m+72(FP), R0
 292  	MOVD a+80(FP), R1
 293  	MOVD z_len+8(FP), R2
 294  	MOVD x_base+24(FP), R3
 295  	MOVD y_base+48(FP), R4
 296  	MOVD z_base+0(FP), R5
 297  	// compute unrolled loop lengths
 298  	AND $7, R2, R6
 299  	LSR $3, R2
 300  loop1:
 301  	CBZ R6, loop1done
 302  loop1cont:
 303  	// unroll 1X
 304  	MOVD.P 8(R3), R7
 305  	MOVD.P 8(R4), R8
 306  	// multiply
 307  	UMULH R0, R8, R9
 308  	MUL R0, R8
 309  	ADDS R1, R8
 310  	ADC ZR, R9, R1
 311  	// add
 312  	ADDS R7, R8
 313  	ADC ZR, R1
 314  	MOVD.P R8, 8(R5)
 315  	SUB $1, R6
 316  	CBNZ R6, loop1cont
 317  loop1done:
 318  loop8:
 319  	CBZ R2, loop8done
 320  loop8cont:
 321  	// unroll 8X
 322  	LDP.P 64(R3), (R6, R7)
 323  	LDP -48(R3), (R8, R9)
 324  	LDP -32(R3), (R10, R11)
 325  	LDP -16(R3), (R12, R13)
 326  	LDP.P 64(R4), (R14, R15)
 327  	LDP -48(R4), (R16, R17)
 328  	LDP -32(R4), (R19, R20)
 329  	LDP -16(R4), (R21, R22)
 330  	// multiply
 331  	UMULH R0, R14, R23
 332  	MUL R0, R14
 333  	ADDS R1, R14
 334  	UMULH R0, R15, R24
 335  	MUL R0, R15
 336  	ADCS R23, R15
 337  	UMULH R0, R16, R23
 338  	MUL R0, R16
 339  	ADCS R24, R16
 340  	UMULH R0, R17, R24
 341  	MUL R0, R17
 342  	ADCS R23, R17
 343  	UMULH R0, R19, R23
 344  	MUL R0, R19
 345  	ADCS R24, R19
 346  	UMULH R0, R20, R24
 347  	MUL R0, R20
 348  	ADCS R23, R20
 349  	UMULH R0, R21, R23
 350  	MUL R0, R21
 351  	ADCS R24, R21
 352  	UMULH R0, R22, R24
 353  	MUL R0, R22
 354  	ADCS R23, R22
 355  	ADC ZR, R24, R1
 356  	// add
 357  	ADDS R6, R14
 358  	ADCS R7, R15
 359  	ADCS R8, R16
 360  	ADCS R9, R17
 361  	ADCS R10, R19
 362  	ADCS R11, R20
 363  	ADCS R12, R21
 364  	ADCS R13, R22
 365  	ADC ZR, R1
 366  	STP.P (R14, R15), 64(R5)
 367  	STP (R16, R17), -48(R5)
 368  	STP (R19, R20), -32(R5)
 369  	STP (R21, R22), -16(R5)
 370  	SUB $1, R2
 371  	CBNZ R2, loop8cont
 372  loop8done:
 373  	MOVD R1, c+88(FP)
 374  	RET
 375