xor_loong64.s raw

   1  // Copyright 2024 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build !purego
   6  
   7  #include "textflag.h"
   8  
   9  #define SMALL_TAIL \
  10  	SGTU	$2, R7, R8; \
  11  	BNE	R8, xor_1; \
  12  	SGTU	$4, R7, R8; \
  13  	BNE	R8, xor_2; \
  14  	SGTU	$8, R7, R8; \
  15  	BNE	R8, xor_4; \
  16  	SGTU	$16, R7, R8; \
  17  	BNE	R8, xor_8; \
  18  
  19  #define SMALL \
  20  xor_8_check:; \
  21  	SGTU	$8, R7, R8; \
  22  	BNE	R8, xor_4_check; \
  23  xor_8:; \
  24  	SUBV	$8, R7; \
  25  	MOVV	(R5), R10; \
  26  	MOVV	(R6), R11; \
  27  	XOR	R10, R11; \
  28  	MOVV	R11, (R4); \
  29  	ADDV	$8, R5; \
  30  	ADDV	$8, R6; \
  31  	ADDV	$8, R4; \
  32  	BEQ	R7, R0, end; \
  33  xor_4_check:; \
  34  	SGTU	$4, R7, R8; \
  35  	BNE	R8, xor_2_check; \
  36  xor_4:; \
  37  	SUBV	$4, R7; \
  38  	MOVW	(R5), R10; \
  39  	MOVW	(R6), R11; \
  40  	XOR	R10, R11; \
  41  	MOVW	R11, (R4); \
  42  	ADDV	$4, R5; \
  43  	ADDV	$4, R6; \
  44  	ADDV	$4, R4; \
  45  	BEQ	R7, R0, end; \
  46  xor_2_check:; \
  47  	SGTU	$2, R7, R8; \
  48  	BNE	R8, xor_1; \
  49  xor_2:; \
  50  	SUBV	$2, R7; \
  51  	MOVH	(R5), R10; \
  52  	MOVH	(R6), R11; \
  53  	XOR	R10, R11; \
  54  	MOVH	R11, (R4); \
  55  	ADDV	$2, R5; \
  56  	ADDV	$2, R6; \
  57  	ADDV	$2, R4; \
  58  	BEQ	R7, R0, end; \
  59  xor_1:; \
  60  	MOVB	(R5), R10; \
  61  	MOVB	(R6), R11; \
  62  	XOR	R10, R11; \
  63  	MOVB	R11, (R4); \
  64  
  65  // func xorBytesBasic(dst, a, b *byte, n int)
  66  TEXT ·xorBytesBasic(SB), NOSPLIT, $0
  67  	MOVV	dst+0(FP), R4
  68  	MOVV	a+8(FP), R5
  69  	MOVV	b+16(FP), R6
  70  	MOVV	n+24(FP), R7
  71  
  72  	SMALL_TAIL
  73  
  74  xor_64_check:
  75  	SGTU	$64, R7, R8
  76  	BNE	R8, xor_32_check
  77  xor_64_loop:
  78  	SUBV	$64, R7
  79  	MOVV	(R5), R10
  80  	MOVV	8(R5), R11
  81  	MOVV	16(R5), R12
  82  	MOVV	24(R5), R13
  83  	MOVV	(R6), R14
  84  	MOVV	8(R6), R15
  85  	MOVV	16(R6), R16
  86  	MOVV	24(R6), R17
  87  	XOR	R10, R14
  88  	XOR	R11, R15
  89  	XOR	R12, R16
  90  	XOR	R13, R17
  91  	MOVV	R14, (R4)
  92  	MOVV	R15, 8(R4)
  93  	MOVV	R16, 16(R4)
  94  	MOVV	R17, 24(R4)
  95  	MOVV	32(R5), R10
  96  	MOVV	40(R5), R11
  97  	MOVV	48(R5), R12
  98  	MOVV	56(R5), R13
  99  	MOVV	32(R6), R14
 100  	MOVV	40(R6), R15
 101  	MOVV	48(R6), R16
 102  	MOVV	56(R6), R17
 103  	XOR	R10, R14
 104  	XOR	R11, R15
 105  	XOR	R12, R16
 106  	XOR	R13, R17
 107  	MOVV	R14, 32(R4)
 108  	MOVV	R15, 40(R4)
 109  	MOVV	R16, 48(R4)
 110  	MOVV	R17, 56(R4)
 111  	SGTU	$64, R7, R8
 112  	ADDV	$64, R5
 113  	ADDV	$64, R6
 114  	ADDV	$64, R4
 115  	BEQ	R8, xor_64_loop
 116  	BEQ	R7, end
 117  
 118  xor_32_check:
 119  	SGTU	$32, R7, R8
 120  	BNE	R8, xor_16_check
 121  xor_32:
 122  	SUBV	$32, R7
 123  	MOVV	(R5), R10
 124  	MOVV	8(R5), R11
 125  	MOVV	16(R5), R12
 126  	MOVV	24(R5), R13
 127  	MOVV	(R6), R14
 128  	MOVV	8(R6), R15
 129  	MOVV	16(R6), R16
 130  	MOVV	24(R6), R17
 131  	XOR	R10, R14
 132  	XOR	R11, R15
 133  	XOR	R12, R16
 134  	XOR	R13, R17
 135  	MOVV	R14, (R4)
 136  	MOVV	R15, 8(R4)
 137  	MOVV	R16, 16(R4)
 138  	MOVV	R17, 24(R4)
 139  	ADDV	$32, R5
 140  	ADDV	$32, R6
 141  	ADDV	$32, R4
 142  	BEQ	R7, R0, end
 143  
 144  xor_16_check:
 145  	SGTU	$16, R7, R8
 146  	BNE	R8, xor_8_check
 147  xor_16:
 148  	SUBV	$16, R7
 149  	MOVV	(R5), R10
 150  	MOVV	8(R5), R11
 151  	MOVV	(R6), R12
 152  	MOVV	8(R6), R13
 153  	XOR	R10, R12
 154  	XOR	R11, R13
 155  	MOVV	R12, (R4)
 156  	MOVV	R13, 8(R4)
 157  	ADDV	$16, R5
 158  	ADDV	$16, R6
 159  	ADDV	$16, R4
 160  	BEQ	R7, R0, end
 161  
 162  	SMALL
 163  end:
 164  	RET
 165  
 166  // func xorBytesLSX(dst, a, b *byte, n int)
 167  TEXT ·xorBytesLSX(SB), NOSPLIT, $0
 168  	MOVV	dst+0(FP), R4
 169  	MOVV	a+8(FP), R5
 170  	MOVV	b+16(FP), R6
 171  	MOVV	n+24(FP), R7
 172  
 173  	SMALL_TAIL
 174  
 175  xor_128_lsx_check:
 176  	SGTU	$128, R7, R8
 177  	BNE	R8, xor_64_lsx_check
 178  xor_128_lsx_loop:
 179  	SUBV	$128, R7
 180  	VMOVQ	(R5), V0
 181  	VMOVQ	16(R5), V1
 182  	VMOVQ	32(R5), V2
 183  	VMOVQ	48(R5), V3
 184  	VMOVQ	64(R5), V4
 185  	VMOVQ	80(R5), V5
 186  	VMOVQ	96(R5), V6
 187  	VMOVQ	112(R5), V7
 188  	VMOVQ	(R6), V8
 189  	VMOVQ	16(R6), V9
 190  	VMOVQ	32(R6), V10
 191  	VMOVQ	48(R6), V11
 192  	VMOVQ	64(R6), V12
 193  	VMOVQ	80(R6), V13
 194  	VMOVQ	96(R6), V14
 195  	VMOVQ	112(R6), V15
 196  	VXORV	V0, V8, V8
 197  	VXORV	V1, V9, V9
 198  	VXORV	V2, V10, V10
 199  	VXORV	V3, V11, V11
 200  	VXORV	V4, V12, V12
 201  	VXORV	V5, V13, V13
 202  	VXORV	V6, V14, V14
 203  	VXORV	V7, V15, V15
 204  	VMOVQ	V8, (R4)
 205  	VMOVQ	V9, 16(R4)
 206  	VMOVQ	V10, 32(R4)
 207  	VMOVQ	V11, 48(R4)
 208  	VMOVQ	V12, 64(R4)
 209  	VMOVQ	V13, 80(R4)
 210  	VMOVQ	V14, 96(R4)
 211  	VMOVQ	V15, 112(R4)
 212  	SGTU	$128, R7, R8
 213  	ADDV	$128, R5
 214  	ADDV	$128, R6
 215  	ADDV	$128, R4
 216  	BEQ	R8, xor_128_lsx_loop
 217  	BEQ	R7, end
 218  
 219  xor_64_lsx_check:
 220  	SGTU	$64, R7, R8
 221  	BNE	R8, xor_32_lsx_check
 222  xor_64_lsx:
 223  	SUBV	$64, R7
 224  	VMOVQ	(R5), V0
 225  	VMOVQ	16(R5), V1
 226  	VMOVQ	32(R5), V2
 227  	VMOVQ	48(R5), V3
 228  	VMOVQ	(R6), V4
 229  	VMOVQ	16(R6), V5
 230  	VMOVQ	32(R6), V6
 231  	VMOVQ	48(R6), V7
 232  	VXORV	V0, V4, V4
 233  	VXORV	V1, V5, V5
 234  	VXORV	V2, V6, V6
 235  	VXORV	V3, V7, V7
 236  	VMOVQ	V4, (R4)
 237  	VMOVQ	V5, 16(R4)
 238  	VMOVQ	V6, 32(R4)
 239  	VMOVQ	V7, 48(R4)
 240  	ADDV	$64, R5
 241  	ADDV	$64, R6
 242  	ADDV	$64, R4
 243  	BEQ	R7, end
 244  
 245  xor_32_lsx_check:
 246  	SGTU	$32, R7, R8
 247  	BNE	R8, xor_16_lsx_check
 248  xor_32_lsx:
 249  	SUBV	$32, R7
 250  	VMOVQ	(R5), V0
 251  	VMOVQ	16(R5), V1
 252  	VMOVQ	(R6), V2
 253  	VMOVQ	16(R6), V3
 254  	VXORV	V0, V2, V2
 255  	VXORV	V1, V3, V3
 256  	VMOVQ	V2, (R4)
 257  	VMOVQ	V3, 16(R4)
 258  	ADDV	$32, R5
 259  	ADDV	$32, R6
 260  	ADDV	$32, R4
 261  	BEQ	R7, end
 262  
 263  xor_16_lsx_check:
 264  	SGTU	$16, R7, R8
 265  	BNE	R8, xor_8_check
 266  xor_16_lsx:
 267  	SUBV	$16, R7
 268  	VMOVQ	(R5), V0
 269  	VMOVQ	(R6), V1
 270  	VXORV	V0, V1, V1
 271  	VMOVQ	V1, (R4)
 272  	ADDV	$16, R5
 273  	ADDV	$16, R6
 274  	ADDV	$16, R4
 275  	BEQ	R7, end
 276  
 277  	SMALL
 278  end:
 279  	RET
 280  
 281  // func xorBytesLASX(dst, a, b *byte, n int)
 282  TEXT ·xorBytesLASX(SB), NOSPLIT, $0
 283  	MOVV	dst+0(FP), R4
 284  	MOVV	a+8(FP), R5
 285  	MOVV	b+16(FP), R6
 286  	MOVV	n+24(FP), R7
 287  
 288  	SMALL_TAIL
 289  
 290  xor_256_lasx_check:
 291  	SGTU	$256, R7, R8
 292  	BNE	R8, xor_128_lasx_check
 293  xor_256_lasx_loop:
 294  	SUBV	$256, R7
 295  	XVMOVQ	(R5), X0
 296  	XVMOVQ	32(R5), X1
 297  	XVMOVQ	64(R5), X2
 298  	XVMOVQ	96(R5), X3
 299  	XVMOVQ	128(R5), X4
 300  	XVMOVQ	160(R5), X5
 301  	XVMOVQ	192(R5), X6
 302  	XVMOVQ	224(R5), X7
 303  	XVMOVQ	(R6), X8
 304  	XVMOVQ	32(R6), X9
 305  	XVMOVQ	64(R6), X10
 306  	XVMOVQ	96(R6), X11
 307  	XVMOVQ	128(R6), X12
 308  	XVMOVQ	160(R6), X13
 309  	XVMOVQ	192(R6), X14
 310  	XVMOVQ	224(R6), X15
 311  	XVXORV	X0, X8, X8
 312  	XVXORV	X1, X9, X9
 313  	XVXORV	X2, X10, X10
 314  	XVXORV	X3, X11, X11
 315  	XVXORV	X4, X12, X12
 316  	XVXORV	X5, X13, X13
 317  	XVXORV	X6, X14, X14
 318  	XVXORV	X7, X15, X15
 319  	XVMOVQ	X8, (R4)
 320  	XVMOVQ	X9, 32(R4)
 321  	XVMOVQ	X10, 64(R4)
 322  	XVMOVQ	X11, 96(R4)
 323  	XVMOVQ	X12, 128(R4)
 324  	XVMOVQ	X13, 160(R4)
 325  	XVMOVQ	X14, 192(R4)
 326  	XVMOVQ	X15, 224(R4)
 327  	SGTU	$256, R7, R8
 328  	ADDV	$256, R5
 329  	ADDV	$256, R6
 330  	ADDV	$256, R4
 331  	BEQ	R8, xor_256_lasx_loop
 332  	BEQ	R7, end
 333  
 334  xor_128_lasx_check:
 335  	SGTU	$128, R7, R8
 336  	BNE	R8, xor_64_lasx_check
 337  xor_128_lasx:
 338  	SUBV	$128, R7
 339  	XVMOVQ	(R5), X0
 340  	XVMOVQ	32(R5), X1
 341  	XVMOVQ	64(R5), X2
 342  	XVMOVQ	96(R5), X3
 343  	XVMOVQ	(R6), X4
 344  	XVMOVQ	32(R6), X5
 345  	XVMOVQ	64(R6), X6
 346  	XVMOVQ	96(R6), X7
 347  	XVXORV	X0, X4, X4
 348  	XVXORV	X1, X5, X5
 349  	XVXORV	X2, X6, X6
 350  	XVXORV	X3, X7, X7
 351  	XVMOVQ	X4, (R4)
 352  	XVMOVQ	X5, 32(R4)
 353  	XVMOVQ	X6, 64(R4)
 354  	XVMOVQ	X7, 96(R4)
 355  	ADDV	$128, R5
 356  	ADDV	$128, R6
 357  	ADDV	$128, R4
 358  	BEQ	R7, end
 359  
 360  xor_64_lasx_check:
 361  	SGTU	$64, R7, R8
 362  	BNE	R8, xor_32_lasx_check
 363  xor_64_lasx:
 364  	SUBV	$64, R7
 365  	XVMOVQ	(R5), X0
 366  	XVMOVQ	32(R5), X1
 367  	XVMOVQ	(R6), X2
 368  	XVMOVQ	32(R6), X3
 369  	XVXORV	X0, X2, X2
 370  	XVXORV	X1, X3, X3
 371  	XVMOVQ	X2, (R4)
 372  	XVMOVQ	X3, 32(R4)
 373  	ADDV	$64, R5
 374  	ADDV	$64, R6
 375  	ADDV	$64, R4
 376  	BEQ	R7, end
 377  
 378  xor_32_lasx_check:
 379  	SGTU	$32, R7, R8
 380  	BNE	R8, xor_16_lasx_check
 381  xor_32_lasx:
 382  	SUBV	$32, R7
 383  	XVMOVQ	(R5), X0
 384  	XVMOVQ	(R6), X1
 385  	XVXORV	X0, X1, X1
 386  	XVMOVQ	X1, (R4)
 387  	ADDV	$32, R5
 388  	ADDV	$32, R6
 389  	ADDV	$32, R4
 390  	BEQ	R7, end
 391  
 392  xor_16_lasx_check:
 393  	SGTU	$16, R7, R8
 394  	BNE	R8, xor_8_check
 395  xor_16_lasx:
 396  	SUBV	$16, R7
 397  	VMOVQ	(R5), V0
 398  	VMOVQ	(R6), V1
 399  	VXORV	V0, V1, V1
 400  	VMOVQ	V1, (R4)
 401  	ADDV	$16, R5
 402  	ADDV	$16, R6
 403  	ADDV	$16, R4
 404  	BEQ	R7, end
 405  
 406  	SMALL
 407  end:
 408  	RET
 409  
 410