xxhash_arm64.s raw

   1  //go:build !appengine && gc && !purego
   2  // +build !appengine
   3  // +build gc
   4  // +build !purego
   5  
   6  #include "textflag.h"
   7  
   8  // Registers:
   9  #define digest	R1
  10  #define h	R2 // return value
  11  #define p	R3 // input pointer
  12  #define n	R4 // input length
  13  #define nblocks	R5 // n / 32
  14  #define prime1	R7
  15  #define prime2	R8
  16  #define prime3	R9
  17  #define prime4	R10
  18  #define prime5	R11
  19  #define v1	R12
  20  #define v2	R13
  21  #define v3	R14
  22  #define v4	R15
  23  #define x1	R20
  24  #define x2	R21
  25  #define x3	R22
  26  #define x4	R23
  27  
  28  #define round(acc, x) \
  29  	MADD prime2, acc, x, acc \
  30  	ROR  $64-31, acc         \
  31  	MUL  prime1, acc
  32  
  33  // round0 performs the operation x = round(0, x).
  34  #define round0(x) \
  35  	MUL prime2, x \
  36  	ROR $64-31, x \
  37  	MUL prime1, x
  38  
  39  #define mergeRound(acc, x) \
  40  	round0(x)                     \
  41  	EOR  x, acc                   \
  42  	MADD acc, prime4, prime1, acc
  43  
  44  // blockLoop processes as many 32-byte blocks as possible,
  45  // updating v1, v2, v3, and v4. It assumes that n >= 32.
  46  #define blockLoop() \
  47  	LSR     $5, n, nblocks  \
  48  	PCALIGN $16             \
  49  	loop:                   \
  50  	LDP.P   16(p), (x1, x2) \
  51  	LDP.P   16(p), (x3, x4) \
  52  	round(v1, x1)           \
  53  	round(v2, x2)           \
  54  	round(v3, x3)           \
  55  	round(v4, x4)           \
  56  	SUB     $1, nblocks     \
  57  	CBNZ    nblocks, loop
  58  
  59  // func Sum64(b []byte) uint64
  60  TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
  61  	LDP b_base+0(FP), (p, n)
  62  
  63  	LDP  ·primes+0(SB), (prime1, prime2)
  64  	LDP  ·primes+16(SB), (prime3, prime4)
  65  	MOVD ·primes+32(SB), prime5
  66  
  67  	CMP  $32, n
  68  	CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
  69  	BLT  afterLoop
  70  
  71  	ADD  prime1, prime2, v1
  72  	MOVD prime2, v2
  73  	MOVD $0, v3
  74  	NEG  prime1, v4
  75  
  76  	blockLoop()
  77  
  78  	ROR $64-1, v1, x1
  79  	ROR $64-7, v2, x2
  80  	ADD x1, x2
  81  	ROR $64-12, v3, x3
  82  	ROR $64-18, v4, x4
  83  	ADD x3, x4
  84  	ADD x2, x4, h
  85  
  86  	mergeRound(h, v1)
  87  	mergeRound(h, v2)
  88  	mergeRound(h, v3)
  89  	mergeRound(h, v4)
  90  
  91  afterLoop:
  92  	ADD n, h
  93  
  94  	TBZ   $4, n, try8
  95  	LDP.P 16(p), (x1, x2)
  96  
  97  	round0(x1)
  98  
  99  	// NOTE: here and below, sequencing the EOR after the ROR (using a
 100  	// rotated register) is worth a small but measurable speedup for small
 101  	// inputs.
 102  	ROR  $64-27, h
 103  	EOR  x1 @> 64-27, h, h
 104  	MADD h, prime4, prime1, h
 105  
 106  	round0(x2)
 107  	ROR  $64-27, h
 108  	EOR  x2 @> 64-27, h, h
 109  	MADD h, prime4, prime1, h
 110  
 111  try8:
 112  	TBZ    $3, n, try4
 113  	MOVD.P 8(p), x1
 114  
 115  	round0(x1)
 116  	ROR  $64-27, h
 117  	EOR  x1 @> 64-27, h, h
 118  	MADD h, prime4, prime1, h
 119  
 120  try4:
 121  	TBZ     $2, n, try2
 122  	MOVWU.P 4(p), x2
 123  
 124  	MUL  prime1, x2
 125  	ROR  $64-23, h
 126  	EOR  x2 @> 64-23, h, h
 127  	MADD h, prime3, prime2, h
 128  
 129  try2:
 130  	TBZ     $1, n, try1
 131  	MOVHU.P 2(p), x3
 132  	AND     $255, x3, x1
 133  	LSR     $8, x3, x2
 134  
 135  	MUL prime5, x1
 136  	ROR $64-11, h
 137  	EOR x1 @> 64-11, h, h
 138  	MUL prime1, h
 139  
 140  	MUL prime5, x2
 141  	ROR $64-11, h
 142  	EOR x2 @> 64-11, h, h
 143  	MUL prime1, h
 144  
 145  try1:
 146  	TBZ   $0, n, finalize
 147  	MOVBU (p), x4
 148  
 149  	MUL prime5, x4
 150  	ROR $64-11, h
 151  	EOR x4 @> 64-11, h, h
 152  	MUL prime1, h
 153  
 154  finalize:
 155  	EOR h >> 33, h
 156  	MUL prime2, h
 157  	EOR h >> 29, h
 158  	MUL prime3, h
 159  	EOR h >> 32, h
 160  
 161  	MOVD h, ret+24(FP)
 162  	RET
 163  
 164  // func writeBlocks(d *Digest, b []byte) int
 165  TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
 166  	LDP ·primes+0(SB), (prime1, prime2)
 167  
 168  	// Load state. Assume v[1-4] are stored contiguously.
 169  	MOVD d+0(FP), digest
 170  	LDP  0(digest), (v1, v2)
 171  	LDP  16(digest), (v3, v4)
 172  
 173  	LDP b_base+8(FP), (p, n)
 174  
 175  	blockLoop()
 176  
 177  	// Store updated state.
 178  	STP (v1, v2), 0(digest)
 179  	STP (v3, v4), 16(digest)
 180  
 181  	BIC  $31, n
 182  	MOVD n, ret+32(FP)
 183  	RET
 184