decode_arm64.s raw

   1  // Copyright 2020 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // +build !appengine
   6  // +build gc
   7  // +build !noasm
   8  
   9  #include "textflag.h"
  10  
  11  #define R_TMP0 R2
  12  #define R_TMP1 R3
  13  #define R_LEN R4
  14  #define R_OFF R5
  15  #define R_SRC R6
  16  #define R_DST R7
  17  #define R_DBASE R8
  18  #define R_DLEN R9
  19  #define R_DEND R10
  20  #define R_SBASE R11
  21  #define R_SLEN R12
  22  #define R_SEND R13
  23  #define R_TMP2 R14
  24  #define R_TMP3 R15
  25  
  26  // TEST_SRC will check if R_SRC is <= SRC_END
  27  #define TEST_SRC() \
  28  	CMP R_SEND, R_SRC \
  29  	BGT errCorrupt
  30  
  31  // MOVD R_SRC, R_TMP1
  32  // SUB  R_SBASE, R_TMP1, R_TMP1
  33  // CMP  R_SLEN, R_TMP1
  34  // BGT  errCorrupt
  35  
  36  // The asm code generally follows the pure Go code in decode_other.go, except
  37  // where marked with a "!!!".
  38  
  39  // func decode(dst, src []byte) int
  40  //
  41  // All local variables fit into registers. The non-zero stack size is only to
  42  // spill registers and push args when issuing a CALL. The register allocation:
  43  //	- R_TMP0	scratch
  44  //	- R_TMP1	scratch
  45  //	- R_LEN	length or x
  46  //	- R_OFF	offset
  47  //	- R_SRC	&src[s]
  48  //	- R_DST	&dst[d]
  49  //	+ R_DBASE	dst_base
  50  //	+ R_DLEN	dst_len
  51  //	+ R_DEND	dst_base + dst_len
  52  //	+ R_SBASE	src_base
  53  //	+ R_SLEN	src_len
  54  //	+ R_SEND	src_base + src_len
  55  //	- R_TMP2	used by doCopy
  56  //	- R_TMP3	used by doCopy
  57  //
  58  // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
  59  // function, and after a CALL returns, and are not otherwise modified.
  60  //
  61  // The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
  62  // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
  63  TEXT ·s2Decode(SB), NOSPLIT, $56-56
  64  	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
  65  	MOVD dst_base+0(FP), R_DBASE
  66  	MOVD dst_len+8(FP), R_DLEN
  67  	MOVD R_DBASE, R_DST
  68  	MOVD R_DBASE, R_DEND
  69  	ADD  R_DLEN, R_DEND, R_DEND
  70  	MOVD src_base+24(FP), R_SBASE
  71  	MOVD src_len+32(FP), R_SLEN
  72  	MOVD R_SBASE, R_SRC
  73  	MOVD R_SBASE, R_SEND
  74  	ADD  R_SLEN, R_SEND, R_SEND
  75  	MOVD $0, R_OFF
  76  
  77  loop:
  78  	// for s < len(src)
  79  	CMP R_SEND, R_SRC
  80  	BEQ end
  81  
  82  	// R_LEN = uint32(src[s])
  83  	//
  84  	// switch src[s] & 0x03
  85  	MOVBU (R_SRC), R_LEN
  86  	MOVW  R_LEN, R_TMP1
  87  	ANDW  $3, R_TMP1
  88  	MOVW  $1, R1
  89  	CMPW  R1, R_TMP1
  90  	BGE   tagCopy
  91  
  92  	// ----------------------------------------
  93  	// The code below handles literal tags.
  94  
  95  	// case tagLiteral:
  96  	// x := uint32(src[s] >> 2)
  97  	// switch
  98  	MOVW $60, R1
  99  	LSRW $2, R_LEN, R_LEN
 100  	CMPW R_LEN, R1
 101  	BLS  tagLit60Plus
 102  
 103  	// case x < 60:
 104  	// s++
 105  	ADD $1, R_SRC, R_SRC
 106  
 107  doLit:
 108  	// This is the end of the inner "switch", when we have a literal tag.
 109  	//
 110  	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
 111  	// used in the pure Go decode_other.go code.
 112  
 113  	// length = int(x) + 1
 114  	//
 115  	// Unlike the pure Go code, we don't need to check if length <= 0 because
 116  	// R_LEN can hold 64 bits, so the increment cannot overflow.
 117  	ADD $1, R_LEN, R_LEN
 118  
 119  	// Prepare to check if copying length bytes will run past the end of dst or
 120  	// src.
 121  	//
 122  	// R_TMP0 = len(dst) - d
 123  	// R_TMP1 = len(src) - s
 124  	MOVD R_DEND, R_TMP0
 125  	SUB  R_DST, R_TMP0, R_TMP0
 126  	MOVD R_SEND, R_TMP1
 127  	SUB  R_SRC, R_TMP1, R_TMP1
 128  
 129  	// !!! Try a faster technique for short (16 or fewer bytes) copies.
 130  	//
 131  	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
 132  	//   goto callMemmove // Fall back on calling runtime·memmove.
 133  	// }
 134  	//
 135  	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
 136  	// against 21 instead of 16, because it cannot assume that all of its input
 137  	// is contiguous in memory and so it needs to leave enough source bytes to
 138  	// read the next tag without refilling buffers, but Go's Decode assumes
 139  	// contiguousness (the src argument is a []byte).
 140  	CMP $16, R_LEN
 141  	BGT callMemmove
 142  	CMP $16, R_TMP0
 143  	BLT callMemmove
 144  	CMP $16, R_TMP1
 145  	BLT callMemmove
 146  
 147  	// !!! Implement the copy from src to dst as a 16-byte load and store.
 148  	// (Decode's documentation says that dst and src must not overlap.)
 149  	//
 150  	// This always copies 16 bytes, instead of only length bytes, but that's
 151  	// OK. If the input is a valid Snappy encoding then subsequent iterations
 152  	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
 153  	// non-nil error), so the overrun will be ignored.
 154  	//
 155  	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
 156  	// 16-byte loads and stores. This technique probably wouldn't be as
 157  	// effective on architectures that are fussier about alignment.
 158  	LDP 0(R_SRC), (R_TMP2, R_TMP3)
 159  	STP (R_TMP2, R_TMP3), 0(R_DST)
 160  
 161  	// d += length
 162  	// s += length
 163  	ADD R_LEN, R_DST, R_DST
 164  	ADD R_LEN, R_SRC, R_SRC
 165  	B   loop
 166  
 167  callMemmove:
 168  	// if length > len(dst)-d || length > len(src)-s { etc }
 169  	CMP R_TMP0, R_LEN
 170  	BGT errCorrupt
 171  	CMP R_TMP1, R_LEN
 172  	BGT errCorrupt
 173  
 174  	// copy(dst[d:], src[s:s+length])
 175  	//
 176  	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
 177  	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
 178  	// three registers to the stack, to save local variables across the CALL.
 179  	MOVD R_DST, 8(RSP)
 180  	MOVD R_SRC, 16(RSP)
 181  	MOVD R_LEN, 24(RSP)
 182  	MOVD R_DST, 32(RSP)
 183  	MOVD R_SRC, 40(RSP)
 184  	MOVD R_LEN, 48(RSP)
 185  	MOVD R_OFF, 56(RSP)
 186  	CALL runtime·memmove(SB)
 187  
 188  	// Restore local variables: unspill registers from the stack and
 189  	// re-calculate R_DBASE-R_SEND.
 190  	MOVD 32(RSP), R_DST
 191  	MOVD 40(RSP), R_SRC
 192  	MOVD 48(RSP), R_LEN
 193  	MOVD 56(RSP), R_OFF
 194  	MOVD dst_base+0(FP), R_DBASE
 195  	MOVD dst_len+8(FP), R_DLEN
 196  	MOVD R_DBASE, R_DEND
 197  	ADD  R_DLEN, R_DEND, R_DEND
 198  	MOVD src_base+24(FP), R_SBASE
 199  	MOVD src_len+32(FP), R_SLEN
 200  	MOVD R_SBASE, R_SEND
 201  	ADD  R_SLEN, R_SEND, R_SEND
 202  
 203  	// d += length
 204  	// s += length
 205  	ADD R_LEN, R_DST, R_DST
 206  	ADD R_LEN, R_SRC, R_SRC
 207  	B   loop
 208  
 209  tagLit60Plus:
 210  	// !!! This fragment does the
 211  	//
 212  	// s += x - 58; if uint(s) > uint(len(src)) { etc }
 213  	//
 214  	// checks. In the asm version, we code it once instead of once per switch case.
 215  	ADD R_LEN, R_SRC, R_SRC
 216  	SUB $58, R_SRC, R_SRC
 217  	TEST_SRC()
 218  
 219  	// case x == 60:
 220  	MOVW $61, R1
 221  	CMPW R1, R_LEN
 222  	BEQ  tagLit61
 223  	BGT  tagLit62Plus
 224  
 225  	// x = uint32(src[s-1])
 226  	MOVBU -1(R_SRC), R_LEN
 227  	B     doLit
 228  
 229  tagLit61:
 230  	// case x == 61:
 231  	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
 232  	MOVHU -2(R_SRC), R_LEN
 233  	B     doLit
 234  
 235  tagLit62Plus:
 236  	CMPW $62, R_LEN
 237  	BHI  tagLit63
 238  
 239  	// case x == 62:
 240  	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
 241  	MOVHU -3(R_SRC), R_LEN
 242  	MOVBU -1(R_SRC), R_TMP1
 243  	ORR   R_TMP1<<16, R_LEN
 244  	B     doLit
 245  
 246  tagLit63:
 247  	// case x == 63:
 248  	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
 249  	MOVWU -4(R_SRC), R_LEN
 250  	B     doLit
 251  
 252  	// The code above handles literal tags.
 253  	// ----------------------------------------
 254  	// The code below handles copy tags.
 255  
 256  tagCopy4:
 257  	// case tagCopy4:
 258  	// s += 5
 259  	ADD $5, R_SRC, R_SRC
 260  
 261  	// if uint(s) > uint(len(src)) { etc }
 262  	MOVD R_SRC, R_TMP1
 263  	SUB  R_SBASE, R_TMP1, R_TMP1
 264  	CMP  R_SLEN, R_TMP1
 265  	BGT  errCorrupt
 266  
 267  	// length = 1 + int(src[s-5])>>2
 268  	MOVD $1, R1
 269  	ADD  R_LEN>>2, R1, R_LEN
 270  
 271  	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
 272  	MOVWU -4(R_SRC), R_OFF
 273  	B     doCopy
 274  
 275  tagCopy2:
 276  	// case tagCopy2:
 277  	// s += 3
 278  	ADD $3, R_SRC, R_SRC
 279  
 280  	// if uint(s) > uint(len(src)) { etc }
 281  	TEST_SRC()
 282  
 283  	// length = 1 + int(src[s-3])>>2
 284  	MOVD $1, R1
 285  	ADD  R_LEN>>2, R1, R_LEN
 286  
 287  	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
 288  	MOVHU -2(R_SRC), R_OFF
 289  	B     doCopy
 290  
 291  tagCopy:
 292  	// We have a copy tag. We assume that:
 293  	//	- R_TMP1 == src[s] & 0x03
 294  	//	- R_LEN == src[s]
 295  	CMP $2, R_TMP1
 296  	BEQ tagCopy2
 297  	BGT tagCopy4
 298  
 299  	// case tagCopy1:
 300  	// s += 2
 301  	ADD $2, R_SRC, R_SRC
 302  
 303  	// if uint(s) > uint(len(src)) { etc }
 304  	TEST_SRC()
 305  
 306  	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
 307  	// Calculate offset in R_TMP0 in case it is a repeat.
 308  	MOVD  R_LEN, R_TMP0
 309  	AND   $0xe0, R_TMP0
 310  	MOVBU -1(R_SRC), R_TMP1
 311  	ORR   R_TMP0<<3, R_TMP1, R_TMP0
 312  
 313  	// length = 4 + int(src[s-2])>>2&0x7
 314  	MOVD $7, R1
 315  	AND  R_LEN>>2, R1, R_LEN
 316  	ADD  $4, R_LEN, R_LEN
 317  
 318  	// check if repeat code with offset 0.
 319  	CMP $0, R_TMP0
 320  	BEQ repeatCode
 321  
 322  	// This is a regular copy, transfer our temporary value to R_OFF (offset)
 323  	MOVD R_TMP0, R_OFF
 324  	B    doCopy
 325  
 326  	// This is a repeat code.
 327  repeatCode:
 328  	// If length < 9, reuse last offset, with the length already calculated.
 329  	CMP $9, R_LEN
 330  	BLT doCopyRepeat
 331  	BEQ repeatLen1
 332  	CMP $10, R_LEN
 333  	BEQ repeatLen2
 334  
 335  repeatLen3:
 336  	// s +=3
 337  	ADD $3, R_SRC, R_SRC
 338  
 339  	// if uint(s) > uint(len(src)) { etc }
 340  	TEST_SRC()
 341  
 342  	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
 343  	MOVBU -1(R_SRC), R_TMP0
 344  	MOVHU -3(R_SRC), R_LEN
 345  	ORR   R_TMP0<<16, R_LEN, R_LEN
 346  	ADD   $65540, R_LEN, R_LEN
 347  	B     doCopyRepeat
 348  
 349  repeatLen2:
 350  	// s +=2
 351  	ADD $2, R_SRC, R_SRC
 352  
 353  	// if uint(s) > uint(len(src)) { etc }
 354  	TEST_SRC()
 355  
 356  	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
 357  	MOVHU -2(R_SRC), R_LEN
 358  	ADD   $260, R_LEN, R_LEN
 359  	B     doCopyRepeat
 360  
 361  repeatLen1:
 362  	// s +=1
 363  	ADD $1, R_SRC, R_SRC
 364  
 365  	// if uint(s) > uint(len(src)) { etc }
 366  	TEST_SRC()
 367  
 368  	// length = src[s-1] + 8
 369  	MOVBU -1(R_SRC), R_LEN
 370  	ADD   $8, R_LEN, R_LEN
 371  	B     doCopyRepeat
 372  
 373  doCopy:
 374  	// This is the end of the outer "switch", when we have a copy tag.
 375  	//
 376  	// We assume that:
 377  	//	- R_LEN == length && R_LEN > 0
 378  	//	- R_OFF == offset
 379  
 380  	// if d < offset { etc }
 381  	MOVD R_DST, R_TMP1
 382  	SUB  R_DBASE, R_TMP1, R_TMP1
 383  	CMP  R_OFF, R_TMP1
 384  	BLT  errCorrupt
 385  
 386  	// Repeat values can skip the test above, since any offset > 0 will be in dst.
 387  doCopyRepeat:
 388  
 389  	// if offset <= 0 { etc }
 390  	CMP $0, R_OFF
 391  	BLE errCorrupt
 392  
 393  	// if length > len(dst)-d { etc }
 394  	MOVD R_DEND, R_TMP1
 395  	SUB  R_DST, R_TMP1, R_TMP1
 396  	CMP  R_TMP1, R_LEN
 397  	BGT  errCorrupt
 398  
 399  	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
 400  	//
 401  	// Set:
 402  	//	- R_TMP2 = len(dst)-d
 403  	//	- R_TMP3 = &dst[d-offset]
 404  	MOVD R_DEND, R_TMP2
 405  	SUB  R_DST, R_TMP2, R_TMP2
 406  	MOVD R_DST, R_TMP3
 407  	SUB  R_OFF, R_TMP3, R_TMP3
 408  
 409  	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
 410  	//
 411  	// First, try using two 8-byte load/stores, similar to the doLit technique
 412  	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
 413  	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
 414  	// and not one 16-byte load/store, and the first store has to be before the
 415  	// second load, due to the overlap if offset is in the range [8, 16).
 416  	//
 417  	// if length > 16 || offset < 8 || len(dst)-d < 16 {
 418  	//   goto slowForwardCopy
 419  	// }
 420  	// copy 16 bytes
 421  	// d += length
 422  	CMP  $16, R_LEN
 423  	BGT  slowForwardCopy
 424  	CMP  $8, R_OFF
 425  	BLT  slowForwardCopy
 426  	CMP  $16, R_TMP2
 427  	BLT  slowForwardCopy
 428  	MOVD 0(R_TMP3), R_TMP0
 429  	MOVD R_TMP0, 0(R_DST)
 430  	MOVD 8(R_TMP3), R_TMP1
 431  	MOVD R_TMP1, 8(R_DST)
 432  	ADD  R_LEN, R_DST, R_DST
 433  	B    loop
 434  
 435  slowForwardCopy:
 436  	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
 437  	// can still try 8-byte load stores, provided we can overrun up to 10 extra
 438  	// bytes. As above, the overrun will be fixed up by subsequent iterations
 439  	// of the outermost loop.
 440  	//
 441  	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
 442  	// commentary says:
 443  	//
 444  	// ----
 445  	//
 446  	// The main part of this loop is a simple copy of eight bytes at a time
 447  	// until we've copied (at least) the requested amount of bytes.  However,
 448  	// if d and d-offset are less than eight bytes apart (indicating a
 449  	// repeating pattern of length < 8), we first need to expand the pattern in
 450  	// order to get the correct results. For instance, if the buffer looks like
 451  	// this, with the eight-byte <d-offset> and <d> patterns marked as
 452  	// intervals:
 453  	//
 454  	//    abxxxxxxxxxxxx
 455  	//    [------]           d-offset
 456  	//      [------]         d
 457  	//
 458  	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
 459  	// once, after which we can move <d> two bytes without moving <d-offset>:
 460  	//
 461  	//    ababxxxxxxxxxx
 462  	//    [------]           d-offset
 463  	//        [------]       d
 464  	//
 465  	// and repeat the exercise until the two no longer overlap.
 466  	//
 467  	// This allows us to do very well in the special case of one single byte
 468  	// repeated many times, without taking a big hit for more general cases.
 469  	//
 470  	// The worst case of extra writing past the end of the match occurs when
 471  	// offset == 1 and length == 1; the last copy will read from byte positions
 472  	// [0..7] and write to [4..11], whereas it was only supposed to write to
 473  	// position 1. Thus, ten excess bytes.
 474  	//
 475  	// ----
 476  	//
 477  	// That "10 byte overrun" worst case is confirmed by Go's
 478  	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
 479  	// and finishSlowForwardCopy algorithm.
 480  	//
 481  	// if length > len(dst)-d-10 {
 482  	//   goto verySlowForwardCopy
 483  	// }
 484  	SUB $10, R_TMP2, R_TMP2
 485  	CMP R_TMP2, R_LEN
 486  	BGT verySlowForwardCopy
 487  
 488  	// We want to keep the offset, so we use R_TMP2 from here.
 489  	MOVD R_OFF, R_TMP2
 490  
 491  makeOffsetAtLeast8:
 492  	// !!! As above, expand the pattern so that offset >= 8 and we can use
 493  	// 8-byte load/stores.
 494  	//
 495  	// for offset < 8 {
 496  	//   copy 8 bytes from dst[d-offset:] to dst[d:]
 497  	//   length -= offset
 498  	//   d      += offset
 499  	//   offset += offset
 500  	//   // The two previous lines together means that d-offset, and therefore
 501  	//   // R_TMP3, is unchanged.
 502  	// }
 503  	CMP  $8, R_TMP2
 504  	BGE  fixUpSlowForwardCopy
 505  	MOVD (R_TMP3), R_TMP1
 506  	MOVD R_TMP1, (R_DST)
 507  	SUB  R_TMP2, R_LEN, R_LEN
 508  	ADD  R_TMP2, R_DST, R_DST
 509  	ADD  R_TMP2, R_TMP2, R_TMP2
 510  	B    makeOffsetAtLeast8
 511  
 512  fixUpSlowForwardCopy:
 513  	// !!! Add length (which might be negative now) to d (implied by R_DST being
 514  	// &dst[d]) so that d ends up at the right place when we jump back to the
 515  	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
 516  	// length is positive, copying the remaining length bytes will write to the
 517  	// right place.
 518  	MOVD R_DST, R_TMP0
 519  	ADD  R_LEN, R_DST, R_DST
 520  
 521  finishSlowForwardCopy:
 522  	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
 523  	// length means that we overrun, but as above, that will be fixed up by
 524  	// subsequent iterations of the outermost loop.
 525  	MOVD $0, R1
 526  	CMP  R1, R_LEN
 527  	BLE  loop
 528  	MOVD (R_TMP3), R_TMP1
 529  	MOVD R_TMP1, (R_TMP0)
 530  	ADD  $8, R_TMP3, R_TMP3
 531  	ADD  $8, R_TMP0, R_TMP0
 532  	SUB  $8, R_LEN, R_LEN
 533  	B    finishSlowForwardCopy
 534  
 535  verySlowForwardCopy:
 536  	// verySlowForwardCopy is a simple implementation of forward copy. In C
 537  	// parlance, this is a do/while loop instead of a while loop, since we know
 538  	// that length > 0. In Go syntax:
 539  	//
 540  	// for {
 541  	//   dst[d] = dst[d - offset]
 542  	//   d++
 543  	//   length--
 544  	//   if length == 0 {
 545  	//     break
 546  	//   }
 547  	// }
 548  	MOVB (R_TMP3), R_TMP1
 549  	MOVB R_TMP1, (R_DST)
 550  	ADD  $1, R_TMP3, R_TMP3
 551  	ADD  $1, R_DST, R_DST
 552  	SUB  $1, R_LEN, R_LEN
 553  	CBNZ R_LEN, verySlowForwardCopy
 554  	B    loop
 555  
 556  	// The code above handles copy tags.
 557  	// ----------------------------------------
 558  
 559  end:
 560  	// This is the end of the "for s < len(src)".
 561  	//
 562  	// if d != len(dst) { etc }
 563  	CMP R_DEND, R_DST
 564  	BNE errCorrupt
 565  
 566  	// return 0
 567  	MOVD $0, ret+48(FP)
 568  	RET
 569  
 570  errCorrupt:
 571  	// return decodeErrCodeCorrupt
 572  	MOVD $1, R_TMP0
 573  	MOVD R_TMP0, ret+48(FP)
 574  	RET
 575