decode_amd64.s raw

   1  // Copyright 2016 The Go Authors. All rights reserved.
   2  // Copyright (c) 2019 Klaus Post. All rights reserved.
   3  // Use of this source code is governed by a BSD-style
   4  // license that can be found in the LICENSE file.
   5  
   6  // +build !appengine
   7  // +build gc
   8  // +build !noasm
   9  
  10  #include "textflag.h"
  11  
  12  #define R_TMP0 AX
  13  #define R_TMP1 BX
  14  #define R_LEN CX
  15  #define R_OFF DX
  16  #define R_SRC SI
  17  #define R_DST DI
  18  #define R_DBASE R8
  19  #define R_DLEN R9
  20  #define R_DEND R10
  21  #define R_SBASE R11
  22  #define R_SLEN R12
  23  #define R_SEND R13
  24  #define R_TMP2 R14
  25  #define R_TMP3 R15
  26  
  27  // The asm code generally follows the pure Go code in decode_other.go, except
  28  // where marked with a "!!!".
  29  
  30  // func decode(dst, src []byte) int
  31  //
  32  // All local variables fit into registers. The non-zero stack size is only to
  33  // spill registers and push args when issuing a CALL. The register allocation:
  34  //	- R_TMP0	scratch
  35  //	- R_TMP1	scratch
  36  //	- R_LEN	    length or x (shared)
  37  //	- R_OFF	    offset
  38  //	- R_SRC	    &src[s]
  39  //	- R_DST	    &dst[d]
  40  //	+ R_DBASE	dst_base
  41  //	+ R_DLEN	dst_len
  42  //	+ R_DEND	dst_base + dst_len
  43  //	+ R_SBASE	src_base
  44  //	+ R_SLEN	src_len
  45  //	+ R_SEND	src_base + src_len
  46  //	- R_TMP2	used by doCopy
  47  //	- R_TMP3	used by doCopy
  48  //
  49  // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
  50  // function, and after a CALL returns, and are not otherwise modified.
  51  //
  52  // The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
  53  // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
  54  TEXT ·s2Decode(SB), NOSPLIT, $48-56
  55  	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
  56  	MOVQ dst_base+0(FP), R_DBASE
  57  	MOVQ dst_len+8(FP), R_DLEN
  58  	MOVQ R_DBASE, R_DST
  59  	MOVQ R_DBASE, R_DEND
  60  	ADDQ R_DLEN, R_DEND
  61  	MOVQ src_base+24(FP), R_SBASE
  62  	MOVQ src_len+32(FP), R_SLEN
  63  	MOVQ R_SBASE, R_SRC
  64  	MOVQ R_SBASE, R_SEND
  65  	ADDQ R_SLEN, R_SEND
  66  	XORQ R_OFF, R_OFF
  67  
  68  loop:
  69  	// for s < len(src)
  70  	CMPQ R_SRC, R_SEND
  71  	JEQ  end
  72  
  73  	// R_LEN = uint32(src[s])
  74  	//
  75  	// switch src[s] & 0x03
  76  	MOVBLZX (R_SRC), R_LEN
  77  	MOVL    R_LEN, R_TMP1
  78  	ANDL    $3, R_TMP1
  79  	CMPL    R_TMP1, $1
  80  	JAE     tagCopy
  81  
  82  	// ----------------------------------------
  83  	// The code below handles literal tags.
  84  
  85  	// case tagLiteral:
  86  	// x := uint32(src[s] >> 2)
  87  	// switch
  88  	SHRL $2, R_LEN
  89  	CMPL R_LEN, $60
  90  	JAE  tagLit60Plus
  91  
  92  	// case x < 60:
  93  	// s++
  94  	INCQ R_SRC
  95  
  96  doLit:
  97  	// This is the end of the inner "switch", when we have a literal tag.
  98  	//
  99  	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
 100  	// used in the pure Go decode_other.go code.
 101  
 102  	// length = int(x) + 1
 103  	//
 104  	// Unlike the pure Go code, we don't need to check if length <= 0 because
 105  	// R_LEN can hold 64 bits, so the increment cannot overflow.
 106  	INCQ R_LEN
 107  
 108  	// Prepare to check if copying length bytes will run past the end of dst or
 109  	// src.
 110  	//
 111  	// R_TMP0 = len(dst) - d
 112  	// R_TMP1 = len(src) - s
 113  	MOVQ R_DEND, R_TMP0
 114  	SUBQ R_DST, R_TMP0
 115  	MOVQ R_SEND, R_TMP1
 116  	SUBQ R_SRC, R_TMP1
 117  
 118  	// !!! Try a faster technique for short (16 or fewer bytes) copies.
 119  	//
 120  	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
 121  	//   goto callMemmove // Fall back on calling runtime·memmove.
 122  	// }
 123  	//
 124  	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
 125  	// against 21 instead of 16, because it cannot assume that all of its input
 126  	// is contiguous in memory and so it needs to leave enough source bytes to
 127  	// read the next tag without refilling buffers, but Go's Decode assumes
 128  	// contiguousness (the src argument is a []byte).
 129  	CMPQ R_LEN, $16
 130  	JGT  callMemmove
 131  	CMPQ R_TMP0, $16
 132  	JLT  callMemmove
 133  	CMPQ R_TMP1, $16
 134  	JLT  callMemmove
 135  
 136  	// !!! Implement the copy from src to dst as a 16-byte load and store.
 137  	// (Decode's documentation says that dst and src must not overlap.)
 138  	//
 139  	// This always copies 16 bytes, instead of only length bytes, but that's
 140  	// OK. If the input is a valid Snappy encoding then subsequent iterations
 141  	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
 142  	// non-nil error), so the overrun will be ignored.
 143  	//
 144  	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
 145  	// 16-byte loads and stores. This technique probably wouldn't be as
 146  	// effective on architectures that are fussier about alignment.
 147  	MOVOU 0(R_SRC), X0
 148  	MOVOU X0, 0(R_DST)
 149  
 150  	// d += length
 151  	// s += length
 152  	ADDQ R_LEN, R_DST
 153  	ADDQ R_LEN, R_SRC
 154  	JMP  loop
 155  
 156  callMemmove:
 157  	// if length > len(dst)-d || length > len(src)-s { etc }
 158  	CMPQ R_LEN, R_TMP0
 159  	JGT  errCorrupt
 160  	CMPQ R_LEN, R_TMP1
 161  	JGT  errCorrupt
 162  
 163  	// copy(dst[d:], src[s:s+length])
 164  	//
 165  	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
 166  	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
 167  	// three registers to the stack, to save local variables across the CALL.
 168  	MOVQ R_DST, 0(SP)
 169  	MOVQ R_SRC, 8(SP)
 170  	MOVQ R_LEN, 16(SP)
 171  	MOVQ R_DST, 24(SP)
 172  	MOVQ R_SRC, 32(SP)
 173  	MOVQ R_LEN, 40(SP)
 174  	MOVQ R_OFF, 48(SP)
 175  	CALL runtime·memmove(SB)
 176  
 177  	// Restore local variables: unspill registers from the stack and
 178  	// re-calculate R_DBASE-R_SEND.
 179  	MOVQ 24(SP), R_DST
 180  	MOVQ 32(SP), R_SRC
 181  	MOVQ 40(SP), R_LEN
 182  	MOVQ 48(SP), R_OFF
 183  	MOVQ dst_base+0(FP), R_DBASE
 184  	MOVQ dst_len+8(FP), R_DLEN
 185  	MOVQ R_DBASE, R_DEND
 186  	ADDQ R_DLEN, R_DEND
 187  	MOVQ src_base+24(FP), R_SBASE
 188  	MOVQ src_len+32(FP), R_SLEN
 189  	MOVQ R_SBASE, R_SEND
 190  	ADDQ R_SLEN, R_SEND
 191  
 192  	// d += length
 193  	// s += length
 194  	ADDQ R_LEN, R_DST
 195  	ADDQ R_LEN, R_SRC
 196  	JMP  loop
 197  
 198  tagLit60Plus:
 199  	// !!! This fragment does the
 200  	//
 201  	// s += x - 58; if uint(s) > uint(len(src)) { etc }
 202  	//
 203  	// checks. In the asm version, we code it once instead of once per switch case.
 204  	ADDQ R_LEN, R_SRC
 205  	SUBQ $58, R_SRC
 206  	CMPQ R_SRC, R_SEND
 207  	JA   errCorrupt
 208  
 209  	// case x == 60:
 210  	CMPL R_LEN, $61
 211  	JEQ  tagLit61
 212  	JA   tagLit62Plus
 213  
 214  	// x = uint32(src[s-1])
 215  	MOVBLZX -1(R_SRC), R_LEN
 216  	JMP     doLit
 217  
 218  tagLit61:
 219  	// case x == 61:
 220  	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
 221  	MOVWLZX -2(R_SRC), R_LEN
 222  	JMP     doLit
 223  
 224  tagLit62Plus:
 225  	CMPL R_LEN, $62
 226  	JA   tagLit63
 227  
 228  	// case x == 62:
 229  	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
 230  	// We read one byte, safe to read one back, since we are just reading tag.
 231  	// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
 232  	MOVL -4(R_SRC), R_LEN
 233  	SHRL $8, R_LEN
 234  	JMP  doLit
 235  
 236  tagLit63:
 237  	// case x == 63:
 238  	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
 239  	MOVL -4(R_SRC), R_LEN
 240  	JMP  doLit
 241  
 242  // The code above handles literal tags.
 243  // ----------------------------------------
 244  // The code below handles copy tags.
 245  
 246  tagCopy4:
 247  	// case tagCopy4:
 248  	// s += 5
 249  	ADDQ $5, R_SRC
 250  
 251  	// if uint(s) > uint(len(src)) { etc }
 252  	CMPQ R_SRC, R_SEND
 253  	JA   errCorrupt
 254  
 255  	// length = 1 + int(src[s-5])>>2
 256  	SHRQ $2, R_LEN
 257  	INCQ R_LEN
 258  
 259  	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
 260  	MOVLQZX -4(R_SRC), R_OFF
 261  	JMP     doCopy
 262  
 263  tagCopy2:
 264  	// case tagCopy2:
 265  	// s += 3
 266  	ADDQ $3, R_SRC
 267  
 268  	// if uint(s) > uint(len(src)) { etc }
 269  	CMPQ R_SRC, R_SEND
 270  	JA   errCorrupt
 271  
 272  	// length = 1 + int(src[s-3])>>2
 273  	SHRQ $2, R_LEN
 274  	INCQ R_LEN
 275  
 276  	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
 277  	MOVWQZX -2(R_SRC), R_OFF
 278  	JMP     doCopy
 279  
 280  tagCopy:
 281  	// We have a copy tag. We assume that:
 282  	//	- R_TMP1 == src[s] & 0x03
 283  	//	- R_LEN == src[s]
 284  	CMPQ R_TMP1, $2
 285  	JEQ  tagCopy2
 286  	JA   tagCopy4
 287  
 288  	// case tagCopy1:
 289  	// s += 2
 290  	ADDQ $2, R_SRC
 291  
 292  	// if uint(s) > uint(len(src)) { etc }
 293  	CMPQ R_SRC, R_SEND
 294  	JA   errCorrupt
 295  
 296  	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
 297  	// length = 4 + int(src[s-2])>>2&0x7
 298  	MOVBQZX -1(R_SRC), R_TMP1
 299  	MOVQ    R_LEN, R_TMP0
 300  	SHRQ    $2, R_LEN
 301  	ANDQ    $0xe0, R_TMP0
 302  	ANDQ    $7, R_LEN
 303  	SHLQ    $3, R_TMP0
 304  	ADDQ    $4, R_LEN
 305  	ORQ     R_TMP1, R_TMP0
 306  
 307  	// check if repeat code, ZF set by ORQ.
 308  	JZ repeatCode
 309  
 310  	// This is a regular copy, transfer our temporary value to R_OFF (length)
 311  	MOVQ R_TMP0, R_OFF
 312  	JMP  doCopy
 313  
 314  // This is a repeat code.
 315  repeatCode:
 316  	// If length < 9, reuse last offset, with the length already calculated.
 317  	CMPQ R_LEN, $9
 318  	JL   doCopyRepeat
 319  
 320  	// Read additional bytes for length.
 321  	JE repeatLen1
 322  
 323  	// Rare, so the extra branch shouldn't hurt too much.
 324  	CMPQ R_LEN, $10
 325  	JE   repeatLen2
 326  	JMP  repeatLen3
 327  
 328  // Read repeat lengths.
 329  repeatLen1:
 330  	// s ++
 331  	ADDQ $1, R_SRC
 332  
 333  	// if uint(s) > uint(len(src)) { etc }
 334  	CMPQ R_SRC, R_SEND
 335  	JA   errCorrupt
 336  
 337  	// length = src[s-1] + 8
 338  	MOVBQZX -1(R_SRC), R_LEN
 339  	ADDL    $8, R_LEN
 340  	JMP     doCopyRepeat
 341  
 342  repeatLen2:
 343  	// s +=2
 344  	ADDQ $2, R_SRC
 345  
 346  	// if uint(s) > uint(len(src)) { etc }
 347  	CMPQ R_SRC, R_SEND
 348  	JA   errCorrupt
 349  
 350  	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
 351  	MOVWQZX -2(R_SRC), R_LEN
 352  	ADDL    $260, R_LEN
 353  	JMP     doCopyRepeat
 354  
 355  repeatLen3:
 356  	// s +=3
 357  	ADDQ $3, R_SRC
 358  
 359  	// if uint(s) > uint(len(src)) { etc }
 360  	CMPQ R_SRC, R_SEND
 361  	JA   errCorrupt
 362  
 363  	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
 364  	// Read one byte further back (just part of the tag, shifted out)
 365  	MOVL -4(R_SRC), R_LEN
 366  	SHRL $8, R_LEN
 367  	ADDL $65540, R_LEN
 368  	JMP  doCopyRepeat
 369  
 370  doCopy:
 371  	// This is the end of the outer "switch", when we have a copy tag.
 372  	//
 373  	// We assume that:
 374  	//	- R_LEN == length && R_LEN > 0
 375  	//	- R_OFF == offset
 376  
 377  	// if d < offset { etc }
 378  	MOVQ R_DST, R_TMP1
 379  	SUBQ R_DBASE, R_TMP1
 380  	CMPQ R_TMP1, R_OFF
 381  	JLT  errCorrupt
 382  
 383  	// Repeat values can skip the test above, since any offset > 0 will be in dst.
 384  doCopyRepeat:
 385  	// if offset <= 0 { etc }
 386  	CMPQ R_OFF, $0
 387  	JLE  errCorrupt
 388  
 389  	// if length > len(dst)-d { etc }
 390  	MOVQ R_DEND, R_TMP1
 391  	SUBQ R_DST, R_TMP1
 392  	CMPQ R_LEN, R_TMP1
 393  	JGT  errCorrupt
 394  
 395  	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
 396  	//
 397  	// Set:
 398  	//	- R_TMP2 = len(dst)-d
 399  	//	- R_TMP3 = &dst[d-offset]
 400  	MOVQ R_DEND, R_TMP2
 401  	SUBQ R_DST, R_TMP2
 402  	MOVQ R_DST, R_TMP3
 403  	SUBQ R_OFF, R_TMP3
 404  
 405  	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
 406  	//
 407  	// First, try using two 8-byte load/stores, similar to the doLit technique
 408  	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
 409  	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
 410  	// and not one 16-byte load/store, and the first store has to be before the
 411  	// second load, due to the overlap if offset is in the range [8, 16).
 412  	//
 413  	// if length > 16 || offset < 8 || len(dst)-d < 16 {
 414  	//   goto slowForwardCopy
 415  	// }
 416  	// copy 16 bytes
 417  	// d += length
 418  	CMPQ R_LEN, $16
 419  	JGT  slowForwardCopy
 420  	CMPQ R_OFF, $8
 421  	JLT  slowForwardCopy
 422  	CMPQ R_TMP2, $16
 423  	JLT  slowForwardCopy
 424  	MOVQ 0(R_TMP3), R_TMP0
 425  	MOVQ R_TMP0, 0(R_DST)
 426  	MOVQ 8(R_TMP3), R_TMP1
 427  	MOVQ R_TMP1, 8(R_DST)
 428  	ADDQ R_LEN, R_DST
 429  	JMP  loop
 430  
 431  slowForwardCopy:
 432  	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
 433  	// can still try 8-byte load stores, provided we can overrun up to 10 extra
 434  	// bytes. As above, the overrun will be fixed up by subsequent iterations
 435  	// of the outermost loop.
 436  	//
 437  	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
 438  	// commentary says:
 439  	//
 440  	// ----
 441  	//
 442  	// The main part of this loop is a simple copy of eight bytes at a time
 443  	// until we've copied (at least) the requested amount of bytes.  However,
 444  	// if d and d-offset are less than eight bytes apart (indicating a
 445  	// repeating pattern of length < 8), we first need to expand the pattern in
 446  	// order to get the correct results. For instance, if the buffer looks like
 447  	// this, with the eight-byte <d-offset> and <d> patterns marked as
 448  	// intervals:
 449  	//
 450  	//    abxxxxxxxxxxxx
 451  	//    [------]           d-offset
 452  	//      [------]         d
 453  	//
 454  	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
 455  	// once, after which we can move <d> two bytes without moving <d-offset>:
 456  	//
 457  	//    ababxxxxxxxxxx
 458  	//    [------]           d-offset
 459  	//        [------]       d
 460  	//
 461  	// and repeat the exercise until the two no longer overlap.
 462  	//
 463  	// This allows us to do very well in the special case of one single byte
 464  	// repeated many times, without taking a big hit for more general cases.
 465  	//
 466  	// The worst case of extra writing past the end of the match occurs when
 467  	// offset == 1 and length == 1; the last copy will read from byte positions
 468  	// [0..7] and write to [4..11], whereas it was only supposed to write to
 469  	// position 1. Thus, ten excess bytes.
 470  	//
 471  	// ----
 472  	//
 473  	// That "10 byte overrun" worst case is confirmed by Go's
 474  	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
 475  	// and finishSlowForwardCopy algorithm.
 476  	//
 477  	// if length > len(dst)-d-10 {
 478  	//   goto verySlowForwardCopy
 479  	// }
 480  	SUBQ $10, R_TMP2
 481  	CMPQ R_LEN, R_TMP2
 482  	JGT  verySlowForwardCopy
 483  
 484  	// We want to keep the offset, so we use R_TMP2 from here.
 485  	MOVQ R_OFF, R_TMP2
 486  
 487  makeOffsetAtLeast8:
 488  	// !!! As above, expand the pattern so that offset >= 8 and we can use
 489  	// 8-byte load/stores.
 490  	//
 491  	// for offset < 8 {
 492  	//   copy 8 bytes from dst[d-offset:] to dst[d:]
 493  	//   length -= offset
 494  	//   d      += offset
 495  	//   offset += offset
 496  	//   // The two previous lines together means that d-offset, and therefore
 497  	//   // R_TMP3, is unchanged.
 498  	// }
 499  	CMPQ R_TMP2, $8
 500  	JGE  fixUpSlowForwardCopy
 501  	MOVQ (R_TMP3), R_TMP1
 502  	MOVQ R_TMP1, (R_DST)
 503  	SUBQ R_TMP2, R_LEN
 504  	ADDQ R_TMP2, R_DST
 505  	ADDQ R_TMP2, R_TMP2
 506  	JMP  makeOffsetAtLeast8
 507  
 508  fixUpSlowForwardCopy:
 509  	// !!! Add length (which might be negative now) to d (implied by R_DST being
 510  	// &dst[d]) so that d ends up at the right place when we jump back to the
 511  	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
 512  	// length is positive, copying the remaining length bytes will write to the
 513  	// right place.
 514  	MOVQ R_DST, R_TMP0
 515  	ADDQ R_LEN, R_DST
 516  
 517  finishSlowForwardCopy:
 518  	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
 519  	// length means that we overrun, but as above, that will be fixed up by
 520  	// subsequent iterations of the outermost loop.
 521  	CMPQ R_LEN, $0
 522  	JLE  loop
 523  	MOVQ (R_TMP3), R_TMP1
 524  	MOVQ R_TMP1, (R_TMP0)
 525  	ADDQ $8, R_TMP3
 526  	ADDQ $8, R_TMP0
 527  	SUBQ $8, R_LEN
 528  	JMP  finishSlowForwardCopy
 529  
 530  verySlowForwardCopy:
 531  	// verySlowForwardCopy is a simple implementation of forward copy. In C
 532  	// parlance, this is a do/while loop instead of a while loop, since we know
 533  	// that length > 0. In Go syntax:
 534  	//
 535  	// for {
 536  	//   dst[d] = dst[d - offset]
 537  	//   d++
 538  	//   length--
 539  	//   if length == 0 {
 540  	//     break
 541  	//   }
 542  	// }
 543  	MOVB (R_TMP3), R_TMP1
 544  	MOVB R_TMP1, (R_DST)
 545  	INCQ R_TMP3
 546  	INCQ R_DST
 547  	DECQ R_LEN
 548  	JNZ  verySlowForwardCopy
 549  	JMP  loop
 550  
 551  // The code above handles copy tags.
 552  // ----------------------------------------
 553  
 554  end:
 555  	// This is the end of the "for s < len(src)".
 556  	//
 557  	// if d != len(dst) { etc }
 558  	CMPQ R_DST, R_DEND
 559  	JNE  errCorrupt
 560  
 561  	// return 0
 562  	MOVQ $0, ret+48(FP)
 563  	RET
 564  
 565  errCorrupt:
 566  	// return decodeErrCodeCorrupt
 567  	MOVQ $1, ret+48(FP)
 568  	RET
 569