seqdec_amd64.s raw

   1  // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
   2  
   3  //go:build !appengine && !noasm && gc && !noasm
   4  
   5  // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
   6  // Requires: CMOV
   7  TEXT ·sequenceDecs_decode_amd64(SB), $8-32
   8  	MOVQ    br+8(FP), CX
   9  	MOVQ    24(CX), DX
  10  	MOVBQZX 40(CX), BX
  11  	MOVQ    (CX), AX
  12  	MOVQ    32(CX), SI
  13  	ADDQ    SI, AX
  14  	MOVQ    AX, (SP)
  15  	MOVQ    ctx+16(FP), AX
  16  	MOVQ    72(AX), DI
  17  	MOVQ    80(AX), R8
  18  	MOVQ    88(AX), R9
  19  	MOVQ    104(AX), R10
  20  	MOVQ    s+0(FP), AX
  21  	MOVQ    144(AX), R11
  22  	MOVQ    152(AX), R12
  23  	MOVQ    160(AX), R13
  24  
  25  sequenceDecs_decode_amd64_main_loop:
  26  	MOVQ (SP), R14
  27  
  28  	// Fill bitreader to have enough for the offset and match length.
  29  	CMPQ SI, $0x08
  30  	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
  31  	MOVQ BX, AX
  32  	SHRQ $0x03, AX
  33  	SUBQ AX, R14
  34  	MOVQ (R14), DX
  35  	SUBQ AX, SI
  36  	ANDQ $0x07, BX
  37  	JMP  sequenceDecs_decode_amd64_fill_end
  38  
  39  sequenceDecs_decode_amd64_fill_byte_by_byte:
  40  	CMPQ    SI, $0x00
  41  	JLE     sequenceDecs_decode_amd64_fill_check_overread
  42  	CMPQ    BX, $0x07
  43  	JLE     sequenceDecs_decode_amd64_fill_end
  44  	SHLQ    $0x08, DX
  45  	SUBQ    $0x01, R14
  46  	SUBQ    $0x01, SI
  47  	SUBQ    $0x08, BX
  48  	MOVBQZX (R14), AX
  49  	ORQ     AX, DX
  50  	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
  51  
  52  sequenceDecs_decode_amd64_fill_check_overread:
  53  	CMPQ BX, $0x40
  54  	JA   error_overread
  55  
  56  sequenceDecs_decode_amd64_fill_end:
  57  	// Update offset
  58  	MOVQ  R9, AX
  59  	MOVQ  BX, CX
  60  	MOVQ  DX, R15
  61  	SHLQ  CL, R15
  62  	MOVB  AH, CL
  63  	SHRQ  $0x20, AX
  64  	TESTQ CX, CX
  65  	JZ    sequenceDecs_decode_amd64_of_update_zero
  66  	ADDQ  CX, BX
  67  	CMPQ  BX, $0x40
  68  	JA    sequenceDecs_decode_amd64_of_update_zero
  69  	CMPQ  CX, $0x40
  70  	JAE   sequenceDecs_decode_amd64_of_update_zero
  71  	NEGQ  CX
  72  	SHRQ  CL, R15
  73  	ADDQ  R15, AX
  74  
  75  sequenceDecs_decode_amd64_of_update_zero:
  76  	MOVQ AX, 16(R10)
  77  
  78  	// Update match length
  79  	MOVQ  R8, AX
  80  	MOVQ  BX, CX
  81  	MOVQ  DX, R15
  82  	SHLQ  CL, R15
  83  	MOVB  AH, CL
  84  	SHRQ  $0x20, AX
  85  	TESTQ CX, CX
  86  	JZ    sequenceDecs_decode_amd64_ml_update_zero
  87  	ADDQ  CX, BX
  88  	CMPQ  BX, $0x40
  89  	JA    sequenceDecs_decode_amd64_ml_update_zero
  90  	CMPQ  CX, $0x40
  91  	JAE   sequenceDecs_decode_amd64_ml_update_zero
  92  	NEGQ  CX
  93  	SHRQ  CL, R15
  94  	ADDQ  R15, AX
  95  
  96  sequenceDecs_decode_amd64_ml_update_zero:
  97  	MOVQ AX, 8(R10)
  98  
  99  	// Fill bitreader to have enough for the remaining
 100  	CMPQ SI, $0x08
 101  	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
 102  	MOVQ BX, AX
 103  	SHRQ $0x03, AX
 104  	SUBQ AX, R14
 105  	MOVQ (R14), DX
 106  	SUBQ AX, SI
 107  	ANDQ $0x07, BX
 108  	JMP  sequenceDecs_decode_amd64_fill_2_end
 109  
 110  sequenceDecs_decode_amd64_fill_2_byte_by_byte:
 111  	CMPQ    SI, $0x00
 112  	JLE     sequenceDecs_decode_amd64_fill_2_check_overread
 113  	CMPQ    BX, $0x07
 114  	JLE     sequenceDecs_decode_amd64_fill_2_end
 115  	SHLQ    $0x08, DX
 116  	SUBQ    $0x01, R14
 117  	SUBQ    $0x01, SI
 118  	SUBQ    $0x08, BX
 119  	MOVBQZX (R14), AX
 120  	ORQ     AX, DX
 121  	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
 122  
 123  sequenceDecs_decode_amd64_fill_2_check_overread:
 124  	CMPQ BX, $0x40
 125  	JA   error_overread
 126  
 127  sequenceDecs_decode_amd64_fill_2_end:
 128  	// Update literal length
 129  	MOVQ  DI, AX
 130  	MOVQ  BX, CX
 131  	MOVQ  DX, R15
 132  	SHLQ  CL, R15
 133  	MOVB  AH, CL
 134  	SHRQ  $0x20, AX
 135  	TESTQ CX, CX
 136  	JZ    sequenceDecs_decode_amd64_ll_update_zero
 137  	ADDQ  CX, BX
 138  	CMPQ  BX, $0x40
 139  	JA    sequenceDecs_decode_amd64_ll_update_zero
 140  	CMPQ  CX, $0x40
 141  	JAE   sequenceDecs_decode_amd64_ll_update_zero
 142  	NEGQ  CX
 143  	SHRQ  CL, R15
 144  	ADDQ  R15, AX
 145  
 146  sequenceDecs_decode_amd64_ll_update_zero:
 147  	MOVQ AX, (R10)
 148  
 149  	// Fill bitreader for state updates
 150  	MOVQ    R14, (SP)
 151  	MOVQ    R9, AX
 152  	SHRQ    $0x08, AX
 153  	MOVBQZX AL, AX
 154  	MOVQ    ctx+16(FP), CX
 155  	CMPQ    96(CX), $0x00
 156  	JZ      sequenceDecs_decode_amd64_skip_update
 157  
 158  	// Update Literal Length State
 159  	MOVBQZX DI, R14
 160  	SHRL    $0x10, DI
 161  	LEAQ    (BX)(R14*1), CX
 162  	MOVQ    DX, R15
 163  	MOVQ    CX, BX
 164  	ROLQ    CL, R15
 165  	MOVL    $0x00000001, BP
 166  	MOVB    R14, CL
 167  	SHLL    CL, BP
 168  	DECL    BP
 169  	ANDQ    BP, R15
 170  	ADDQ    R15, DI
 171  
 172  	// Load ctx.llTable
 173  	MOVQ ctx+16(FP), CX
 174  	MOVQ (CX), CX
 175  	MOVQ (CX)(DI*8), DI
 176  
 177  	// Update Match Length State
 178  	MOVBQZX R8, R14
 179  	SHRL    $0x10, R8
 180  	LEAQ    (BX)(R14*1), CX
 181  	MOVQ    DX, R15
 182  	MOVQ    CX, BX
 183  	ROLQ    CL, R15
 184  	MOVL    $0x00000001, BP
 185  	MOVB    R14, CL
 186  	SHLL    CL, BP
 187  	DECL    BP
 188  	ANDQ    BP, R15
 189  	ADDQ    R15, R8
 190  
 191  	// Load ctx.mlTable
 192  	MOVQ ctx+16(FP), CX
 193  	MOVQ 24(CX), CX
 194  	MOVQ (CX)(R8*8), R8
 195  
 196  	// Update Offset State
 197  	MOVBQZX R9, R14
 198  	SHRL    $0x10, R9
 199  	LEAQ    (BX)(R14*1), CX
 200  	MOVQ    DX, R15
 201  	MOVQ    CX, BX
 202  	ROLQ    CL, R15
 203  	MOVL    $0x00000001, BP
 204  	MOVB    R14, CL
 205  	SHLL    CL, BP
 206  	DECL    BP
 207  	ANDQ    BP, R15
 208  	ADDQ    R15, R9
 209  
 210  	// Load ctx.ofTable
 211  	MOVQ ctx+16(FP), CX
 212  	MOVQ 48(CX), CX
 213  	MOVQ (CX)(R9*8), R9
 214  
 215  sequenceDecs_decode_amd64_skip_update:
 216  	// Adjust offset
 217  	MOVQ 16(R10), CX
 218  	CMPQ AX, $0x01
 219  	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
 220  	MOVQ R12, R13
 221  	MOVQ R11, R12
 222  	MOVQ CX, R11
 223  	JMP  sequenceDecs_decode_amd64_after_adjust
 224  
 225  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
 226  	CMPQ (R10), $0x00000000
 227  	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
 228  	INCQ CX
 229  	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero
 230  
 231  sequenceDecs_decode_amd64_adjust_offset_maybezero:
 232  	TESTQ CX, CX
 233  	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
 234  	MOVQ  R11, CX
 235  	JMP   sequenceDecs_decode_amd64_after_adjust
 236  
 237  sequenceDecs_decode_amd64_adjust_offset_nonzero:
 238  	CMPQ CX, $0x01
 239  	JB   sequenceDecs_decode_amd64_adjust_zero
 240  	JEQ  sequenceDecs_decode_amd64_adjust_one
 241  	CMPQ CX, $0x02
 242  	JA   sequenceDecs_decode_amd64_adjust_three
 243  	JMP  sequenceDecs_decode_amd64_adjust_two
 244  
 245  sequenceDecs_decode_amd64_adjust_zero:
 246  	MOVQ R11, AX
 247  	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
 248  
 249  sequenceDecs_decode_amd64_adjust_one:
 250  	MOVQ R12, AX
 251  	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
 252  
 253  sequenceDecs_decode_amd64_adjust_two:
 254  	MOVQ R13, AX
 255  	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
 256  
 257  sequenceDecs_decode_amd64_adjust_three:
 258  	LEAQ -1(R11), AX
 259  
 260  sequenceDecs_decode_amd64_adjust_test_temp_valid:
 261  	TESTQ AX, AX
 262  	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
 263  	MOVQ  $0x00000001, AX
 264  
 265  sequenceDecs_decode_amd64_adjust_temp_valid:
 266  	CMPQ    CX, $0x01
 267  	CMOVQNE R12, R13
 268  	MOVQ    R11, R12
 269  	MOVQ    AX, R11
 270  	MOVQ    AX, CX
 271  
 272  sequenceDecs_decode_amd64_after_adjust:
 273  	MOVQ CX, 16(R10)
 274  
 275  	// Check values
 276  	MOVQ  8(R10), AX
 277  	MOVQ  (R10), R14
 278  	LEAQ  (AX)(R14*1), R15
 279  	MOVQ  s+0(FP), BP
 280  	ADDQ  R15, 256(BP)
 281  	MOVQ  ctx+16(FP), R15
 282  	SUBQ  R14, 128(R15)
 283  	JS    error_not_enough_literals
 284  	CMPQ  AX, $0x00020002
 285  	JA    sequenceDecs_decode_amd64_error_match_len_too_big
 286  	TESTQ CX, CX
 287  	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
 288  	TESTQ AX, AX
 289  	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
 290  
 291  sequenceDecs_decode_amd64_match_len_ofs_ok:
 292  	ADDQ $0x18, R10
 293  	MOVQ ctx+16(FP), AX
 294  	DECQ 96(AX)
 295  	JNS  sequenceDecs_decode_amd64_main_loop
 296  	MOVQ s+0(FP), AX
 297  	MOVQ R11, 144(AX)
 298  	MOVQ R12, 152(AX)
 299  	MOVQ R13, 160(AX)
 300  	MOVQ br+8(FP), AX
 301  	MOVQ DX, 24(AX)
 302  	MOVB BL, 40(AX)
 303  	MOVQ SI, 32(AX)
 304  
 305  	// Return success
 306  	MOVQ $0x00000000, ret+24(FP)
 307  	RET
 308  
 309  	// Return with match length error
 310  sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
 311  	MOVQ $0x00000001, ret+24(FP)
 312  	RET
 313  
 314  	// Return with match too long error
 315  sequenceDecs_decode_amd64_error_match_len_too_big:
 316  	MOVQ $0x00000002, ret+24(FP)
 317  	RET
 318  
 319  	// Return with match offset too long error
 320  	MOVQ $0x00000003, ret+24(FP)
 321  	RET
 322  
 323  	// Return with not enough literals error
 324  error_not_enough_literals:
 325  	MOVQ $0x00000004, ret+24(FP)
 326  	RET
 327  
 328  	// Return with overread error
 329  error_overread:
 330  	MOVQ $0x00000006, ret+24(FP)
 331  	RET
 332  
 333  // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 334  // Requires: CMOV
 335  TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
 336  	MOVQ    br+8(FP), CX
 337  	MOVQ    24(CX), DX
 338  	MOVBQZX 40(CX), BX
 339  	MOVQ    (CX), AX
 340  	MOVQ    32(CX), SI
 341  	ADDQ    SI, AX
 342  	MOVQ    AX, (SP)
 343  	MOVQ    ctx+16(FP), AX
 344  	MOVQ    72(AX), DI
 345  	MOVQ    80(AX), R8
 346  	MOVQ    88(AX), R9
 347  	MOVQ    104(AX), R10
 348  	MOVQ    s+0(FP), AX
 349  	MOVQ    144(AX), R11
 350  	MOVQ    152(AX), R12
 351  	MOVQ    160(AX), R13
 352  
 353  sequenceDecs_decode_56_amd64_main_loop:
 354  	MOVQ (SP), R14
 355  
 356  	// Fill bitreader to have enough for the offset and match length.
 357  	CMPQ SI, $0x08
 358  	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
 359  	MOVQ BX, AX
 360  	SHRQ $0x03, AX
 361  	SUBQ AX, R14
 362  	MOVQ (R14), DX
 363  	SUBQ AX, SI
 364  	ANDQ $0x07, BX
 365  	JMP  sequenceDecs_decode_56_amd64_fill_end
 366  
 367  sequenceDecs_decode_56_amd64_fill_byte_by_byte:
 368  	CMPQ    SI, $0x00
 369  	JLE     sequenceDecs_decode_56_amd64_fill_check_overread
 370  	CMPQ    BX, $0x07
 371  	JLE     sequenceDecs_decode_56_amd64_fill_end
 372  	SHLQ    $0x08, DX
 373  	SUBQ    $0x01, R14
 374  	SUBQ    $0x01, SI
 375  	SUBQ    $0x08, BX
 376  	MOVBQZX (R14), AX
 377  	ORQ     AX, DX
 378  	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
 379  
 380  sequenceDecs_decode_56_amd64_fill_check_overread:
 381  	CMPQ BX, $0x40
 382  	JA   error_overread
 383  
 384  sequenceDecs_decode_56_amd64_fill_end:
 385  	// Update offset
 386  	MOVQ  R9, AX
 387  	MOVQ  BX, CX
 388  	MOVQ  DX, R15
 389  	SHLQ  CL, R15
 390  	MOVB  AH, CL
 391  	SHRQ  $0x20, AX
 392  	TESTQ CX, CX
 393  	JZ    sequenceDecs_decode_56_amd64_of_update_zero
 394  	ADDQ  CX, BX
 395  	CMPQ  BX, $0x40
 396  	JA    sequenceDecs_decode_56_amd64_of_update_zero
 397  	CMPQ  CX, $0x40
 398  	JAE   sequenceDecs_decode_56_amd64_of_update_zero
 399  	NEGQ  CX
 400  	SHRQ  CL, R15
 401  	ADDQ  R15, AX
 402  
 403  sequenceDecs_decode_56_amd64_of_update_zero:
 404  	MOVQ AX, 16(R10)
 405  
 406  	// Update match length
 407  	MOVQ  R8, AX
 408  	MOVQ  BX, CX
 409  	MOVQ  DX, R15
 410  	SHLQ  CL, R15
 411  	MOVB  AH, CL
 412  	SHRQ  $0x20, AX
 413  	TESTQ CX, CX
 414  	JZ    sequenceDecs_decode_56_amd64_ml_update_zero
 415  	ADDQ  CX, BX
 416  	CMPQ  BX, $0x40
 417  	JA    sequenceDecs_decode_56_amd64_ml_update_zero
 418  	CMPQ  CX, $0x40
 419  	JAE   sequenceDecs_decode_56_amd64_ml_update_zero
 420  	NEGQ  CX
 421  	SHRQ  CL, R15
 422  	ADDQ  R15, AX
 423  
 424  sequenceDecs_decode_56_amd64_ml_update_zero:
 425  	MOVQ AX, 8(R10)
 426  
 427  	// Update literal length
 428  	MOVQ  DI, AX
 429  	MOVQ  BX, CX
 430  	MOVQ  DX, R15
 431  	SHLQ  CL, R15
 432  	MOVB  AH, CL
 433  	SHRQ  $0x20, AX
 434  	TESTQ CX, CX
 435  	JZ    sequenceDecs_decode_56_amd64_ll_update_zero
 436  	ADDQ  CX, BX
 437  	CMPQ  BX, $0x40
 438  	JA    sequenceDecs_decode_56_amd64_ll_update_zero
 439  	CMPQ  CX, $0x40
 440  	JAE   sequenceDecs_decode_56_amd64_ll_update_zero
 441  	NEGQ  CX
 442  	SHRQ  CL, R15
 443  	ADDQ  R15, AX
 444  
 445  sequenceDecs_decode_56_amd64_ll_update_zero:
 446  	MOVQ AX, (R10)
 447  
 448  	// Fill bitreader for state updates
 449  	MOVQ    R14, (SP)
 450  	MOVQ    R9, AX
 451  	SHRQ    $0x08, AX
 452  	MOVBQZX AL, AX
 453  	MOVQ    ctx+16(FP), CX
 454  	CMPQ    96(CX), $0x00
 455  	JZ      sequenceDecs_decode_56_amd64_skip_update
 456  
 457  	// Update Literal Length State
 458  	MOVBQZX DI, R14
 459  	SHRL    $0x10, DI
 460  	LEAQ    (BX)(R14*1), CX
 461  	MOVQ    DX, R15
 462  	MOVQ    CX, BX
 463  	ROLQ    CL, R15
 464  	MOVL    $0x00000001, BP
 465  	MOVB    R14, CL
 466  	SHLL    CL, BP
 467  	DECL    BP
 468  	ANDQ    BP, R15
 469  	ADDQ    R15, DI
 470  
 471  	// Load ctx.llTable
 472  	MOVQ ctx+16(FP), CX
 473  	MOVQ (CX), CX
 474  	MOVQ (CX)(DI*8), DI
 475  
 476  	// Update Match Length State
 477  	MOVBQZX R8, R14
 478  	SHRL    $0x10, R8
 479  	LEAQ    (BX)(R14*1), CX
 480  	MOVQ    DX, R15
 481  	MOVQ    CX, BX
 482  	ROLQ    CL, R15
 483  	MOVL    $0x00000001, BP
 484  	MOVB    R14, CL
 485  	SHLL    CL, BP
 486  	DECL    BP
 487  	ANDQ    BP, R15
 488  	ADDQ    R15, R8
 489  
 490  	// Load ctx.mlTable
 491  	MOVQ ctx+16(FP), CX
 492  	MOVQ 24(CX), CX
 493  	MOVQ (CX)(R8*8), R8
 494  
 495  	// Update Offset State
 496  	MOVBQZX R9, R14
 497  	SHRL    $0x10, R9
 498  	LEAQ    (BX)(R14*1), CX
 499  	MOVQ    DX, R15
 500  	MOVQ    CX, BX
 501  	ROLQ    CL, R15
 502  	MOVL    $0x00000001, BP
 503  	MOVB    R14, CL
 504  	SHLL    CL, BP
 505  	DECL    BP
 506  	ANDQ    BP, R15
 507  	ADDQ    R15, R9
 508  
 509  	// Load ctx.ofTable
 510  	MOVQ ctx+16(FP), CX
 511  	MOVQ 48(CX), CX
 512  	MOVQ (CX)(R9*8), R9
 513  
 514  sequenceDecs_decode_56_amd64_skip_update:
 515  	// Adjust offset
 516  	MOVQ 16(R10), CX
 517  	CMPQ AX, $0x01
 518  	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
 519  	MOVQ R12, R13
 520  	MOVQ R11, R12
 521  	MOVQ CX, R11
 522  	JMP  sequenceDecs_decode_56_amd64_after_adjust
 523  
 524  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
 525  	CMPQ (R10), $0x00000000
 526  	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
 527  	INCQ CX
 528  	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero
 529  
 530  sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
 531  	TESTQ CX, CX
 532  	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
 533  	MOVQ  R11, CX
 534  	JMP   sequenceDecs_decode_56_amd64_after_adjust
 535  
 536  sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
 537  	CMPQ CX, $0x01
 538  	JB   sequenceDecs_decode_56_amd64_adjust_zero
 539  	JEQ  sequenceDecs_decode_56_amd64_adjust_one
 540  	CMPQ CX, $0x02
 541  	JA   sequenceDecs_decode_56_amd64_adjust_three
 542  	JMP  sequenceDecs_decode_56_amd64_adjust_two
 543  
 544  sequenceDecs_decode_56_amd64_adjust_zero:
 545  	MOVQ R11, AX
 546  	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
 547  
 548  sequenceDecs_decode_56_amd64_adjust_one:
 549  	MOVQ R12, AX
 550  	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
 551  
 552  sequenceDecs_decode_56_amd64_adjust_two:
 553  	MOVQ R13, AX
 554  	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
 555  
 556  sequenceDecs_decode_56_amd64_adjust_three:
 557  	LEAQ -1(R11), AX
 558  
 559  sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
 560  	TESTQ AX, AX
 561  	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
 562  	MOVQ  $0x00000001, AX
 563  
 564  sequenceDecs_decode_56_amd64_adjust_temp_valid:
 565  	CMPQ    CX, $0x01
 566  	CMOVQNE R12, R13
 567  	MOVQ    R11, R12
 568  	MOVQ    AX, R11
 569  	MOVQ    AX, CX
 570  
 571  sequenceDecs_decode_56_amd64_after_adjust:
 572  	MOVQ CX, 16(R10)
 573  
 574  	// Check values
 575  	MOVQ  8(R10), AX
 576  	MOVQ  (R10), R14
 577  	LEAQ  (AX)(R14*1), R15
 578  	MOVQ  s+0(FP), BP
 579  	ADDQ  R15, 256(BP)
 580  	MOVQ  ctx+16(FP), R15
 581  	SUBQ  R14, 128(R15)
 582  	JS    error_not_enough_literals
 583  	CMPQ  AX, $0x00020002
 584  	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
 585  	TESTQ CX, CX
 586  	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
 587  	TESTQ AX, AX
 588  	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
 589  
 590  sequenceDecs_decode_56_amd64_match_len_ofs_ok:
 591  	ADDQ $0x18, R10
 592  	MOVQ ctx+16(FP), AX
 593  	DECQ 96(AX)
 594  	JNS  sequenceDecs_decode_56_amd64_main_loop
 595  	MOVQ s+0(FP), AX
 596  	MOVQ R11, 144(AX)
 597  	MOVQ R12, 152(AX)
 598  	MOVQ R13, 160(AX)
 599  	MOVQ br+8(FP), AX
 600  	MOVQ DX, 24(AX)
 601  	MOVB BL, 40(AX)
 602  	MOVQ SI, 32(AX)
 603  
 604  	// Return success
 605  	MOVQ $0x00000000, ret+24(FP)
 606  	RET
 607  
 608  	// Return with match length error
 609  sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
 610  	MOVQ $0x00000001, ret+24(FP)
 611  	RET
 612  
 613  	// Return with match too long error
 614  sequenceDecs_decode_56_amd64_error_match_len_too_big:
 615  	MOVQ $0x00000002, ret+24(FP)
 616  	RET
 617  
 618  	// Return with match offset too long error
 619  	MOVQ $0x00000003, ret+24(FP)
 620  	RET
 621  
 622  	// Return with not enough literals error
 623  error_not_enough_literals:
 624  	MOVQ $0x00000004, ret+24(FP)
 625  	RET
 626  
 627  	// Return with overread error
 628  error_overread:
 629  	MOVQ $0x00000006, ret+24(FP)
 630  	RET
 631  
 632  // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 633  // Requires: BMI, BMI2, CMOV
 634  TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
 635  	MOVQ    br+8(FP), BX
 636  	MOVQ    24(BX), AX
 637  	MOVBQZX 40(BX), DX
 638  	MOVQ    (BX), CX
 639  	MOVQ    32(BX), BX
 640  	ADDQ    BX, CX
 641  	MOVQ    CX, (SP)
 642  	MOVQ    ctx+16(FP), CX
 643  	MOVQ    72(CX), SI
 644  	MOVQ    80(CX), DI
 645  	MOVQ    88(CX), R8
 646  	MOVQ    104(CX), R9
 647  	MOVQ    s+0(FP), CX
 648  	MOVQ    144(CX), R10
 649  	MOVQ    152(CX), R11
 650  	MOVQ    160(CX), R12
 651  
 652  sequenceDecs_decode_bmi2_main_loop:
 653  	MOVQ (SP), R13
 654  
 655  	// Fill bitreader to have enough for the offset and match length.
 656  	CMPQ BX, $0x08
 657  	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
 658  	MOVQ DX, CX
 659  	SHRQ $0x03, CX
 660  	SUBQ CX, R13
 661  	MOVQ (R13), AX
 662  	SUBQ CX, BX
 663  	ANDQ $0x07, DX
 664  	JMP  sequenceDecs_decode_bmi2_fill_end
 665  
 666  sequenceDecs_decode_bmi2_fill_byte_by_byte:
 667  	CMPQ    BX, $0x00
 668  	JLE     sequenceDecs_decode_bmi2_fill_check_overread
 669  	CMPQ    DX, $0x07
 670  	JLE     sequenceDecs_decode_bmi2_fill_end
 671  	SHLQ    $0x08, AX
 672  	SUBQ    $0x01, R13
 673  	SUBQ    $0x01, BX
 674  	SUBQ    $0x08, DX
 675  	MOVBQZX (R13), CX
 676  	ORQ     CX, AX
 677  	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
 678  
 679  sequenceDecs_decode_bmi2_fill_check_overread:
 680  	CMPQ DX, $0x40
 681  	JA   error_overread
 682  
 683  sequenceDecs_decode_bmi2_fill_end:
 684  	// Update offset
 685  	MOVQ   $0x00000808, CX
 686  	BEXTRQ CX, R8, R14
 687  	MOVQ   AX, R15
 688  	LEAQ   (DX)(R14*1), CX
 689  	ROLQ   CL, R15
 690  	BZHIQ  R14, R15, R15
 691  	MOVQ   CX, DX
 692  	MOVQ   R8, CX
 693  	SHRQ   $0x20, CX
 694  	ADDQ   R15, CX
 695  	MOVQ   CX, 16(R9)
 696  
 697  	// Update match length
 698  	MOVQ   $0x00000808, CX
 699  	BEXTRQ CX, DI, R14
 700  	MOVQ   AX, R15
 701  	LEAQ   (DX)(R14*1), CX
 702  	ROLQ   CL, R15
 703  	BZHIQ  R14, R15, R15
 704  	MOVQ   CX, DX
 705  	MOVQ   DI, CX
 706  	SHRQ   $0x20, CX
 707  	ADDQ   R15, CX
 708  	MOVQ   CX, 8(R9)
 709  
 710  	// Fill bitreader to have enough for the remaining
 711  	CMPQ BX, $0x08
 712  	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
 713  	MOVQ DX, CX
 714  	SHRQ $0x03, CX
 715  	SUBQ CX, R13
 716  	MOVQ (R13), AX
 717  	SUBQ CX, BX
 718  	ANDQ $0x07, DX
 719  	JMP  sequenceDecs_decode_bmi2_fill_2_end
 720  
 721  sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
 722  	CMPQ    BX, $0x00
 723  	JLE     sequenceDecs_decode_bmi2_fill_2_check_overread
 724  	CMPQ    DX, $0x07
 725  	JLE     sequenceDecs_decode_bmi2_fill_2_end
 726  	SHLQ    $0x08, AX
 727  	SUBQ    $0x01, R13
 728  	SUBQ    $0x01, BX
 729  	SUBQ    $0x08, DX
 730  	MOVBQZX (R13), CX
 731  	ORQ     CX, AX
 732  	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
 733  
 734  sequenceDecs_decode_bmi2_fill_2_check_overread:
 735  	CMPQ DX, $0x40
 736  	JA   error_overread
 737  
 738  sequenceDecs_decode_bmi2_fill_2_end:
 739  	// Update literal length
 740  	MOVQ   $0x00000808, CX
 741  	BEXTRQ CX, SI, R14
 742  	MOVQ   AX, R15
 743  	LEAQ   (DX)(R14*1), CX
 744  	ROLQ   CL, R15
 745  	BZHIQ  R14, R15, R15
 746  	MOVQ   CX, DX
 747  	MOVQ   SI, CX
 748  	SHRQ   $0x20, CX
 749  	ADDQ   R15, CX
 750  	MOVQ   CX, (R9)
 751  
 752  	// Fill bitreader for state updates
 753  	MOVQ    R13, (SP)
 754  	MOVQ    $0x00000808, CX
 755  	BEXTRQ  CX, R8, R13
 756  	MOVQ    ctx+16(FP), CX
 757  	CMPQ    96(CX), $0x00
 758  	JZ      sequenceDecs_decode_bmi2_skip_update
 759  	LEAQ    (SI)(DI*1), R14
 760  	ADDQ    R8, R14
 761  	MOVBQZX R14, R14
 762  	LEAQ    (DX)(R14*1), CX
 763  	MOVQ    AX, R15
 764  	MOVQ    CX, DX
 765  	ROLQ    CL, R15
 766  	BZHIQ   R14, R15, R15
 767  
 768  	// Update Offset State
 769  	BZHIQ R8, R15, CX
 770  	SHRXQ R8, R15, R15
 771  	SHRL  $0x10, R8
 772  	ADDQ  CX, R8
 773  
 774  	// Load ctx.ofTable
 775  	MOVQ ctx+16(FP), CX
 776  	MOVQ 48(CX), CX
 777  	MOVQ (CX)(R8*8), R8
 778  
 779  	// Update Match Length State
 780  	BZHIQ DI, R15, CX
 781  	SHRXQ DI, R15, R15
 782  	SHRL  $0x10, DI
 783  	ADDQ  CX, DI
 784  
 785  	// Load ctx.mlTable
 786  	MOVQ ctx+16(FP), CX
 787  	MOVQ 24(CX), CX
 788  	MOVQ (CX)(DI*8), DI
 789  
 790  	// Update Literal Length State
 791  	BZHIQ SI, R15, CX
 792  	SHRL  $0x10, SI
 793  	ADDQ  CX, SI
 794  
 795  	// Load ctx.llTable
 796  	MOVQ ctx+16(FP), CX
 797  	MOVQ (CX), CX
 798  	MOVQ (CX)(SI*8), SI
 799  
 800  sequenceDecs_decode_bmi2_skip_update:
 801  	// Adjust offset
 802  	MOVQ 16(R9), CX
 803  	CMPQ R13, $0x01
 804  	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
 805  	MOVQ R11, R12
 806  	MOVQ R10, R11
 807  	MOVQ CX, R10
 808  	JMP  sequenceDecs_decode_bmi2_after_adjust
 809  
 810  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
 811  	CMPQ (R9), $0x00000000
 812  	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
 813  	INCQ CX
 814  	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero
 815  
 816  sequenceDecs_decode_bmi2_adjust_offset_maybezero:
 817  	TESTQ CX, CX
 818  	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
 819  	MOVQ  R10, CX
 820  	JMP   sequenceDecs_decode_bmi2_after_adjust
 821  
 822  sequenceDecs_decode_bmi2_adjust_offset_nonzero:
 823  	CMPQ CX, $0x01
 824  	JB   sequenceDecs_decode_bmi2_adjust_zero
 825  	JEQ  sequenceDecs_decode_bmi2_adjust_one
 826  	CMPQ CX, $0x02
 827  	JA   sequenceDecs_decode_bmi2_adjust_three
 828  	JMP  sequenceDecs_decode_bmi2_adjust_two
 829  
 830  sequenceDecs_decode_bmi2_adjust_zero:
 831  	MOVQ R10, R13
 832  	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
 833  
 834  sequenceDecs_decode_bmi2_adjust_one:
 835  	MOVQ R11, R13
 836  	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
 837  
 838  sequenceDecs_decode_bmi2_adjust_two:
 839  	MOVQ R12, R13
 840  	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
 841  
 842  sequenceDecs_decode_bmi2_adjust_three:
 843  	LEAQ -1(R10), R13
 844  
 845  sequenceDecs_decode_bmi2_adjust_test_temp_valid:
 846  	TESTQ R13, R13
 847  	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
 848  	MOVQ  $0x00000001, R13
 849  
 850  sequenceDecs_decode_bmi2_adjust_temp_valid:
 851  	CMPQ    CX, $0x01
 852  	CMOVQNE R11, R12
 853  	MOVQ    R10, R11
 854  	MOVQ    R13, R10
 855  	MOVQ    R13, CX
 856  
 857  sequenceDecs_decode_bmi2_after_adjust:
 858  	MOVQ CX, 16(R9)
 859  
 860  	// Check values
 861  	MOVQ  8(R9), R13
 862  	MOVQ  (R9), R14
 863  	LEAQ  (R13)(R14*1), R15
 864  	MOVQ  s+0(FP), BP
 865  	ADDQ  R15, 256(BP)
 866  	MOVQ  ctx+16(FP), R15
 867  	SUBQ  R14, 128(R15)
 868  	JS    error_not_enough_literals
 869  	CMPQ  R13, $0x00020002
 870  	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
 871  	TESTQ CX, CX
 872  	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
 873  	TESTQ R13, R13
 874  	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
 875  
 876  sequenceDecs_decode_bmi2_match_len_ofs_ok:
 877  	ADDQ $0x18, R9
 878  	MOVQ ctx+16(FP), CX
 879  	DECQ 96(CX)
 880  	JNS  sequenceDecs_decode_bmi2_main_loop
 881  	MOVQ s+0(FP), CX
 882  	MOVQ R10, 144(CX)
 883  	MOVQ R11, 152(CX)
 884  	MOVQ R12, 160(CX)
 885  	MOVQ br+8(FP), CX
 886  	MOVQ AX, 24(CX)
 887  	MOVB DL, 40(CX)
 888  	MOVQ BX, 32(CX)
 889  
 890  	// Return success
 891  	MOVQ $0x00000000, ret+24(FP)
 892  	RET
 893  
 894  	// Return with match length error
 895  sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
 896  	MOVQ $0x00000001, ret+24(FP)
 897  	RET
 898  
 899  	// Return with match too long error
 900  sequenceDecs_decode_bmi2_error_match_len_too_big:
 901  	MOVQ $0x00000002, ret+24(FP)
 902  	RET
 903  
 904  	// Return with match offset too long error
 905  	MOVQ $0x00000003, ret+24(FP)
 906  	RET
 907  
 908  	// Return with not enough literals error
 909  error_not_enough_literals:
 910  	MOVQ $0x00000004, ret+24(FP)
 911  	RET
 912  
 913  	// Return with overread error
 914  error_overread:
 915  	MOVQ $0x00000006, ret+24(FP)
 916  	RET
 917  
 918  // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 919  // Requires: BMI, BMI2, CMOV
 920  TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
 921  	MOVQ    br+8(FP), BX
 922  	MOVQ    24(BX), AX
 923  	MOVBQZX 40(BX), DX
 924  	MOVQ    (BX), CX
 925  	MOVQ    32(BX), BX
 926  	ADDQ    BX, CX
 927  	MOVQ    CX, (SP)
 928  	MOVQ    ctx+16(FP), CX
 929  	MOVQ    72(CX), SI
 930  	MOVQ    80(CX), DI
 931  	MOVQ    88(CX), R8
 932  	MOVQ    104(CX), R9
 933  	MOVQ    s+0(FP), CX
 934  	MOVQ    144(CX), R10
 935  	MOVQ    152(CX), R11
 936  	MOVQ    160(CX), R12
 937  
 938  sequenceDecs_decode_56_bmi2_main_loop:
 939  	MOVQ (SP), R13
 940  
 941  	// Fill bitreader to have enough for the offset and match length.
 942  	CMPQ BX, $0x08
 943  	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
 944  	MOVQ DX, CX
 945  	SHRQ $0x03, CX
 946  	SUBQ CX, R13
 947  	MOVQ (R13), AX
 948  	SUBQ CX, BX
 949  	ANDQ $0x07, DX
 950  	JMP  sequenceDecs_decode_56_bmi2_fill_end
 951  
 952  sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
 953  	CMPQ    BX, $0x00
 954  	JLE     sequenceDecs_decode_56_bmi2_fill_check_overread
 955  	CMPQ    DX, $0x07
 956  	JLE     sequenceDecs_decode_56_bmi2_fill_end
 957  	SHLQ    $0x08, AX
 958  	SUBQ    $0x01, R13
 959  	SUBQ    $0x01, BX
 960  	SUBQ    $0x08, DX
 961  	MOVBQZX (R13), CX
 962  	ORQ     CX, AX
 963  	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
 964  
 965  sequenceDecs_decode_56_bmi2_fill_check_overread:
 966  	CMPQ DX, $0x40
 967  	JA   error_overread
 968  
 969  sequenceDecs_decode_56_bmi2_fill_end:
 970  	// Update offset
 971  	MOVQ   $0x00000808, CX
 972  	BEXTRQ CX, R8, R14
 973  	MOVQ   AX, R15
 974  	LEAQ   (DX)(R14*1), CX
 975  	ROLQ   CL, R15
 976  	BZHIQ  R14, R15, R15
 977  	MOVQ   CX, DX
 978  	MOVQ   R8, CX
 979  	SHRQ   $0x20, CX
 980  	ADDQ   R15, CX
 981  	MOVQ   CX, 16(R9)
 982  
 983  	// Update match length
 984  	MOVQ   $0x00000808, CX
 985  	BEXTRQ CX, DI, R14
 986  	MOVQ   AX, R15
 987  	LEAQ   (DX)(R14*1), CX
 988  	ROLQ   CL, R15
 989  	BZHIQ  R14, R15, R15
 990  	MOVQ   CX, DX
 991  	MOVQ   DI, CX
 992  	SHRQ   $0x20, CX
 993  	ADDQ   R15, CX
 994  	MOVQ   CX, 8(R9)
 995  
 996  	// Update literal length
 997  	MOVQ   $0x00000808, CX
 998  	BEXTRQ CX, SI, R14
 999  	MOVQ   AX, R15
1000  	LEAQ   (DX)(R14*1), CX
1001  	ROLQ   CL, R15
1002  	BZHIQ  R14, R15, R15
1003  	MOVQ   CX, DX
1004  	MOVQ   SI, CX
1005  	SHRQ   $0x20, CX
1006  	ADDQ   R15, CX
1007  	MOVQ   CX, (R9)
1008  
1009  	// Fill bitreader for state updates
1010  	MOVQ    R13, (SP)
1011  	MOVQ    $0x00000808, CX
1012  	BEXTRQ  CX, R8, R13
1013  	MOVQ    ctx+16(FP), CX
1014  	CMPQ    96(CX), $0x00
1015  	JZ      sequenceDecs_decode_56_bmi2_skip_update
1016  	LEAQ    (SI)(DI*1), R14
1017  	ADDQ    R8, R14
1018  	MOVBQZX R14, R14
1019  	LEAQ    (DX)(R14*1), CX
1020  	MOVQ    AX, R15
1021  	MOVQ    CX, DX
1022  	ROLQ    CL, R15
1023  	BZHIQ   R14, R15, R15
1024  
1025  	// Update Offset State
1026  	BZHIQ R8, R15, CX
1027  	SHRXQ R8, R15, R15
1028  	SHRL  $0x10, R8
1029  	ADDQ  CX, R8
1030  
1031  	// Load ctx.ofTable
1032  	MOVQ ctx+16(FP), CX
1033  	MOVQ 48(CX), CX
1034  	MOVQ (CX)(R8*8), R8
1035  
1036  	// Update Match Length State
1037  	BZHIQ DI, R15, CX
1038  	SHRXQ DI, R15, R15
1039  	SHRL  $0x10, DI
1040  	ADDQ  CX, DI
1041  
1042  	// Load ctx.mlTable
1043  	MOVQ ctx+16(FP), CX
1044  	MOVQ 24(CX), CX
1045  	MOVQ (CX)(DI*8), DI
1046  
1047  	// Update Literal Length State
1048  	BZHIQ SI, R15, CX
1049  	SHRL  $0x10, SI
1050  	ADDQ  CX, SI
1051  
1052  	// Load ctx.llTable
1053  	MOVQ ctx+16(FP), CX
1054  	MOVQ (CX), CX
1055  	MOVQ (CX)(SI*8), SI
1056  
1057  sequenceDecs_decode_56_bmi2_skip_update:
1058  	// Adjust offset
1059  	MOVQ 16(R9), CX
1060  	CMPQ R13, $0x01
1061  	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
1062  	MOVQ R11, R12
1063  	MOVQ R10, R11
1064  	MOVQ CX, R10
1065  	JMP  sequenceDecs_decode_56_bmi2_after_adjust
1066  
1067  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
1068  	CMPQ (R9), $0x00000000
1069  	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
1070  	INCQ CX
1071  	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
1072  
1073  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
1074  	TESTQ CX, CX
1075  	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
1076  	MOVQ  R10, CX
1077  	JMP   sequenceDecs_decode_56_bmi2_after_adjust
1078  
1079  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
1080  	CMPQ CX, $0x01
1081  	JB   sequenceDecs_decode_56_bmi2_adjust_zero
1082  	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
1083  	CMPQ CX, $0x02
1084  	JA   sequenceDecs_decode_56_bmi2_adjust_three
1085  	JMP  sequenceDecs_decode_56_bmi2_adjust_two
1086  
1087  sequenceDecs_decode_56_bmi2_adjust_zero:
1088  	MOVQ R10, R13
1089  	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
1090  
1091  sequenceDecs_decode_56_bmi2_adjust_one:
1092  	MOVQ R11, R13
1093  	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
1094  
1095  sequenceDecs_decode_56_bmi2_adjust_two:
1096  	MOVQ R12, R13
1097  	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
1098  
1099  sequenceDecs_decode_56_bmi2_adjust_three:
1100  	LEAQ -1(R10), R13
1101  
1102  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
1103  	TESTQ R13, R13
1104  	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
1105  	MOVQ  $0x00000001, R13
1106  
1107  sequenceDecs_decode_56_bmi2_adjust_temp_valid:
1108  	CMPQ    CX, $0x01
1109  	CMOVQNE R11, R12
1110  	MOVQ    R10, R11
1111  	MOVQ    R13, R10
1112  	MOVQ    R13, CX
1113  
1114  sequenceDecs_decode_56_bmi2_after_adjust:
1115  	MOVQ CX, 16(R9)
1116  
1117  	// Check values
1118  	MOVQ  8(R9), R13
1119  	MOVQ  (R9), R14
1120  	LEAQ  (R13)(R14*1), R15
1121  	MOVQ  s+0(FP), BP
1122  	ADDQ  R15, 256(BP)
1123  	MOVQ  ctx+16(FP), R15
1124  	SUBQ  R14, 128(R15)
1125  	JS    error_not_enough_literals
1126  	CMPQ  R13, $0x00020002
1127  	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
1128  	TESTQ CX, CX
1129  	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
1130  	TESTQ R13, R13
1131  	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
1132  
1133  sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
1134  	ADDQ $0x18, R9
1135  	MOVQ ctx+16(FP), CX
1136  	DECQ 96(CX)
1137  	JNS  sequenceDecs_decode_56_bmi2_main_loop
1138  	MOVQ s+0(FP), CX
1139  	MOVQ R10, 144(CX)
1140  	MOVQ R11, 152(CX)
1141  	MOVQ R12, 160(CX)
1142  	MOVQ br+8(FP), CX
1143  	MOVQ AX, 24(CX)
1144  	MOVB DL, 40(CX)
1145  	MOVQ BX, 32(CX)
1146  
1147  	// Return success
1148  	MOVQ $0x00000000, ret+24(FP)
1149  	RET
1150  
1151  	// Return with match length error
1152  sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
1153  	MOVQ $0x00000001, ret+24(FP)
1154  	RET
1155  
1156  	// Return with match too long error
1157  sequenceDecs_decode_56_bmi2_error_match_len_too_big:
1158  	MOVQ $0x00000002, ret+24(FP)
1159  	RET
1160  
1161  	// Return with match offset too long error
1162  	MOVQ $0x00000003, ret+24(FP)
1163  	RET
1164  
1165  	// Return with not enough literals error
1166  error_not_enough_literals:
1167  	MOVQ $0x00000004, ret+24(FP)
1168  	RET
1169  
1170  	// Return with overread error
1171  error_overread:
1172  	MOVQ $0x00000006, ret+24(FP)
1173  	RET
1174  
1175  // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
1176  // Requires: SSE
1177  TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
1178  	MOVQ  ctx+0(FP), R10
1179  	MOVQ  8(R10), CX
1180  	TESTQ CX, CX
1181  	JZ    empty_seqs
1182  	MOVQ  (R10), AX
1183  	MOVQ  24(R10), DX
1184  	MOVQ  32(R10), BX
1185  	MOVQ  80(R10), SI
1186  	MOVQ  104(R10), DI
1187  	MOVQ  120(R10), R8
1188  	MOVQ  56(R10), R9
1189  	MOVQ  64(R10), R10
1190  	ADDQ  R10, R9
1191  
1192  	// seqsBase += 24 * seqIndex
1193  	LEAQ (DX)(DX*2), R11
1194  	SHLQ $0x03, R11
1195  	ADDQ R11, AX
1196  
1197  	// outBase += outPosition
1198  	ADDQ DI, BX
1199  
1200  main_loop:
1201  	MOVQ (AX), R11
1202  	MOVQ 16(AX), R12
1203  	MOVQ 8(AX), R13
1204  
1205  	// Copy literals
1206  	TESTQ R11, R11
1207  	JZ    check_offset
1208  	XORQ  R14, R14
1209  
1210  copy_1:
1211  	MOVUPS (SI)(R14*1), X0
1212  	MOVUPS X0, (BX)(R14*1)
1213  	ADDQ   $0x10, R14
1214  	CMPQ   R14, R11
1215  	JB     copy_1
1216  	ADDQ   R11, SI
1217  	ADDQ   R11, BX
1218  	ADDQ   R11, DI
1219  
1220  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
1221  check_offset:
1222  	LEAQ (DI)(R10*1), R11
1223  	CMPQ R12, R11
1224  	JG   error_match_off_too_big
1225  	CMPQ R12, R8
1226  	JG   error_match_off_too_big
1227  
1228  	// Copy match from history
1229  	MOVQ R12, R11
1230  	SUBQ DI, R11
1231  	JLS  copy_match
1232  	MOVQ R9, R14
1233  	SUBQ R11, R14
1234  	CMPQ R13, R11
1235  	JG   copy_all_from_history
1236  	MOVQ R13, R11
1237  	SUBQ $0x10, R11
1238  	JB   copy_4_small
1239  
1240  copy_4_loop:
1241  	MOVUPS (R14), X0
1242  	MOVUPS X0, (BX)
1243  	ADDQ   $0x10, R14
1244  	ADDQ   $0x10, BX
1245  	SUBQ   $0x10, R11
1246  	JAE    copy_4_loop
1247  	LEAQ   16(R14)(R11*1), R14
1248  	LEAQ   16(BX)(R11*1), BX
1249  	MOVUPS -16(R14), X0
1250  	MOVUPS X0, -16(BX)
1251  	JMP    copy_4_end
1252  
1253  copy_4_small:
1254  	CMPQ R13, $0x03
1255  	JE   copy_4_move_3
1256  	CMPQ R13, $0x08
1257  	JB   copy_4_move_4through7
1258  	JMP  copy_4_move_8through16
1259  
1260  copy_4_move_3:
1261  	MOVW (R14), R11
1262  	MOVB 2(R14), R12
1263  	MOVW R11, (BX)
1264  	MOVB R12, 2(BX)
1265  	ADDQ R13, R14
1266  	ADDQ R13, BX
1267  	JMP  copy_4_end
1268  
1269  copy_4_move_4through7:
1270  	MOVL (R14), R11
1271  	MOVL -4(R14)(R13*1), R12
1272  	MOVL R11, (BX)
1273  	MOVL R12, -4(BX)(R13*1)
1274  	ADDQ R13, R14
1275  	ADDQ R13, BX
1276  	JMP  copy_4_end
1277  
1278  copy_4_move_8through16:
1279  	MOVQ (R14), R11
1280  	MOVQ -8(R14)(R13*1), R12
1281  	MOVQ R11, (BX)
1282  	MOVQ R12, -8(BX)(R13*1)
1283  	ADDQ R13, R14
1284  	ADDQ R13, BX
1285  
1286  copy_4_end:
1287  	ADDQ R13, DI
1288  	ADDQ $0x18, AX
1289  	INCQ DX
1290  	CMPQ DX, CX
1291  	JB   main_loop
1292  	JMP  loop_finished
1293  
1294  copy_all_from_history:
1295  	MOVQ R11, R15
1296  	SUBQ $0x10, R15
1297  	JB   copy_5_small
1298  
1299  copy_5_loop:
1300  	MOVUPS (R14), X0
1301  	MOVUPS X0, (BX)
1302  	ADDQ   $0x10, R14
1303  	ADDQ   $0x10, BX
1304  	SUBQ   $0x10, R15
1305  	JAE    copy_5_loop
1306  	LEAQ   16(R14)(R15*1), R14
1307  	LEAQ   16(BX)(R15*1), BX
1308  	MOVUPS -16(R14), X0
1309  	MOVUPS X0, -16(BX)
1310  	JMP    copy_5_end
1311  
1312  copy_5_small:
1313  	CMPQ R11, $0x03
1314  	JE   copy_5_move_3
1315  	JB   copy_5_move_1or2
1316  	CMPQ R11, $0x08
1317  	JB   copy_5_move_4through7
1318  	JMP  copy_5_move_8through16
1319  
1320  copy_5_move_1or2:
1321  	MOVB (R14), R15
1322  	MOVB -1(R14)(R11*1), BP
1323  	MOVB R15, (BX)
1324  	MOVB BP, -1(BX)(R11*1)
1325  	ADDQ R11, R14
1326  	ADDQ R11, BX
1327  	JMP  copy_5_end
1328  
1329  copy_5_move_3:
1330  	MOVW (R14), R15
1331  	MOVB 2(R14), BP
1332  	MOVW R15, (BX)
1333  	MOVB BP, 2(BX)
1334  	ADDQ R11, R14
1335  	ADDQ R11, BX
1336  	JMP  copy_5_end
1337  
1338  copy_5_move_4through7:
1339  	MOVL (R14), R15
1340  	MOVL -4(R14)(R11*1), BP
1341  	MOVL R15, (BX)
1342  	MOVL BP, -4(BX)(R11*1)
1343  	ADDQ R11, R14
1344  	ADDQ R11, BX
1345  	JMP  copy_5_end
1346  
1347  copy_5_move_8through16:
1348  	MOVQ (R14), R15
1349  	MOVQ -8(R14)(R11*1), BP
1350  	MOVQ R15, (BX)
1351  	MOVQ BP, -8(BX)(R11*1)
1352  	ADDQ R11, R14
1353  	ADDQ R11, BX
1354  
1355  copy_5_end:
1356  	ADDQ R11, DI
1357  	SUBQ R11, R13
1358  
1359  	// Copy match from the current buffer
1360  copy_match:
1361  	MOVQ BX, R11
1362  	SUBQ R12, R11
1363  
1364  	// ml <= mo
1365  	CMPQ R13, R12
1366  	JA   copy_overlapping_match
1367  
1368  	// Copy non-overlapping match
1369  	ADDQ R13, DI
1370  	MOVQ BX, R12
1371  	ADDQ R13, BX
1372  
1373  copy_2:
1374  	MOVUPS (R11), X0
1375  	MOVUPS X0, (R12)
1376  	ADDQ   $0x10, R11
1377  	ADDQ   $0x10, R12
1378  	SUBQ   $0x10, R13
1379  	JHI    copy_2
1380  	JMP    handle_loop
1381  
1382  	// Copy overlapping match
1383  copy_overlapping_match:
1384  	ADDQ R13, DI
1385  
1386  copy_slow_3:
1387  	MOVB (R11), R12
1388  	MOVB R12, (BX)
1389  	INCQ R11
1390  	INCQ BX
1391  	DECQ R13
1392  	JNZ  copy_slow_3
1393  
1394  handle_loop:
1395  	ADDQ $0x18, AX
1396  	INCQ DX
1397  	CMPQ DX, CX
1398  	JB   main_loop
1399  
1400  loop_finished:
1401  	// Return value
1402  	MOVB $0x01, ret+8(FP)
1403  
1404  	// Update the context
1405  	MOVQ ctx+0(FP), AX
1406  	MOVQ DX, 24(AX)
1407  	MOVQ DI, 104(AX)
1408  	SUBQ 80(AX), SI
1409  	MOVQ SI, 112(AX)
1410  	RET
1411  
1412  error_match_off_too_big:
1413  	// Return value
1414  	MOVB $0x00, ret+8(FP)
1415  
1416  	// Update the context
1417  	MOVQ ctx+0(FP), AX
1418  	MOVQ DX, 24(AX)
1419  	MOVQ DI, 104(AX)
1420  	SUBQ 80(AX), SI
1421  	MOVQ SI, 112(AX)
1422  	RET
1423  
1424  empty_seqs:
1425  	// Return value
1426  	MOVB $0x01, ret+8(FP)
1427  	RET
1428  
1429  // func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
1430  // Requires: SSE
1431  TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
1432  	MOVQ  ctx+0(FP), R10
1433  	MOVQ  8(R10), CX
1434  	TESTQ CX, CX
1435  	JZ    empty_seqs
1436  	MOVQ  (R10), AX
1437  	MOVQ  24(R10), DX
1438  	MOVQ  32(R10), BX
1439  	MOVQ  80(R10), SI
1440  	MOVQ  104(R10), DI
1441  	MOVQ  120(R10), R8
1442  	MOVQ  56(R10), R9
1443  	MOVQ  64(R10), R10
1444  	ADDQ  R10, R9
1445  
1446  	// seqsBase += 24 * seqIndex
1447  	LEAQ (DX)(DX*2), R11
1448  	SHLQ $0x03, R11
1449  	ADDQ R11, AX
1450  
1451  	// outBase += outPosition
1452  	ADDQ DI, BX
1453  
1454  main_loop:
1455  	MOVQ (AX), R11
1456  	MOVQ 16(AX), R12
1457  	MOVQ 8(AX), R13
1458  
1459  	// Copy literals
1460  	TESTQ R11, R11
1461  	JZ    check_offset
1462  	MOVQ  R11, R14
1463  	SUBQ  $0x10, R14
1464  	JB    copy_1_small
1465  
1466  copy_1_loop:
1467  	MOVUPS (SI), X0
1468  	MOVUPS X0, (BX)
1469  	ADDQ   $0x10, SI
1470  	ADDQ   $0x10, BX
1471  	SUBQ   $0x10, R14
1472  	JAE    copy_1_loop
1473  	LEAQ   16(SI)(R14*1), SI
1474  	LEAQ   16(BX)(R14*1), BX
1475  	MOVUPS -16(SI), X0
1476  	MOVUPS X0, -16(BX)
1477  	JMP    copy_1_end
1478  
1479  copy_1_small:
1480  	CMPQ R11, $0x03
1481  	JE   copy_1_move_3
1482  	JB   copy_1_move_1or2
1483  	CMPQ R11, $0x08
1484  	JB   copy_1_move_4through7
1485  	JMP  copy_1_move_8through16
1486  
1487  copy_1_move_1or2:
1488  	MOVB (SI), R14
1489  	MOVB -1(SI)(R11*1), R15
1490  	MOVB R14, (BX)
1491  	MOVB R15, -1(BX)(R11*1)
1492  	ADDQ R11, SI
1493  	ADDQ R11, BX
1494  	JMP  copy_1_end
1495  
1496  copy_1_move_3:
1497  	MOVW (SI), R14
1498  	MOVB 2(SI), R15
1499  	MOVW R14, (BX)
1500  	MOVB R15, 2(BX)
1501  	ADDQ R11, SI
1502  	ADDQ R11, BX
1503  	JMP  copy_1_end
1504  
1505  copy_1_move_4through7:
1506  	MOVL (SI), R14
1507  	MOVL -4(SI)(R11*1), R15
1508  	MOVL R14, (BX)
1509  	MOVL R15, -4(BX)(R11*1)
1510  	ADDQ R11, SI
1511  	ADDQ R11, BX
1512  	JMP  copy_1_end
1513  
1514  copy_1_move_8through16:
1515  	MOVQ (SI), R14
1516  	MOVQ -8(SI)(R11*1), R15
1517  	MOVQ R14, (BX)
1518  	MOVQ R15, -8(BX)(R11*1)
1519  	ADDQ R11, SI
1520  	ADDQ R11, BX
1521  
1522  copy_1_end:
1523  	ADDQ R11, DI
1524  
1525  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
1526  check_offset:
1527  	LEAQ (DI)(R10*1), R11
1528  	CMPQ R12, R11
1529  	JG   error_match_off_too_big
1530  	CMPQ R12, R8
1531  	JG   error_match_off_too_big
1532  
1533  	// Copy match from history
1534  	MOVQ R12, R11
1535  	SUBQ DI, R11
1536  	JLS  copy_match
1537  	MOVQ R9, R14
1538  	SUBQ R11, R14
1539  	CMPQ R13, R11
1540  	JG   copy_all_from_history
1541  	MOVQ R13, R11
1542  	SUBQ $0x10, R11
1543  	JB   copy_4_small
1544  
1545  copy_4_loop:
1546  	MOVUPS (R14), X0
1547  	MOVUPS X0, (BX)
1548  	ADDQ   $0x10, R14
1549  	ADDQ   $0x10, BX
1550  	SUBQ   $0x10, R11
1551  	JAE    copy_4_loop
1552  	LEAQ   16(R14)(R11*1), R14
1553  	LEAQ   16(BX)(R11*1), BX
1554  	MOVUPS -16(R14), X0
1555  	MOVUPS X0, -16(BX)
1556  	JMP    copy_4_end
1557  
1558  copy_4_small:
1559  	CMPQ R13, $0x03
1560  	JE   copy_4_move_3
1561  	CMPQ R13, $0x08
1562  	JB   copy_4_move_4through7
1563  	JMP  copy_4_move_8through16
1564  
1565  copy_4_move_3:
1566  	MOVW (R14), R11
1567  	MOVB 2(R14), R12
1568  	MOVW R11, (BX)
1569  	MOVB R12, 2(BX)
1570  	ADDQ R13, R14
1571  	ADDQ R13, BX
1572  	JMP  copy_4_end
1573  
1574  copy_4_move_4through7:
1575  	MOVL (R14), R11
1576  	MOVL -4(R14)(R13*1), R12
1577  	MOVL R11, (BX)
1578  	MOVL R12, -4(BX)(R13*1)
1579  	ADDQ R13, R14
1580  	ADDQ R13, BX
1581  	JMP  copy_4_end
1582  
1583  copy_4_move_8through16:
1584  	MOVQ (R14), R11
1585  	MOVQ -8(R14)(R13*1), R12
1586  	MOVQ R11, (BX)
1587  	MOVQ R12, -8(BX)(R13*1)
1588  	ADDQ R13, R14
1589  	ADDQ R13, BX
1590  
1591  copy_4_end:
1592  	ADDQ R13, DI
1593  	ADDQ $0x18, AX
1594  	INCQ DX
1595  	CMPQ DX, CX
1596  	JB   main_loop
1597  	JMP  loop_finished
1598  
1599  copy_all_from_history:
1600  	MOVQ R11, R15
1601  	SUBQ $0x10, R15
1602  	JB   copy_5_small
1603  
1604  copy_5_loop:
1605  	MOVUPS (R14), X0
1606  	MOVUPS X0, (BX)
1607  	ADDQ   $0x10, R14
1608  	ADDQ   $0x10, BX
1609  	SUBQ   $0x10, R15
1610  	JAE    copy_5_loop
1611  	LEAQ   16(R14)(R15*1), R14
1612  	LEAQ   16(BX)(R15*1), BX
1613  	MOVUPS -16(R14), X0
1614  	MOVUPS X0, -16(BX)
1615  	JMP    copy_5_end
1616  
1617  copy_5_small:
1618  	CMPQ R11, $0x03
1619  	JE   copy_5_move_3
1620  	JB   copy_5_move_1or2
1621  	CMPQ R11, $0x08
1622  	JB   copy_5_move_4through7
1623  	JMP  copy_5_move_8through16
1624  
1625  copy_5_move_1or2:
1626  	MOVB (R14), R15
1627  	MOVB -1(R14)(R11*1), BP
1628  	MOVB R15, (BX)
1629  	MOVB BP, -1(BX)(R11*1)
1630  	ADDQ R11, R14
1631  	ADDQ R11, BX
1632  	JMP  copy_5_end
1633  
1634  copy_5_move_3:
1635  	MOVW (R14), R15
1636  	MOVB 2(R14), BP
1637  	MOVW R15, (BX)
1638  	MOVB BP, 2(BX)
1639  	ADDQ R11, R14
1640  	ADDQ R11, BX
1641  	JMP  copy_5_end
1642  
1643  copy_5_move_4through7:
1644  	MOVL (R14), R15
1645  	MOVL -4(R14)(R11*1), BP
1646  	MOVL R15, (BX)
1647  	MOVL BP, -4(BX)(R11*1)
1648  	ADDQ R11, R14
1649  	ADDQ R11, BX
1650  	JMP  copy_5_end
1651  
1652  copy_5_move_8through16:
1653  	MOVQ (R14), R15
1654  	MOVQ -8(R14)(R11*1), BP
1655  	MOVQ R15, (BX)
1656  	MOVQ BP, -8(BX)(R11*1)
1657  	ADDQ R11, R14
1658  	ADDQ R11, BX
1659  
1660  copy_5_end:
1661  	ADDQ R11, DI
1662  	SUBQ R11, R13
1663  
1664  	// Copy match from the current buffer
1665  copy_match:
1666  	MOVQ BX, R11
1667  	SUBQ R12, R11
1668  
1669  	// ml <= mo
1670  	CMPQ R13, R12
1671  	JA   copy_overlapping_match
1672  
1673  	// Copy non-overlapping match
1674  	ADDQ R13, DI
1675  	MOVQ R13, R12
1676  	SUBQ $0x10, R12
1677  	JB   copy_2_small
1678  
1679  copy_2_loop:
1680  	MOVUPS (R11), X0
1681  	MOVUPS X0, (BX)
1682  	ADDQ   $0x10, R11
1683  	ADDQ   $0x10, BX
1684  	SUBQ   $0x10, R12
1685  	JAE    copy_2_loop
1686  	LEAQ   16(R11)(R12*1), R11
1687  	LEAQ   16(BX)(R12*1), BX
1688  	MOVUPS -16(R11), X0
1689  	MOVUPS X0, -16(BX)
1690  	JMP    copy_2_end
1691  
1692  copy_2_small:
1693  	CMPQ R13, $0x03
1694  	JE   copy_2_move_3
1695  	JB   copy_2_move_1or2
1696  	CMPQ R13, $0x08
1697  	JB   copy_2_move_4through7
1698  	JMP  copy_2_move_8through16
1699  
1700  copy_2_move_1or2:
1701  	MOVB (R11), R12
1702  	MOVB -1(R11)(R13*1), R14
1703  	MOVB R12, (BX)
1704  	MOVB R14, -1(BX)(R13*1)
1705  	ADDQ R13, R11
1706  	ADDQ R13, BX
1707  	JMP  copy_2_end
1708  
1709  copy_2_move_3:
1710  	MOVW (R11), R12
1711  	MOVB 2(R11), R14
1712  	MOVW R12, (BX)
1713  	MOVB R14, 2(BX)
1714  	ADDQ R13, R11
1715  	ADDQ R13, BX
1716  	JMP  copy_2_end
1717  
1718  copy_2_move_4through7:
1719  	MOVL (R11), R12
1720  	MOVL -4(R11)(R13*1), R14
1721  	MOVL R12, (BX)
1722  	MOVL R14, -4(BX)(R13*1)
1723  	ADDQ R13, R11
1724  	ADDQ R13, BX
1725  	JMP  copy_2_end
1726  
1727  copy_2_move_8through16:
1728  	MOVQ (R11), R12
1729  	MOVQ -8(R11)(R13*1), R14
1730  	MOVQ R12, (BX)
1731  	MOVQ R14, -8(BX)(R13*1)
1732  	ADDQ R13, R11
1733  	ADDQ R13, BX
1734  
1735  copy_2_end:
1736  	JMP handle_loop
1737  
1738  	// Copy overlapping match
1739  copy_overlapping_match:
1740  	ADDQ R13, DI
1741  
1742  copy_slow_3:
1743  	MOVB (R11), R12
1744  	MOVB R12, (BX)
1745  	INCQ R11
1746  	INCQ BX
1747  	DECQ R13
1748  	JNZ  copy_slow_3
1749  
1750  handle_loop:
1751  	ADDQ $0x18, AX
1752  	INCQ DX
1753  	CMPQ DX, CX
1754  	JB   main_loop
1755  
1756  loop_finished:
1757  	// Return value
1758  	MOVB $0x01, ret+8(FP)
1759  
1760  	// Update the context
1761  	MOVQ ctx+0(FP), AX
1762  	MOVQ DX, 24(AX)
1763  	MOVQ DI, 104(AX)
1764  	SUBQ 80(AX), SI
1765  	MOVQ SI, 112(AX)
1766  	RET
1767  
1768  error_match_off_too_big:
1769  	// Return value
1770  	MOVB $0x00, ret+8(FP)
1771  
1772  	// Update the context
1773  	MOVQ ctx+0(FP), AX
1774  	MOVQ DX, 24(AX)
1775  	MOVQ DI, 104(AX)
1776  	SUBQ 80(AX), SI
1777  	MOVQ SI, 112(AX)
1778  	RET
1779  
1780  empty_seqs:
1781  	// Return value
1782  	MOVB $0x01, ret+8(FP)
1783  	RET
1784  
1785  // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
1786  // Requires: CMOV, SSE
1787  TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
1788  	MOVQ    br+8(FP), CX
1789  	MOVQ    24(CX), DX
1790  	MOVBQZX 40(CX), BX
1791  	MOVQ    (CX), AX
1792  	MOVQ    32(CX), SI
1793  	ADDQ    SI, AX
1794  	MOVQ    AX, (SP)
1795  	MOVQ    ctx+16(FP), AX
1796  	MOVQ    72(AX), DI
1797  	MOVQ    80(AX), R8
1798  	MOVQ    88(AX), R9
1799  	XORQ    CX, CX
1800  	MOVQ    CX, 8(SP)
1801  	MOVQ    CX, 16(SP)
1802  	MOVQ    CX, 24(SP)
1803  	MOVQ    112(AX), R10
1804  	MOVQ    128(AX), CX
1805  	MOVQ    CX, 32(SP)
1806  	MOVQ    144(AX), R11
1807  	MOVQ    136(AX), R12
1808  	MOVQ    200(AX), CX
1809  	MOVQ    CX, 56(SP)
1810  	MOVQ    176(AX), CX
1811  	MOVQ    CX, 48(SP)
1812  	MOVQ    184(AX), AX
1813  	MOVQ    AX, 40(SP)
1814  	MOVQ    40(SP), AX
1815  	ADDQ    AX, 48(SP)
1816  
1817  	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
1818  	ADDQ R10, 32(SP)
1819  
1820  	// outBase += outPosition
1821  	ADDQ R12, R10
1822  
1823  sequenceDecs_decodeSync_amd64_main_loop:
1824  	MOVQ (SP), R13
1825  
1826  	// Fill bitreader to have enough for the offset and match length.
1827  	CMPQ SI, $0x08
1828  	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
1829  	MOVQ BX, AX
1830  	SHRQ $0x03, AX
1831  	SUBQ AX, R13
1832  	MOVQ (R13), DX
1833  	SUBQ AX, SI
1834  	ANDQ $0x07, BX
1835  	JMP  sequenceDecs_decodeSync_amd64_fill_end
1836  
1837  sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
1838  	CMPQ    SI, $0x00
1839  	JLE     sequenceDecs_decodeSync_amd64_fill_check_overread
1840  	CMPQ    BX, $0x07
1841  	JLE     sequenceDecs_decodeSync_amd64_fill_end
1842  	SHLQ    $0x08, DX
1843  	SUBQ    $0x01, R13
1844  	SUBQ    $0x01, SI
1845  	SUBQ    $0x08, BX
1846  	MOVBQZX (R13), AX
1847  	ORQ     AX, DX
1848  	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
1849  
1850  sequenceDecs_decodeSync_amd64_fill_check_overread:
1851  	CMPQ BX, $0x40
1852  	JA   error_overread
1853  
1854  sequenceDecs_decodeSync_amd64_fill_end:
1855  	// Update offset
1856  	MOVQ  R9, AX
1857  	MOVQ  BX, CX
1858  	MOVQ  DX, R14
1859  	SHLQ  CL, R14
1860  	MOVB  AH, CL
1861  	SHRQ  $0x20, AX
1862  	TESTQ CX, CX
1863  	JZ    sequenceDecs_decodeSync_amd64_of_update_zero
1864  	ADDQ  CX, BX
1865  	CMPQ  BX, $0x40
1866  	JA    sequenceDecs_decodeSync_amd64_of_update_zero
1867  	CMPQ  CX, $0x40
1868  	JAE   sequenceDecs_decodeSync_amd64_of_update_zero
1869  	NEGQ  CX
1870  	SHRQ  CL, R14
1871  	ADDQ  R14, AX
1872  
1873  sequenceDecs_decodeSync_amd64_of_update_zero:
1874  	MOVQ AX, 8(SP)
1875  
1876  	// Update match length
1877  	MOVQ  R8, AX
1878  	MOVQ  BX, CX
1879  	MOVQ  DX, R14
1880  	SHLQ  CL, R14
1881  	MOVB  AH, CL
1882  	SHRQ  $0x20, AX
1883  	TESTQ CX, CX
1884  	JZ    sequenceDecs_decodeSync_amd64_ml_update_zero
1885  	ADDQ  CX, BX
1886  	CMPQ  BX, $0x40
1887  	JA    sequenceDecs_decodeSync_amd64_ml_update_zero
1888  	CMPQ  CX, $0x40
1889  	JAE   sequenceDecs_decodeSync_amd64_ml_update_zero
1890  	NEGQ  CX
1891  	SHRQ  CL, R14
1892  	ADDQ  R14, AX
1893  
1894  sequenceDecs_decodeSync_amd64_ml_update_zero:
1895  	MOVQ AX, 16(SP)
1896  
1897  	// Fill bitreader to have enough for the remaining
1898  	CMPQ SI, $0x08
1899  	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
1900  	MOVQ BX, AX
1901  	SHRQ $0x03, AX
1902  	SUBQ AX, R13
1903  	MOVQ (R13), DX
1904  	SUBQ AX, SI
1905  	ANDQ $0x07, BX
1906  	JMP  sequenceDecs_decodeSync_amd64_fill_2_end
1907  
1908  sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
1909  	CMPQ    SI, $0x00
1910  	JLE     sequenceDecs_decodeSync_amd64_fill_2_check_overread
1911  	CMPQ    BX, $0x07
1912  	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
1913  	SHLQ    $0x08, DX
1914  	SUBQ    $0x01, R13
1915  	SUBQ    $0x01, SI
1916  	SUBQ    $0x08, BX
1917  	MOVBQZX (R13), AX
1918  	ORQ     AX, DX
1919  	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
1920  
1921  sequenceDecs_decodeSync_amd64_fill_2_check_overread:
1922  	CMPQ BX, $0x40
1923  	JA   error_overread
1924  
1925  sequenceDecs_decodeSync_amd64_fill_2_end:
1926  	// Update literal length
1927  	MOVQ  DI, AX
1928  	MOVQ  BX, CX
1929  	MOVQ  DX, R14
1930  	SHLQ  CL, R14
1931  	MOVB  AH, CL
1932  	SHRQ  $0x20, AX
1933  	TESTQ CX, CX
1934  	JZ    sequenceDecs_decodeSync_amd64_ll_update_zero
1935  	ADDQ  CX, BX
1936  	CMPQ  BX, $0x40
1937  	JA    sequenceDecs_decodeSync_amd64_ll_update_zero
1938  	CMPQ  CX, $0x40
1939  	JAE   sequenceDecs_decodeSync_amd64_ll_update_zero
1940  	NEGQ  CX
1941  	SHRQ  CL, R14
1942  	ADDQ  R14, AX
1943  
1944  sequenceDecs_decodeSync_amd64_ll_update_zero:
1945  	MOVQ AX, 24(SP)
1946  
1947  	// Fill bitreader for state updates
1948  	MOVQ    R13, (SP)
1949  	MOVQ    R9, AX
1950  	SHRQ    $0x08, AX
1951  	MOVBQZX AL, AX
1952  	MOVQ    ctx+16(FP), CX
1953  	CMPQ    96(CX), $0x00
1954  	JZ      sequenceDecs_decodeSync_amd64_skip_update
1955  
1956  	// Update Literal Length State
1957  	MOVBQZX DI, R13
1958  	SHRL    $0x10, DI
1959  	LEAQ    (BX)(R13*1), CX
1960  	MOVQ    DX, R14
1961  	MOVQ    CX, BX
1962  	ROLQ    CL, R14
1963  	MOVL    $0x00000001, R15
1964  	MOVB    R13, CL
1965  	SHLL    CL, R15
1966  	DECL    R15
1967  	ANDQ    R15, R14
1968  	ADDQ    R14, DI
1969  
1970  	// Load ctx.llTable
1971  	MOVQ ctx+16(FP), CX
1972  	MOVQ (CX), CX
1973  	MOVQ (CX)(DI*8), DI
1974  
1975  	// Update Match Length State
1976  	MOVBQZX R8, R13
1977  	SHRL    $0x10, R8
1978  	LEAQ    (BX)(R13*1), CX
1979  	MOVQ    DX, R14
1980  	MOVQ    CX, BX
1981  	ROLQ    CL, R14
1982  	MOVL    $0x00000001, R15
1983  	MOVB    R13, CL
1984  	SHLL    CL, R15
1985  	DECL    R15
1986  	ANDQ    R15, R14
1987  	ADDQ    R14, R8
1988  
1989  	// Load ctx.mlTable
1990  	MOVQ ctx+16(FP), CX
1991  	MOVQ 24(CX), CX
1992  	MOVQ (CX)(R8*8), R8
1993  
1994  	// Update Offset State
1995  	MOVBQZX R9, R13
1996  	SHRL    $0x10, R9
1997  	LEAQ    (BX)(R13*1), CX
1998  	MOVQ    DX, R14
1999  	MOVQ    CX, BX
2000  	ROLQ    CL, R14
2001  	MOVL    $0x00000001, R15
2002  	MOVB    R13, CL
2003  	SHLL    CL, R15
2004  	DECL    R15
2005  	ANDQ    R15, R14
2006  	ADDQ    R14, R9
2007  
2008  	// Load ctx.ofTable
2009  	MOVQ ctx+16(FP), CX
2010  	MOVQ 48(CX), CX
2011  	MOVQ (CX)(R9*8), R9
2012  
2013  sequenceDecs_decodeSync_amd64_skip_update:
2014  	// Adjust offset
2015  	MOVQ   s+0(FP), CX
2016  	MOVQ   8(SP), R13
2017  	CMPQ   AX, $0x01
2018  	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
2019  	MOVUPS 144(CX), X0
2020  	MOVQ   R13, 144(CX)
2021  	MOVUPS X0, 152(CX)
2022  	JMP    sequenceDecs_decodeSync_amd64_after_adjust
2023  
2024  sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
2025  	CMPQ 24(SP), $0x00000000
2026  	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
2027  	INCQ R13
2028  	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
2029  
2030  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
2031  	TESTQ R13, R13
2032  	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
2033  	MOVQ  144(CX), R13
2034  	JMP   sequenceDecs_decodeSync_amd64_after_adjust
2035  
2036  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
2037  	MOVQ    R13, AX
2038  	XORQ    R14, R14
2039  	MOVQ    $-1, R15
2040  	CMPQ    R13, $0x03
2041  	CMOVQEQ R14, AX
2042  	CMOVQEQ R15, R14
2043  	ADDQ    144(CX)(AX*8), R14
2044  	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
2045  	MOVQ    $0x00000001, R14
2046  
2047  sequenceDecs_decodeSync_amd64_adjust_temp_valid:
2048  	CMPQ R13, $0x01
2049  	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
2050  	MOVQ 152(CX), AX
2051  	MOVQ AX, 160(CX)
2052  
2053  sequenceDecs_decodeSync_amd64_adjust_skip:
2054  	MOVQ 144(CX), AX
2055  	MOVQ AX, 152(CX)
2056  	MOVQ R14, 144(CX)
2057  	MOVQ R14, R13
2058  
2059  sequenceDecs_decodeSync_amd64_after_adjust:
2060  	MOVQ R13, 8(SP)
2061  
2062  	// Check values
2063  	MOVQ  16(SP), AX
2064  	MOVQ  24(SP), CX
2065  	LEAQ  (AX)(CX*1), R14
2066  	MOVQ  s+0(FP), R15
2067  	ADDQ  R14, 256(R15)
2068  	MOVQ  ctx+16(FP), R14
2069  	SUBQ  CX, 104(R14)
2070  	JS    error_not_enough_literals
2071  	CMPQ  AX, $0x00020002
2072  	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
2073  	TESTQ R13, R13
2074  	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
2075  	TESTQ AX, AX
2076  	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
2077  
2078  sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
2079  	MOVQ 24(SP), AX
2080  	MOVQ 8(SP), CX
2081  	MOVQ 16(SP), R13
2082  
2083  	// Check if we have enough space in s.out
2084  	LEAQ (AX)(R13*1), R14
2085  	ADDQ R10, R14
2086  	CMPQ R14, 32(SP)
2087  	JA   error_not_enough_space
2088  
2089  	// Copy literals
2090  	TESTQ AX, AX
2091  	JZ    check_offset
2092  	XORQ  R14, R14
2093  
2094  copy_1:
2095  	MOVUPS (R11)(R14*1), X0
2096  	MOVUPS X0, (R10)(R14*1)
2097  	ADDQ   $0x10, R14
2098  	CMPQ   R14, AX
2099  	JB     copy_1
2100  	ADDQ   AX, R11
2101  	ADDQ   AX, R10
2102  	ADDQ   AX, R12
2103  
2104  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
2105  check_offset:
2106  	MOVQ R12, AX
2107  	ADDQ 40(SP), AX
2108  	CMPQ CX, AX
2109  	JG   error_match_off_too_big
2110  	CMPQ CX, 56(SP)
2111  	JG   error_match_off_too_big
2112  
2113  	// Copy match from history
2114  	MOVQ CX, AX
2115  	SUBQ R12, AX
2116  	JLS  copy_match
2117  	MOVQ 48(SP), R14
2118  	SUBQ AX, R14
2119  	CMPQ R13, AX
2120  	JG   copy_all_from_history
2121  	MOVQ R13, AX
2122  	SUBQ $0x10, AX
2123  	JB   copy_4_small
2124  
2125  copy_4_loop:
2126  	MOVUPS (R14), X0
2127  	MOVUPS X0, (R10)
2128  	ADDQ   $0x10, R14
2129  	ADDQ   $0x10, R10
2130  	SUBQ   $0x10, AX
2131  	JAE    copy_4_loop
2132  	LEAQ   16(R14)(AX*1), R14
2133  	LEAQ   16(R10)(AX*1), R10
2134  	MOVUPS -16(R14), X0
2135  	MOVUPS X0, -16(R10)
2136  	JMP    copy_4_end
2137  
2138  copy_4_small:
2139  	CMPQ R13, $0x03
2140  	JE   copy_4_move_3
2141  	CMPQ R13, $0x08
2142  	JB   copy_4_move_4through7
2143  	JMP  copy_4_move_8through16
2144  
2145  copy_4_move_3:
2146  	MOVW (R14), AX
2147  	MOVB 2(R14), CL
2148  	MOVW AX, (R10)
2149  	MOVB CL, 2(R10)
2150  	ADDQ R13, R14
2151  	ADDQ R13, R10
2152  	JMP  copy_4_end
2153  
2154  copy_4_move_4through7:
2155  	MOVL (R14), AX
2156  	MOVL -4(R14)(R13*1), CX
2157  	MOVL AX, (R10)
2158  	MOVL CX, -4(R10)(R13*1)
2159  	ADDQ R13, R14
2160  	ADDQ R13, R10
2161  	JMP  copy_4_end
2162  
2163  copy_4_move_8through16:
2164  	MOVQ (R14), AX
2165  	MOVQ -8(R14)(R13*1), CX
2166  	MOVQ AX, (R10)
2167  	MOVQ CX, -8(R10)(R13*1)
2168  	ADDQ R13, R14
2169  	ADDQ R13, R10
2170  
2171  copy_4_end:
2172  	ADDQ R13, R12
2173  	JMP  handle_loop
2174  	JMP loop_finished
2175  
2176  copy_all_from_history:
2177  	MOVQ AX, R15
2178  	SUBQ $0x10, R15
2179  	JB   copy_5_small
2180  
2181  copy_5_loop:
2182  	MOVUPS (R14), X0
2183  	MOVUPS X0, (R10)
2184  	ADDQ   $0x10, R14
2185  	ADDQ   $0x10, R10
2186  	SUBQ   $0x10, R15
2187  	JAE    copy_5_loop
2188  	LEAQ   16(R14)(R15*1), R14
2189  	LEAQ   16(R10)(R15*1), R10
2190  	MOVUPS -16(R14), X0
2191  	MOVUPS X0, -16(R10)
2192  	JMP    copy_5_end
2193  
2194  copy_5_small:
2195  	CMPQ AX, $0x03
2196  	JE   copy_5_move_3
2197  	JB   copy_5_move_1or2
2198  	CMPQ AX, $0x08
2199  	JB   copy_5_move_4through7
2200  	JMP  copy_5_move_8through16
2201  
2202  copy_5_move_1or2:
2203  	MOVB (R14), R15
2204  	MOVB -1(R14)(AX*1), BP
2205  	MOVB R15, (R10)
2206  	MOVB BP, -1(R10)(AX*1)
2207  	ADDQ AX, R14
2208  	ADDQ AX, R10
2209  	JMP  copy_5_end
2210  
2211  copy_5_move_3:
2212  	MOVW (R14), R15
2213  	MOVB 2(R14), BP
2214  	MOVW R15, (R10)
2215  	MOVB BP, 2(R10)
2216  	ADDQ AX, R14
2217  	ADDQ AX, R10
2218  	JMP  copy_5_end
2219  
2220  copy_5_move_4through7:
2221  	MOVL (R14), R15
2222  	MOVL -4(R14)(AX*1), BP
2223  	MOVL R15, (R10)
2224  	MOVL BP, -4(R10)(AX*1)
2225  	ADDQ AX, R14
2226  	ADDQ AX, R10
2227  	JMP  copy_5_end
2228  
2229  copy_5_move_8through16:
2230  	MOVQ (R14), R15
2231  	MOVQ -8(R14)(AX*1), BP
2232  	MOVQ R15, (R10)
2233  	MOVQ BP, -8(R10)(AX*1)
2234  	ADDQ AX, R14
2235  	ADDQ AX, R10
2236  
2237  copy_5_end:
2238  	ADDQ AX, R12
2239  	SUBQ AX, R13
2240  
2241  	// Copy match from the current buffer
2242  copy_match:
2243  	MOVQ R10, AX
2244  	SUBQ CX, AX
2245  
2246  	// ml <= mo
2247  	CMPQ R13, CX
2248  	JA   copy_overlapping_match
2249  
2250  	// Copy non-overlapping match
2251  	ADDQ R13, R12
2252  	MOVQ R10, CX
2253  	ADDQ R13, R10
2254  
2255  copy_2:
2256  	MOVUPS (AX), X0
2257  	MOVUPS X0, (CX)
2258  	ADDQ   $0x10, AX
2259  	ADDQ   $0x10, CX
2260  	SUBQ   $0x10, R13
2261  	JHI    copy_2
2262  	JMP    handle_loop
2263  
2264  	// Copy overlapping match
2265  copy_overlapping_match:
2266  	ADDQ R13, R12
2267  
2268  copy_slow_3:
2269  	MOVB (AX), CL
2270  	MOVB CL, (R10)
2271  	INCQ AX
2272  	INCQ R10
2273  	DECQ R13
2274  	JNZ  copy_slow_3
2275  
2276  handle_loop:
2277  	MOVQ ctx+16(FP), AX
2278  	DECQ 96(AX)
2279  	JNS  sequenceDecs_decodeSync_amd64_main_loop
2280  
2281  loop_finished:
2282  	MOVQ br+8(FP), AX
2283  	MOVQ DX, 24(AX)
2284  	MOVB BL, 40(AX)
2285  	MOVQ SI, 32(AX)
2286  
2287  	// Update the context
2288  	MOVQ ctx+16(FP), AX
2289  	MOVQ R12, 136(AX)
2290  	MOVQ 144(AX), CX
2291  	SUBQ CX, R11
2292  	MOVQ R11, 168(AX)
2293  
2294  	// Return success
2295  	MOVQ $0x00000000, ret+24(FP)
2296  	RET
2297  
2298  	// Return with match length error
2299  sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
2300  	MOVQ 16(SP), AX
2301  	MOVQ ctx+16(FP), CX
2302  	MOVQ AX, 216(CX)
2303  	MOVQ $0x00000001, ret+24(FP)
2304  	RET
2305  
2306  	// Return with match too long error
2307  sequenceDecs_decodeSync_amd64_error_match_len_too_big:
2308  	MOVQ ctx+16(FP), AX
2309  	MOVQ 16(SP), CX
2310  	MOVQ CX, 216(AX)
2311  	MOVQ $0x00000002, ret+24(FP)
2312  	RET
2313  
2314  	// Return with match offset too long error
2315  error_match_off_too_big:
2316  	MOVQ ctx+16(FP), AX
2317  	MOVQ 8(SP), CX
2318  	MOVQ CX, 224(AX)
2319  	MOVQ R12, 136(AX)
2320  	MOVQ $0x00000003, ret+24(FP)
2321  	RET
2322  
2323  	// Return with not enough literals error
2324  error_not_enough_literals:
2325  	MOVQ ctx+16(FP), AX
2326  	MOVQ 24(SP), CX
2327  	MOVQ CX, 208(AX)
2328  	MOVQ $0x00000004, ret+24(FP)
2329  	RET
2330  
2331  	// Return with overread error
2332  error_overread:
2333  	MOVQ $0x00000006, ret+24(FP)
2334  	RET
2335  
2336  	// Return with not enough output space error
2337  error_not_enough_space:
2338  	MOVQ ctx+16(FP), AX
2339  	MOVQ 24(SP), CX
2340  	MOVQ CX, 208(AX)
2341  	MOVQ 16(SP), CX
2342  	MOVQ CX, 216(AX)
2343  	MOVQ R12, 136(AX)
2344  	MOVQ $0x00000005, ret+24(FP)
2345  	RET
2346  
2347  // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
2348  // Requires: BMI, BMI2, CMOV, SSE
2349  TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
2350  	MOVQ    br+8(FP), BX
2351  	MOVQ    24(BX), AX
2352  	MOVBQZX 40(BX), DX
2353  	MOVQ    (BX), CX
2354  	MOVQ    32(BX), BX
2355  	ADDQ    BX, CX
2356  	MOVQ    CX, (SP)
2357  	MOVQ    ctx+16(FP), CX
2358  	MOVQ    72(CX), SI
2359  	MOVQ    80(CX), DI
2360  	MOVQ    88(CX), R8
2361  	XORQ    R9, R9
2362  	MOVQ    R9, 8(SP)
2363  	MOVQ    R9, 16(SP)
2364  	MOVQ    R9, 24(SP)
2365  	MOVQ    112(CX), R9
2366  	MOVQ    128(CX), R10
2367  	MOVQ    R10, 32(SP)
2368  	MOVQ    144(CX), R10
2369  	MOVQ    136(CX), R11
2370  	MOVQ    200(CX), R12
2371  	MOVQ    R12, 56(SP)
2372  	MOVQ    176(CX), R12
2373  	MOVQ    R12, 48(SP)
2374  	MOVQ    184(CX), CX
2375  	MOVQ    CX, 40(SP)
2376  	MOVQ    40(SP), CX
2377  	ADDQ    CX, 48(SP)
2378  
2379  	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
2380  	ADDQ R9, 32(SP)
2381  
2382  	// outBase += outPosition
2383  	ADDQ R11, R9
2384  
2385  sequenceDecs_decodeSync_bmi2_main_loop:
2386  	MOVQ (SP), R12
2387  
2388  	// Fill bitreader to have enough for the offset and match length.
2389  	CMPQ BX, $0x08
2390  	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
2391  	MOVQ DX, CX
2392  	SHRQ $0x03, CX
2393  	SUBQ CX, R12
2394  	MOVQ (R12), AX
2395  	SUBQ CX, BX
2396  	ANDQ $0x07, DX
2397  	JMP  sequenceDecs_decodeSync_bmi2_fill_end
2398  
2399  sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
2400  	CMPQ    BX, $0x00
2401  	JLE     sequenceDecs_decodeSync_bmi2_fill_check_overread
2402  	CMPQ    DX, $0x07
2403  	JLE     sequenceDecs_decodeSync_bmi2_fill_end
2404  	SHLQ    $0x08, AX
2405  	SUBQ    $0x01, R12
2406  	SUBQ    $0x01, BX
2407  	SUBQ    $0x08, DX
2408  	MOVBQZX (R12), CX
2409  	ORQ     CX, AX
2410  	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
2411  
2412  sequenceDecs_decodeSync_bmi2_fill_check_overread:
2413  	CMPQ DX, $0x40
2414  	JA   error_overread
2415  
2416  sequenceDecs_decodeSync_bmi2_fill_end:
2417  	// Update offset
2418  	MOVQ   $0x00000808, CX
2419  	BEXTRQ CX, R8, R13
2420  	MOVQ   AX, R14
2421  	LEAQ   (DX)(R13*1), CX
2422  	ROLQ   CL, R14
2423  	BZHIQ  R13, R14, R14
2424  	MOVQ   CX, DX
2425  	MOVQ   R8, CX
2426  	SHRQ   $0x20, CX
2427  	ADDQ   R14, CX
2428  	MOVQ   CX, 8(SP)
2429  
2430  	// Update match length
2431  	MOVQ   $0x00000808, CX
2432  	BEXTRQ CX, DI, R13
2433  	MOVQ   AX, R14
2434  	LEAQ   (DX)(R13*1), CX
2435  	ROLQ   CL, R14
2436  	BZHIQ  R13, R14, R14
2437  	MOVQ   CX, DX
2438  	MOVQ   DI, CX
2439  	SHRQ   $0x20, CX
2440  	ADDQ   R14, CX
2441  	MOVQ   CX, 16(SP)
2442  
2443  	// Fill bitreader to have enough for the remaining
2444  	CMPQ BX, $0x08
2445  	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
2446  	MOVQ DX, CX
2447  	SHRQ $0x03, CX
2448  	SUBQ CX, R12
2449  	MOVQ (R12), AX
2450  	SUBQ CX, BX
2451  	ANDQ $0x07, DX
2452  	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end
2453  
2454  sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
2455  	CMPQ    BX, $0x00
2456  	JLE     sequenceDecs_decodeSync_bmi2_fill_2_check_overread
2457  	CMPQ    DX, $0x07
2458  	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
2459  	SHLQ    $0x08, AX
2460  	SUBQ    $0x01, R12
2461  	SUBQ    $0x01, BX
2462  	SUBQ    $0x08, DX
2463  	MOVBQZX (R12), CX
2464  	ORQ     CX, AX
2465  	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
2466  
2467  sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
2468  	CMPQ DX, $0x40
2469  	JA   error_overread
2470  
2471  sequenceDecs_decodeSync_bmi2_fill_2_end:
2472  	// Update literal length
2473  	MOVQ   $0x00000808, CX
2474  	BEXTRQ CX, SI, R13
2475  	MOVQ   AX, R14
2476  	LEAQ   (DX)(R13*1), CX
2477  	ROLQ   CL, R14
2478  	BZHIQ  R13, R14, R14
2479  	MOVQ   CX, DX
2480  	MOVQ   SI, CX
2481  	SHRQ   $0x20, CX
2482  	ADDQ   R14, CX
2483  	MOVQ   CX, 24(SP)
2484  
2485  	// Fill bitreader for state updates
2486  	MOVQ    R12, (SP)
2487  	MOVQ    $0x00000808, CX
2488  	BEXTRQ  CX, R8, R12
2489  	MOVQ    ctx+16(FP), CX
2490  	CMPQ    96(CX), $0x00
2491  	JZ      sequenceDecs_decodeSync_bmi2_skip_update
2492  	LEAQ    (SI)(DI*1), R13
2493  	ADDQ    R8, R13
2494  	MOVBQZX R13, R13
2495  	LEAQ    (DX)(R13*1), CX
2496  	MOVQ    AX, R14
2497  	MOVQ    CX, DX
2498  	ROLQ    CL, R14
2499  	BZHIQ   R13, R14, R14
2500  
2501  	// Update Offset State
2502  	BZHIQ R8, R14, CX
2503  	SHRXQ R8, R14, R14
2504  	SHRL  $0x10, R8
2505  	ADDQ  CX, R8
2506  
2507  	// Load ctx.ofTable
2508  	MOVQ ctx+16(FP), CX
2509  	MOVQ 48(CX), CX
2510  	MOVQ (CX)(R8*8), R8
2511  
2512  	// Update Match Length State
2513  	BZHIQ DI, R14, CX
2514  	SHRXQ DI, R14, R14
2515  	SHRL  $0x10, DI
2516  	ADDQ  CX, DI
2517  
2518  	// Load ctx.mlTable
2519  	MOVQ ctx+16(FP), CX
2520  	MOVQ 24(CX), CX
2521  	MOVQ (CX)(DI*8), DI
2522  
2523  	// Update Literal Length State
2524  	BZHIQ SI, R14, CX
2525  	SHRL  $0x10, SI
2526  	ADDQ  CX, SI
2527  
2528  	// Load ctx.llTable
2529  	MOVQ ctx+16(FP), CX
2530  	MOVQ (CX), CX
2531  	MOVQ (CX)(SI*8), SI
2532  
2533  sequenceDecs_decodeSync_bmi2_skip_update:
2534  	// Adjust offset
2535  	MOVQ   s+0(FP), CX
2536  	MOVQ   8(SP), R13
2537  	CMPQ   R12, $0x01
2538  	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
2539  	MOVUPS 144(CX), X0
2540  	MOVQ   R13, 144(CX)
2541  	MOVUPS X0, 152(CX)
2542  	JMP    sequenceDecs_decodeSync_bmi2_after_adjust
2543  
2544  sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
2545  	CMPQ 24(SP), $0x00000000
2546  	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
2547  	INCQ R13
2548  	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
2549  
2550  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
2551  	TESTQ R13, R13
2552  	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
2553  	MOVQ  144(CX), R13
2554  	JMP   sequenceDecs_decodeSync_bmi2_after_adjust
2555  
2556  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
2557  	MOVQ    R13, R12
2558  	XORQ    R14, R14
2559  	MOVQ    $-1, R15
2560  	CMPQ    R13, $0x03
2561  	CMOVQEQ R14, R12
2562  	CMOVQEQ R15, R14
2563  	ADDQ    144(CX)(R12*8), R14
2564  	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
2565  	MOVQ    $0x00000001, R14
2566  
2567  sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
2568  	CMPQ R13, $0x01
2569  	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
2570  	MOVQ 152(CX), R12
2571  	MOVQ R12, 160(CX)
2572  
2573  sequenceDecs_decodeSync_bmi2_adjust_skip:
2574  	MOVQ 144(CX), R12
2575  	MOVQ R12, 152(CX)
2576  	MOVQ R14, 144(CX)
2577  	MOVQ R14, R13
2578  
2579  sequenceDecs_decodeSync_bmi2_after_adjust:
2580  	MOVQ R13, 8(SP)
2581  
2582  	// Check values
2583  	MOVQ  16(SP), CX
2584  	MOVQ  24(SP), R12
2585  	LEAQ  (CX)(R12*1), R14
2586  	MOVQ  s+0(FP), R15
2587  	ADDQ  R14, 256(R15)
2588  	MOVQ  ctx+16(FP), R14
2589  	SUBQ  R12, 104(R14)
2590  	JS    error_not_enough_literals
2591  	CMPQ  CX, $0x00020002
2592  	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
2593  	TESTQ R13, R13
2594  	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
2595  	TESTQ CX, CX
2596  	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
2597  
2598  sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
2599  	MOVQ 24(SP), CX
2600  	MOVQ 8(SP), R12
2601  	MOVQ 16(SP), R13
2602  
2603  	// Check if we have enough space in s.out
2604  	LEAQ (CX)(R13*1), R14
2605  	ADDQ R9, R14
2606  	CMPQ R14, 32(SP)
2607  	JA   error_not_enough_space
2608  
2609  	// Copy literals
2610  	TESTQ CX, CX
2611  	JZ    check_offset
2612  	XORQ  R14, R14
2613  
2614  copy_1:
2615  	MOVUPS (R10)(R14*1), X0
2616  	MOVUPS X0, (R9)(R14*1)
2617  	ADDQ   $0x10, R14
2618  	CMPQ   R14, CX
2619  	JB     copy_1
2620  	ADDQ   CX, R10
2621  	ADDQ   CX, R9
2622  	ADDQ   CX, R11
2623  
2624  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
2625  check_offset:
2626  	MOVQ R11, CX
2627  	ADDQ 40(SP), CX
2628  	CMPQ R12, CX
2629  	JG   error_match_off_too_big
2630  	CMPQ R12, 56(SP)
2631  	JG   error_match_off_too_big
2632  
2633  	// Copy match from history
2634  	MOVQ R12, CX
2635  	SUBQ R11, CX
2636  	JLS  copy_match
2637  	MOVQ 48(SP), R14
2638  	SUBQ CX, R14
2639  	CMPQ R13, CX
2640  	JG   copy_all_from_history
2641  	MOVQ R13, CX
2642  	SUBQ $0x10, CX
2643  	JB   copy_4_small
2644  
2645  copy_4_loop:
2646  	MOVUPS (R14), X0
2647  	MOVUPS X0, (R9)
2648  	ADDQ   $0x10, R14
2649  	ADDQ   $0x10, R9
2650  	SUBQ   $0x10, CX
2651  	JAE    copy_4_loop
2652  	LEAQ   16(R14)(CX*1), R14
2653  	LEAQ   16(R9)(CX*1), R9
2654  	MOVUPS -16(R14), X0
2655  	MOVUPS X0, -16(R9)
2656  	JMP    copy_4_end
2657  
2658  copy_4_small:
2659  	CMPQ R13, $0x03
2660  	JE   copy_4_move_3
2661  	CMPQ R13, $0x08
2662  	JB   copy_4_move_4through7
2663  	JMP  copy_4_move_8through16
2664  
2665  copy_4_move_3:
2666  	MOVW (R14), CX
2667  	MOVB 2(R14), R12
2668  	MOVW CX, (R9)
2669  	MOVB R12, 2(R9)
2670  	ADDQ R13, R14
2671  	ADDQ R13, R9
2672  	JMP  copy_4_end
2673  
2674  copy_4_move_4through7:
2675  	MOVL (R14), CX
2676  	MOVL -4(R14)(R13*1), R12
2677  	MOVL CX, (R9)
2678  	MOVL R12, -4(R9)(R13*1)
2679  	ADDQ R13, R14
2680  	ADDQ R13, R9
2681  	JMP  copy_4_end
2682  
2683  copy_4_move_8through16:
2684  	MOVQ (R14), CX
2685  	MOVQ -8(R14)(R13*1), R12
2686  	MOVQ CX, (R9)
2687  	MOVQ R12, -8(R9)(R13*1)
2688  	ADDQ R13, R14
2689  	ADDQ R13, R9
2690  
2691  copy_4_end:
2692  	ADDQ R13, R11
2693  	JMP  handle_loop
2694  	JMP loop_finished
2695  
2696  copy_all_from_history:
2697  	MOVQ CX, R15
2698  	SUBQ $0x10, R15
2699  	JB   copy_5_small
2700  
2701  copy_5_loop:
2702  	MOVUPS (R14), X0
2703  	MOVUPS X0, (R9)
2704  	ADDQ   $0x10, R14
2705  	ADDQ   $0x10, R9
2706  	SUBQ   $0x10, R15
2707  	JAE    copy_5_loop
2708  	LEAQ   16(R14)(R15*1), R14
2709  	LEAQ   16(R9)(R15*1), R9
2710  	MOVUPS -16(R14), X0
2711  	MOVUPS X0, -16(R9)
2712  	JMP    copy_5_end
2713  
2714  copy_5_small:
2715  	CMPQ CX, $0x03
2716  	JE   copy_5_move_3
2717  	JB   copy_5_move_1or2
2718  	CMPQ CX, $0x08
2719  	JB   copy_5_move_4through7
2720  	JMP  copy_5_move_8through16
2721  
2722  copy_5_move_1or2:
2723  	MOVB (R14), R15
2724  	MOVB -1(R14)(CX*1), BP
2725  	MOVB R15, (R9)
2726  	MOVB BP, -1(R9)(CX*1)
2727  	ADDQ CX, R14
2728  	ADDQ CX, R9
2729  	JMP  copy_5_end
2730  
2731  copy_5_move_3:
2732  	MOVW (R14), R15
2733  	MOVB 2(R14), BP
2734  	MOVW R15, (R9)
2735  	MOVB BP, 2(R9)
2736  	ADDQ CX, R14
2737  	ADDQ CX, R9
2738  	JMP  copy_5_end
2739  
2740  copy_5_move_4through7:
2741  	MOVL (R14), R15
2742  	MOVL -4(R14)(CX*1), BP
2743  	MOVL R15, (R9)
2744  	MOVL BP, -4(R9)(CX*1)
2745  	ADDQ CX, R14
2746  	ADDQ CX, R9
2747  	JMP  copy_5_end
2748  
2749  copy_5_move_8through16:
2750  	MOVQ (R14), R15
2751  	MOVQ -8(R14)(CX*1), BP
2752  	MOVQ R15, (R9)
2753  	MOVQ BP, -8(R9)(CX*1)
2754  	ADDQ CX, R14
2755  	ADDQ CX, R9
2756  
2757  copy_5_end:
2758  	ADDQ CX, R11
2759  	SUBQ CX, R13
2760  
2761  	// Copy match from the current buffer
2762  copy_match:
2763  	MOVQ R9, CX
2764  	SUBQ R12, CX
2765  
2766  	// ml <= mo
2767  	CMPQ R13, R12
2768  	JA   copy_overlapping_match
2769  
2770  	// Copy non-overlapping match
2771  	ADDQ R13, R11
2772  	MOVQ R9, R12
2773  	ADDQ R13, R9
2774  
2775  copy_2:
2776  	MOVUPS (CX), X0
2777  	MOVUPS X0, (R12)
2778  	ADDQ   $0x10, CX
2779  	ADDQ   $0x10, R12
2780  	SUBQ   $0x10, R13
2781  	JHI    copy_2
2782  	JMP    handle_loop
2783  
2784  	// Copy overlapping match
2785  copy_overlapping_match:
2786  	ADDQ R13, R11
2787  
2788  copy_slow_3:
2789  	MOVB (CX), R12
2790  	MOVB R12, (R9)
2791  	INCQ CX
2792  	INCQ R9
2793  	DECQ R13
2794  	JNZ  copy_slow_3
2795  
2796  handle_loop:
2797  	MOVQ ctx+16(FP), CX
2798  	DECQ 96(CX)
2799  	JNS  sequenceDecs_decodeSync_bmi2_main_loop
2800  
2801  loop_finished:
2802  	MOVQ br+8(FP), CX
2803  	MOVQ AX, 24(CX)
2804  	MOVB DL, 40(CX)
2805  	MOVQ BX, 32(CX)
2806  
2807  	// Update the context
2808  	MOVQ ctx+16(FP), AX
2809  	MOVQ R11, 136(AX)
2810  	MOVQ 144(AX), CX
2811  	SUBQ CX, R10
2812  	MOVQ R10, 168(AX)
2813  
2814  	// Return success
2815  	MOVQ $0x00000000, ret+24(FP)
2816  	RET
2817  
2818  	// Return with match length error
2819  sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
2820  	MOVQ 16(SP), AX
2821  	MOVQ ctx+16(FP), CX
2822  	MOVQ AX, 216(CX)
2823  	MOVQ $0x00000001, ret+24(FP)
2824  	RET
2825  
2826  	// Return with match too long error
2827  sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
2828  	MOVQ ctx+16(FP), AX
2829  	MOVQ 16(SP), CX
2830  	MOVQ CX, 216(AX)
2831  	MOVQ $0x00000002, ret+24(FP)
2832  	RET
2833  
2834  	// Return with match offset too long error
2835  error_match_off_too_big:
2836  	MOVQ ctx+16(FP), AX
2837  	MOVQ 8(SP), CX
2838  	MOVQ CX, 224(AX)
2839  	MOVQ R11, 136(AX)
2840  	MOVQ $0x00000003, ret+24(FP)
2841  	RET
2842  
2843  	// Return with not enough literals error
2844  error_not_enough_literals:
2845  	MOVQ ctx+16(FP), AX
2846  	MOVQ 24(SP), CX
2847  	MOVQ CX, 208(AX)
2848  	MOVQ $0x00000004, ret+24(FP)
2849  	RET
2850  
2851  	// Return with overread error
2852  error_overread:
2853  	MOVQ $0x00000006, ret+24(FP)
2854  	RET
2855  
2856  	// Return with not enough output space error
2857  error_not_enough_space:
2858  	MOVQ ctx+16(FP), AX
2859  	MOVQ 24(SP), CX
2860  	MOVQ CX, 208(AX)
2861  	MOVQ 16(SP), CX
2862  	MOVQ CX, 216(AX)
2863  	MOVQ R11, 136(AX)
2864  	MOVQ $0x00000005, ret+24(FP)
2865  	RET
2866  
2867  // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
2868  // Requires: CMOV, SSE
2869  TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
2870  	MOVQ    br+8(FP), CX
2871  	MOVQ    24(CX), DX
2872  	MOVBQZX 40(CX), BX
2873  	MOVQ    (CX), AX
2874  	MOVQ    32(CX), SI
2875  	ADDQ    SI, AX
2876  	MOVQ    AX, (SP)
2877  	MOVQ    ctx+16(FP), AX
2878  	MOVQ    72(AX), DI
2879  	MOVQ    80(AX), R8
2880  	MOVQ    88(AX), R9
2881  	XORQ    CX, CX
2882  	MOVQ    CX, 8(SP)
2883  	MOVQ    CX, 16(SP)
2884  	MOVQ    CX, 24(SP)
2885  	MOVQ    112(AX), R10
2886  	MOVQ    128(AX), CX
2887  	MOVQ    CX, 32(SP)
2888  	MOVQ    144(AX), R11
2889  	MOVQ    136(AX), R12
2890  	MOVQ    200(AX), CX
2891  	MOVQ    CX, 56(SP)
2892  	MOVQ    176(AX), CX
2893  	MOVQ    CX, 48(SP)
2894  	MOVQ    184(AX), AX
2895  	MOVQ    AX, 40(SP)
2896  	MOVQ    40(SP), AX
2897  	ADDQ    AX, 48(SP)
2898  
2899  	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
2900  	ADDQ R10, 32(SP)
2901  
2902  	// outBase += outPosition
2903  	ADDQ R12, R10
2904  
2905  sequenceDecs_decodeSync_safe_amd64_main_loop:
2906  	MOVQ (SP), R13
2907  
2908  	// Fill bitreader to have enough for the offset and match length.
2909  	CMPQ SI, $0x08
2910  	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
2911  	MOVQ BX, AX
2912  	SHRQ $0x03, AX
2913  	SUBQ AX, R13
2914  	MOVQ (R13), DX
2915  	SUBQ AX, SI
2916  	ANDQ $0x07, BX
2917  	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end
2918  
2919  sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
2920  	CMPQ    SI, $0x00
2921  	JLE     sequenceDecs_decodeSync_safe_amd64_fill_check_overread
2922  	CMPQ    BX, $0x07
2923  	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
2924  	SHLQ    $0x08, DX
2925  	SUBQ    $0x01, R13
2926  	SUBQ    $0x01, SI
2927  	SUBQ    $0x08, BX
2928  	MOVBQZX (R13), AX
2929  	ORQ     AX, DX
2930  	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
2931  
2932  sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
2933  	CMPQ BX, $0x40
2934  	JA   error_overread
2935  
2936  sequenceDecs_decodeSync_safe_amd64_fill_end:
2937  	// Update offset
2938  	MOVQ  R9, AX
2939  	MOVQ  BX, CX
2940  	MOVQ  DX, R14
2941  	SHLQ  CL, R14
2942  	MOVB  AH, CL
2943  	SHRQ  $0x20, AX
2944  	TESTQ CX, CX
2945  	JZ    sequenceDecs_decodeSync_safe_amd64_of_update_zero
2946  	ADDQ  CX, BX
2947  	CMPQ  BX, $0x40
2948  	JA    sequenceDecs_decodeSync_safe_amd64_of_update_zero
2949  	CMPQ  CX, $0x40
2950  	JAE   sequenceDecs_decodeSync_safe_amd64_of_update_zero
2951  	NEGQ  CX
2952  	SHRQ  CL, R14
2953  	ADDQ  R14, AX
2954  
2955  sequenceDecs_decodeSync_safe_amd64_of_update_zero:
2956  	MOVQ AX, 8(SP)
2957  
2958  	// Update match length
2959  	MOVQ  R8, AX
2960  	MOVQ  BX, CX
2961  	MOVQ  DX, R14
2962  	SHLQ  CL, R14
2963  	MOVB  AH, CL
2964  	SHRQ  $0x20, AX
2965  	TESTQ CX, CX
2966  	JZ    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
2967  	ADDQ  CX, BX
2968  	CMPQ  BX, $0x40
2969  	JA    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
2970  	CMPQ  CX, $0x40
2971  	JAE   sequenceDecs_decodeSync_safe_amd64_ml_update_zero
2972  	NEGQ  CX
2973  	SHRQ  CL, R14
2974  	ADDQ  R14, AX
2975  
2976  sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
2977  	MOVQ AX, 16(SP)
2978  
2979  	// Fill bitreader to have enough for the remaining
2980  	CMPQ SI, $0x08
2981  	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
2982  	MOVQ BX, AX
2983  	SHRQ $0x03, AX
2984  	SUBQ AX, R13
2985  	MOVQ (R13), DX
2986  	SUBQ AX, SI
2987  	ANDQ $0x07, BX
2988  	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end
2989  
2990  sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
2991  	CMPQ    SI, $0x00
2992  	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
2993  	CMPQ    BX, $0x07
2994  	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
2995  	SHLQ    $0x08, DX
2996  	SUBQ    $0x01, R13
2997  	SUBQ    $0x01, SI
2998  	SUBQ    $0x08, BX
2999  	MOVBQZX (R13), AX
3000  	ORQ     AX, DX
3001  	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
3002  
3003  sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
3004  	CMPQ BX, $0x40
3005  	JA   error_overread
3006  
3007  sequenceDecs_decodeSync_safe_amd64_fill_2_end:
3008  	// Update literal length
3009  	MOVQ  DI, AX
3010  	MOVQ  BX, CX
3011  	MOVQ  DX, R14
3012  	SHLQ  CL, R14
3013  	MOVB  AH, CL
3014  	SHRQ  $0x20, AX
3015  	TESTQ CX, CX
3016  	JZ    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
3017  	ADDQ  CX, BX
3018  	CMPQ  BX, $0x40
3019  	JA    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
3020  	CMPQ  CX, $0x40
3021  	JAE   sequenceDecs_decodeSync_safe_amd64_ll_update_zero
3022  	NEGQ  CX
3023  	SHRQ  CL, R14
3024  	ADDQ  R14, AX
3025  
3026  sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
3027  	MOVQ AX, 24(SP)
3028  
3029  	// Fill bitreader for state updates
3030  	MOVQ    R13, (SP)
3031  	MOVQ    R9, AX
3032  	SHRQ    $0x08, AX
3033  	MOVBQZX AL, AX
3034  	MOVQ    ctx+16(FP), CX
3035  	CMPQ    96(CX), $0x00
3036  	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update
3037  
3038  	// Update Literal Length State
3039  	MOVBQZX DI, R13
3040  	SHRL    $0x10, DI
3041  	LEAQ    (BX)(R13*1), CX
3042  	MOVQ    DX, R14
3043  	MOVQ    CX, BX
3044  	ROLQ    CL, R14
3045  	MOVL    $0x00000001, R15
3046  	MOVB    R13, CL
3047  	SHLL    CL, R15
3048  	DECL    R15
3049  	ANDQ    R15, R14
3050  	ADDQ    R14, DI
3051  
3052  	// Load ctx.llTable
3053  	MOVQ ctx+16(FP), CX
3054  	MOVQ (CX), CX
3055  	MOVQ (CX)(DI*8), DI
3056  
3057  	// Update Match Length State
3058  	MOVBQZX R8, R13
3059  	SHRL    $0x10, R8
3060  	LEAQ    (BX)(R13*1), CX
3061  	MOVQ    DX, R14
3062  	MOVQ    CX, BX
3063  	ROLQ    CL, R14
3064  	MOVL    $0x00000001, R15
3065  	MOVB    R13, CL
3066  	SHLL    CL, R15
3067  	DECL    R15
3068  	ANDQ    R15, R14
3069  	ADDQ    R14, R8
3070  
3071  	// Load ctx.mlTable
3072  	MOVQ ctx+16(FP), CX
3073  	MOVQ 24(CX), CX
3074  	MOVQ (CX)(R8*8), R8
3075  
3076  	// Update Offset State
3077  	MOVBQZX R9, R13
3078  	SHRL    $0x10, R9
3079  	LEAQ    (BX)(R13*1), CX
3080  	MOVQ    DX, R14
3081  	MOVQ    CX, BX
3082  	ROLQ    CL, R14
3083  	MOVL    $0x00000001, R15
3084  	MOVB    R13, CL
3085  	SHLL    CL, R15
3086  	DECL    R15
3087  	ANDQ    R15, R14
3088  	ADDQ    R14, R9
3089  
3090  	// Load ctx.ofTable
3091  	MOVQ ctx+16(FP), CX
3092  	MOVQ 48(CX), CX
3093  	MOVQ (CX)(R9*8), R9
3094  
3095  sequenceDecs_decodeSync_safe_amd64_skip_update:
3096  	// Adjust offset
3097  	MOVQ   s+0(FP), CX
3098  	MOVQ   8(SP), R13
3099  	CMPQ   AX, $0x01
3100  	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
3101  	MOVUPS 144(CX), X0
3102  	MOVQ   R13, 144(CX)
3103  	MOVUPS X0, 152(CX)
3104  	JMP    sequenceDecs_decodeSync_safe_amd64_after_adjust
3105  
3106  sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
3107  	CMPQ 24(SP), $0x00000000
3108  	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
3109  	INCQ R13
3110  	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
3111  
3112  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
3113  	TESTQ R13, R13
3114  	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
3115  	MOVQ  144(CX), R13
3116  	JMP   sequenceDecs_decodeSync_safe_amd64_after_adjust
3117  
3118  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
3119  	MOVQ    R13, AX
3120  	XORQ    R14, R14
3121  	MOVQ    $-1, R15
3122  	CMPQ    R13, $0x03
3123  	CMOVQEQ R14, AX
3124  	CMOVQEQ R15, R14
3125  	ADDQ    144(CX)(AX*8), R14
3126  	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
3127  	MOVQ    $0x00000001, R14
3128  
3129  sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
3130  	CMPQ R13, $0x01
3131  	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
3132  	MOVQ 152(CX), AX
3133  	MOVQ AX, 160(CX)
3134  
3135  sequenceDecs_decodeSync_safe_amd64_adjust_skip:
3136  	MOVQ 144(CX), AX
3137  	MOVQ AX, 152(CX)
3138  	MOVQ R14, 144(CX)
3139  	MOVQ R14, R13
3140  
3141  sequenceDecs_decodeSync_safe_amd64_after_adjust:
3142  	MOVQ R13, 8(SP)
3143  
3144  	// Check values
3145  	MOVQ  16(SP), AX
3146  	MOVQ  24(SP), CX
3147  	LEAQ  (AX)(CX*1), R14
3148  	MOVQ  s+0(FP), R15
3149  	ADDQ  R14, 256(R15)
3150  	MOVQ  ctx+16(FP), R14
3151  	SUBQ  CX, 104(R14)
3152  	JS    error_not_enough_literals
3153  	CMPQ  AX, $0x00020002
3154  	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
3155  	TESTQ R13, R13
3156  	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
3157  	TESTQ AX, AX
3158  	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
3159  
3160  sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
3161  	MOVQ 24(SP), AX
3162  	MOVQ 8(SP), CX
3163  	MOVQ 16(SP), R13
3164  
3165  	// Check if we have enough space in s.out
3166  	LEAQ (AX)(R13*1), R14
3167  	ADDQ R10, R14
3168  	CMPQ R14, 32(SP)
3169  	JA   error_not_enough_space
3170  
3171  	// Copy literals
3172  	TESTQ AX, AX
3173  	JZ    check_offset
3174  	MOVQ  AX, R14
3175  	SUBQ  $0x10, R14
3176  	JB    copy_1_small
3177  
3178  copy_1_loop:
3179  	MOVUPS (R11), X0
3180  	MOVUPS X0, (R10)
3181  	ADDQ   $0x10, R11
3182  	ADDQ   $0x10, R10
3183  	SUBQ   $0x10, R14
3184  	JAE    copy_1_loop
3185  	LEAQ   16(R11)(R14*1), R11
3186  	LEAQ   16(R10)(R14*1), R10
3187  	MOVUPS -16(R11), X0
3188  	MOVUPS X0, -16(R10)
3189  	JMP    copy_1_end
3190  
3191  copy_1_small:
3192  	CMPQ AX, $0x03
3193  	JE   copy_1_move_3
3194  	JB   copy_1_move_1or2
3195  	CMPQ AX, $0x08
3196  	JB   copy_1_move_4through7
3197  	JMP  copy_1_move_8through16
3198  
3199  copy_1_move_1or2:
3200  	MOVB (R11), R14
3201  	MOVB -1(R11)(AX*1), R15
3202  	MOVB R14, (R10)
3203  	MOVB R15, -1(R10)(AX*1)
3204  	ADDQ AX, R11
3205  	ADDQ AX, R10
3206  	JMP  copy_1_end
3207  
3208  copy_1_move_3:
3209  	MOVW (R11), R14
3210  	MOVB 2(R11), R15
3211  	MOVW R14, (R10)
3212  	MOVB R15, 2(R10)
3213  	ADDQ AX, R11
3214  	ADDQ AX, R10
3215  	JMP  copy_1_end
3216  
3217  copy_1_move_4through7:
3218  	MOVL (R11), R14
3219  	MOVL -4(R11)(AX*1), R15
3220  	MOVL R14, (R10)
3221  	MOVL R15, -4(R10)(AX*1)
3222  	ADDQ AX, R11
3223  	ADDQ AX, R10
3224  	JMP  copy_1_end
3225  
3226  copy_1_move_8through16:
3227  	MOVQ (R11), R14
3228  	MOVQ -8(R11)(AX*1), R15
3229  	MOVQ R14, (R10)
3230  	MOVQ R15, -8(R10)(AX*1)
3231  	ADDQ AX, R11
3232  	ADDQ AX, R10
3233  
3234  copy_1_end:
3235  	ADDQ AX, R12
3236  
3237  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
3238  check_offset:
3239  	MOVQ R12, AX
3240  	ADDQ 40(SP), AX
3241  	CMPQ CX, AX
3242  	JG   error_match_off_too_big
3243  	CMPQ CX, 56(SP)
3244  	JG   error_match_off_too_big
3245  
3246  	// Copy match from history
3247  	MOVQ CX, AX
3248  	SUBQ R12, AX
3249  	JLS  copy_match
3250  	MOVQ 48(SP), R14
3251  	SUBQ AX, R14
3252  	CMPQ R13, AX
3253  	JG   copy_all_from_history
3254  	MOVQ R13, AX
3255  	SUBQ $0x10, AX
3256  	JB   copy_4_small
3257  
3258  copy_4_loop:
3259  	MOVUPS (R14), X0
3260  	MOVUPS X0, (R10)
3261  	ADDQ   $0x10, R14
3262  	ADDQ   $0x10, R10
3263  	SUBQ   $0x10, AX
3264  	JAE    copy_4_loop
3265  	LEAQ   16(R14)(AX*1), R14
3266  	LEAQ   16(R10)(AX*1), R10
3267  	MOVUPS -16(R14), X0
3268  	MOVUPS X0, -16(R10)
3269  	JMP    copy_4_end
3270  
3271  copy_4_small:
3272  	CMPQ R13, $0x03
3273  	JE   copy_4_move_3
3274  	CMPQ R13, $0x08
3275  	JB   copy_4_move_4through7
3276  	JMP  copy_4_move_8through16
3277  
3278  copy_4_move_3:
3279  	MOVW (R14), AX
3280  	MOVB 2(R14), CL
3281  	MOVW AX, (R10)
3282  	MOVB CL, 2(R10)
3283  	ADDQ R13, R14
3284  	ADDQ R13, R10
3285  	JMP  copy_4_end
3286  
3287  copy_4_move_4through7:
3288  	MOVL (R14), AX
3289  	MOVL -4(R14)(R13*1), CX
3290  	MOVL AX, (R10)
3291  	MOVL CX, -4(R10)(R13*1)
3292  	ADDQ R13, R14
3293  	ADDQ R13, R10
3294  	JMP  copy_4_end
3295  
3296  copy_4_move_8through16:
3297  	MOVQ (R14), AX
3298  	MOVQ -8(R14)(R13*1), CX
3299  	MOVQ AX, (R10)
3300  	MOVQ CX, -8(R10)(R13*1)
3301  	ADDQ R13, R14
3302  	ADDQ R13, R10
3303  
3304  copy_4_end:
3305  	ADDQ R13, R12
3306  	JMP  handle_loop
3307  	JMP loop_finished
3308  
3309  copy_all_from_history:
3310  	MOVQ AX, R15
3311  	SUBQ $0x10, R15
3312  	JB   copy_5_small
3313  
3314  copy_5_loop:
3315  	MOVUPS (R14), X0
3316  	MOVUPS X0, (R10)
3317  	ADDQ   $0x10, R14
3318  	ADDQ   $0x10, R10
3319  	SUBQ   $0x10, R15
3320  	JAE    copy_5_loop
3321  	LEAQ   16(R14)(R15*1), R14
3322  	LEAQ   16(R10)(R15*1), R10
3323  	MOVUPS -16(R14), X0
3324  	MOVUPS X0, -16(R10)
3325  	JMP    copy_5_end
3326  
3327  copy_5_small:
3328  	CMPQ AX, $0x03
3329  	JE   copy_5_move_3
3330  	JB   copy_5_move_1or2
3331  	CMPQ AX, $0x08
3332  	JB   copy_5_move_4through7
3333  	JMP  copy_5_move_8through16
3334  
3335  copy_5_move_1or2:
3336  	MOVB (R14), R15
3337  	MOVB -1(R14)(AX*1), BP
3338  	MOVB R15, (R10)
3339  	MOVB BP, -1(R10)(AX*1)
3340  	ADDQ AX, R14
3341  	ADDQ AX, R10
3342  	JMP  copy_5_end
3343  
3344  copy_5_move_3:
3345  	MOVW (R14), R15
3346  	MOVB 2(R14), BP
3347  	MOVW R15, (R10)
3348  	MOVB BP, 2(R10)
3349  	ADDQ AX, R14
3350  	ADDQ AX, R10
3351  	JMP  copy_5_end
3352  
3353  copy_5_move_4through7:
3354  	MOVL (R14), R15
3355  	MOVL -4(R14)(AX*1), BP
3356  	MOVL R15, (R10)
3357  	MOVL BP, -4(R10)(AX*1)
3358  	ADDQ AX, R14
3359  	ADDQ AX, R10
3360  	JMP  copy_5_end
3361  
3362  copy_5_move_8through16:
3363  	MOVQ (R14), R15
3364  	MOVQ -8(R14)(AX*1), BP
3365  	MOVQ R15, (R10)
3366  	MOVQ BP, -8(R10)(AX*1)
3367  	ADDQ AX, R14
3368  	ADDQ AX, R10
3369  
3370  copy_5_end:
3371  	ADDQ AX, R12
3372  	SUBQ AX, R13
3373  
3374  	// Copy match from the current buffer
3375  copy_match:
3376  	MOVQ R10, AX
3377  	SUBQ CX, AX
3378  
3379  	// ml <= mo
3380  	CMPQ R13, CX
3381  	JA   copy_overlapping_match
3382  
3383  	// Copy non-overlapping match
3384  	ADDQ R13, R12
3385  	MOVQ R13, CX
3386  	SUBQ $0x10, CX
3387  	JB   copy_2_small
3388  
3389  copy_2_loop:
3390  	MOVUPS (AX), X0
3391  	MOVUPS X0, (R10)
3392  	ADDQ   $0x10, AX
3393  	ADDQ   $0x10, R10
3394  	SUBQ   $0x10, CX
3395  	JAE    copy_2_loop
3396  	LEAQ   16(AX)(CX*1), AX
3397  	LEAQ   16(R10)(CX*1), R10
3398  	MOVUPS -16(AX), X0
3399  	MOVUPS X0, -16(R10)
3400  	JMP    copy_2_end
3401  
3402  copy_2_small:
3403  	CMPQ R13, $0x03
3404  	JE   copy_2_move_3
3405  	JB   copy_2_move_1or2
3406  	CMPQ R13, $0x08
3407  	JB   copy_2_move_4through7
3408  	JMP  copy_2_move_8through16
3409  
3410  copy_2_move_1or2:
3411  	MOVB (AX), CL
3412  	MOVB -1(AX)(R13*1), R14
3413  	MOVB CL, (R10)
3414  	MOVB R14, -1(R10)(R13*1)
3415  	ADDQ R13, AX
3416  	ADDQ R13, R10
3417  	JMP  copy_2_end
3418  
3419  copy_2_move_3:
3420  	MOVW (AX), CX
3421  	MOVB 2(AX), R14
3422  	MOVW CX, (R10)
3423  	MOVB R14, 2(R10)
3424  	ADDQ R13, AX
3425  	ADDQ R13, R10
3426  	JMP  copy_2_end
3427  
3428  copy_2_move_4through7:
3429  	MOVL (AX), CX
3430  	MOVL -4(AX)(R13*1), R14
3431  	MOVL CX, (R10)
3432  	MOVL R14, -4(R10)(R13*1)
3433  	ADDQ R13, AX
3434  	ADDQ R13, R10
3435  	JMP  copy_2_end
3436  
3437  copy_2_move_8through16:
3438  	MOVQ (AX), CX
3439  	MOVQ -8(AX)(R13*1), R14
3440  	MOVQ CX, (R10)
3441  	MOVQ R14, -8(R10)(R13*1)
3442  	ADDQ R13, AX
3443  	ADDQ R13, R10
3444  
3445  copy_2_end:
3446  	JMP handle_loop
3447  
3448  	// Copy overlapping match
3449  copy_overlapping_match:
3450  	ADDQ R13, R12
3451  
3452  copy_slow_3:
3453  	MOVB (AX), CL
3454  	MOVB CL, (R10)
3455  	INCQ AX
3456  	INCQ R10
3457  	DECQ R13
3458  	JNZ  copy_slow_3
3459  
3460  handle_loop:
3461  	MOVQ ctx+16(FP), AX
3462  	DECQ 96(AX)
3463  	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop
3464  
3465  loop_finished:
3466  	MOVQ br+8(FP), AX
3467  	MOVQ DX, 24(AX)
3468  	MOVB BL, 40(AX)
3469  	MOVQ SI, 32(AX)
3470  
3471  	// Update the context
3472  	MOVQ ctx+16(FP), AX
3473  	MOVQ R12, 136(AX)
3474  	MOVQ 144(AX), CX
3475  	SUBQ CX, R11
3476  	MOVQ R11, 168(AX)
3477  
3478  	// Return success
3479  	MOVQ $0x00000000, ret+24(FP)
3480  	RET
3481  
3482  	// Return with match length error
3483  sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
3484  	MOVQ 16(SP), AX
3485  	MOVQ ctx+16(FP), CX
3486  	MOVQ AX, 216(CX)
3487  	MOVQ $0x00000001, ret+24(FP)
3488  	RET
3489  
3490  	// Return with match too long error
3491  sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
3492  	MOVQ ctx+16(FP), AX
3493  	MOVQ 16(SP), CX
3494  	MOVQ CX, 216(AX)
3495  	MOVQ $0x00000002, ret+24(FP)
3496  	RET
3497  
3498  	// Return with match offset too long error
3499  error_match_off_too_big:
3500  	MOVQ ctx+16(FP), AX
3501  	MOVQ 8(SP), CX
3502  	MOVQ CX, 224(AX)
3503  	MOVQ R12, 136(AX)
3504  	MOVQ $0x00000003, ret+24(FP)
3505  	RET
3506  
3507  	// Return with not enough literals error
3508  error_not_enough_literals:
3509  	MOVQ ctx+16(FP), AX
3510  	MOVQ 24(SP), CX
3511  	MOVQ CX, 208(AX)
3512  	MOVQ $0x00000004, ret+24(FP)
3513  	RET
3514  
3515  	// Return with overread error
3516  error_overread:
3517  	MOVQ $0x00000006, ret+24(FP)
3518  	RET
3519  
3520  	// Return with not enough output space error
3521  error_not_enough_space:
3522  	MOVQ ctx+16(FP), AX
3523  	MOVQ 24(SP), CX
3524  	MOVQ CX, 208(AX)
3525  	MOVQ 16(SP), CX
3526  	MOVQ CX, 216(AX)
3527  	MOVQ R12, 136(AX)
3528  	MOVQ $0x00000005, ret+24(FP)
3529  	RET
3530  
3531  // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
3532  // Requires: BMI, BMI2, CMOV, SSE
3533  TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
3534  	MOVQ    br+8(FP), BX
3535  	MOVQ    24(BX), AX
3536  	MOVBQZX 40(BX), DX
3537  	MOVQ    (BX), CX
3538  	MOVQ    32(BX), BX
3539  	ADDQ    BX, CX
3540  	MOVQ    CX, (SP)
3541  	MOVQ    ctx+16(FP), CX
3542  	MOVQ    72(CX), SI
3543  	MOVQ    80(CX), DI
3544  	MOVQ    88(CX), R8
3545  	XORQ    R9, R9
3546  	MOVQ    R9, 8(SP)
3547  	MOVQ    R9, 16(SP)
3548  	MOVQ    R9, 24(SP)
3549  	MOVQ    112(CX), R9
3550  	MOVQ    128(CX), R10
3551  	MOVQ    R10, 32(SP)
3552  	MOVQ    144(CX), R10
3553  	MOVQ    136(CX), R11
3554  	MOVQ    200(CX), R12
3555  	MOVQ    R12, 56(SP)
3556  	MOVQ    176(CX), R12
3557  	MOVQ    R12, 48(SP)
3558  	MOVQ    184(CX), CX
3559  	MOVQ    CX, 40(SP)
3560  	MOVQ    40(SP), CX
3561  	ADDQ    CX, 48(SP)
3562  
3563  	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
3564  	ADDQ R9, 32(SP)
3565  
3566  	// outBase += outPosition
3567  	ADDQ R11, R9
3568  
3569  sequenceDecs_decodeSync_safe_bmi2_main_loop:
3570  	MOVQ (SP), R12
3571  
3572  	// Fill bitreader to have enough for the offset and match length.
3573  	CMPQ BX, $0x08
3574  	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
3575  	MOVQ DX, CX
3576  	SHRQ $0x03, CX
3577  	SUBQ CX, R12
3578  	MOVQ (R12), AX
3579  	SUBQ CX, BX
3580  	ANDQ $0x07, DX
3581  	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end
3582  
3583  sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
3584  	CMPQ    BX, $0x00
3585  	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
3586  	CMPQ    DX, $0x07
3587  	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
3588  	SHLQ    $0x08, AX
3589  	SUBQ    $0x01, R12
3590  	SUBQ    $0x01, BX
3591  	SUBQ    $0x08, DX
3592  	MOVBQZX (R12), CX
3593  	ORQ     CX, AX
3594  	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
3595  
3596  sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
3597  	CMPQ DX, $0x40
3598  	JA   error_overread
3599  
3600  sequenceDecs_decodeSync_safe_bmi2_fill_end:
3601  	// Update offset
3602  	MOVQ   $0x00000808, CX
3603  	BEXTRQ CX, R8, R13
3604  	MOVQ   AX, R14
3605  	LEAQ   (DX)(R13*1), CX
3606  	ROLQ   CL, R14
3607  	BZHIQ  R13, R14, R14
3608  	MOVQ   CX, DX
3609  	MOVQ   R8, CX
3610  	SHRQ   $0x20, CX
3611  	ADDQ   R14, CX
3612  	MOVQ   CX, 8(SP)
3613  
3614  	// Update match length
3615  	MOVQ   $0x00000808, CX
3616  	BEXTRQ CX, DI, R13
3617  	MOVQ   AX, R14
3618  	LEAQ   (DX)(R13*1), CX
3619  	ROLQ   CL, R14
3620  	BZHIQ  R13, R14, R14
3621  	MOVQ   CX, DX
3622  	MOVQ   DI, CX
3623  	SHRQ   $0x20, CX
3624  	ADDQ   R14, CX
3625  	MOVQ   CX, 16(SP)
3626  
3627  	// Fill bitreader to have enough for the remaining
3628  	CMPQ BX, $0x08
3629  	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
3630  	MOVQ DX, CX
3631  	SHRQ $0x03, CX
3632  	SUBQ CX, R12
3633  	MOVQ (R12), AX
3634  	SUBQ CX, BX
3635  	ANDQ $0x07, DX
3636  	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end
3637  
3638  sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
3639  	CMPQ    BX, $0x00
3640  	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
3641  	CMPQ    DX, $0x07
3642  	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
3643  	SHLQ    $0x08, AX
3644  	SUBQ    $0x01, R12
3645  	SUBQ    $0x01, BX
3646  	SUBQ    $0x08, DX
3647  	MOVBQZX (R12), CX
3648  	ORQ     CX, AX
3649  	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
3650  
3651  sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
3652  	CMPQ DX, $0x40
3653  	JA   error_overread
3654  
3655  sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
3656  	// Update literal length
3657  	MOVQ   $0x00000808, CX
3658  	BEXTRQ CX, SI, R13
3659  	MOVQ   AX, R14
3660  	LEAQ   (DX)(R13*1), CX
3661  	ROLQ   CL, R14
3662  	BZHIQ  R13, R14, R14
3663  	MOVQ   CX, DX
3664  	MOVQ   SI, CX
3665  	SHRQ   $0x20, CX
3666  	ADDQ   R14, CX
3667  	MOVQ   CX, 24(SP)
3668  
3669  	// Fill bitreader for state updates
3670  	MOVQ    R12, (SP)
3671  	MOVQ    $0x00000808, CX
3672  	BEXTRQ  CX, R8, R12
3673  	MOVQ    ctx+16(FP), CX
3674  	CMPQ    96(CX), $0x00
3675  	JZ      sequenceDecs_decodeSync_safe_bmi2_skip_update
3676  	LEAQ    (SI)(DI*1), R13
3677  	ADDQ    R8, R13
3678  	MOVBQZX R13, R13
3679  	LEAQ    (DX)(R13*1), CX
3680  	MOVQ    AX, R14
3681  	MOVQ    CX, DX
3682  	ROLQ    CL, R14
3683  	BZHIQ   R13, R14, R14
3684  
3685  	// Update Offset State
3686  	BZHIQ R8, R14, CX
3687  	SHRXQ R8, R14, R14
3688  	SHRL  $0x10, R8
3689  	ADDQ  CX, R8
3690  
3691  	// Load ctx.ofTable
3692  	MOVQ ctx+16(FP), CX
3693  	MOVQ 48(CX), CX
3694  	MOVQ (CX)(R8*8), R8
3695  
3696  	// Update Match Length State
3697  	BZHIQ DI, R14, CX
3698  	SHRXQ DI, R14, R14
3699  	SHRL  $0x10, DI
3700  	ADDQ  CX, DI
3701  
3702  	// Load ctx.mlTable
3703  	MOVQ ctx+16(FP), CX
3704  	MOVQ 24(CX), CX
3705  	MOVQ (CX)(DI*8), DI
3706  
3707  	// Update Literal Length State
3708  	BZHIQ SI, R14, CX
3709  	SHRL  $0x10, SI
3710  	ADDQ  CX, SI
3711  
3712  	// Load ctx.llTable
3713  	MOVQ ctx+16(FP), CX
3714  	MOVQ (CX), CX
3715  	MOVQ (CX)(SI*8), SI
3716  
3717  sequenceDecs_decodeSync_safe_bmi2_skip_update:
3718  	// Adjust offset
3719  	MOVQ   s+0(FP), CX
3720  	MOVQ   8(SP), R13
3721  	CMPQ   R12, $0x01
3722  	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
3723  	MOVUPS 144(CX), X0
3724  	MOVQ   R13, 144(CX)
3725  	MOVUPS X0, 152(CX)
3726  	JMP    sequenceDecs_decodeSync_safe_bmi2_after_adjust
3727  
3728  sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
3729  	CMPQ 24(SP), $0x00000000
3730  	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
3731  	INCQ R13
3732  	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
3733  
3734  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
3735  	TESTQ R13, R13
3736  	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
3737  	MOVQ  144(CX), R13
3738  	JMP   sequenceDecs_decodeSync_safe_bmi2_after_adjust
3739  
3740  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
3741  	MOVQ    R13, R12
3742  	XORQ    R14, R14
3743  	MOVQ    $-1, R15
3744  	CMPQ    R13, $0x03
3745  	CMOVQEQ R14, R12
3746  	CMOVQEQ R15, R14
3747  	ADDQ    144(CX)(R12*8), R14
3748  	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
3749  	MOVQ    $0x00000001, R14
3750  
3751  sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
3752  	CMPQ R13, $0x01
3753  	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
3754  	MOVQ 152(CX), R12
3755  	MOVQ R12, 160(CX)
3756  
3757  sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
3758  	MOVQ 144(CX), R12
3759  	MOVQ R12, 152(CX)
3760  	MOVQ R14, 144(CX)
3761  	MOVQ R14, R13
3762  
3763  sequenceDecs_decodeSync_safe_bmi2_after_adjust:
3764  	MOVQ R13, 8(SP)
3765  
3766  	// Check values
3767  	MOVQ  16(SP), CX
3768  	MOVQ  24(SP), R12
3769  	LEAQ  (CX)(R12*1), R14
3770  	MOVQ  s+0(FP), R15
3771  	ADDQ  R14, 256(R15)
3772  	MOVQ  ctx+16(FP), R14
3773  	SUBQ  R12, 104(R14)
3774  	JS    error_not_enough_literals
3775  	CMPQ  CX, $0x00020002
3776  	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
3777  	TESTQ R13, R13
3778  	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
3779  	TESTQ CX, CX
3780  	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
3781  
3782  sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
3783  	MOVQ 24(SP), CX
3784  	MOVQ 8(SP), R12
3785  	MOVQ 16(SP), R13
3786  
3787  	// Check if we have enough space in s.out
3788  	LEAQ (CX)(R13*1), R14
3789  	ADDQ R9, R14
3790  	CMPQ R14, 32(SP)
3791  	JA   error_not_enough_space
3792  
3793  	// Copy literals
3794  	TESTQ CX, CX
3795  	JZ    check_offset
3796  	MOVQ  CX, R14
3797  	SUBQ  $0x10, R14
3798  	JB    copy_1_small
3799  
3800  copy_1_loop:
3801  	MOVUPS (R10), X0
3802  	MOVUPS X0, (R9)
3803  	ADDQ   $0x10, R10
3804  	ADDQ   $0x10, R9
3805  	SUBQ   $0x10, R14
3806  	JAE    copy_1_loop
3807  	LEAQ   16(R10)(R14*1), R10
3808  	LEAQ   16(R9)(R14*1), R9
3809  	MOVUPS -16(R10), X0
3810  	MOVUPS X0, -16(R9)
3811  	JMP    copy_1_end
3812  
3813  copy_1_small:
3814  	CMPQ CX, $0x03
3815  	JE   copy_1_move_3
3816  	JB   copy_1_move_1or2
3817  	CMPQ CX, $0x08
3818  	JB   copy_1_move_4through7
3819  	JMP  copy_1_move_8through16
3820  
3821  copy_1_move_1or2:
3822  	MOVB (R10), R14
3823  	MOVB -1(R10)(CX*1), R15
3824  	MOVB R14, (R9)
3825  	MOVB R15, -1(R9)(CX*1)
3826  	ADDQ CX, R10
3827  	ADDQ CX, R9
3828  	JMP  copy_1_end
3829  
3830  copy_1_move_3:
3831  	MOVW (R10), R14
3832  	MOVB 2(R10), R15
3833  	MOVW R14, (R9)
3834  	MOVB R15, 2(R9)
3835  	ADDQ CX, R10
3836  	ADDQ CX, R9
3837  	JMP  copy_1_end
3838  
3839  copy_1_move_4through7:
3840  	MOVL (R10), R14
3841  	MOVL -4(R10)(CX*1), R15
3842  	MOVL R14, (R9)
3843  	MOVL R15, -4(R9)(CX*1)
3844  	ADDQ CX, R10
3845  	ADDQ CX, R9
3846  	JMP  copy_1_end
3847  
3848  copy_1_move_8through16:
3849  	MOVQ (R10), R14
3850  	MOVQ -8(R10)(CX*1), R15
3851  	MOVQ R14, (R9)
3852  	MOVQ R15, -8(R9)(CX*1)
3853  	ADDQ CX, R10
3854  	ADDQ CX, R9
3855  
3856  copy_1_end:
3857  	ADDQ CX, R11
3858  
3859  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
3860  check_offset:
3861  	MOVQ R11, CX
3862  	ADDQ 40(SP), CX
3863  	CMPQ R12, CX
3864  	JG   error_match_off_too_big
3865  	CMPQ R12, 56(SP)
3866  	JG   error_match_off_too_big
3867  
3868  	// Copy match from history
3869  	MOVQ R12, CX
3870  	SUBQ R11, CX
3871  	JLS  copy_match
3872  	MOVQ 48(SP), R14
3873  	SUBQ CX, R14
3874  	CMPQ R13, CX
3875  	JG   copy_all_from_history
3876  	MOVQ R13, CX
3877  	SUBQ $0x10, CX
3878  	JB   copy_4_small
3879  
3880  copy_4_loop:
3881  	MOVUPS (R14), X0
3882  	MOVUPS X0, (R9)
3883  	ADDQ   $0x10, R14
3884  	ADDQ   $0x10, R9
3885  	SUBQ   $0x10, CX
3886  	JAE    copy_4_loop
3887  	LEAQ   16(R14)(CX*1), R14
3888  	LEAQ   16(R9)(CX*1), R9
3889  	MOVUPS -16(R14), X0
3890  	MOVUPS X0, -16(R9)
3891  	JMP    copy_4_end
3892  
3893  copy_4_small:
3894  	CMPQ R13, $0x03
3895  	JE   copy_4_move_3
3896  	CMPQ R13, $0x08
3897  	JB   copy_4_move_4through7
3898  	JMP  copy_4_move_8through16
3899  
3900  copy_4_move_3:
3901  	MOVW (R14), CX
3902  	MOVB 2(R14), R12
3903  	MOVW CX, (R9)
3904  	MOVB R12, 2(R9)
3905  	ADDQ R13, R14
3906  	ADDQ R13, R9
3907  	JMP  copy_4_end
3908  
3909  copy_4_move_4through7:
3910  	MOVL (R14), CX
3911  	MOVL -4(R14)(R13*1), R12
3912  	MOVL CX, (R9)
3913  	MOVL R12, -4(R9)(R13*1)
3914  	ADDQ R13, R14
3915  	ADDQ R13, R9
3916  	JMP  copy_4_end
3917  
3918  copy_4_move_8through16:
3919  	MOVQ (R14), CX
3920  	MOVQ -8(R14)(R13*1), R12
3921  	MOVQ CX, (R9)
3922  	MOVQ R12, -8(R9)(R13*1)
3923  	ADDQ R13, R14
3924  	ADDQ R13, R9
3925  
3926  copy_4_end:
3927  	ADDQ R13, R11
3928  	JMP  handle_loop
3929  	JMP loop_finished
3930  
3931  copy_all_from_history:
3932  	MOVQ CX, R15
3933  	SUBQ $0x10, R15
3934  	JB   copy_5_small
3935  
3936  copy_5_loop:
3937  	MOVUPS (R14), X0
3938  	MOVUPS X0, (R9)
3939  	ADDQ   $0x10, R14
3940  	ADDQ   $0x10, R9
3941  	SUBQ   $0x10, R15
3942  	JAE    copy_5_loop
3943  	LEAQ   16(R14)(R15*1), R14
3944  	LEAQ   16(R9)(R15*1), R9
3945  	MOVUPS -16(R14), X0
3946  	MOVUPS X0, -16(R9)
3947  	JMP    copy_5_end
3948  
3949  copy_5_small:
3950  	CMPQ CX, $0x03
3951  	JE   copy_5_move_3
3952  	JB   copy_5_move_1or2
3953  	CMPQ CX, $0x08
3954  	JB   copy_5_move_4through7
3955  	JMP  copy_5_move_8through16
3956  
3957  copy_5_move_1or2:
3958  	MOVB (R14), R15
3959  	MOVB -1(R14)(CX*1), BP
3960  	MOVB R15, (R9)
3961  	MOVB BP, -1(R9)(CX*1)
3962  	ADDQ CX, R14
3963  	ADDQ CX, R9
3964  	JMP  copy_5_end
3965  
3966  copy_5_move_3:
3967  	MOVW (R14), R15
3968  	MOVB 2(R14), BP
3969  	MOVW R15, (R9)
3970  	MOVB BP, 2(R9)
3971  	ADDQ CX, R14
3972  	ADDQ CX, R9
3973  	JMP  copy_5_end
3974  
3975  copy_5_move_4through7:
3976  	MOVL (R14), R15
3977  	MOVL -4(R14)(CX*1), BP
3978  	MOVL R15, (R9)
3979  	MOVL BP, -4(R9)(CX*1)
3980  	ADDQ CX, R14
3981  	ADDQ CX, R9
3982  	JMP  copy_5_end
3983  
3984  copy_5_move_8through16:
3985  	MOVQ (R14), R15
3986  	MOVQ -8(R14)(CX*1), BP
3987  	MOVQ R15, (R9)
3988  	MOVQ BP, -8(R9)(CX*1)
3989  	ADDQ CX, R14
3990  	ADDQ CX, R9
3991  
3992  copy_5_end:
3993  	ADDQ CX, R11
3994  	SUBQ CX, R13
3995  
3996  	// Copy match from the current buffer
3997  copy_match:
3998  	MOVQ R9, CX
3999  	SUBQ R12, CX
4000  
4001  	// ml <= mo
4002  	CMPQ R13, R12
4003  	JA   copy_overlapping_match
4004  
4005  	// Copy non-overlapping match
4006  	ADDQ R13, R11
4007  	MOVQ R13, R12
4008  	SUBQ $0x10, R12
4009  	JB   copy_2_small
4010  
4011  copy_2_loop:
4012  	MOVUPS (CX), X0
4013  	MOVUPS X0, (R9)
4014  	ADDQ   $0x10, CX
4015  	ADDQ   $0x10, R9
4016  	SUBQ   $0x10, R12
4017  	JAE    copy_2_loop
4018  	LEAQ   16(CX)(R12*1), CX
4019  	LEAQ   16(R9)(R12*1), R9
4020  	MOVUPS -16(CX), X0
4021  	MOVUPS X0, -16(R9)
4022  	JMP    copy_2_end
4023  
4024  copy_2_small:
4025  	CMPQ R13, $0x03
4026  	JE   copy_2_move_3
4027  	JB   copy_2_move_1or2
4028  	CMPQ R13, $0x08
4029  	JB   copy_2_move_4through7
4030  	JMP  copy_2_move_8through16
4031  
4032  copy_2_move_1or2:
4033  	MOVB (CX), R12
4034  	MOVB -1(CX)(R13*1), R14
4035  	MOVB R12, (R9)
4036  	MOVB R14, -1(R9)(R13*1)
4037  	ADDQ R13, CX
4038  	ADDQ R13, R9
4039  	JMP  copy_2_end
4040  
4041  copy_2_move_3:
4042  	MOVW (CX), R12
4043  	MOVB 2(CX), R14
4044  	MOVW R12, (R9)
4045  	MOVB R14, 2(R9)
4046  	ADDQ R13, CX
4047  	ADDQ R13, R9
4048  	JMP  copy_2_end
4049  
4050  copy_2_move_4through7:
4051  	MOVL (CX), R12
4052  	MOVL -4(CX)(R13*1), R14
4053  	MOVL R12, (R9)
4054  	MOVL R14, -4(R9)(R13*1)
4055  	ADDQ R13, CX
4056  	ADDQ R13, R9
4057  	JMP  copy_2_end
4058  
4059  copy_2_move_8through16:
4060  	MOVQ (CX), R12
4061  	MOVQ -8(CX)(R13*1), R14
4062  	MOVQ R12, (R9)
4063  	MOVQ R14, -8(R9)(R13*1)
4064  	ADDQ R13, CX
4065  	ADDQ R13, R9
4066  
4067  copy_2_end:
4068  	JMP handle_loop
4069  
4070  	// Copy overlapping match
4071  copy_overlapping_match:
4072  	ADDQ R13, R11
4073  
4074  copy_slow_3:
4075  	MOVB (CX), R12
4076  	MOVB R12, (R9)
4077  	INCQ CX
4078  	INCQ R9
4079  	DECQ R13
4080  	JNZ  copy_slow_3
4081  
4082  handle_loop:
4083  	MOVQ ctx+16(FP), CX
4084  	DECQ 96(CX)
4085  	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop
4086  
4087  loop_finished:
4088  	MOVQ br+8(FP), CX
4089  	MOVQ AX, 24(CX)
4090  	MOVB DL, 40(CX)
4091  	MOVQ BX, 32(CX)
4092  
4093  	// Update the context
4094  	MOVQ ctx+16(FP), AX
4095  	MOVQ R11, 136(AX)
4096  	MOVQ 144(AX), CX
4097  	SUBQ CX, R10
4098  	MOVQ R10, 168(AX)
4099  
4100  	// Return success
4101  	MOVQ $0x00000000, ret+24(FP)
4102  	RET
4103  
4104  	// Return with match length error
4105  sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
4106  	MOVQ 16(SP), AX
4107  	MOVQ ctx+16(FP), CX
4108  	MOVQ AX, 216(CX)
4109  	MOVQ $0x00000001, ret+24(FP)
4110  	RET
4111  
4112  	// Return with match too long error
4113  sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
4114  	MOVQ ctx+16(FP), AX
4115  	MOVQ 16(SP), CX
4116  	MOVQ CX, 216(AX)
4117  	MOVQ $0x00000002, ret+24(FP)
4118  	RET
4119  
4120  	// Return with match offset too long error
4121  error_match_off_too_big:
4122  	MOVQ ctx+16(FP), AX
4123  	MOVQ 8(SP), CX
4124  	MOVQ CX, 224(AX)
4125  	MOVQ R11, 136(AX)
4126  	MOVQ $0x00000003, ret+24(FP)
4127  	RET
4128  
4129  	// Return with not enough literals error
4130  error_not_enough_literals:
4131  	MOVQ ctx+16(FP), AX
4132  	MOVQ 24(SP), CX
4133  	MOVQ CX, 208(AX)
4134  	MOVQ $0x00000004, ret+24(FP)
4135  	RET
4136  
4137  	// Return with overread error
4138  error_overread:
4139  	MOVQ $0x00000006, ret+24(FP)
4140  	RET
4141  
4142  	// Return with not enough output space error
4143  error_not_enough_space:
4144  	MOVQ ctx+16(FP), AX
4145  	MOVQ 24(SP), CX
4146  	MOVQ CX, 208(AX)
4147  	MOVQ 16(SP), CX
4148  	MOVQ CX, 216(AX)
4149  	MOVQ R11, 136(AX)
4150  	MOVQ $0x00000005, ret+24(FP)
4151  	RET
4152