encode_better.go raw

   1  // Copyright 2016 The Snappy-Go Authors. All rights reserved.
   2  // Copyright (c) 2019 Klaus Post. All rights reserved.
   3  // Use of this source code is governed by a BSD-style
   4  // license that can be found in the LICENSE file.
   5  
   6  package s2
   7  
   8  import (
   9  	"bytes"
  10  	"fmt"
  11  	"math/bits"
  12  )
  13  
  14  // hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
  15  // Preferably h should be a constant and should always be <32.
  16  func hash4(u uint64, h uint8) uint32 {
  17  	const prime4bytes = 2654435761
  18  	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
  19  }
  20  
  21  // hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
  22  // Preferably h should be a constant and should always be <64.
  23  func hash5(u uint64, h uint8) uint32 {
  24  	const prime5bytes = 889523592379
  25  	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
  26  }
  27  
  28  // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
  29  // Preferably h should be a constant and should always be <64.
  30  func hash7(u uint64, h uint8) uint32 {
  31  	const prime7bytes = 58295818150454627
  32  	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
  33  }
  34  
  35  // hash8 returns the hash of u to fit in a hash table with h bits.
  36  // Preferably h should be a constant and should always be <64.
  37  func hash8(u uint64, h uint8) uint32 {
  38  	const prime8bytes = 0xcf1bbcdcb7a56463
  39  	return uint32((u * prime8bytes) >> ((64 - h) & 63))
  40  }
  41  
  42  // encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
  43  // assumes that the varint-encoded length of the decompressed bytes has already
  44  // been written.
  45  //
  46  // It also assumes that:
  47  //
  48  //	len(dst) >= MaxEncodedLen(len(src)) &&
  49  //	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
  50  func encodeBlockBetterGo(dst, src []byte) (d int) {
  51  	// sLimit is when to stop looking for offset/length copies. The inputMargin
  52  	// lets us use a fast path for emitLiteral in the main loop, while we are
  53  	// looking for copies.
  54  	sLimit := len(src) - inputMargin
  55  	if len(src) < minNonLiteralBlockSize {
  56  		return 0
  57  	}
  58  
  59  	// Initialize the hash tables.
  60  	const (
  61  		// Long hash matches.
  62  		lTableBits    = 17
  63  		maxLTableSize = 1 << lTableBits
  64  
  65  		// Short hash matches.
  66  		sTableBits    = 14
  67  		maxSTableSize = 1 << sTableBits
  68  	)
  69  
  70  	var lTable [maxLTableSize]uint32
  71  	var sTable [maxSTableSize]uint32
  72  
  73  	// Bail if we can't compress to at least this.
  74  	dstLimit := len(src) - len(src)>>5 - 6
  75  
  76  	// nextEmit is where in src the next emitLiteral should start from.
  77  	nextEmit := 0
  78  
  79  	// The encoded form must start with a literal, as there are no previous
  80  	// bytes to copy, so we start looking for hash matches at s == 1.
  81  	s := 1
  82  	cv := load64(src, s)
  83  
  84  	// We initialize repeat to 0, so we never match on first attempt
  85  	repeat := 0
  86  
  87  	for {
  88  		candidateL := 0
  89  		nextS := 0
  90  		for {
  91  			// Next src position to check
  92  			nextS = s + (s-nextEmit)>>7 + 1
  93  			if nextS > sLimit {
  94  				goto emitRemainder
  95  			}
  96  			hashL := hash7(cv, lTableBits)
  97  			hashS := hash4(cv, sTableBits)
  98  			candidateL = int(lTable[hashL])
  99  			candidateS := int(sTable[hashS])
 100  			lTable[hashL] = uint32(s)
 101  			sTable[hashS] = uint32(s)
 102  
 103  			valLong := load64(src, candidateL)
 104  			valShort := load64(src, candidateS)
 105  
 106  			// If long matches at least 8 bytes, use that.
 107  			if cv == valLong {
 108  				break
 109  			}
 110  			if cv == valShort {
 111  				candidateL = candidateS
 112  				break
 113  			}
 114  
 115  			// Check repeat at offset checkRep.
 116  			const checkRep = 1
 117  			// Minimum length of a repeat. Tested with various values.
 118  			// While 4-5 offers improvements in some, 6 reduces
 119  			// regressions significantly.
 120  			const wantRepeatBytes = 6
 121  			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
 122  			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
 123  				base := s + checkRep
 124  				// Extend back
 125  				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
 126  					i--
 127  					base--
 128  				}
 129  				d += emitLiteral(dst[d:], src[nextEmit:base])
 130  
 131  				// Extend forward
 132  				candidate := s - repeat + wantRepeatBytes + checkRep
 133  				s += wantRepeatBytes + checkRep
 134  				for s < len(src) {
 135  					if len(src)-s < 8 {
 136  						if src[s] == src[candidate] {
 137  							s++
 138  							candidate++
 139  							continue
 140  						}
 141  						break
 142  					}
 143  					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
 144  						s += bits.TrailingZeros64(diff) >> 3
 145  						break
 146  					}
 147  					s += 8
 148  					candidate += 8
 149  				}
 150  				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
 151  				d += emitRepeat(dst[d:], repeat, s-base)
 152  				nextEmit = s
 153  				if s >= sLimit {
 154  					goto emitRemainder
 155  				}
 156  				// Index in-between
 157  				index0 := base + 1
 158  				index1 := s - 2
 159  
 160  				for index0 < index1 {
 161  					cv0 := load64(src, index0)
 162  					cv1 := load64(src, index1)
 163  					lTable[hash7(cv0, lTableBits)] = uint32(index0)
 164  					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
 165  
 166  					lTable[hash7(cv1, lTableBits)] = uint32(index1)
 167  					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
 168  					index0 += 2
 169  					index1 -= 2
 170  				}
 171  
 172  				cv = load64(src, s)
 173  				continue
 174  			}
 175  
 176  			// Long likely matches 7, so take that.
 177  			if uint32(cv) == uint32(valLong) {
 178  				break
 179  			}
 180  
 181  			// Check our short candidate
 182  			if uint32(cv) == uint32(valShort) {
 183  				// Try a long candidate at s+1
 184  				hashL = hash7(cv>>8, lTableBits)
 185  				candidateL = int(lTable[hashL])
 186  				lTable[hashL] = uint32(s + 1)
 187  				if uint32(cv>>8) == load32(src, candidateL) {
 188  					s++
 189  					break
 190  				}
 191  				// Use our short candidate.
 192  				candidateL = candidateS
 193  				break
 194  			}
 195  
 196  			cv = load64(src, nextS)
 197  			s = nextS
 198  		}
 199  
 200  		// Extend backwards
 201  		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
 202  			candidateL--
 203  			s--
 204  		}
 205  
 206  		// Bail if we exceed the maximum size.
 207  		if d+(s-nextEmit) > dstLimit {
 208  			return 0
 209  		}
 210  
 211  		base := s
 212  		offset := base - candidateL
 213  
 214  		// Extend the 4-byte match as long as possible.
 215  		s += 4
 216  		candidateL += 4
 217  		for s < len(src) {
 218  			if len(src)-s < 8 {
 219  				if src[s] == src[candidateL] {
 220  					s++
 221  					candidateL++
 222  					continue
 223  				}
 224  				break
 225  			}
 226  			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
 227  				s += bits.TrailingZeros64(diff) >> 3
 228  				break
 229  			}
 230  			s += 8
 231  			candidateL += 8
 232  		}
 233  
 234  		if offset > 65535 && s-base <= 5 && repeat != offset {
 235  			// Bail if the match is equal or worse to the encoding.
 236  			s = nextS + 1
 237  			if s >= sLimit {
 238  				goto emitRemainder
 239  			}
 240  			cv = load64(src, s)
 241  			continue
 242  		}
 243  
 244  		d += emitLiteral(dst[d:], src[nextEmit:base])
 245  		if repeat == offset {
 246  			d += emitRepeat(dst[d:], offset, s-base)
 247  		} else {
 248  			d += emitCopy(dst[d:], offset, s-base)
 249  			repeat = offset
 250  		}
 251  
 252  		nextEmit = s
 253  		if s >= sLimit {
 254  			goto emitRemainder
 255  		}
 256  
 257  		if d > dstLimit {
 258  			// Do we have space for more, if not bail.
 259  			return 0
 260  		}
 261  
 262  		// Index short & long
 263  		index0 := base + 1
 264  		index1 := s - 2
 265  
 266  		cv0 := load64(src, index0)
 267  		cv1 := load64(src, index1)
 268  		lTable[hash7(cv0, lTableBits)] = uint32(index0)
 269  		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
 270  
 271  		// lTable could be postponed, but very minor difference.
 272  		lTable[hash7(cv1, lTableBits)] = uint32(index1)
 273  		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
 274  		index0 += 1
 275  		index1 -= 1
 276  		cv = load64(src, s)
 277  
 278  		// Index large values sparsely in between.
 279  		// We do two starting from different offsets for speed.
 280  		index2 := (index0 + index1 + 1) >> 1
 281  		for index2 < index1 {
 282  			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
 283  			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
 284  			index0 += 2
 285  			index2 += 2
 286  		}
 287  	}
 288  
 289  emitRemainder:
 290  	if nextEmit < len(src) {
 291  		// Bail if we exceed the maximum size.
 292  		if d+len(src)-nextEmit > dstLimit {
 293  			return 0
 294  		}
 295  		d += emitLiteral(dst[d:], src[nextEmit:])
 296  	}
 297  	return d
 298  }
 299  
 300  // encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
 301  // assumes that the varint-encoded length of the decompressed bytes has already
 302  // been written.
 303  //
 304  // It also assumes that:
 305  //
 306  //	len(dst) >= MaxEncodedLen(len(src)) &&
 307  //	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 308  func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
 309  	// sLimit is when to stop looking for offset/length copies. The inputMargin
 310  	// lets us use a fast path for emitLiteral in the main loop, while we are
 311  	// looking for copies.
 312  	sLimit := len(src) - inputMargin
 313  	if len(src) < minNonLiteralBlockSize {
 314  		return 0
 315  	}
 316  
 317  	// Initialize the hash tables.
 318  	const (
 319  		// Long hash matches.
 320  		lTableBits    = 16
 321  		maxLTableSize = 1 << lTableBits
 322  
 323  		// Short hash matches.
 324  		sTableBits    = 14
 325  		maxSTableSize = 1 << sTableBits
 326  	)
 327  
 328  	var lTable [maxLTableSize]uint32
 329  	var sTable [maxSTableSize]uint32
 330  
 331  	// Bail if we can't compress to at least this.
 332  	dstLimit := len(src) - len(src)>>5 - 6
 333  
 334  	// nextEmit is where in src the next emitLiteral should start from.
 335  	nextEmit := 0
 336  
 337  	// The encoded form must start with a literal, as there are no previous
 338  	// bytes to copy, so we start looking for hash matches at s == 1.
 339  	s := 1
 340  	cv := load64(src, s)
 341  
 342  	// We initialize repeat to 0, so we never match on first attempt
 343  	repeat := 0
 344  	const maxSkip = 100
 345  
 346  	for {
 347  		candidateL := 0
 348  		nextS := 0
 349  		for {
 350  			// Next src position to check
 351  			nextS = min(s+(s-nextEmit)>>7+1, s+maxSkip)
 352  
 353  			if nextS > sLimit {
 354  				goto emitRemainder
 355  			}
 356  			hashL := hash7(cv, lTableBits)
 357  			hashS := hash4(cv, sTableBits)
 358  			candidateL = int(lTable[hashL])
 359  			candidateS := int(sTable[hashS])
 360  			lTable[hashL] = uint32(s)
 361  			sTable[hashS] = uint32(s)
 362  
 363  			if uint32(cv) == load32(src, candidateL) {
 364  				break
 365  			}
 366  
 367  			// Check our short candidate
 368  			if uint32(cv) == load32(src, candidateS) {
 369  				// Try a long candidate at s+1
 370  				hashL = hash7(cv>>8, lTableBits)
 371  				candidateL = int(lTable[hashL])
 372  				lTable[hashL] = uint32(s + 1)
 373  				if uint32(cv>>8) == load32(src, candidateL) {
 374  					s++
 375  					break
 376  				}
 377  				// Use our short candidate.
 378  				candidateL = candidateS
 379  				break
 380  			}
 381  
 382  			cv = load64(src, nextS)
 383  			s = nextS
 384  		}
 385  
 386  		// Extend backwards
 387  		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
 388  			candidateL--
 389  			s--
 390  		}
 391  
 392  		// Bail if we exceed the maximum size.
 393  		if d+(s-nextEmit) > dstLimit {
 394  			return 0
 395  		}
 396  
 397  		base := s
 398  		offset := base - candidateL
 399  
 400  		// Extend the 4-byte match as long as possible.
 401  		s += 4
 402  		candidateL += 4
 403  		for s < len(src) {
 404  			if len(src)-s < 8 {
 405  				if src[s] == src[candidateL] {
 406  					s++
 407  					candidateL++
 408  					continue
 409  				}
 410  				break
 411  			}
 412  			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
 413  				s += bits.TrailingZeros64(diff) >> 3
 414  				break
 415  			}
 416  			s += 8
 417  			candidateL += 8
 418  		}
 419  
 420  		if offset > 65535 && s-base <= 5 && repeat != offset {
 421  			// Bail if the match is equal or worse to the encoding.
 422  			s = nextS + 1
 423  			if s >= sLimit {
 424  				goto emitRemainder
 425  			}
 426  			cv = load64(src, s)
 427  			continue
 428  		}
 429  
 430  		d += emitLiteral(dst[d:], src[nextEmit:base])
 431  		d += emitCopyNoRepeat(dst[d:], offset, s-base)
 432  		repeat = offset
 433  
 434  		nextEmit = s
 435  		if s >= sLimit {
 436  			goto emitRemainder
 437  		}
 438  
 439  		if d > dstLimit {
 440  			// Do we have space for more, if not bail.
 441  			return 0
 442  		}
 443  
 444  		// Index short & long
 445  		index0 := base + 1
 446  		index1 := s - 2
 447  
 448  		cv0 := load64(src, index0)
 449  		cv1 := load64(src, index1)
 450  		lTable[hash7(cv0, lTableBits)] = uint32(index0)
 451  		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
 452  
 453  		lTable[hash7(cv1, lTableBits)] = uint32(index1)
 454  		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
 455  		index0 += 1
 456  		index1 -= 1
 457  		cv = load64(src, s)
 458  
 459  		// Index large values sparsely in between.
 460  		// We do two starting from different offsets for speed.
 461  		index2 := (index0 + index1 + 1) >> 1
 462  		for index2 < index1 {
 463  			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
 464  			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
 465  			index0 += 2
 466  			index2 += 2
 467  		}
 468  	}
 469  
 470  emitRemainder:
 471  	if nextEmit < len(src) {
 472  		// Bail if we exceed the maximum size.
 473  		if d+len(src)-nextEmit > dstLimit {
 474  			return 0
 475  		}
 476  		d += emitLiteral(dst[d:], src[nextEmit:])
 477  	}
 478  	return d
 479  }
 480  
 481  func encodeBlockBetterGo64K(dst, src []byte) (d int) {
 482  	// sLimit is when to stop looking for offset/length copies. The inputMargin
 483  	// lets us use a fast path for emitLiteral in the main loop, while we are
 484  	// looking for copies.
 485  	sLimit := len(src) - inputMargin
 486  	if len(src) < minNonLiteralBlockSize {
 487  		return 0
 488  	}
 489  	// Initialize the hash tables.
 490  	// Use smaller tables for smaller blocks
 491  	const (
 492  		// Long hash matches.
 493  		lTableBits    = 16
 494  		maxLTableSize = 1 << lTableBits
 495  
 496  		// Short hash matches.
 497  		sTableBits    = 13
 498  		maxSTableSize = 1 << sTableBits
 499  	)
 500  
 501  	var lTable [maxLTableSize]uint16
 502  	var sTable [maxSTableSize]uint16
 503  
 504  	// Bail if we can't compress to at least this.
 505  	dstLimit := len(src) - len(src)>>5 - 6
 506  
 507  	// nextEmit is where in src the next emitLiteral should start from.
 508  	nextEmit := 0
 509  
 510  	// The encoded form must start with a literal, as there are no previous
 511  	// bytes to copy, so we start looking for hash matches at s == 1.
 512  	s := 1
 513  	cv := load64(src, s)
 514  
 515  	// We initialize repeat to 0, so we never match on first attempt
 516  	repeat := 0
 517  
 518  	for {
 519  		candidateL := 0
 520  		nextS := 0
 521  		for {
 522  			// Next src position to check
 523  			nextS = s + (s-nextEmit)>>6 + 1
 524  			if nextS > sLimit {
 525  				goto emitRemainder
 526  			}
 527  			hashL := hash7(cv, lTableBits)
 528  			hashS := hash4(cv, sTableBits)
 529  			candidateL = int(lTable[hashL])
 530  			candidateS := int(sTable[hashS])
 531  			lTable[hashL] = uint16(s)
 532  			sTable[hashS] = uint16(s)
 533  
 534  			valLong := load64(src, candidateL)
 535  			valShort := load64(src, candidateS)
 536  
 537  			// If long matches at least 8 bytes, use that.
 538  			if cv == valLong {
 539  				break
 540  			}
 541  			if cv == valShort {
 542  				candidateL = candidateS
 543  				break
 544  			}
 545  
 546  			// Check repeat at offset checkRep.
 547  			const checkRep = 1
 548  			// Minimum length of a repeat. Tested with various values.
 549  			// While 4-5 offers improvements in some, 6 reduces
 550  			// regressions significantly.
 551  			const wantRepeatBytes = 6
 552  			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
 553  			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
 554  				base := s + checkRep
 555  				// Extend back
 556  				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
 557  					i--
 558  					base--
 559  				}
 560  				d += emitLiteral(dst[d:], src[nextEmit:base])
 561  
 562  				// Extend forward
 563  				candidate := s - repeat + wantRepeatBytes + checkRep
 564  				s += wantRepeatBytes + checkRep
 565  				for s < len(src) {
 566  					if len(src)-s < 8 {
 567  						if src[s] == src[candidate] {
 568  							s++
 569  							candidate++
 570  							continue
 571  						}
 572  						break
 573  					}
 574  					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
 575  						s += bits.TrailingZeros64(diff) >> 3
 576  						break
 577  					}
 578  					s += 8
 579  					candidate += 8
 580  				}
 581  				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
 582  				d += emitRepeat(dst[d:], repeat, s-base)
 583  				nextEmit = s
 584  				if s >= sLimit {
 585  					goto emitRemainder
 586  				}
 587  				// Index in-between
 588  				index0 := base + 1
 589  				index1 := s - 2
 590  
 591  				for index0 < index1 {
 592  					cv0 := load64(src, index0)
 593  					cv1 := load64(src, index1)
 594  					lTable[hash7(cv0, lTableBits)] = uint16(index0)
 595  					sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
 596  
 597  					lTable[hash7(cv1, lTableBits)] = uint16(index1)
 598  					sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
 599  					index0 += 2
 600  					index1 -= 2
 601  				}
 602  
 603  				cv = load64(src, s)
 604  				continue
 605  			}
 606  
 607  			// Long likely matches 7, so take that.
 608  			if uint32(cv) == uint32(valLong) {
 609  				break
 610  			}
 611  
 612  			// Check our short candidate
 613  			if uint32(cv) == uint32(valShort) {
 614  				// Try a long candidate at s+1
 615  				hashL = hash7(cv>>8, lTableBits)
 616  				candidateL = int(lTable[hashL])
 617  				lTable[hashL] = uint16(s + 1)
 618  				if uint32(cv>>8) == load32(src, candidateL) {
 619  					s++
 620  					break
 621  				}
 622  				// Use our short candidate.
 623  				candidateL = candidateS
 624  				break
 625  			}
 626  
 627  			cv = load64(src, nextS)
 628  			s = nextS
 629  		}
 630  
 631  		// Extend backwards
 632  		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
 633  			candidateL--
 634  			s--
 635  		}
 636  
 637  		// Bail if we exceed the maximum size.
 638  		if d+(s-nextEmit) > dstLimit {
 639  			return 0
 640  		}
 641  
 642  		base := s
 643  		offset := base - candidateL
 644  
 645  		// Extend the 4-byte match as long as possible.
 646  		s += 4
 647  		candidateL += 4
 648  		for s < len(src) {
 649  			if len(src)-s < 8 {
 650  				if src[s] == src[candidateL] {
 651  					s++
 652  					candidateL++
 653  					continue
 654  				}
 655  				break
 656  			}
 657  			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
 658  				s += bits.TrailingZeros64(diff) >> 3
 659  				break
 660  			}
 661  			s += 8
 662  			candidateL += 8
 663  		}
 664  
 665  		d += emitLiteral(dst[d:], src[nextEmit:base])
 666  		if repeat == offset {
 667  			d += emitRepeat(dst[d:], offset, s-base)
 668  		} else {
 669  			d += emitCopy(dst[d:], offset, s-base)
 670  			repeat = offset
 671  		}
 672  
 673  		nextEmit = s
 674  		if s >= sLimit {
 675  			goto emitRemainder
 676  		}
 677  
 678  		if d > dstLimit {
 679  			// Do we have space for more, if not bail.
 680  			return 0
 681  		}
 682  
 683  		// Index short & long
 684  		index0 := base + 1
 685  		index1 := s - 2
 686  
 687  		cv0 := load64(src, index0)
 688  		cv1 := load64(src, index1)
 689  		lTable[hash7(cv0, lTableBits)] = uint16(index0)
 690  		sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
 691  
 692  		// lTable could be postponed, but very minor difference.
 693  		lTable[hash7(cv1, lTableBits)] = uint16(index1)
 694  		sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
 695  		index0 += 1
 696  		index1 -= 1
 697  		cv = load64(src, s)
 698  
 699  		// Index large values sparsely in between.
 700  		// We do two starting from different offsets for speed.
 701  		index2 := (index0 + index1 + 1) >> 1
 702  		for index2 < index1 {
 703  			lTable[hash7(load64(src, index0), lTableBits)] = uint16(index0)
 704  			lTable[hash7(load64(src, index2), lTableBits)] = uint16(index2)
 705  			index0 += 2
 706  			index2 += 2
 707  		}
 708  	}
 709  
 710  emitRemainder:
 711  	if nextEmit < len(src) {
 712  		// Bail if we exceed the maximum size.
 713  		if d+len(src)-nextEmit > dstLimit {
 714  			return 0
 715  		}
 716  		d += emitLiteral(dst[d:], src[nextEmit:])
 717  	}
 718  	return d
 719  }
 720  
 721  // encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
 722  // assumes that the varint-encoded length of the decompressed bytes has already
 723  // been written.
 724  //
 725  // It also assumes that:
 726  //
 727  //	len(dst) >= MaxEncodedLen(len(src)) &&
 728  //	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 729  func encodeBlockBetterSnappyGo64K(dst, src []byte) (d int) {
 730  	// sLimit is when to stop looking for offset/length copies. The inputMargin
 731  	// lets us use a fast path for emitLiteral in the main loop, while we are
 732  	// looking for copies.
 733  	sLimit := len(src) - inputMargin
 734  	if len(src) < minNonLiteralBlockSize {
 735  		return 0
 736  	}
 737  
 738  	// Initialize the hash tables.
 739  	// Use smaller tables for smaller blocks
 740  	const (
 741  		// Long hash matches.
 742  		lTableBits    = 15
 743  		maxLTableSize = 1 << lTableBits
 744  
 745  		// Short hash matches.
 746  		sTableBits    = 13
 747  		maxSTableSize = 1 << sTableBits
 748  	)
 749  
 750  	var lTable [maxLTableSize]uint16
 751  	var sTable [maxSTableSize]uint16
 752  
 753  	// Bail if we can't compress to at least this.
 754  	dstLimit := len(src) - len(src)>>5 - 6
 755  
 756  	// nextEmit is where in src the next emitLiteral should start from.
 757  	nextEmit := 0
 758  
 759  	// The encoded form must start with a literal, as there are no previous
 760  	// bytes to copy, so we start looking for hash matches at s == 1.
 761  	s := 1
 762  	cv := load64(src, s)
 763  
 764  	const maxSkip = 100
 765  
 766  	for {
 767  		candidateL := 0
 768  		nextS := 0
 769  		for {
 770  			// Next src position to check
 771  			nextS = min(s+(s-nextEmit)>>6+1, s+maxSkip)
 772  
 773  			if nextS > sLimit {
 774  				goto emitRemainder
 775  			}
 776  			hashL := hash7(cv, lTableBits)
 777  			hashS := hash4(cv, sTableBits)
 778  			candidateL = int(lTable[hashL])
 779  			candidateS := int(sTable[hashS])
 780  			lTable[hashL] = uint16(s)
 781  			sTable[hashS] = uint16(s)
 782  
 783  			if uint32(cv) == load32(src, candidateL) {
 784  				break
 785  			}
 786  
 787  			// Check our short candidate
 788  			if uint32(cv) == load32(src, candidateS) {
 789  				// Try a long candidate at s+1
 790  				hashL = hash7(cv>>8, lTableBits)
 791  				candidateL = int(lTable[hashL])
 792  				lTable[hashL] = uint16(s + 1)
 793  				if uint32(cv>>8) == load32(src, candidateL) {
 794  					s++
 795  					break
 796  				}
 797  				// Use our short candidate.
 798  				candidateL = candidateS
 799  				break
 800  			}
 801  
 802  			cv = load64(src, nextS)
 803  			s = nextS
 804  		}
 805  
 806  		// Extend backwards
 807  		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
 808  			candidateL--
 809  			s--
 810  		}
 811  
 812  		// Bail if we exceed the maximum size.
 813  		if d+(s-nextEmit) > dstLimit {
 814  			return 0
 815  		}
 816  
 817  		base := s
 818  		offset := base - candidateL
 819  
 820  		// Extend the 4-byte match as long as possible.
 821  		s += 4
 822  		candidateL += 4
 823  		for s < len(src) {
 824  			if len(src)-s < 8 {
 825  				if src[s] == src[candidateL] {
 826  					s++
 827  					candidateL++
 828  					continue
 829  				}
 830  				break
 831  			}
 832  			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
 833  				s += bits.TrailingZeros64(diff) >> 3
 834  				break
 835  			}
 836  			s += 8
 837  			candidateL += 8
 838  		}
 839  
 840  		d += emitLiteral(dst[d:], src[nextEmit:base])
 841  		d += emitCopyNoRepeat(dst[d:], offset, s-base)
 842  
 843  		nextEmit = s
 844  		if s >= sLimit {
 845  			goto emitRemainder
 846  		}
 847  
 848  		if d > dstLimit {
 849  			// Do we have space for more, if not bail.
 850  			return 0
 851  		}
 852  
 853  		// Index short & long
 854  		index0 := base + 1
 855  		index1 := s - 2
 856  
 857  		cv0 := load64(src, index0)
 858  		cv1 := load64(src, index1)
 859  		lTable[hash7(cv0, lTableBits)] = uint16(index0)
 860  		sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
 861  
 862  		lTable[hash7(cv1, lTableBits)] = uint16(index1)
 863  		sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
 864  		index0 += 1
 865  		index1 -= 1
 866  		cv = load64(src, s)
 867  
 868  		// Index large values sparsely in between.
 869  		// We do two starting from different offsets for speed.
 870  		index2 := (index0 + index1 + 1) >> 1
 871  		for index2 < index1 {
 872  			lTable[hash7(load64(src, index0), lTableBits)] = uint16(index0)
 873  			lTable[hash7(load64(src, index2), lTableBits)] = uint16(index2)
 874  			index0 += 2
 875  			index2 += 2
 876  		}
 877  	}
 878  
 879  emitRemainder:
 880  	if nextEmit < len(src) {
 881  		// Bail if we exceed the maximum size.
 882  		if d+len(src)-nextEmit > dstLimit {
 883  			return 0
 884  		}
 885  		d += emitLiteral(dst[d:], src[nextEmit:])
 886  	}
 887  	return d
 888  }
 889  
 890  // encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It
 891  // assumes that the varint-encoded length of the decompressed bytes has already
 892  // been written.
 893  //
 894  // It also assumes that:
 895  //
 896  //	len(dst) >= MaxEncodedLen(len(src)) &&
 897  //	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 898  func encodeBlockBetterDict(dst, src []byte, dict *Dict) (d int) {
 899  	// sLimit is when to stop looking for offset/length copies. The inputMargin
 900  	// lets us use a fast path for emitLiteral in the main loop, while we are
 901  	// looking for copies.
 902  	// Initialize the hash tables.
 903  	const (
 904  		// Long hash matches.
 905  		lTableBits    = 17
 906  		maxLTableSize = 1 << lTableBits
 907  
 908  		// Short hash matches.
 909  		sTableBits    = 14
 910  		maxSTableSize = 1 << sTableBits
 911  
 912  		maxAhead = 8 // maximum bytes ahead without checking sLimit
 913  
 914  		debug = false
 915  	)
 916  
 917  	sLimit := min(len(src)-inputMargin, MaxDictSrcOffset-maxAhead)
 918  	if len(src) < minNonLiteralBlockSize {
 919  		return 0
 920  	}
 921  
 922  	dict.initBetter()
 923  
 924  	var lTable [maxLTableSize]uint32
 925  	var sTable [maxSTableSize]uint32
 926  
 927  	// Bail if we can't compress to at least this.
 928  	dstLimit := len(src) - len(src)>>5 - 6
 929  
 930  	// nextEmit is where in src the next emitLiteral should start from.
 931  	nextEmit := 0
 932  
 933  	// The encoded form must start with a literal, as there are no previous
 934  	// bytes to copy, so we start looking for hash matches at s == 1.
 935  	s := 0
 936  	cv := load64(src, s)
 937  
 938  	// We initialize repeat to 0, so we never match on first attempt
 939  	repeat := len(dict.dict) - dict.repeat
 940  
 941  	// While in dict
 942  searchDict:
 943  	for {
 944  		candidateL := 0
 945  		nextS := 0
 946  		for {
 947  			// Next src position to check
 948  			nextS = s + (s-nextEmit)>>7 + 1
 949  			if nextS > sLimit {
 950  				break searchDict
 951  			}
 952  			hashL := hash7(cv, lTableBits)
 953  			hashS := hash4(cv, sTableBits)
 954  			candidateL = int(lTable[hashL])
 955  			candidateS := int(sTable[hashS])
 956  			dictL := int(dict.betterTableLong[hashL])
 957  			dictS := int(dict.betterTableShort[hashS])
 958  			lTable[hashL] = uint32(s)
 959  			sTable[hashS] = uint32(s)
 960  
 961  			valLong := load64(src, candidateL)
 962  			valShort := load64(src, candidateS)
 963  
 964  			// If long matches at least 8 bytes, use that.
 965  			if s != 0 {
 966  				if cv == valLong {
 967  					goto emitMatch
 968  				}
 969  				if cv == valShort {
 970  					candidateL = candidateS
 971  					goto emitMatch
 972  				}
 973  			}
 974  
 975  			// Check dict repeat.
 976  			if repeat >= s+4 {
 977  				candidate := len(dict.dict) - repeat + s
 978  				if candidate > 0 && uint32(cv) == load32(dict.dict, candidate) {
 979  					// Extend back
 980  					base := s
 981  					for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
 982  						i--
 983  						base--
 984  					}
 985  					d += emitLiteral(dst[d:], src[nextEmit:base])
 986  					if debug && nextEmit != base {
 987  						fmt.Println("emitted ", base-nextEmit, "literals")
 988  					}
 989  					s += 4
 990  					candidate += 4
 991  					for candidate < len(dict.dict)-8 && s <= len(src)-8 {
 992  						if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
 993  							s += bits.TrailingZeros64(diff) >> 3
 994  							break
 995  						}
 996  						s += 8
 997  						candidate += 8
 998  					}
 999  					d += emitRepeat(dst[d:], repeat, s-base)
1000  					if debug {
1001  						fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
1002  					}
1003  					nextEmit = s
1004  					if s >= sLimit {
1005  						break searchDict
1006  					}
1007  					// Index in-between
1008  					index0 := base + 1
1009  					index1 := s - 2
1010  
1011  					cv = load64(src, s)
1012  					for index0 < index1 {
1013  						cv0 := load64(src, index0)
1014  						cv1 := load64(src, index1)
1015  						lTable[hash7(cv0, lTableBits)] = uint32(index0)
1016  						sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
1017  
1018  						lTable[hash7(cv1, lTableBits)] = uint32(index1)
1019  						sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
1020  						index0 += 2
1021  						index1 -= 2
1022  					}
1023  					continue
1024  				}
1025  			}
1026  			// Don't try to find match at s==0
1027  			if s == 0 {
1028  				cv = load64(src, nextS)
1029  				s = nextS
1030  				continue
1031  			}
1032  
1033  			// Long likely matches 7, so take that.
1034  			if uint32(cv) == uint32(valLong) {
1035  				goto emitMatch
1036  			}
1037  
1038  			// Long dict...
1039  			if uint32(cv) == load32(dict.dict, dictL) {
1040  				candidateL = dictL
1041  				goto emitDict
1042  			}
1043  
1044  			// Check our short candidate
1045  			if uint32(cv) == uint32(valShort) {
1046  				// Try a long candidate at s+1
1047  				hashL = hash7(cv>>8, lTableBits)
1048  				candidateL = int(lTable[hashL])
1049  				lTable[hashL] = uint32(s + 1)
1050  				if uint32(cv>>8) == load32(src, candidateL) {
1051  					s++
1052  					goto emitMatch
1053  				}
1054  				// Use our short candidate.
1055  				candidateL = candidateS
1056  				goto emitMatch
1057  			}
1058  			if uint32(cv) == load32(dict.dict, dictS) {
1059  				// Try a long candidate at s+1
1060  				hashL = hash7(cv>>8, lTableBits)
1061  				candidateL = int(lTable[hashL])
1062  				lTable[hashL] = uint32(s + 1)
1063  				if uint32(cv>>8) == load32(src, candidateL) {
1064  					s++
1065  					goto emitMatch
1066  				}
1067  				candidateL = dictS
1068  				goto emitDict
1069  			}
1070  			cv = load64(src, nextS)
1071  			s = nextS
1072  		}
1073  	emitDict:
1074  		{
1075  			if debug {
1076  				if load32(dict.dict, candidateL) != load32(src, s) {
1077  					panic("dict emit mismatch")
1078  				}
1079  			}
1080  			// Extend backwards.
1081  			// The top bytes will be rechecked to get the full match.
1082  			for candidateL > 0 && s > nextEmit && dict.dict[candidateL-1] == src[s-1] {
1083  				candidateL--
1084  				s--
1085  			}
1086  
1087  			// Bail if we exceed the maximum size.
1088  			if d+(s-nextEmit) > dstLimit {
1089  				return 0
1090  			}
1091  
1092  			// A 4-byte match has been found. We'll later see if more than 4 bytes
1093  			// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
1094  			// them as literal bytes.
1095  
1096  			d += emitLiteral(dst[d:], src[nextEmit:s])
1097  			if debug && nextEmit != s {
1098  				fmt.Println("emitted ", s-nextEmit, "literals")
1099  			}
1100  			{
1101  				// Invariant: we have a 4-byte match at s, and no need to emit any
1102  				// literal bytes prior to s.
1103  				base := s
1104  				offset := s + (len(dict.dict)) - candidateL
1105  
1106  				// Extend the 4-byte match as long as possible.
1107  				s += 4
1108  				candidateL += 4
1109  				for s <= len(src)-8 && len(dict.dict)-candidateL >= 8 {
1110  					if diff := load64(src, s) ^ load64(dict.dict, candidateL); diff != 0 {
1111  						s += bits.TrailingZeros64(diff) >> 3
1112  						break
1113  					}
1114  					s += 8
1115  					candidateL += 8
1116  				}
1117  
1118  				if repeat == offset {
1119  					if debug {
1120  						fmt.Println("emitted dict repeat, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
1121  					}
1122  					d += emitRepeat(dst[d:], offset, s-base)
1123  				} else {
1124  					if debug {
1125  						fmt.Println("emitted dict copy, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
1126  					}
1127  					// Matches longer than 64 are split.
1128  					if s <= sLimit || s-base < 8 {
1129  						d += emitCopy(dst[d:], offset, s-base)
1130  					} else {
1131  						// Split to ensure we don't start a copy within next block.
1132  						d += emitCopy(dst[d:], offset, 4)
1133  						d += emitRepeat(dst[d:], offset, s-base-4)
1134  					}
1135  					repeat = offset
1136  				}
1137  				if false {
1138  					// Validate match.
1139  					if s <= candidateL {
1140  						panic("s <= candidate")
1141  					}
1142  					a := src[base:s]
1143  					b := dict.dict[base-repeat : base-repeat+(s-base)]
1144  					if !bytes.Equal(a, b) {
1145  						panic("mismatch")
1146  					}
1147  				}
1148  
1149  				nextEmit = s
1150  				if s >= sLimit {
1151  					break searchDict
1152  				}
1153  
1154  				if d > dstLimit {
1155  					// Do we have space for more, if not bail.
1156  					return 0
1157  				}
1158  
1159  				// Index short & long
1160  				index0 := base + 1
1161  				index1 := s - 2
1162  
1163  				cv0 := load64(src, index0)
1164  				cv1 := load64(src, index1)
1165  				lTable[hash7(cv0, lTableBits)] = uint32(index0)
1166  				sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
1167  
1168  				lTable[hash7(cv1, lTableBits)] = uint32(index1)
1169  				sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
1170  				index0 += 1
1171  				index1 -= 1
1172  				cv = load64(src, s)
1173  
1174  				// index every second long in between.
1175  				for index0 < index1 {
1176  					lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
1177  					lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
1178  					index0 += 2
1179  					index1 -= 2
1180  				}
1181  			}
1182  			continue
1183  		}
1184  	emitMatch:
1185  
1186  		// Extend backwards
1187  		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
1188  			candidateL--
1189  			s--
1190  		}
1191  
1192  		// Bail if we exceed the maximum size.
1193  		if d+(s-nextEmit) > dstLimit {
1194  			return 0
1195  		}
1196  
1197  		base := s
1198  		offset := base - candidateL
1199  
1200  		// Extend the 4-byte match as long as possible.
1201  		s += 4
1202  		candidateL += 4
1203  		for s < len(src) {
1204  			if len(src)-s < 8 {
1205  				if src[s] == src[candidateL] {
1206  					s++
1207  					candidateL++
1208  					continue
1209  				}
1210  				break
1211  			}
1212  			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
1213  				s += bits.TrailingZeros64(diff) >> 3
1214  				break
1215  			}
1216  			s += 8
1217  			candidateL += 8
1218  		}
1219  
1220  		if offset > 65535 && s-base <= 5 && repeat != offset {
1221  			// Bail if the match is equal or worse to the encoding.
1222  			s = nextS + 1
1223  			if s >= sLimit {
1224  				goto emitRemainder
1225  			}
1226  			cv = load64(src, s)
1227  			continue
1228  		}
1229  
1230  		d += emitLiteral(dst[d:], src[nextEmit:base])
1231  		if debug && nextEmit != s {
1232  			fmt.Println("emitted ", s-nextEmit, "literals")
1233  		}
1234  		if repeat == offset {
1235  			if debug {
1236  				fmt.Println("emitted match repeat, length", s-base, "offset:", offset, "s:", s)
1237  			}
1238  			d += emitRepeat(dst[d:], offset, s-base)
1239  		} else {
1240  			if debug {
1241  				fmt.Println("emitted match copy, length", s-base, "offset:", offset, "s:", s)
1242  			}
1243  			d += emitCopy(dst[d:], offset, s-base)
1244  			repeat = offset
1245  		}
1246  
1247  		nextEmit = s
1248  		if s >= sLimit {
1249  			goto emitRemainder
1250  		}
1251  
1252  		if d > dstLimit {
1253  			// Do we have space for more, if not bail.
1254  			return 0
1255  		}
1256  
1257  		// Index short & long
1258  		index0 := base + 1
1259  		index1 := s - 2
1260  
1261  		cv0 := load64(src, index0)
1262  		cv1 := load64(src, index1)
1263  		lTable[hash7(cv0, lTableBits)] = uint32(index0)
1264  		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
1265  
1266  		lTable[hash7(cv1, lTableBits)] = uint32(index1)
1267  		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
1268  		index0 += 1
1269  		index1 -= 1
1270  		cv = load64(src, s)
1271  
1272  		// Index large values sparsely in between.
1273  		// We do two starting from different offsets for speed.
1274  		index2 := (index0 + index1 + 1) >> 1
1275  		for index2 < index1 {
1276  			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
1277  			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
1278  			index0 += 2
1279  			index2 += 2
1280  		}
1281  	}
1282  
1283  	// Search without dict:
1284  	if repeat > s {
1285  		repeat = 0
1286  	}
1287  
1288  	// No more dict
1289  	sLimit = len(src) - inputMargin
1290  	if s >= sLimit {
1291  		goto emitRemainder
1292  	}
1293  	cv = load64(src, s)
1294  	if debug {
1295  		fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
1296  	}
1297  	for {
1298  		candidateL := 0
1299  		nextS := 0
1300  		for {
1301  			// Next src position to check
1302  			nextS = s + (s-nextEmit)>>7 + 1
1303  			if nextS > sLimit {
1304  				goto emitRemainder
1305  			}
1306  			hashL := hash7(cv, lTableBits)
1307  			hashS := hash4(cv, sTableBits)
1308  			candidateL = int(lTable[hashL])
1309  			candidateS := int(sTable[hashS])
1310  			lTable[hashL] = uint32(s)
1311  			sTable[hashS] = uint32(s)
1312  
1313  			valLong := load64(src, candidateL)
1314  			valShort := load64(src, candidateS)
1315  
1316  			// If long matches at least 8 bytes, use that.
1317  			if cv == valLong {
1318  				break
1319  			}
1320  			if cv == valShort {
1321  				candidateL = candidateS
1322  				break
1323  			}
1324  
1325  			// Check repeat at offset checkRep.
1326  			const checkRep = 1
1327  			// Minimum length of a repeat. Tested with various values.
1328  			// While 4-5 offers improvements in some, 6 reduces
1329  			// regressions significantly.
1330  			const wantRepeatBytes = 6
1331  			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
1332  			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
1333  				base := s + checkRep
1334  				// Extend back
1335  				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
1336  					i--
1337  					base--
1338  				}
1339  				d += emitLiteral(dst[d:], src[nextEmit:base])
1340  
1341  				// Extend forward
1342  				candidate := s - repeat + wantRepeatBytes + checkRep
1343  				s += wantRepeatBytes + checkRep
1344  				for s < len(src) {
1345  					if len(src)-s < 8 {
1346  						if src[s] == src[candidate] {
1347  							s++
1348  							candidate++
1349  							continue
1350  						}
1351  						break
1352  					}
1353  					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
1354  						s += bits.TrailingZeros64(diff) >> 3
1355  						break
1356  					}
1357  					s += 8
1358  					candidate += 8
1359  				}
1360  				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
1361  				d += emitRepeat(dst[d:], repeat, s-base)
1362  				nextEmit = s
1363  				if s >= sLimit {
1364  					goto emitRemainder
1365  				}
1366  				// Index in-between
1367  				index0 := base + 1
1368  				index1 := s - 2
1369  
1370  				for index0 < index1 {
1371  					cv0 := load64(src, index0)
1372  					cv1 := load64(src, index1)
1373  					lTable[hash7(cv0, lTableBits)] = uint32(index0)
1374  					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
1375  
1376  					lTable[hash7(cv1, lTableBits)] = uint32(index1)
1377  					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
1378  					index0 += 2
1379  					index1 -= 2
1380  				}
1381  
1382  				cv = load64(src, s)
1383  				continue
1384  			}
1385  
1386  			// Long likely matches 7, so take that.
1387  			if uint32(cv) == uint32(valLong) {
1388  				break
1389  			}
1390  
1391  			// Check our short candidate
1392  			if uint32(cv) == uint32(valShort) {
1393  				// Try a long candidate at s+1
1394  				hashL = hash7(cv>>8, lTableBits)
1395  				candidateL = int(lTable[hashL])
1396  				lTable[hashL] = uint32(s + 1)
1397  				if uint32(cv>>8) == load32(src, candidateL) {
1398  					s++
1399  					break
1400  				}
1401  				// Use our short candidate.
1402  				candidateL = candidateS
1403  				break
1404  			}
1405  
1406  			cv = load64(src, nextS)
1407  			s = nextS
1408  		}
1409  
1410  		// Extend backwards
1411  		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
1412  			candidateL--
1413  			s--
1414  		}
1415  
1416  		// Bail if we exceed the maximum size.
1417  		if d+(s-nextEmit) > dstLimit {
1418  			return 0
1419  		}
1420  
1421  		base := s
1422  		offset := base - candidateL
1423  
1424  		// Extend the 4-byte match as long as possible.
1425  		s += 4
1426  		candidateL += 4
1427  		for s < len(src) {
1428  			if len(src)-s < 8 {
1429  				if src[s] == src[candidateL] {
1430  					s++
1431  					candidateL++
1432  					continue
1433  				}
1434  				break
1435  			}
1436  			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
1437  				s += bits.TrailingZeros64(diff) >> 3
1438  				break
1439  			}
1440  			s += 8
1441  			candidateL += 8
1442  		}
1443  
1444  		if offset > 65535 && s-base <= 5 && repeat != offset {
1445  			// Bail if the match is equal or worse to the encoding.
1446  			s = nextS + 1
1447  			if s >= sLimit {
1448  				goto emitRemainder
1449  			}
1450  			cv = load64(src, s)
1451  			continue
1452  		}
1453  
1454  		d += emitLiteral(dst[d:], src[nextEmit:base])
1455  		if repeat == offset {
1456  			d += emitRepeat(dst[d:], offset, s-base)
1457  		} else {
1458  			d += emitCopy(dst[d:], offset, s-base)
1459  			repeat = offset
1460  		}
1461  
1462  		nextEmit = s
1463  		if s >= sLimit {
1464  			goto emitRemainder
1465  		}
1466  
1467  		if d > dstLimit {
1468  			// Do we have space for more, if not bail.
1469  			return 0
1470  		}
1471  
1472  		// Index short & long
1473  		index0 := base + 1
1474  		index1 := s - 2
1475  
1476  		cv0 := load64(src, index0)
1477  		cv1 := load64(src, index1)
1478  		lTable[hash7(cv0, lTableBits)] = uint32(index0)
1479  		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
1480  
1481  		lTable[hash7(cv1, lTableBits)] = uint32(index1)
1482  		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
1483  		index0 += 1
1484  		index1 -= 1
1485  		cv = load64(src, s)
1486  
1487  		// Index large values sparsely in between.
1488  		// We do two starting from different offsets for speed.
1489  		index2 := (index0 + index1 + 1) >> 1
1490  		for index2 < index1 {
1491  			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
1492  			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
1493  			index0 += 2
1494  			index2 += 2
1495  		}
1496  	}
1497  
1498  emitRemainder:
1499  	if nextEmit < len(src) {
1500  		// Bail if we exceed the maximum size.
1501  		if d+len(src)-nextEmit > dstLimit {
1502  			return 0
1503  		}
1504  		d += emitLiteral(dst[d:], src[nextEmit:])
1505  	}
1506  	return d
1507  }
1508