enc_better.go raw

   1  // Copyright 2019+ Klaus Post. All rights reserved.
   2  // License information can be found in the LICENSE file.
   3  // Based on work by Yann Collet, released under BSD License.
   4  
   5  package zstd
   6  
   7  import "fmt"
   8  
   9  const (
  10  	betterLongTableBits = 19                       // Bits used in the long match table
  11  	betterLongTableSize = 1 << betterLongTableBits // Size of the table
  12  	betterLongLen       = 8                        // Bytes used for table hash
  13  
  14  	// Note: Increasing the short table bits or making the hash shorter
  15  	// can actually lead to compression degradation since it will 'steal' more from the
  16  	// long match table and match offsets are quite big.
  17  	// This greatly depends on the type of input.
  18  	betterShortTableBits = 13                        // Bits used in the short match table
  19  	betterShortTableSize = 1 << betterShortTableBits // Size of the table
  20  	betterShortLen       = 5                         // Bytes used for table hash
  21  
  22  	betterLongTableShardCnt  = 1 << (betterLongTableBits - dictShardBits)    // Number of shards in the table
  23  	betterLongTableShardSize = betterLongTableSize / betterLongTableShardCnt // Size of an individual shard
  24  
  25  	betterShortTableShardCnt  = 1 << (betterShortTableBits - dictShardBits)     // Number of shards in the table
  26  	betterShortTableShardSize = betterShortTableSize / betterShortTableShardCnt // Size of an individual shard
  27  )
  28  
  29  type prevEntry struct {
  30  	offset int32
  31  	prev   int32
  32  }
  33  
  34  // betterFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
  35  // The long match table contains the previous entry with the same hash,
  36  // effectively making it a "chain" of length 2.
  37  // When we find a long match we choose between the two values and select the longest.
  38  // When we find a short match, after checking the long, we check if we can find a long at n+1
  39  // and that it is longer (lazy matching).
  40  type betterFastEncoder struct {
  41  	fastBase
  42  	table     [betterShortTableSize]tableEntry
  43  	longTable [betterLongTableSize]prevEntry
  44  }
  45  
  46  type betterFastEncoderDict struct {
  47  	betterFastEncoder
  48  	dictTable            []tableEntry
  49  	dictLongTable        []prevEntry
  50  	shortTableShardDirty [betterShortTableShardCnt]bool
  51  	longTableShardDirty  [betterLongTableShardCnt]bool
  52  	allDirty             bool
  53  }
  54  
  55  // Encode improves compression...
  56  func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
  57  	const (
  58  		// Input margin is the number of bytes we read (8)
  59  		// and the maximum we will read ahead (2)
  60  		inputMargin            = 8 + 2
  61  		minNonLiteralBlockSize = 16
  62  	)
  63  
  64  	// Protect against e.cur wraparound.
  65  	for e.cur >= e.bufferReset-int32(len(e.hist)) {
  66  		if len(e.hist) == 0 {
  67  			e.table = [betterShortTableSize]tableEntry{}
  68  			e.longTable = [betterLongTableSize]prevEntry{}
  69  			e.cur = e.maxMatchOff
  70  			break
  71  		}
  72  		// Shift down everything in the table that isn't already too far away.
  73  		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
  74  		for i := range e.table[:] {
  75  			v := e.table[i].offset
  76  			if v < minOff {
  77  				v = 0
  78  			} else {
  79  				v = v - e.cur + e.maxMatchOff
  80  			}
  81  			e.table[i].offset = v
  82  		}
  83  		for i := range e.longTable[:] {
  84  			v := e.longTable[i].offset
  85  			v2 := e.longTable[i].prev
  86  			if v < minOff {
  87  				v = 0
  88  				v2 = 0
  89  			} else {
  90  				v = v - e.cur + e.maxMatchOff
  91  				if v2 < minOff {
  92  					v2 = 0
  93  				} else {
  94  					v2 = v2 - e.cur + e.maxMatchOff
  95  				}
  96  			}
  97  			e.longTable[i] = prevEntry{
  98  				offset: v,
  99  				prev:   v2,
 100  			}
 101  		}
 102  		e.cur = e.maxMatchOff
 103  		break
 104  	}
 105  	// Add block to history
 106  	s := e.addBlock(src)
 107  	blk.size = len(src)
 108  
 109  	// Check RLE first
 110  	if len(src) > zstdMinMatch {
 111  		ml := matchLen(src[1:], src)
 112  		if ml == len(src)-1 {
 113  			blk.literals = append(blk.literals, src[0])
 114  			blk.sequences = append(blk.sequences, seq{litLen: 1, matchLen: uint32(len(src)-1) - zstdMinMatch, offset: 1 + 3})
 115  			return
 116  		}
 117  	}
 118  
 119  	if len(src) < minNonLiteralBlockSize {
 120  		blk.extraLits = len(src)
 121  		blk.literals = blk.literals[:len(src)]
 122  		copy(blk.literals, src)
 123  		return
 124  	}
 125  
 126  	// Override src
 127  	src = e.hist
 128  	sLimit := int32(len(src)) - inputMargin
 129  	// stepSize is the number of bytes to skip on every main loop iteration.
 130  	// It should be >= 1.
 131  	const stepSize = 1
 132  
 133  	const kSearchStrength = 9
 134  
 135  	// nextEmit is where in src the next emitLiteral should start from.
 136  	nextEmit := s
 137  	cv := load6432(src, s)
 138  
 139  	// Relative offsets
 140  	offset1 := int32(blk.recentOffsets[0])
 141  	offset2 := int32(blk.recentOffsets[1])
 142  
 143  	addLiterals := func(s *seq, until int32) {
 144  		if until == nextEmit {
 145  			return
 146  		}
 147  		blk.literals = append(blk.literals, src[nextEmit:until]...)
 148  		s.litLen = uint32(until - nextEmit)
 149  	}
 150  	if debugEncoder {
 151  		println("recent offsets:", blk.recentOffsets)
 152  	}
 153  
 154  encodeLoop:
 155  	for {
 156  		var t int32
 157  		// We allow the encoder to optionally turn off repeat offsets across blocks
 158  		canRepeat := len(blk.sequences) > 2
 159  		var matched, index0 int32
 160  
 161  		for {
 162  			if debugAsserts && canRepeat && offset1 == 0 {
 163  				panic("offset0 was 0")
 164  			}
 165  
 166  			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
 167  			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 168  			candidateL := e.longTable[nextHashL]
 169  			candidateS := e.table[nextHashS]
 170  
 171  			const repOff = 1
 172  			repIndex := s - offset1 + repOff
 173  			off := s + e.cur
 174  			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
 175  			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
 176  			index0 = s + 1
 177  
 178  			if canRepeat {
 179  				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
 180  					// Consider history as well.
 181  					var seq seq
 182  					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
 183  
 184  					seq.matchLen = uint32(length - zstdMinMatch)
 185  
 186  					// We might be able to match backwards.
 187  					// Extend as long as we can.
 188  					start := s + repOff
 189  					// We end the search early, so we don't risk 0 literals
 190  					// and have to do special offset treatment.
 191  					startLimit := nextEmit + 1
 192  
 193  					tMin := max(s-e.maxMatchOff, 0)
 194  					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
 195  						repIndex--
 196  						start--
 197  						seq.matchLen++
 198  					}
 199  					addLiterals(&seq, start)
 200  
 201  					// rep 0
 202  					seq.offset = 1
 203  					if debugSequences {
 204  						println("repeat sequence", seq, "next s:", s)
 205  					}
 206  					blk.sequences = append(blk.sequences, seq)
 207  
 208  					// Index match start+1 (long) -> s - 1
 209  					index0 := s + repOff
 210  					s += length + repOff
 211  
 212  					nextEmit = s
 213  					if s >= sLimit {
 214  						if debugEncoder {
 215  							println("repeat ended", s, length)
 216  
 217  						}
 218  						break encodeLoop
 219  					}
 220  					// Index skipped...
 221  					for index0 < s-1 {
 222  						cv0 := load6432(src, index0)
 223  						cv1 := cv0 >> 8
 224  						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
 225  						off := index0 + e.cur
 226  						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 227  						e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
 228  						index0 += 2
 229  					}
 230  					cv = load6432(src, s)
 231  					continue
 232  				}
 233  				const repOff2 = 1
 234  
 235  				// We deviate from the reference encoder and also check offset 2.
 236  				// Still slower and not much better, so disabled.
 237  				// repIndex = s - offset2 + repOff2
 238  				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
 239  					// Consider history as well.
 240  					var seq seq
 241  					length := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
 242  
 243  					seq.matchLen = uint32(length - zstdMinMatch)
 244  
 245  					// We might be able to match backwards.
 246  					// Extend as long as we can.
 247  					start := s + repOff2
 248  					// We end the search early, so we don't risk 0 literals
 249  					// and have to do special offset treatment.
 250  					startLimit := nextEmit + 1
 251  
 252  					tMin := max(s-e.maxMatchOff, 0)
 253  					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
 254  						repIndex--
 255  						start--
 256  						seq.matchLen++
 257  					}
 258  					addLiterals(&seq, start)
 259  
 260  					// rep 2
 261  					seq.offset = 2
 262  					if debugSequences {
 263  						println("repeat sequence 2", seq, "next s:", s)
 264  					}
 265  					blk.sequences = append(blk.sequences, seq)
 266  
 267  					s += length + repOff2
 268  					nextEmit = s
 269  					if s >= sLimit {
 270  						if debugEncoder {
 271  							println("repeat ended", s, length)
 272  
 273  						}
 274  						break encodeLoop
 275  					}
 276  
 277  					// Index skipped...
 278  					for index0 < s-1 {
 279  						cv0 := load6432(src, index0)
 280  						cv1 := cv0 >> 8
 281  						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
 282  						off := index0 + e.cur
 283  						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 284  						e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
 285  						index0 += 2
 286  					}
 287  					cv = load6432(src, s)
 288  					// Swap offsets
 289  					offset1, offset2 = offset2, offset1
 290  					continue
 291  				}
 292  			}
 293  			// Find the offsets of our two matches.
 294  			coffsetL := candidateL.offset - e.cur
 295  			coffsetLP := candidateL.prev - e.cur
 296  
 297  			// Check if we have a long match.
 298  			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
 299  				// Found a long match, at least 8 bytes.
 300  				matched = e.matchlen(s+8, coffsetL+8, src) + 8
 301  				t = coffsetL
 302  				if debugAsserts && s <= t {
 303  					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 304  				}
 305  				if debugAsserts && s-t > e.maxMatchOff {
 306  					panic("s - t >e.maxMatchOff")
 307  				}
 308  				if debugMatches {
 309  					println("long match")
 310  				}
 311  
 312  				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
 313  					// Found a long match, at least 8 bytes.
 314  					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
 315  					if prevMatch > matched {
 316  						matched = prevMatch
 317  						t = coffsetLP
 318  					}
 319  					if debugAsserts && s <= t {
 320  						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 321  					}
 322  					if debugAsserts && s-t > e.maxMatchOff {
 323  						panic("s - t >e.maxMatchOff")
 324  					}
 325  					if debugMatches {
 326  						println("long match")
 327  					}
 328  				}
 329  				break
 330  			}
 331  
 332  			// Check if we have a long match on prev.
 333  			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
 334  				// Found a long match, at least 8 bytes.
 335  				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
 336  				t = coffsetLP
 337  				if debugAsserts && s <= t {
 338  					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 339  				}
 340  				if debugAsserts && s-t > e.maxMatchOff {
 341  					panic("s - t >e.maxMatchOff")
 342  				}
 343  				if debugMatches {
 344  					println("long match")
 345  				}
 346  				break
 347  			}
 348  
 349  			coffsetS := candidateS.offset - e.cur
 350  
 351  			// Check if we have a short match.
 352  			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
 353  				// found a regular match
 354  				matched = e.matchlen(s+4, coffsetS+4, src) + 4
 355  
 356  				// See if we can find a long match at s+1
 357  				const checkAt = 1
 358  				cv := load6432(src, s+checkAt)
 359  				nextHashL = hashLen(cv, betterLongTableBits, betterLongLen)
 360  				candidateL = e.longTable[nextHashL]
 361  				coffsetL = candidateL.offset - e.cur
 362  
 363  				// We can store it, since we have at least a 4 byte match.
 364  				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
 365  				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
 366  					// Found a long match, at least 8 bytes.
 367  					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
 368  					if matchedNext > matched {
 369  						t = coffsetL
 370  						s += checkAt
 371  						matched = matchedNext
 372  						if debugMatches {
 373  							println("long match (after short)")
 374  						}
 375  						break
 376  					}
 377  				}
 378  
 379  				// Check prev long...
 380  				coffsetL = candidateL.prev - e.cur
 381  				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
 382  					// Found a long match, at least 8 bytes.
 383  					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
 384  					if matchedNext > matched {
 385  						t = coffsetL
 386  						s += checkAt
 387  						matched = matchedNext
 388  						if debugMatches {
 389  							println("prev long match (after short)")
 390  						}
 391  						break
 392  					}
 393  				}
 394  				t = coffsetS
 395  				if debugAsserts && s <= t {
 396  					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 397  				}
 398  				if debugAsserts && s-t > e.maxMatchOff {
 399  					panic("s - t >e.maxMatchOff")
 400  				}
 401  				if debugAsserts && t < 0 {
 402  					panic("t<0")
 403  				}
 404  				if debugMatches {
 405  					println("short match")
 406  				}
 407  				break
 408  			}
 409  
 410  			// No match found, move forward in input.
 411  			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
 412  			if s >= sLimit {
 413  				break encodeLoop
 414  			}
 415  			cv = load6432(src, s)
 416  		}
 417  
 418  		// Try to find a better match by searching for a long match at the end of the current best match
 419  		if s+matched < sLimit {
 420  			// Allow some bytes at the beginning to mismatch.
 421  			// Sweet spot is around 3 bytes, but depends on input.
 422  			// The skipped bytes are tested in Extend backwards,
 423  			// and still picked up as part of the match if they do.
 424  			const skipBeginning = 3
 425  
 426  			nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
 427  			s2 := s + skipBeginning
 428  			cv := load3232(src, s2)
 429  			candidateL := e.longTable[nextHashL]
 430  			coffsetL := candidateL.offset - e.cur - matched + skipBeginning
 431  			if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
 432  				// Found a long match, at least 4 bytes.
 433  				matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
 434  				if matchedNext > matched {
 435  					t = coffsetL
 436  					s = s2
 437  					matched = matchedNext
 438  					if debugMatches {
 439  						println("long match at end-of-match")
 440  					}
 441  				}
 442  			}
 443  
 444  			// Check prev long...
 445  			if true {
 446  				coffsetL = candidateL.prev - e.cur - matched + skipBeginning
 447  				if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
 448  					// Found a long match, at least 4 bytes.
 449  					matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
 450  					if matchedNext > matched {
 451  						t = coffsetL
 452  						s = s2
 453  						matched = matchedNext
 454  						if debugMatches {
 455  							println("prev long match at end-of-match")
 456  						}
 457  					}
 458  				}
 459  			}
 460  		}
 461  		// A match has been found. Update recent offsets.
 462  		offset2 = offset1
 463  		offset1 = s - t
 464  
 465  		if debugAsserts && s <= t {
 466  			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 467  		}
 468  
 469  		if debugAsserts && canRepeat && int(offset1) > len(src) {
 470  			panic("invalid offset")
 471  		}
 472  
 473  		// Extend the n-byte match as long as possible.
 474  		l := matched
 475  
 476  		// Extend backwards
 477  		tMin := max(s-e.maxMatchOff, 0)
 478  		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
 479  			s--
 480  			t--
 481  			l++
 482  		}
 483  
 484  		// Write our sequence
 485  		var seq seq
 486  		seq.litLen = uint32(s - nextEmit)
 487  		seq.matchLen = uint32(l - zstdMinMatch)
 488  		if seq.litLen > 0 {
 489  			blk.literals = append(blk.literals, src[nextEmit:s]...)
 490  		}
 491  		seq.offset = uint32(s-t) + 3
 492  		s += l
 493  		if debugSequences {
 494  			println("sequence", seq, "next s:", s)
 495  		}
 496  		blk.sequences = append(blk.sequences, seq)
 497  		nextEmit = s
 498  		if s >= sLimit {
 499  			break encodeLoop
 500  		}
 501  
 502  		// Index match start+1 (long) -> s - 1
 503  		off := index0 + e.cur
 504  		for index0 < s-1 {
 505  			cv0 := load6432(src, index0)
 506  			cv1 := cv0 >> 8
 507  			h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
 508  			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 509  			e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
 510  			index0 += 2
 511  			off += 2
 512  		}
 513  
 514  		cv = load6432(src, s)
 515  		if !canRepeat {
 516  			continue
 517  		}
 518  
 519  		// Check offset 2
 520  		for {
 521  			o2 := s - offset2
 522  			if load3232(src, o2) != uint32(cv) {
 523  				// Do regular search
 524  				break
 525  			}
 526  
 527  			// Store this, since we have it.
 528  			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
 529  			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 530  
 531  			// We have at least 4 byte match.
 532  			// No need to check backwards. We come straight from a match
 533  			l := 4 + e.matchlen(s+4, o2+4, src)
 534  
 535  			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
 536  			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
 537  			seq.matchLen = uint32(l) - zstdMinMatch
 538  			seq.litLen = 0
 539  
 540  			// Since litlen is always 0, this is offset 1.
 541  			seq.offset = 1
 542  			s += l
 543  			nextEmit = s
 544  			if debugSequences {
 545  				println("sequence", seq, "next s:", s)
 546  			}
 547  			blk.sequences = append(blk.sequences, seq)
 548  
 549  			// Swap offset 1 and 2.
 550  			offset1, offset2 = offset2, offset1
 551  			if s >= sLimit {
 552  				// Finished
 553  				break encodeLoop
 554  			}
 555  			cv = load6432(src, s)
 556  		}
 557  	}
 558  
 559  	if int(nextEmit) < len(src) {
 560  		blk.literals = append(blk.literals, src[nextEmit:]...)
 561  		blk.extraLits = len(src) - int(nextEmit)
 562  	}
 563  	blk.recentOffsets[0] = uint32(offset1)
 564  	blk.recentOffsets[1] = uint32(offset2)
 565  	if debugEncoder {
 566  		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 567  	}
 568  }
 569  
 570  // EncodeNoHist will encode a block with no history and no following blocks.
 571  // Most notable difference is that src will not be copied for history and
 572  // we do not need to check for max match length.
 573  func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 574  	e.ensureHist(len(src))
 575  	e.Encode(blk, src)
 576  }
 577  
 578  // Encode improves compression...
 579  func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) {
 580  	const (
 581  		// Input margin is the number of bytes we read (8)
 582  		// and the maximum we will read ahead (2)
 583  		inputMargin            = 8 + 2
 584  		minNonLiteralBlockSize = 16
 585  	)
 586  
 587  	// Protect against e.cur wraparound.
 588  	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 589  		if len(e.hist) == 0 {
 590  			for i := range e.table[:] {
 591  				e.table[i] = tableEntry{}
 592  			}
 593  			for i := range e.longTable[:] {
 594  				e.longTable[i] = prevEntry{}
 595  			}
 596  			e.cur = e.maxMatchOff
 597  			e.allDirty = true
 598  			break
 599  		}
 600  		// Shift down everything in the table that isn't already too far away.
 601  		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
 602  		for i := range e.table[:] {
 603  			v := e.table[i].offset
 604  			if v < minOff {
 605  				v = 0
 606  			} else {
 607  				v = v - e.cur + e.maxMatchOff
 608  			}
 609  			e.table[i].offset = v
 610  		}
 611  		for i := range e.longTable[:] {
 612  			v := e.longTable[i].offset
 613  			v2 := e.longTable[i].prev
 614  			if v < minOff {
 615  				v = 0
 616  				v2 = 0
 617  			} else {
 618  				v = v - e.cur + e.maxMatchOff
 619  				if v2 < minOff {
 620  					v2 = 0
 621  				} else {
 622  					v2 = v2 - e.cur + e.maxMatchOff
 623  				}
 624  			}
 625  			e.longTable[i] = prevEntry{
 626  				offset: v,
 627  				prev:   v2,
 628  			}
 629  		}
 630  		e.allDirty = true
 631  		e.cur = e.maxMatchOff
 632  		break
 633  	}
 634  
 635  	s := e.addBlock(src)
 636  	blk.size = len(src)
 637  	if len(src) < minNonLiteralBlockSize {
 638  		blk.extraLits = len(src)
 639  		blk.literals = blk.literals[:len(src)]
 640  		copy(blk.literals, src)
 641  		return
 642  	}
 643  
 644  	// Override src
 645  	src = e.hist
 646  	sLimit := int32(len(src)) - inputMargin
 647  	// stepSize is the number of bytes to skip on every main loop iteration.
 648  	// It should be >= 1.
 649  	const stepSize = 1
 650  
 651  	const kSearchStrength = 9
 652  
 653  	// nextEmit is where in src the next emitLiteral should start from.
 654  	nextEmit := s
 655  	cv := load6432(src, s)
 656  
 657  	// Relative offsets
 658  	offset1 := int32(blk.recentOffsets[0])
 659  	offset2 := int32(blk.recentOffsets[1])
 660  
 661  	addLiterals := func(s *seq, until int32) {
 662  		if until == nextEmit {
 663  			return
 664  		}
 665  		blk.literals = append(blk.literals, src[nextEmit:until]...)
 666  		s.litLen = uint32(until - nextEmit)
 667  	}
 668  	if debugEncoder {
 669  		println("recent offsets:", blk.recentOffsets)
 670  	}
 671  
 672  encodeLoop:
 673  	for {
 674  		var t int32
 675  		// We allow the encoder to optionally turn off repeat offsets across blocks
 676  		canRepeat := len(blk.sequences) > 2
 677  		var matched, index0 int32
 678  
 679  		for {
 680  			if debugAsserts && canRepeat && offset1 == 0 {
 681  				panic("offset0 was 0")
 682  			}
 683  
 684  			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
 685  			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 686  			candidateL := e.longTable[nextHashL]
 687  			candidateS := e.table[nextHashS]
 688  
 689  			const repOff = 1
 690  			repIndex := s - offset1 + repOff
 691  			off := s + e.cur
 692  			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
 693  			e.markLongShardDirty(nextHashL)
 694  			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
 695  			e.markShortShardDirty(nextHashS)
 696  			index0 = s + 1
 697  
 698  			if canRepeat {
 699  				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
 700  					// Consider history as well.
 701  					var seq seq
 702  					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
 703  
 704  					seq.matchLen = uint32(length - zstdMinMatch)
 705  
 706  					// We might be able to match backwards.
 707  					// Extend as long as we can.
 708  					start := s + repOff
 709  					// We end the search early, so we don't risk 0 literals
 710  					// and have to do special offset treatment.
 711  					startLimit := nextEmit + 1
 712  
 713  					tMin := max(s-e.maxMatchOff, 0)
 714  					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
 715  						repIndex--
 716  						start--
 717  						seq.matchLen++
 718  					}
 719  					addLiterals(&seq, start)
 720  
 721  					// rep 0
 722  					seq.offset = 1
 723  					if debugSequences {
 724  						println("repeat sequence", seq, "next s:", s)
 725  					}
 726  					blk.sequences = append(blk.sequences, seq)
 727  
 728  					// Index match start+1 (long) -> s - 1
 729  					s += length + repOff
 730  
 731  					nextEmit = s
 732  					if s >= sLimit {
 733  						if debugEncoder {
 734  							println("repeat ended", s, length)
 735  
 736  						}
 737  						break encodeLoop
 738  					}
 739  					// Index skipped...
 740  					for index0 < s-1 {
 741  						cv0 := load6432(src, index0)
 742  						cv1 := cv0 >> 8
 743  						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
 744  						off := index0 + e.cur
 745  						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 746  						e.markLongShardDirty(h0)
 747  						h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
 748  						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
 749  						e.markShortShardDirty(h1)
 750  						index0 += 2
 751  					}
 752  					cv = load6432(src, s)
 753  					continue
 754  				}
 755  				const repOff2 = 1
 756  
 757  				// We deviate from the reference encoder and also check offset 2.
 758  				// Still slower and not much better, so disabled.
 759  				// repIndex = s - offset2 + repOff2
 760  				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
 761  					// Consider history as well.
 762  					var seq seq
 763  					length := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
 764  
 765  					seq.matchLen = uint32(length - zstdMinMatch)
 766  
 767  					// We might be able to match backwards.
 768  					// Extend as long as we can.
 769  					start := s + repOff2
 770  					// We end the search early, so we don't risk 0 literals
 771  					// and have to do special offset treatment.
 772  					startLimit := nextEmit + 1
 773  
 774  					tMin := max(s-e.maxMatchOff, 0)
 775  					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
 776  						repIndex--
 777  						start--
 778  						seq.matchLen++
 779  					}
 780  					addLiterals(&seq, start)
 781  
 782  					// rep 2
 783  					seq.offset = 2
 784  					if debugSequences {
 785  						println("repeat sequence 2", seq, "next s:", s)
 786  					}
 787  					blk.sequences = append(blk.sequences, seq)
 788  
 789  					s += length + repOff2
 790  					nextEmit = s
 791  					if s >= sLimit {
 792  						if debugEncoder {
 793  							println("repeat ended", s, length)
 794  
 795  						}
 796  						break encodeLoop
 797  					}
 798  
 799  					// Index skipped...
 800  					for index0 < s-1 {
 801  						cv0 := load6432(src, index0)
 802  						cv1 := cv0 >> 8
 803  						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
 804  						off := index0 + e.cur
 805  						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 806  						e.markLongShardDirty(h0)
 807  						h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
 808  						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
 809  						e.markShortShardDirty(h1)
 810  						index0 += 2
 811  					}
 812  					cv = load6432(src, s)
 813  					// Swap offsets
 814  					offset1, offset2 = offset2, offset1
 815  					continue
 816  				}
 817  			}
 818  			// Find the offsets of our two matches.
 819  			coffsetL := candidateL.offset - e.cur
 820  			coffsetLP := candidateL.prev - e.cur
 821  
 822  			// Check if we have a long match.
 823  			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
 824  				// Found a long match, at least 8 bytes.
 825  				matched = e.matchlen(s+8, coffsetL+8, src) + 8
 826  				t = coffsetL
 827  				if debugAsserts && s <= t {
 828  					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 829  				}
 830  				if debugAsserts && s-t > e.maxMatchOff {
 831  					panic("s - t >e.maxMatchOff")
 832  				}
 833  				if debugMatches {
 834  					println("long match")
 835  				}
 836  
 837  				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
 838  					// Found a long match, at least 8 bytes.
 839  					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
 840  					if prevMatch > matched {
 841  						matched = prevMatch
 842  						t = coffsetLP
 843  					}
 844  					if debugAsserts && s <= t {
 845  						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 846  					}
 847  					if debugAsserts && s-t > e.maxMatchOff {
 848  						panic("s - t >e.maxMatchOff")
 849  					}
 850  					if debugMatches {
 851  						println("long match")
 852  					}
 853  				}
 854  				break
 855  			}
 856  
 857  			// Check if we have a long match on prev.
 858  			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
 859  				// Found a long match, at least 8 bytes.
 860  				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
 861  				t = coffsetLP
 862  				if debugAsserts && s <= t {
 863  					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 864  				}
 865  				if debugAsserts && s-t > e.maxMatchOff {
 866  					panic("s - t >e.maxMatchOff")
 867  				}
 868  				if debugMatches {
 869  					println("long match")
 870  				}
 871  				break
 872  			}
 873  
 874  			coffsetS := candidateS.offset - e.cur
 875  
 876  			// Check if we have a short match.
 877  			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
 878  				// found a regular match
 879  				matched = e.matchlen(s+4, coffsetS+4, src) + 4
 880  
 881  				// See if we can find a long match at s+1
 882  				const checkAt = 1
 883  				cv := load6432(src, s+checkAt)
 884  				nextHashL = hashLen(cv, betterLongTableBits, betterLongLen)
 885  				candidateL = e.longTable[nextHashL]
 886  				coffsetL = candidateL.offset - e.cur
 887  
 888  				// We can store it, since we have at least a 4 byte match.
 889  				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
 890  				e.markLongShardDirty(nextHashL)
 891  				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
 892  					// Found a long match, at least 8 bytes.
 893  					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
 894  					if matchedNext > matched {
 895  						t = coffsetL
 896  						s += checkAt
 897  						matched = matchedNext
 898  						if debugMatches {
 899  							println("long match (after short)")
 900  						}
 901  						break
 902  					}
 903  				}
 904  
 905  				// Check prev long...
 906  				coffsetL = candidateL.prev - e.cur
 907  				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
 908  					// Found a long match, at least 8 bytes.
 909  					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
 910  					if matchedNext > matched {
 911  						t = coffsetL
 912  						s += checkAt
 913  						matched = matchedNext
 914  						if debugMatches {
 915  							println("prev long match (after short)")
 916  						}
 917  						break
 918  					}
 919  				}
 920  				t = coffsetS
 921  				if debugAsserts && s <= t {
 922  					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 923  				}
 924  				if debugAsserts && s-t > e.maxMatchOff {
 925  					panic("s - t >e.maxMatchOff")
 926  				}
 927  				if debugAsserts && t < 0 {
 928  					panic("t<0")
 929  				}
 930  				if debugMatches {
 931  					println("short match")
 932  				}
 933  				break
 934  			}
 935  
 936  			// No match found, move forward in input.
 937  			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
 938  			if s >= sLimit {
 939  				break encodeLoop
 940  			}
 941  			cv = load6432(src, s)
 942  		}
 943  		// Try to find a better match by searching for a long match at the end of the current best match
 944  		if s+matched < sLimit {
 945  			nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
 946  			cv := load3232(src, s)
 947  			candidateL := e.longTable[nextHashL]
 948  			coffsetL := candidateL.offset - e.cur - matched
 949  			if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
 950  				// Found a long match, at least 4 bytes.
 951  				matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
 952  				if matchedNext > matched {
 953  					t = coffsetL
 954  					matched = matchedNext
 955  					if debugMatches {
 956  						println("long match at end-of-match")
 957  					}
 958  				}
 959  			}
 960  
 961  			// Check prev long...
 962  			if true {
 963  				coffsetL = candidateL.prev - e.cur - matched
 964  				if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
 965  					// Found a long match, at least 4 bytes.
 966  					matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
 967  					if matchedNext > matched {
 968  						t = coffsetL
 969  						matched = matchedNext
 970  						if debugMatches {
 971  							println("prev long match at end-of-match")
 972  						}
 973  					}
 974  				}
 975  			}
 976  		}
 977  		// A match has been found. Update recent offsets.
 978  		offset2 = offset1
 979  		offset1 = s - t
 980  
 981  		if debugAsserts && s <= t {
 982  			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 983  		}
 984  
 985  		if debugAsserts && canRepeat && int(offset1) > len(src) {
 986  			panic("invalid offset")
 987  		}
 988  
 989  		// Extend the n-byte match as long as possible.
 990  		l := matched
 991  
 992  		// Extend backwards
 993  		tMin := max(s-e.maxMatchOff, 0)
 994  		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
 995  			s--
 996  			t--
 997  			l++
 998  		}
 999  
1000  		// Write our sequence
1001  		var seq seq
1002  		seq.litLen = uint32(s - nextEmit)
1003  		seq.matchLen = uint32(l - zstdMinMatch)
1004  		if seq.litLen > 0 {
1005  			blk.literals = append(blk.literals, src[nextEmit:s]...)
1006  		}
1007  		seq.offset = uint32(s-t) + 3
1008  		s += l
1009  		if debugSequences {
1010  			println("sequence", seq, "next s:", s)
1011  		}
1012  		blk.sequences = append(blk.sequences, seq)
1013  		nextEmit = s
1014  		if s >= sLimit {
1015  			break encodeLoop
1016  		}
1017  
1018  		// Index match start+1 (long) -> s - 1
1019  		off := index0 + e.cur
1020  		for index0 < s-1 {
1021  			cv0 := load6432(src, index0)
1022  			cv1 := cv0 >> 8
1023  			h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
1024  			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
1025  			e.markLongShardDirty(h0)
1026  			h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
1027  			e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
1028  			e.markShortShardDirty(h1)
1029  			index0 += 2
1030  			off += 2
1031  		}
1032  
1033  		cv = load6432(src, s)
1034  		if !canRepeat {
1035  			continue
1036  		}
1037  
1038  		// Check offset 2
1039  		for {
1040  			o2 := s - offset2
1041  			if load3232(src, o2) != uint32(cv) {
1042  				// Do regular search
1043  				break
1044  			}
1045  
1046  			// Store this, since we have it.
1047  			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
1048  			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
1049  
1050  			// We have at least 4 byte match.
1051  			// No need to check backwards. We come straight from a match
1052  			l := 4 + e.matchlen(s+4, o2+4, src)
1053  
1054  			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
1055  			e.markLongShardDirty(nextHashL)
1056  			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
1057  			e.markShortShardDirty(nextHashS)
1058  			seq.matchLen = uint32(l) - zstdMinMatch
1059  			seq.litLen = 0
1060  
1061  			// Since litlen is always 0, this is offset 1.
1062  			seq.offset = 1
1063  			s += l
1064  			nextEmit = s
1065  			if debugSequences {
1066  				println("sequence", seq, "next s:", s)
1067  			}
1068  			blk.sequences = append(blk.sequences, seq)
1069  
1070  			// Swap offset 1 and 2.
1071  			offset1, offset2 = offset2, offset1
1072  			if s >= sLimit {
1073  				// Finished
1074  				break encodeLoop
1075  			}
1076  			cv = load6432(src, s)
1077  		}
1078  	}
1079  
1080  	if int(nextEmit) < len(src) {
1081  		blk.literals = append(blk.literals, src[nextEmit:]...)
1082  		blk.extraLits = len(src) - int(nextEmit)
1083  	}
1084  	blk.recentOffsets[0] = uint32(offset1)
1085  	blk.recentOffsets[1] = uint32(offset2)
1086  	if debugEncoder {
1087  		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
1088  	}
1089  }
1090  
1091  // ResetDict will reset and set a dictionary if not nil
1092  func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
1093  	e.resetBase(d, singleBlock)
1094  	if d != nil {
1095  		panic("betterFastEncoder: Reset with dict")
1096  	}
1097  }
1098  
1099  // ResetDict will reset and set a dictionary if not nil
1100  func (e *betterFastEncoderDict) Reset(d *dict, singleBlock bool) {
1101  	e.resetBase(d, singleBlock)
1102  	if d == nil {
1103  		return
1104  	}
1105  	// Init or copy dict table
1106  	if len(e.dictTable) != len(e.table) || d.id != e.lastDictID {
1107  		if len(e.dictTable) != len(e.table) {
1108  			e.dictTable = make([]tableEntry, len(e.table))
1109  		}
1110  		end := int32(len(d.content)) - 8 + e.maxMatchOff
1111  		for i := e.maxMatchOff; i < end; i += 4 {
1112  			const hashLog = betterShortTableBits
1113  
1114  			cv := load6432(d.content, i-e.maxMatchOff)
1115  			nextHash := hashLen(cv, hashLog, betterShortLen)      // 0 -> 4
1116  			nextHash1 := hashLen(cv>>8, hashLog, betterShortLen)  // 1 -> 5
1117  			nextHash2 := hashLen(cv>>16, hashLog, betterShortLen) // 2 -> 6
1118  			nextHash3 := hashLen(cv>>24, hashLog, betterShortLen) // 3 -> 7
1119  			e.dictTable[nextHash] = tableEntry{
1120  				val:    uint32(cv),
1121  				offset: i,
1122  			}
1123  			e.dictTable[nextHash1] = tableEntry{
1124  				val:    uint32(cv >> 8),
1125  				offset: i + 1,
1126  			}
1127  			e.dictTable[nextHash2] = tableEntry{
1128  				val:    uint32(cv >> 16),
1129  				offset: i + 2,
1130  			}
1131  			e.dictTable[nextHash3] = tableEntry{
1132  				val:    uint32(cv >> 24),
1133  				offset: i + 3,
1134  			}
1135  		}
1136  		e.lastDictID = d.id
1137  		e.allDirty = true
1138  	}
1139  
1140  	// Init or copy dict table
1141  	if len(e.dictLongTable) != len(e.longTable) || d.id != e.lastDictID {
1142  		if len(e.dictLongTable) != len(e.longTable) {
1143  			e.dictLongTable = make([]prevEntry, len(e.longTable))
1144  		}
1145  		if len(d.content) >= 8 {
1146  			cv := load6432(d.content, 0)
1147  			h := hashLen(cv, betterLongTableBits, betterLongLen)
1148  			e.dictLongTable[h] = prevEntry{
1149  				offset: e.maxMatchOff,
1150  				prev:   e.dictLongTable[h].offset,
1151  			}
1152  
1153  			end := int32(len(d.content)) - 8 + e.maxMatchOff
1154  			off := 8 // First to read
1155  			for i := e.maxMatchOff + 1; i < end; i++ {
1156  				cv = cv>>8 | (uint64(d.content[off]) << 56)
1157  				h := hashLen(cv, betterLongTableBits, betterLongLen)
1158  				e.dictLongTable[h] = prevEntry{
1159  					offset: i,
1160  					prev:   e.dictLongTable[h].offset,
1161  				}
1162  				off++
1163  			}
1164  		}
1165  		e.lastDictID = d.id
1166  		e.allDirty = true
1167  	}
1168  
1169  	// Reset table to initial state
1170  	{
1171  		dirtyShardCnt := 0
1172  		if !e.allDirty {
1173  			for i := range e.shortTableShardDirty {
1174  				if e.shortTableShardDirty[i] {
1175  					dirtyShardCnt++
1176  				}
1177  			}
1178  		}
1179  		const shardCnt = betterShortTableShardCnt
1180  		const shardSize = betterShortTableShardSize
1181  		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
1182  			copy(e.table[:], e.dictTable)
1183  			for i := range e.shortTableShardDirty {
1184  				e.shortTableShardDirty[i] = false
1185  			}
1186  		} else {
1187  			for i := range e.shortTableShardDirty {
1188  				if !e.shortTableShardDirty[i] {
1189  					continue
1190  				}
1191  
1192  				copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
1193  				e.shortTableShardDirty[i] = false
1194  			}
1195  		}
1196  	}
1197  	{
1198  		dirtyShardCnt := 0
1199  		if !e.allDirty {
1200  			for i := range e.shortTableShardDirty {
1201  				if e.shortTableShardDirty[i] {
1202  					dirtyShardCnt++
1203  				}
1204  			}
1205  		}
1206  		const shardCnt = betterLongTableShardCnt
1207  		const shardSize = betterLongTableShardSize
1208  		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
1209  			copy(e.longTable[:], e.dictLongTable)
1210  			for i := range e.longTableShardDirty {
1211  				e.longTableShardDirty[i] = false
1212  			}
1213  		} else {
1214  			for i := range e.longTableShardDirty {
1215  				if !e.longTableShardDirty[i] {
1216  					continue
1217  				}
1218  
1219  				copy(e.longTable[i*shardSize:(i+1)*shardSize], e.dictLongTable[i*shardSize:(i+1)*shardSize])
1220  				e.longTableShardDirty[i] = false
1221  			}
1222  		}
1223  	}
1224  	e.cur = e.maxMatchOff
1225  	e.allDirty = false
1226  }
1227  
1228  func (e *betterFastEncoderDict) markLongShardDirty(entryNum uint32) {
1229  	e.longTableShardDirty[entryNum/betterLongTableShardSize] = true
1230  }
1231  
1232  func (e *betterFastEncoderDict) markShortShardDirty(entryNum uint32) {
1233  	e.shortTableShardDirty[entryNum/betterShortTableShardSize] = true
1234  }
1235