enc_dfast.go raw

   1  // Copyright 2019+ Klaus Post. All rights reserved.
   2  // License information can be found in the LICENSE file.
   3  // Based on work by Yann Collet, released under BSD License.
   4  
   5  package zstd
   6  
   7  import "fmt"
   8  
   9  const (
  10  	dFastLongTableBits = 17                      // Bits used in the long match table
  11  	dFastLongTableSize = 1 << dFastLongTableBits // Size of the table
  12  	dFastLongTableMask = dFastLongTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
  13  	dFastLongLen       = 8                       // Bytes used for table hash
  14  
  15  	dLongTableShardCnt  = 1 << (dFastLongTableBits - dictShardBits) // Number of shards in the table
  16  	dLongTableShardSize = dFastLongTableSize / dLongTableShardCnt   // Size of an individual shard
  17  
  18  	dFastShortTableBits = tableBits                // Bits used in the short match table
  19  	dFastShortTableSize = 1 << dFastShortTableBits // Size of the table
  20  	dFastShortTableMask = dFastShortTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
  21  	dFastShortLen       = 5                        // Bytes used for table hash
  22  
  23  )
  24  
  25  type doubleFastEncoder struct {
  26  	fastEncoder
  27  	longTable [dFastLongTableSize]tableEntry
  28  }
  29  
  30  type doubleFastEncoderDict struct {
  31  	fastEncoderDict
  32  	longTable           [dFastLongTableSize]tableEntry
  33  	dictLongTable       []tableEntry
  34  	longTableShardDirty [dLongTableShardCnt]bool
  35  }
  36  
  37  // Encode mimmics functionality in zstd_dfast.c
  38  func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
  39  	const (
  40  		// Input margin is the number of bytes we read (8)
  41  		// and the maximum we will read ahead (2)
  42  		inputMargin            = 8 + 2
  43  		minNonLiteralBlockSize = 16
  44  	)
  45  
  46  	// Protect against e.cur wraparound.
  47  	for e.cur >= e.bufferReset-int32(len(e.hist)) {
  48  		if len(e.hist) == 0 {
  49  			e.table = [dFastShortTableSize]tableEntry{}
  50  			e.longTable = [dFastLongTableSize]tableEntry{}
  51  			e.cur = e.maxMatchOff
  52  			break
  53  		}
  54  		// Shift down everything in the table that isn't already too far away.
  55  		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
  56  		for i := range e.table[:] {
  57  			v := e.table[i].offset
  58  			if v < minOff {
  59  				v = 0
  60  			} else {
  61  				v = v - e.cur + e.maxMatchOff
  62  			}
  63  			e.table[i].offset = v
  64  		}
  65  		for i := range e.longTable[:] {
  66  			v := e.longTable[i].offset
  67  			if v < minOff {
  68  				v = 0
  69  			} else {
  70  				v = v - e.cur + e.maxMatchOff
  71  			}
  72  			e.longTable[i].offset = v
  73  		}
  74  		e.cur = e.maxMatchOff
  75  		break
  76  	}
  77  
  78  	s := e.addBlock(src)
  79  	blk.size = len(src)
  80  	if len(src) < minNonLiteralBlockSize {
  81  		blk.extraLits = len(src)
  82  		blk.literals = blk.literals[:len(src)]
  83  		copy(blk.literals, src)
  84  		return
  85  	}
  86  
  87  	// Override src
  88  	src = e.hist
  89  	sLimit := int32(len(src)) - inputMargin
  90  	// stepSize is the number of bytes to skip on every main loop iteration.
  91  	// It should be >= 1.
  92  	const stepSize = 1
  93  
  94  	const kSearchStrength = 8
  95  
  96  	// nextEmit is where in src the next emitLiteral should start from.
  97  	nextEmit := s
  98  	cv := load6432(src, s)
  99  
 100  	// Relative offsets
 101  	offset1 := int32(blk.recentOffsets[0])
 102  	offset2 := int32(blk.recentOffsets[1])
 103  
 104  	addLiterals := func(s *seq, until int32) {
 105  		if until == nextEmit {
 106  			return
 107  		}
 108  		blk.literals = append(blk.literals, src[nextEmit:until]...)
 109  		s.litLen = uint32(until - nextEmit)
 110  	}
 111  	if debugEncoder {
 112  		println("recent offsets:", blk.recentOffsets)
 113  	}
 114  
 115  encodeLoop:
 116  	for {
 117  		var t int32
 118  		// We allow the encoder to optionally turn off repeat offsets across blocks
 119  		canRepeat := len(blk.sequences) > 2
 120  
 121  		for {
 122  			if debugAsserts && canRepeat && offset1 == 0 {
 123  				panic("offset0 was 0")
 124  			}
 125  
 126  			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 127  			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 128  			candidateL := e.longTable[nextHashL]
 129  			candidateS := e.table[nextHashS]
 130  
 131  			const repOff = 1
 132  			repIndex := s - offset1 + repOff
 133  			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
 134  			e.longTable[nextHashL] = entry
 135  			e.table[nextHashS] = entry
 136  
 137  			if canRepeat {
 138  				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
 139  					// Consider history as well.
 140  					var seq seq
 141  					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
 142  
 143  					seq.matchLen = uint32(length - zstdMinMatch)
 144  
 145  					// We might be able to match backwards.
 146  					// Extend as long as we can.
 147  					start := s + repOff
 148  					// We end the search early, so we don't risk 0 literals
 149  					// and have to do special offset treatment.
 150  					startLimit := nextEmit + 1
 151  
 152  					tMin := max(s-e.maxMatchOff, 0)
 153  					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
 154  						repIndex--
 155  						start--
 156  						seq.matchLen++
 157  					}
 158  					addLiterals(&seq, start)
 159  
 160  					// rep 0
 161  					seq.offset = 1
 162  					if debugSequences {
 163  						println("repeat sequence", seq, "next s:", s)
 164  					}
 165  					blk.sequences = append(blk.sequences, seq)
 166  					s += length + repOff
 167  					nextEmit = s
 168  					if s >= sLimit {
 169  						if debugEncoder {
 170  							println("repeat ended", s, length)
 171  
 172  						}
 173  						break encodeLoop
 174  					}
 175  					cv = load6432(src, s)
 176  					continue
 177  				}
 178  			}
 179  			// Find the offsets of our two matches.
 180  			coffsetL := s - (candidateL.offset - e.cur)
 181  			coffsetS := s - (candidateS.offset - e.cur)
 182  
 183  			// Check if we have a long match.
 184  			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
 185  				// Found a long match, likely at least 8 bytes.
 186  				// Reference encoder checks all 8 bytes, we only check 4,
 187  				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
 188  				t = candidateL.offset - e.cur
 189  				if debugAsserts && s <= t {
 190  					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 191  				}
 192  				if debugAsserts && s-t > e.maxMatchOff {
 193  					panic("s - t >e.maxMatchOff")
 194  				}
 195  				if debugMatches {
 196  					println("long match")
 197  				}
 198  				break
 199  			}
 200  
 201  			// Check if we have a short match.
 202  			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
 203  				// found a regular match
 204  				// See if we can find a long match at s+1
 205  				const checkAt = 1
 206  				cv := load6432(src, s+checkAt)
 207  				nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
 208  				candidateL = e.longTable[nextHashL]
 209  				coffsetL = s - (candidateL.offset - e.cur) + checkAt
 210  
 211  				// We can store it, since we have at least a 4 byte match.
 212  				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
 213  				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
 214  					// Found a long match, likely at least 8 bytes.
 215  					// Reference encoder checks all 8 bytes, we only check 4,
 216  					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
 217  					t = candidateL.offset - e.cur
 218  					s += checkAt
 219  					if debugMatches {
 220  						println("long match (after short)")
 221  					}
 222  					break
 223  				}
 224  
 225  				t = candidateS.offset - e.cur
 226  				if debugAsserts && s <= t {
 227  					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 228  				}
 229  				if debugAsserts && s-t > e.maxMatchOff {
 230  					panic("s - t >e.maxMatchOff")
 231  				}
 232  				if debugAsserts && t < 0 {
 233  					panic("t<0")
 234  				}
 235  				if debugMatches {
 236  					println("short match")
 237  				}
 238  				break
 239  			}
 240  
 241  			// No match found, move forward in input.
 242  			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
 243  			if s >= sLimit {
 244  				break encodeLoop
 245  			}
 246  			cv = load6432(src, s)
 247  		}
 248  
 249  		// A 4-byte match has been found. Update recent offsets.
 250  		// We'll later see if more than 4 bytes.
 251  		offset2 = offset1
 252  		offset1 = s - t
 253  
 254  		if debugAsserts && s <= t {
 255  			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 256  		}
 257  
 258  		if debugAsserts && canRepeat && int(offset1) > len(src) {
 259  			panic("invalid offset")
 260  		}
 261  
 262  		// Extend the 4-byte match as long as possible.
 263  		l := e.matchlen(s+4, t+4, src) + 4
 264  
 265  		// Extend backwards
 266  		tMin := max(s-e.maxMatchOff, 0)
 267  		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
 268  			s--
 269  			t--
 270  			l++
 271  		}
 272  
 273  		// Write our sequence
 274  		var seq seq
 275  		seq.litLen = uint32(s - nextEmit)
 276  		seq.matchLen = uint32(l - zstdMinMatch)
 277  		if seq.litLen > 0 {
 278  			blk.literals = append(blk.literals, src[nextEmit:s]...)
 279  		}
 280  		seq.offset = uint32(s-t) + 3
 281  		s += l
 282  		if debugSequences {
 283  			println("sequence", seq, "next s:", s)
 284  		}
 285  		blk.sequences = append(blk.sequences, seq)
 286  		nextEmit = s
 287  		if s >= sLimit {
 288  			break encodeLoop
 289  		}
 290  
 291  		// Index match start+1 (long) and start+2 (short)
 292  		index0 := s - l + 1
 293  		// Index match end-2 (long) and end-1 (short)
 294  		index1 := s - 2
 295  
 296  		cv0 := load6432(src, index0)
 297  		cv1 := load6432(src, index1)
 298  		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
 299  		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
 300  		e.longTable[hashLen(cv0, dFastLongTableBits, dFastLongLen)] = te0
 301  		e.longTable[hashLen(cv1, dFastLongTableBits, dFastLongLen)] = te1
 302  		cv0 >>= 8
 303  		cv1 >>= 8
 304  		te0.offset++
 305  		te1.offset++
 306  		te0.val = uint32(cv0)
 307  		te1.val = uint32(cv1)
 308  		e.table[hashLen(cv0, dFastShortTableBits, dFastShortLen)] = te0
 309  		e.table[hashLen(cv1, dFastShortTableBits, dFastShortLen)] = te1
 310  
 311  		cv = load6432(src, s)
 312  
 313  		if !canRepeat {
 314  			continue
 315  		}
 316  
 317  		// Check offset 2
 318  		for {
 319  			o2 := s - offset2
 320  			if load3232(src, o2) != uint32(cv) {
 321  				// Do regular search
 322  				break
 323  			}
 324  
 325  			// Store this, since we have it.
 326  			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 327  			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 328  
 329  			// We have at least 4 byte match.
 330  			// No need to check backwards. We come straight from a match
 331  			l := 4 + e.matchlen(s+4, o2+4, src)
 332  
 333  			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
 334  			e.longTable[nextHashL] = entry
 335  			e.table[nextHashS] = entry
 336  			seq.matchLen = uint32(l) - zstdMinMatch
 337  			seq.litLen = 0
 338  
 339  			// Since litlen is always 0, this is offset 1.
 340  			seq.offset = 1
 341  			s += l
 342  			nextEmit = s
 343  			if debugSequences {
 344  				println("sequence", seq, "next s:", s)
 345  			}
 346  			blk.sequences = append(blk.sequences, seq)
 347  
 348  			// Swap offset 1 and 2.
 349  			offset1, offset2 = offset2, offset1
 350  			if s >= sLimit {
 351  				// Finished
 352  				break encodeLoop
 353  			}
 354  			cv = load6432(src, s)
 355  		}
 356  	}
 357  
 358  	if int(nextEmit) < len(src) {
 359  		blk.literals = append(blk.literals, src[nextEmit:]...)
 360  		blk.extraLits = len(src) - int(nextEmit)
 361  	}
 362  	blk.recentOffsets[0] = uint32(offset1)
 363  	blk.recentOffsets[1] = uint32(offset2)
 364  	if debugEncoder {
 365  		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 366  	}
 367  }
 368  
 369  // EncodeNoHist will encode a block with no history and no following blocks.
 370  // Most notable difference is that src will not be copied for history and
 371  // we do not need to check for max match length.
 372  func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 373  	const (
 374  		// Input margin is the number of bytes we read (8)
 375  		// and the maximum we will read ahead (2)
 376  		inputMargin            = 8 + 2
 377  		minNonLiteralBlockSize = 16
 378  	)
 379  
 380  	// Protect against e.cur wraparound.
 381  	if e.cur >= e.bufferReset {
 382  		for i := range e.table[:] {
 383  			e.table[i] = tableEntry{}
 384  		}
 385  		for i := range e.longTable[:] {
 386  			e.longTable[i] = tableEntry{}
 387  		}
 388  		e.cur = e.maxMatchOff
 389  	}
 390  
 391  	s := int32(0)
 392  	blk.size = len(src)
 393  	if len(src) < minNonLiteralBlockSize {
 394  		blk.extraLits = len(src)
 395  		blk.literals = blk.literals[:len(src)]
 396  		copy(blk.literals, src)
 397  		return
 398  	}
 399  
 400  	// Override src
 401  	sLimit := int32(len(src)) - inputMargin
 402  	// stepSize is the number of bytes to skip on every main loop iteration.
 403  	// It should be >= 1.
 404  	const stepSize = 1
 405  
 406  	const kSearchStrength = 8
 407  
 408  	// nextEmit is where in src the next emitLiteral should start from.
 409  	nextEmit := s
 410  	cv := load6432(src, s)
 411  
 412  	// Relative offsets
 413  	offset1 := int32(blk.recentOffsets[0])
 414  	offset2 := int32(blk.recentOffsets[1])
 415  
 416  	addLiterals := func(s *seq, until int32) {
 417  		if until == nextEmit {
 418  			return
 419  		}
 420  		blk.literals = append(blk.literals, src[nextEmit:until]...)
 421  		s.litLen = uint32(until - nextEmit)
 422  	}
 423  	if debugEncoder {
 424  		println("recent offsets:", blk.recentOffsets)
 425  	}
 426  
 427  encodeLoop:
 428  	for {
 429  		var t int32
 430  		for {
 431  
 432  			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 433  			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 434  			candidateL := e.longTable[nextHashL]
 435  			candidateS := e.table[nextHashS]
 436  
 437  			const repOff = 1
 438  			repIndex := s - offset1 + repOff
 439  			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
 440  			e.longTable[nextHashL] = entry
 441  			e.table[nextHashS] = entry
 442  
 443  			if len(blk.sequences) > 2 {
 444  				if load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
 445  					// Consider history as well.
 446  					var seq seq
 447  					//length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
 448  					length := 4 + int32(matchLen(src[s+4+repOff:], src[repIndex+4:]))
 449  
 450  					seq.matchLen = uint32(length - zstdMinMatch)
 451  
 452  					// We might be able to match backwards.
 453  					// Extend as long as we can.
 454  					start := s + repOff
 455  					// We end the search early, so we don't risk 0 literals
 456  					// and have to do special offset treatment.
 457  					startLimit := nextEmit + 1
 458  
 459  					tMin := max(s-e.maxMatchOff, 0)
 460  					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] {
 461  						repIndex--
 462  						start--
 463  						seq.matchLen++
 464  					}
 465  					addLiterals(&seq, start)
 466  
 467  					// rep 0
 468  					seq.offset = 1
 469  					if debugSequences {
 470  						println("repeat sequence", seq, "next s:", s)
 471  					}
 472  					blk.sequences = append(blk.sequences, seq)
 473  					s += length + repOff
 474  					nextEmit = s
 475  					if s >= sLimit {
 476  						if debugEncoder {
 477  							println("repeat ended", s, length)
 478  
 479  						}
 480  						break encodeLoop
 481  					}
 482  					cv = load6432(src, s)
 483  					continue
 484  				}
 485  			}
 486  			// Find the offsets of our two matches.
 487  			coffsetL := s - (candidateL.offset - e.cur)
 488  			coffsetS := s - (candidateS.offset - e.cur)
 489  
 490  			// Check if we have a long match.
 491  			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
 492  				// Found a long match, likely at least 8 bytes.
 493  				// Reference encoder checks all 8 bytes, we only check 4,
 494  				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
 495  				t = candidateL.offset - e.cur
 496  				if debugAsserts && s <= t {
 497  					panic(fmt.Sprintf("s (%d) <= t (%d). cur: %d", s, t, e.cur))
 498  				}
 499  				if debugAsserts && s-t > e.maxMatchOff {
 500  					panic("s - t >e.maxMatchOff")
 501  				}
 502  				if debugMatches {
 503  					println("long match")
 504  				}
 505  				break
 506  			}
 507  
 508  			// Check if we have a short match.
 509  			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
 510  				// found a regular match
 511  				// See if we can find a long match at s+1
 512  				const checkAt = 1
 513  				cv := load6432(src, s+checkAt)
 514  				nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
 515  				candidateL = e.longTable[nextHashL]
 516  				coffsetL = s - (candidateL.offset - e.cur) + checkAt
 517  
 518  				// We can store it, since we have at least a 4 byte match.
 519  				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
 520  				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
 521  					// Found a long match, likely at least 8 bytes.
 522  					// Reference encoder checks all 8 bytes, we only check 4,
 523  					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
 524  					t = candidateL.offset - e.cur
 525  					s += checkAt
 526  					if debugMatches {
 527  						println("long match (after short)")
 528  					}
 529  					break
 530  				}
 531  
 532  				t = candidateS.offset - e.cur
 533  				if debugAsserts && s <= t {
 534  					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 535  				}
 536  				if debugAsserts && s-t > e.maxMatchOff {
 537  					panic("s - t >e.maxMatchOff")
 538  				}
 539  				if debugAsserts && t < 0 {
 540  					panic("t<0")
 541  				}
 542  				if debugMatches {
 543  					println("short match")
 544  				}
 545  				break
 546  			}
 547  
 548  			// No match found, move forward in input.
 549  			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
 550  			if s >= sLimit {
 551  				break encodeLoop
 552  			}
 553  			cv = load6432(src, s)
 554  		}
 555  
 556  		// A 4-byte match has been found. Update recent offsets.
 557  		// We'll later see if more than 4 bytes.
 558  		offset2 = offset1
 559  		offset1 = s - t
 560  
 561  		if debugAsserts && s <= t {
 562  			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 563  		}
 564  
 565  		// Extend the 4-byte match as long as possible.
 566  		//l := e.matchlen(s+4, t+4, src) + 4
 567  		l := int32(matchLen(src[s+4:], src[t+4:])) + 4
 568  
 569  		// Extend backwards
 570  		tMin := max(s-e.maxMatchOff, 0)
 571  		for t > tMin && s > nextEmit && src[t-1] == src[s-1] {
 572  			s--
 573  			t--
 574  			l++
 575  		}
 576  
 577  		// Write our sequence
 578  		var seq seq
 579  		seq.litLen = uint32(s - nextEmit)
 580  		seq.matchLen = uint32(l - zstdMinMatch)
 581  		if seq.litLen > 0 {
 582  			blk.literals = append(blk.literals, src[nextEmit:s]...)
 583  		}
 584  		seq.offset = uint32(s-t) + 3
 585  		s += l
 586  		if debugSequences {
 587  			println("sequence", seq, "next s:", s)
 588  		}
 589  		blk.sequences = append(blk.sequences, seq)
 590  		nextEmit = s
 591  		if s >= sLimit {
 592  			break encodeLoop
 593  		}
 594  
 595  		// Index match start+1 (long) and start+2 (short)
 596  		index0 := s - l + 1
 597  		// Index match end-2 (long) and end-1 (short)
 598  		index1 := s - 2
 599  
 600  		cv0 := load6432(src, index0)
 601  		cv1 := load6432(src, index1)
 602  		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
 603  		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
 604  		e.longTable[hashLen(cv0, dFastLongTableBits, dFastLongLen)] = te0
 605  		e.longTable[hashLen(cv1, dFastLongTableBits, dFastLongLen)] = te1
 606  		cv0 >>= 8
 607  		cv1 >>= 8
 608  		te0.offset++
 609  		te1.offset++
 610  		te0.val = uint32(cv0)
 611  		te1.val = uint32(cv1)
 612  		e.table[hashLen(cv0, dFastShortTableBits, dFastShortLen)] = te0
 613  		e.table[hashLen(cv1, dFastShortTableBits, dFastShortLen)] = te1
 614  
 615  		cv = load6432(src, s)
 616  
 617  		if len(blk.sequences) <= 2 {
 618  			continue
 619  		}
 620  
 621  		// Check offset 2
 622  		for {
 623  			o2 := s - offset2
 624  			if load3232(src, o2) != uint32(cv) {
 625  				// Do regular search
 626  				break
 627  			}
 628  
 629  			// Store this, since we have it.
 630  			nextHashS := hashLen(cv1>>8, dFastShortTableBits, dFastShortLen)
 631  			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 632  
 633  			// We have at least 4 byte match.
 634  			// No need to check backwards. We come straight from a match
 635  			//l := 4 + e.matchlen(s+4, o2+4, src)
 636  			l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
 637  
 638  			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
 639  			e.longTable[nextHashL] = entry
 640  			e.table[nextHashS] = entry
 641  			seq.matchLen = uint32(l) - zstdMinMatch
 642  			seq.litLen = 0
 643  
 644  			// Since litlen is always 0, this is offset 1.
 645  			seq.offset = 1
 646  			s += l
 647  			nextEmit = s
 648  			if debugSequences {
 649  				println("sequence", seq, "next s:", s)
 650  			}
 651  			blk.sequences = append(blk.sequences, seq)
 652  
 653  			// Swap offset 1 and 2.
 654  			offset1, offset2 = offset2, offset1
 655  			if s >= sLimit {
 656  				// Finished
 657  				break encodeLoop
 658  			}
 659  			cv = load6432(src, s)
 660  		}
 661  	}
 662  
 663  	if int(nextEmit) < len(src) {
 664  		blk.literals = append(blk.literals, src[nextEmit:]...)
 665  		blk.extraLits = len(src) - int(nextEmit)
 666  	}
 667  	if debugEncoder {
 668  		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 669  	}
 670  
 671  	// We do not store history, so we must offset e.cur to avoid false matches for next user.
 672  	if e.cur < e.bufferReset {
 673  		e.cur += int32(len(src))
 674  	}
 675  }
 676  
 677  // Encode will encode the content, with a dictionary if initialized for it.
 678  func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) {
 679  	const (
 680  		// Input margin is the number of bytes we read (8)
 681  		// and the maximum we will read ahead (2)
 682  		inputMargin            = 8 + 2
 683  		minNonLiteralBlockSize = 16
 684  	)
 685  
 686  	// Protect against e.cur wraparound.
 687  	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 688  		if len(e.hist) == 0 {
 689  			for i := range e.table[:] {
 690  				e.table[i] = tableEntry{}
 691  			}
 692  			for i := range e.longTable[:] {
 693  				e.longTable[i] = tableEntry{}
 694  			}
 695  			e.markAllShardsDirty()
 696  			e.cur = e.maxMatchOff
 697  			break
 698  		}
 699  		// Shift down everything in the table that isn't already too far away.
 700  		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
 701  		for i := range e.table[:] {
 702  			v := e.table[i].offset
 703  			if v < minOff {
 704  				v = 0
 705  			} else {
 706  				v = v - e.cur + e.maxMatchOff
 707  			}
 708  			e.table[i].offset = v
 709  		}
 710  		for i := range e.longTable[:] {
 711  			v := e.longTable[i].offset
 712  			if v < minOff {
 713  				v = 0
 714  			} else {
 715  				v = v - e.cur + e.maxMatchOff
 716  			}
 717  			e.longTable[i].offset = v
 718  		}
 719  		e.markAllShardsDirty()
 720  		e.cur = e.maxMatchOff
 721  		break
 722  	}
 723  
 724  	s := e.addBlock(src)
 725  	blk.size = len(src)
 726  	if len(src) < minNonLiteralBlockSize {
 727  		blk.extraLits = len(src)
 728  		blk.literals = blk.literals[:len(src)]
 729  		copy(blk.literals, src)
 730  		return
 731  	}
 732  
 733  	// Override src
 734  	src = e.hist
 735  	sLimit := int32(len(src)) - inputMargin
 736  	// stepSize is the number of bytes to skip on every main loop iteration.
 737  	// It should be >= 1.
 738  	const stepSize = 1
 739  
 740  	const kSearchStrength = 8
 741  
 742  	// nextEmit is where in src the next emitLiteral should start from.
 743  	nextEmit := s
 744  	cv := load6432(src, s)
 745  
 746  	// Relative offsets
 747  	offset1 := int32(blk.recentOffsets[0])
 748  	offset2 := int32(blk.recentOffsets[1])
 749  
 750  	addLiterals := func(s *seq, until int32) {
 751  		if until == nextEmit {
 752  			return
 753  		}
 754  		blk.literals = append(blk.literals, src[nextEmit:until]...)
 755  		s.litLen = uint32(until - nextEmit)
 756  	}
 757  	if debugEncoder {
 758  		println("recent offsets:", blk.recentOffsets)
 759  	}
 760  
 761  encodeLoop:
 762  	for {
 763  		var t int32
 764  		// We allow the encoder to optionally turn off repeat offsets across blocks
 765  		canRepeat := len(blk.sequences) > 2
 766  
 767  		for {
 768  			if debugAsserts && canRepeat && offset1 == 0 {
 769  				panic("offset0 was 0")
 770  			}
 771  
 772  			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 773  			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 774  			candidateL := e.longTable[nextHashL]
 775  			candidateS := e.table[nextHashS]
 776  
 777  			const repOff = 1
 778  			repIndex := s - offset1 + repOff
 779  			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
 780  			e.longTable[nextHashL] = entry
 781  			e.markLongShardDirty(nextHashL)
 782  			e.table[nextHashS] = entry
 783  			e.markShardDirty(nextHashS)
 784  
 785  			if canRepeat {
 786  				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
 787  					// Consider history as well.
 788  					var seq seq
 789  					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
 790  
 791  					seq.matchLen = uint32(length - zstdMinMatch)
 792  
 793  					// We might be able to match backwards.
 794  					// Extend as long as we can.
 795  					start := s + repOff
 796  					// We end the search early, so we don't risk 0 literals
 797  					// and have to do special offset treatment.
 798  					startLimit := nextEmit + 1
 799  
 800  					tMin := max(s-e.maxMatchOff, 0)
 801  					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
 802  						repIndex--
 803  						start--
 804  						seq.matchLen++
 805  					}
 806  					addLiterals(&seq, start)
 807  
 808  					// rep 0
 809  					seq.offset = 1
 810  					if debugSequences {
 811  						println("repeat sequence", seq, "next s:", s)
 812  					}
 813  					blk.sequences = append(blk.sequences, seq)
 814  					s += length + repOff
 815  					nextEmit = s
 816  					if s >= sLimit {
 817  						if debugEncoder {
 818  							println("repeat ended", s, length)
 819  
 820  						}
 821  						break encodeLoop
 822  					}
 823  					cv = load6432(src, s)
 824  					continue
 825  				}
 826  			}
 827  			// Find the offsets of our two matches.
 828  			coffsetL := s - (candidateL.offset - e.cur)
 829  			coffsetS := s - (candidateS.offset - e.cur)
 830  
 831  			// Check if we have a long match.
 832  			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
 833  				// Found a long match, likely at least 8 bytes.
 834  				// Reference encoder checks all 8 bytes, we only check 4,
 835  				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
 836  				t = candidateL.offset - e.cur
 837  				if debugAsserts && s <= t {
 838  					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 839  				}
 840  				if debugAsserts && s-t > e.maxMatchOff {
 841  					panic("s - t >e.maxMatchOff")
 842  				}
 843  				if debugMatches {
 844  					println("long match")
 845  				}
 846  				break
 847  			}
 848  
 849  			// Check if we have a short match.
 850  			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
 851  				// found a regular match
 852  				// See if we can find a long match at s+1
 853  				const checkAt = 1
 854  				cv := load6432(src, s+checkAt)
 855  				nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
 856  				candidateL = e.longTable[nextHashL]
 857  				coffsetL = s - (candidateL.offset - e.cur) + checkAt
 858  
 859  				// We can store it, since we have at least a 4 byte match.
 860  				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
 861  				e.markLongShardDirty(nextHashL)
 862  				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
 863  					// Found a long match, likely at least 8 bytes.
 864  					// Reference encoder checks all 8 bytes, we only check 4,
 865  					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
 866  					t = candidateL.offset - e.cur
 867  					s += checkAt
 868  					if debugMatches {
 869  						println("long match (after short)")
 870  					}
 871  					break
 872  				}
 873  
 874  				t = candidateS.offset - e.cur
 875  				if debugAsserts && s <= t {
 876  					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 877  				}
 878  				if debugAsserts && s-t > e.maxMatchOff {
 879  					panic("s - t >e.maxMatchOff")
 880  				}
 881  				if debugAsserts && t < 0 {
 882  					panic("t<0")
 883  				}
 884  				if debugMatches {
 885  					println("short match")
 886  				}
 887  				break
 888  			}
 889  
 890  			// No match found, move forward in input.
 891  			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
 892  			if s >= sLimit {
 893  				break encodeLoop
 894  			}
 895  			cv = load6432(src, s)
 896  		}
 897  
 898  		// A 4-byte match has been found. Update recent offsets.
 899  		// We'll later see if more than 4 bytes.
 900  		offset2 = offset1
 901  		offset1 = s - t
 902  
 903  		if debugAsserts && s <= t {
 904  			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 905  		}
 906  
 907  		if debugAsserts && canRepeat && int(offset1) > len(src) {
 908  			panic("invalid offset")
 909  		}
 910  
 911  		// Extend the 4-byte match as long as possible.
 912  		l := e.matchlen(s+4, t+4, src) + 4
 913  
 914  		// Extend backwards
 915  		tMin := max(s-e.maxMatchOff, 0)
 916  		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
 917  			s--
 918  			t--
 919  			l++
 920  		}
 921  
 922  		// Write our sequence
 923  		var seq seq
 924  		seq.litLen = uint32(s - nextEmit)
 925  		seq.matchLen = uint32(l - zstdMinMatch)
 926  		if seq.litLen > 0 {
 927  			blk.literals = append(blk.literals, src[nextEmit:s]...)
 928  		}
 929  		seq.offset = uint32(s-t) + 3
 930  		s += l
 931  		if debugSequences {
 932  			println("sequence", seq, "next s:", s)
 933  		}
 934  		blk.sequences = append(blk.sequences, seq)
 935  		nextEmit = s
 936  		if s >= sLimit {
 937  			break encodeLoop
 938  		}
 939  
 940  		// Index match start+1 (long) and start+2 (short)
 941  		index0 := s - l + 1
 942  		// Index match end-2 (long) and end-1 (short)
 943  		index1 := s - 2
 944  
 945  		cv0 := load6432(src, index0)
 946  		cv1 := load6432(src, index1)
 947  		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
 948  		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
 949  		longHash1 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
 950  		longHash2 := hashLen(cv1, dFastLongTableBits, dFastLongLen)
 951  		e.longTable[longHash1] = te0
 952  		e.longTable[longHash2] = te1
 953  		e.markLongShardDirty(longHash1)
 954  		e.markLongShardDirty(longHash2)
 955  		cv0 >>= 8
 956  		cv1 >>= 8
 957  		te0.offset++
 958  		te1.offset++
 959  		te0.val = uint32(cv0)
 960  		te1.val = uint32(cv1)
 961  		hashVal1 := hashLen(cv0, dFastShortTableBits, dFastShortLen)
 962  		hashVal2 := hashLen(cv1, dFastShortTableBits, dFastShortLen)
 963  		e.table[hashVal1] = te0
 964  		e.markShardDirty(hashVal1)
 965  		e.table[hashVal2] = te1
 966  		e.markShardDirty(hashVal2)
 967  
 968  		cv = load6432(src, s)
 969  
 970  		if !canRepeat {
 971  			continue
 972  		}
 973  
 974  		// Check offset 2
 975  		for {
 976  			o2 := s - offset2
 977  			if load3232(src, o2) != uint32(cv) {
 978  				// Do regular search
 979  				break
 980  			}
 981  
 982  			// Store this, since we have it.
 983  			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 984  			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 985  
 986  			// We have at least 4 byte match.
 987  			// No need to check backwards. We come straight from a match
 988  			l := 4 + e.matchlen(s+4, o2+4, src)
 989  
 990  			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
 991  			e.longTable[nextHashL] = entry
 992  			e.markLongShardDirty(nextHashL)
 993  			e.table[nextHashS] = entry
 994  			e.markShardDirty(nextHashS)
 995  			seq.matchLen = uint32(l) - zstdMinMatch
 996  			seq.litLen = 0
 997  
 998  			// Since litlen is always 0, this is offset 1.
 999  			seq.offset = 1
1000  			s += l
1001  			nextEmit = s
1002  			if debugSequences {
1003  				println("sequence", seq, "next s:", s)
1004  			}
1005  			blk.sequences = append(blk.sequences, seq)
1006  
1007  			// Swap offset 1 and 2.
1008  			offset1, offset2 = offset2, offset1
1009  			if s >= sLimit {
1010  				// Finished
1011  				break encodeLoop
1012  			}
1013  			cv = load6432(src, s)
1014  		}
1015  	}
1016  
1017  	if int(nextEmit) < len(src) {
1018  		blk.literals = append(blk.literals, src[nextEmit:]...)
1019  		blk.extraLits = len(src) - int(nextEmit)
1020  	}
1021  	blk.recentOffsets[0] = uint32(offset1)
1022  	blk.recentOffsets[1] = uint32(offset2)
1023  	if debugEncoder {
1024  		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
1025  	}
1026  	// If we encoded more than 64K mark all dirty.
1027  	if len(src) > 64<<10 {
1028  		e.markAllShardsDirty()
1029  	}
1030  }
1031  
1032  // ResetDict will reset and set a dictionary if not nil
1033  func (e *doubleFastEncoder) Reset(d *dict, singleBlock bool) {
1034  	e.fastEncoder.Reset(d, singleBlock)
1035  	if d != nil {
1036  		panic("doubleFastEncoder: Reset with dict not supported")
1037  	}
1038  }
1039  
1040  // ResetDict will reset and set a dictionary if not nil
1041  func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
1042  	allDirty := e.allDirty
1043  	e.fastEncoderDict.Reset(d, singleBlock)
1044  	if d == nil {
1045  		return
1046  	}
1047  
1048  	// Init or copy dict table
1049  	if len(e.dictLongTable) != len(e.longTable) || d.id != e.lastDictID {
1050  		if len(e.dictLongTable) != len(e.longTable) {
1051  			e.dictLongTable = make([]tableEntry, len(e.longTable))
1052  		}
1053  		if len(d.content) >= 8 {
1054  			cv := load6432(d.content, 0)
1055  			e.dictLongTable[hashLen(cv, dFastLongTableBits, dFastLongLen)] = tableEntry{
1056  				val:    uint32(cv),
1057  				offset: e.maxMatchOff,
1058  			}
1059  			end := int32(len(d.content)) - 8 + e.maxMatchOff
1060  			for i := e.maxMatchOff + 1; i < end; i++ {
1061  				cv = cv>>8 | (uint64(d.content[i-e.maxMatchOff+7]) << 56)
1062  				e.dictLongTable[hashLen(cv, dFastLongTableBits, dFastLongLen)] = tableEntry{
1063  					val:    uint32(cv),
1064  					offset: i,
1065  				}
1066  			}
1067  		}
1068  		e.lastDictID = d.id
1069  		allDirty = true
1070  	}
1071  	// Reset table to initial state
1072  	e.cur = e.maxMatchOff
1073  
1074  	dirtyShardCnt := 0
1075  	if !allDirty {
1076  		for i := range e.longTableShardDirty {
1077  			if e.longTableShardDirty[i] {
1078  				dirtyShardCnt++
1079  			}
1080  		}
1081  	}
1082  
1083  	if allDirty || dirtyShardCnt > dLongTableShardCnt/2 {
1084  		//copy(e.longTable[:], e.dictLongTable)
1085  		e.longTable = *(*[dFastLongTableSize]tableEntry)(e.dictLongTable)
1086  		for i := range e.longTableShardDirty {
1087  			e.longTableShardDirty[i] = false
1088  		}
1089  		return
1090  	}
1091  	for i := range e.longTableShardDirty {
1092  		if !e.longTableShardDirty[i] {
1093  			continue
1094  		}
1095  
1096  		// copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
1097  		*(*[dLongTableShardSize]tableEntry)(e.longTable[i*dLongTableShardSize:]) = *(*[dLongTableShardSize]tableEntry)(e.dictLongTable[i*dLongTableShardSize:])
1098  
1099  		e.longTableShardDirty[i] = false
1100  	}
1101  }
1102  
1103  func (e *doubleFastEncoderDict) markLongShardDirty(entryNum uint32) {
1104  	e.longTableShardDirty[entryNum/dLongTableShardSize] = true
1105  }
1106