encode_better.go raw
1 // Copyright 2016 The Snappy-Go Authors. All rights reserved.
2 // Copyright (c) 2019 Klaus Post. All rights reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5
6 package s2
7
8 import (
9 "bytes"
10 "fmt"
11 "math/bits"
12 )
13
14 // hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
15 // Preferably h should be a constant and should always be <32.
16 func hash4(u uint64, h uint8) uint32 {
17 const prime4bytes = 2654435761
18 return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
19 }
20
21 // hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
22 // Preferably h should be a constant and should always be <64.
23 func hash5(u uint64, h uint8) uint32 {
24 const prime5bytes = 889523592379
25 return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
26 }
27
28 // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
29 // Preferably h should be a constant and should always be <64.
30 func hash7(u uint64, h uint8) uint32 {
31 const prime7bytes = 58295818150454627
32 return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
33 }
34
35 // hash8 returns the hash of u to fit in a hash table with h bits.
36 // Preferably h should be a constant and should always be <64.
37 func hash8(u uint64, h uint8) uint32 {
38 const prime8bytes = 0xcf1bbcdcb7a56463
39 return uint32((u * prime8bytes) >> ((64 - h) & 63))
40 }
41
42 // encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
43 // assumes that the varint-encoded length of the decompressed bytes has already
44 // been written.
45 //
46 // It also assumes that:
47 //
48 // len(dst) >= MaxEncodedLen(len(src)) &&
49 // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
50 func encodeBlockBetterGo(dst, src []byte) (d int) {
51 // sLimit is when to stop looking for offset/length copies. The inputMargin
52 // lets us use a fast path for emitLiteral in the main loop, while we are
53 // looking for copies.
54 sLimit := len(src) - inputMargin
55 if len(src) < minNonLiteralBlockSize {
56 return 0
57 }
58
59 // Initialize the hash tables.
60 const (
61 // Long hash matches.
62 lTableBits = 17
63 maxLTableSize = 1 << lTableBits
64
65 // Short hash matches.
66 sTableBits = 14
67 maxSTableSize = 1 << sTableBits
68 )
69
70 var lTable [maxLTableSize]uint32
71 var sTable [maxSTableSize]uint32
72
73 // Bail if we can't compress to at least this.
74 dstLimit := len(src) - len(src)>>5 - 6
75
76 // nextEmit is where in src the next emitLiteral should start from.
77 nextEmit := 0
78
79 // The encoded form must start with a literal, as there are no previous
80 // bytes to copy, so we start looking for hash matches at s == 1.
81 s := 1
82 cv := load64(src, s)
83
84 // We initialize repeat to 0, so we never match on first attempt
85 repeat := 0
86
87 for {
88 candidateL := 0
89 nextS := 0
90 for {
91 // Next src position to check
92 nextS = s + (s-nextEmit)>>7 + 1
93 if nextS > sLimit {
94 goto emitRemainder
95 }
96 hashL := hash7(cv, lTableBits)
97 hashS := hash4(cv, sTableBits)
98 candidateL = int(lTable[hashL])
99 candidateS := int(sTable[hashS])
100 lTable[hashL] = uint32(s)
101 sTable[hashS] = uint32(s)
102
103 valLong := load64(src, candidateL)
104 valShort := load64(src, candidateS)
105
106 // If long matches at least 8 bytes, use that.
107 if cv == valLong {
108 break
109 }
110 if cv == valShort {
111 candidateL = candidateS
112 break
113 }
114
115 // Check repeat at offset checkRep.
116 const checkRep = 1
117 // Minimum length of a repeat. Tested with various values.
118 // While 4-5 offers improvements in some, 6 reduces
119 // regressions significantly.
120 const wantRepeatBytes = 6
121 const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
122 if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
123 base := s + checkRep
124 // Extend back
125 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
126 i--
127 base--
128 }
129 d += emitLiteral(dst[d:], src[nextEmit:base])
130
131 // Extend forward
132 candidate := s - repeat + wantRepeatBytes + checkRep
133 s += wantRepeatBytes + checkRep
134 for s < len(src) {
135 if len(src)-s < 8 {
136 if src[s] == src[candidate] {
137 s++
138 candidate++
139 continue
140 }
141 break
142 }
143 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
144 s += bits.TrailingZeros64(diff) >> 3
145 break
146 }
147 s += 8
148 candidate += 8
149 }
150 // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
151 d += emitRepeat(dst[d:], repeat, s-base)
152 nextEmit = s
153 if s >= sLimit {
154 goto emitRemainder
155 }
156 // Index in-between
157 index0 := base + 1
158 index1 := s - 2
159
160 for index0 < index1 {
161 cv0 := load64(src, index0)
162 cv1 := load64(src, index1)
163 lTable[hash7(cv0, lTableBits)] = uint32(index0)
164 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
165
166 lTable[hash7(cv1, lTableBits)] = uint32(index1)
167 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
168 index0 += 2
169 index1 -= 2
170 }
171
172 cv = load64(src, s)
173 continue
174 }
175
176 // Long likely matches 7, so take that.
177 if uint32(cv) == uint32(valLong) {
178 break
179 }
180
181 // Check our short candidate
182 if uint32(cv) == uint32(valShort) {
183 // Try a long candidate at s+1
184 hashL = hash7(cv>>8, lTableBits)
185 candidateL = int(lTable[hashL])
186 lTable[hashL] = uint32(s + 1)
187 if uint32(cv>>8) == load32(src, candidateL) {
188 s++
189 break
190 }
191 // Use our short candidate.
192 candidateL = candidateS
193 break
194 }
195
196 cv = load64(src, nextS)
197 s = nextS
198 }
199
200 // Extend backwards
201 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
202 candidateL--
203 s--
204 }
205
206 // Bail if we exceed the maximum size.
207 if d+(s-nextEmit) > dstLimit {
208 return 0
209 }
210
211 base := s
212 offset := base - candidateL
213
214 // Extend the 4-byte match as long as possible.
215 s += 4
216 candidateL += 4
217 for s < len(src) {
218 if len(src)-s < 8 {
219 if src[s] == src[candidateL] {
220 s++
221 candidateL++
222 continue
223 }
224 break
225 }
226 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
227 s += bits.TrailingZeros64(diff) >> 3
228 break
229 }
230 s += 8
231 candidateL += 8
232 }
233
234 if offset > 65535 && s-base <= 5 && repeat != offset {
235 // Bail if the match is equal or worse to the encoding.
236 s = nextS + 1
237 if s >= sLimit {
238 goto emitRemainder
239 }
240 cv = load64(src, s)
241 continue
242 }
243
244 d += emitLiteral(dst[d:], src[nextEmit:base])
245 if repeat == offset {
246 d += emitRepeat(dst[d:], offset, s-base)
247 } else {
248 d += emitCopy(dst[d:], offset, s-base)
249 repeat = offset
250 }
251
252 nextEmit = s
253 if s >= sLimit {
254 goto emitRemainder
255 }
256
257 if d > dstLimit {
258 // Do we have space for more, if not bail.
259 return 0
260 }
261
262 // Index short & long
263 index0 := base + 1
264 index1 := s - 2
265
266 cv0 := load64(src, index0)
267 cv1 := load64(src, index1)
268 lTable[hash7(cv0, lTableBits)] = uint32(index0)
269 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
270
271 // lTable could be postponed, but very minor difference.
272 lTable[hash7(cv1, lTableBits)] = uint32(index1)
273 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
274 index0 += 1
275 index1 -= 1
276 cv = load64(src, s)
277
278 // Index large values sparsely in between.
279 // We do two starting from different offsets for speed.
280 index2 := (index0 + index1 + 1) >> 1
281 for index2 < index1 {
282 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
283 lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
284 index0 += 2
285 index2 += 2
286 }
287 }
288
289 emitRemainder:
290 if nextEmit < len(src) {
291 // Bail if we exceed the maximum size.
292 if d+len(src)-nextEmit > dstLimit {
293 return 0
294 }
295 d += emitLiteral(dst[d:], src[nextEmit:])
296 }
297 return d
298 }
299
300 // encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
301 // assumes that the varint-encoded length of the decompressed bytes has already
302 // been written.
303 //
304 // It also assumes that:
305 //
306 // len(dst) >= MaxEncodedLen(len(src)) &&
307 // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
308 func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
309 // sLimit is when to stop looking for offset/length copies. The inputMargin
310 // lets us use a fast path for emitLiteral in the main loop, while we are
311 // looking for copies.
312 sLimit := len(src) - inputMargin
313 if len(src) < minNonLiteralBlockSize {
314 return 0
315 }
316
317 // Initialize the hash tables.
318 const (
319 // Long hash matches.
320 lTableBits = 16
321 maxLTableSize = 1 << lTableBits
322
323 // Short hash matches.
324 sTableBits = 14
325 maxSTableSize = 1 << sTableBits
326 )
327
328 var lTable [maxLTableSize]uint32
329 var sTable [maxSTableSize]uint32
330
331 // Bail if we can't compress to at least this.
332 dstLimit := len(src) - len(src)>>5 - 6
333
334 // nextEmit is where in src the next emitLiteral should start from.
335 nextEmit := 0
336
337 // The encoded form must start with a literal, as there are no previous
338 // bytes to copy, so we start looking for hash matches at s == 1.
339 s := 1
340 cv := load64(src, s)
341
342 // We initialize repeat to 0, so we never match on first attempt
343 repeat := 0
344 const maxSkip = 100
345
346 for {
347 candidateL := 0
348 nextS := 0
349 for {
350 // Next src position to check
351 nextS = min(s+(s-nextEmit)>>7+1, s+maxSkip)
352
353 if nextS > sLimit {
354 goto emitRemainder
355 }
356 hashL := hash7(cv, lTableBits)
357 hashS := hash4(cv, sTableBits)
358 candidateL = int(lTable[hashL])
359 candidateS := int(sTable[hashS])
360 lTable[hashL] = uint32(s)
361 sTable[hashS] = uint32(s)
362
363 if uint32(cv) == load32(src, candidateL) {
364 break
365 }
366
367 // Check our short candidate
368 if uint32(cv) == load32(src, candidateS) {
369 // Try a long candidate at s+1
370 hashL = hash7(cv>>8, lTableBits)
371 candidateL = int(lTable[hashL])
372 lTable[hashL] = uint32(s + 1)
373 if uint32(cv>>8) == load32(src, candidateL) {
374 s++
375 break
376 }
377 // Use our short candidate.
378 candidateL = candidateS
379 break
380 }
381
382 cv = load64(src, nextS)
383 s = nextS
384 }
385
386 // Extend backwards
387 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
388 candidateL--
389 s--
390 }
391
392 // Bail if we exceed the maximum size.
393 if d+(s-nextEmit) > dstLimit {
394 return 0
395 }
396
397 base := s
398 offset := base - candidateL
399
400 // Extend the 4-byte match as long as possible.
401 s += 4
402 candidateL += 4
403 for s < len(src) {
404 if len(src)-s < 8 {
405 if src[s] == src[candidateL] {
406 s++
407 candidateL++
408 continue
409 }
410 break
411 }
412 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
413 s += bits.TrailingZeros64(diff) >> 3
414 break
415 }
416 s += 8
417 candidateL += 8
418 }
419
420 if offset > 65535 && s-base <= 5 && repeat != offset {
421 // Bail if the match is equal or worse to the encoding.
422 s = nextS + 1
423 if s >= sLimit {
424 goto emitRemainder
425 }
426 cv = load64(src, s)
427 continue
428 }
429
430 d += emitLiteral(dst[d:], src[nextEmit:base])
431 d += emitCopyNoRepeat(dst[d:], offset, s-base)
432 repeat = offset
433
434 nextEmit = s
435 if s >= sLimit {
436 goto emitRemainder
437 }
438
439 if d > dstLimit {
440 // Do we have space for more, if not bail.
441 return 0
442 }
443
444 // Index short & long
445 index0 := base + 1
446 index1 := s - 2
447
448 cv0 := load64(src, index0)
449 cv1 := load64(src, index1)
450 lTable[hash7(cv0, lTableBits)] = uint32(index0)
451 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
452
453 lTable[hash7(cv1, lTableBits)] = uint32(index1)
454 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
455 index0 += 1
456 index1 -= 1
457 cv = load64(src, s)
458
459 // Index large values sparsely in between.
460 // We do two starting from different offsets for speed.
461 index2 := (index0 + index1 + 1) >> 1
462 for index2 < index1 {
463 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
464 lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
465 index0 += 2
466 index2 += 2
467 }
468 }
469
470 emitRemainder:
471 if nextEmit < len(src) {
472 // Bail if we exceed the maximum size.
473 if d+len(src)-nextEmit > dstLimit {
474 return 0
475 }
476 d += emitLiteral(dst[d:], src[nextEmit:])
477 }
478 return d
479 }
480
481 func encodeBlockBetterGo64K(dst, src []byte) (d int) {
482 // sLimit is when to stop looking for offset/length copies. The inputMargin
483 // lets us use a fast path for emitLiteral in the main loop, while we are
484 // looking for copies.
485 sLimit := len(src) - inputMargin
486 if len(src) < minNonLiteralBlockSize {
487 return 0
488 }
489 // Initialize the hash tables.
490 // Use smaller tables for smaller blocks
491 const (
492 // Long hash matches.
493 lTableBits = 16
494 maxLTableSize = 1 << lTableBits
495
496 // Short hash matches.
497 sTableBits = 13
498 maxSTableSize = 1 << sTableBits
499 )
500
501 var lTable [maxLTableSize]uint16
502 var sTable [maxSTableSize]uint16
503
504 // Bail if we can't compress to at least this.
505 dstLimit := len(src) - len(src)>>5 - 6
506
507 // nextEmit is where in src the next emitLiteral should start from.
508 nextEmit := 0
509
510 // The encoded form must start with a literal, as there are no previous
511 // bytes to copy, so we start looking for hash matches at s == 1.
512 s := 1
513 cv := load64(src, s)
514
515 // We initialize repeat to 0, so we never match on first attempt
516 repeat := 0
517
518 for {
519 candidateL := 0
520 nextS := 0
521 for {
522 // Next src position to check
523 nextS = s + (s-nextEmit)>>6 + 1
524 if nextS > sLimit {
525 goto emitRemainder
526 }
527 hashL := hash7(cv, lTableBits)
528 hashS := hash4(cv, sTableBits)
529 candidateL = int(lTable[hashL])
530 candidateS := int(sTable[hashS])
531 lTable[hashL] = uint16(s)
532 sTable[hashS] = uint16(s)
533
534 valLong := load64(src, candidateL)
535 valShort := load64(src, candidateS)
536
537 // If long matches at least 8 bytes, use that.
538 if cv == valLong {
539 break
540 }
541 if cv == valShort {
542 candidateL = candidateS
543 break
544 }
545
546 // Check repeat at offset checkRep.
547 const checkRep = 1
548 // Minimum length of a repeat. Tested with various values.
549 // While 4-5 offers improvements in some, 6 reduces
550 // regressions significantly.
551 const wantRepeatBytes = 6
552 const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
553 if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
554 base := s + checkRep
555 // Extend back
556 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
557 i--
558 base--
559 }
560 d += emitLiteral(dst[d:], src[nextEmit:base])
561
562 // Extend forward
563 candidate := s - repeat + wantRepeatBytes + checkRep
564 s += wantRepeatBytes + checkRep
565 for s < len(src) {
566 if len(src)-s < 8 {
567 if src[s] == src[candidate] {
568 s++
569 candidate++
570 continue
571 }
572 break
573 }
574 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
575 s += bits.TrailingZeros64(diff) >> 3
576 break
577 }
578 s += 8
579 candidate += 8
580 }
581 // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
582 d += emitRepeat(dst[d:], repeat, s-base)
583 nextEmit = s
584 if s >= sLimit {
585 goto emitRemainder
586 }
587 // Index in-between
588 index0 := base + 1
589 index1 := s - 2
590
591 for index0 < index1 {
592 cv0 := load64(src, index0)
593 cv1 := load64(src, index1)
594 lTable[hash7(cv0, lTableBits)] = uint16(index0)
595 sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
596
597 lTable[hash7(cv1, lTableBits)] = uint16(index1)
598 sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
599 index0 += 2
600 index1 -= 2
601 }
602
603 cv = load64(src, s)
604 continue
605 }
606
607 // Long likely matches 7, so take that.
608 if uint32(cv) == uint32(valLong) {
609 break
610 }
611
612 // Check our short candidate
613 if uint32(cv) == uint32(valShort) {
614 // Try a long candidate at s+1
615 hashL = hash7(cv>>8, lTableBits)
616 candidateL = int(lTable[hashL])
617 lTable[hashL] = uint16(s + 1)
618 if uint32(cv>>8) == load32(src, candidateL) {
619 s++
620 break
621 }
622 // Use our short candidate.
623 candidateL = candidateS
624 break
625 }
626
627 cv = load64(src, nextS)
628 s = nextS
629 }
630
631 // Extend backwards
632 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
633 candidateL--
634 s--
635 }
636
637 // Bail if we exceed the maximum size.
638 if d+(s-nextEmit) > dstLimit {
639 return 0
640 }
641
642 base := s
643 offset := base - candidateL
644
645 // Extend the 4-byte match as long as possible.
646 s += 4
647 candidateL += 4
648 for s < len(src) {
649 if len(src)-s < 8 {
650 if src[s] == src[candidateL] {
651 s++
652 candidateL++
653 continue
654 }
655 break
656 }
657 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
658 s += bits.TrailingZeros64(diff) >> 3
659 break
660 }
661 s += 8
662 candidateL += 8
663 }
664
665 d += emitLiteral(dst[d:], src[nextEmit:base])
666 if repeat == offset {
667 d += emitRepeat(dst[d:], offset, s-base)
668 } else {
669 d += emitCopy(dst[d:], offset, s-base)
670 repeat = offset
671 }
672
673 nextEmit = s
674 if s >= sLimit {
675 goto emitRemainder
676 }
677
678 if d > dstLimit {
679 // Do we have space for more, if not bail.
680 return 0
681 }
682
683 // Index short & long
684 index0 := base + 1
685 index1 := s - 2
686
687 cv0 := load64(src, index0)
688 cv1 := load64(src, index1)
689 lTable[hash7(cv0, lTableBits)] = uint16(index0)
690 sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
691
692 // lTable could be postponed, but very minor difference.
693 lTable[hash7(cv1, lTableBits)] = uint16(index1)
694 sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
695 index0 += 1
696 index1 -= 1
697 cv = load64(src, s)
698
699 // Index large values sparsely in between.
700 // We do two starting from different offsets for speed.
701 index2 := (index0 + index1 + 1) >> 1
702 for index2 < index1 {
703 lTable[hash7(load64(src, index0), lTableBits)] = uint16(index0)
704 lTable[hash7(load64(src, index2), lTableBits)] = uint16(index2)
705 index0 += 2
706 index2 += 2
707 }
708 }
709
710 emitRemainder:
711 if nextEmit < len(src) {
712 // Bail if we exceed the maximum size.
713 if d+len(src)-nextEmit > dstLimit {
714 return 0
715 }
716 d += emitLiteral(dst[d:], src[nextEmit:])
717 }
718 return d
719 }
720
721 // encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
722 // assumes that the varint-encoded length of the decompressed bytes has already
723 // been written.
724 //
725 // It also assumes that:
726 //
727 // len(dst) >= MaxEncodedLen(len(src)) &&
728 // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
729 func encodeBlockBetterSnappyGo64K(dst, src []byte) (d int) {
730 // sLimit is when to stop looking for offset/length copies. The inputMargin
731 // lets us use a fast path for emitLiteral in the main loop, while we are
732 // looking for copies.
733 sLimit := len(src) - inputMargin
734 if len(src) < minNonLiteralBlockSize {
735 return 0
736 }
737
738 // Initialize the hash tables.
739 // Use smaller tables for smaller blocks
740 const (
741 // Long hash matches.
742 lTableBits = 15
743 maxLTableSize = 1 << lTableBits
744
745 // Short hash matches.
746 sTableBits = 13
747 maxSTableSize = 1 << sTableBits
748 )
749
750 var lTable [maxLTableSize]uint16
751 var sTable [maxSTableSize]uint16
752
753 // Bail if we can't compress to at least this.
754 dstLimit := len(src) - len(src)>>5 - 6
755
756 // nextEmit is where in src the next emitLiteral should start from.
757 nextEmit := 0
758
759 // The encoded form must start with a literal, as there are no previous
760 // bytes to copy, so we start looking for hash matches at s == 1.
761 s := 1
762 cv := load64(src, s)
763
764 const maxSkip = 100
765
766 for {
767 candidateL := 0
768 nextS := 0
769 for {
770 // Next src position to check
771 nextS = min(s+(s-nextEmit)>>6+1, s+maxSkip)
772
773 if nextS > sLimit {
774 goto emitRemainder
775 }
776 hashL := hash7(cv, lTableBits)
777 hashS := hash4(cv, sTableBits)
778 candidateL = int(lTable[hashL])
779 candidateS := int(sTable[hashS])
780 lTable[hashL] = uint16(s)
781 sTable[hashS] = uint16(s)
782
783 if uint32(cv) == load32(src, candidateL) {
784 break
785 }
786
787 // Check our short candidate
788 if uint32(cv) == load32(src, candidateS) {
789 // Try a long candidate at s+1
790 hashL = hash7(cv>>8, lTableBits)
791 candidateL = int(lTable[hashL])
792 lTable[hashL] = uint16(s + 1)
793 if uint32(cv>>8) == load32(src, candidateL) {
794 s++
795 break
796 }
797 // Use our short candidate.
798 candidateL = candidateS
799 break
800 }
801
802 cv = load64(src, nextS)
803 s = nextS
804 }
805
806 // Extend backwards
807 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
808 candidateL--
809 s--
810 }
811
812 // Bail if we exceed the maximum size.
813 if d+(s-nextEmit) > dstLimit {
814 return 0
815 }
816
817 base := s
818 offset := base - candidateL
819
820 // Extend the 4-byte match as long as possible.
821 s += 4
822 candidateL += 4
823 for s < len(src) {
824 if len(src)-s < 8 {
825 if src[s] == src[candidateL] {
826 s++
827 candidateL++
828 continue
829 }
830 break
831 }
832 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
833 s += bits.TrailingZeros64(diff) >> 3
834 break
835 }
836 s += 8
837 candidateL += 8
838 }
839
840 d += emitLiteral(dst[d:], src[nextEmit:base])
841 d += emitCopyNoRepeat(dst[d:], offset, s-base)
842
843 nextEmit = s
844 if s >= sLimit {
845 goto emitRemainder
846 }
847
848 if d > dstLimit {
849 // Do we have space for more, if not bail.
850 return 0
851 }
852
853 // Index short & long
854 index0 := base + 1
855 index1 := s - 2
856
857 cv0 := load64(src, index0)
858 cv1 := load64(src, index1)
859 lTable[hash7(cv0, lTableBits)] = uint16(index0)
860 sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
861
862 lTable[hash7(cv1, lTableBits)] = uint16(index1)
863 sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
864 index0 += 1
865 index1 -= 1
866 cv = load64(src, s)
867
868 // Index large values sparsely in between.
869 // We do two starting from different offsets for speed.
870 index2 := (index0 + index1 + 1) >> 1
871 for index2 < index1 {
872 lTable[hash7(load64(src, index0), lTableBits)] = uint16(index0)
873 lTable[hash7(load64(src, index2), lTableBits)] = uint16(index2)
874 index0 += 2
875 index2 += 2
876 }
877 }
878
879 emitRemainder:
880 if nextEmit < len(src) {
881 // Bail if we exceed the maximum size.
882 if d+len(src)-nextEmit > dstLimit {
883 return 0
884 }
885 d += emitLiteral(dst[d:], src[nextEmit:])
886 }
887 return d
888 }
889
890 // encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It
891 // assumes that the varint-encoded length of the decompressed bytes has already
892 // been written.
893 //
894 // It also assumes that:
895 //
896 // len(dst) >= MaxEncodedLen(len(src)) &&
897 // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
898 func encodeBlockBetterDict(dst, src []byte, dict *Dict) (d int) {
899 // sLimit is when to stop looking for offset/length copies. The inputMargin
900 // lets us use a fast path for emitLiteral in the main loop, while we are
901 // looking for copies.
902 // Initialize the hash tables.
903 const (
904 // Long hash matches.
905 lTableBits = 17
906 maxLTableSize = 1 << lTableBits
907
908 // Short hash matches.
909 sTableBits = 14
910 maxSTableSize = 1 << sTableBits
911
912 maxAhead = 8 // maximum bytes ahead without checking sLimit
913
914 debug = false
915 )
916
917 sLimit := min(len(src)-inputMargin, MaxDictSrcOffset-maxAhead)
918 if len(src) < minNonLiteralBlockSize {
919 return 0
920 }
921
922 dict.initBetter()
923
924 var lTable [maxLTableSize]uint32
925 var sTable [maxSTableSize]uint32
926
927 // Bail if we can't compress to at least this.
928 dstLimit := len(src) - len(src)>>5 - 6
929
930 // nextEmit is where in src the next emitLiteral should start from.
931 nextEmit := 0
932
933 // The encoded form must start with a literal, as there are no previous
934 // bytes to copy, so we start looking for hash matches at s == 1.
935 s := 0
936 cv := load64(src, s)
937
938 // We initialize repeat to 0, so we never match on first attempt
939 repeat := len(dict.dict) - dict.repeat
940
941 // While in dict
942 searchDict:
943 for {
944 candidateL := 0
945 nextS := 0
946 for {
947 // Next src position to check
948 nextS = s + (s-nextEmit)>>7 + 1
949 if nextS > sLimit {
950 break searchDict
951 }
952 hashL := hash7(cv, lTableBits)
953 hashS := hash4(cv, sTableBits)
954 candidateL = int(lTable[hashL])
955 candidateS := int(sTable[hashS])
956 dictL := int(dict.betterTableLong[hashL])
957 dictS := int(dict.betterTableShort[hashS])
958 lTable[hashL] = uint32(s)
959 sTable[hashS] = uint32(s)
960
961 valLong := load64(src, candidateL)
962 valShort := load64(src, candidateS)
963
964 // If long matches at least 8 bytes, use that.
965 if s != 0 {
966 if cv == valLong {
967 goto emitMatch
968 }
969 if cv == valShort {
970 candidateL = candidateS
971 goto emitMatch
972 }
973 }
974
975 // Check dict repeat.
976 if repeat >= s+4 {
977 candidate := len(dict.dict) - repeat + s
978 if candidate > 0 && uint32(cv) == load32(dict.dict, candidate) {
979 // Extend back
980 base := s
981 for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
982 i--
983 base--
984 }
985 d += emitLiteral(dst[d:], src[nextEmit:base])
986 if debug && nextEmit != base {
987 fmt.Println("emitted ", base-nextEmit, "literals")
988 }
989 s += 4
990 candidate += 4
991 for candidate < len(dict.dict)-8 && s <= len(src)-8 {
992 if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
993 s += bits.TrailingZeros64(diff) >> 3
994 break
995 }
996 s += 8
997 candidate += 8
998 }
999 d += emitRepeat(dst[d:], repeat, s-base)
1000 if debug {
1001 fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
1002 }
1003 nextEmit = s
1004 if s >= sLimit {
1005 break searchDict
1006 }
1007 // Index in-between
1008 index0 := base + 1
1009 index1 := s - 2
1010
1011 cv = load64(src, s)
1012 for index0 < index1 {
1013 cv0 := load64(src, index0)
1014 cv1 := load64(src, index1)
1015 lTable[hash7(cv0, lTableBits)] = uint32(index0)
1016 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
1017
1018 lTable[hash7(cv1, lTableBits)] = uint32(index1)
1019 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
1020 index0 += 2
1021 index1 -= 2
1022 }
1023 continue
1024 }
1025 }
1026 // Don't try to find match at s==0
1027 if s == 0 {
1028 cv = load64(src, nextS)
1029 s = nextS
1030 continue
1031 }
1032
1033 // Long likely matches 7, so take that.
1034 if uint32(cv) == uint32(valLong) {
1035 goto emitMatch
1036 }
1037
1038 // Long dict...
1039 if uint32(cv) == load32(dict.dict, dictL) {
1040 candidateL = dictL
1041 goto emitDict
1042 }
1043
1044 // Check our short candidate
1045 if uint32(cv) == uint32(valShort) {
1046 // Try a long candidate at s+1
1047 hashL = hash7(cv>>8, lTableBits)
1048 candidateL = int(lTable[hashL])
1049 lTable[hashL] = uint32(s + 1)
1050 if uint32(cv>>8) == load32(src, candidateL) {
1051 s++
1052 goto emitMatch
1053 }
1054 // Use our short candidate.
1055 candidateL = candidateS
1056 goto emitMatch
1057 }
1058 if uint32(cv) == load32(dict.dict, dictS) {
1059 // Try a long candidate at s+1
1060 hashL = hash7(cv>>8, lTableBits)
1061 candidateL = int(lTable[hashL])
1062 lTable[hashL] = uint32(s + 1)
1063 if uint32(cv>>8) == load32(src, candidateL) {
1064 s++
1065 goto emitMatch
1066 }
1067 candidateL = dictS
1068 goto emitDict
1069 }
1070 cv = load64(src, nextS)
1071 s = nextS
1072 }
1073 emitDict:
1074 {
1075 if debug {
1076 if load32(dict.dict, candidateL) != load32(src, s) {
1077 panic("dict emit mismatch")
1078 }
1079 }
1080 // Extend backwards.
1081 // The top bytes will be rechecked to get the full match.
1082 for candidateL > 0 && s > nextEmit && dict.dict[candidateL-1] == src[s-1] {
1083 candidateL--
1084 s--
1085 }
1086
1087 // Bail if we exceed the maximum size.
1088 if d+(s-nextEmit) > dstLimit {
1089 return 0
1090 }
1091
1092 // A 4-byte match has been found. We'll later see if more than 4 bytes
1093 // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
1094 // them as literal bytes.
1095
1096 d += emitLiteral(dst[d:], src[nextEmit:s])
1097 if debug && nextEmit != s {
1098 fmt.Println("emitted ", s-nextEmit, "literals")
1099 }
1100 {
1101 // Invariant: we have a 4-byte match at s, and no need to emit any
1102 // literal bytes prior to s.
1103 base := s
1104 offset := s + (len(dict.dict)) - candidateL
1105
1106 // Extend the 4-byte match as long as possible.
1107 s += 4
1108 candidateL += 4
1109 for s <= len(src)-8 && len(dict.dict)-candidateL >= 8 {
1110 if diff := load64(src, s) ^ load64(dict.dict, candidateL); diff != 0 {
1111 s += bits.TrailingZeros64(diff) >> 3
1112 break
1113 }
1114 s += 8
1115 candidateL += 8
1116 }
1117
1118 if repeat == offset {
1119 if debug {
1120 fmt.Println("emitted dict repeat, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
1121 }
1122 d += emitRepeat(dst[d:], offset, s-base)
1123 } else {
1124 if debug {
1125 fmt.Println("emitted dict copy, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
1126 }
1127 // Matches longer than 64 are split.
1128 if s <= sLimit || s-base < 8 {
1129 d += emitCopy(dst[d:], offset, s-base)
1130 } else {
1131 // Split to ensure we don't start a copy within next block.
1132 d += emitCopy(dst[d:], offset, 4)
1133 d += emitRepeat(dst[d:], offset, s-base-4)
1134 }
1135 repeat = offset
1136 }
1137 if false {
1138 // Validate match.
1139 if s <= candidateL {
1140 panic("s <= candidate")
1141 }
1142 a := src[base:s]
1143 b := dict.dict[base-repeat : base-repeat+(s-base)]
1144 if !bytes.Equal(a, b) {
1145 panic("mismatch")
1146 }
1147 }
1148
1149 nextEmit = s
1150 if s >= sLimit {
1151 break searchDict
1152 }
1153
1154 if d > dstLimit {
1155 // Do we have space for more, if not bail.
1156 return 0
1157 }
1158
1159 // Index short & long
1160 index0 := base + 1
1161 index1 := s - 2
1162
1163 cv0 := load64(src, index0)
1164 cv1 := load64(src, index1)
1165 lTable[hash7(cv0, lTableBits)] = uint32(index0)
1166 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
1167
1168 lTable[hash7(cv1, lTableBits)] = uint32(index1)
1169 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
1170 index0 += 1
1171 index1 -= 1
1172 cv = load64(src, s)
1173
1174 // index every second long in between.
1175 for index0 < index1 {
1176 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
1177 lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
1178 index0 += 2
1179 index1 -= 2
1180 }
1181 }
1182 continue
1183 }
1184 emitMatch:
1185
1186 // Extend backwards
1187 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
1188 candidateL--
1189 s--
1190 }
1191
1192 // Bail if we exceed the maximum size.
1193 if d+(s-nextEmit) > dstLimit {
1194 return 0
1195 }
1196
1197 base := s
1198 offset := base - candidateL
1199
1200 // Extend the 4-byte match as long as possible.
1201 s += 4
1202 candidateL += 4
1203 for s < len(src) {
1204 if len(src)-s < 8 {
1205 if src[s] == src[candidateL] {
1206 s++
1207 candidateL++
1208 continue
1209 }
1210 break
1211 }
1212 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
1213 s += bits.TrailingZeros64(diff) >> 3
1214 break
1215 }
1216 s += 8
1217 candidateL += 8
1218 }
1219
1220 if offset > 65535 && s-base <= 5 && repeat != offset {
1221 // Bail if the match is equal or worse to the encoding.
1222 s = nextS + 1
1223 if s >= sLimit {
1224 goto emitRemainder
1225 }
1226 cv = load64(src, s)
1227 continue
1228 }
1229
1230 d += emitLiteral(dst[d:], src[nextEmit:base])
1231 if debug && nextEmit != s {
1232 fmt.Println("emitted ", s-nextEmit, "literals")
1233 }
1234 if repeat == offset {
1235 if debug {
1236 fmt.Println("emitted match repeat, length", s-base, "offset:", offset, "s:", s)
1237 }
1238 d += emitRepeat(dst[d:], offset, s-base)
1239 } else {
1240 if debug {
1241 fmt.Println("emitted match copy, length", s-base, "offset:", offset, "s:", s)
1242 }
1243 d += emitCopy(dst[d:], offset, s-base)
1244 repeat = offset
1245 }
1246
1247 nextEmit = s
1248 if s >= sLimit {
1249 goto emitRemainder
1250 }
1251
1252 if d > dstLimit {
1253 // Do we have space for more, if not bail.
1254 return 0
1255 }
1256
1257 // Index short & long
1258 index0 := base + 1
1259 index1 := s - 2
1260
1261 cv0 := load64(src, index0)
1262 cv1 := load64(src, index1)
1263 lTable[hash7(cv0, lTableBits)] = uint32(index0)
1264 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
1265
1266 lTable[hash7(cv1, lTableBits)] = uint32(index1)
1267 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
1268 index0 += 1
1269 index1 -= 1
1270 cv = load64(src, s)
1271
1272 // Index large values sparsely in between.
1273 // We do two starting from different offsets for speed.
1274 index2 := (index0 + index1 + 1) >> 1
1275 for index2 < index1 {
1276 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
1277 lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
1278 index0 += 2
1279 index2 += 2
1280 }
1281 }
1282
1283 // Search without dict:
1284 if repeat > s {
1285 repeat = 0
1286 }
1287
1288 // No more dict
1289 sLimit = len(src) - inputMargin
1290 if s >= sLimit {
1291 goto emitRemainder
1292 }
1293 cv = load64(src, s)
1294 if debug {
1295 fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
1296 }
1297 for {
1298 candidateL := 0
1299 nextS := 0
1300 for {
1301 // Next src position to check
1302 nextS = s + (s-nextEmit)>>7 + 1
1303 if nextS > sLimit {
1304 goto emitRemainder
1305 }
1306 hashL := hash7(cv, lTableBits)
1307 hashS := hash4(cv, sTableBits)
1308 candidateL = int(lTable[hashL])
1309 candidateS := int(sTable[hashS])
1310 lTable[hashL] = uint32(s)
1311 sTable[hashS] = uint32(s)
1312
1313 valLong := load64(src, candidateL)
1314 valShort := load64(src, candidateS)
1315
1316 // If long matches at least 8 bytes, use that.
1317 if cv == valLong {
1318 break
1319 }
1320 if cv == valShort {
1321 candidateL = candidateS
1322 break
1323 }
1324
1325 // Check repeat at offset checkRep.
1326 const checkRep = 1
1327 // Minimum length of a repeat. Tested with various values.
1328 // While 4-5 offers improvements in some, 6 reduces
1329 // regressions significantly.
1330 const wantRepeatBytes = 6
1331 const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
1332 if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
1333 base := s + checkRep
1334 // Extend back
1335 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
1336 i--
1337 base--
1338 }
1339 d += emitLiteral(dst[d:], src[nextEmit:base])
1340
1341 // Extend forward
1342 candidate := s - repeat + wantRepeatBytes + checkRep
1343 s += wantRepeatBytes + checkRep
1344 for s < len(src) {
1345 if len(src)-s < 8 {
1346 if src[s] == src[candidate] {
1347 s++
1348 candidate++
1349 continue
1350 }
1351 break
1352 }
1353 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
1354 s += bits.TrailingZeros64(diff) >> 3
1355 break
1356 }
1357 s += 8
1358 candidate += 8
1359 }
1360 // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
1361 d += emitRepeat(dst[d:], repeat, s-base)
1362 nextEmit = s
1363 if s >= sLimit {
1364 goto emitRemainder
1365 }
1366 // Index in-between
1367 index0 := base + 1
1368 index1 := s - 2
1369
1370 for index0 < index1 {
1371 cv0 := load64(src, index0)
1372 cv1 := load64(src, index1)
1373 lTable[hash7(cv0, lTableBits)] = uint32(index0)
1374 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
1375
1376 lTable[hash7(cv1, lTableBits)] = uint32(index1)
1377 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
1378 index0 += 2
1379 index1 -= 2
1380 }
1381
1382 cv = load64(src, s)
1383 continue
1384 }
1385
1386 // Long likely matches 7, so take that.
1387 if uint32(cv) == uint32(valLong) {
1388 break
1389 }
1390
1391 // Check our short candidate
1392 if uint32(cv) == uint32(valShort) {
1393 // Try a long candidate at s+1
1394 hashL = hash7(cv>>8, lTableBits)
1395 candidateL = int(lTable[hashL])
1396 lTable[hashL] = uint32(s + 1)
1397 if uint32(cv>>8) == load32(src, candidateL) {
1398 s++
1399 break
1400 }
1401 // Use our short candidate.
1402 candidateL = candidateS
1403 break
1404 }
1405
1406 cv = load64(src, nextS)
1407 s = nextS
1408 }
1409
1410 // Extend backwards
1411 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
1412 candidateL--
1413 s--
1414 }
1415
1416 // Bail if we exceed the maximum size.
1417 if d+(s-nextEmit) > dstLimit {
1418 return 0
1419 }
1420
1421 base := s
1422 offset := base - candidateL
1423
1424 // Extend the 4-byte match as long as possible.
1425 s += 4
1426 candidateL += 4
1427 for s < len(src) {
1428 if len(src)-s < 8 {
1429 if src[s] == src[candidateL] {
1430 s++
1431 candidateL++
1432 continue
1433 }
1434 break
1435 }
1436 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
1437 s += bits.TrailingZeros64(diff) >> 3
1438 break
1439 }
1440 s += 8
1441 candidateL += 8
1442 }
1443
1444 if offset > 65535 && s-base <= 5 && repeat != offset {
1445 // Bail if the match is equal or worse to the encoding.
1446 s = nextS + 1
1447 if s >= sLimit {
1448 goto emitRemainder
1449 }
1450 cv = load64(src, s)
1451 continue
1452 }
1453
1454 d += emitLiteral(dst[d:], src[nextEmit:base])
1455 if repeat == offset {
1456 d += emitRepeat(dst[d:], offset, s-base)
1457 } else {
1458 d += emitCopy(dst[d:], offset, s-base)
1459 repeat = offset
1460 }
1461
1462 nextEmit = s
1463 if s >= sLimit {
1464 goto emitRemainder
1465 }
1466
1467 if d > dstLimit {
1468 // Do we have space for more, if not bail.
1469 return 0
1470 }
1471
1472 // Index short & long
1473 index0 := base + 1
1474 index1 := s - 2
1475
1476 cv0 := load64(src, index0)
1477 cv1 := load64(src, index1)
1478 lTable[hash7(cv0, lTableBits)] = uint32(index0)
1479 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
1480
1481 lTable[hash7(cv1, lTableBits)] = uint32(index1)
1482 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
1483 index0 += 1
1484 index1 -= 1
1485 cv = load64(src, s)
1486
1487 // Index large values sparsely in between.
1488 // We do two starting from different offsets for speed.
1489 index2 := (index0 + index1 + 1) >> 1
1490 for index2 < index1 {
1491 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
1492 lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
1493 index0 += 2
1494 index2 += 2
1495 }
1496 }
1497
1498 emitRemainder:
1499 if nextEmit < len(src) {
1500 // Bail if we exceed the maximum size.
1501 if d+len(src)-nextEmit > dstLimit {
1502 return 0
1503 }
1504 d += emitLiteral(dst[d:], src[nextEmit:])
1505 }
1506 return d
1507 }
1508