1 // Copyright 2016 The Snappy-Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 package snapref
6 7 func load32(b []byte, i int) uint32 {
8 b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
9 return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
10 }
11 12 func load64(b []byte, i int) uint64 {
13 b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
14 return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
15 uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
16 }
17 18 // emitLiteral writes a literal chunk and returns the number of bytes written.
19 //
20 // It assumes that:
21 //
22 // dst is long enough to hold the encoded bytes
23 // 1 <= len(lit) && len(lit) <= 65536
24 func emitLiteral(dst, lit []byte) int {
25 i, n := 0, uint(len(lit)-1)
26 switch {
27 case n < 60:
28 dst[0] = uint8(n)<<2 | tagLiteral
29 i = 1
30 case n < 1<<8:
31 dst[0] = 60<<2 | tagLiteral
32 dst[1] = uint8(n)
33 i = 2
34 default:
35 dst[0] = 61<<2 | tagLiteral
36 dst[1] = uint8(n)
37 dst[2] = uint8(n >> 8)
38 i = 3
39 }
40 return i + copy(dst[i:], lit)
41 }
42 43 // emitCopy writes a copy chunk and returns the number of bytes written.
44 //
45 // It assumes that:
46 //
47 // dst is long enough to hold the encoded bytes
48 // 1 <= offset && offset <= 65535
49 // 4 <= length && length <= 65535
50 func emitCopy(dst []byte, offset, length int) int {
51 i := 0
52 // The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
53 // threshold for this loop is a little higher (at 68 = 64 + 4), and the
54 // length emitted down below is a little lower (at 60 = 64 - 4), because
55 // it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
56 // by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
57 // a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
58 // 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
59 // tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
60 // encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
61 for length >= 68 {
62 // Emit a length 64 copy, encoded as 3 bytes.
63 dst[i+0] = 63<<2 | tagCopy2
64 dst[i+1] = uint8(offset)
65 dst[i+2] = uint8(offset >> 8)
66 i += 3
67 length -= 64
68 }
69 if length > 64 {
70 // Emit a length 60 copy, encoded as 3 bytes.
71 dst[i+0] = 59<<2 | tagCopy2
72 dst[i+1] = uint8(offset)
73 dst[i+2] = uint8(offset >> 8)
74 i += 3
75 length -= 60
76 }
77 if length >= 12 || offset >= 2048 {
78 // Emit the remaining copy, encoded as 3 bytes.
79 dst[i+0] = uint8(length-1)<<2 | tagCopy2
80 dst[i+1] = uint8(offset)
81 dst[i+2] = uint8(offset >> 8)
82 return i + 3
83 }
84 // Emit the remaining copy, encoded as 2 bytes.
85 dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
86 dst[i+1] = uint8(offset)
87 return i + 2
88 }
89 90 func hash(u, shift uint32) uint32 {
91 return (u * 0x1e35a7bd) >> shift
92 }
93 94 // EncodeBlockInto exposes encodeBlock but checks dst size.
95 func EncodeBlockInto(dst, src []byte) (d int) {
96 if MaxEncodedLen(len(src)) > len(dst) {
97 return 0
98 }
99 100 // encodeBlock breaks on too big blocks, so split.
101 for len(src) > 0 {
102 p := src
103 src = nil
104 if len(p) > maxBlockSize {
105 p, src = p[:maxBlockSize], p[maxBlockSize:]
106 }
107 if len(p) < minNonLiteralBlockSize {
108 d += emitLiteral(dst[d:], p)
109 } else {
110 d += encodeBlock(dst[d:], p)
111 }
112 }
113 return d
114 }
115 116 // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
117 // assumes that the varint-encoded length of the decompressed bytes has already
118 // been written.
119 //
120 // It also assumes that:
121 //
122 // len(dst) >= MaxEncodedLen(len(src)) &&
123 // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
124 func encodeBlock(dst, src []byte) (d int) {
125 // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
126 // The table element type is uint16, as s < sLimit and sLimit < len(src)
127 // and len(src) <= maxBlockSize and maxBlockSize == 65536.
128 const (
129 maxTableSize = 1 << 14
130 // tableMask is redundant, but helps the compiler eliminate bounds
131 // checks.
132 tableMask = maxTableSize - 1
133 )
134 shift := uint32(32 - 8)
135 for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
136 shift--
137 }
138 // In Go, all array elements are zero-initialized, so there is no advantage
139 // to a smaller tableSize per se. However, it matches the C++ algorithm,
140 // and in the asm versions of this code, we can get away with zeroing only
141 // the first tableSize elements.
142 var table [maxTableSize]uint16
143 144 // sLimit is when to stop looking for offset/length copies. The inputMargin
145 // lets us use a fast path for emitLiteral in the main loop, while we are
146 // looking for copies.
147 sLimit := len(src) - inputMargin
148 149 // nextEmit is where in src the next emitLiteral should start from.
150 nextEmit := 0
151 152 // The encoded form must start with a literal, as there are no previous
153 // bytes to copy, so we start looking for hash matches at s == 1.
154 s := 1
155 nextHash := hash(load32(src, s), shift)
156 157 for {
158 // Copied from the C++ snappy implementation:
159 //
160 // Heuristic match skipping: If 32 bytes are scanned with no matches
161 // found, start looking only at every other byte. If 32 more bytes are
162 // scanned (or skipped), look at every third byte, etc.. When a match
163 // is found, immediately go back to looking at every byte. This is a
164 // small loss (~5% performance, ~0.1% density) for compressible data
165 // due to more bookkeeping, but for non-compressible data (such as
166 // JPEG) it's a huge win since the compressor quickly "realizes" the
167 // data is incompressible and doesn't bother looking for matches
168 // everywhere.
169 //
170 // The "skip" variable keeps track of how many bytes there are since
171 // the last match; dividing it by 32 (ie. right-shifting by five) gives
172 // the number of bytes to move ahead for each iteration.
173 skip := 32
174 175 nextS := s
176 candidate := 0
177 for {
178 s = nextS
179 bytesBetweenHashLookups := skip >> 5
180 nextS = s + bytesBetweenHashLookups
181 skip += bytesBetweenHashLookups
182 if nextS > sLimit {
183 goto emitRemainder
184 }
185 candidate = int(table[nextHash&tableMask])
186 table[nextHash&tableMask] = uint16(s)
187 nextHash = hash(load32(src, nextS), shift)
188 if load32(src, s) == load32(src, candidate) {
189 break
190 }
191 }
192 193 // A 4-byte match has been found. We'll later see if more than 4 bytes
194 // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
195 // them as literal bytes.
196 d += emitLiteral(dst[d:], src[nextEmit:s])
197 198 // Call emitCopy, and then see if another emitCopy could be our next
199 // move. Repeat until we find no match for the input immediately after
200 // what was consumed by the last emitCopy call.
201 //
202 // If we exit this loop normally then we need to call emitLiteral next,
203 // though we don't yet know how big the literal will be. We handle that
204 // by proceeding to the next iteration of the main loop. We also can
205 // exit this loop via goto if we get close to exhausting the input.
206 for {
207 // Invariant: we have a 4-byte match at s, and no need to emit any
208 // literal bytes prior to s.
209 base := s
210 211 // Extend the 4-byte match as long as possible.
212 //
213 // This is an inlined version of:
214 // s = extendMatch(src, candidate+4, s+4)
215 s += 4
216 for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
217 }
218 219 d += emitCopy(dst[d:], base-candidate, s-base)
220 nextEmit = s
221 if s >= sLimit {
222 goto emitRemainder
223 }
224 225 // We could immediately start working at s now, but to improve
226 // compression we first update the hash table at s-1 and at s. If
227 // another emitCopy is not our next move, also calculate nextHash
228 // at s+1. At least on GOARCH=amd64, these three hash calculations
229 // are faster as one load64 call (with some shifts) instead of
230 // three load32 calls.
231 x := load64(src, s-1)
232 prevHash := hash(uint32(x>>0), shift)
233 table[prevHash&tableMask] = uint16(s - 1)
234 currHash := hash(uint32(x>>8), shift)
235 candidate = int(table[currHash&tableMask])
236 table[currHash&tableMask] = uint16(s)
237 if uint32(x>>8) != load32(src, candidate) {
238 nextHash = hash(uint32(x>>16), shift)
239 s++
240 break
241 }
242 }
243 }
244 245 emitRemainder:
246 if nextEmit < len(src) {
247 d += emitLiteral(dst[d:], src[nextEmit:])
248 }
249 return d
250 }
251