1 // Copyright 2011 The Snappy-Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 package snapref
6 7 import (
8 "encoding/binary"
9 "errors"
10 "io"
11 )
12 13 // Encode returns the encoded form of src. The returned slice may be a sub-
14 // slice of dst if dst was large enough to hold the entire encoded block.
15 // Otherwise, a newly allocated slice will be returned.
16 //
17 // The dst and src must not overlap. It is valid to pass a nil dst.
18 //
19 // Encode handles the Snappy block format, not the Snappy stream format.
20 func Encode(dst, src []byte) []byte {
21 if n := MaxEncodedLen(len(src)); n < 0 {
22 panic(ErrTooLarge)
23 } else if cap(dst) < n {
24 dst = make([]byte, n)
25 } else {
26 dst = dst[:n]
27 }
28 29 // The block starts with the varint-encoded length of the decompressed bytes.
30 d := binary.PutUvarint(dst, uint64(len(src)))
31 32 for len(src) > 0 {
33 p := src
34 src = nil
35 if len(p) > maxBlockSize {
36 p, src = p[:maxBlockSize], p[maxBlockSize:]
37 }
38 if len(p) < minNonLiteralBlockSize {
39 d += emitLiteral(dst[d:], p)
40 } else {
41 d += encodeBlock(dst[d:], p)
42 }
43 }
44 return dst[:d]
45 }
46 47 // inputMargin is the minimum number of extra input bytes to keep, inside
48 // encodeBlock's inner loop. On some architectures, this margin lets us
49 // implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
50 // literals can be implemented as a single load to and store from a 16-byte
51 // register. That literal's actual length can be as short as 1 byte, so this
52 // can copy up to 15 bytes too much, but that's OK as subsequent iterations of
53 // the encoding loop will fix up the copy overrun, and this inputMargin ensures
54 // that we don't overrun the dst and src buffers.
55 const inputMargin = 16 - 1
56 57 // minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
58 // could be encoded with a copy tag. This is the minimum with respect to the
59 // algorithm used by encodeBlock, not a minimum enforced by the file format.
60 //
61 // The encoded output must start with at least a 1 byte literal, as there are
62 // no previous bytes to copy. A minimal (1 byte) copy after that, generated
63 // from an emitCopy call in encodeBlock's main loop, would require at least
64 // another inputMargin bytes, for the reason above: we want any emitLiteral
65 // calls inside encodeBlock's main loop to use the fast path if possible, which
66 // requires being able to overrun by inputMargin bytes. Thus,
67 // minNonLiteralBlockSize equals 1 + 1 + inputMargin.
68 //
69 // The C++ code doesn't use this exact threshold, but it could, as discussed at
70 // https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion
71 // The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an
72 // optimization. It should not affect the encoded form. This is tested by
73 // TestSameEncodingAsCppShortCopies.
74 const minNonLiteralBlockSize = 1 + 1 + inputMargin
75 76 // MaxEncodedLen returns the maximum length of a snappy block, given its
77 // uncompressed length.
78 //
79 // It will return a negative value if srcLen is too large to encode.
80 func MaxEncodedLen(srcLen int) int {
81 n := uint64(srcLen)
82 if n > 0xffffffff {
83 return -1
84 }
85 // Compressed data can be defined as:
86 // compressed := item* literal*
87 // item := literal* copy
88 //
89 // The trailing literal sequence has a space blowup of at most 62/60
90 // since a literal of length 60 needs one tag byte + one extra byte
91 // for length information.
92 //
93 // Item blowup is trickier to measure. Suppose the "copy" op copies
94 // 4 bytes of data. Because of a special check in the encoding code,
95 // we produce a 4-byte copy only if the offset is < 65536. Therefore
96 // the copy op takes 3 bytes to encode, and this type of item leads
97 // to at most the 62/60 blowup for representing literals.
98 //
99 // Suppose the "copy" op copies 5 bytes of data. If the offset is big
100 // enough, it will take 5 bytes to encode the copy op. Therefore the
101 // worst case here is a one-byte literal followed by a five-byte copy.
102 // That is, 6 bytes of input turn into 7 bytes of "compressed" data.
103 //
104 // This last factor dominates the blowup, so the final estimate is:
105 n = 32 + n + n/6
106 if n > 0xffffffff {
107 return -1
108 }
109 return int(n)
110 }
111 112 var errClosed = errors.New("snappy: Writer is closed")
113 114 // NewWriter returns a new Writer that compresses to w.
115 //
116 // The Writer returned does not buffer writes. There is no need to Flush or
117 // Close such a Writer.
118 //
119 // Deprecated: the Writer returned is not suitable for many small writes, only
120 // for few large writes. Use NewBufferedWriter instead, which is efficient
121 // regardless of the frequency and shape of the writes, and remember to Close
122 // that Writer when done.
123 func NewWriter(w io.Writer) *Writer {
124 return &Writer{
125 w: w,
126 obuf: make([]byte, obufLen),
127 }
128 }
129 130 // NewBufferedWriter returns a new Writer that compresses to w, using the
131 // framing format described at
132 // https://github.com/google/snappy/blob/master/framing_format.txt
133 //
134 // The Writer returned buffers writes. Users must call Close to guarantee all
135 // data has been forwarded to the underlying io.Writer. They may also call
136 // Flush zero or more times before calling Close.
137 func NewBufferedWriter(w io.Writer) *Writer {
138 return &Writer{
139 w: w,
140 ibuf: make([]byte, 0, maxBlockSize),
141 obuf: make([]byte, obufLen),
142 }
143 }
144 145 // Writer is an io.Writer that can write Snappy-compressed bytes.
146 //
147 // Writer handles the Snappy stream format, not the Snappy block format.
148 type Writer struct {
149 w io.Writer
150 err error
151 152 // ibuf is a buffer for the incoming (uncompressed) bytes.
153 //
154 // Its use is optional. For backwards compatibility, Writers created by the
155 // NewWriter function have ibuf == nil, do not buffer incoming bytes, and
156 // therefore do not need to be Flush'ed or Close'd.
157 ibuf []byte
158 159 // obuf is a buffer for the outgoing (compressed) bytes.
160 obuf []byte
161 162 // wroteStreamHeader is whether we have written the stream header.
163 wroteStreamHeader bool
164 }
165 166 // Reset discards the writer's state and switches the Snappy writer to write to
167 // w. This permits reusing a Writer rather than allocating a new one.
168 func (w *Writer) Reset(writer io.Writer) {
169 w.w = writer
170 w.err = nil
171 if w.ibuf != nil {
172 w.ibuf = w.ibuf[:0]
173 }
174 w.wroteStreamHeader = false
175 }
176 177 // Write satisfies the io.Writer interface.
178 func (w *Writer) Write(p []byte) (nRet int, errRet error) {
179 if w.ibuf == nil {
180 // Do not buffer incoming bytes. This does not perform or compress well
181 // if the caller of Writer.Write writes many small slices. This
182 // behavior is therefore deprecated, but still supported for backwards
183 // compatibility with code that doesn't explicitly Flush or Close.
184 return w.write(p)
185 }
186 187 // The remainder of this method is based on bufio.Writer.Write from the
188 // standard library.
189 190 for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil {
191 var n int
192 if len(w.ibuf) == 0 {
193 // Large write, empty buffer.
194 // Write directly from p to avoid copy.
195 n, _ = w.write(p)
196 } else {
197 n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
198 w.ibuf = w.ibuf[:len(w.ibuf)+n]
199 w.Flush()
200 }
201 nRet += n
202 p = p[n:]
203 }
204 if w.err != nil {
205 return nRet, w.err
206 }
207 n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
208 w.ibuf = w.ibuf[:len(w.ibuf)+n]
209 nRet += n
210 return nRet, nil
211 }
212 213 func (w *Writer) write(p []byte) (nRet int, errRet error) {
214 if w.err != nil {
215 return 0, w.err
216 }
217 for len(p) > 0 {
218 obufStart := len(magicChunk)
219 if !w.wroteStreamHeader {
220 w.wroteStreamHeader = true
221 copy(w.obuf, magicChunk)
222 obufStart = 0
223 }
224 225 var uncompressed []byte
226 if len(p) > maxBlockSize {
227 uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
228 } else {
229 uncompressed, p = p, nil
230 }
231 checksum := crc(uncompressed)
232 233 // Compress the buffer, discarding the result if the improvement
234 // isn't at least 12.5%.
235 compressed := Encode(w.obuf[obufHeaderLen:], uncompressed)
236 chunkType := uint8(chunkTypeCompressedData)
237 chunkLen := 4 + len(compressed)
238 obufEnd := obufHeaderLen + len(compressed)
239 if len(compressed) >= len(uncompressed)-len(uncompressed)/8 {
240 chunkType = chunkTypeUncompressedData
241 chunkLen = 4 + len(uncompressed)
242 obufEnd = obufHeaderLen
243 }
244 245 // Fill in the per-chunk header that comes before the body.
246 w.obuf[len(magicChunk)+0] = chunkType
247 w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0)
248 w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8)
249 w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16)
250 w.obuf[len(magicChunk)+4] = uint8(checksum >> 0)
251 w.obuf[len(magicChunk)+5] = uint8(checksum >> 8)
252 w.obuf[len(magicChunk)+6] = uint8(checksum >> 16)
253 w.obuf[len(magicChunk)+7] = uint8(checksum >> 24)
254 255 if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil {
256 w.err = err
257 return nRet, err
258 }
259 if chunkType == chunkTypeUncompressedData {
260 if _, err := w.w.Write(uncompressed); err != nil {
261 w.err = err
262 return nRet, err
263 }
264 }
265 nRet += len(uncompressed)
266 }
267 return nRet, nil
268 }
269 270 // Flush flushes the Writer to its underlying io.Writer.
271 func (w *Writer) Flush() error {
272 if w.err != nil {
273 return w.err
274 }
275 if len(w.ibuf) == 0 {
276 return nil
277 }
278 w.write(w.ibuf)
279 w.ibuf = w.ibuf[:0]
280 return w.err
281 }
282 283 // Close calls Flush and then closes the Writer.
284 func (w *Writer) Close() error {
285 w.Flush()
286 ret := w.err
287 if w.err == nil {
288 w.err = errClosed
289 }
290 return ret
291 }
292