1 // Copyright 2011 The Snappy-Go Authors. All rights reserved.
2 // Copyright (c) 2019 Klaus Post. All rights reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 6 // Package s2 implements the S2 compression format.
7 //
8 // S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput,
9 // which is why it features concurrent compression for bigger payloads.
10 //
11 // Decoding is compatible with Snappy compressed content,
12 // but content compressed with S2 cannot be decompressed by Snappy.
13 //
14 // For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2
15 //
16 // There are actually two S2 formats: block and stream. They are related,
17 // but different: trying to decompress block-compressed data as a S2 stream
18 // will fail, and vice versa. The block format is the Decode and Encode
19 // functions and the stream format is the Reader and Writer types.
20 //
21 // A "better" compression option is available. This will trade some compression
22 // speed
23 //
24 // The block format, the more common case, is used when the complete size (the
25 // number of bytes) of the original data is known upfront, at the time
26 // compression starts. The stream format, also known as the framing format, is
27 // for when that isn't always true.
28 //
29 // Blocks to not offer much data protection, so it is up to you to
30 // add data validation of decompressed blocks.
31 //
32 // Streams perform CRC validation of the decompressed data.
33 // Stream compression will also be performed on multiple CPU cores concurrently
34 // significantly improving throughput.
35 package s2
36 37 import (
38 "bytes"
39 "hash/crc32"
40 41 "github.com/klauspost/compress/internal/race"
42 )
43 44 /*
45 Each encoded block begins with the varint-encoded length of the decoded data,
46 followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
47 first byte of each chunk is broken into its 2 least and 6 most significant bits
48 called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
49 Zero means a literal tag. All other values mean a copy tag.
50 51 For literal tags:
52 - If m < 60, the next 1 + m bytes are literal bytes.
53 - Otherwise, let n be the little-endian unsigned integer denoted by the next
54 m - 59 bytes. The next 1 + n bytes after that are literal bytes.
55 56 For copy tags, length bytes are copied from offset bytes ago, in the style of
57 Lempel-Ziv compression algorithms. In particular:
58 - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
59 The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
60 of the offset. The next byte is bits 0-7 of the offset.
61 - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
62 The length is 1 + m. The offset is the little-endian unsigned integer
63 denoted by the next 2 bytes.
64 - For l == 3, the offset ranges in [0, 1<<32) and the length in
65 [1, 65). The length is 1 + m. The offset is the little-endian unsigned
66 integer denoted by the next 4 bytes.
67 */
68 const (
69 tagLiteral = 0x00
70 tagCopy1 = 0x01
71 tagCopy2 = 0x02
72 tagCopy4 = 0x03
73 )
74 75 const (
76 checksumSize = 4
77 chunkHeaderSize = 4
78 magicChunk = "\xff\x06\x00\x00" + magicBody
79 magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy
80 magicBodySnappy = "sNaPpY"
81 magicBody = "S2sTwO"
82 83 // maxBlockSize is the maximum size of the input to encodeBlock.
84 //
85 // For the framing format (Writer type instead of Encode function),
86 // this is the maximum uncompressed size of a block.
87 maxBlockSize = 4 << 20
88 89 // minBlockSize is the minimum size of block setting when creating a writer.
90 minBlockSize = 4 << 10
91 92 skippableFrameHeader = 4
93 maxChunkSize = 1<<24 - 1 // 16777215
94 95 // Default block size
96 defaultBlockSize = 1 << 20
97 98 // maxSnappyBlockSize is the maximum snappy block size.
99 maxSnappyBlockSize = 1 << 16
100 101 obufHeaderLen = checksumSize + chunkHeaderSize
102 )
103 104 const (
105 chunkTypeCompressedData = 0x00
106 chunkTypeUncompressedData = 0x01
107 ChunkTypeIndex = 0x99
108 chunkTypePadding = 0xfe
109 chunkTypeStreamIdentifier = 0xff
110 )
111 112 var (
113 crcTable = crc32.MakeTable(crc32.Castagnoli)
114 magicChunkSnappyBytes = []byte(magicChunkSnappy) // Can be passed to functions where it escapes.
115 magicChunkBytes = []byte(magicChunk) // Can be passed to functions where it escapes.
116 )
117 118 // crc implements the checksum specified in section 3 of
119 // https://github.com/google/snappy/blob/master/framing_format.txt
120 func crc(b []byte) uint32 {
121 race.ReadSlice(b)
122 123 c := crc32.Update(0, crcTable, b)
124 return c>>15 | c<<17 + 0xa282ead8
125 }
126 127 // literalExtraSize returns the extra size of encoding n literals.
128 // n should be >= 0 and <= math.MaxUint32.
129 func literalExtraSize(n int64) int64 {
130 if n == 0 {
131 return 0
132 }
133 switch {
134 case n < 60:
135 return 1
136 case n < 1<<8:
137 return 2
138 case n < 1<<16:
139 return 3
140 case n < 1<<24:
141 return 4
142 default:
143 return 5
144 }
145 }
146 147 type byter interface {
148 Bytes() []byte
149 }
150 151 var _ byter = &bytes.Buffer{}
152