s2.go raw

   1  // Copyright 2011 The Snappy-Go Authors. All rights reserved.
   2  // Copyright (c) 2019 Klaus Post. All rights reserved.
   3  // Use of this source code is governed by a BSD-style
   4  // license that can be found in the LICENSE file.
   5  
   6  // Package s2 implements the S2 compression format.
   7  //
   8  // S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput,
   9  // which is why it features concurrent compression for bigger payloads.
  10  //
  11  // Decoding is compatible with Snappy compressed content,
  12  // but content compressed with S2 cannot be decompressed by Snappy.
  13  //
  14  // For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2
  15  //
  16  // There are actually two S2 formats: block and stream. They are related,
  17  // but different: trying to decompress block-compressed data as a S2 stream
  18  // will fail, and vice versa. The block format is the Decode and Encode
  19  // functions and the stream format is the Reader and Writer types.
  20  //
  21  // A "better" compression option is available. This will trade some compression
  22  // speed
  23  //
  24  // The block format, the more common case, is used when the complete size (the
  25  // number of bytes) of the original data is known upfront, at the time
  26  // compression starts. The stream format, also known as the framing format, is
  27  // for when that isn't always true.
  28  //
  29  // Blocks to not offer much data protection, so it is up to you to
  30  // add data validation of decompressed blocks.
  31  //
  32  // Streams perform CRC validation of the decompressed data.
  33  // Stream compression will also be performed on multiple CPU cores concurrently
  34  // significantly improving throughput.
  35  package s2
  36  
  37  import (
  38  	"bytes"
  39  	"hash/crc32"
  40  
  41  	"github.com/klauspost/compress/internal/race"
  42  )
  43  
  44  /*
  45  Each encoded block begins with the varint-encoded length of the decoded data,
  46  followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
  47  first byte of each chunk is broken into its 2 least and 6 most significant bits
  48  called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
  49  Zero means a literal tag. All other values mean a copy tag.
  50  
  51  For literal tags:
  52    - If m < 60, the next 1 + m bytes are literal bytes.
  53    - Otherwise, let n be the little-endian unsigned integer denoted by the next
  54      m - 59 bytes. The next 1 + n bytes after that are literal bytes.
  55  
  56  For copy tags, length bytes are copied from offset bytes ago, in the style of
  57  Lempel-Ziv compression algorithms. In particular:
  58    - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
  59      The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
  60      of the offset. The next byte is bits 0-7 of the offset.
  61    - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
  62      The length is 1 + m. The offset is the little-endian unsigned integer
  63      denoted by the next 2 bytes.
  64    - For l == 3, the offset ranges in [0, 1<<32) and the length in
  65      [1, 65). The length is 1 + m. The offset is the little-endian unsigned
  66      integer denoted by the next 4 bytes.
  67  */
  68  const (
  69  	tagLiteral = 0x00
  70  	tagCopy1   = 0x01
  71  	tagCopy2   = 0x02
  72  	tagCopy4   = 0x03
  73  )
  74  
  75  const (
  76  	checksumSize     = 4
  77  	chunkHeaderSize  = 4
  78  	magicChunk       = "\xff\x06\x00\x00" + magicBody
  79  	magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy
  80  	magicBodySnappy  = "sNaPpY"
  81  	magicBody        = "S2sTwO"
  82  
  83  	// maxBlockSize is the maximum size of the input to encodeBlock.
  84  	//
  85  	// For the framing format (Writer type instead of Encode function),
  86  	// this is the maximum uncompressed size of a block.
  87  	maxBlockSize = 4 << 20
  88  
  89  	// minBlockSize is the minimum size of block setting when creating a writer.
  90  	minBlockSize = 4 << 10
  91  
  92  	skippableFrameHeader = 4
  93  	maxChunkSize         = 1<<24 - 1 // 16777215
  94  
  95  	// Default block size
  96  	defaultBlockSize = 1 << 20
  97  
  98  	// maxSnappyBlockSize is the maximum snappy block size.
  99  	maxSnappyBlockSize = 1 << 16
 100  
 101  	obufHeaderLen = checksumSize + chunkHeaderSize
 102  )
 103  
 104  const (
 105  	chunkTypeCompressedData   = 0x00
 106  	chunkTypeUncompressedData = 0x01
 107  	ChunkTypeIndex            = 0x99
 108  	chunkTypePadding          = 0xfe
 109  	chunkTypeStreamIdentifier = 0xff
 110  )
 111  
 112  var (
 113  	crcTable              = crc32.MakeTable(crc32.Castagnoli)
 114  	magicChunkSnappyBytes = []byte(magicChunkSnappy) // Can be passed to functions where it escapes.
 115  	magicChunkBytes       = []byte(magicChunk)       // Can be passed to functions where it escapes.
 116  )
 117  
 118  // crc implements the checksum specified in section 3 of
 119  // https://github.com/google/snappy/blob/master/framing_format.txt
 120  func crc(b []byte) uint32 {
 121  	race.ReadSlice(b)
 122  
 123  	c := crc32.Update(0, crcTable, b)
 124  	return c>>15 | c<<17 + 0xa282ead8
 125  }
 126  
 127  // literalExtraSize returns the extra size of encoding n literals.
 128  // n should be >= 0 and <= math.MaxUint32.
 129  func literalExtraSize(n int64) int64 {
 130  	if n == 0 {
 131  		return 0
 132  	}
 133  	switch {
 134  	case n < 60:
 135  		return 1
 136  	case n < 1<<8:
 137  		return 2
 138  	case n < 1<<16:
 139  		return 3
 140  	case n < 1<<24:
 141  		return 4
 142  	default:
 143  		return 5
 144  	}
 145  }
 146  
 147  type byter interface {
 148  	Bytes() []byte
 149  }
 150  
 151  var _ byter = &bytes.Buffer{}
 152