dict.go raw

   1  // Copyright (c) 2022+ Klaus Post. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package s2
   6  
   7  import (
   8  	"bytes"
   9  	"encoding/binary"
  10  	"sync"
  11  )
  12  
  13  const (
  14  	// MinDictSize is the minimum dictionary size when repeat has been read.
  15  	MinDictSize = 16
  16  
  17  	// MaxDictSize is the maximum dictionary size when repeat has been read.
  18  	MaxDictSize = 65536
  19  
  20  	// MaxDictSrcOffset is the maximum offset where a dictionary entry can start.
  21  	MaxDictSrcOffset = 65535
  22  )
  23  
  24  // Dict contains a dictionary that can be used for encoding and decoding s2
  25  type Dict struct {
  26  	dict   []byte
  27  	repeat int // Repeat as index of dict
  28  
  29  	fast, better, best sync.Once
  30  	fastTable          *[1 << 14]uint16
  31  
  32  	betterTableShort *[1 << 14]uint16
  33  	betterTableLong  *[1 << 17]uint16
  34  
  35  	bestTableShort *[1 << 16]uint32
  36  	bestTableLong  *[1 << 19]uint32
  37  }
  38  
  39  // NewDict will read a dictionary.
  40  // It will return nil if the dictionary is invalid.
  41  func NewDict(dict []byte) *Dict {
  42  	if len(dict) == 0 {
  43  		return nil
  44  	}
  45  	var d Dict
  46  	// Repeat is the first value of the dict
  47  	r, n := binary.Uvarint(dict)
  48  	if n <= 0 {
  49  		return nil
  50  	}
  51  	dict = dict[n:]
  52  	d.dict = dict
  53  	if cap(d.dict) < len(d.dict)+16 {
  54  		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
  55  	}
  56  	if len(dict) < MinDictSize || len(dict) > MaxDictSize {
  57  		return nil
  58  	}
  59  	d.repeat = int(r)
  60  	if d.repeat > len(dict) {
  61  		return nil
  62  	}
  63  	return &d
  64  }
  65  
  66  // Bytes will return a serialized version of the dictionary.
  67  // The output can be sent to NewDict.
  68  func (d *Dict) Bytes() []byte {
  69  	dst := make([]byte, binary.MaxVarintLen16+len(d.dict))
  70  	return append(dst[:binary.PutUvarint(dst, uint64(d.repeat))], d.dict...)
  71  }
  72  
  73  // MakeDict will create a dictionary.
  74  // 'data' must be at least MinDictSize.
  75  // If data is longer than MaxDictSize only the last MaxDictSize bytes will be used.
  76  // If searchStart is set the start repeat value will be set to the last
  77  // match of this content.
  78  // If no matches are found, it will attempt to find shorter matches.
  79  // This content should match the typical start of a block.
  80  // If at least 4 bytes cannot be matched, repeat is set to start of block.
  81  func MakeDict(data []byte, searchStart []byte) *Dict {
  82  	if len(data) == 0 {
  83  		return nil
  84  	}
  85  	if len(data) > MaxDictSize {
  86  		data = data[len(data)-MaxDictSize:]
  87  	}
  88  	var d Dict
  89  	dict := data
  90  	d.dict = dict
  91  	if cap(d.dict) < len(d.dict)+16 {
  92  		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
  93  	}
  94  	if len(dict) < MinDictSize {
  95  		return nil
  96  	}
  97  
  98  	// Find the longest match possible, last entry if multiple.
  99  	for s := len(searchStart); s > 4; s-- {
 100  		if idx := bytes.LastIndex(data, searchStart[:s]); idx >= 0 && idx <= len(data)-8 {
 101  			d.repeat = idx
 102  			break
 103  		}
 104  	}
 105  
 106  	return &d
 107  }
 108  
 109  // MakeDictManual will create a dictionary.
 110  // 'data' must be at least MinDictSize and less than or equal to MaxDictSize.
 111  // A manual first repeat index into data must be provided.
 112  // It must be less than len(data)-8.
 113  func MakeDictManual(data []byte, firstIdx uint16) *Dict {
 114  	if len(data) < MinDictSize || int(firstIdx) >= len(data)-8 || len(data) > MaxDictSize {
 115  		return nil
 116  	}
 117  	var d Dict
 118  	dict := data
 119  	d.dict = dict
 120  	if cap(d.dict) < len(d.dict)+16 {
 121  		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
 122  	}
 123  
 124  	d.repeat = int(firstIdx)
 125  	return &d
 126  }
 127  
 128  // Encode returns the encoded form of src. The returned slice may be a sub-
 129  // slice of dst if dst was large enough to hold the entire encoded block.
 130  // Otherwise, a newly allocated slice will be returned.
 131  //
 132  // The dst and src must not overlap. It is valid to pass a nil dst.
 133  //
 134  // The blocks will require the same amount of memory to decode as encoding,
 135  // and does not make for concurrent decoding.
 136  // Also note that blocks do not contain CRC information, so corruption may be undetected.
 137  //
 138  // If you need to encode larger amounts of data, consider using
 139  // the streaming interface which gives all of these features.
 140  func (d *Dict) Encode(dst, src []byte) []byte {
 141  	if n := MaxEncodedLen(len(src)); n < 0 {
 142  		panic(ErrTooLarge)
 143  	} else if cap(dst) < n {
 144  		dst = make([]byte, n)
 145  	} else {
 146  		dst = dst[:n]
 147  	}
 148  
 149  	// The block starts with the varint-encoded length of the decompressed bytes.
 150  	dstP := binary.PutUvarint(dst, uint64(len(src)))
 151  
 152  	if len(src) == 0 {
 153  		return dst[:dstP]
 154  	}
 155  	if len(src) < minNonLiteralBlockSize {
 156  		dstP += emitLiteral(dst[dstP:], src)
 157  		return dst[:dstP]
 158  	}
 159  	n := encodeBlockDictGo(dst[dstP:], src, d)
 160  	if n > 0 {
 161  		dstP += n
 162  		return dst[:dstP]
 163  	}
 164  	// Not compressible
 165  	dstP += emitLiteral(dst[dstP:], src)
 166  	return dst[:dstP]
 167  }
 168  
 169  // EncodeBetter returns the encoded form of src. The returned slice may be a sub-
 170  // slice of dst if dst was large enough to hold the entire encoded block.
 171  // Otherwise, a newly allocated slice will be returned.
 172  //
 173  // EncodeBetter compresses better than Encode but typically with a
 174  // 10-40% speed decrease on both compression and decompression.
 175  //
 176  // The dst and src must not overlap. It is valid to pass a nil dst.
 177  //
 178  // The blocks will require the same amount of memory to decode as encoding,
 179  // and does not make for concurrent decoding.
 180  // Also note that blocks do not contain CRC information, so corruption may be undetected.
 181  //
 182  // If you need to encode larger amounts of data, consider using
 183  // the streaming interface which gives all of these features.
 184  func (d *Dict) EncodeBetter(dst, src []byte) []byte {
 185  	if n := MaxEncodedLen(len(src)); n < 0 {
 186  		panic(ErrTooLarge)
 187  	} else if len(dst) < n {
 188  		dst = make([]byte, n)
 189  	}
 190  
 191  	// The block starts with the varint-encoded length of the decompressed bytes.
 192  	dstP := binary.PutUvarint(dst, uint64(len(src)))
 193  
 194  	if len(src) == 0 {
 195  		return dst[:dstP]
 196  	}
 197  	if len(src) < minNonLiteralBlockSize {
 198  		dstP += emitLiteral(dst[dstP:], src)
 199  		return dst[:dstP]
 200  	}
 201  	n := encodeBlockBetterDict(dst[dstP:], src, d)
 202  	if n > 0 {
 203  		dstP += n
 204  		return dst[:dstP]
 205  	}
 206  	// Not compressible
 207  	dstP += emitLiteral(dst[dstP:], src)
 208  	return dst[:dstP]
 209  }
 210  
 211  // EncodeBest returns the encoded form of src. The returned slice may be a sub-
 212  // slice of dst if dst was large enough to hold the entire encoded block.
 213  // Otherwise, a newly allocated slice will be returned.
 214  //
 215  // EncodeBest compresses as good as reasonably possible but with a
 216  // big speed decrease.
 217  //
 218  // The dst and src must not overlap. It is valid to pass a nil dst.
 219  //
 220  // The blocks will require the same amount of memory to decode as encoding,
 221  // and does not make for concurrent decoding.
 222  // Also note that blocks do not contain CRC information, so corruption may be undetected.
 223  //
 224  // If you need to encode larger amounts of data, consider using
 225  // the streaming interface which gives all of these features.
 226  func (d *Dict) EncodeBest(dst, src []byte) []byte {
 227  	if n := MaxEncodedLen(len(src)); n < 0 {
 228  		panic(ErrTooLarge)
 229  	} else if len(dst) < n {
 230  		dst = make([]byte, n)
 231  	}
 232  
 233  	// The block starts with the varint-encoded length of the decompressed bytes.
 234  	dstP := binary.PutUvarint(dst, uint64(len(src)))
 235  
 236  	if len(src) == 0 {
 237  		return dst[:dstP]
 238  	}
 239  	if len(src) < minNonLiteralBlockSize {
 240  		dstP += emitLiteral(dst[dstP:], src)
 241  		return dst[:dstP]
 242  	}
 243  	n := encodeBlockBest(dst[dstP:], src, d)
 244  	if n > 0 {
 245  		dstP += n
 246  		return dst[:dstP]
 247  	}
 248  	// Not compressible
 249  	dstP += emitLiteral(dst[dstP:], src)
 250  	return dst[:dstP]
 251  }
 252  
 253  // Decode returns the decoded form of src. The returned slice may be a sub-
 254  // slice of dst if dst was large enough to hold the entire decoded block.
 255  // Otherwise, a newly allocated slice will be returned.
 256  //
 257  // The dst and src must not overlap. It is valid to pass a nil dst.
 258  func (d *Dict) Decode(dst, src []byte) ([]byte, error) {
 259  	dLen, s, err := decodedLen(src)
 260  	if err != nil {
 261  		return nil, err
 262  	}
 263  	if dLen <= cap(dst) {
 264  		dst = dst[:dLen]
 265  	} else {
 266  		dst = make([]byte, dLen)
 267  	}
 268  	if s2DecodeDict(dst, src[s:], d) != 0 {
 269  		return nil, ErrCorrupt
 270  	}
 271  	return dst, nil
 272  }
 273  
 274  func (d *Dict) initFast() {
 275  	d.fast.Do(func() {
 276  		const (
 277  			tableBits    = 14
 278  			maxTableSize = 1 << tableBits
 279  		)
 280  
 281  		var table [maxTableSize]uint16
 282  		// We stop so any entry of length 8 can always be read.
 283  		for i := 0; i < len(d.dict)-8-2; i += 3 {
 284  			x0 := load64(d.dict, i)
 285  			h0 := hash6(x0, tableBits)
 286  			h1 := hash6(x0>>8, tableBits)
 287  			h2 := hash6(x0>>16, tableBits)
 288  			table[h0] = uint16(i)
 289  			table[h1] = uint16(i + 1)
 290  			table[h2] = uint16(i + 2)
 291  		}
 292  		d.fastTable = &table
 293  	})
 294  }
 295  
 296  func (d *Dict) initBetter() {
 297  	d.better.Do(func() {
 298  		const (
 299  			// Long hash matches.
 300  			lTableBits    = 17
 301  			maxLTableSize = 1 << lTableBits
 302  
 303  			// Short hash matches.
 304  			sTableBits    = 14
 305  			maxSTableSize = 1 << sTableBits
 306  		)
 307  
 308  		var lTable [maxLTableSize]uint16
 309  		var sTable [maxSTableSize]uint16
 310  
 311  		// We stop so any entry of length 8 can always be read.
 312  		for i := 0; i < len(d.dict)-8; i++ {
 313  			cv := load64(d.dict, i)
 314  			lTable[hash7(cv, lTableBits)] = uint16(i)
 315  			sTable[hash4(cv, sTableBits)] = uint16(i)
 316  		}
 317  		d.betterTableShort = &sTable
 318  		d.betterTableLong = &lTable
 319  	})
 320  }
 321  
 322  func (d *Dict) initBest() {
 323  	d.best.Do(func() {
 324  		const (
 325  			// Long hash matches.
 326  			lTableBits    = 19
 327  			maxLTableSize = 1 << lTableBits
 328  
 329  			// Short hash matches.
 330  			sTableBits    = 16
 331  			maxSTableSize = 1 << sTableBits
 332  		)
 333  
 334  		var lTable [maxLTableSize]uint32
 335  		var sTable [maxSTableSize]uint32
 336  
 337  		// We stop so any entry of length 8 can always be read.
 338  		for i := 0; i < len(d.dict)-8; i++ {
 339  			cv := load64(d.dict, i)
 340  			hashL := hash8(cv, lTableBits)
 341  			hashS := hash4(cv, sTableBits)
 342  			candidateL := lTable[hashL]
 343  			candidateS := sTable[hashS]
 344  			lTable[hashL] = uint32(i) | candidateL<<16
 345  			sTable[hashS] = uint32(i) | candidateS<<16
 346  		}
 347  		d.bestTableShort = &sTable
 348  		d.bestTableLong = &lTable
 349  	})
 350  }
 351