suffixarray.mx raw

   1  // Copyright 2010 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Package suffixarray implements substring search in logarithmic time using
   6  // an in-memory suffix array.
   7  //
   8  // Example use:
   9  //
  10  //	// create index for some data
  11  //	index := suffixarray.New(data)
  12  //
  13  //	// lookup byte slice s
  14  //	offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
  15  //	offsets2 := index.Lookup(s, 3)  // the list of at most 3 indices where s occurs in data
  16  package suffixarray
  17  
  18  import (
  19  	"bytes"
  20  	"encoding/binary"
  21  	"errors"
  22  	"io"
  23  	"math"
  24  	"regexp"
  25  	"slices"
  26  	"sort"
  27  )
  28  
  29  // Can change for testing
  30  var maxData32 int = realMaxData32
  31  
  32  const realMaxData32 = math.MaxInt32
  33  
  34  // Index implements a suffix array for fast substring search.
  35  type Index struct {
  36  	data []byte
  37  	sa   ints // suffix array for data; sa.len() == len(data)
  38  }
  39  
  40  // An ints is either an []int32 or an []int64.
  41  // That is, one of them is empty, and one is the real data.
  42  // The int64 form is used when len(data) > maxData32
  43  type ints struct {
  44  	int32 []int32
  45  	int64 []int64
  46  }
  47  
  48  func (a *ints) len() int {
  49  	return len(a.int32) + len(a.int64)
  50  }
  51  
  52  func (a *ints) get(i int) int64 {
  53  	if a.int32 != nil {
  54  		return int64(a.int32[i])
  55  	}
  56  	return a.int64[i]
  57  }
  58  
  59  func (a *ints) set(i int, v int64) {
  60  	if a.int32 != nil {
  61  		a.int32[i] = int32(v)
  62  	} else {
  63  		a.int64[i] = v
  64  	}
  65  }
  66  
  67  func (a *ints) slice(i, j int) ints {
  68  	if a.int32 != nil {
  69  		return ints{a.int32[i:j], nil}
  70  	}
  71  	return ints{nil, a.int64[i:j]}
  72  }
  73  
  74  // New creates a new [Index] for data.
  75  // [Index] creation time is O(N) for N = len(data).
  76  func New(data []byte) *Index {
  77  	ix := &Index{data: data}
  78  	if len(data) <= maxData32 {
  79  		ix.sa.int32 = []int32{:len(data)}
  80  		text_32(data, ix.sa.int32)
  81  	} else {
  82  		ix.sa.int64 = []int64{:len(data)}
  83  		text_64(data, ix.sa.int64)
  84  	}
  85  	return ix
  86  }
  87  
  88  // writeInt writes an int x to w using buf to buffer the write.
  89  func writeInt(w io.Writer, buf []byte, x int) error {
  90  	binary.PutVarint(buf, int64(x))
  91  	_, err := w.Write(buf[0:binary.MaxVarintLen64])
  92  	return err
  93  }
  94  
  95  // readInt reads an int x from r using buf to buffer the read and returns x.
  96  func readInt(r io.Reader, buf []byte) (int64, error) {
  97  	_, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error
  98  	x, _ := binary.Varint(buf)
  99  	return x, err
 100  }
 101  
 102  // writeSlice writes data[:n] to w and returns n.
 103  // It uses buf to buffer the write.
 104  func writeSlice(w io.Writer, buf []byte, data ints) (n int, err error) {
 105  	// encode as many elements as fit into buf
 106  	p := binary.MaxVarintLen64
 107  	m := data.len()
 108  	for ; n < m && p+binary.MaxVarintLen64 <= len(buf); n++ {
 109  		p += binary.PutUvarint(buf[p:], uint64(data.get(n)))
 110  	}
 111  
 112  	// update buffer size
 113  	binary.PutVarint(buf, int64(p))
 114  
 115  	// write buffer
 116  	_, err = w.Write(buf[0:p])
 117  	return
 118  }
 119  
 120  var errTooBig = errors.New("suffixarray: data too large")
 121  
 122  // readSlice reads data[:n] from r and returns n.
 123  // It uses buf to buffer the read.
 124  func readSlice(r io.Reader, buf []byte, data ints) (n int, err error) {
 125  	// read buffer size
 126  	var size64 int64
 127  	size64, err = readInt(r, buf)
 128  	if err != nil {
 129  		return
 130  	}
 131  	if int64(int(size64)) != size64 || int(size64) < 0 {
 132  		// We never write chunks this big anyway.
 133  		return 0, errTooBig
 134  	}
 135  	size := int(size64)
 136  
 137  	// read buffer w/o the size
 138  	if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil {
 139  		return
 140  	}
 141  
 142  	// decode as many elements as present in buf
 143  	for p := binary.MaxVarintLen64; p < size; n++ {
 144  		x, w := binary.Uvarint(buf[p:])
 145  		data.set(n, int64(x))
 146  		p += w
 147  	}
 148  
 149  	return
 150  }
 151  
 152  const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore
 153  
 154  // Read reads the index from r into x; x must not be nil.
 155  func (x *Index) Read(r io.Reader) error {
 156  	// buffer for all reads
 157  	buf := []byte{:bufSize}
 158  
 159  	// read length
 160  	n64, err := readInt(r, buf)
 161  	if err != nil {
 162  		return err
 163  	}
 164  	if int64(int(n64)) != n64 || int(n64) < 0 {
 165  		return errTooBig
 166  	}
 167  	n := int(n64)
 168  
 169  	// allocate space
 170  	if 2*n < cap(x.data) || cap(x.data) < n || x.sa.int32 != nil && n > maxData32 || x.sa.int64 != nil && n <= maxData32 {
 171  		// new data is significantly smaller or larger than
 172  		// existing buffers - allocate new ones
 173  		x.data = []byte{:n}
 174  		x.sa.int32 = nil
 175  		x.sa.int64 = nil
 176  		if n <= maxData32 {
 177  			x.sa.int32 = []int32{:n}
 178  		} else {
 179  			x.sa.int64 = []int64{:n}
 180  		}
 181  	} else {
 182  		// re-use existing buffers
 183  		x.data = x.data[0:n]
 184  		x.sa = x.sa.slice(0, n)
 185  	}
 186  
 187  	// read data
 188  	if _, err := io.ReadFull(r, x.data); err != nil {
 189  		return err
 190  	}
 191  
 192  	// read index
 193  	sa := x.sa
 194  	for sa.len() > 0 {
 195  		n, err := readSlice(r, buf, sa)
 196  		if err != nil {
 197  			return err
 198  		}
 199  		sa = sa.slice(n, sa.len())
 200  	}
 201  	return nil
 202  }
 203  
 204  // Write writes the index x to w.
 205  func (x *Index) Write(w io.Writer) error {
 206  	// buffer for all writes
 207  	buf := []byte{:bufSize}
 208  
 209  	// write length
 210  	if err := writeInt(w, buf, len(x.data)); err != nil {
 211  		return err
 212  	}
 213  
 214  	// write data
 215  	if _, err := w.Write(x.data); err != nil {
 216  		return err
 217  	}
 218  
 219  	// write index
 220  	sa := x.sa
 221  	for sa.len() > 0 {
 222  		n, err := writeSlice(w, buf, sa)
 223  		if err != nil {
 224  			return err
 225  		}
 226  		sa = sa.slice(n, sa.len())
 227  	}
 228  	return nil
 229  }
 230  
 231  // Bytes returns the data over which the index was created.
 232  // It must not be modified.
 233  func (x *Index) Bytes() []byte {
 234  	return x.data
 235  }
 236  
 237  func (x *Index) at(i int) []byte {
 238  	return x.data[x.sa.get(i):]
 239  }
 240  
 241  // lookupAll returns a slice into the matching region of the index.
 242  // The runtime is O(log(N)*len(s)).
 243  func (x *Index) lookupAll(s []byte) ints {
 244  	// find matching suffix index range [i:j]
 245  	// find the first index where s would be the prefix
 246  	i := sort.Search(x.sa.len(), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
 247  	// starting at i, find the first index at which s is not a prefix
 248  	j := i + sort.Search(x.sa.len()-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
 249  	return x.sa.slice(i, j)
 250  }
 251  
 252  // Lookup returns an unsorted list of at most n indices where the byte string s
 253  // occurs in the indexed data. If n < 0, all occurrences are returned.
 254  // The result is nil if s is empty, s is not found, or n == 0.
 255  // Lookup time is O(log(N)*len(s) + len(result)) where N is the
 256  // size of the indexed data.
 257  func (x *Index) Lookup(s []byte, n int) (result []int) {
 258  	if len(s) > 0 && n != 0 {
 259  		matches := x.lookupAll(s)
 260  		count := matches.len()
 261  		if n < 0 || count < n {
 262  			n = count
 263  		}
 264  		// 0 <= n <= count
 265  		if n > 0 {
 266  			result = []int{:n}
 267  			if matches.int32 != nil {
 268  				for i := range result {
 269  					result[i] = int(matches.int32[i])
 270  				}
 271  			} else {
 272  				for i := range result {
 273  					result[i] = int(matches.int64[i])
 274  				}
 275  			}
 276  		}
 277  	}
 278  	return
 279  }
 280  
 281  // FindAllIndex returns a sorted list of non-overlapping matches of the
 282  // regular expression r, where a match is a pair of indices specifying
 283  // the matched slice of x.Bytes(). If n < 0, all matches are returned
 284  // in successive order. Otherwise, at most n matches are returned and
 285  // they may not be successive. The result is nil if there are no matches,
 286  // or if n == 0.
 287  func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) {
 288  	// a non-empty literal prefix is used to determine possible
 289  	// match start indices with Lookup
 290  	prefix, complete := r.LiteralPrefix()
 291  	lit := []byte(prefix)
 292  
 293  	// worst-case scenario: no literal prefix
 294  	if prefix == "" {
 295  		return r.FindAllIndex(x.data, n)
 296  	}
 297  
 298  	// if regexp is a literal just use Lookup and convert its
 299  	// result into match pairs
 300  	if complete {
 301  		// Lookup returns indices that may belong to overlapping matches.
 302  		// After eliminating them, we may end up with fewer than n matches.
 303  		// If we don't have enough at the end, redo the search with an
 304  		// increased value n1, but only if Lookup returned all the requested
 305  		// indices in the first place (if it returned fewer than that then
 306  		// there cannot be more).
 307  		for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
 308  			indices := x.Lookup(lit, n1)
 309  			if len(indices) == 0 {
 310  				return
 311  			}
 312  			slices.Sort(indices)
 313  			pairs := []int{:2*len(indices)}
 314  			result = [][]int{:len(indices)}
 315  			count := 0
 316  			prev := 0
 317  			for _, i := range indices {
 318  				if count == n {
 319  					break
 320  				}
 321  				// ignore indices leading to overlapping matches
 322  				if prev <= i {
 323  					j := 2 * count
 324  					pairs[j+0] = i
 325  					pairs[j+1] = i + len(lit)
 326  					result[count] = pairs[j : j+2]
 327  					count++
 328  					prev = i + len(lit)
 329  				}
 330  			}
 331  			result = result[0:count]
 332  			if len(result) >= n || len(indices) != n1 {
 333  				// found all matches or there's no chance to find more
 334  				// (n and n1 can be negative)
 335  				break
 336  			}
 337  		}
 338  		if len(result) == 0 {
 339  			result = nil
 340  		}
 341  		return
 342  	}
 343  
 344  	// regexp has a non-empty literal prefix; Lookup(lit) computes
 345  	// the indices of possible complete matches; use these as starting
 346  	// points for anchored searches
 347  	// (regexp "^" matches beginning of input, not beginning of line)
 348  	r = regexp.MustCompile("^" + r.String()) // compiles because r compiled
 349  
 350  	// same comment about Lookup applies here as in the loop above
 351  	for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
 352  		indices := x.Lookup(lit, n1)
 353  		if len(indices) == 0 {
 354  			return
 355  		}
 356  		slices.Sort(indices)
 357  		result = result[0:0]
 358  		prev := 0
 359  		for _, i := range indices {
 360  			if len(result) == n {
 361  				break
 362  			}
 363  			m := r.FindIndex(x.data[i:]) // anchored search - will not run off
 364  			// ignore indices leading to overlapping matches
 365  			if m != nil && prev <= i {
 366  				m[0] = i // correct m
 367  				m[1] += i
 368  				result = append(result, m)
 369  				prev = m[1]
 370  			}
 371  		}
 372  		if len(result) >= n || len(indices) != n1 {
 373  			// found all matches or there's no chance to find more
 374  			// (n and n1 can be negative)
 375  			break
 376  		}
 377  	}
 378  	if len(result) == 0 {
 379  		result = nil
 380  	}
 381  	return
 382  }
 383