sais.mx raw

   1  // Copyright 2019 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Suffix array construction by induced sorting (SAIS).
   6  // See Ge Nong, Sen Zhang, and Wai Hong Chen,
   7  // "Two Efficient Algorithms for Linear Time Suffix Array Construction",
   8  // especially section 3 (https://ieeexplore.ieee.org/document/5582081).
   9  // See also http://zork.net/~st/jottings/sais.html.
  10  //
  11  // With optimizations inspired by Yuta Mori's sais-lite
  12  // (https://sites.google.com/site/yuta256/sais).
  13  //
  14  // And with other new optimizations.
  15  
  16  // Many of these functions are parameterized by the sizes of
  17  // the types they operate on. The generator gen.go makes
  18  // copies of these functions for use with other sizes.
  19  // Specifically:
  20  //
  21  // - A function with a name ending in _8_32 takes []byte and []int32 arguments
  22  //   and is duplicated into _32_32, _8_64, and _64_64 forms.
  23  //   The _32_32 and _64_64_ suffixes are shortened to plain _32 and _64.
  24  //   Any lines in the function body that contain the text "byte-only" or "256"
  25  //   are stripped when creating _32_32 and _64_64 forms.
  26  //   (Those lines are typically 8-bit-specific optimizations.)
  27  //
  28  // - A function with a name ending only in _32 operates on []int32
  29  //   and is duplicated into a _64 form. (Note that it may still take a []byte,
  30  //   but there is no need for a version of the function in which the []byte
  31  //   is widened to a full integer array.)
  32  
  33  // The overall runtime of this code is linear in the input size:
  34  // it runs a sequence of linear passes to reduce the problem to
  35  // a subproblem at most half as big, invokes itself recursively,
  36  // and then runs a sequence of linear passes to turn the answer
  37  // for the subproblem into the answer for the original problem.
  38  // This gives T(N) = O(N) + T(N/2) = O(N) + O(N/2) + O(N/4) + ... = O(N).
  39  //
  40  // The outline of the code, with the forward and backward scans
  41  // through O(N)-sized arrays called out, is:
  42  //
  43  // sais_I_N
  44  //	placeLMS_I_B
  45  //		bucketMax_I_B
  46  //			freq_I_B
  47  //				<scan +text> (1)
  48  //			<scan +freq> (2)
  49  //		<scan -text, random bucket> (3)
  50  //	induceSubL_I_B
  51  //		bucketMin_I_B
  52  //			freq_I_B
  53  //				<scan +text, often optimized away> (4)
  54  //			<scan +freq> (5)
  55  //		<scan +sa, random text, random bucket> (6)
  56  //	induceSubS_I_B
  57  //		bucketMax_I_B
  58  //			freq_I_B
  59  //				<scan +text, often optimized away> (7)
  60  //			<scan +freq> (8)
  61  //		<scan -sa, random text, random bucket> (9)
  62  //	assignID_I_B
  63  //		<scan +sa, random text substrings> (10)
  64  //	map_B
  65  //		<scan -sa> (11)
  66  //	recurse_B
  67  //		(recursive call to sais_B_B for a subproblem of size at most 1/2 input, often much smaller)
  68  //	unmap_I_B
  69  //		<scan -text> (12)
  70  //		<scan +sa> (13)
  71  //	expand_I_B
  72  //		bucketMax_I_B
  73  //			freq_I_B
  74  //				<scan +text, often optimized away> (14)
  75  //			<scan +freq> (15)
  76  //		<scan -sa, random text, random bucket> (16)
  77  //	induceL_I_B
  78  //		bucketMin_I_B
  79  //			freq_I_B
  80  //				<scan +text, often optimized away> (17)
  81  //			<scan +freq> (18)
  82  //		<scan +sa, random text, random bucket> (19)
  83  //	induceS_I_B
  84  //		bucketMax_I_B
  85  //			freq_I_B
  86  //				<scan +text, often optimized away> (20)
  87  //			<scan +freq> (21)
  88  //		<scan -sa, random text, random bucket> (22)
  89  //
  90  // Here, _B indicates the suffix array size (_32 or _64) and _I the input size (_8 or _B).
  91  //
  92  // The outline shows there are in general 22 scans through
  93  // O(N)-sized arrays for a given level of the recursion.
  94  // In the top level, operating on 8-bit input text,
  95  // the six freq scans are fixed size (256) instead of potentially
  96  // input-sized. Also, the frequency is counted once and cached
  97  // whenever there is room to do so (there is nearly always room in general,
  98  // and always room at the top level), which eliminates all but
  99  // the first freq_I_B text scans (that is, 5 of the 6).
 100  // So the top level of the recursion only does 22 - 6 - 5 = 11
 101  // input-sized scans and a typical level does 16 scans.
 102  //
 103  // The linear scans do not cost anywhere near as much as
 104  // the random accesses to the text made during a few of
 105  // the scans (specifically #6, #9, #16, #19, #22 marked above).
 106  // In real texts, there is not much but some locality to
 107  // the accesses, due to the repetitive structure of the text
 108  // (the same reason Burrows-Wheeler compression is so effective).
 109  // For random inputs, there is no locality, which makes those
 110  // accesses even more expensive, especially once the text
 111  // no longer fits in cache.
 112  // For example, running on 50 MB of Go source code, induceSubL_8_32
 113  // (which runs only once, at the top level of the recursion)
 114  // takes 0.44s, while on 50 MB of random input, it takes 2.55s.
 115  // Nearly all the relative slowdown is explained by the text access:
 116  //
 117  //		c0, c1 := text[k-1], text[k]
 118  //
 119  // That line runs for 0.23s on the Go text and 2.02s on random text.
 120  
 121  //go:generate go run gen.go
 122  
 123  package suffixarray
 124  
 125  // text_32 returns the suffix array for the input text.
 126  // It requires that len(text) fit in an int32
 127  // and that the caller zero sa.
 128  func text_32(text []byte, sa []int32) {
 129  	if int(int32(len(text))) != len(text) || len(text) != len(sa) {
 130  		panic("suffixarray: misuse of text_32")
 131  	}
 132  	sais_8_32(text, 256, sa, []int32{:2*256})
 133  }
 134  
 135  // sais_8_32 computes the suffix array of text.
 136  // The text must contain only values in [0, textMax).
 137  // The suffix array is stored in sa, which the caller
 138  // must ensure is already zeroed.
 139  // The caller must also provide temporary space tmp
 140  // with len(tmp) ≥ textMax. If len(tmp) ≥ 2*textMax
 141  // then the algorithm runs a little faster.
 142  // If sais_8_32 modifies tmp, it sets tmp[0] = -1 on return.
 143  func sais_8_32(text []byte, textMax int, sa, tmp []int32) {
 144  	if len(sa) != len(text) || len(tmp) < textMax {
 145  		panic("suffixarray: misuse of sais_8_32")
 146  	}
 147  
 148  	// Trivial base cases. Sorting 0 or 1 things is easy.
 149  	if len(text) == 0 {
 150  		return
 151  	}
 152  	if len(text) == 1 {
 153  		sa[0] = 0
 154  		return
 155  	}
 156  
 157  	// Establish slices indexed by text character
 158  	// holding character frequency and bucket-sort offsets.
 159  	// If there's only enough tmp for one slice,
 160  	// we make it the bucket offsets and recompute
 161  	// the character frequency each time we need it.
 162  	var freq, bucket []int32
 163  	if len(tmp) >= 2*textMax {
 164  		freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
 165  		freq[0] = -1 // mark as uninitialized
 166  	} else {
 167  		freq, bucket = nil, tmp[:textMax]
 168  	}
 169  
 170  	// The SAIS algorithm.
 171  	// Each of these calls makes one scan through sa.
 172  	// See the individual functions for documentation
 173  	// about each's role in the algorithm.
 174  	numLMS := placeLMS_8_32(text, sa, freq, bucket)
 175  	if numLMS <= 1 {
 176  		// 0 or 1 items are already sorted. Do nothing.
 177  	} else {
 178  		induceSubL_8_32(text, sa, freq, bucket)
 179  		induceSubS_8_32(text, sa, freq, bucket)
 180  		length_8_32(text, sa, numLMS)
 181  		maxID := assignID_8_32(text, sa, numLMS)
 182  		if maxID < numLMS {
 183  			map_32(sa, numLMS)
 184  			recurse_32(sa, tmp, numLMS, maxID)
 185  			unmap_8_32(text, sa, numLMS)
 186  		} else {
 187  			// If maxID == numLMS, then each LMS-substring
 188  			// is unique, so the relative ordering of two LMS-suffixes
 189  			// is determined by just the leading LMS-substring.
 190  			// That is, the LMS-suffix sort order matches the
 191  			// (simpler) LMS-substring sort order.
 192  			// Copy the original LMS-substring order into the
 193  			// suffix array destination.
 194  			copy(sa, sa[len(sa)-numLMS:])
 195  		}
 196  		expand_8_32(text, freq, bucket, sa, numLMS)
 197  	}
 198  	induceL_8_32(text, sa, freq, bucket)
 199  	induceS_8_32(text, sa, freq, bucket)
 200  
 201  	// Mark for caller that we overwrote tmp.
 202  	tmp[0] = -1
 203  }
 204  
 205  // freq_8_32 returns the character frequencies
 206  // for text, as a slice indexed by character value.
 207  // If freq is nil, freq_8_32 uses and returns bucket.
 208  // If freq is non-nil, freq_8_32 assumes that freq[0] >= 0
 209  // means the frequencies are already computed.
 210  // If the frequency data is overwritten or uninitialized,
 211  // the caller must set freq[0] = -1 to force recomputation
 212  // the next time it is needed.
 213  func freq_8_32(text []byte, freq, bucket []int32) []int32 {
 214  	if freq != nil && freq[0] >= 0 {
 215  		return freq // already computed
 216  	}
 217  	if freq == nil {
 218  		freq = bucket
 219  	}
 220  
 221  	freq = freq[:256] // eliminate bounds check for freq[c] below
 222  	clear(freq)
 223  	for _, c := range text {
 224  		freq[c]++
 225  	}
 226  	return freq
 227  }
 228  
 229  // bucketMin_8_32 stores into bucket[c] the minimum index
 230  // in the bucket for character c in a bucket-sort of text.
 231  func bucketMin_8_32(text []byte, freq, bucket []int32) {
 232  	freq = freq_8_32(text, freq, bucket)
 233  	freq = freq[:256]     // establish len(freq) = 256, so 0 ≤ i < 256 below
 234  	bucket = bucket[:256] // eliminate bounds check for bucket[i] below
 235  	total := int32(0)
 236  	for i, n := range freq {
 237  		bucket[i] = total
 238  		total += n
 239  	}
 240  }
 241  
 242  // bucketMax_8_32 stores into bucket[c] the maximum index
 243  // in the bucket for character c in a bucket-sort of text.
 244  // The bucket indexes for c are [min, max).
 245  // That is, max is one past the final index in that bucket.
 246  func bucketMax_8_32(text []byte, freq, bucket []int32) {
 247  	freq = freq_8_32(text, freq, bucket)
 248  	freq = freq[:256]     // establish len(freq) = 256, so 0 ≤ i < 256 below
 249  	bucket = bucket[:256] // eliminate bounds check for bucket[i] below
 250  	total := int32(0)
 251  	for i, n := range freq {
 252  		total += n
 253  		bucket[i] = total
 254  	}
 255  }
 256  
 257  // The SAIS algorithm proceeds in a sequence of scans through sa.
 258  // Each of the following functions implements one scan,
 259  // and the functions appear here in the order they execute in the algorithm.
 260  
 261  // placeLMS_8_32 places into sa the indexes of the
 262  // final characters of the LMS substrings of text,
 263  // sorted into the rightmost ends of their correct buckets
 264  // in the suffix array.
 265  //
 266  // The imaginary sentinel character at the end of the text
 267  // is the final character of the final LMS substring, but there
 268  // is no bucket for the imaginary sentinel character,
 269  // which has a smaller value than any real character.
 270  // The caller must therefore pretend that sa[-1] == len(text).
 271  //
 272  // The text indexes of LMS-substring characters are always ≥ 1
 273  // (the first LMS-substring must be preceded by one or more L-type
 274  // characters that are not part of any LMS-substring),
 275  // so using 0 as a “not present” suffix array entry is safe,
 276  // both in this function and in most later functions
 277  // (until induceL_8_32 below).
 278  func placeLMS_8_32(text []byte, sa, freq, bucket []int32) int {
 279  	bucketMax_8_32(text, freq, bucket)
 280  
 281  	numLMS := 0
 282  	lastB := int32(-1)
 283  	bucket = bucket[:256] // eliminate bounds check for bucket[c1] below
 284  
 285  	// The next stanza of code (until the blank line) loop backward
 286  	// over text, stopping to execute a code body at each position i
 287  	// such that text[i] is an L-character and text[i+1] is an S-character.
 288  	// That is, i+1 is the position of the start of an LMS-substring.
 289  	// These could be hoisted out into a function with a callback,
 290  	// but at a significant speed cost. Instead, we just write these
 291  	// seven lines a few times in this source file. The copies below
 292  	// refer back to the pattern established by this original as the
 293  	// "LMS-substring iterator".
 294  	//
 295  	// In every scan through the text, c0, c1 are successive characters of text.
 296  	// In this backward scan, c0 == text[i] and c1 == text[i+1].
 297  	// By scanning backward, we can keep track of whether the current
 298  	// position is type-S or type-L according to the usual definition:
 299  	//
 300  	//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
 301  	//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
 302  	//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
 303  	//
 304  	// The backward scan lets us maintain the current type,
 305  	// update it when we see c0 != c1, and otherwise leave it alone.
 306  	// We want to identify all S positions with a preceding L.
 307  	// Position len(text) is one such position by definition, but we have
 308  	// nowhere to write it down, so we eliminate it by untruthfully
 309  	// setting isTypeS = false at the start of the loop.
 310  	c0, c1, isTypeS := byte(0), byte(0), false
 311  	for i := len(text) - 1; i >= 0; i-- {
 312  		c0, c1 = text[i], c0
 313  		if c0 < c1 {
 314  			isTypeS = true
 315  		} else if c0 > c1 && isTypeS {
 316  			isTypeS = false
 317  
 318  			// Bucket the index i+1 for the start of an LMS-substring.
 319  			b := bucket[c1] - 1
 320  			bucket[c1] = b
 321  			sa[b] = int32(i + 1)
 322  			lastB = b
 323  			numLMS++
 324  		}
 325  	}
 326  
 327  	// We recorded the LMS-substring starts but really want the ends.
 328  	// Luckily, with two differences, the start indexes and the end indexes are the same.
 329  	// The first difference is that the rightmost LMS-substring's end index is len(text),
 330  	// so the caller must pretend that sa[-1] == len(text), as noted above.
 331  	// The second difference is that the first leftmost LMS-substring start index
 332  	// does not end an earlier LMS-substring, so as an optimization we can omit
 333  	// that leftmost LMS-substring start index (the last one we wrote).
 334  	//
 335  	// Exception: if numLMS <= 1, the caller is not going to bother with
 336  	// the recursion at all and will treat the result as containing LMS-substring starts.
 337  	// In that case, we don't remove the final entry.
 338  	if numLMS > 1 {
 339  		sa[lastB] = 0
 340  	}
 341  	return numLMS
 342  }
 343  
 344  // induceSubL_8_32 inserts the L-type text indexes of LMS-substrings
 345  // into sa, assuming that the final characters of the LMS-substrings
 346  // are already inserted into sa, sorted by final character, and at the
 347  // right (not left) end of the corresponding character bucket.
 348  // Each LMS-substring has the form (as a regexp) /S+L+S/:
 349  // one or more S-type, one or more L-type, final S-type.
 350  // induceSubL_8_32 leaves behind only the leftmost L-type text
 351  // index for each LMS-substring. That is, it removes the final S-type
 352  // indexes that are present on entry, and it inserts but then removes
 353  // the interior L-type indexes too.
 354  // (Only the leftmost L-type index is needed by induceSubS_8_32.)
 355  func induceSubL_8_32(text []byte, sa, freq, bucket []int32) {
 356  	// Initialize positions for left side of character buckets.
 357  	bucketMin_8_32(text, freq, bucket)
 358  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
 359  
 360  	// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
 361  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
 362  	// Because j-1 is type L, inserting it into sa now will sort it correctly.
 363  	// But we want to distinguish a j-1 with j-2 of type L from type S.
 364  	// We can process the former but want to leave the latter for the caller.
 365  	// We record the difference by negating j-1 if it is preceded by type S.
 366  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
 367  	// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
 368  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
 369  	// and so on, in sorted but not necessarily adjacent order, until it finds
 370  	// one preceded by an index of type S, at which point it must stop.
 371  	//
 372  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
 373  	// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
 374  	// only the indexes of the leftmost L-type indexes for each LMS-substring.
 375  	//
 376  	// The suffix array sa therefore serves simultaneously as input, output,
 377  	// and a miraculously well-tailored work queue.
 378  
 379  	// placeLMS_8_32 left out the implicit entry sa[-1] == len(text),
 380  	// corresponding to the identified type-L index len(text)-1.
 381  	// Process it before the left-to-right scan of sa proper.
 382  	// See body in loop for commentary.
 383  	k := len(text) - 1
 384  	c0, c1 := text[k-1], text[k]
 385  	if c0 < c1 {
 386  		k = -k
 387  	}
 388  
 389  	// Cache recently used bucket index:
 390  	// we're processing suffixes in sorted order
 391  	// and accessing buckets indexed by the
 392  	// byte before the sorted order, which still
 393  	// has very good locality.
 394  	// Invariant: b is cached, possibly dirty copy of bucket[cB].
 395  	cB := c1
 396  	b := bucket[cB]
 397  	sa[b] = int32(k)
 398  	b++
 399  
 400  	for i := 0; i < len(sa); i++ {
 401  		j := int(sa[i])
 402  		if j == 0 {
 403  			// Skip empty entry.
 404  			continue
 405  		}
 406  		if j < 0 {
 407  			// Leave discovered type-S index for caller.
 408  			sa[i] = int32(-j)
 409  			continue
 410  		}
 411  		sa[i] = 0
 412  
 413  		// Index j was on work queue, meaning k := j-1 is L-type,
 414  		// so we can now place k correctly into sa.
 415  		// If k-1 is L-type, queue k for processing later in this loop.
 416  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
 417  		k := j - 1
 418  		c0, c1 := text[k-1], text[k]
 419  		if c0 < c1 {
 420  			k = -k
 421  		}
 422  
 423  		if cB != c1 {
 424  			bucket[cB] = b
 425  			cB = c1
 426  			b = bucket[cB]
 427  		}
 428  		sa[b] = int32(k)
 429  		b++
 430  	}
 431  }
 432  
 433  // induceSubS_8_32 inserts the S-type text indexes of LMS-substrings
 434  // into sa, assuming that the leftmost L-type text indexes are already
 435  // inserted into sa, sorted by LMS-substring suffix, and at the
 436  // left end of the corresponding character bucket.
 437  // Each LMS-substring has the form (as a regexp) /S+L+S/:
 438  // one or more S-type, one or more L-type, final S-type.
 439  // induceSubS_8_32 leaves behind only the leftmost S-type text
 440  // index for each LMS-substring, in sorted order, at the right end of sa.
 441  // That is, it removes the L-type indexes that are present on entry,
 442  // and it inserts but then removes the interior S-type indexes too,
 443  // leaving the LMS-substring start indexes packed into sa[len(sa)-numLMS:].
 444  // (Only the LMS-substring start indexes are processed by the recursion.)
 445  func induceSubS_8_32(text []byte, sa, freq, bucket []int32) {
 446  	// Initialize positions for right side of character buckets.
 447  	bucketMax_8_32(text, freq, bucket)
 448  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
 449  
 450  	// Analogous to induceSubL_8_32 above,
 451  	// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
 452  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
 453  	// Because j-1 is type S, inserting it into sa now will sort it correctly.
 454  	// But we want to distinguish a j-1 with j-2 of type S from type L.
 455  	// We can process the former but want to leave the latter for the caller.
 456  	// We record the difference by negating j-1 if it is preceded by type L.
 457  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
 458  	// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
 459  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
 460  	// and so on, in sorted but not necessarily adjacent order, until it finds
 461  	// one preceded by an index of type L, at which point it must stop.
 462  	// That index (preceded by one of type L) is an LMS-substring start.
 463  	//
 464  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
 465  	// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
 466  	// so that the loop finishes with the top of sa containing exactly
 467  	// the LMS-substring start indexes, sorted by LMS-substring.
 468  
 469  	// Cache recently used bucket index:
 470  	cB := byte(0)
 471  	b := bucket[cB]
 472  
 473  	top := len(sa)
 474  	for i := len(sa) - 1; i >= 0; i-- {
 475  		j := int(sa[i])
 476  		if j == 0 {
 477  			// Skip empty entry.
 478  			continue
 479  		}
 480  		sa[i] = 0
 481  		if j < 0 {
 482  			// Leave discovered LMS-substring start index for caller.
 483  			top--
 484  			sa[top] = int32(-j)
 485  			continue
 486  		}
 487  
 488  		// Index j was on work queue, meaning k := j-1 is S-type,
 489  		// so we can now place k correctly into sa.
 490  		// If k-1 is S-type, queue k for processing later in this loop.
 491  		// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
 492  		k := j - 1
 493  		c1 := text[k]
 494  		c0 := text[k-1]
 495  		if c0 > c1 {
 496  			k = -k
 497  		}
 498  
 499  		if cB != c1 {
 500  			bucket[cB] = b
 501  			cB = c1
 502  			b = bucket[cB]
 503  		}
 504  		b--
 505  		sa[b] = int32(k)
 506  	}
 507  }
 508  
 509  // length_8_32 computes and records the length of each LMS-substring in text.
 510  // The length of the LMS-substring at index j is stored at sa[j/2],
 511  // avoiding the LMS-substring indexes already stored in the top half of sa.
 512  // (If index j is an LMS-substring start, then index j-1 is type L and cannot be.)
 513  // There are two exceptions, made for optimizations in name_8_32 below.
 514  //
 515  // First, the final LMS-substring is recorded as having length 0, which is otherwise
 516  // impossible, instead of giving it a length that includes the implicit sentinel.
 517  // This ensures the final LMS-substring has length unequal to all others
 518  // and therefore can be detected as different without text comparison
 519  // (it is unequal because it is the only one that ends in the implicit sentinel,
 520  // and the text comparison would be problematic since the implicit sentinel
 521  // is not actually present at text[len(text)]).
 522  //
 523  // Second, to avoid text comparison entirely, if an LMS-substring is very short,
 524  // sa[j/2] records its actual text instead of its length, so that if two such
 525  // substrings have matching “length,” the text need not be read at all.
 526  // The definition of “very short” is that the text bytes must pack into a uint32,
 527  // and the unsigned encoding e must be ≥ len(text), so that it can be
 528  // distinguished from a valid length.
 529  func length_8_32(text []byte, sa []int32, numLMS int) {
 530  	end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
 531  
 532  	// The encoding of N text bytes into a “length” word
 533  	// adds 1 to each byte, packs them into the bottom
 534  	// N*8 bits of a word, and then bitwise inverts the result.
 535  	// That is, the text sequence A B C (hex 41 42 43)
 536  	// encodes as ^uint32(0x42_43_44).
 537  	// LMS-substrings can never start or end with 0xFF.
 538  	// Adding 1 ensures the encoded byte sequence never
 539  	// starts or ends with 0x00, so that present bytes can be
 540  	// distinguished from zero-padding in the top bits,
 541  	// so the length need not be separately encoded.
 542  	// Inverting the bytes increases the chance that a
 543  	// 4-byte encoding will still be ≥ len(text).
 544  	// In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F)
 545  	// then the high bit of the inversion will be set,
 546  	// making it clearly not a valid length (it would be a negative one).
 547  	//
 548  	// cx holds the pre-inverted encoding (the packed incremented bytes).
 549  	cx := uint32(0) // byte-only
 550  
 551  	// This stanza (until the blank line) is the "LMS-substring iterator",
 552  	// described in placeLMS_8_32 above, with one line added to maintain cx.
 553  	c0, c1, isTypeS := byte(0), byte(0), false
 554  	for i := len(text) - 1; i >= 0; i-- {
 555  		c0, c1 = text[i], c0
 556  		cx = cx<<8 | uint32(c1+1) // byte-only
 557  		if c0 < c1 {
 558  			isTypeS = true
 559  		} else if c0 > c1 && isTypeS {
 560  			isTypeS = false
 561  
 562  			// Index j = i+1 is the start of an LMS-substring.
 563  			// Compute length or encoded text to store in sa[j/2].
 564  			j := i + 1
 565  			var code int32
 566  			if end == 0 {
 567  				code = 0
 568  			} else {
 569  				code = int32(end - j)
 570  				if code <= 32/8 && ^cx >= uint32(len(text)) { // byte-only
 571  					code = int32(^cx) // byte-only
 572  				} // byte-only
 573  			}
 574  			sa[j>>1] = code
 575  			end = j + 1
 576  			cx = uint32(c1 + 1) // byte-only
 577  		}
 578  	}
 579  }
 580  
 581  // assignID_8_32 assigns a dense ID numbering to the
 582  // set of LMS-substrings respecting string ordering and equality,
 583  // returning the maximum assigned ID.
 584  // For example given the input "ababab", the LMS-substrings
 585  // are "aba", "aba", and "ab", renumbered as 2 2 1.
 586  // sa[len(sa)-numLMS:] holds the LMS-substring indexes
 587  // sorted in string order, so to assign numbers we can
 588  // consider each in turn, removing adjacent duplicates.
 589  // The new ID for the LMS-substring at index j is written to sa[j/2],
 590  // overwriting the length previously stored there (by length_8_32 above).
 591  func assignID_8_32(text []byte, sa []int32, numLMS int) int {
 592  	id := 0
 593  	lastLen := int32(-1) // impossible
 594  	lastPos := int32(0)
 595  	for _, j := range sa[len(sa)-numLMS:] {
 596  		// Is the LMS-substring at index j new, or is it the same as the last one we saw?
 597  		n := sa[j/2]
 598  		if n != lastLen {
 599  			goto New
 600  		}
 601  		if uint32(n) >= uint32(len(text)) {
 602  			// “Length” is really encoded full text, and they match.
 603  			goto Same
 604  		}
 605  		{
 606  			// Compare actual texts.
 607  			n := int(n)
 608  			this := text[j:][:n]
 609  			last := text[lastPos:][:n]
 610  			for i := 0; i < n; i++ {
 611  				if this[i] != last[i] {
 612  					goto New
 613  				}
 614  			}
 615  			goto Same
 616  		}
 617  	New:
 618  		id++
 619  		lastPos = j
 620  		lastLen = n
 621  	Same:
 622  		sa[j/2] = int32(id)
 623  	}
 624  	return id
 625  }
 626  
 627  // map_32 maps the LMS-substrings in text to their new IDs,
 628  // producing the subproblem for the recursion.
 629  // The mapping itself was mostly applied by assignID_8_32:
 630  // sa[i] is either 0, the ID for the LMS-substring at index 2*i,
 631  // or the ID for the LMS-substring at index 2*i+1.
 632  // To produce the subproblem we need only remove the zeros
 633  // and change ID into ID-1 (our IDs start at 1, but text chars start at 0).
 634  //
 635  // map_32 packs the result, which is the input to the recursion,
 636  // into the top of sa, so that the recursion result can be stored
 637  // in the bottom of sa, which sets up for expand_8_32 well.
 638  func map_32(sa []int32, numLMS int) {
 639  	w := len(sa)
 640  	for i := len(sa) / 2; i >= 0; i-- {
 641  		j := sa[i]
 642  		if j > 0 {
 643  			w--
 644  			sa[w] = j - 1
 645  		}
 646  	}
 647  }
 648  
 649  // recurse_32 calls sais_32 recursively to solve the subproblem we've built.
 650  // The subproblem is at the right end of sa, the suffix array result will be
 651  // written at the left end of sa, and the middle of sa is available for use as
 652  // temporary frequency and bucket storage.
 653  func recurse_32(sa, oldTmp []int32, numLMS, maxID int) {
 654  	dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:]
 655  
 656  	// Set up temporary space for recursive call.
 657  	// We must pass sais_32 a tmp buffer with at least maxID entries.
 658  	//
 659  	// The subproblem is guaranteed to have length at most len(sa)/2,
 660  	// so that sa can hold both the subproblem and its suffix array.
 661  	// Nearly all the time, however, the subproblem has length < len(sa)/3,
 662  	// in which case there is a subproblem-sized middle of sa that
 663  	// we can reuse for temporary space (saTmp).
 664  	// When recurse_32 is called from sais_8_32, oldTmp is length 512
 665  	// (from text_32), and saTmp will typically be much larger, so we'll use saTmp.
 666  	// When deeper recursions come back to recurse_32, now oldTmp is
 667  	// the saTmp from the top-most recursion, it is typically larger than
 668  	// the current saTmp (because the current sa gets smaller and smaller
 669  	// as the recursion gets deeper), and we keep reusing that top-most
 670  	// large saTmp instead of the offered smaller ones.
 671  	//
 672  	// Why is the subproblem length so often just under len(sa)/3?
 673  	// See Nong, Zhang, and Chen, section 3.6 for a plausible explanation.
 674  	// In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern
 675  	// in the input, perfect alternation of larger and smaller input bytes.
 676  	// Real text doesn't do that. If each L-type index is randomly followed
 677  	// by either an L-type or S-type index, then half the substrings will
 678  	// be of the form SLS, but the other half will be longer. Of that half,
 679  	// half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on.
 680  	// Not counting the final S in each (which overlaps the first S in the next),
 681  	// This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3.
 682  	// The space we need is further reduced by the fact that many of the
 683  	// short patterns like SLS will often be the same character sequences
 684  	// repeated throughout the text, reducing maxID relative to numLMS.
 685  	//
 686  	// For short inputs, the averages may not run in our favor, but then we
 687  	// can often fall back to using the length-512 tmp available in the
 688  	// top-most call. (Also a short allocation would not be a big deal.)
 689  	//
 690  	// For pathological inputs, we fall back to allocating a new tmp of length
 691  	// max(maxID, numLMS/2). This level of the recursion needs maxID,
 692  	// and all deeper levels of the recursion will need no more than numLMS/2,
 693  	// so this one allocation is guaranteed to suffice for the entire stack
 694  	// of recursive calls.
 695  	tmp := oldTmp
 696  	if len(tmp) < len(saTmp) {
 697  		tmp = saTmp
 698  	}
 699  	if len(tmp) < numLMS {
 700  		// TestSAIS/forcealloc reaches this code.
 701  		n := maxID
 702  		if n < numLMS/2 {
 703  			n = numLMS / 2
 704  		}
 705  		tmp = []int32{:n}
 706  	}
 707  
 708  	// sais_32 requires that the caller arrange to clear dst,
 709  	// because in general the caller may know dst is
 710  	// freshly-allocated and already cleared. But this one is not.
 711  	clear(dst)
 712  	sais_32(text, maxID, dst, tmp)
 713  }
 714  
 715  // unmap_8_32 unmaps the subproblem back to the original.
 716  // sa[:numLMS] is the LMS-substring numbers, which don't matter much anymore.
 717  // sa[len(sa)-numLMS:] is the sorted list of those LMS-substring numbers.
 718  // The key part is that if the list says K that means the K'th substring.
 719  // We can replace sa[:numLMS] with the indexes of the LMS-substrings.
 720  // Then if the list says K it really means sa[K].
 721  // Having mapped the list back to LMS-substring indexes,
 722  // we can place those into the right buckets.
 723  func unmap_8_32(text []byte, sa []int32, numLMS int) {
 724  	unmap := sa[len(sa)-numLMS:]
 725  	j := len(unmap)
 726  
 727  	// "LMS-substring iterator" (see placeLMS_8_32 above).
 728  	c0, c1, isTypeS := byte(0), byte(0), false
 729  	for i := len(text) - 1; i >= 0; i-- {
 730  		c0, c1 = text[i], c0
 731  		if c0 < c1 {
 732  			isTypeS = true
 733  		} else if c0 > c1 && isTypeS {
 734  			isTypeS = false
 735  
 736  			// Populate inverse map.
 737  			j--
 738  			unmap[j] = int32(i + 1)
 739  		}
 740  	}
 741  
 742  	// Apply inverse map to subproblem suffix array.
 743  	sa = sa[:numLMS]
 744  	for i := 0; i < len(sa); i++ {
 745  		sa[i] = unmap[sa[i]]
 746  	}
 747  }
 748  
 749  // expand_8_32 distributes the compacted, sorted LMS-suffix indexes
 750  // from sa[:numLMS] into the tops of the appropriate buckets in sa,
 751  // preserving the sorted order and making room for the L-type indexes
 752  // to be slotted into the sorted sequence by induceL_8_32.
 753  func expand_8_32(text []byte, freq, bucket, sa []int32, numLMS int) {
 754  	bucketMax_8_32(text, freq, bucket)
 755  	bucket = bucket[:256] // eliminate bound check for bucket[c] below
 756  
 757  	// Loop backward through sa, always tracking
 758  	// the next index to populate from sa[:numLMS].
 759  	// When we get to one, populate it.
 760  	// Zero the rest of the slots; they have dead values in them.
 761  	x := numLMS - 1
 762  	saX := sa[x]
 763  	c := text[saX]
 764  	b := bucket[c] - 1
 765  	bucket[c] = b
 766  
 767  	for i := len(sa) - 1; i >= 0; i-- {
 768  		if i != int(b) {
 769  			sa[i] = 0
 770  			continue
 771  		}
 772  		sa[i] = saX
 773  
 774  		// Load next entry to put down (if any).
 775  		if x > 0 {
 776  			x--
 777  			saX = sa[x] // TODO bounds check
 778  			c = text[saX]
 779  			b = bucket[c] - 1
 780  			bucket[c] = b
 781  		}
 782  	}
 783  }
 784  
 785  // induceL_8_32 inserts L-type text indexes into sa,
 786  // assuming that the leftmost S-type indexes are inserted
 787  // into sa, in sorted order, in the right bucket halves.
 788  // It leaves all the L-type indexes in sa, but the
 789  // leftmost L-type indexes are negated, to mark them
 790  // for processing by induceS_8_32.
 791  func induceL_8_32(text []byte, sa, freq, bucket []int32) {
 792  	// Initialize positions for left side of character buckets.
 793  	bucketMin_8_32(text, freq, bucket)
 794  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
 795  
 796  	// This scan is similar to the one in induceSubL_8_32 above.
 797  	// That one arranges to clear all but the leftmost L-type indexes.
 798  	// This scan leaves all the L-type indexes and the original S-type
 799  	// indexes, but it negates the positive leftmost L-type indexes
 800  	// (the ones that induceS_8_32 needs to process).
 801  
 802  	// expand_8_32 left out the implicit entry sa[-1] == len(text),
 803  	// corresponding to the identified type-L index len(text)-1.
 804  	// Process it before the left-to-right scan of sa proper.
 805  	// See body in loop for commentary.
 806  	k := len(text) - 1
 807  	c0, c1 := text[k-1], text[k]
 808  	if c0 < c1 {
 809  		k = -k
 810  	}
 811  
 812  	// Cache recently used bucket index.
 813  	cB := c1
 814  	b := bucket[cB]
 815  	sa[b] = int32(k)
 816  	b++
 817  
 818  	for i := 0; i < len(sa); i++ {
 819  		j := int(sa[i])
 820  		if j <= 0 {
 821  			// Skip empty or negated entry (including negated zero).
 822  			continue
 823  		}
 824  
 825  		// Index j was on work queue, meaning k := j-1 is L-type,
 826  		// so we can now place k correctly into sa.
 827  		// If k-1 is L-type, queue k for processing later in this loop.
 828  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
 829  		// If k is zero, k-1 doesn't exist, so we only need to leave it
 830  		// for the caller. The caller can't tell the difference between
 831  		// an empty slot and a non-empty zero, but there's no need
 832  		// to distinguish them anyway: the final suffix array will end up
 833  		// with one zero somewhere, and that will be a real zero.
 834  		k := j - 1
 835  		c1 := text[k]
 836  		if k > 0 {
 837  			if c0 := text[k-1]; c0 < c1 {
 838  				k = -k
 839  			}
 840  		}
 841  
 842  		if cB != c1 {
 843  			bucket[cB] = b
 844  			cB = c1
 845  			b = bucket[cB]
 846  		}
 847  		sa[b] = int32(k)
 848  		b++
 849  	}
 850  }
 851  
 852  func induceS_8_32(text []byte, sa, freq, bucket []int32) {
 853  	// Initialize positions for right side of character buckets.
 854  	bucketMax_8_32(text, freq, bucket)
 855  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
 856  
 857  	cB := byte(0)
 858  	b := bucket[cB]
 859  
 860  	for i := len(sa) - 1; i >= 0; i-- {
 861  		j := int(sa[i])
 862  		if j >= 0 {
 863  			// Skip non-flagged entry.
 864  			// (This loop can't see an empty entry; 0 means the real zero index.)
 865  			continue
 866  		}
 867  
 868  		// Negative j is a work queue entry; rewrite to positive j for final suffix array.
 869  		j = -j
 870  		sa[i] = int32(j)
 871  
 872  		// Index j was on work queue (encoded as -j but now decoded),
 873  		// meaning k := j-1 is L-type,
 874  		// so we can now place k correctly into sa.
 875  		// If k-1 is S-type, queue -k for processing later in this loop.
 876  		// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
 877  		// If k is zero, k-1 doesn't exist, so we only need to leave it
 878  		// for the caller.
 879  		k := j - 1
 880  		c1 := text[k]
 881  		if k > 0 {
 882  			if c0 := text[k-1]; c0 <= c1 {
 883  				k = -k
 884  			}
 885  		}
 886  
 887  		if cB != c1 {
 888  			bucket[cB] = b
 889  			cB = c1
 890  			b = bucket[cB]
 891  		}
 892  		b--
 893  		sa[b] = int32(k)
 894  	}
 895  }
 896