1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 // Suffix array construction by induced sorting (SAIS).
6 // See Ge Nong, Sen Zhang, and Wai Hong Chen,
7 // "Two Efficient Algorithms for Linear Time Suffix Array Construction",
8 // especially section 3 (https://ieeexplore.ieee.org/document/5582081).
9 // See also http://zork.net/~st/jottings/sais.html.
10 //
11 // With optimizations inspired by Yuta Mori's sais-lite
12 // (https://sites.google.com/site/yuta256/sais).
13 //
14 // And with other new optimizations.
15 16 // Many of these functions are parameterized by the sizes of
17 // the types they operate on. The generator gen.go makes
18 // copies of these functions for use with other sizes.
19 // Specifically:
20 //
21 // - A function with a name ending in _8_32 takes []byte and []int32 arguments
22 // and is duplicated into _32_32, _8_64, and _64_64 forms.
23 // The _32_32 and _64_64_ suffixes are shortened to plain _32 and _64.
24 // Any lines in the function body that contain the text "byte-only" or "256"
25 // are stripped when creating _32_32 and _64_64 forms.
26 // (Those lines are typically 8-bit-specific optimizations.)
27 //
28 // - A function with a name ending only in _32 operates on []int32
29 // and is duplicated into a _64 form. (Note that it may still take a []byte,
30 // but there is no need for a version of the function in which the []byte
31 // is widened to a full integer array.)
32 33 // The overall runtime of this code is linear in the input size:
34 // it runs a sequence of linear passes to reduce the problem to
35 // a subproblem at most half as big, invokes itself recursively,
36 // and then runs a sequence of linear passes to turn the answer
37 // for the subproblem into the answer for the original problem.
38 // This gives T(N) = O(N) + T(N/2) = O(N) + O(N/2) + O(N/4) + ... = O(N).
39 //
40 // The outline of the code, with the forward and backward scans
41 // through O(N)-sized arrays called out, is:
42 //
43 // sais_I_N
44 // placeLMS_I_B
45 // bucketMax_I_B
46 // freq_I_B
47 // <scan +text> (1)
48 // <scan +freq> (2)
49 // <scan -text, random bucket> (3)
50 // induceSubL_I_B
51 // bucketMin_I_B
52 // freq_I_B
53 // <scan +text, often optimized away> (4)
54 // <scan +freq> (5)
55 // <scan +sa, random text, random bucket> (6)
56 // induceSubS_I_B
57 // bucketMax_I_B
58 // freq_I_B
59 // <scan +text, often optimized away> (7)
60 // <scan +freq> (8)
61 // <scan -sa, random text, random bucket> (9)
62 // assignID_I_B
63 // <scan +sa, random text substrings> (10)
64 // map_B
65 // <scan -sa> (11)
66 // recurse_B
67 // (recursive call to sais_B_B for a subproblem of size at most 1/2 input, often much smaller)
68 // unmap_I_B
69 // <scan -text> (12)
70 // <scan +sa> (13)
71 // expand_I_B
72 // bucketMax_I_B
73 // freq_I_B
74 // <scan +text, often optimized away> (14)
75 // <scan +freq> (15)
76 // <scan -sa, random text, random bucket> (16)
77 // induceL_I_B
78 // bucketMin_I_B
79 // freq_I_B
80 // <scan +text, often optimized away> (17)
81 // <scan +freq> (18)
82 // <scan +sa, random text, random bucket> (19)
83 // induceS_I_B
84 // bucketMax_I_B
85 // freq_I_B
86 // <scan +text, often optimized away> (20)
87 // <scan +freq> (21)
88 // <scan -sa, random text, random bucket> (22)
89 //
90 // Here, _B indicates the suffix array size (_32 or _64) and _I the input size (_8 or _B).
91 //
92 // The outline shows there are in general 22 scans through
93 // O(N)-sized arrays for a given level of the recursion.
94 // In the top level, operating on 8-bit input text,
95 // the six freq scans are fixed size (256) instead of potentially
96 // input-sized. Also, the frequency is counted once and cached
97 // whenever there is room to do so (there is nearly always room in general,
98 // and always room at the top level), which eliminates all but
99 // the first freq_I_B text scans (that is, 5 of the 6).
100 // So the top level of the recursion only does 22 - 6 - 5 = 11
101 // input-sized scans and a typical level does 16 scans.
102 //
103 // The linear scans do not cost anywhere near as much as
104 // the random accesses to the text made during a few of
105 // the scans (specifically #6, #9, #16, #19, #22 marked above).
106 // In real texts, there is not much but some locality to
107 // the accesses, due to the repetitive structure of the text
108 // (the same reason Burrows-Wheeler compression is so effective).
109 // For random inputs, there is no locality, which makes those
110 // accesses even more expensive, especially once the text
111 // no longer fits in cache.
112 // For example, running on 50 MB of Go source code, induceSubL_8_32
113 // (which runs only once, at the top level of the recursion)
114 // takes 0.44s, while on 50 MB of random input, it takes 2.55s.
115 // Nearly all the relative slowdown is explained by the text access:
116 //
117 // c0, c1 := text[k-1], text[k]
118 //
119 // That line runs for 0.23s on the Go text and 2.02s on random text.
120 121 //go:generate go run gen.go
122 123 package suffixarray
124 125 // text_32 returns the suffix array for the input text.
126 // It requires that len(text) fit in an int32
127 // and that the caller zero sa.
128 func text_32(text []byte, sa []int32) {
129 if int(int32(len(text))) != len(text) || len(text) != len(sa) {
130 panic("suffixarray: misuse of text_32")
131 }
132 sais_8_32(text, 256, sa, []int32{:2*256})
133 }
134 135 // sais_8_32 computes the suffix array of text.
136 // The text must contain only values in [0, textMax).
137 // The suffix array is stored in sa, which the caller
138 // must ensure is already zeroed.
139 // The caller must also provide temporary space tmp
140 // with len(tmp) ≥ textMax. If len(tmp) ≥ 2*textMax
141 // then the algorithm runs a little faster.
142 // If sais_8_32 modifies tmp, it sets tmp[0] = -1 on return.
143 func sais_8_32(text []byte, textMax int, sa, tmp []int32) {
144 if len(sa) != len(text) || len(tmp) < textMax {
145 panic("suffixarray: misuse of sais_8_32")
146 }
147 148 // Trivial base cases. Sorting 0 or 1 things is easy.
149 if len(text) == 0 {
150 return
151 }
152 if len(text) == 1 {
153 sa[0] = 0
154 return
155 }
156 157 // Establish slices indexed by text character
158 // holding character frequency and bucket-sort offsets.
159 // If there's only enough tmp for one slice,
160 // we make it the bucket offsets and recompute
161 // the character frequency each time we need it.
162 var freq, bucket []int32
163 if len(tmp) >= 2*textMax {
164 freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
165 freq[0] = -1 // mark as uninitialized
166 } else {
167 freq, bucket = nil, tmp[:textMax]
168 }
169 170 // The SAIS algorithm.
171 // Each of these calls makes one scan through sa.
172 // See the individual functions for documentation
173 // about each's role in the algorithm.
174 numLMS := placeLMS_8_32(text, sa, freq, bucket)
175 if numLMS <= 1 {
176 // 0 or 1 items are already sorted. Do nothing.
177 } else {
178 induceSubL_8_32(text, sa, freq, bucket)
179 induceSubS_8_32(text, sa, freq, bucket)
180 length_8_32(text, sa, numLMS)
181 maxID := assignID_8_32(text, sa, numLMS)
182 if maxID < numLMS {
183 map_32(sa, numLMS)
184 recurse_32(sa, tmp, numLMS, maxID)
185 unmap_8_32(text, sa, numLMS)
186 } else {
187 // If maxID == numLMS, then each LMS-substring
188 // is unique, so the relative ordering of two LMS-suffixes
189 // is determined by just the leading LMS-substring.
190 // That is, the LMS-suffix sort order matches the
191 // (simpler) LMS-substring sort order.
192 // Copy the original LMS-substring order into the
193 // suffix array destination.
194 copy(sa, sa[len(sa)-numLMS:])
195 }
196 expand_8_32(text, freq, bucket, sa, numLMS)
197 }
198 induceL_8_32(text, sa, freq, bucket)
199 induceS_8_32(text, sa, freq, bucket)
200 201 // Mark for caller that we overwrote tmp.
202 tmp[0] = -1
203 }
204 205 // freq_8_32 returns the character frequencies
206 // for text, as a slice indexed by character value.
207 // If freq is nil, freq_8_32 uses and returns bucket.
208 // If freq is non-nil, freq_8_32 assumes that freq[0] >= 0
209 // means the frequencies are already computed.
210 // If the frequency data is overwritten or uninitialized,
211 // the caller must set freq[0] = -1 to force recomputation
212 // the next time it is needed.
213 func freq_8_32(text []byte, freq, bucket []int32) []int32 {
214 if freq != nil && freq[0] >= 0 {
215 return freq // already computed
216 }
217 if freq == nil {
218 freq = bucket
219 }
220 221 freq = freq[:256] // eliminate bounds check for freq[c] below
222 clear(freq)
223 for _, c := range text {
224 freq[c]++
225 }
226 return freq
227 }
228 229 // bucketMin_8_32 stores into bucket[c] the minimum index
230 // in the bucket for character c in a bucket-sort of text.
231 func bucketMin_8_32(text []byte, freq, bucket []int32) {
232 freq = freq_8_32(text, freq, bucket)
233 freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below
234 bucket = bucket[:256] // eliminate bounds check for bucket[i] below
235 total := int32(0)
236 for i, n := range freq {
237 bucket[i] = total
238 total += n
239 }
240 }
241 242 // bucketMax_8_32 stores into bucket[c] the maximum index
243 // in the bucket for character c in a bucket-sort of text.
244 // The bucket indexes for c are [min, max).
245 // That is, max is one past the final index in that bucket.
246 func bucketMax_8_32(text []byte, freq, bucket []int32) {
247 freq = freq_8_32(text, freq, bucket)
248 freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below
249 bucket = bucket[:256] // eliminate bounds check for bucket[i] below
250 total := int32(0)
251 for i, n := range freq {
252 total += n
253 bucket[i] = total
254 }
255 }
256 257 // The SAIS algorithm proceeds in a sequence of scans through sa.
258 // Each of the following functions implements one scan,
259 // and the functions appear here in the order they execute in the algorithm.
260 261 // placeLMS_8_32 places into sa the indexes of the
262 // final characters of the LMS substrings of text,
263 // sorted into the rightmost ends of their correct buckets
264 // in the suffix array.
265 //
266 // The imaginary sentinel character at the end of the text
267 // is the final character of the final LMS substring, but there
268 // is no bucket for the imaginary sentinel character,
269 // which has a smaller value than any real character.
270 // The caller must therefore pretend that sa[-1] == len(text).
271 //
272 // The text indexes of LMS-substring characters are always ≥ 1
273 // (the first LMS-substring must be preceded by one or more L-type
274 // characters that are not part of any LMS-substring),
275 // so using 0 as a “not present” suffix array entry is safe,
276 // both in this function and in most later functions
277 // (until induceL_8_32 below).
278 func placeLMS_8_32(text []byte, sa, freq, bucket []int32) int {
279 bucketMax_8_32(text, freq, bucket)
280 281 numLMS := 0
282 lastB := int32(-1)
283 bucket = bucket[:256] // eliminate bounds check for bucket[c1] below
284 285 // The next stanza of code (until the blank line) loop backward
286 // over text, stopping to execute a code body at each position i
287 // such that text[i] is an L-character and text[i+1] is an S-character.
288 // That is, i+1 is the position of the start of an LMS-substring.
289 // These could be hoisted out into a function with a callback,
290 // but at a significant speed cost. Instead, we just write these
291 // seven lines a few times in this source file. The copies below
292 // refer back to the pattern established by this original as the
293 // "LMS-substring iterator".
294 //
295 // In every scan through the text, c0, c1 are successive characters of text.
296 // In this backward scan, c0 == text[i] and c1 == text[i+1].
297 // By scanning backward, we can keep track of whether the current
298 // position is type-S or type-L according to the usual definition:
299 //
300 // - position len(text) is type S with text[len(text)] == -1 (the sentinel)
301 // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
302 // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
303 //
304 // The backward scan lets us maintain the current type,
305 // update it when we see c0 != c1, and otherwise leave it alone.
306 // We want to identify all S positions with a preceding L.
307 // Position len(text) is one such position by definition, but we have
308 // nowhere to write it down, so we eliminate it by untruthfully
309 // setting isTypeS = false at the start of the loop.
310 c0, c1, isTypeS := byte(0), byte(0), false
311 for i := len(text) - 1; i >= 0; i-- {
312 c0, c1 = text[i], c0
313 if c0 < c1 {
314 isTypeS = true
315 } else if c0 > c1 && isTypeS {
316 isTypeS = false
317 318 // Bucket the index i+1 for the start of an LMS-substring.
319 b := bucket[c1] - 1
320 bucket[c1] = b
321 sa[b] = int32(i + 1)
322 lastB = b
323 numLMS++
324 }
325 }
326 327 // We recorded the LMS-substring starts but really want the ends.
328 // Luckily, with two differences, the start indexes and the end indexes are the same.
329 // The first difference is that the rightmost LMS-substring's end index is len(text),
330 // so the caller must pretend that sa[-1] == len(text), as noted above.
331 // The second difference is that the first leftmost LMS-substring start index
332 // does not end an earlier LMS-substring, so as an optimization we can omit
333 // that leftmost LMS-substring start index (the last one we wrote).
334 //
335 // Exception: if numLMS <= 1, the caller is not going to bother with
336 // the recursion at all and will treat the result as containing LMS-substring starts.
337 // In that case, we don't remove the final entry.
338 if numLMS > 1 {
339 sa[lastB] = 0
340 }
341 return numLMS
342 }
343 344 // induceSubL_8_32 inserts the L-type text indexes of LMS-substrings
345 // into sa, assuming that the final characters of the LMS-substrings
346 // are already inserted into sa, sorted by final character, and at the
347 // right (not left) end of the corresponding character bucket.
348 // Each LMS-substring has the form (as a regexp) /S+L+S/:
349 // one or more S-type, one or more L-type, final S-type.
350 // induceSubL_8_32 leaves behind only the leftmost L-type text
351 // index for each LMS-substring. That is, it removes the final S-type
352 // indexes that are present on entry, and it inserts but then removes
353 // the interior L-type indexes too.
354 // (Only the leftmost L-type index is needed by induceSubS_8_32.)
355 func induceSubL_8_32(text []byte, sa, freq, bucket []int32) {
356 // Initialize positions for left side of character buckets.
357 bucketMin_8_32(text, freq, bucket)
358 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
359 360 // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
361 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
362 // Because j-1 is type L, inserting it into sa now will sort it correctly.
363 // But we want to distinguish a j-1 with j-2 of type L from type S.
364 // We can process the former but want to leave the latter for the caller.
365 // We record the difference by negating j-1 if it is preceded by type S.
366 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to
367 // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
368 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
369 // and so on, in sorted but not necessarily adjacent order, until it finds
370 // one preceded by an index of type S, at which point it must stop.
371 //
372 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
373 // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
374 // only the indexes of the leftmost L-type indexes for each LMS-substring.
375 //
376 // The suffix array sa therefore serves simultaneously as input, output,
377 // and a miraculously well-tailored work queue.
378 379 // placeLMS_8_32 left out the implicit entry sa[-1] == len(text),
380 // corresponding to the identified type-L index len(text)-1.
381 // Process it before the left-to-right scan of sa proper.
382 // See body in loop for commentary.
383 k := len(text) - 1
384 c0, c1 := text[k-1], text[k]
385 if c0 < c1 {
386 k = -k
387 }
388 389 // Cache recently used bucket index:
390 // we're processing suffixes in sorted order
391 // and accessing buckets indexed by the
392 // byte before the sorted order, which still
393 // has very good locality.
394 // Invariant: b is cached, possibly dirty copy of bucket[cB].
395 cB := c1
396 b := bucket[cB]
397 sa[b] = int32(k)
398 b++
399 400 for i := 0; i < len(sa); i++ {
401 j := int(sa[i])
402 if j == 0 {
403 // Skip empty entry.
404 continue
405 }
406 if j < 0 {
407 // Leave discovered type-S index for caller.
408 sa[i] = int32(-j)
409 continue
410 }
411 sa[i] = 0
412 413 // Index j was on work queue, meaning k := j-1 is L-type,
414 // so we can now place k correctly into sa.
415 // If k-1 is L-type, queue k for processing later in this loop.
416 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
417 k := j - 1
418 c0, c1 := text[k-1], text[k]
419 if c0 < c1 {
420 k = -k
421 }
422 423 if cB != c1 {
424 bucket[cB] = b
425 cB = c1
426 b = bucket[cB]
427 }
428 sa[b] = int32(k)
429 b++
430 }
431 }
432 433 // induceSubS_8_32 inserts the S-type text indexes of LMS-substrings
434 // into sa, assuming that the leftmost L-type text indexes are already
435 // inserted into sa, sorted by LMS-substring suffix, and at the
436 // left end of the corresponding character bucket.
437 // Each LMS-substring has the form (as a regexp) /S+L+S/:
438 // one or more S-type, one or more L-type, final S-type.
439 // induceSubS_8_32 leaves behind only the leftmost S-type text
440 // index for each LMS-substring, in sorted order, at the right end of sa.
441 // That is, it removes the L-type indexes that are present on entry,
442 // and it inserts but then removes the interior S-type indexes too,
443 // leaving the LMS-substring start indexes packed into sa[len(sa)-numLMS:].
444 // (Only the LMS-substring start indexes are processed by the recursion.)
445 func induceSubS_8_32(text []byte, sa, freq, bucket []int32) {
446 // Initialize positions for right side of character buckets.
447 bucketMax_8_32(text, freq, bucket)
448 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
449 450 // Analogous to induceSubL_8_32 above,
451 // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
452 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
453 // Because j-1 is type S, inserting it into sa now will sort it correctly.
454 // But we want to distinguish a j-1 with j-2 of type S from type L.
455 // We can process the former but want to leave the latter for the caller.
456 // We record the difference by negating j-1 if it is preceded by type L.
457 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to
458 // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
459 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
460 // and so on, in sorted but not necessarily adjacent order, until it finds
461 // one preceded by an index of type L, at which point it must stop.
462 // That index (preceded by one of type L) is an LMS-substring start.
463 //
464 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
465 // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
466 // so that the loop finishes with the top of sa containing exactly
467 // the LMS-substring start indexes, sorted by LMS-substring.
468 469 // Cache recently used bucket index:
470 cB := byte(0)
471 b := bucket[cB]
472 473 top := len(sa)
474 for i := len(sa) - 1; i >= 0; i-- {
475 j := int(sa[i])
476 if j == 0 {
477 // Skip empty entry.
478 continue
479 }
480 sa[i] = 0
481 if j < 0 {
482 // Leave discovered LMS-substring start index for caller.
483 top--
484 sa[top] = int32(-j)
485 continue
486 }
487 488 // Index j was on work queue, meaning k := j-1 is S-type,
489 // so we can now place k correctly into sa.
490 // If k-1 is S-type, queue k for processing later in this loop.
491 // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
492 k := j - 1
493 c1 := text[k]
494 c0 := text[k-1]
495 if c0 > c1 {
496 k = -k
497 }
498 499 if cB != c1 {
500 bucket[cB] = b
501 cB = c1
502 b = bucket[cB]
503 }
504 b--
505 sa[b] = int32(k)
506 }
507 }
508 509 // length_8_32 computes and records the length of each LMS-substring in text.
510 // The length of the LMS-substring at index j is stored at sa[j/2],
511 // avoiding the LMS-substring indexes already stored in the top half of sa.
512 // (If index j is an LMS-substring start, then index j-1 is type L and cannot be.)
513 // There are two exceptions, made for optimizations in name_8_32 below.
514 //
515 // First, the final LMS-substring is recorded as having length 0, which is otherwise
516 // impossible, instead of giving it a length that includes the implicit sentinel.
517 // This ensures the final LMS-substring has length unequal to all others
518 // and therefore can be detected as different without text comparison
519 // (it is unequal because it is the only one that ends in the implicit sentinel,
520 // and the text comparison would be problematic since the implicit sentinel
521 // is not actually present at text[len(text)]).
522 //
523 // Second, to avoid text comparison entirely, if an LMS-substring is very short,
524 // sa[j/2] records its actual text instead of its length, so that if two such
525 // substrings have matching “length,” the text need not be read at all.
526 // The definition of “very short” is that the text bytes must pack into a uint32,
527 // and the unsigned encoding e must be ≥ len(text), so that it can be
528 // distinguished from a valid length.
529 func length_8_32(text []byte, sa []int32, numLMS int) {
530 end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
531 532 // The encoding of N text bytes into a “length” word
533 // adds 1 to each byte, packs them into the bottom
534 // N*8 bits of a word, and then bitwise inverts the result.
535 // That is, the text sequence A B C (hex 41 42 43)
536 // encodes as ^uint32(0x42_43_44).
537 // LMS-substrings can never start or end with 0xFF.
538 // Adding 1 ensures the encoded byte sequence never
539 // starts or ends with 0x00, so that present bytes can be
540 // distinguished from zero-padding in the top bits,
541 // so the length need not be separately encoded.
542 // Inverting the bytes increases the chance that a
543 // 4-byte encoding will still be ≥ len(text).
544 // In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F)
545 // then the high bit of the inversion will be set,
546 // making it clearly not a valid length (it would be a negative one).
547 //
548 // cx holds the pre-inverted encoding (the packed incremented bytes).
549 cx := uint32(0) // byte-only
550 551 // This stanza (until the blank line) is the "LMS-substring iterator",
552 // described in placeLMS_8_32 above, with one line added to maintain cx.
553 c0, c1, isTypeS := byte(0), byte(0), false
554 for i := len(text) - 1; i >= 0; i-- {
555 c0, c1 = text[i], c0
556 cx = cx<<8 | uint32(c1+1) // byte-only
557 if c0 < c1 {
558 isTypeS = true
559 } else if c0 > c1 && isTypeS {
560 isTypeS = false
561 562 // Index j = i+1 is the start of an LMS-substring.
563 // Compute length or encoded text to store in sa[j/2].
564 j := i + 1
565 var code int32
566 if end == 0 {
567 code = 0
568 } else {
569 code = int32(end - j)
570 if code <= 32/8 && ^cx >= uint32(len(text)) { // byte-only
571 code = int32(^cx) // byte-only
572 } // byte-only
573 }
574 sa[j>>1] = code
575 end = j + 1
576 cx = uint32(c1 + 1) // byte-only
577 }
578 }
579 }
580 581 // assignID_8_32 assigns a dense ID numbering to the
582 // set of LMS-substrings respecting string ordering and equality,
583 // returning the maximum assigned ID.
584 // For example given the input "ababab", the LMS-substrings
585 // are "aba", "aba", and "ab", renumbered as 2 2 1.
586 // sa[len(sa)-numLMS:] holds the LMS-substring indexes
587 // sorted in string order, so to assign numbers we can
588 // consider each in turn, removing adjacent duplicates.
589 // The new ID for the LMS-substring at index j is written to sa[j/2],
590 // overwriting the length previously stored there (by length_8_32 above).
591 func assignID_8_32(text []byte, sa []int32, numLMS int) int {
592 id := 0
593 lastLen := int32(-1) // impossible
594 lastPos := int32(0)
595 for _, j := range sa[len(sa)-numLMS:] {
596 // Is the LMS-substring at index j new, or is it the same as the last one we saw?
597 n := sa[j/2]
598 if n != lastLen {
599 goto New
600 }
601 if uint32(n) >= uint32(len(text)) {
602 // “Length” is really encoded full text, and they match.
603 goto Same
604 }
605 {
606 // Compare actual texts.
607 n := int(n)
608 this := text[j:][:n]
609 last := text[lastPos:][:n]
610 for i := 0; i < n; i++ {
611 if this[i] != last[i] {
612 goto New
613 }
614 }
615 goto Same
616 }
617 New:
618 id++
619 lastPos = j
620 lastLen = n
621 Same:
622 sa[j/2] = int32(id)
623 }
624 return id
625 }
626 627 // map_32 maps the LMS-substrings in text to their new IDs,
628 // producing the subproblem for the recursion.
629 // The mapping itself was mostly applied by assignID_8_32:
630 // sa[i] is either 0, the ID for the LMS-substring at index 2*i,
631 // or the ID for the LMS-substring at index 2*i+1.
632 // To produce the subproblem we need only remove the zeros
633 // and change ID into ID-1 (our IDs start at 1, but text chars start at 0).
634 //
635 // map_32 packs the result, which is the input to the recursion,
636 // into the top of sa, so that the recursion result can be stored
637 // in the bottom of sa, which sets up for expand_8_32 well.
638 func map_32(sa []int32, numLMS int) {
639 w := len(sa)
640 for i := len(sa) / 2; i >= 0; i-- {
641 j := sa[i]
642 if j > 0 {
643 w--
644 sa[w] = j - 1
645 }
646 }
647 }
648 649 // recurse_32 calls sais_32 recursively to solve the subproblem we've built.
650 // The subproblem is at the right end of sa, the suffix array result will be
651 // written at the left end of sa, and the middle of sa is available for use as
652 // temporary frequency and bucket storage.
653 func recurse_32(sa, oldTmp []int32, numLMS, maxID int) {
654 dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:]
655 656 // Set up temporary space for recursive call.
657 // We must pass sais_32 a tmp buffer with at least maxID entries.
658 //
659 // The subproblem is guaranteed to have length at most len(sa)/2,
660 // so that sa can hold both the subproblem and its suffix array.
661 // Nearly all the time, however, the subproblem has length < len(sa)/3,
662 // in which case there is a subproblem-sized middle of sa that
663 // we can reuse for temporary space (saTmp).
664 // When recurse_32 is called from sais_8_32, oldTmp is length 512
665 // (from text_32), and saTmp will typically be much larger, so we'll use saTmp.
666 // When deeper recursions come back to recurse_32, now oldTmp is
667 // the saTmp from the top-most recursion, it is typically larger than
668 // the current saTmp (because the current sa gets smaller and smaller
669 // as the recursion gets deeper), and we keep reusing that top-most
670 // large saTmp instead of the offered smaller ones.
671 //
672 // Why is the subproblem length so often just under len(sa)/3?
673 // See Nong, Zhang, and Chen, section 3.6 for a plausible explanation.
674 // In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern
675 // in the input, perfect alternation of larger and smaller input bytes.
676 // Real text doesn't do that. If each L-type index is randomly followed
677 // by either an L-type or S-type index, then half the substrings will
678 // be of the form SLS, but the other half will be longer. Of that half,
679 // half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on.
680 // Not counting the final S in each (which overlaps the first S in the next),
681 // This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3.
682 // The space we need is further reduced by the fact that many of the
683 // short patterns like SLS will often be the same character sequences
684 // repeated throughout the text, reducing maxID relative to numLMS.
685 //
686 // For short inputs, the averages may not run in our favor, but then we
687 // can often fall back to using the length-512 tmp available in the
688 // top-most call. (Also a short allocation would not be a big deal.)
689 //
690 // For pathological inputs, we fall back to allocating a new tmp of length
691 // max(maxID, numLMS/2). This level of the recursion needs maxID,
692 // and all deeper levels of the recursion will need no more than numLMS/2,
693 // so this one allocation is guaranteed to suffice for the entire stack
694 // of recursive calls.
695 tmp := oldTmp
696 if len(tmp) < len(saTmp) {
697 tmp = saTmp
698 }
699 if len(tmp) < numLMS {
700 // TestSAIS/forcealloc reaches this code.
701 n := maxID
702 if n < numLMS/2 {
703 n = numLMS / 2
704 }
705 tmp = []int32{:n}
706 }
707 708 // sais_32 requires that the caller arrange to clear dst,
709 // because in general the caller may know dst is
710 // freshly-allocated and already cleared. But this one is not.
711 clear(dst)
712 sais_32(text, maxID, dst, tmp)
713 }
714 715 // unmap_8_32 unmaps the subproblem back to the original.
716 // sa[:numLMS] is the LMS-substring numbers, which don't matter much anymore.
717 // sa[len(sa)-numLMS:] is the sorted list of those LMS-substring numbers.
718 // The key part is that if the list says K that means the K'th substring.
719 // We can replace sa[:numLMS] with the indexes of the LMS-substrings.
720 // Then if the list says K it really means sa[K].
721 // Having mapped the list back to LMS-substring indexes,
722 // we can place those into the right buckets.
723 func unmap_8_32(text []byte, sa []int32, numLMS int) {
724 unmap := sa[len(sa)-numLMS:]
725 j := len(unmap)
726 727 // "LMS-substring iterator" (see placeLMS_8_32 above).
728 c0, c1, isTypeS := byte(0), byte(0), false
729 for i := len(text) - 1; i >= 0; i-- {
730 c0, c1 = text[i], c0
731 if c0 < c1 {
732 isTypeS = true
733 } else if c0 > c1 && isTypeS {
734 isTypeS = false
735 736 // Populate inverse map.
737 j--
738 unmap[j] = int32(i + 1)
739 }
740 }
741 742 // Apply inverse map to subproblem suffix array.
743 sa = sa[:numLMS]
744 for i := 0; i < len(sa); i++ {
745 sa[i] = unmap[sa[i]]
746 }
747 }
748 749 // expand_8_32 distributes the compacted, sorted LMS-suffix indexes
750 // from sa[:numLMS] into the tops of the appropriate buckets in sa,
751 // preserving the sorted order and making room for the L-type indexes
752 // to be slotted into the sorted sequence by induceL_8_32.
753 func expand_8_32(text []byte, freq, bucket, sa []int32, numLMS int) {
754 bucketMax_8_32(text, freq, bucket)
755 bucket = bucket[:256] // eliminate bound check for bucket[c] below
756 757 // Loop backward through sa, always tracking
758 // the next index to populate from sa[:numLMS].
759 // When we get to one, populate it.
760 // Zero the rest of the slots; they have dead values in them.
761 x := numLMS - 1
762 saX := sa[x]
763 c := text[saX]
764 b := bucket[c] - 1
765 bucket[c] = b
766 767 for i := len(sa) - 1; i >= 0; i-- {
768 if i != int(b) {
769 sa[i] = 0
770 continue
771 }
772 sa[i] = saX
773 774 // Load next entry to put down (if any).
775 if x > 0 {
776 x--
777 saX = sa[x] // TODO bounds check
778 c = text[saX]
779 b = bucket[c] - 1
780 bucket[c] = b
781 }
782 }
783 }
784 785 // induceL_8_32 inserts L-type text indexes into sa,
786 // assuming that the leftmost S-type indexes are inserted
787 // into sa, in sorted order, in the right bucket halves.
788 // It leaves all the L-type indexes in sa, but the
789 // leftmost L-type indexes are negated, to mark them
790 // for processing by induceS_8_32.
791 func induceL_8_32(text []byte, sa, freq, bucket []int32) {
792 // Initialize positions for left side of character buckets.
793 bucketMin_8_32(text, freq, bucket)
794 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
795 796 // This scan is similar to the one in induceSubL_8_32 above.
797 // That one arranges to clear all but the leftmost L-type indexes.
798 // This scan leaves all the L-type indexes and the original S-type
799 // indexes, but it negates the positive leftmost L-type indexes
800 // (the ones that induceS_8_32 needs to process).
801 802 // expand_8_32 left out the implicit entry sa[-1] == len(text),
803 // corresponding to the identified type-L index len(text)-1.
804 // Process it before the left-to-right scan of sa proper.
805 // See body in loop for commentary.
806 k := len(text) - 1
807 c0, c1 := text[k-1], text[k]
808 if c0 < c1 {
809 k = -k
810 }
811 812 // Cache recently used bucket index.
813 cB := c1
814 b := bucket[cB]
815 sa[b] = int32(k)
816 b++
817 818 for i := 0; i < len(sa); i++ {
819 j := int(sa[i])
820 if j <= 0 {
821 // Skip empty or negated entry (including negated zero).
822 continue
823 }
824 825 // Index j was on work queue, meaning k := j-1 is L-type,
826 // so we can now place k correctly into sa.
827 // If k-1 is L-type, queue k for processing later in this loop.
828 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
829 // If k is zero, k-1 doesn't exist, so we only need to leave it
830 // for the caller. The caller can't tell the difference between
831 // an empty slot and a non-empty zero, but there's no need
832 // to distinguish them anyway: the final suffix array will end up
833 // with one zero somewhere, and that will be a real zero.
834 k := j - 1
835 c1 := text[k]
836 if k > 0 {
837 if c0 := text[k-1]; c0 < c1 {
838 k = -k
839 }
840 }
841 842 if cB != c1 {
843 bucket[cB] = b
844 cB = c1
845 b = bucket[cB]
846 }
847 sa[b] = int32(k)
848 b++
849 }
850 }
851 852 func induceS_8_32(text []byte, sa, freq, bucket []int32) {
853 // Initialize positions for right side of character buckets.
854 bucketMax_8_32(text, freq, bucket)
855 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
856 857 cB := byte(0)
858 b := bucket[cB]
859 860 for i := len(sa) - 1; i >= 0; i-- {
861 j := int(sa[i])
862 if j >= 0 {
863 // Skip non-flagged entry.
864 // (This loop can't see an empty entry; 0 means the real zero index.)
865 continue
866 }
867 868 // Negative j is a work queue entry; rewrite to positive j for final suffix array.
869 j = -j
870 sa[i] = int32(j)
871 872 // Index j was on work queue (encoded as -j but now decoded),
873 // meaning k := j-1 is L-type,
874 // so we can now place k correctly into sa.
875 // If k-1 is S-type, queue -k for processing later in this loop.
876 // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
877 // If k is zero, k-1 doesn't exist, so we only need to leave it
878 // for the caller.
879 k := j - 1
880 c1 := text[k]
881 if k > 0 {
882 if c0 := text[k-1]; c0 <= c1 {
883 k = -k
884 }
885 }
886 887 if cB != c1 {
888 bucket[cB] = b
889 cB = c1
890 b = bucket[cB]
891 }
892 b--
893 sa[b] = int32(k)
894 }
895 }
896