1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 // Code generated by go generate; DO NOT EDIT.
6 7 package suffixarray
8 9 func text_64(text []byte, sa []int64) {
10 if int(int64(len(text))) != len(text) || len(text) != len(sa) {
11 panic("suffixarray: misuse of text_64")
12 }
13 sais_8_64(text, 256, sa, []int64{:2*256})
14 }
15 16 func sais_8_64(text []byte, textMax int, sa, tmp []int64) {
17 if len(sa) != len(text) || len(tmp) < textMax {
18 panic("suffixarray: misuse of sais_8_64")
19 }
20 21 // Trivial base cases. Sorting 0 or 1 things is easy.
22 if len(text) == 0 {
23 return
24 }
25 if len(text) == 1 {
26 sa[0] = 0
27 return
28 }
29 30 // Establish slices indexed by text character
31 // holding character frequency and bucket-sort offsets.
32 // If there's only enough tmp for one slice,
33 // we make it the bucket offsets and recompute
34 // the character frequency each time we need it.
35 var freq, bucket []int64
36 if len(tmp) >= 2*textMax {
37 freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
38 freq[0] = -1 // mark as uninitialized
39 } else {
40 freq, bucket = nil, tmp[:textMax]
41 }
42 43 // The SAIS algorithm.
44 // Each of these calls makes one scan through sa.
45 // See the individual functions for documentation
46 // about each's role in the algorithm.
47 numLMS := placeLMS_8_64(text, sa, freq, bucket)
48 if numLMS <= 1 {
49 // 0 or 1 items are already sorted. Do nothing.
50 } else {
51 induceSubL_8_64(text, sa, freq, bucket)
52 induceSubS_8_64(text, sa, freq, bucket)
53 length_8_64(text, sa, numLMS)
54 maxID := assignID_8_64(text, sa, numLMS)
55 if maxID < numLMS {
56 map_64(sa, numLMS)
57 recurse_64(sa, tmp, numLMS, maxID)
58 unmap_8_64(text, sa, numLMS)
59 } else {
60 // If maxID == numLMS, then each LMS-substring
61 // is unique, so the relative ordering of two LMS-suffixes
62 // is determined by just the leading LMS-substring.
63 // That is, the LMS-suffix sort order matches the
64 // (simpler) LMS-substring sort order.
65 // Copy the original LMS-substring order into the
66 // suffix array destination.
67 copy(sa, sa[len(sa)-numLMS:])
68 }
69 expand_8_64(text, freq, bucket, sa, numLMS)
70 }
71 induceL_8_64(text, sa, freq, bucket)
72 induceS_8_64(text, sa, freq, bucket)
73 74 // Mark for caller that we overwrote tmp.
75 tmp[0] = -1
76 }
77 78 func sais_32(text []int32, textMax int, sa, tmp []int32) {
79 if len(sa) != len(text) || len(tmp) < textMax {
80 panic("suffixarray: misuse of sais_32")
81 }
82 83 // Trivial base cases. Sorting 0 or 1 things is easy.
84 if len(text) == 0 {
85 return
86 }
87 if len(text) == 1 {
88 sa[0] = 0
89 return
90 }
91 92 // Establish slices indexed by text character
93 // holding character frequency and bucket-sort offsets.
94 // If there's only enough tmp for one slice,
95 // we make it the bucket offsets and recompute
96 // the character frequency each time we need it.
97 var freq, bucket []int32
98 if len(tmp) >= 2*textMax {
99 freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
100 freq[0] = -1 // mark as uninitialized
101 } else {
102 freq, bucket = nil, tmp[:textMax]
103 }
104 105 // The SAIS algorithm.
106 // Each of these calls makes one scan through sa.
107 // See the individual functions for documentation
108 // about each's role in the algorithm.
109 numLMS := placeLMS_32(text, sa, freq, bucket)
110 if numLMS <= 1 {
111 // 0 or 1 items are already sorted. Do nothing.
112 } else {
113 induceSubL_32(text, sa, freq, bucket)
114 induceSubS_32(text, sa, freq, bucket)
115 length_32(text, sa, numLMS)
116 maxID := assignID_32(text, sa, numLMS)
117 if maxID < numLMS {
118 map_32(sa, numLMS)
119 recurse_32(sa, tmp, numLMS, maxID)
120 unmap_32(text, sa, numLMS)
121 } else {
122 // If maxID == numLMS, then each LMS-substring
123 // is unique, so the relative ordering of two LMS-suffixes
124 // is determined by just the leading LMS-substring.
125 // That is, the LMS-suffix sort order matches the
126 // (simpler) LMS-substring sort order.
127 // Copy the original LMS-substring order into the
128 // suffix array destination.
129 copy(sa, sa[len(sa)-numLMS:])
130 }
131 expand_32(text, freq, bucket, sa, numLMS)
132 }
133 induceL_32(text, sa, freq, bucket)
134 induceS_32(text, sa, freq, bucket)
135 136 // Mark for caller that we overwrote tmp.
137 tmp[0] = -1
138 }
139 140 func sais_64(text []int64, textMax int, sa, tmp []int64) {
141 if len(sa) != len(text) || len(tmp) < textMax {
142 panic("suffixarray: misuse of sais_64")
143 }
144 145 // Trivial base cases. Sorting 0 or 1 things is easy.
146 if len(text) == 0 {
147 return
148 }
149 if len(text) == 1 {
150 sa[0] = 0
151 return
152 }
153 154 // Establish slices indexed by text character
155 // holding character frequency and bucket-sort offsets.
156 // If there's only enough tmp for one slice,
157 // we make it the bucket offsets and recompute
158 // the character frequency each time we need it.
159 var freq, bucket []int64
160 if len(tmp) >= 2*textMax {
161 freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
162 freq[0] = -1 // mark as uninitialized
163 } else {
164 freq, bucket = nil, tmp[:textMax]
165 }
166 167 // The SAIS algorithm.
168 // Each of these calls makes one scan through sa.
169 // See the individual functions for documentation
170 // about each's role in the algorithm.
171 numLMS := placeLMS_64(text, sa, freq, bucket)
172 if numLMS <= 1 {
173 // 0 or 1 items are already sorted. Do nothing.
174 } else {
175 induceSubL_64(text, sa, freq, bucket)
176 induceSubS_64(text, sa, freq, bucket)
177 length_64(text, sa, numLMS)
178 maxID := assignID_64(text, sa, numLMS)
179 if maxID < numLMS {
180 map_64(sa, numLMS)
181 recurse_64(sa, tmp, numLMS, maxID)
182 unmap_64(text, sa, numLMS)
183 } else {
184 // If maxID == numLMS, then each LMS-substring
185 // is unique, so the relative ordering of two LMS-suffixes
186 // is determined by just the leading LMS-substring.
187 // That is, the LMS-suffix sort order matches the
188 // (simpler) LMS-substring sort order.
189 // Copy the original LMS-substring order into the
190 // suffix array destination.
191 copy(sa, sa[len(sa)-numLMS:])
192 }
193 expand_64(text, freq, bucket, sa, numLMS)
194 }
195 induceL_64(text, sa, freq, bucket)
196 induceS_64(text, sa, freq, bucket)
197 198 // Mark for caller that we overwrote tmp.
199 tmp[0] = -1
200 }
201 202 func freq_8_64(text []byte, freq, bucket []int64) []int64 {
203 if freq != nil && freq[0] >= 0 {
204 return freq // already computed
205 }
206 if freq == nil {
207 freq = bucket
208 }
209 210 freq = freq[:256] // eliminate bounds check for freq[c] below
211 clear(freq)
212 for _, c := range text {
213 freq[c]++
214 }
215 return freq
216 }
217 218 func freq_32(text []int32, freq, bucket []int32) []int32 {
219 if freq != nil && freq[0] >= 0 {
220 return freq // already computed
221 }
222 if freq == nil {
223 freq = bucket
224 }
225 226 clear(freq)
227 for _, c := range text {
228 freq[c]++
229 }
230 return freq
231 }
232 233 func freq_64(text []int64, freq, bucket []int64) []int64 {
234 if freq != nil && freq[0] >= 0 {
235 return freq // already computed
236 }
237 if freq == nil {
238 freq = bucket
239 }
240 241 clear(freq)
242 for _, c := range text {
243 freq[c]++
244 }
245 return freq
246 }
247 248 func bucketMin_8_64(text []byte, freq, bucket []int64) {
249 freq = freq_8_64(text, freq, bucket)
250 freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below
251 bucket = bucket[:256] // eliminate bounds check for bucket[i] below
252 total := int64(0)
253 for i, n := range freq {
254 bucket[i] = total
255 total += n
256 }
257 }
258 259 func bucketMin_32(text []int32, freq, bucket []int32) {
260 freq = freq_32(text, freq, bucket)
261 total := int32(0)
262 for i, n := range freq {
263 bucket[i] = total
264 total += n
265 }
266 }
267 268 func bucketMin_64(text []int64, freq, bucket []int64) {
269 freq = freq_64(text, freq, bucket)
270 total := int64(0)
271 for i, n := range freq {
272 bucket[i] = total
273 total += n
274 }
275 }
276 277 func bucketMax_8_64(text []byte, freq, bucket []int64) {
278 freq = freq_8_64(text, freq, bucket)
279 freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below
280 bucket = bucket[:256] // eliminate bounds check for bucket[i] below
281 total := int64(0)
282 for i, n := range freq {
283 total += n
284 bucket[i] = total
285 }
286 }
287 288 func bucketMax_32(text []int32, freq, bucket []int32) {
289 freq = freq_32(text, freq, bucket)
290 total := int32(0)
291 for i, n := range freq {
292 total += n
293 bucket[i] = total
294 }
295 }
296 297 func bucketMax_64(text []int64, freq, bucket []int64) {
298 freq = freq_64(text, freq, bucket)
299 total := int64(0)
300 for i, n := range freq {
301 total += n
302 bucket[i] = total
303 }
304 }
305 306 func placeLMS_8_64(text []byte, sa, freq, bucket []int64) int {
307 bucketMax_8_64(text, freq, bucket)
308 309 numLMS := 0
310 lastB := int64(-1)
311 bucket = bucket[:256] // eliminate bounds check for bucket[c1] below
312 313 // The next stanza of code (until the blank line) loop backward
314 // over text, stopping to execute a code body at each position i
315 // such that text[i] is an L-character and text[i+1] is an S-character.
316 // That is, i+1 is the position of the start of an LMS-substring.
317 // These could be hoisted out into a function with a callback,
318 // but at a significant speed cost. Instead, we just write these
319 // seven lines a few times in this source file. The copies below
320 // refer back to the pattern established by this original as the
321 // "LMS-substring iterator".
322 //
323 // In every scan through the text, c0, c1 are successive characters of text.
324 // In this backward scan, c0 == text[i] and c1 == text[i+1].
325 // By scanning backward, we can keep track of whether the current
326 // position is type-S or type-L according to the usual definition:
327 //
328 // - position len(text) is type S with text[len(text)] == -1 (the sentinel)
329 // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
330 // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
331 //
332 // The backward scan lets us maintain the current type,
333 // update it when we see c0 != c1, and otherwise leave it alone.
334 // We want to identify all S positions with a preceding L.
335 // Position len(text) is one such position by definition, but we have
336 // nowhere to write it down, so we eliminate it by untruthfully
337 // setting isTypeS = false at the start of the loop.
338 c0, c1, isTypeS := byte(0), byte(0), false
339 for i := len(text) - 1; i >= 0; i-- {
340 c0, c1 = text[i], c0
341 if c0 < c1 {
342 isTypeS = true
343 } else if c0 > c1 && isTypeS {
344 isTypeS = false
345 346 // Bucket the index i+1 for the start of an LMS-substring.
347 b := bucket[c1] - 1
348 bucket[c1] = b
349 sa[b] = int64(i + 1)
350 lastB = b
351 numLMS++
352 }
353 }
354 355 // We recorded the LMS-substring starts but really want the ends.
356 // Luckily, with two differences, the start indexes and the end indexes are the same.
357 // The first difference is that the rightmost LMS-substring's end index is len(text),
358 // so the caller must pretend that sa[-1] == len(text), as noted above.
359 // The second difference is that the first leftmost LMS-substring start index
360 // does not end an earlier LMS-substring, so as an optimization we can omit
361 // that leftmost LMS-substring start index (the last one we wrote).
362 //
363 // Exception: if numLMS <= 1, the caller is not going to bother with
364 // the recursion at all and will treat the result as containing LMS-substring starts.
365 // In that case, we don't remove the final entry.
366 if numLMS > 1 {
367 sa[lastB] = 0
368 }
369 return numLMS
370 }
371 372 func placeLMS_32(text []int32, sa, freq, bucket []int32) int {
373 bucketMax_32(text, freq, bucket)
374 375 numLMS := 0
376 lastB := int32(-1)
377 378 // The next stanza of code (until the blank line) loop backward
379 // over text, stopping to execute a code body at each position i
380 // such that text[i] is an L-character and text[i+1] is an S-character.
381 // That is, i+1 is the position of the start of an LMS-substring.
382 // These could be hoisted out into a function with a callback,
383 // but at a significant speed cost. Instead, we just write these
384 // seven lines a few times in this source file. The copies below
385 // refer back to the pattern established by this original as the
386 // "LMS-substring iterator".
387 //
388 // In every scan through the text, c0, c1 are successive characters of text.
389 // In this backward scan, c0 == text[i] and c1 == text[i+1].
390 // By scanning backward, we can keep track of whether the current
391 // position is type-S or type-L according to the usual definition:
392 //
393 // - position len(text) is type S with text[len(text)] == -1 (the sentinel)
394 // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
395 // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
396 //
397 // The backward scan lets us maintain the current type,
398 // update it when we see c0 != c1, and otherwise leave it alone.
399 // We want to identify all S positions with a preceding L.
400 // Position len(text) is one such position by definition, but we have
401 // nowhere to write it down, so we eliminate it by untruthfully
402 // setting isTypeS = false at the start of the loop.
403 c0, c1, isTypeS := int32(0), int32(0), false
404 for i := len(text) - 1; i >= 0; i-- {
405 c0, c1 = text[i], c0
406 if c0 < c1 {
407 isTypeS = true
408 } else if c0 > c1 && isTypeS {
409 isTypeS = false
410 411 // Bucket the index i+1 for the start of an LMS-substring.
412 b := bucket[c1] - 1
413 bucket[c1] = b
414 sa[b] = int32(i + 1)
415 lastB = b
416 numLMS++
417 }
418 }
419 420 // We recorded the LMS-substring starts but really want the ends.
421 // Luckily, with two differences, the start indexes and the end indexes are the same.
422 // The first difference is that the rightmost LMS-substring's end index is len(text),
423 // so the caller must pretend that sa[-1] == len(text), as noted above.
424 // The second difference is that the first leftmost LMS-substring start index
425 // does not end an earlier LMS-substring, so as an optimization we can omit
426 // that leftmost LMS-substring start index (the last one we wrote).
427 //
428 // Exception: if numLMS <= 1, the caller is not going to bother with
429 // the recursion at all and will treat the result as containing LMS-substring starts.
430 // In that case, we don't remove the final entry.
431 if numLMS > 1 {
432 sa[lastB] = 0
433 }
434 return numLMS
435 }
436 437 func placeLMS_64(text []int64, sa, freq, bucket []int64) int {
438 bucketMax_64(text, freq, bucket)
439 440 numLMS := 0
441 lastB := int64(-1)
442 443 // The next stanza of code (until the blank line) loop backward
444 // over text, stopping to execute a code body at each position i
445 // such that text[i] is an L-character and text[i+1] is an S-character.
446 // That is, i+1 is the position of the start of an LMS-substring.
447 // These could be hoisted out into a function with a callback,
448 // but at a significant speed cost. Instead, we just write these
449 // seven lines a few times in this source file. The copies below
450 // refer back to the pattern established by this original as the
451 // "LMS-substring iterator".
452 //
453 // In every scan through the text, c0, c1 are successive characters of text.
454 // In this backward scan, c0 == text[i] and c1 == text[i+1].
455 // By scanning backward, we can keep track of whether the current
456 // position is type-S or type-L according to the usual definition:
457 //
458 // - position len(text) is type S with text[len(text)] == -1 (the sentinel)
459 // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
460 // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
461 //
462 // The backward scan lets us maintain the current type,
463 // update it when we see c0 != c1, and otherwise leave it alone.
464 // We want to identify all S positions with a preceding L.
465 // Position len(text) is one such position by definition, but we have
466 // nowhere to write it down, so we eliminate it by untruthfully
467 // setting isTypeS = false at the start of the loop.
468 c0, c1, isTypeS := int64(0), int64(0), false
469 for i := len(text) - 1; i >= 0; i-- {
470 c0, c1 = text[i], c0
471 if c0 < c1 {
472 isTypeS = true
473 } else if c0 > c1 && isTypeS {
474 isTypeS = false
475 476 // Bucket the index i+1 for the start of an LMS-substring.
477 b := bucket[c1] - 1
478 bucket[c1] = b
479 sa[b] = int64(i + 1)
480 lastB = b
481 numLMS++
482 }
483 }
484 485 // We recorded the LMS-substring starts but really want the ends.
486 // Luckily, with two differences, the start indexes and the end indexes are the same.
487 // The first difference is that the rightmost LMS-substring's end index is len(text),
488 // so the caller must pretend that sa[-1] == len(text), as noted above.
489 // The second difference is that the first leftmost LMS-substring start index
490 // does not end an earlier LMS-substring, so as an optimization we can omit
491 // that leftmost LMS-substring start index (the last one we wrote).
492 //
493 // Exception: if numLMS <= 1, the caller is not going to bother with
494 // the recursion at all and will treat the result as containing LMS-substring starts.
495 // In that case, we don't remove the final entry.
496 if numLMS > 1 {
497 sa[lastB] = 0
498 }
499 return numLMS
500 }
501 502 func induceSubL_8_64(text []byte, sa, freq, bucket []int64) {
503 // Initialize positions for left side of character buckets.
504 bucketMin_8_64(text, freq, bucket)
505 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
506 507 // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
508 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
509 // Because j-1 is type L, inserting it into sa now will sort it correctly.
510 // But we want to distinguish a j-1 with j-2 of type L from type S.
511 // We can process the former but want to leave the latter for the caller.
512 // We record the difference by negating j-1 if it is preceded by type S.
513 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to
514 // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
515 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
516 // and so on, in sorted but not necessarily adjacent order, until it finds
517 // one preceded by an index of type S, at which point it must stop.
518 //
519 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
520 // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
521 // only the indexes of the leftmost L-type indexes for each LMS-substring.
522 //
523 // The suffix array sa therefore serves simultaneously as input, output,
524 // and a miraculously well-tailored work queue.
525 526 // placeLMS_8_64 left out the implicit entry sa[-1] == len(text),
527 // corresponding to the identified type-L index len(text)-1.
528 // Process it before the left-to-right scan of sa proper.
529 // See body in loop for commentary.
530 k := len(text) - 1
531 c0, c1 := text[k-1], text[k]
532 if c0 < c1 {
533 k = -k
534 }
535 536 // Cache recently used bucket index:
537 // we're processing suffixes in sorted order
538 // and accessing buckets indexed by the
539 // byte before the sorted order, which still
540 // has very good locality.
541 // Invariant: b is cached, possibly dirty copy of bucket[cB].
542 cB := c1
543 b := bucket[cB]
544 sa[b] = int64(k)
545 b++
546 547 for i := 0; i < len(sa); i++ {
548 j := int(sa[i])
549 if j == 0 {
550 // Skip empty entry.
551 continue
552 }
553 if j < 0 {
554 // Leave discovered type-S index for caller.
555 sa[i] = int64(-j)
556 continue
557 }
558 sa[i] = 0
559 560 // Index j was on work queue, meaning k := j-1 is L-type,
561 // so we can now place k correctly into sa.
562 // If k-1 is L-type, queue k for processing later in this loop.
563 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
564 k := j - 1
565 c0, c1 := text[k-1], text[k]
566 if c0 < c1 {
567 k = -k
568 }
569 570 if cB != c1 {
571 bucket[cB] = b
572 cB = c1
573 b = bucket[cB]
574 }
575 sa[b] = int64(k)
576 b++
577 }
578 }
579 580 func induceSubL_32(text []int32, sa, freq, bucket []int32) {
581 // Initialize positions for left side of character buckets.
582 bucketMin_32(text, freq, bucket)
583 584 // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
585 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
586 // Because j-1 is type L, inserting it into sa now will sort it correctly.
587 // But we want to distinguish a j-1 with j-2 of type L from type S.
588 // We can process the former but want to leave the latter for the caller.
589 // We record the difference by negating j-1 if it is preceded by type S.
590 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to
591 // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
592 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
593 // and so on, in sorted but not necessarily adjacent order, until it finds
594 // one preceded by an index of type S, at which point it must stop.
595 //
596 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
597 // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
598 // only the indexes of the leftmost L-type indexes for each LMS-substring.
599 //
600 // The suffix array sa therefore serves simultaneously as input, output,
601 // and a miraculously well-tailored work queue.
602 603 // placeLMS_32 left out the implicit entry sa[-1] == len(text),
604 // corresponding to the identified type-L index len(text)-1.
605 // Process it before the left-to-right scan of sa proper.
606 // See body in loop for commentary.
607 k := len(text) - 1
608 c0, c1 := text[k-1], text[k]
609 if c0 < c1 {
610 k = -k
611 }
612 613 // Cache recently used bucket index:
614 // we're processing suffixes in sorted order
615 // and accessing buckets indexed by the
616 // int32 before the sorted order, which still
617 // has very good locality.
618 // Invariant: b is cached, possibly dirty copy of bucket[cB].
619 cB := c1
620 b := bucket[cB]
621 sa[b] = int32(k)
622 b++
623 624 for i := 0; i < len(sa); i++ {
625 j := int(sa[i])
626 if j == 0 {
627 // Skip empty entry.
628 continue
629 }
630 if j < 0 {
631 // Leave discovered type-S index for caller.
632 sa[i] = int32(-j)
633 continue
634 }
635 sa[i] = 0
636 637 // Index j was on work queue, meaning k := j-1 is L-type,
638 // so we can now place k correctly into sa.
639 // If k-1 is L-type, queue k for processing later in this loop.
640 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
641 k := j - 1
642 c0, c1 := text[k-1], text[k]
643 if c0 < c1 {
644 k = -k
645 }
646 647 if cB != c1 {
648 bucket[cB] = b
649 cB = c1
650 b = bucket[cB]
651 }
652 sa[b] = int32(k)
653 b++
654 }
655 }
656 657 func induceSubL_64(text []int64, sa, freq, bucket []int64) {
658 // Initialize positions for left side of character buckets.
659 bucketMin_64(text, freq, bucket)
660 661 // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
662 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
663 // Because j-1 is type L, inserting it into sa now will sort it correctly.
664 // But we want to distinguish a j-1 with j-2 of type L from type S.
665 // We can process the former but want to leave the latter for the caller.
666 // We record the difference by negating j-1 if it is preceded by type S.
667 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to
668 // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
669 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
670 // and so on, in sorted but not necessarily adjacent order, until it finds
671 // one preceded by an index of type S, at which point it must stop.
672 //
673 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
674 // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
675 // only the indexes of the leftmost L-type indexes for each LMS-substring.
676 //
677 // The suffix array sa therefore serves simultaneously as input, output,
678 // and a miraculously well-tailored work queue.
679 680 // placeLMS_64 left out the implicit entry sa[-1] == len(text),
681 // corresponding to the identified type-L index len(text)-1.
682 // Process it before the left-to-right scan of sa proper.
683 // See body in loop for commentary.
684 k := len(text) - 1
685 c0, c1 := text[k-1], text[k]
686 if c0 < c1 {
687 k = -k
688 }
689 690 // Cache recently used bucket index:
691 // we're processing suffixes in sorted order
692 // and accessing buckets indexed by the
693 // int64 before the sorted order, which still
694 // has very good locality.
695 // Invariant: b is cached, possibly dirty copy of bucket[cB].
696 cB := c1
697 b := bucket[cB]
698 sa[b] = int64(k)
699 b++
700 701 for i := 0; i < len(sa); i++ {
702 j := int(sa[i])
703 if j == 0 {
704 // Skip empty entry.
705 continue
706 }
707 if j < 0 {
708 // Leave discovered type-S index for caller.
709 sa[i] = int64(-j)
710 continue
711 }
712 sa[i] = 0
713 714 // Index j was on work queue, meaning k := j-1 is L-type,
715 // so we can now place k correctly into sa.
716 // If k-1 is L-type, queue k for processing later in this loop.
717 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
718 k := j - 1
719 c0, c1 := text[k-1], text[k]
720 if c0 < c1 {
721 k = -k
722 }
723 724 if cB != c1 {
725 bucket[cB] = b
726 cB = c1
727 b = bucket[cB]
728 }
729 sa[b] = int64(k)
730 b++
731 }
732 }
733 734 func induceSubS_8_64(text []byte, sa, freq, bucket []int64) {
735 // Initialize positions for right side of character buckets.
736 bucketMax_8_64(text, freq, bucket)
737 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
738 739 // Analogous to induceSubL_8_64 above,
740 // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
741 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
742 // Because j-1 is type S, inserting it into sa now will sort it correctly.
743 // But we want to distinguish a j-1 with j-2 of type S from type L.
744 // We can process the former but want to leave the latter for the caller.
745 // We record the difference by negating j-1 if it is preceded by type L.
746 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to
747 // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
748 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
749 // and so on, in sorted but not necessarily adjacent order, until it finds
750 // one preceded by an index of type L, at which point it must stop.
751 // That index (preceded by one of type L) is an LMS-substring start.
752 //
753 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
754 // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
755 // so that the loop finishes with the top of sa containing exactly
756 // the LMS-substring start indexes, sorted by LMS-substring.
757 758 // Cache recently used bucket index:
759 cB := byte(0)
760 b := bucket[cB]
761 762 top := len(sa)
763 for i := len(sa) - 1; i >= 0; i-- {
764 j := int(sa[i])
765 if j == 0 {
766 // Skip empty entry.
767 continue
768 }
769 sa[i] = 0
770 if j < 0 {
771 // Leave discovered LMS-substring start index for caller.
772 top--
773 sa[top] = int64(-j)
774 continue
775 }
776 777 // Index j was on work queue, meaning k := j-1 is S-type,
778 // so we can now place k correctly into sa.
779 // If k-1 is S-type, queue k for processing later in this loop.
780 // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
781 k := j - 1
782 c1 := text[k]
783 c0 := text[k-1]
784 if c0 > c1 {
785 k = -k
786 }
787 788 if cB != c1 {
789 bucket[cB] = b
790 cB = c1
791 b = bucket[cB]
792 }
793 b--
794 sa[b] = int64(k)
795 }
796 }
797 798 func induceSubS_32(text []int32, sa, freq, bucket []int32) {
799 // Initialize positions for right side of character buckets.
800 bucketMax_32(text, freq, bucket)
801 802 // Analogous to induceSubL_32 above,
803 // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
804 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
805 // Because j-1 is type S, inserting it into sa now will sort it correctly.
806 // But we want to distinguish a j-1 with j-2 of type S from type L.
807 // We can process the former but want to leave the latter for the caller.
808 // We record the difference by negating j-1 if it is preceded by type L.
809 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to
810 // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
811 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
812 // and so on, in sorted but not necessarily adjacent order, until it finds
813 // one preceded by an index of type L, at which point it must stop.
814 // That index (preceded by one of type L) is an LMS-substring start.
815 //
816 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
817 // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
818 // so that the loop finishes with the top of sa containing exactly
819 // the LMS-substring start indexes, sorted by LMS-substring.
820 821 // Cache recently used bucket index:
822 cB := int32(0)
823 b := bucket[cB]
824 825 top := len(sa)
826 for i := len(sa) - 1; i >= 0; i-- {
827 j := int(sa[i])
828 if j == 0 {
829 // Skip empty entry.
830 continue
831 }
832 sa[i] = 0
833 if j < 0 {
834 // Leave discovered LMS-substring start index for caller.
835 top--
836 sa[top] = int32(-j)
837 continue
838 }
839 840 // Index j was on work queue, meaning k := j-1 is S-type,
841 // so we can now place k correctly into sa.
842 // If k-1 is S-type, queue k for processing later in this loop.
843 // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
844 k := j - 1
845 c1 := text[k]
846 c0 := text[k-1]
847 if c0 > c1 {
848 k = -k
849 }
850 851 if cB != c1 {
852 bucket[cB] = b
853 cB = c1
854 b = bucket[cB]
855 }
856 b--
857 sa[b] = int32(k)
858 }
859 }
860 861 func induceSubS_64(text []int64, sa, freq, bucket []int64) {
862 // Initialize positions for right side of character buckets.
863 bucketMax_64(text, freq, bucket)
864 865 // Analogous to induceSubL_64 above,
866 // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
867 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
868 // Because j-1 is type S, inserting it into sa now will sort it correctly.
869 // But we want to distinguish a j-1 with j-2 of type S from type L.
870 // We can process the former but want to leave the latter for the caller.
871 // We record the difference by negating j-1 if it is preceded by type L.
872 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to
873 // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
874 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
875 // and so on, in sorted but not necessarily adjacent order, until it finds
876 // one preceded by an index of type L, at which point it must stop.
877 // That index (preceded by one of type L) is an LMS-substring start.
878 //
879 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
880 // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
881 // so that the loop finishes with the top of sa containing exactly
882 // the LMS-substring start indexes, sorted by LMS-substring.
883 884 // Cache recently used bucket index:
885 cB := int64(0)
886 b := bucket[cB]
887 888 top := len(sa)
889 for i := len(sa) - 1; i >= 0; i-- {
890 j := int(sa[i])
891 if j == 0 {
892 // Skip empty entry.
893 continue
894 }
895 sa[i] = 0
896 if j < 0 {
897 // Leave discovered LMS-substring start index for caller.
898 top--
899 sa[top] = int64(-j)
900 continue
901 }
902 903 // Index j was on work queue, meaning k := j-1 is S-type,
904 // so we can now place k correctly into sa.
905 // If k-1 is S-type, queue k for processing later in this loop.
906 // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
907 k := j - 1
908 c1 := text[k]
909 c0 := text[k-1]
910 if c0 > c1 {
911 k = -k
912 }
913 914 if cB != c1 {
915 bucket[cB] = b
916 cB = c1
917 b = bucket[cB]
918 }
919 b--
920 sa[b] = int64(k)
921 }
922 }
923 924 func length_8_64(text []byte, sa []int64, numLMS int) {
925 end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
926 927 // The encoding of N text bytes into a “length” word
928 // adds 1 to each byte, packs them into the bottom
929 // N*8 bits of a word, and then bitwise inverts the result.
930 // That is, the text sequence A B C (hex 41 42 43)
931 // encodes as ^uint64(0x42_43_44).
932 // LMS-substrings can never start or end with 0xFF.
933 // Adding 1 ensures the encoded byte sequence never
934 // starts or ends with 0x00, so that present bytes can be
935 // distinguished from zero-padding in the top bits,
936 // so the length need not be separately encoded.
937 // Inverting the bytes increases the chance that a
938 // 4-byte encoding will still be ≥ len(text).
939 // In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F)
940 // then the high bit of the inversion will be set,
941 // making it clearly not a valid length (it would be a negative one).
942 //
943 // cx holds the pre-inverted encoding (the packed incremented bytes).
944 cx := uint64(0) // byte-only
945 946 // This stanza (until the blank line) is the "LMS-substring iterator",
947 // described in placeLMS_8_64 above, with one line added to maintain cx.
948 c0, c1, isTypeS := byte(0), byte(0), false
949 for i := len(text) - 1; i >= 0; i-- {
950 c0, c1 = text[i], c0
951 cx = cx<<8 | uint64(c1+1) // byte-only
952 if c0 < c1 {
953 isTypeS = true
954 } else if c0 > c1 && isTypeS {
955 isTypeS = false
956 957 // Index j = i+1 is the start of an LMS-substring.
958 // Compute length or encoded text to store in sa[j/2].
959 j := i + 1
960 var code int64
961 if end == 0 {
962 code = 0
963 } else {
964 code = int64(end - j)
965 if code <= 64/8 && ^cx >= uint64(len(text)) { // byte-only
966 code = int64(^cx) // byte-only
967 } // byte-only
968 }
969 sa[j>>1] = code
970 end = j + 1
971 cx = uint64(c1 + 1) // byte-only
972 }
973 }
974 }
975 976 func length_32(text []int32, sa []int32, numLMS int) {
977 end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
978 979 // The encoding of N text int32s into a “length” word
980 // adds 1 to each int32, packs them into the bottom
981 // N*8 bits of a word, and then bitwise inverts the result.
982 // That is, the text sequence A B C (hex 41 42 43)
983 // encodes as ^uint32(0x42_43_44).
984 // LMS-substrings can never start or end with 0xFF.
985 // Adding 1 ensures the encoded int32 sequence never
986 // starts or ends with 0x00, so that present int32s can be
987 // distinguished from zero-padding in the top bits,
988 // so the length need not be separately encoded.
989 // Inverting the int32s increases the chance that a
990 // 4-int32 encoding will still be ≥ len(text).
991 // In particular, if the first int32 is ASCII (<= 0x7E, so +1 <= 0x7F)
992 // then the high bit of the inversion will be set,
993 // making it clearly not a valid length (it would be a negative one).
994 //
995 // cx holds the pre-inverted encoding (the packed incremented int32s).
996 997 // This stanza (until the blank line) is the "LMS-substring iterator",
998 // described in placeLMS_32 above, with one line added to maintain cx.
999 c0, c1, isTypeS := int32(0), int32(0), false
1000 for i := len(text) - 1; i >= 0; i-- {
1001 c0, c1 = text[i], c0
1002 if c0 < c1 {
1003 isTypeS = true
1004 } else if c0 > c1 && isTypeS {
1005 isTypeS = false
1006 1007 // Index j = i+1 is the start of an LMS-substring.
1008 // Compute length or encoded text to store in sa[j/2].
1009 j := i + 1
1010 var code int32
1011 if end == 0 {
1012 code = 0
1013 } else {
1014 code = int32(end - j)
1015 }
1016 sa[j>>1] = code
1017 end = j + 1
1018 }
1019 }
1020 }
1021 1022 func length_64(text []int64, sa []int64, numLMS int) {
1023 end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
1024 1025 // The encoding of N text int64s into a “length” word
1026 // adds 1 to each int64, packs them into the bottom
1027 // N*8 bits of a word, and then bitwise inverts the result.
1028 // That is, the text sequence A B C (hex 41 42 43)
1029 // encodes as ^uint64(0x42_43_44).
1030 // LMS-substrings can never start or end with 0xFF.
1031 // Adding 1 ensures the encoded int64 sequence never
1032 // starts or ends with 0x00, so that present int64s can be
1033 // distinguished from zero-padding in the top bits,
1034 // so the length need not be separately encoded.
1035 // Inverting the int64s increases the chance that a
1036 // 4-int64 encoding will still be ≥ len(text).
1037 // In particular, if the first int64 is ASCII (<= 0x7E, so +1 <= 0x7F)
1038 // then the high bit of the inversion will be set,
1039 // making it clearly not a valid length (it would be a negative one).
1040 //
1041 // cx holds the pre-inverted encoding (the packed incremented int64s).
1042 1043 // This stanza (until the blank line) is the "LMS-substring iterator",
1044 // described in placeLMS_64 above, with one line added to maintain cx.
1045 c0, c1, isTypeS := int64(0), int64(0), false
1046 for i := len(text) - 1; i >= 0; i-- {
1047 c0, c1 = text[i], c0
1048 if c0 < c1 {
1049 isTypeS = true
1050 } else if c0 > c1 && isTypeS {
1051 isTypeS = false
1052 1053 // Index j = i+1 is the start of an LMS-substring.
1054 // Compute length or encoded text to store in sa[j/2].
1055 j := i + 1
1056 var code int64
1057 if end == 0 {
1058 code = 0
1059 } else {
1060 code = int64(end - j)
1061 }
1062 sa[j>>1] = code
1063 end = j + 1
1064 }
1065 }
1066 }
1067 1068 func assignID_8_64(text []byte, sa []int64, numLMS int) int {
1069 id := 0
1070 lastLen := int64(-1) // impossible
1071 lastPos := int64(0)
1072 for _, j := range sa[len(sa)-numLMS:] {
1073 // Is the LMS-substring at index j new, or is it the same as the last one we saw?
1074 n := sa[j/2]
1075 if n != lastLen {
1076 goto New
1077 }
1078 if uint64(n) >= uint64(len(text)) {
1079 // “Length” is really encoded full text, and they match.
1080 goto Same
1081 }
1082 {
1083 // Compare actual texts.
1084 n := int(n)
1085 this := text[j:][:n]
1086 last := text[lastPos:][:n]
1087 for i := 0; i < n; i++ {
1088 if this[i] != last[i] {
1089 goto New
1090 }
1091 }
1092 goto Same
1093 }
1094 New:
1095 id++
1096 lastPos = j
1097 lastLen = n
1098 Same:
1099 sa[j/2] = int64(id)
1100 }
1101 return id
1102 }
1103 1104 func assignID_32(text []int32, sa []int32, numLMS int) int {
1105 id := 0
1106 lastLen := int32(-1) // impossible
1107 lastPos := int32(0)
1108 for _, j := range sa[len(sa)-numLMS:] {
1109 // Is the LMS-substring at index j new, or is it the same as the last one we saw?
1110 n := sa[j/2]
1111 if n != lastLen {
1112 goto New
1113 }
1114 if uint32(n) >= uint32(len(text)) {
1115 // “Length” is really encoded full text, and they match.
1116 goto Same
1117 }
1118 {
1119 // Compare actual texts.
1120 n := int(n)
1121 this := text[j:][:n]
1122 last := text[lastPos:][:n]
1123 for i := 0; i < n; i++ {
1124 if this[i] != last[i] {
1125 goto New
1126 }
1127 }
1128 goto Same
1129 }
1130 New:
1131 id++
1132 lastPos = j
1133 lastLen = n
1134 Same:
1135 sa[j/2] = int32(id)
1136 }
1137 return id
1138 }
1139 1140 func assignID_64(text []int64, sa []int64, numLMS int) int {
1141 id := 0
1142 lastLen := int64(-1) // impossible
1143 lastPos := int64(0)
1144 for _, j := range sa[len(sa)-numLMS:] {
1145 // Is the LMS-substring at index j new, or is it the same as the last one we saw?
1146 n := sa[j/2]
1147 if n != lastLen {
1148 goto New
1149 }
1150 if uint64(n) >= uint64(len(text)) {
1151 // “Length” is really encoded full text, and they match.
1152 goto Same
1153 }
1154 {
1155 // Compare actual texts.
1156 n := int(n)
1157 this := text[j:][:n]
1158 last := text[lastPos:][:n]
1159 for i := 0; i < n; i++ {
1160 if this[i] != last[i] {
1161 goto New
1162 }
1163 }
1164 goto Same
1165 }
1166 New:
1167 id++
1168 lastPos = j
1169 lastLen = n
1170 Same:
1171 sa[j/2] = int64(id)
1172 }
1173 return id
1174 }
1175 1176 func map_64(sa []int64, numLMS int) {
1177 w := len(sa)
1178 for i := len(sa) / 2; i >= 0; i-- {
1179 j := sa[i]
1180 if j > 0 {
1181 w--
1182 sa[w] = j - 1
1183 }
1184 }
1185 }
1186 1187 func recurse_64(sa, oldTmp []int64, numLMS, maxID int) {
1188 dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:]
1189 1190 // Set up temporary space for recursive call.
1191 // We must pass sais_64 a tmp buffer with at least maxID entries.
1192 //
1193 // The subproblem is guaranteed to have length at most len(sa)/2,
1194 // so that sa can hold both the subproblem and its suffix array.
1195 // Nearly all the time, however, the subproblem has length < len(sa)/3,
1196 // in which case there is a subproblem-sized middle of sa that
1197 // we can reuse for temporary space (saTmp).
1198 // When recurse_64 is called from sais_8_64, oldTmp is length 512
1199 // (from text_64), and saTmp will typically be much larger, so we'll use saTmp.
1200 // When deeper recursions come back to recurse_64, now oldTmp is
1201 // the saTmp from the top-most recursion, it is typically larger than
1202 // the current saTmp (because the current sa gets smaller and smaller
1203 // as the recursion gets deeper), and we keep reusing that top-most
1204 // large saTmp instead of the offered smaller ones.
1205 //
1206 // Why is the subproblem length so often just under len(sa)/3?
1207 // See Nong, Zhang, and Chen, section 3.6 for a plausible explanation.
1208 // In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern
1209 // in the input, perfect alternation of larger and smaller input bytes.
1210 // Real text doesn't do that. If each L-type index is randomly followed
1211 // by either an L-type or S-type index, then half the substrings will
1212 // be of the form SLS, but the other half will be longer. Of that half,
1213 // half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on.
1214 // Not counting the final S in each (which overlaps the first S in the next),
1215 // This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3.
1216 // The space we need is further reduced by the fact that many of the
1217 // short patterns like SLS will often be the same character sequences
1218 // repeated throughout the text, reducing maxID relative to numLMS.
1219 //
1220 // For short inputs, the averages may not run in our favor, but then we
1221 // can often fall back to using the length-512 tmp available in the
1222 // top-most call. (Also a short allocation would not be a big deal.)
1223 //
1224 // For pathological inputs, we fall back to allocating a new tmp of length
1225 // max(maxID, numLMS/2). This level of the recursion needs maxID,
1226 // and all deeper levels of the recursion will need no more than numLMS/2,
1227 // so this one allocation is guaranteed to suffice for the entire stack
1228 // of recursive calls.
1229 tmp := oldTmp
1230 if len(tmp) < len(saTmp) {
1231 tmp = saTmp
1232 }
1233 if len(tmp) < numLMS {
1234 // TestSAIS/forcealloc reaches this code.
1235 n := maxID
1236 if n < numLMS/2 {
1237 n = numLMS / 2
1238 }
1239 tmp = []int64{:n}
1240 }
1241 1242 // sais_64 requires that the caller arrange to clear dst,
1243 // because in general the caller may know dst is
1244 // freshly-allocated and already cleared. But this one is not.
1245 clear(dst)
1246 sais_64(text, maxID, dst, tmp)
1247 }
1248 1249 func unmap_8_64(text []byte, sa []int64, numLMS int) {
1250 unmap := sa[len(sa)-numLMS:]
1251 j := len(unmap)
1252 1253 // "LMS-substring iterator" (see placeLMS_8_64 above).
1254 c0, c1, isTypeS := byte(0), byte(0), false
1255 for i := len(text) - 1; i >= 0; i-- {
1256 c0, c1 = text[i], c0
1257 if c0 < c1 {
1258 isTypeS = true
1259 } else if c0 > c1 && isTypeS {
1260 isTypeS = false
1261 1262 // Populate inverse map.
1263 j--
1264 unmap[j] = int64(i + 1)
1265 }
1266 }
1267 1268 // Apply inverse map to subproblem suffix array.
1269 sa = sa[:numLMS]
1270 for i := 0; i < len(sa); i++ {
1271 sa[i] = unmap[sa[i]]
1272 }
1273 }
1274 1275 func unmap_32(text []int32, sa []int32, numLMS int) {
1276 unmap := sa[len(sa)-numLMS:]
1277 j := len(unmap)
1278 1279 // "LMS-substring iterator" (see placeLMS_32 above).
1280 c0, c1, isTypeS := int32(0), int32(0), false
1281 for i := len(text) - 1; i >= 0; i-- {
1282 c0, c1 = text[i], c0
1283 if c0 < c1 {
1284 isTypeS = true
1285 } else if c0 > c1 && isTypeS {
1286 isTypeS = false
1287 1288 // Populate inverse map.
1289 j--
1290 unmap[j] = int32(i + 1)
1291 }
1292 }
1293 1294 // Apply inverse map to subproblem suffix array.
1295 sa = sa[:numLMS]
1296 for i := 0; i < len(sa); i++ {
1297 sa[i] = unmap[sa[i]]
1298 }
1299 }
1300 1301 func unmap_64(text []int64, sa []int64, numLMS int) {
1302 unmap := sa[len(sa)-numLMS:]
1303 j := len(unmap)
1304 1305 // "LMS-substring iterator" (see placeLMS_64 above).
1306 c0, c1, isTypeS := int64(0), int64(0), false
1307 for i := len(text) - 1; i >= 0; i-- {
1308 c0, c1 = text[i], c0
1309 if c0 < c1 {
1310 isTypeS = true
1311 } else if c0 > c1 && isTypeS {
1312 isTypeS = false
1313 1314 // Populate inverse map.
1315 j--
1316 unmap[j] = int64(i + 1)
1317 }
1318 }
1319 1320 // Apply inverse map to subproblem suffix array.
1321 sa = sa[:numLMS]
1322 for i := 0; i < len(sa); i++ {
1323 sa[i] = unmap[sa[i]]
1324 }
1325 }
1326 1327 func expand_8_64(text []byte, freq, bucket, sa []int64, numLMS int) {
1328 bucketMax_8_64(text, freq, bucket)
1329 bucket = bucket[:256] // eliminate bound check for bucket[c] below
1330 1331 // Loop backward through sa, always tracking
1332 // the next index to populate from sa[:numLMS].
1333 // When we get to one, populate it.
1334 // Zero the rest of the slots; they have dead values in them.
1335 x := numLMS - 1
1336 saX := sa[x]
1337 c := text[saX]
1338 b := bucket[c] - 1
1339 bucket[c] = b
1340 1341 for i := len(sa) - 1; i >= 0; i-- {
1342 if i != int(b) {
1343 sa[i] = 0
1344 continue
1345 }
1346 sa[i] = saX
1347 1348 // Load next entry to put down (if any).
1349 if x > 0 {
1350 x--
1351 saX = sa[x] // TODO bounds check
1352 c = text[saX]
1353 b = bucket[c] - 1
1354 bucket[c] = b
1355 }
1356 }
1357 }
1358 1359 func expand_32(text []int32, freq, bucket, sa []int32, numLMS int) {
1360 bucketMax_32(text, freq, bucket)
1361 1362 // Loop backward through sa, always tracking
1363 // the next index to populate from sa[:numLMS].
1364 // When we get to one, populate it.
1365 // Zero the rest of the slots; they have dead values in them.
1366 x := numLMS - 1
1367 saX := sa[x]
1368 c := text[saX]
1369 b := bucket[c] - 1
1370 bucket[c] = b
1371 1372 for i := len(sa) - 1; i >= 0; i-- {
1373 if i != int(b) {
1374 sa[i] = 0
1375 continue
1376 }
1377 sa[i] = saX
1378 1379 // Load next entry to put down (if any).
1380 if x > 0 {
1381 x--
1382 saX = sa[x] // TODO bounds check
1383 c = text[saX]
1384 b = bucket[c] - 1
1385 bucket[c] = b
1386 }
1387 }
1388 }
1389 1390 func expand_64(text []int64, freq, bucket, sa []int64, numLMS int) {
1391 bucketMax_64(text, freq, bucket)
1392 1393 // Loop backward through sa, always tracking
1394 // the next index to populate from sa[:numLMS].
1395 // When we get to one, populate it.
1396 // Zero the rest of the slots; they have dead values in them.
1397 x := numLMS - 1
1398 saX := sa[x]
1399 c := text[saX]
1400 b := bucket[c] - 1
1401 bucket[c] = b
1402 1403 for i := len(sa) - 1; i >= 0; i-- {
1404 if i != int(b) {
1405 sa[i] = 0
1406 continue
1407 }
1408 sa[i] = saX
1409 1410 // Load next entry to put down (if any).
1411 if x > 0 {
1412 x--
1413 saX = sa[x] // TODO bounds check
1414 c = text[saX]
1415 b = bucket[c] - 1
1416 bucket[c] = b
1417 }
1418 }
1419 }
1420 1421 func induceL_8_64(text []byte, sa, freq, bucket []int64) {
1422 // Initialize positions for left side of character buckets.
1423 bucketMin_8_64(text, freq, bucket)
1424 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
1425 1426 // This scan is similar to the one in induceSubL_8_64 above.
1427 // That one arranges to clear all but the leftmost L-type indexes.
1428 // This scan leaves all the L-type indexes and the original S-type
1429 // indexes, but it negates the positive leftmost L-type indexes
1430 // (the ones that induceS_8_64 needs to process).
1431 1432 // expand_8_64 left out the implicit entry sa[-1] == len(text),
1433 // corresponding to the identified type-L index len(text)-1.
1434 // Process it before the left-to-right scan of sa proper.
1435 // See body in loop for commentary.
1436 k := len(text) - 1
1437 c0, c1 := text[k-1], text[k]
1438 if c0 < c1 {
1439 k = -k
1440 }
1441 1442 // Cache recently used bucket index.
1443 cB := c1
1444 b := bucket[cB]
1445 sa[b] = int64(k)
1446 b++
1447 1448 for i := 0; i < len(sa); i++ {
1449 j := int(sa[i])
1450 if j <= 0 {
1451 // Skip empty or negated entry (including negated zero).
1452 continue
1453 }
1454 1455 // Index j was on work queue, meaning k := j-1 is L-type,
1456 // so we can now place k correctly into sa.
1457 // If k-1 is L-type, queue k for processing later in this loop.
1458 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
1459 // If k is zero, k-1 doesn't exist, so we only need to leave it
1460 // for the caller. The caller can't tell the difference between
1461 // an empty slot and a non-empty zero, but there's no need
1462 // to distinguish them anyway: the final suffix array will end up
1463 // with one zero somewhere, and that will be a real zero.
1464 k := j - 1
1465 c1 := text[k]
1466 if k > 0 {
1467 if c0 := text[k-1]; c0 < c1 {
1468 k = -k
1469 }
1470 }
1471 1472 if cB != c1 {
1473 bucket[cB] = b
1474 cB = c1
1475 b = bucket[cB]
1476 }
1477 sa[b] = int64(k)
1478 b++
1479 }
1480 }
1481 1482 func induceL_32(text []int32, sa, freq, bucket []int32) {
1483 // Initialize positions for left side of character buckets.
1484 bucketMin_32(text, freq, bucket)
1485 1486 // This scan is similar to the one in induceSubL_32 above.
1487 // That one arranges to clear all but the leftmost L-type indexes.
1488 // This scan leaves all the L-type indexes and the original S-type
1489 // indexes, but it negates the positive leftmost L-type indexes
1490 // (the ones that induceS_32 needs to process).
1491 1492 // expand_32 left out the implicit entry sa[-1] == len(text),
1493 // corresponding to the identified type-L index len(text)-1.
1494 // Process it before the left-to-right scan of sa proper.
1495 // See body in loop for commentary.
1496 k := len(text) - 1
1497 c0, c1 := text[k-1], text[k]
1498 if c0 < c1 {
1499 k = -k
1500 }
1501 1502 // Cache recently used bucket index.
1503 cB := c1
1504 b := bucket[cB]
1505 sa[b] = int32(k)
1506 b++
1507 1508 for i := 0; i < len(sa); i++ {
1509 j := int(sa[i])
1510 if j <= 0 {
1511 // Skip empty or negated entry (including negated zero).
1512 continue
1513 }
1514 1515 // Index j was on work queue, meaning k := j-1 is L-type,
1516 // so we can now place k correctly into sa.
1517 // If k-1 is L-type, queue k for processing later in this loop.
1518 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
1519 // If k is zero, k-1 doesn't exist, so we only need to leave it
1520 // for the caller. The caller can't tell the difference between
1521 // an empty slot and a non-empty zero, but there's no need
1522 // to distinguish them anyway: the final suffix array will end up
1523 // with one zero somewhere, and that will be a real zero.
1524 k := j - 1
1525 c1 := text[k]
1526 if k > 0 {
1527 if c0 := text[k-1]; c0 < c1 {
1528 k = -k
1529 }
1530 }
1531 1532 if cB != c1 {
1533 bucket[cB] = b
1534 cB = c1
1535 b = bucket[cB]
1536 }
1537 sa[b] = int32(k)
1538 b++
1539 }
1540 }
1541 1542 func induceL_64(text []int64, sa, freq, bucket []int64) {
1543 // Initialize positions for left side of character buckets.
1544 bucketMin_64(text, freq, bucket)
1545 1546 // This scan is similar to the one in induceSubL_64 above.
1547 // That one arranges to clear all but the leftmost L-type indexes.
1548 // This scan leaves all the L-type indexes and the original S-type
1549 // indexes, but it negates the positive leftmost L-type indexes
1550 // (the ones that induceS_64 needs to process).
1551 1552 // expand_64 left out the implicit entry sa[-1] == len(text),
1553 // corresponding to the identified type-L index len(text)-1.
1554 // Process it before the left-to-right scan of sa proper.
1555 // See body in loop for commentary.
1556 k := len(text) - 1
1557 c0, c1 := text[k-1], text[k]
1558 if c0 < c1 {
1559 k = -k
1560 }
1561 1562 // Cache recently used bucket index.
1563 cB := c1
1564 b := bucket[cB]
1565 sa[b] = int64(k)
1566 b++
1567 1568 for i := 0; i < len(sa); i++ {
1569 j := int(sa[i])
1570 if j <= 0 {
1571 // Skip empty or negated entry (including negated zero).
1572 continue
1573 }
1574 1575 // Index j was on work queue, meaning k := j-1 is L-type,
1576 // so we can now place k correctly into sa.
1577 // If k-1 is L-type, queue k for processing later in this loop.
1578 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
1579 // If k is zero, k-1 doesn't exist, so we only need to leave it
1580 // for the caller. The caller can't tell the difference between
1581 // an empty slot and a non-empty zero, but there's no need
1582 // to distinguish them anyway: the final suffix array will end up
1583 // with one zero somewhere, and that will be a real zero.
1584 k := j - 1
1585 c1 := text[k]
1586 if k > 0 {
1587 if c0 := text[k-1]; c0 < c1 {
1588 k = -k
1589 }
1590 }
1591 1592 if cB != c1 {
1593 bucket[cB] = b
1594 cB = c1
1595 b = bucket[cB]
1596 }
1597 sa[b] = int64(k)
1598 b++
1599 }
1600 }
1601 1602 func induceS_8_64(text []byte, sa, freq, bucket []int64) {
1603 // Initialize positions for right side of character buckets.
1604 bucketMax_8_64(text, freq, bucket)
1605 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
1606 1607 cB := byte(0)
1608 b := bucket[cB]
1609 1610 for i := len(sa) - 1; i >= 0; i-- {
1611 j := int(sa[i])
1612 if j >= 0 {
1613 // Skip non-flagged entry.
1614 // (This loop can't see an empty entry; 0 means the real zero index.)
1615 continue
1616 }
1617 1618 // Negative j is a work queue entry; rewrite to positive j for final suffix array.
1619 j = -j
1620 sa[i] = int64(j)
1621 1622 // Index j was on work queue (encoded as -j but now decoded),
1623 // meaning k := j-1 is L-type,
1624 // so we can now place k correctly into sa.
1625 // If k-1 is S-type, queue -k for processing later in this loop.
1626 // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
1627 // If k is zero, k-1 doesn't exist, so we only need to leave it
1628 // for the caller.
1629 k := j - 1
1630 c1 := text[k]
1631 if k > 0 {
1632 if c0 := text[k-1]; c0 <= c1 {
1633 k = -k
1634 }
1635 }
1636 1637 if cB != c1 {
1638 bucket[cB] = b
1639 cB = c1
1640 b = bucket[cB]
1641 }
1642 b--
1643 sa[b] = int64(k)
1644 }
1645 }
1646 1647 func induceS_32(text []int32, sa, freq, bucket []int32) {
1648 // Initialize positions for right side of character buckets.
1649 bucketMax_32(text, freq, bucket)
1650 1651 cB := int32(0)
1652 b := bucket[cB]
1653 1654 for i := len(sa) - 1; i >= 0; i-- {
1655 j := int(sa[i])
1656 if j >= 0 {
1657 // Skip non-flagged entry.
1658 // (This loop can't see an empty entry; 0 means the real zero index.)
1659 continue
1660 }
1661 1662 // Negative j is a work queue entry; rewrite to positive j for final suffix array.
1663 j = -j
1664 sa[i] = int32(j)
1665 1666 // Index j was on work queue (encoded as -j but now decoded),
1667 // meaning k := j-1 is L-type,
1668 // so we can now place k correctly into sa.
1669 // If k-1 is S-type, queue -k for processing later in this loop.
1670 // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
1671 // If k is zero, k-1 doesn't exist, so we only need to leave it
1672 // for the caller.
1673 k := j - 1
1674 c1 := text[k]
1675 if k > 0 {
1676 if c0 := text[k-1]; c0 <= c1 {
1677 k = -k
1678 }
1679 }
1680 1681 if cB != c1 {
1682 bucket[cB] = b
1683 cB = c1
1684 b = bucket[cB]
1685 }
1686 b--
1687 sa[b] = int32(k)
1688 }
1689 }
1690 1691 func induceS_64(text []int64, sa, freq, bucket []int64) {
1692 // Initialize positions for right side of character buckets.
1693 bucketMax_64(text, freq, bucket)
1694 1695 cB := int64(0)
1696 b := bucket[cB]
1697 1698 for i := len(sa) - 1; i >= 0; i-- {
1699 j := int(sa[i])
1700 if j >= 0 {
1701 // Skip non-flagged entry.
1702 // (This loop can't see an empty entry; 0 means the real zero index.)
1703 continue
1704 }
1705 1706 // Negative j is a work queue entry; rewrite to positive j for final suffix array.
1707 j = -j
1708 sa[i] = int64(j)
1709 1710 // Index j was on work queue (encoded as -j but now decoded),
1711 // meaning k := j-1 is L-type,
1712 // so we can now place k correctly into sa.
1713 // If k-1 is S-type, queue -k for processing later in this loop.
1714 // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
1715 // If k is zero, k-1 doesn't exist, so we only need to leave it
1716 // for the caller.
1717 k := j - 1
1718 c1 := text[k]
1719 if k > 0 {
1720 if c0 := text[k-1]; c0 <= c1 {
1721 k = -k
1722 }
1723 }
1724 1725 if cB != c1 {
1726 bucket[cB] = b
1727 cB = c1
1728 b = bucket[cB]
1729 }
1730 b--
1731 sa[b] = int64(k)
1732 }
1733 }
1734