iter.mx raw

   1  // Copyright 2011 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package norm
   6  
   7  import (
   8  	"fmt"
   9  	"unicode/utf8"
  10  )
  11  
  12  // MaxSegmentSize is the maximum size of a byte buffer needed to consider any
  13  // sequence of starter and non-starter runes for the purpose of normalization.
  14  const MaxSegmentSize = maxByteBufferSize
  15  
  16  // An Iter iterates over a string or byte slice, while normalizing it
  17  // to a given Form.
  18  type Iter struct {
  19  	rb     reorderBuffer
  20  	buf    [maxByteBufferSize]byte
  21  	info   Properties // first character saved from previous iteration
  22  	next   iterFunc   // implementation of next depends on form
  23  	asciiF iterFunc
  24  
  25  	p        int    // current position in input source
  26  	multiSeg []byte // remainder of multi-segment decomposition
  27  }
  28  
  29  type iterFunc func(*Iter) []byte
  30  
  31  // Init initializes i to iterate over src after normalizing it to Form f.
  32  func (i *Iter) Init(f Form, src []byte) {
  33  	i.p = 0
  34  	if len(src) == 0 {
  35  		i.setDone()
  36  		i.rb.nsrc = 0
  37  		return
  38  	}
  39  	i.multiSeg = nil
  40  	i.rb.init(f, src)
  41  	i.next = i.rb.f.nextMain
  42  	i.asciiF = nextASCIIBytes
  43  	i.info = i.rb.f.info(i.rb.src, i.p)
  44  	i.rb.ss.first(i.info)
  45  }
  46  
  47  // InitString initializes i to iterate over src after normalizing it to Form f.
  48  func (i *Iter) InitString(f Form, src []byte) {
  49  	i.p = 0
  50  	if len(src) == 0 {
  51  		i.setDone()
  52  		i.rb.nsrc = 0
  53  		return
  54  	}
  55  	i.multiSeg = nil
  56  	i.rb.initString(f, src)
  57  	i.next = i.rb.f.nextMain
  58  	i.asciiF = nextASCIIString
  59  	i.info = i.rb.f.info(i.rb.src, i.p)
  60  	i.rb.ss.first(i.info)
  61  }
  62  
  63  // Seek sets the segment to be returned by the next call to Next to start
  64  // at position p.  It is the responsibility of the caller to set p to the
  65  // start of a segment.
  66  func (i *Iter) Seek(offset int64, whence int) (int64, error) {
  67  	var abs int64
  68  	switch whence {
  69  	case 0:
  70  		abs = offset
  71  	case 1:
  72  		abs = int64(i.p) + offset
  73  	case 2:
  74  		abs = int64(i.rb.nsrc) + offset
  75  	default:
  76  		return 0, fmt.Errorf("norm: invalid whence")
  77  	}
  78  	if abs < 0 {
  79  		return 0, fmt.Errorf("norm: negative position")
  80  	}
  81  	if int(abs) >= i.rb.nsrc {
  82  		i.setDone()
  83  		return int64(i.p), nil
  84  	}
  85  	i.p = int(abs)
  86  	i.multiSeg = nil
  87  	i.next = i.rb.f.nextMain
  88  	i.info = i.rb.f.info(i.rb.src, i.p)
  89  	i.rb.ss.first(i.info)
  90  	return abs, nil
  91  }
  92  
  93  // returnSlice returns a slice of the underlying input type as a byte slice.
  94  // If the underlying is of type []byte, it will simply return a slice.
  95  // If the underlying is of type string, it will copy the slice to the buffer
  96  // and return that.
  97  func (i *Iter) returnSlice(a, b int) []byte {
  98  	if i.rb.src.bytes == nil {
  99  		return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
 100  	}
 101  	return i.rb.src.bytes[a:b]
 102  }
 103  
 104  // Pos returns the byte position at which the next call to Next will commence processing.
 105  func (i *Iter) Pos() int {
 106  	return i.p
 107  }
 108  
 109  func (i *Iter) setDone() {
 110  	i.next = nextDone
 111  	i.p = i.rb.nsrc
 112  }
 113  
 114  // Done returns true if there is no more input to process.
 115  func (i *Iter) Done() bool {
 116  	return i.p >= i.rb.nsrc
 117  }
 118  
 119  // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
 120  // For any input a and b for which f(a) == f(b), subsequent calls
 121  // to Next will return the same segments.
 122  // Modifying runes are grouped together with the preceding starter, if such a starter exists.
 123  // Although not guaranteed, n will typically be the smallest possible n.
 124  func (i *Iter) Next() []byte {
 125  	return i.next(i)
 126  }
 127  
 128  func nextASCIIBytes(i *Iter) []byte {
 129  	p := i.p + 1
 130  	if p >= i.rb.nsrc {
 131  		p0 := i.p
 132  		i.setDone()
 133  		return i.rb.src.bytes[p0:p]
 134  	}
 135  	if i.rb.src.bytes[p] < utf8.RuneSelf {
 136  		p0 := i.p
 137  		i.p = p
 138  		return i.rb.src.bytes[p0:p]
 139  	}
 140  	i.info = i.rb.f.info(i.rb.src, i.p)
 141  	i.next = i.rb.f.nextMain
 142  	return i.next(i)
 143  }
 144  
 145  func nextASCIIString(i *Iter) []byte {
 146  	p := i.p + 1
 147  	if p >= i.rb.nsrc {
 148  		i.buf[0] = i.rb.src.str[i.p]
 149  		i.setDone()
 150  		return i.buf[:1]
 151  	}
 152  	if i.rb.src.str[p] < utf8.RuneSelf {
 153  		i.buf[0] = i.rb.src.str[i.p]
 154  		i.p = p
 155  		return i.buf[:1]
 156  	}
 157  	i.info = i.rb.f.info(i.rb.src, i.p)
 158  	i.next = i.rb.f.nextMain
 159  	return i.next(i)
 160  }
 161  
 162  func nextHangul(i *Iter) []byte {
 163  	p := i.p
 164  	next := p + hangulUTF8Size
 165  	if next >= i.rb.nsrc {
 166  		i.setDone()
 167  	} else if i.rb.src.hangul(next) == 0 {
 168  		i.rb.ss.next(i.info)
 169  		i.info = i.rb.f.info(i.rb.src, i.p)
 170  		i.next = i.rb.f.nextMain
 171  		return i.next(i)
 172  	}
 173  	i.p = next
 174  	return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
 175  }
 176  
 177  func nextDone(i *Iter) []byte {
 178  	return nil
 179  }
 180  
 181  // nextMulti is used for iterating over multi-segment decompositions
 182  // for decomposing normal forms.
 183  func nextMulti(i *Iter) []byte {
 184  	j := 0
 185  	d := i.multiSeg
 186  	// skip first rune
 187  	for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
 188  	}
 189  	for j < len(d) {
 190  		info := i.rb.f.info(input{bytes: d}, j)
 191  		if info.BoundaryBefore() {
 192  			i.multiSeg = d[j:]
 193  			return d[:j]
 194  		}
 195  		j += int(info.size)
 196  	}
 197  	// treat last segment as normal decomposition
 198  	i.next = i.rb.f.nextMain
 199  	return i.next(i)
 200  }
 201  
 202  // nextMultiNorm is used for iterating over multi-segment decompositions
 203  // for composing normal forms.
 204  func nextMultiNorm(i *Iter) []byte {
 205  	j := 0
 206  	d := i.multiSeg
 207  	for j < len(d) {
 208  		info := i.rb.f.info(input{bytes: d}, j)
 209  		if info.BoundaryBefore() {
 210  			i.rb.compose()
 211  			seg := i.buf[:i.rb.flushCopy(i.buf[:])]
 212  			i.rb.insertUnsafe(input{bytes: d}, j, info)
 213  			i.multiSeg = d[j+int(info.size):]
 214  			return seg
 215  		}
 216  		i.rb.insertUnsafe(input{bytes: d}, j, info)
 217  		j += int(info.size)
 218  	}
 219  	i.multiSeg = nil
 220  	i.next = nextComposed
 221  	return doNormComposed(i)
 222  }
 223  
 224  // nextDecomposed is the implementation of Next for forms NFD and NFKD.
 225  func nextDecomposed(i *Iter) (next []byte) {
 226  	outp := 0
 227  	inCopyStart, outCopyStart := i.p, 0
 228  	for {
 229  		if sz := int(i.info.size); sz <= 1 {
 230  			i.rb.ss = 0
 231  			p := i.p
 232  			i.p++ // ASCII or illegal byte.  Either way, advance by 1.
 233  			if i.p >= i.rb.nsrc {
 234  				i.setDone()
 235  				return i.returnSlice(p, i.p)
 236  			} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
 237  				i.next = i.asciiF
 238  				return i.returnSlice(p, i.p)
 239  			}
 240  			outp++
 241  		} else if d := i.info.Decomposition(); d != nil {
 242  			// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
 243  			// Case 1: there is a leftover to copy.  In this case the decomposition
 244  			// must begin with a modifier and should always be appended.
 245  			// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
 246  			p := outp + len(d)
 247  			if outp > 0 {
 248  				i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
 249  				// TODO: this condition should not be possible, but we leave it
 250  				// in for defensive purposes.
 251  				if p > len(i.buf) {
 252  					return i.buf[:outp]
 253  				}
 254  			} else if i.info.multiSegment() {
 255  				// outp must be 0 as multi-segment decompositions always
 256  				// start a new segment.
 257  				if i.multiSeg == nil {
 258  					i.multiSeg = d
 259  					i.next = nextMulti
 260  					return nextMulti(i)
 261  				}
 262  				// We are in the last segment.  Treat as normal decomposition.
 263  				d = i.multiSeg
 264  				i.multiSeg = nil
 265  				p = len(d)
 266  			}
 267  			prevCC := i.info.tccc
 268  			if i.p += sz; i.p >= i.rb.nsrc {
 269  				i.setDone()
 270  				i.info = Properties{} // Force BoundaryBefore to succeed.
 271  			} else {
 272  				i.info = i.rb.f.info(i.rb.src, i.p)
 273  			}
 274  			switch i.rb.ss.next(i.info) {
 275  			case ssOverflow:
 276  				i.next = nextCGJDecompose
 277  				if outp > 0 {
 278  					copy(i.buf[outp:], d)
 279  					return i.buf[:p]
 280  				}
 281  				return d
 282  			case ssStarter:
 283  				if outp > 0 {
 284  					copy(i.buf[outp:], d)
 285  					return i.buf[:p]
 286  				}
 287  				return d
 288  			}
 289  			copy(i.buf[outp:], d)
 290  			outp = p
 291  			inCopyStart, outCopyStart = i.p, outp
 292  			if i.info.ccc < prevCC {
 293  				goto doNorm
 294  			}
 295  			continue
 296  		} else if r := i.rb.src.hangul(i.p); r != 0 {
 297  			outp = decomposeHangul(i.buf[:], r)
 298  			i.p += hangulUTF8Size
 299  			inCopyStart, outCopyStart = i.p, outp
 300  			if i.p >= i.rb.nsrc {
 301  				i.setDone()
 302  				break
 303  			} else if i.rb.src.hangul(i.p) != 0 {
 304  				i.next = nextHangul
 305  				return i.buf[:outp]
 306  			}
 307  		} else {
 308  			p := outp + sz
 309  			if p > len(i.buf) {
 310  				break
 311  			}
 312  			outp = p
 313  			i.p += sz
 314  		}
 315  		if i.p >= i.rb.nsrc {
 316  			i.setDone()
 317  			break
 318  		}
 319  		prevCC := i.info.tccc
 320  		i.info = i.rb.f.info(i.rb.src, i.p)
 321  		if v := i.rb.ss.next(i.info); v == ssStarter {
 322  			break
 323  		} else if v == ssOverflow {
 324  			i.next = nextCGJDecompose
 325  			break
 326  		}
 327  		if i.info.ccc < prevCC {
 328  			goto doNorm
 329  		}
 330  	}
 331  	if outCopyStart == 0 {
 332  		return i.returnSlice(inCopyStart, i.p)
 333  	} else if inCopyStart < i.p {
 334  		i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
 335  	}
 336  	return i.buf[:outp]
 337  doNorm:
 338  	// Insert what we have decomposed so far in the reorderBuffer.
 339  	// As we will only reorder, there will always be enough room.
 340  	i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
 341  	i.rb.insertDecomposed(i.buf[0:outp])
 342  	return doNormDecomposed(i)
 343  }
 344  
 345  func doNormDecomposed(i *Iter) []byte {
 346  	for {
 347  		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
 348  		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
 349  			i.setDone()
 350  			break
 351  		}
 352  		i.info = i.rb.f.info(i.rb.src, i.p)
 353  		if i.info.ccc == 0 {
 354  			break
 355  		}
 356  		if s := i.rb.ss.next(i.info); s == ssOverflow {
 357  			i.next = nextCGJDecompose
 358  			break
 359  		}
 360  	}
 361  	// new segment or too many combining characters: exit normalization
 362  	return i.buf[:i.rb.flushCopy(i.buf[:])]
 363  }
 364  
 365  func nextCGJDecompose(i *Iter) []byte {
 366  	i.rb.ss = 0
 367  	i.rb.insertCGJ()
 368  	i.next = nextDecomposed
 369  	i.rb.ss.first(i.info)
 370  	buf := doNormDecomposed(i)
 371  	return buf
 372  }
 373  
 374  // nextComposed is the implementation of Next for forms NFC and NFKC.
 375  func nextComposed(i *Iter) []byte {
 376  	outp, startp := 0, i.p
 377  	var prevCC uint8
 378  	for {
 379  		if !i.info.isYesC() {
 380  			goto doNorm
 381  		}
 382  		prevCC = i.info.tccc
 383  		sz := int(i.info.size)
 384  		if sz == 0 {
 385  			sz = 1 // illegal rune: copy byte-by-byte
 386  		}
 387  		p := outp + sz
 388  		if p > len(i.buf) {
 389  			break
 390  		}
 391  		outp = p
 392  		i.p += sz
 393  		if i.p >= i.rb.nsrc {
 394  			i.setDone()
 395  			break
 396  		} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
 397  			i.rb.ss = 0
 398  			i.next = i.asciiF
 399  			break
 400  		}
 401  		i.info = i.rb.f.info(i.rb.src, i.p)
 402  		if v := i.rb.ss.next(i.info); v == ssStarter {
 403  			break
 404  		} else if v == ssOverflow {
 405  			i.next = nextCGJCompose
 406  			break
 407  		}
 408  		if i.info.ccc < prevCC {
 409  			goto doNorm
 410  		}
 411  	}
 412  	return i.returnSlice(startp, i.p)
 413  doNorm:
 414  	// reset to start position
 415  	i.p = startp
 416  	i.info = i.rb.f.info(i.rb.src, i.p)
 417  	i.rb.ss.first(i.info)
 418  	if i.info.multiSegment() {
 419  		d := i.info.Decomposition()
 420  		info := i.rb.f.info(input{bytes: d}, 0)
 421  		i.rb.insertUnsafe(input{bytes: d}, 0, info)
 422  		i.multiSeg = d[int(info.size):]
 423  		i.next = nextMultiNorm
 424  		return nextMultiNorm(i)
 425  	}
 426  	i.rb.ss.first(i.info)
 427  	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
 428  	return doNormComposed(i)
 429  }
 430  
 431  func doNormComposed(i *Iter) []byte {
 432  	// First rune should already be inserted.
 433  	for {
 434  		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
 435  			i.setDone()
 436  			break
 437  		}
 438  		i.info = i.rb.f.info(i.rb.src, i.p)
 439  		if s := i.rb.ss.next(i.info); s == ssStarter {
 440  			break
 441  		} else if s == ssOverflow {
 442  			i.next = nextCGJCompose
 443  			break
 444  		}
 445  		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
 446  	}
 447  	i.rb.compose()
 448  	seg := i.buf[:i.rb.flushCopy(i.buf[:])]
 449  	return seg
 450  }
 451  
 452  func nextCGJCompose(i *Iter) []byte {
 453  	i.rb.ss = 0 // instead of first
 454  	i.rb.insertCGJ()
 455  	i.next = nextComposed
 456  	// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
 457  	// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
 458  	// If we ever change that, insert a check here.
 459  	i.rb.ss.first(i.info)
 460  	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
 461  	return doNormComposed(i)
 462  }
 463