normalize.mx raw

   1  // Copyright 2011 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Note: the file data_test.go that is generated should not be checked in.
   6  //go:generate go run maketables.go triegen.go
   7  //go:generate go test -tags test
   8  
   9  // Package norm contains types and functions for normalizing Unicode strings.
  10  package norm // import "golang.org/x/text/unicode/norm"
  11  
  12  import (
  13  	"unicode/utf8"
  14  
  15  	"golang.org/x/text/transform"
  16  )
  17  
  18  // A Form denotes a canonical representation of Unicode code points.
  19  // The Unicode-defined normalization and equivalence forms are:
  20  //
  21  //	NFC   Unicode Normalization Form C
  22  //	NFD   Unicode Normalization Form D
  23  //	NFKC  Unicode Normalization Form KC
  24  //	NFKD  Unicode Normalization Form KD
  25  //
  26  // For a Form f, this documentation uses the notation f(x) to mean
  27  // the bytes or string x converted to the given form.
  28  // A position n in x is called a boundary if conversion to the form can
  29  // proceed independently on both sides:
  30  //
  31  //	f(x) == append(f(x[0:n]), f(x[n:])...)
  32  //
  33  // References: https://unicode.org/reports/tr15/ and
  34  // https://unicode.org/notes/tn5/.
  35  type Form int
  36  
  37  const (
  38  	NFC Form = iota
  39  	NFD
  40  	NFKC
  41  	NFKD
  42  )
  43  
  44  // Bytes returns f(b). May return b if f(b) = b.
  45  func (f Form) Bytes(b []byte) []byte {
  46  	src := inputBytes(b)
  47  	ft := formTable[f]
  48  	n, ok := ft.quickSpan(src, 0, len(b), true)
  49  	if ok {
  50  		return b
  51  	}
  52  	out := []byte{:n:len(b)}
  53  	copy(out, b[0:n])
  54  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
  55  	return doAppendInner(&rb, n)
  56  }
  57  
  58  // String returns f(s).
  59  func (f Form) String(s string) string {
  60  	src := inputString(s)
  61  	ft := formTable[f]
  62  	n, ok := ft.quickSpan(src, 0, len(s), true)
  63  	if ok {
  64  		return s
  65  	}
  66  	out := []byte{:n:len(s)}
  67  	copy(out, s[0:n])
  68  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
  69  	return string(doAppendInner(&rb, n))
  70  }
  71  
  72  // IsNormal returns true if b == f(b).
  73  func (f Form) IsNormal(b []byte) bool {
  74  	src := inputBytes(b)
  75  	ft := formTable[f]
  76  	bp, ok := ft.quickSpan(src, 0, len(b), true)
  77  	if ok {
  78  		return true
  79  	}
  80  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
  81  	rb.setFlusher(nil, cmpNormalBytes)
  82  	for bp < len(b) {
  83  		rb.out = b[bp:]
  84  		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
  85  			return false
  86  		}
  87  		bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
  88  	}
  89  	return true
  90  }
  91  
  92  func cmpNormalBytes(rb *reorderBuffer) bool {
  93  	b := rb.out
  94  	for i := 0; i < rb.nrune; i++ {
  95  		info := rb.rune[i]
  96  		if int(info.size) > len(b) {
  97  			return false
  98  		}
  99  		p := info.pos
 100  		pe := p + info.size
 101  		for ; p < pe; p++ {
 102  			if b[0] != rb.byte[p] {
 103  				return false
 104  			}
 105  			b = b[1:]
 106  		}
 107  	}
 108  	return true
 109  }
 110  
 111  // IsNormalString returns true if s == f(s).
 112  func (f Form) IsNormalString(s string) bool {
 113  	src := inputString(s)
 114  	ft := formTable[f]
 115  	bp, ok := ft.quickSpan(src, 0, len(s), true)
 116  	if ok {
 117  		return true
 118  	}
 119  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
 120  	rb.setFlusher(nil, func(rb *reorderBuffer) bool {
 121  		for i := 0; i < rb.nrune; i++ {
 122  			info := rb.rune[i]
 123  			if bp+int(info.size) > len(s) {
 124  				return false
 125  			}
 126  			p := info.pos
 127  			pe := p + info.size
 128  			for ; p < pe; p++ {
 129  				if s[bp] != rb.byte[p] {
 130  					return false
 131  				}
 132  				bp++
 133  			}
 134  		}
 135  		return true
 136  	})
 137  	for bp < len(s) {
 138  		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
 139  			return false
 140  		}
 141  		bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
 142  	}
 143  	return true
 144  }
 145  
 146  // patchTail fixes a case where a rune may be incorrectly normalized
 147  // if it is followed by illegal continuation bytes. It returns the
 148  // patched buffer and whether the decomposition is still in progress.
 149  func patchTail(rb *reorderBuffer) bool {
 150  	info, p := lastRuneStart(&rb.f, rb.out)
 151  	if p == -1 || info.size == 0 {
 152  		return true
 153  	}
 154  	end := p + int(info.size)
 155  	extra := len(rb.out) - end
 156  	if extra > 0 {
 157  		// Potentially allocating memory. However, this only
 158  		// happens with ill-formed UTF-8.
 159  		x := []byte{:0}
 160  		x = append(x, rb.out[len(rb.out)-extra:]...)
 161  		rb.out = rb.out[:end]
 162  		decomposeToLastBoundary(rb)
 163  		rb.doFlush()
 164  		rb.out = append(rb.out, x...)
 165  		return false
 166  	}
 167  	buf := rb.out[p:]
 168  	rb.out = rb.out[:p]
 169  	decomposeToLastBoundary(rb)
 170  	if s := rb.ss.next(info); s == ssStarter {
 171  		rb.doFlush()
 172  		rb.ss.first(info)
 173  	} else if s == ssOverflow {
 174  		rb.doFlush()
 175  		rb.insertCGJ()
 176  		rb.ss = 0
 177  	}
 178  	rb.insertUnsafe(inputBytes(buf), 0, info)
 179  	return true
 180  }
 181  
 182  func appendQuick(rb *reorderBuffer, i int) int {
 183  	if rb.nsrc == i {
 184  		return i
 185  	}
 186  	end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
 187  	rb.out = rb.src.appendSlice(rb.out, i, end)
 188  	return end
 189  }
 190  
 191  // Append returns f(append(out, b...)).
 192  // The buffer out must be nil, empty, or equal to f(out).
 193  func (f Form) Append(out []byte, src ...byte) []byte {
 194  	return f.doAppend(out, inputBytes(src), len(src))
 195  }
 196  
 197  func (f Form) doAppend(out []byte, src input, n int) []byte {
 198  	if n == 0 {
 199  		return out
 200  	}
 201  	ft := formTable[f]
 202  	// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
 203  	if len(out) == 0 {
 204  		p, _ := ft.quickSpan(src, 0, n, true)
 205  		out = src.appendSlice(out, 0, p)
 206  		if p == n {
 207  			return out
 208  		}
 209  		rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
 210  		return doAppendInner(&rb, p)
 211  	}
 212  	rb := reorderBuffer{f: *ft, src: src, nsrc: n}
 213  	return doAppend(&rb, out, 0)
 214  }
 215  
 216  func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
 217  	rb.setFlusher(out, appendFlush)
 218  	src, n := rb.src, rb.nsrc
 219  	doMerge := len(out) > 0
 220  	if q := src.skipContinuationBytes(p); q > p {
 221  		// Move leading non-starters to destination.
 222  		rb.out = src.appendSlice(rb.out, p, q)
 223  		p = q
 224  		doMerge = patchTail(rb)
 225  	}
 226  	fd := &rb.f
 227  	if doMerge {
 228  		var info Properties
 229  		if p < n {
 230  			info = fd.info(src, p)
 231  			if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
 232  				if p == 0 {
 233  					decomposeToLastBoundary(rb)
 234  				}
 235  				p = decomposeSegment(rb, p, true)
 236  			}
 237  		}
 238  		if info.size == 0 {
 239  			rb.doFlush()
 240  			// Append incomplete UTF-8 encoding.
 241  			return src.appendSlice(rb.out, p, n)
 242  		}
 243  		if rb.nrune > 0 {
 244  			return doAppendInner(rb, p)
 245  		}
 246  	}
 247  	p = appendQuick(rb, p)
 248  	return doAppendInner(rb, p)
 249  }
 250  
 251  func doAppendInner(rb *reorderBuffer, p int) []byte {
 252  	for n := rb.nsrc; p < n; {
 253  		p = decomposeSegment(rb, p, true)
 254  		p = appendQuick(rb, p)
 255  	}
 256  	return rb.out
 257  }
 258  
 259  // AppendString returns f(append(out, []byte(s))).
 260  // The buffer out must be nil, empty, or equal to f(out).
 261  func (f Form) AppendString(out []byte, src string) []byte {
 262  	return f.doAppend(out, inputString(src), len(src))
 263  }
 264  
 265  // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
 266  // It is not guaranteed to return the largest such n.
 267  func (f Form) QuickSpan(b []byte) int {
 268  	n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
 269  	return n
 270  }
 271  
 272  // Span implements transform.SpanningTransformer. It returns a boundary n such
 273  // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
 274  func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
 275  	n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
 276  	if n < len(b) {
 277  		if !ok {
 278  			err = transform.ErrEndOfSpan
 279  		} else {
 280  			err = transform.ErrShortSrc
 281  		}
 282  	}
 283  	return n, err
 284  }
 285  
 286  // SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
 287  // It is not guaranteed to return the largest such n.
 288  func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
 289  	n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
 290  	if n < len(s) {
 291  		if !ok {
 292  			err = transform.ErrEndOfSpan
 293  		} else {
 294  			err = transform.ErrShortSrc
 295  		}
 296  	}
 297  	return n, err
 298  }
 299  
 300  // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
 301  // whether any non-normalized parts were found. If atEOF is false, n will
 302  // not point past the last segment if this segment might be become
 303  // non-normalized by appending other runes.
 304  func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
 305  	var lastCC uint8
 306  	ss := streamSafe(0)
 307  	lastSegStart := i
 308  	for n = end; i < n; {
 309  		if j := src.skipASCII(i, n); i != j {
 310  			i = j
 311  			lastSegStart = i - 1
 312  			lastCC = 0
 313  			ss = 0
 314  			continue
 315  		}
 316  		info := f.info(src, i)
 317  		if info.size == 0 {
 318  			if atEOF {
 319  				// include incomplete runes
 320  				return n, true
 321  			}
 322  			return lastSegStart, true
 323  		}
 324  		// This block needs to be before the next, because it is possible to
 325  		// have an overflow for runes that are starters (e.g. with U+FF9E).
 326  		switch ss.next(info) {
 327  		case ssStarter:
 328  			lastSegStart = i
 329  		case ssOverflow:
 330  			return lastSegStart, false
 331  		case ssSuccess:
 332  			if lastCC > info.ccc {
 333  				return lastSegStart, false
 334  			}
 335  		}
 336  		if f.composing {
 337  			if !info.isYesC() {
 338  				break
 339  			}
 340  		} else {
 341  			if !info.isYesD() {
 342  				break
 343  			}
 344  		}
 345  		lastCC = info.ccc
 346  		i += int(info.size)
 347  	}
 348  	if i == n {
 349  		if !atEOF {
 350  			n = lastSegStart
 351  		}
 352  		return n, true
 353  	}
 354  	return lastSegStart, false
 355  }
 356  
 357  // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
 358  // It is not guaranteed to return the largest such n.
 359  func (f Form) QuickSpanString(s string) int {
 360  	n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
 361  	return n
 362  }
 363  
 364  // FirstBoundary returns the position i of the first boundary in b
 365  // or -1 if b contains no boundary.
 366  func (f Form) FirstBoundary(b []byte) int {
 367  	return f.firstBoundary(inputBytes(b), len(b))
 368  }
 369  
 370  func (f Form) firstBoundary(src input, nsrc int) int {
 371  	i := src.skipContinuationBytes(0)
 372  	if i >= nsrc {
 373  		return -1
 374  	}
 375  	fd := formTable[f]
 376  	ss := streamSafe(0)
 377  	// We should call ss.first here, but we can't as the first rune is
 378  	// skipped already. This means FirstBoundary can't really determine
 379  	// CGJ insertion points correctly. Luckily it doesn't have to.
 380  	for {
 381  		info := fd.info(src, i)
 382  		if info.size == 0 {
 383  			return -1
 384  		}
 385  		if s := ss.next(info); s != ssSuccess {
 386  			return i
 387  		}
 388  		i += int(info.size)
 389  		if i >= nsrc {
 390  			if !info.BoundaryAfter() && !ss.isMax() {
 391  				return -1
 392  			}
 393  			return nsrc
 394  		}
 395  	}
 396  }
 397  
 398  // FirstBoundaryInString returns the position i of the first boundary in s
 399  // or -1 if s contains no boundary.
 400  func (f Form) FirstBoundaryInString(s string) int {
 401  	return f.firstBoundary(inputString(s), len(s))
 402  }
 403  
 404  // NextBoundary reports the index of the boundary between the first and next
 405  // segment in b or -1 if atEOF is false and there are not enough bytes to
 406  // determine this boundary.
 407  func (f Form) NextBoundary(b []byte, atEOF bool) int {
 408  	return f.nextBoundary(inputBytes(b), len(b), atEOF)
 409  }
 410  
 411  // NextBoundaryInString reports the index of the boundary between the first and
 412  // next segment in b or -1 if atEOF is false and there are not enough bytes to
 413  // determine this boundary.
 414  func (f Form) NextBoundaryInString(s string, atEOF bool) int {
 415  	return f.nextBoundary(inputString(s), len(s), atEOF)
 416  }
 417  
 418  func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
 419  	if nsrc == 0 {
 420  		if atEOF {
 421  			return 0
 422  		}
 423  		return -1
 424  	}
 425  	fd := formTable[f]
 426  	info := fd.info(src, 0)
 427  	if info.size == 0 {
 428  		if atEOF {
 429  			return 1
 430  		}
 431  		return -1
 432  	}
 433  	ss := streamSafe(0)
 434  	ss.first(info)
 435  
 436  	for i := int(info.size); i < nsrc; i += int(info.size) {
 437  		info = fd.info(src, i)
 438  		if info.size == 0 {
 439  			if atEOF {
 440  				return i
 441  			}
 442  			return -1
 443  		}
 444  		// TODO: Using streamSafe to determine the boundary isn't the same as
 445  		// using BoundaryBefore. Determine which should be used.
 446  		if s := ss.next(info); s != ssSuccess {
 447  			return i
 448  		}
 449  	}
 450  	if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
 451  		return -1
 452  	}
 453  	return nsrc
 454  }
 455  
 456  // LastBoundary returns the position i of the last boundary in b
 457  // or -1 if b contains no boundary.
 458  func (f Form) LastBoundary(b []byte) int {
 459  	return lastBoundary(formTable[f], b)
 460  }
 461  
 462  func lastBoundary(fd *formInfo, b []byte) int {
 463  	i := len(b)
 464  	info, p := lastRuneStart(fd, b)
 465  	if p == -1 {
 466  		return -1
 467  	}
 468  	if info.size == 0 { // ends with incomplete rune
 469  		if p == 0 { // starts with incomplete rune
 470  			return -1
 471  		}
 472  		i = p
 473  		info, p = lastRuneStart(fd, b[:i])
 474  		if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
 475  			return i
 476  		}
 477  	}
 478  	if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
 479  		return i
 480  	}
 481  	if info.BoundaryAfter() {
 482  		return i
 483  	}
 484  	ss := streamSafe(0)
 485  	v := ss.backwards(info)
 486  	for i = p; i >= 0 && v != ssStarter; i = p {
 487  		info, p = lastRuneStart(fd, b[:i])
 488  		if v = ss.backwards(info); v == ssOverflow {
 489  			break
 490  		}
 491  		if p+int(info.size) != i {
 492  			if p == -1 { // no boundary found
 493  				return -1
 494  			}
 495  			return i // boundary after an illegal UTF-8 encoding
 496  		}
 497  	}
 498  	return i
 499  }
 500  
 501  // decomposeSegment scans the first segment in src into rb. It inserts 0x034f
 502  // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
 503  // and returns the number of bytes consumed from src or iShortDst or iShortSrc.
 504  func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
 505  	// Force one character to be consumed.
 506  	info := rb.f.info(rb.src, sp)
 507  	if info.size == 0 {
 508  		return 0
 509  	}
 510  	if s := rb.ss.next(info); s == ssStarter {
 511  		// TODO: this could be removed if we don't support merging.
 512  		if rb.nrune > 0 {
 513  			goto end
 514  		}
 515  	} else if s == ssOverflow {
 516  		rb.insertCGJ()
 517  		goto end
 518  	}
 519  	if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
 520  		return int(err)
 521  	}
 522  	for {
 523  		sp += int(info.size)
 524  		if sp >= rb.nsrc {
 525  			if !atEOF && !info.BoundaryAfter() {
 526  				return int(iShortSrc)
 527  			}
 528  			break
 529  		}
 530  		info = rb.f.info(rb.src, sp)
 531  		if info.size == 0 {
 532  			if !atEOF {
 533  				return int(iShortSrc)
 534  			}
 535  			break
 536  		}
 537  		if s := rb.ss.next(info); s == ssStarter {
 538  			break
 539  		} else if s == ssOverflow {
 540  			rb.insertCGJ()
 541  			break
 542  		}
 543  		if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
 544  			return int(err)
 545  		}
 546  	}
 547  end:
 548  	if !rb.doFlush() {
 549  		return int(iShortDst)
 550  	}
 551  	return sp
 552  }
 553  
 554  // lastRuneStart returns the runeInfo and position of the last
 555  // rune in buf or the zero runeInfo and -1 if no rune was found.
 556  func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
 557  	p := len(buf) - 1
 558  	for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
 559  	}
 560  	if p < 0 {
 561  		return Properties{}, -1
 562  	}
 563  	return fd.info(inputBytes(buf), p), p
 564  }
 565  
 566  // decomposeToLastBoundary finds an open segment at the end of the buffer
 567  // and scans it into rb. Returns the buffer minus the last segment.
 568  func decomposeToLastBoundary(rb *reorderBuffer) {
 569  	fd := &rb.f
 570  	info, i := lastRuneStart(fd, rb.out)
 571  	if int(info.size) != len(rb.out)-i {
 572  		// illegal trailing continuation bytes
 573  		return
 574  	}
 575  	if info.BoundaryAfter() {
 576  		return
 577  	}
 578  	var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
 579  	padd := 0
 580  	ss := streamSafe(0)
 581  	p := len(rb.out)
 582  	for {
 583  		add[padd] = info
 584  		v := ss.backwards(info)
 585  		if v == ssOverflow {
 586  			// Note that if we have an overflow, it the string we are appending to
 587  			// is not correctly normalized. In this case the behavior is undefined.
 588  			break
 589  		}
 590  		padd++
 591  		p -= int(info.size)
 592  		if v == ssStarter || p < 0 {
 593  			break
 594  		}
 595  		info, i = lastRuneStart(fd, rb.out[:p])
 596  		if int(info.size) != p-i {
 597  			break
 598  		}
 599  	}
 600  	rb.ss = ss
 601  	// Copy bytes for insertion as we may need to overwrite rb.out.
 602  	var buf [maxBufferSize * utf8.UTFMax]byte
 603  	cp := buf[:copy(buf[:], rb.out[p:])]
 604  	rb.out = rb.out[:p]
 605  	for padd--; padd >= 0; padd-- {
 606  		info = add[padd]
 607  		rb.insertUnsafe(inputBytes(cp), 0, info)
 608  		cp = cp[info.size:]
 609  	}
 610  }
 611