map.go raw

   1  // Copyright 2014 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package cases
   6  
   7  // This file contains the definitions of case mappings for all supported
   8  // languages. The rules for the language-specific tailorings were taken and
   9  // modified from the CLDR transform definitions in common/transforms.
  10  
  11  import (
  12  	"strings"
  13  	"unicode"
  14  	"unicode/utf8"
  15  
  16  	"golang.org/x/text/internal"
  17  	"golang.org/x/text/language"
  18  	"golang.org/x/text/transform"
  19  	"golang.org/x/text/unicode/norm"
  20  )
  21  
  22  // A mapFunc takes a context set to the current rune and writes the mapped
  23  // version to the same context. It may advance the context to the next rune. It
  24  // returns whether a checkpoint is possible: whether the pDst bytes written to
  25  // dst so far won't need changing as we see more source bytes.
  26  type mapFunc func(*context) bool
  27  
  28  // A spanFunc takes a context set to the current rune and returns whether this
  29  // rune would be altered when written to the output. It may advance the context
  30  // to the next rune. It returns whether a checkpoint is possible.
  31  type spanFunc func(*context) bool
  32  
  33  // maxIgnorable defines the maximum number of ignorables to consider for
  34  // lookahead operations.
  35  const maxIgnorable = 30
  36  
  37  // supported lists the language tags for which we have tailorings.
  38  const supported = "und af az el lt nl tr"
  39  
  40  func init() {
  41  	tags := []language.Tag{}
  42  	for _, s := range strings.Split(supported, " ") {
  43  		tags = append(tags, language.MustParse(s))
  44  	}
  45  	matcher = internal.NewInheritanceMatcher(tags)
  46  	Supported = language.NewCoverage(tags)
  47  }
  48  
  49  var (
  50  	matcher *internal.InheritanceMatcher
  51  
  52  	Supported language.Coverage
  53  
  54  	// We keep the following lists separate, instead of having a single per-
  55  	// language struct, to give the compiler a chance to remove unused code.
  56  
  57  	// Some uppercase mappers are stateless, so we can precompute the
  58  	// Transformers and save a bit on runtime allocations.
  59  	upperFunc = []struct {
  60  		upper mapFunc
  61  		span  spanFunc
  62  	}{
  63  		{nil, nil},                  // und
  64  		{nil, nil},                  // af
  65  		{aztrUpper(upper), isUpper}, // az
  66  		{elUpper, noSpan},           // el
  67  		{ltUpper(upper), noSpan},    // lt
  68  		{nil, nil},                  // nl
  69  		{aztrUpper(upper), isUpper}, // tr
  70  	}
  71  
  72  	undUpper            transform.SpanningTransformer = &undUpperCaser{}
  73  	undLower            transform.SpanningTransformer = &undLowerCaser{}
  74  	undLowerIgnoreSigma transform.SpanningTransformer = &undLowerIgnoreSigmaCaser{}
  75  
  76  	lowerFunc = []mapFunc{
  77  		nil,       // und
  78  		nil,       // af
  79  		aztrLower, // az
  80  		nil,       // el
  81  		ltLower,   // lt
  82  		nil,       // nl
  83  		aztrLower, // tr
  84  	}
  85  
  86  	titleInfos = []struct {
  87  		title     mapFunc
  88  		lower     mapFunc
  89  		titleSpan spanFunc
  90  		rewrite   func(*context)
  91  	}{
  92  		{title, lower, isTitle, nil},                // und
  93  		{title, lower, isTitle, afnlRewrite},        // af
  94  		{aztrUpper(title), aztrLower, isTitle, nil}, // az
  95  		{title, lower, isTitle, nil},                // el
  96  		{ltUpper(title), ltLower, noSpan, nil},      // lt
  97  		{nlTitle, lower, nlTitleSpan, afnlRewrite},  // nl
  98  		{aztrUpper(title), aztrLower, isTitle, nil}, // tr
  99  	}
 100  )
 101  
 102  func makeUpper(t language.Tag, o options) transform.SpanningTransformer {
 103  	_, i, _ := matcher.Match(t)
 104  	f := upperFunc[i].upper
 105  	if f == nil {
 106  		return undUpper
 107  	}
 108  	return &simpleCaser{f: f, span: upperFunc[i].span}
 109  }
 110  
 111  func makeLower(t language.Tag, o options) transform.SpanningTransformer {
 112  	_, i, _ := matcher.Match(t)
 113  	f := lowerFunc[i]
 114  	if f == nil {
 115  		if o.ignoreFinalSigma {
 116  			return undLowerIgnoreSigma
 117  		}
 118  		return undLower
 119  	}
 120  	if o.ignoreFinalSigma {
 121  		return &simpleCaser{f: f, span: isLower}
 122  	}
 123  	return &lowerCaser{
 124  		first:   f,
 125  		midWord: finalSigma(f),
 126  	}
 127  }
 128  
 129  func makeTitle(t language.Tag, o options) transform.SpanningTransformer {
 130  	_, i, _ := matcher.Match(t)
 131  	x := &titleInfos[i]
 132  	lower := x.lower
 133  	if o.noLower {
 134  		lower = (*context).copy
 135  	} else if !o.ignoreFinalSigma {
 136  		lower = finalSigma(lower)
 137  	}
 138  	return &titleCaser{
 139  		title:     x.title,
 140  		lower:     lower,
 141  		titleSpan: x.titleSpan,
 142  		rewrite:   x.rewrite,
 143  	}
 144  }
 145  
 146  func noSpan(c *context) bool {
 147  	c.err = transform.ErrEndOfSpan
 148  	return false
 149  }
 150  
 151  // TODO: consider a similar special case for the fast majority lower case. This
 152  // is a bit more involved so will require some more precise benchmarking to
 153  // justify it.
 154  
 155  type undUpperCaser struct{ transform.NopResetter }
 156  
 157  // undUpperCaser implements the Transformer interface for doing an upper case
 158  // mapping for the root locale (und). It eliminates the need for an allocation
 159  // as it prevents escaping by not using function pointers.
 160  func (t undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 161  	c := context{dst: dst, src: src, atEOF: atEOF}
 162  	for c.next() {
 163  		upper(&c)
 164  		c.checkpoint()
 165  	}
 166  	return c.ret()
 167  }
 168  
 169  func (t undUpperCaser) Span(src []byte, atEOF bool) (n int, err error) {
 170  	c := context{src: src, atEOF: atEOF}
 171  	for c.next() && isUpper(&c) {
 172  		c.checkpoint()
 173  	}
 174  	return c.retSpan()
 175  }
 176  
 177  // undLowerIgnoreSigmaCaser implements the Transformer interface for doing
 178  // a lower case mapping for the root locale (und) ignoring final sigma
 179  // handling. This casing algorithm is used in some performance-critical packages
 180  // like secure/precis and x/net/http/idna, which warrants its special-casing.
 181  type undLowerIgnoreSigmaCaser struct{ transform.NopResetter }
 182  
 183  func (t undLowerIgnoreSigmaCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 184  	c := context{dst: dst, src: src, atEOF: atEOF}
 185  	for c.next() && lower(&c) {
 186  		c.checkpoint()
 187  	}
 188  	return c.ret()
 189  
 190  }
 191  
 192  // Span implements a generic lower-casing. This is possible as isLower works
 193  // for all lowercasing variants. All lowercase variants only vary in how they
 194  // transform a non-lowercase letter. They will never change an already lowercase
 195  // letter. In addition, there is no state.
 196  func (t undLowerIgnoreSigmaCaser) Span(src []byte, atEOF bool) (n int, err error) {
 197  	c := context{src: src, atEOF: atEOF}
 198  	for c.next() && isLower(&c) {
 199  		c.checkpoint()
 200  	}
 201  	return c.retSpan()
 202  }
 203  
 204  type simpleCaser struct {
 205  	context
 206  	f    mapFunc
 207  	span spanFunc
 208  }
 209  
 210  // simpleCaser implements the Transformer interface for doing a case operation
 211  // on a rune-by-rune basis.
 212  func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 213  	c := context{dst: dst, src: src, atEOF: atEOF}
 214  	for c.next() && t.f(&c) {
 215  		c.checkpoint()
 216  	}
 217  	return c.ret()
 218  }
 219  
 220  func (t *simpleCaser) Span(src []byte, atEOF bool) (n int, err error) {
 221  	c := context{src: src, atEOF: atEOF}
 222  	for c.next() && t.span(&c) {
 223  		c.checkpoint()
 224  	}
 225  	return c.retSpan()
 226  }
 227  
 228  // undLowerCaser implements the Transformer interface for doing a lower case
 229  // mapping for the root locale (und) ignoring final sigma handling. This casing
 230  // algorithm is used in some performance-critical packages like secure/precis
 231  // and x/net/http/idna, which warrants its special-casing.
 232  type undLowerCaser struct{ transform.NopResetter }
 233  
 234  func (t undLowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 235  	c := context{dst: dst, src: src, atEOF: atEOF}
 236  
 237  	for isInterWord := true; c.next(); {
 238  		if isInterWord {
 239  			if c.info.isCased() {
 240  				if !lower(&c) {
 241  					break
 242  				}
 243  				isInterWord = false
 244  			} else if !c.copy() {
 245  				break
 246  			}
 247  		} else {
 248  			if c.info.isNotCasedAndNotCaseIgnorable() {
 249  				if !c.copy() {
 250  					break
 251  				}
 252  				isInterWord = true
 253  			} else if !c.hasPrefix("Σ") {
 254  				if !lower(&c) {
 255  					break
 256  				}
 257  			} else if !finalSigmaBody(&c) {
 258  				break
 259  			}
 260  		}
 261  		c.checkpoint()
 262  	}
 263  	return c.ret()
 264  }
 265  
 266  func (t undLowerCaser) Span(src []byte, atEOF bool) (n int, err error) {
 267  	c := context{src: src, atEOF: atEOF}
 268  	for c.next() && isLower(&c) {
 269  		c.checkpoint()
 270  	}
 271  	return c.retSpan()
 272  }
 273  
 274  // lowerCaser implements the Transformer interface. The default Unicode lower
 275  // casing requires different treatment for the first and subsequent characters
 276  // of a word, most notably to handle the Greek final Sigma.
 277  type lowerCaser struct {
 278  	undLowerIgnoreSigmaCaser
 279  
 280  	context
 281  
 282  	first, midWord mapFunc
 283  }
 284  
 285  func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 286  	t.context = context{dst: dst, src: src, atEOF: atEOF}
 287  	c := &t.context
 288  
 289  	for isInterWord := true; c.next(); {
 290  		if isInterWord {
 291  			if c.info.isCased() {
 292  				if !t.first(c) {
 293  					break
 294  				}
 295  				isInterWord = false
 296  			} else if !c.copy() {
 297  				break
 298  			}
 299  		} else {
 300  			if c.info.isNotCasedAndNotCaseIgnorable() {
 301  				if !c.copy() {
 302  					break
 303  				}
 304  				isInterWord = true
 305  			} else if !t.midWord(c) {
 306  				break
 307  			}
 308  		}
 309  		c.checkpoint()
 310  	}
 311  	return c.ret()
 312  }
 313  
 314  // titleCaser implements the Transformer interface. Title casing algorithms
 315  // distinguish between the first letter of a word and subsequent letters of the
 316  // same word. It uses state to avoid requiring a potentially infinite lookahead.
 317  type titleCaser struct {
 318  	context
 319  
 320  	// rune mappings used by the actual casing algorithms.
 321  	title     mapFunc
 322  	lower     mapFunc
 323  	titleSpan spanFunc
 324  
 325  	rewrite func(*context)
 326  }
 327  
 328  // Transform implements the standard Unicode title case algorithm as defined in
 329  // Chapter 3 of The Unicode Standard:
 330  // toTitlecase(X): Find the word boundaries in X according to Unicode Standard
 331  // Annex #29, "Unicode Text Segmentation." For each word boundary, find the
 332  // first cased character F following the word boundary. If F exists, map F to
 333  // Titlecase_Mapping(F); then map all characters C between F and the following
 334  // word boundary to Lowercase_Mapping(C).
 335  func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 336  	t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord}
 337  	c := &t.context
 338  
 339  	if !c.next() {
 340  		return c.ret()
 341  	}
 342  
 343  	for {
 344  		p := c.info
 345  		if t.rewrite != nil {
 346  			t.rewrite(c)
 347  		}
 348  
 349  		wasMid := p.isMid()
 350  		// Break out of this loop on failure to ensure we do not modify the
 351  		// state incorrectly.
 352  		if p.isCased() {
 353  			if !c.isMidWord {
 354  				if !t.title(c) {
 355  					break
 356  				}
 357  				c.isMidWord = true
 358  			} else if !t.lower(c) {
 359  				break
 360  			}
 361  		} else if !c.copy() {
 362  			break
 363  		} else if p.isBreak() {
 364  			c.isMidWord = false
 365  		}
 366  
 367  		// As we save the state of the transformer, it is safe to call
 368  		// checkpoint after any successful write.
 369  		if !(c.isMidWord && wasMid) {
 370  			c.checkpoint()
 371  		}
 372  
 373  		if !c.next() {
 374  			break
 375  		}
 376  		if wasMid && c.info.isMid() {
 377  			c.isMidWord = false
 378  		}
 379  	}
 380  	return c.ret()
 381  }
 382  
 383  func (t *titleCaser) Span(src []byte, atEOF bool) (n int, err error) {
 384  	t.context = context{src: src, atEOF: atEOF, isMidWord: t.isMidWord}
 385  	c := &t.context
 386  
 387  	if !c.next() {
 388  		return c.retSpan()
 389  	}
 390  
 391  	for {
 392  		p := c.info
 393  		if t.rewrite != nil {
 394  			t.rewrite(c)
 395  		}
 396  
 397  		wasMid := p.isMid()
 398  		// Break out of this loop on failure to ensure we do not modify the
 399  		// state incorrectly.
 400  		if p.isCased() {
 401  			if !c.isMidWord {
 402  				if !t.titleSpan(c) {
 403  					break
 404  				}
 405  				c.isMidWord = true
 406  			} else if !isLower(c) {
 407  				break
 408  			}
 409  		} else if p.isBreak() {
 410  			c.isMidWord = false
 411  		}
 412  		// As we save the state of the transformer, it is safe to call
 413  		// checkpoint after any successful write.
 414  		if !(c.isMidWord && wasMid) {
 415  			c.checkpoint()
 416  		}
 417  
 418  		if !c.next() {
 419  			break
 420  		}
 421  		if wasMid && c.info.isMid() {
 422  			c.isMidWord = false
 423  		}
 424  	}
 425  	return c.retSpan()
 426  }
 427  
 428  // finalSigma adds Greek final Sigma handing to another casing function. It
 429  // determines whether a lowercased sigma should be σ or ς, by looking ahead for
 430  // case-ignorables and a cased letters.
 431  func finalSigma(f mapFunc) mapFunc {
 432  	return func(c *context) bool {
 433  		if !c.hasPrefix("Σ") {
 434  			return f(c)
 435  		}
 436  		return finalSigmaBody(c)
 437  	}
 438  }
 439  
 440  func finalSigmaBody(c *context) bool {
 441  	// Current rune must be ∑.
 442  
 443  	// ::NFD();
 444  	// # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
 445  	// Σ } [:case-ignorable:]* [:cased:] → σ;
 446  	// [:cased:] [:case-ignorable:]* { Σ → ς;
 447  	// ::Any-Lower;
 448  	// ::NFC();
 449  
 450  	p := c.pDst
 451  	c.writeString("ς")
 452  
 453  	// TODO: we should do this here, but right now this will never have an
 454  	// effect as this is called when the prefix is Sigma, whereas Dutch and
 455  	// Afrikaans only test for an apostrophe.
 456  	//
 457  	// if t.rewrite != nil {
 458  	// 	t.rewrite(c)
 459  	// }
 460  
 461  	// We need to do one more iteration after maxIgnorable, as a cased
 462  	// letter is not an ignorable and may modify the result.
 463  	wasMid := false
 464  	for i := 0; i < maxIgnorable+1; i++ {
 465  		if !c.next() {
 466  			return false
 467  		}
 468  		if !c.info.isCaseIgnorable() {
 469  			// All Midword runes are also case ignorable, so we are
 470  			// guaranteed to have a letter or word break here. As we are
 471  			// unreading the run, there is no need to unset c.isMidWord;
 472  			// the title caser will handle this.
 473  			if c.info.isCased() {
 474  				// p+1 is guaranteed to be in bounds: if writing ς was
 475  				// successful, p+1 will contain the second byte of ς. If not,
 476  				// this function will have returned after c.next returned false.
 477  				c.dst[p+1]++ // ς → σ
 478  			}
 479  			c.unreadRune()
 480  			return true
 481  		}
 482  		// A case ignorable may also introduce a word break, so we may need
 483  		// to continue searching even after detecting a break.
 484  		isMid := c.info.isMid()
 485  		if (wasMid && isMid) || c.info.isBreak() {
 486  			c.isMidWord = false
 487  		}
 488  		wasMid = isMid
 489  		c.copy()
 490  	}
 491  	return true
 492  }
 493  
 494  // finalSigmaSpan would be the same as isLower.
 495  
 496  // elUpper implements Greek upper casing, which entails removing a predefined
 497  // set of non-blocked modifiers. Note that these accents should not be removed
 498  // for title casing!
 499  // Example: "Οδός" -> "ΟΔΟΣ".
 500  func elUpper(c *context) bool {
 501  	// From CLDR:
 502  	// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ;
 503  	// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ;
 504  
 505  	r, _ := utf8.DecodeRune(c.src[c.pSrc:])
 506  	oldPDst := c.pDst
 507  	if !upper(c) {
 508  		return false
 509  	}
 510  	if !unicode.Is(unicode.Greek, r) {
 511  		return true
 512  	}
 513  	i := 0
 514  	// Take the properties of the uppercased rune that is already written to the
 515  	// destination. This saves us the trouble of having to uppercase the
 516  	// decomposed rune again.
 517  	if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil {
 518  		// Restore the destination position and process the decomposed rune.
 519  		r, sz := utf8.DecodeRune(b)
 520  		if r <= 0xFF { // See A.6.1
 521  			return true
 522  		}
 523  		c.pDst = oldPDst
 524  		// Insert the first rune and ignore the modifiers. See A.6.2.
 525  		c.writeBytes(b[:sz])
 526  		i = len(b[sz:]) / 2 // Greek modifiers are always of length 2.
 527  	}
 528  
 529  	for ; i < maxIgnorable && c.next(); i++ {
 530  		switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r {
 531  		// Above and Iota Subscript
 532  		case 0x0300, // U+0300 COMBINING GRAVE ACCENT
 533  			0x0301, // U+0301 COMBINING ACUTE ACCENT
 534  			0x0304, // U+0304 COMBINING MACRON
 535  			0x0306, // U+0306 COMBINING BREVE
 536  			0x0308, // U+0308 COMBINING DIAERESIS
 537  			0x0313, // U+0313 COMBINING COMMA ABOVE
 538  			0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE
 539  			0x0342, // U+0342 COMBINING GREEK PERISPOMENI
 540  			0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI
 541  			// No-op. Gobble the modifier.
 542  
 543  		default:
 544  			switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() {
 545  			case cccZero:
 546  				c.unreadRune()
 547  				return true
 548  
 549  			// We don't need to test for IotaSubscript as the only rune that
 550  			// qualifies (U+0345) was already excluded in the switch statement
 551  			// above. See A.4.
 552  
 553  			case cccAbove:
 554  				return c.copy()
 555  			default:
 556  				// Some other modifier. We're still allowed to gobble Greek
 557  				// modifiers after this.
 558  				c.copy()
 559  			}
 560  		}
 561  	}
 562  	return i == maxIgnorable
 563  }
 564  
 565  // TODO: implement elUpperSpan (low-priority: complex and infrequent).
 566  
 567  func ltLower(c *context) bool {
 568  	// From CLDR:
 569  	// # Introduce an explicit dot above when lowercasing capital I's and J's
 570  	// # whenever there are more accents above.
 571  	// # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
 572  	// # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
 573  	// # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
 574  	// # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
 575  	// # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
 576  	// # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
 577  	// # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
 578  	// ::NFD();
 579  	// I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
 580  	// J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
 581  	// I \u0328 (Į) } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307;
 582  	// I \u0300 (Ì) → i \u0307 \u0300;
 583  	// I \u0301 (Í) → i \u0307 \u0301;
 584  	// I \u0303 (Ĩ) → i \u0307 \u0303;
 585  	// ::Any-Lower();
 586  	// ::NFC();
 587  
 588  	i := 0
 589  	if r := c.src[c.pSrc]; r < utf8.RuneSelf {
 590  		lower(c)
 591  		if r != 'I' && r != 'J' {
 592  			return true
 593  		}
 594  	} else {
 595  		p := norm.NFD.Properties(c.src[c.pSrc:])
 596  		if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') {
 597  			// UTF-8 optimization: the decomposition will only have an above
 598  			// modifier if the last rune of the decomposition is in [U+300-U+311].
 599  			// In all other cases, a decomposition starting with I is always
 600  			// an I followed by modifiers that are not cased themselves. See A.2.
 601  			if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4.
 602  				if !c.writeBytes(d[:1]) {
 603  					return false
 604  				}
 605  				c.dst[c.pDst-1] += 'a' - 'A' // lower
 606  
 607  				// Assumption: modifier never changes on lowercase. See A.1.
 608  				// Assumption: all modifiers added have CCC = Above. See A.2.3.
 609  				return c.writeString("\u0307") && c.writeBytes(d[1:])
 610  			}
 611  			// In all other cases the additional modifiers will have a CCC
 612  			// that is less than 230 (Above). We will insert the U+0307, if
 613  			// needed, after these modifiers so that a string in FCD form
 614  			// will remain so. See A.2.2.
 615  			lower(c)
 616  			i = 1
 617  		} else {
 618  			return lower(c)
 619  		}
 620  	}
 621  
 622  	for ; i < maxIgnorable && c.next(); i++ {
 623  		switch c.info.cccType() {
 624  		case cccZero:
 625  			c.unreadRune()
 626  			return true
 627  		case cccAbove:
 628  			return c.writeString("\u0307") && c.copy() // See A.1.
 629  		default:
 630  			c.copy() // See A.1.
 631  		}
 632  	}
 633  	return i == maxIgnorable
 634  }
 635  
 636  // ltLowerSpan would be the same as isLower.
 637  
 638  func ltUpper(f mapFunc) mapFunc {
 639  	return func(c *context) bool {
 640  		// Unicode:
 641  		// 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
 642  		//
 643  		// From CLDR:
 644  		// # Remove \u0307 following soft-dotteds (i, j, and the like), with possible
 645  		// # intervening non-230 marks.
 646  		// ::NFD();
 647  		// [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ;
 648  		// ::Any-Upper();
 649  		// ::NFC();
 650  
 651  		// TODO: See A.5. A soft-dotted rune never has an exception. This would
 652  		// allow us to overload the exception bit and encode this property in
 653  		// info. Need to measure performance impact of this.
 654  		r, _ := utf8.DecodeRune(c.src[c.pSrc:])
 655  		oldPDst := c.pDst
 656  		if !f(c) {
 657  			return false
 658  		}
 659  		if !unicode.Is(unicode.Soft_Dotted, r) {
 660  			return true
 661  		}
 662  
 663  		// We don't need to do an NFD normalization, as a soft-dotted rune never
 664  		// contains U+0307. See A.3.
 665  
 666  		i := 0
 667  		for ; i < maxIgnorable && c.next(); i++ {
 668  			switch c.info.cccType() {
 669  			case cccZero:
 670  				c.unreadRune()
 671  				return true
 672  			case cccAbove:
 673  				if c.hasPrefix("\u0307") {
 674  					// We don't do a full NFC, but rather combine runes for
 675  					// some of the common cases. (Returning NFC or
 676  					// preserving normal form is neither a requirement nor
 677  					// a possibility anyway).
 678  					if !c.next() {
 679  						return false
 680  					}
 681  					if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc {
 682  						s := ""
 683  						switch c.src[c.pSrc+1] {
 684  						case 0x80: // U+0300 COMBINING GRAVE ACCENT
 685  							s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE
 686  						case 0x81: // U+0301 COMBINING ACUTE ACCENT
 687  							s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE
 688  						case 0x83: // U+0303 COMBINING TILDE
 689  							s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE
 690  						case 0x88: // U+0308 COMBINING DIAERESIS
 691  							s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
 692  						default:
 693  						}
 694  						if s != "" {
 695  							c.pDst = oldPDst
 696  							return c.writeString(s)
 697  						}
 698  					}
 699  				}
 700  				return c.copy()
 701  			default:
 702  				c.copy()
 703  			}
 704  		}
 705  		return i == maxIgnorable
 706  	}
 707  }
 708  
 709  // TODO: implement ltUpperSpan (low priority: complex and infrequent).
 710  
 711  func aztrUpper(f mapFunc) mapFunc {
 712  	return func(c *context) bool {
 713  		// i→İ;
 714  		if c.src[c.pSrc] == 'i' {
 715  			return c.writeString("İ")
 716  		}
 717  		return f(c)
 718  	}
 719  }
 720  
 721  func aztrLower(c *context) (done bool) {
 722  	// From CLDR:
 723  	// # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
 724  	// # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 725  	// İ→i;
 726  	// # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
 727  	// # This matches the behavior of the canonically equivalent I-dot_above
 728  	// # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
 729  	// # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
 730  	// # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
 731  	// I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ;
 732  	// I→ı ;
 733  	// ::Any-Lower();
 734  	if c.hasPrefix("\u0130") { // İ
 735  		return c.writeString("i")
 736  	}
 737  	if c.src[c.pSrc] != 'I' {
 738  		return lower(c)
 739  	}
 740  
 741  	// We ignore the lower-case I for now, but insert it later when we know
 742  	// which form we need.
 743  	start := c.pSrc + c.sz
 744  
 745  	i := 0
 746  Loop:
 747  	// We check for up to n ignorables before \u0307. As \u0307 is an
 748  	// ignorable as well, n is maxIgnorable-1.
 749  	for ; i < maxIgnorable && c.next(); i++ {
 750  		switch c.info.cccType() {
 751  		case cccAbove:
 752  			if c.hasPrefix("\u0307") {
 753  				return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307
 754  			}
 755  			done = true
 756  			break Loop
 757  		case cccZero:
 758  			c.unreadRune()
 759  			done = true
 760  			break Loop
 761  		default:
 762  			// We'll write this rune after we know which starter to use.
 763  		}
 764  	}
 765  	if i == maxIgnorable {
 766  		done = true
 767  	}
 768  	return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done
 769  }
 770  
 771  // aztrLowerSpan would be the same as isLower.
 772  
 773  func nlTitle(c *context) bool {
 774  	// From CLDR:
 775  	// # Special titlecasing for Dutch initial "ij".
 776  	// ::Any-Title();
 777  	// # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
 778  	// [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
 779  	if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' {
 780  		return title(c)
 781  	}
 782  
 783  	if !c.writeString("I") || !c.next() {
 784  		return false
 785  	}
 786  	if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' {
 787  		return c.writeString("J")
 788  	}
 789  	c.unreadRune()
 790  	return true
 791  }
 792  
 793  func nlTitleSpan(c *context) bool {
 794  	// From CLDR:
 795  	// # Special titlecasing for Dutch initial "ij".
 796  	// ::Any-Title();
 797  	// # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
 798  	// [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
 799  	if c.src[c.pSrc] != 'I' {
 800  		return isTitle(c)
 801  	}
 802  	if !c.next() || c.src[c.pSrc] == 'j' {
 803  		return false
 804  	}
 805  	if c.src[c.pSrc] != 'J' {
 806  		c.unreadRune()
 807  	}
 808  	return true
 809  }
 810  
 811  // Not part of CLDR, but see https://unicode.org/cldr/trac/ticket/7078.
 812  func afnlRewrite(c *context) {
 813  	if c.hasPrefix("'") || c.hasPrefix("’") {
 814  		c.isMidWord = true
 815  	}
 816  }
 817