info.go raw

   1  // Copyright 2015 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package cases
   6  
   7  func (c info) cccVal() info {
   8  	if c&exceptionBit != 0 {
   9  		return info(exceptions[c>>exceptionShift]) & cccMask
  10  	}
  11  	return c & cccMask
  12  }
  13  
  14  func (c info) cccType() info {
  15  	ccc := c.cccVal()
  16  	if ccc <= cccZero {
  17  		return cccZero
  18  	}
  19  	return ccc
  20  }
  21  
  22  // TODO: Implement full Unicode breaking algorithm:
  23  // 1) Implement breaking in separate package.
  24  // 2) Use the breaker here.
  25  // 3) Compare table size and performance of using the more generic breaker.
  26  //
  27  // Note that we can extend the current algorithm to be much more accurate. This
  28  // only makes sense, though, if the performance and/or space penalty of using
  29  // the generic breaker is big. Extra data will only be needed for non-cased
  30  // runes, which means there are sufficient bits left in the caseType.
  31  // ICU prohibits breaking in such cases as well.
  32  
  33  // For the purpose of title casing we use an approximation of the Unicode Word
  34  // Breaking algorithm defined in Annex #29:
  35  // https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table.
  36  //
  37  // For our approximation, we group the Word Break types into the following
  38  // categories, with associated rules:
  39  //
  40  // 1) Letter:
  41  //    ALetter, Hebrew_Letter, Numeric, ExtendNumLet, Extend, Format_FE, ZWJ.
  42  //    Rule: Never break between consecutive runes of this category.
  43  //
  44  // 2) Mid:
  45  //    MidLetter, MidNumLet, Single_Quote.
  46  //    (Cf. case-ignorable: MidLetter, MidNumLet, Single_Quote or cat is Mn,
  47  //    Me, Cf, Lm or Sk).
  48  //    Rule: Don't break between Letter and Mid, but break between two Mids.
  49  //
  50  // 3) Break:
  51  //    Any other category: NewLine, MidNum, CR, LF, Double_Quote, Katakana, and
  52  //    Other.
  53  //    These categories should always result in a break between two cased letters.
  54  //    Rule: Always break.
  55  //
  56  // Note 1: the Katakana and MidNum categories can, in esoteric cases, result in
  57  // preventing a break between two cased letters. For now we will ignore this
  58  // (e.g. [ALetter] [ExtendNumLet] [Katakana] [ExtendNumLet] [ALetter] and
  59  // [ALetter] [Numeric] [MidNum] [Numeric] [ALetter].)
  60  //
  61  // Note 2: the rule for Mid is very approximate, but works in most cases. To
  62  // improve, we could store the categories in the trie value and use a FA to
  63  // manage breaks. See TODO comment above.
  64  //
  65  // Note 3: according to the spec, it is possible for the Extend category to
  66  // introduce breaks between other categories grouped in Letter. However, this
  67  // is undesirable for our purposes. ICU prevents breaks in such cases as well.
  68  
  69  // isBreak returns whether this rune should introduce a break.
  70  func (c info) isBreak() bool {
  71  	return c.cccVal() == cccBreak
  72  }
  73  
  74  // isLetter returns whether the rune is of break type ALetter, Hebrew_Letter,
  75  // Numeric, ExtendNumLet, or Extend.
  76  func (c info) isLetter() bool {
  77  	ccc := c.cccVal()
  78  	if ccc == cccZero {
  79  		return !c.isCaseIgnorable()
  80  	}
  81  	return ccc != cccBreak
  82  }
  83