bracket.mx raw

   1  // Copyright 2015 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package bidi
   6  
   7  import (
   8  	"container/list"
   9  	"fmt"
  10  	"sort"
  11  )
  12  
  13  // This file contains a port of the reference implementation of the
  14  // Bidi Parentheses Algorithm:
  15  // https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
  16  //
  17  // The implementation in this file covers definitions BD14-BD16 and rule N0
  18  // of UAX#9.
  19  //
  20  // Some preprocessing is done for each rune before data is passed to this
  21  // algorithm:
  22  //  - opening and closing brackets are identified
  23  //  - a bracket pair type, like '(' and ')' is assigned a unique identifier that
  24  //    is identical for the opening and closing bracket. It is left to do these
  25  //    mappings.
  26  //  - The BPA algorithm requires that bracket characters that are canonical
  27  //    equivalents of each other be able to be substituted for each other.
  28  //    It is the responsibility of the caller to do this canonicalization.
  29  //
  30  // In implementing BD16, this implementation departs slightly from the "logical"
  31  // algorithm defined in UAX#9. In particular, the stack referenced there
  32  // supports operations that go beyond a "basic" stack. An equivalent
  33  // implementation based on a linked list is used here.
  34  
  35  // Bidi_Paired_Bracket_Type
  36  // BD14. An opening paired bracket is a character whose
  37  // Bidi_Paired_Bracket_Type property value is Open.
  38  //
  39  // BD15. A closing paired bracket is a character whose
  40  // Bidi_Paired_Bracket_Type property value is Close.
  41  type bracketType byte
  42  
  43  const (
  44  	bpNone bracketType = iota
  45  	bpOpen
  46  	bpClose
  47  )
  48  
  49  // bracketPair holds a pair of index values for opening and closing bracket
  50  // location of a bracket pair.
  51  type bracketPair struct {
  52  	opener int
  53  	closer int
  54  }
  55  
  56  func (b *bracketPair) String() string {
  57  	return fmt.Sprintf("(%v, %v)", b.opener, b.closer)
  58  }
  59  
  60  // bracketPairs is a slice of bracketPairs with a sort.Interface implementation.
  61  type bracketPairs []bracketPair
  62  
  63  func (b bracketPairs) Len() int           { return len(b) }
  64  func (b bracketPairs) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
  65  func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener }
  66  
  67  // resolvePairedBrackets runs the paired bracket part of the UBA algorithm.
  68  //
  69  // For each rune, it takes the indexes into the original string, the class the
  70  // bracket type (in pairTypes) and the bracket identifier (pairValues). It also
  71  // takes the direction type for the start-of-sentence and the embedding level.
  72  //
  73  // The identifiers for bracket types are the rune of the canonicalized opening
  74  // bracket for brackets (open or close) or 0 for runes that are not brackets.
  75  func resolvePairedBrackets(s *isolatingRunSequence) {
  76  	p := bracketPairer{
  77  		sos:              s.sos,
  78  		openers:          list.New(),
  79  		codesIsolatedRun: s.types,
  80  		indexes:          s.indexes,
  81  	}
  82  	dirEmbed := L
  83  	if s.level&1 != 0 {
  84  		dirEmbed = R
  85  	}
  86  	p.locateBrackets(s.p.pairTypes, s.p.pairValues)
  87  	p.resolveBrackets(dirEmbed, s.p.initialTypes)
  88  }
  89  
  90  type bracketPairer struct {
  91  	sos Class // direction corresponding to start of sequence
  92  
  93  	// The following is a restatement of BD 16 using non-algorithmic language.
  94  	//
  95  	// A bracket pair is a pair of characters consisting of an opening
  96  	// paired bracket and a closing paired bracket such that the
  97  	// Bidi_Paired_Bracket property value of the former equals the latter,
  98  	// subject to the following constraints.
  99  	// - both characters of a pair occur in the same isolating run sequence
 100  	// - the closing character of a pair follows the opening character
 101  	// - any bracket character can belong at most to one pair, the earliest possible one
 102  	// - any bracket character not part of a pair is treated like an ordinary character
 103  	// - pairs may nest properly, but their spans may not overlap otherwise
 104  
 105  	// Bracket characters with canonical decompositions are supposed to be
 106  	// treated as if they had been normalized, to allow normalized and non-
 107  	// normalized text to give the same result. In this implementation that step
 108  	// is pushed out to the caller. The caller has to ensure that the pairValue
 109  	// slices contain the rune of the opening bracket after normalization for
 110  	// any opening or closing bracket.
 111  
 112  	openers *list.List // list of positions for opening brackets
 113  
 114  	// bracket pair positions sorted by location of opening bracket
 115  	pairPositions bracketPairs
 116  
 117  	codesIsolatedRun []Class // directional bidi codes for an isolated run
 118  	indexes          []int   // array of index values into the original string
 119  
 120  }
 121  
 122  // matchOpener reports whether characters at given positions form a matching
 123  // bracket pair.
 124  func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool {
 125  	return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]]
 126  }
 127  
 128  const maxPairingDepth = 63
 129  
 130  // locateBrackets locates matching bracket pairs according to BD16.
 131  //
 132  // This implementation uses a linked list instead of a stack, because, while
 133  // elements are added at the front (like a push) they are not generally removed
 134  // in atomic 'pop' operations, reducing the benefit of the stack archetype.
 135  func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) {
 136  	// traverse the run
 137  	// do that explicitly (not in a for-each) so we can record position
 138  	for i, index := range p.indexes {
 139  
 140  		// look at the bracket type for each character
 141  		if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON {
 142  			// continue scanning
 143  			continue
 144  		}
 145  		switch pairTypes[index] {
 146  		case bpOpen:
 147  			// check if maximum pairing depth reached
 148  			if p.openers.Len() == maxPairingDepth {
 149  				p.openers.Init()
 150  				return
 151  			}
 152  			// remember opener location, most recent first
 153  			p.openers.PushFront(i)
 154  
 155  		case bpClose:
 156  			// see if there is a match
 157  			count := 0
 158  			for elem := p.openers.Front(); elem != nil; elem = elem.Next() {
 159  				count++
 160  				opener := elem.Value.(int)
 161  				if p.matchOpener(pairValues, opener, i) {
 162  					// if the opener matches, add nested pair to the ordered list
 163  					p.pairPositions = append(p.pairPositions, bracketPair{opener, i})
 164  					// remove up to and including matched opener
 165  					for ; count > 0; count-- {
 166  						p.openers.Remove(p.openers.Front())
 167  					}
 168  					break
 169  				}
 170  			}
 171  			sort.Sort(p.pairPositions)
 172  			// if we get here, the closing bracket matched no openers
 173  			// and gets ignored
 174  		}
 175  	}
 176  }
 177  
 178  // Bracket pairs within an isolating run sequence are processed as units so
 179  // that both the opening and the closing paired bracket in a pair resolve to
 180  // the same direction.
 181  //
 182  // N0. Process bracket pairs in an isolating run sequence sequentially in
 183  // the logical order of the text positions of the opening paired brackets
 184  // using the logic given below. Within this scope, bidirectional types EN
 185  // and AN are treated as R.
 186  //
 187  // Identify the bracket pairs in the current isolating run sequence
 188  // according to BD16. For each bracket-pair element in the list of pairs of
 189  // text positions:
 190  //
 191  // a Inspect the bidirectional types of the characters enclosed within the
 192  // bracket pair.
 193  //
 194  // b If any strong type (either L or R) matching the embedding direction is
 195  // found, set the type for both brackets in the pair to match the embedding
 196  // direction.
 197  //
 198  // o [ e ] o -> o e e e o
 199  //
 200  // o [ o e ] -> o e o e e
 201  //
 202  // o [ NI e ] -> o e NI e e
 203  //
 204  // c Otherwise, if a strong type (opposite the embedding direction) is
 205  // found, test for adjacent strong types as follows: 1 First, check
 206  // backwards before the opening paired bracket until the first strong type
 207  // (L, R, or sos) is found. If that first preceding strong type is opposite
 208  // the embedding direction, then set the type for both brackets in the pair
 209  // to that type. 2 Otherwise, set the type for both brackets in the pair to
 210  // the embedding direction.
 211  //
 212  // o [ o ] e -> o o o o e
 213  //
 214  // o [ o NI ] o -> o o o NI o o
 215  //
 216  // e [ o ] o -> e e o e o
 217  //
 218  // e [ o ] e -> e e o e e
 219  //
 220  // e ( o [ o ] NI ) e -> e e o o o o NI e e
 221  //
 222  // d Otherwise, do not set the type for the current bracket pair. Note that
 223  // if the enclosed text contains no strong types the paired brackets will
 224  // both resolve to the same level when resolved individually using rules N1
 225  // and N2.
 226  //
 227  // e ( NI ) o -> e ( NI ) o
 228  
 229  // getStrongTypeN0 maps character's directional code to strong type as required
 230  // by rule N0.
 231  //
 232  // TODO: have separate type for "strong" directionality.
 233  func (p *bracketPairer) getStrongTypeN0(index int) Class {
 234  	switch p.codesIsolatedRun[index] {
 235  	// in the scope of N0, number types are treated as R
 236  	case EN, AN, AL, R:
 237  		return R
 238  	case L:
 239  		return L
 240  	default:
 241  		return ON
 242  	}
 243  }
 244  
 245  // classifyPairContent reports the strong types contained inside a Bracket Pair,
 246  // assuming the given embedding direction.
 247  //
 248  // It returns ON if no strong type is found. If a single strong type is found,
 249  // it returns this type. Otherwise it returns the embedding direction.
 250  //
 251  // TODO: use separate type for "strong" directionality.
 252  func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class {
 253  	dirOpposite := ON
 254  	for i := loc.opener + 1; i < loc.closer; i++ {
 255  		dir := p.getStrongTypeN0(i)
 256  		if dir == ON {
 257  			continue
 258  		}
 259  		if dir == dirEmbed {
 260  			return dir // type matching embedding direction found
 261  		}
 262  		dirOpposite = dir
 263  	}
 264  	// return ON if no strong type found, or class opposite to dirEmbed
 265  	return dirOpposite
 266  }
 267  
 268  // classBeforePair determines which strong types are present before a Bracket
 269  // Pair. Return R or L if strong type found, otherwise ON.
 270  func (p *bracketPairer) classBeforePair(loc bracketPair) Class {
 271  	for i := loc.opener - 1; i >= 0; i-- {
 272  		if dir := p.getStrongTypeN0(i); dir != ON {
 273  			return dir
 274  		}
 275  	}
 276  	// no strong types found, return sos
 277  	return p.sos
 278  }
 279  
 280  // assignBracketType implements rule N0 for a single bracket pair.
 281  func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) {
 282  	// rule "N0, a", inspect contents of pair
 283  	dirPair := p.classifyPairContent(loc, dirEmbed)
 284  
 285  	// dirPair is now L, R, or N (no strong type found)
 286  
 287  	// the following logical tests are performed out of order compared to
 288  	// the statement of the rules but yield the same results
 289  	if dirPair == ON {
 290  		return // case "d" - nothing to do
 291  	}
 292  
 293  	if dirPair != dirEmbed {
 294  		// case "c": strong type found, opposite - check before (c.1)
 295  		dirPair = p.classBeforePair(loc)
 296  		if dirPair == dirEmbed || dirPair == ON {
 297  			// no strong opposite type found before - use embedding (c.2)
 298  			dirPair = dirEmbed
 299  		}
 300  	}
 301  	// else: case "b", strong type found matching embedding,
 302  	// no explicit action needed, as dirPair is already set to embedding
 303  	// direction
 304  
 305  	// set the bracket types to the type found
 306  	p.setBracketsToType(loc, dirPair, initialTypes)
 307  }
 308  
 309  func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) {
 310  	p.codesIsolatedRun[loc.opener] = dirPair
 311  	p.codesIsolatedRun[loc.closer] = dirPair
 312  
 313  	for i := loc.opener + 1; i < loc.closer; i++ {
 314  		index := p.indexes[i]
 315  		if initialTypes[index] != NSM {
 316  			break
 317  		}
 318  		p.codesIsolatedRun[i] = dirPair
 319  	}
 320  
 321  	for i := loc.closer + 1; i < len(p.indexes); i++ {
 322  		index := p.indexes[i]
 323  		if initialTypes[index] != NSM {
 324  			break
 325  		}
 326  		p.codesIsolatedRun[i] = dirPair
 327  	}
 328  }
 329  
 330  // resolveBrackets implements rule N0 for a list of pairs.
 331  func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) {
 332  	for _, loc := range p.pairPositions {
 333  		p.assignBracketType(loc, dirEmbed, initialTypes)
 334  	}
 335  }
 336