replace.mx raw

   1  // Copyright 2011 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package bytes
   6  
   7  import (
   8  	"io"
   9  	"sync"
  10  )
  11  
  12  // Replacer replaces a list of strings with replacements.
  13  // It is safe for concurrent use by multiple goroutines.
  14  type Replacer struct {
  15  	once   sync.Once // guards buildOnce method
  16  	r      replacer
  17  	oldnew [][]byte
  18  }
  19  
  20  // replacer is the interface that a replacement algorithm needs to implement.
  21  type replacer interface {
  22  	Replace(s []byte) []byte
  23  	WriteString(w io.Writer, s []byte) (n int, err error)
  24  }
  25  
  26  // NewReplacer returns a new [Replacer] from a list of old, new string
  27  // pairs. Replacements are performed in the order they appear in the
  28  // target string, without overlapping matches. The old string
  29  // comparisons are done in argument order.
  30  //
  31  // NewReplacer panics if given an odd number of arguments.
  32  func NewReplacer(oldnew ...[]byte) *Replacer {
  33  	if len(oldnew)%2 == 1 {
  34  		panic("bytes.NewReplacer: odd argument count")
  35  	}
  36  	return &Replacer{oldnew: append([][]byte(nil), oldnew...)}
  37  }
  38  
  39  func (r *Replacer) buildOnce() {
  40  	r.r = r.build()
  41  	r.oldnew = nil
  42  }
  43  
  44  func (b *Replacer) build() replacer {
  45  	oldnew := b.oldnew
  46  	if len(oldnew) == 2 && len(oldnew[0]) > 1 {
  47  		return makeSingleStringReplacer(oldnew[0], oldnew[1])
  48  	}
  49  
  50  	allNewBytes := true
  51  	for i := 0; i < len(oldnew); i += 2 {
  52  		if len(oldnew[i]) != 1 {
  53  			return makeGenericReplacer(oldnew)
  54  		}
  55  		if len(oldnew[i+1]) != 1 {
  56  			allNewBytes = false
  57  		}
  58  	}
  59  
  60  	if allNewBytes {
  61  		r := byteReplacer{}
  62  		for i := range r {
  63  			r[i] = byte(i)
  64  		}
  65  		// The first occurrence of old->new map takes precedence
  66  		// over the others with the same old string.
  67  		for i := len(oldnew) - 2; i >= 0; i -= 2 {
  68  			o := oldnew[i][0]
  69  			n := oldnew[i+1][0]
  70  			r[o] = n
  71  		}
  72  		return &r
  73  	}
  74  
  75  	r := byteStringReplacer{toReplace: [][]byte{:0:len(oldnew)/2}}
  76  	// The first occurrence of old->new map takes precedence
  77  	// over the others with the same old string.
  78  	for i := len(oldnew) - 2; i >= 0; i -= 2 {
  79  		o := oldnew[i][0]
  80  		n := oldnew[i+1]
  81  		// To avoid counting repetitions multiple times.
  82  		if r.replacements[o] == nil {
  83  			// We need to use string([]byte{o}) instead of string(o),
  84  			// to avoid utf8 encoding of o.
  85  			// E. g. byte(150) produces string of length 2.
  86  			r.toReplace = append(r.toReplace, []byte([]byte{o}))
  87  		}
  88  		r.replacements[o] = []byte(n)
  89  
  90  	}
  91  	return &r
  92  }
  93  
  94  // Replace returns a copy of s with all replacements performed.
  95  func (r *Replacer) Replace(s []byte) []byte {
  96  	r.once.Do(r.buildOnce)
  97  	return r.r.Replace(s)
  98  }
  99  
 100  // WriteString writes s to w with all replacements performed.
 101  func (r *Replacer) WriteString(w io.Writer, s []byte) (n int, err error) {
 102  	r.once.Do(r.buildOnce)
 103  	return r.r.WriteString(w, s)
 104  }
 105  
 106  // trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
 107  // and values may be empty. For example, the trie containing keys "ax", "ay",
 108  // "bcbc", "x" and "xy" could have eight nodes:
 109  //
 110  //	n0  -
 111  //	n1  a-
 112  //	n2  .x+
 113  //	n3  .y+
 114  //	n4  b-
 115  //	n5  .cbc+
 116  //	n6  x+
 117  //	n7  .y+
 118  //
 119  // n0 is the root node, and its children are n1, n4 and n6; n1's children are
 120  // n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
 121  // with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
 122  // (marked with a trailing "+") are complete keys.
 123  type trieNode struct {
 124  	// value is the value of the trie node's key/value pair. It is empty if
 125  	// this node is not a complete key.
 126  	value []byte
 127  	// priority is the priority (higher is more important) of the trie node's
 128  	// key/value pair; keys are not necessarily matched shortest- or longest-
 129  	// first. Priority is positive if this node is a complete key, and zero
 130  	// otherwise. In the example above, positive/zero priorities are marked
 131  	// with a trailing "+" or "-".
 132  	priority int
 133  
 134  	// A trie node may have zero, one or more child nodes:
 135  	//  * if the remaining fields are zero, there are no children.
 136  	//  * if prefix and next are non-zero, there is one child in next.
 137  	//  * if table is non-zero, it defines all the children.
 138  	//
 139  	// Prefixes are preferred over tables when there is one child, but the
 140  	// root node always uses a table for lookup efficiency.
 141  
 142  	// prefix is the difference in keys between this trie node and the next.
 143  	// In the example above, node n4 has prefix "cbc" and n4's next node is n5.
 144  	// Node n5 has no children and so has zero prefix, next and table fields.
 145  	prefix []byte
 146  	next   *trieNode
 147  
 148  	// table is a lookup table indexed by the next byte in the key, after
 149  	// remapping that byte through genericReplacer.mapping to create a dense
 150  	// index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
 151  	// 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
 152  	// genericReplacer.tableSize will be 5. Node n0's table will be
 153  	// []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped
 154  	// 'a', 'b' and 'x'.
 155  	table []*trieNode
 156  }
 157  
 158  func (t *trieNode) add(key, val []byte, priority int, r *genericReplacer) {
 159  	if key == "" {
 160  		if t.priority == 0 {
 161  			t.value = val
 162  			t.priority = priority
 163  		}
 164  		return
 165  	}
 166  
 167  	if t.prefix != "" {
 168  		// Need to split the prefix among multiple nodes.
 169  		var n int // length of the longest common prefix
 170  		for ; n < len(t.prefix) && n < len(key); n++ {
 171  			if t.prefix[n] != key[n] {
 172  				break
 173  			}
 174  		}
 175  		if n == len(t.prefix) {
 176  			t.next.add(key[n:], val, priority, r)
 177  		} else if n == 0 {
 178  			// First byte differs, start a new lookup table here. Looking up
 179  			// what is currently t.prefix[0] will lead to prefixNode, and
 180  			// looking up key[0] will lead to keyNode.
 181  			var prefixNode *trieNode
 182  			if len(t.prefix) == 1 {
 183  				prefixNode = t.next
 184  			} else {
 185  				prefixNode = &trieNode{
 186  					prefix: t.prefix[1:],
 187  					next:   t.next,
 188  				}
 189  			}
 190  			keyNode := &trieNode{}
 191  			t.table = []*trieNode{:r.tableSize}
 192  			t.table[r.mapping[t.prefix[0]]] = prefixNode
 193  			t.table[r.mapping[key[0]]] = keyNode
 194  			t.prefix = ""
 195  			t.next = nil
 196  			keyNode.add(key[1:], val, priority, r)
 197  		} else {
 198  			// Insert new node after the common section of the prefix.
 199  			next := &trieNode{
 200  				prefix: t.prefix[n:],
 201  				next:   t.next,
 202  			}
 203  			t.prefix = t.prefix[:n]
 204  			t.next = next
 205  			next.add(key[n:], val, priority, r)
 206  		}
 207  	} else if t.table != nil {
 208  		// Insert into existing table.
 209  		m := r.mapping[key[0]]
 210  		if t.table[m] == nil {
 211  			t.table[m] = &trieNode{}
 212  		}
 213  		t.table[m].add(key[1:], val, priority, r)
 214  	} else {
 215  		t.prefix = key
 216  		t.next = &trieNode{}
 217  		t.next.add("", val, priority, r)
 218  	}
 219  }
 220  
 221  func (r *genericReplacer) lookup(s []byte, ignoreRoot bool) (val []byte, keylen int, found bool) {
 222  	// Iterate down the trie to the end, and grab the value and keylen with
 223  	// the highest priority.
 224  	bestPriority := 0
 225  	node := &r.root
 226  	n := 0
 227  	for node != nil {
 228  		if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
 229  			bestPriority = node.priority
 230  			val = node.value
 231  			keylen = n
 232  			found = true
 233  		}
 234  
 235  		if s == "" {
 236  			break
 237  		}
 238  		if node.table != nil {
 239  			index := r.mapping[s[0]]
 240  			if int(index) == r.tableSize {
 241  				break
 242  			}
 243  			node = node.table[index]
 244  			s = s[1:]
 245  			n++
 246  		} else if node.prefix != "" && HasPrefix(s, node.prefix) {
 247  			n += len(node.prefix)
 248  			s = s[len(node.prefix):]
 249  			node = node.next
 250  		} else {
 251  			break
 252  		}
 253  	}
 254  	return
 255  }
 256  
 257  // genericReplacer is the fully generic algorithm.
 258  // It's used as a fallback when nothing faster can be used.
 259  type genericReplacer struct {
 260  	root trieNode
 261  	// tableSize is the size of a trie node's lookup table. It is the number
 262  	// of unique key bytes.
 263  	tableSize int
 264  	// mapping maps from key bytes to a dense index for trieNode.table.
 265  	mapping [256]byte
 266  }
 267  
 268  func makeGenericReplacer(oldnew [][]byte) *genericReplacer {
 269  	r := &genericReplacer{}
 270  	// Find each byte used, then assign them each an index.
 271  	for i := 0; i < len(oldnew); i += 2 {
 272  		key := oldnew[i]
 273  		for j := 0; j < len(key); j++ {
 274  			r.mapping[key[j]] = 1
 275  		}
 276  	}
 277  
 278  	for _, b := range r.mapping {
 279  		r.tableSize += int(b)
 280  	}
 281  
 282  	var index byte
 283  	for i, b := range r.mapping {
 284  		if b == 0 {
 285  			r.mapping[i] = byte(r.tableSize)
 286  		} else {
 287  			r.mapping[i] = index
 288  			index++
 289  		}
 290  	}
 291  	// Ensure root node uses a lookup table (for performance).
 292  	r.root.table = []*trieNode{:r.tableSize}
 293  
 294  	for i := 0; i < len(oldnew); i += 2 {
 295  		r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
 296  	}
 297  	return r
 298  }
 299  
 300  type appendSliceWriter []byte
 301  
 302  // Write writes to the buffer to satisfy [io.Writer].
 303  func (w *appendSliceWriter) Write(p []byte) (int, error) {
 304  	*w = append(*w, p...)
 305  	return len(p), nil
 306  }
 307  
 308  // WriteString writes to the buffer without string->[]byte->string allocations.
 309  func (w *appendSliceWriter) WriteString(s []byte) (int, error) {
 310  	*w = append(*w, s...)
 311  	return len(s), nil
 312  }
 313  
 314  type stringWriter struct {
 315  	w io.Writer
 316  }
 317  
 318  func (w stringWriter) WriteString(s []byte) (int, error) {
 319  	return w.w.Write([]byte(s))
 320  }
 321  
 322  func getStringWriter(w io.Writer) io.StringWriter {
 323  	sw, ok := w.(io.StringWriter)
 324  	if !ok {
 325  		sw = stringWriter{w}
 326  	}
 327  	return sw
 328  }
 329  
 330  func (r *genericReplacer) Replace(s []byte) []byte {
 331  	buf := make(appendSliceWriter, 0, len(s))
 332  	r.WriteString(&buf, s)
 333  	return []byte(buf)
 334  }
 335  
 336  func (r *genericReplacer) WriteString(w io.Writer, s []byte) (n int, err error) {
 337  	sw := getStringWriter(w)
 338  	var last, wn int
 339  	var prevMatchEmpty bool
 340  	for i := 0; i <= len(s); {
 341  		// Fast path: s[i] is not a prefix of any pattern.
 342  		if i != len(s) && r.root.priority == 0 {
 343  			index := int(r.mapping[s[i]])
 344  			if index == r.tableSize || r.root.table[index] == nil {
 345  				i++
 346  				continue
 347  			}
 348  		}
 349  
 350  		// Ignore the empty match iff the previous loop found the empty match.
 351  		val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
 352  		prevMatchEmpty = match && keylen == 0
 353  		if match {
 354  			wn, err = sw.WriteString(s[last:i])
 355  			n += wn
 356  			if err != nil {
 357  				return
 358  			}
 359  			wn, err = sw.WriteString(val)
 360  			n += wn
 361  			if err != nil {
 362  				return
 363  			}
 364  			i += keylen
 365  			last = i
 366  			continue
 367  		}
 368  		i++
 369  	}
 370  	if last != len(s) {
 371  		wn, err = sw.WriteString(s[last:])
 372  		n += wn
 373  	}
 374  	return
 375  }
 376  
 377  // singleStringReplacer is the implementation that's used when there is only
 378  // one string to replace (and that string has more than one byte).
 379  type singleStringReplacer struct {
 380  	finder *stringFinder
 381  	// value is the new string that replaces that pattern when it's found.
 382  	value []byte
 383  }
 384  
 385  func makeSingleStringReplacer(pattern []byte, value []byte) *singleStringReplacer {
 386  	return &singleStringReplacer{finder: makeStringFinder(pattern), value: value}
 387  }
 388  
 389  func (r *singleStringReplacer) Replace(s []byte) []byte {
 390  	var buf Buffer
 391  	i, matched := 0, false
 392  	for {
 393  		match := r.finder.next(s[i:])
 394  		if match == -1 {
 395  			break
 396  		}
 397  		matched = true
 398  		buf.Grow(match + len(r.value))
 399  		buf.WriteString(s[i : i+match])
 400  		buf.WriteString(r.value)
 401  		i += match + len(r.finder.pattern)
 402  	}
 403  	if !matched {
 404  		return s
 405  	}
 406  	buf.WriteString(s[i:])
 407  	return buf.String()
 408  }
 409  
 410  func (r *singleStringReplacer) WriteString(w io.Writer, s []byte) (n int, err error) {
 411  	sw := getStringWriter(w)
 412  	var i, wn int
 413  	for {
 414  		match := r.finder.next(s[i:])
 415  		if match == -1 {
 416  			break
 417  		}
 418  		wn, err = sw.WriteString(s[i : i+match])
 419  		n += wn
 420  		if err != nil {
 421  			return
 422  		}
 423  		wn, err = sw.WriteString(r.value)
 424  		n += wn
 425  		if err != nil {
 426  			return
 427  		}
 428  		i += match + len(r.finder.pattern)
 429  	}
 430  	wn, err = sw.WriteString(s[i:])
 431  	n += wn
 432  	return
 433  }
 434  
 435  // byteReplacer is the implementation that's used when all the "old"
 436  // and "new" values are single ASCII bytes.
 437  // The array contains replacement bytes indexed by old byte.
 438  type byteReplacer [256]byte
 439  
 440  func (r *byteReplacer) Replace(s []byte) []byte {
 441  	var buf []byte // lazily allocated
 442  	for i := 0; i < len(s); i++ {
 443  		b := s[i]
 444  		if r[b] != b {
 445  			if buf == nil {
 446  				buf = []byte(s)
 447  			}
 448  			buf[i] = r[b]
 449  		}
 450  	}
 451  	if buf == nil {
 452  		return s
 453  	}
 454  	return []byte(buf)
 455  }
 456  
 457  func (r *byteReplacer) WriteString(w io.Writer, s []byte) (n int, err error) {
 458  	sw := getStringWriter(w)
 459  	last := 0
 460  	for i := 0; i < len(s); i++ {
 461  		b := s[i]
 462  		if r[b] == b {
 463  			continue
 464  		}
 465  		if last != i {
 466  			wn, err := sw.WriteString(s[last:i])
 467  			n += wn
 468  			if err != nil {
 469  				return n, err
 470  			}
 471  		}
 472  		last = i + 1
 473  		nw, err := w.Write(r[b : int(b)+1])
 474  		n += nw
 475  		if err != nil {
 476  			return n, err
 477  		}
 478  	}
 479  	if last != len(s) {
 480  		nw, err := sw.WriteString(s[last:])
 481  		n += nw
 482  		if err != nil {
 483  			return n, err
 484  		}
 485  	}
 486  	return n, nil
 487  }
 488  
 489  // byteStringReplacer is the implementation that's used when all the
 490  // "old" values are single ASCII bytes but the "new" values vary in size.
 491  type byteStringReplacer struct {
 492  	// replacements contains replacement byte slices indexed by old byte.
 493  	// A nil []byte means that the old byte should not be replaced.
 494  	replacements [256][]byte
 495  	// toReplace keeps a list of bytes to replace. Depending on length of toReplace
 496  	// and length of target string it may be faster to use Count, or a plain loop.
 497  	// We store single byte as a string, because Count takes a string.
 498  	toReplace [][]byte
 499  }
 500  
 501  // countCutOff controls the ratio of a string length to a number of replacements
 502  // at which (*byteStringReplacer).Replace switches algorithms.
 503  // For strings with higher ration of length to replacements than that value,
 504  // we call Count, for each replacement from toReplace.
 505  // For strings, with a lower ratio we use simple loop, because of Count overhead.
 506  // countCutOff is an empirically determined overhead multiplier.
 507  // TODO(tocarip) revisit once we have register-based abi/mid-stack inlining.
 508  const countCutOff = 8
 509  
 510  func (r *byteStringReplacer) Replace(s []byte) []byte {
 511  	newSize := len(s)
 512  	anyChanges := false
 513  	// Is it faster to use Count?
 514  	if len(r.toReplace)*countCutOff <= len(s) {
 515  		for _, x := range r.toReplace {
 516  			if c := Count(s, x); c != 0 {
 517  				// The -1 is because we are replacing 1 byte with len(replacements[b]) bytes.
 518  				newSize += c * (len(r.replacements[x[0]]) - 1)
 519  				anyChanges = true
 520  			}
 521  
 522  		}
 523  	} else {
 524  		for i := 0; i < len(s); i++ {
 525  			b := s[i]
 526  			if r.replacements[b] != nil {
 527  				// See above for explanation of -1
 528  				newSize += len(r.replacements[b]) - 1
 529  				anyChanges = true
 530  			}
 531  		}
 532  	}
 533  	if !anyChanges {
 534  		return s
 535  	}
 536  	buf := []byte{:newSize}
 537  	j := 0
 538  	for i := 0; i < len(s); i++ {
 539  		b := s[i]
 540  		if r.replacements[b] != nil {
 541  			j += copy(buf[j:], r.replacements[b])
 542  		} else {
 543  			buf[j] = b
 544  			j++
 545  		}
 546  	}
 547  	return []byte(buf)
 548  }
 549  
 550  func (r *byteStringReplacer) WriteString(w io.Writer, s []byte) (n int, err error) {
 551  	sw := getStringWriter(w)
 552  	last := 0
 553  	for i := 0; i < len(s); i++ {
 554  		b := s[i]
 555  		if r.replacements[b] == nil {
 556  			continue
 557  		}
 558  		if last != i {
 559  			nw, err := sw.WriteString(s[last:i])
 560  			n += nw
 561  			if err != nil {
 562  				return n, err
 563  			}
 564  		}
 565  		last = i + 1
 566  		nw, err := w.Write(r.replacements[b])
 567  		n += nw
 568  		if err != nil {
 569  			return n, err
 570  		}
 571  	}
 572  	if last != len(s) {
 573  		var nw int
 574  		nw, err = sw.WriteString(s[last:])
 575  		n += nw
 576  	}
 577  	return
 578  }
 579