runes.go raw

   1  // Copyright 2014 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Package runes provide transforms for UTF-8 encoded text.
   6  package runes // import "golang.org/x/text/runes"
   7  
   8  import (
   9  	"unicode"
  10  	"unicode/utf8"
  11  
  12  	"golang.org/x/text/transform"
  13  )
  14  
  15  // A Set is a collection of runes.
  16  type Set interface {
  17  	// Contains returns true if r is contained in the set.
  18  	Contains(r rune) bool
  19  }
  20  
  21  type setFunc func(rune) bool
  22  
  23  func (s setFunc) Contains(r rune) bool {
  24  	return s(r)
  25  }
  26  
  27  // Note: using funcs here instead of wrapping types result in cleaner
  28  // documentation and a smaller API.
  29  
  30  // In creates a Set with a Contains method that returns true for all runes in
  31  // the given RangeTable.
  32  func In(rt *unicode.RangeTable) Set {
  33  	return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
  34  }
  35  
  36  // NotIn creates a Set with a Contains method that returns true for all runes not
  37  // in the given RangeTable.
  38  func NotIn(rt *unicode.RangeTable) Set {
  39  	return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
  40  }
  41  
  42  // Predicate creates a Set with a Contains method that returns f(r).
  43  func Predicate(f func(rune) bool) Set {
  44  	return setFunc(f)
  45  }
  46  
  47  // Transformer implements the transform.Transformer interface.
  48  type Transformer struct {
  49  	t transform.SpanningTransformer
  50  }
  51  
  52  func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  53  	return t.t.Transform(dst, src, atEOF)
  54  }
  55  
  56  func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
  57  	return t.t.Span(b, atEOF)
  58  }
  59  
  60  func (t Transformer) Reset() { t.t.Reset() }
  61  
  62  // Bytes returns a new byte slice with the result of converting b using t.  It
  63  // calls Reset on t. It returns nil if any error was found. This can only happen
  64  // if an error-producing Transformer is passed to If.
  65  func (t Transformer) Bytes(b []byte) []byte {
  66  	b, _, err := transform.Bytes(t, b)
  67  	if err != nil {
  68  		return nil
  69  	}
  70  	return b
  71  }
  72  
  73  // String returns a string with the result of converting s using t. It calls
  74  // Reset on t. It returns the empty string if any error was found. This can only
  75  // happen if an error-producing Transformer is passed to If.
  76  func (t Transformer) String(s string) string {
  77  	s, _, err := transform.String(t, s)
  78  	if err != nil {
  79  		return ""
  80  	}
  81  	return s
  82  }
  83  
  84  // TODO:
  85  // - Copy: copying strings and bytes in whole-rune units.
  86  // - Validation (maybe)
  87  // - Well-formed-ness (maybe)
  88  
  89  const runeErrorString = string(utf8.RuneError)
  90  
  91  // Remove returns a Transformer that removes runes r for which s.Contains(r).
  92  // Illegal input bytes are replaced by RuneError before being passed to f.
  93  func Remove(s Set) Transformer {
  94  	if f, ok := s.(setFunc); ok {
  95  		// This little trick cuts the running time of BenchmarkRemove for sets
  96  		// created by Predicate roughly in half.
  97  		// TODO: special-case RangeTables as well.
  98  		return Transformer{remove(f)}
  99  	}
 100  	return Transformer{remove(s.Contains)}
 101  }
 102  
 103  // TODO: remove transform.RemoveFunc.
 104  
 105  type remove func(r rune) bool
 106  
 107  func (remove) Reset() {}
 108  
 109  // Span implements transform.Spanner.
 110  func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
 111  	for r, size := rune(0), 0; n < len(src); {
 112  		if r = rune(src[n]); r < utf8.RuneSelf {
 113  			size = 1
 114  		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
 115  			// Invalid rune.
 116  			if !atEOF && !utf8.FullRune(src[n:]) {
 117  				err = transform.ErrShortSrc
 118  			} else {
 119  				err = transform.ErrEndOfSpan
 120  			}
 121  			break
 122  		}
 123  		if t(r) {
 124  			err = transform.ErrEndOfSpan
 125  			break
 126  		}
 127  		n += size
 128  	}
 129  	return
 130  }
 131  
 132  // Transform implements transform.Transformer.
 133  func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 134  	for r, size := rune(0), 0; nSrc < len(src); {
 135  		if r = rune(src[nSrc]); r < utf8.RuneSelf {
 136  			size = 1
 137  		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
 138  			// Invalid rune.
 139  			if !atEOF && !utf8.FullRune(src[nSrc:]) {
 140  				err = transform.ErrShortSrc
 141  				break
 142  			}
 143  			// We replace illegal bytes with RuneError. Not doing so might
 144  			// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
 145  			// The resulting byte sequence may subsequently contain runes
 146  			// for which t(r) is true that were passed unnoticed.
 147  			if !t(utf8.RuneError) {
 148  				if nDst+3 > len(dst) {
 149  					err = transform.ErrShortDst
 150  					break
 151  				}
 152  				dst[nDst+0] = runeErrorString[0]
 153  				dst[nDst+1] = runeErrorString[1]
 154  				dst[nDst+2] = runeErrorString[2]
 155  				nDst += 3
 156  			}
 157  			nSrc++
 158  			continue
 159  		}
 160  		if t(r) {
 161  			nSrc += size
 162  			continue
 163  		}
 164  		if nDst+size > len(dst) {
 165  			err = transform.ErrShortDst
 166  			break
 167  		}
 168  		for i := 0; i < size; i++ {
 169  			dst[nDst] = src[nSrc]
 170  			nDst++
 171  			nSrc++
 172  		}
 173  	}
 174  	return
 175  }
 176  
 177  // Map returns a Transformer that maps the runes in the input using the given
 178  // mapping. Illegal bytes in the input are converted to utf8.RuneError before
 179  // being passed to the mapping func.
 180  func Map(mapping func(rune) rune) Transformer {
 181  	return Transformer{mapper(mapping)}
 182  }
 183  
 184  type mapper func(rune) rune
 185  
 186  func (mapper) Reset() {}
 187  
 188  // Span implements transform.Spanner.
 189  func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
 190  	for r, size := rune(0), 0; n < len(src); n += size {
 191  		if r = rune(src[n]); r < utf8.RuneSelf {
 192  			size = 1
 193  		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
 194  			// Invalid rune.
 195  			if !atEOF && !utf8.FullRune(src[n:]) {
 196  				err = transform.ErrShortSrc
 197  			} else {
 198  				err = transform.ErrEndOfSpan
 199  			}
 200  			break
 201  		}
 202  		if t(r) != r {
 203  			err = transform.ErrEndOfSpan
 204  			break
 205  		}
 206  	}
 207  	return n, err
 208  }
 209  
 210  // Transform implements transform.Transformer.
 211  func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 212  	var replacement rune
 213  	var b [utf8.UTFMax]byte
 214  
 215  	for r, size := rune(0), 0; nSrc < len(src); {
 216  		if r = rune(src[nSrc]); r < utf8.RuneSelf {
 217  			if replacement = t(r); replacement < utf8.RuneSelf {
 218  				if nDst == len(dst) {
 219  					err = transform.ErrShortDst
 220  					break
 221  				}
 222  				dst[nDst] = byte(replacement)
 223  				nDst++
 224  				nSrc++
 225  				continue
 226  			}
 227  			size = 1
 228  		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
 229  			// Invalid rune.
 230  			if !atEOF && !utf8.FullRune(src[nSrc:]) {
 231  				err = transform.ErrShortSrc
 232  				break
 233  			}
 234  
 235  			if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
 236  				if nDst+3 > len(dst) {
 237  					err = transform.ErrShortDst
 238  					break
 239  				}
 240  				dst[nDst+0] = runeErrorString[0]
 241  				dst[nDst+1] = runeErrorString[1]
 242  				dst[nDst+2] = runeErrorString[2]
 243  				nDst += 3
 244  				nSrc++
 245  				continue
 246  			}
 247  		} else if replacement = t(r); replacement == r {
 248  			if nDst+size > len(dst) {
 249  				err = transform.ErrShortDst
 250  				break
 251  			}
 252  			for i := 0; i < size; i++ {
 253  				dst[nDst] = src[nSrc]
 254  				nDst++
 255  				nSrc++
 256  			}
 257  			continue
 258  		}
 259  
 260  		n := utf8.EncodeRune(b[:], replacement)
 261  
 262  		if nDst+n > len(dst) {
 263  			err = transform.ErrShortDst
 264  			break
 265  		}
 266  		for i := 0; i < n; i++ {
 267  			dst[nDst] = b[i]
 268  			nDst++
 269  		}
 270  		nSrc += size
 271  	}
 272  	return
 273  }
 274  
 275  // ReplaceIllFormed returns a transformer that replaces all input bytes that are
 276  // not part of a well-formed UTF-8 code sequence with utf8.RuneError.
 277  func ReplaceIllFormed() Transformer {
 278  	return Transformer{&replaceIllFormed{}}
 279  }
 280  
 281  type replaceIllFormed struct{ transform.NopResetter }
 282  
 283  func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
 284  	for n < len(src) {
 285  		// ASCII fast path.
 286  		if src[n] < utf8.RuneSelf {
 287  			n++
 288  			continue
 289  		}
 290  
 291  		r, size := utf8.DecodeRune(src[n:])
 292  
 293  		// Look for a valid non-ASCII rune.
 294  		if r != utf8.RuneError || size != 1 {
 295  			n += size
 296  			continue
 297  		}
 298  
 299  		// Look for short source data.
 300  		if !atEOF && !utf8.FullRune(src[n:]) {
 301  			err = transform.ErrShortSrc
 302  			break
 303  		}
 304  
 305  		// We have an invalid rune.
 306  		err = transform.ErrEndOfSpan
 307  		break
 308  	}
 309  	return n, err
 310  }
 311  
 312  func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 313  	for nSrc < len(src) {
 314  		// ASCII fast path.
 315  		if r := src[nSrc]; r < utf8.RuneSelf {
 316  			if nDst == len(dst) {
 317  				err = transform.ErrShortDst
 318  				break
 319  			}
 320  			dst[nDst] = r
 321  			nDst++
 322  			nSrc++
 323  			continue
 324  		}
 325  
 326  		// Look for a valid non-ASCII rune.
 327  		if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
 328  			if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
 329  				err = transform.ErrShortDst
 330  				break
 331  			}
 332  			nDst += size
 333  			nSrc += size
 334  			continue
 335  		}
 336  
 337  		// Look for short source data.
 338  		if !atEOF && !utf8.FullRune(src[nSrc:]) {
 339  			err = transform.ErrShortSrc
 340  			break
 341  		}
 342  
 343  		// We have an invalid rune.
 344  		if nDst+3 > len(dst) {
 345  			err = transform.ErrShortDst
 346  			break
 347  		}
 348  		dst[nDst+0] = runeErrorString[0]
 349  		dst[nDst+1] = runeErrorString[1]
 350  		dst[nDst+2] = runeErrorString[2]
 351  		nDst += 3
 352  		nSrc++
 353  	}
 354  	return nDst, nSrc, err
 355  }
 356