escape.go raw

   1  // Copyright 2010 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package html
   6  
   7  import (
   8  	"bytes"
   9  	"strings"
  10  	"unicode/utf8"
  11  )
  12  
  13  // These replacements permit compatibility with old numeric entities that
  14  // assumed Windows-1252 encoding.
  15  // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
  16  var replacementTable = [...]rune{
  17  	'\u20AC', // First entry is what 0x80 should be replaced with.
  18  	'\u0081',
  19  	'\u201A',
  20  	'\u0192',
  21  	'\u201E',
  22  	'\u2026',
  23  	'\u2020',
  24  	'\u2021',
  25  	'\u02C6',
  26  	'\u2030',
  27  	'\u0160',
  28  	'\u2039',
  29  	'\u0152',
  30  	'\u008D',
  31  	'\u017D',
  32  	'\u008F',
  33  	'\u0090',
  34  	'\u2018',
  35  	'\u2019',
  36  	'\u201C',
  37  	'\u201D',
  38  	'\u2022',
  39  	'\u2013',
  40  	'\u2014',
  41  	'\u02DC',
  42  	'\u2122',
  43  	'\u0161',
  44  	'\u203A',
  45  	'\u0153',
  46  	'\u009D',
  47  	'\u017E',
  48  	'\u0178', // Last entry is 0x9F.
  49  	// 0x00->'\uFFFD' is handled programmatically.
  50  	// 0x0D->'\u000D' is a no-op.
  51  }
  52  
  53  // unescapeEntity reads an entity like "<" from b[src:] and writes the
  54  // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
  55  // Precondition: b[src] == '&' && dst <= src.
  56  // attribute should be true if parsing an attribute value.
  57  func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
  58  	// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
  59  
  60  	// i starts at 1 because we already know that s[0] == '&'.
  61  	i, s := 1, b[src:]
  62  
  63  	if len(s) <= 1 {
  64  		b[dst] = b[src]
  65  		return dst + 1, src + 1
  66  	}
  67  
  68  	if s[i] == '#' {
  69  		if len(s) <= 3 { // We need to have at least "&#.".
  70  			b[dst] = b[src]
  71  			return dst + 1, src + 1
  72  		}
  73  		i++
  74  		c := s[i]
  75  		hex := false
  76  		if c == 'x' || c == 'X' {
  77  			hex = true
  78  			i++
  79  		}
  80  
  81  		x := '\x00'
  82  		for i < len(s) {
  83  			c = s[i]
  84  			i++
  85  			if hex {
  86  				if '0' <= c && c <= '9' {
  87  					x = 16*x + rune(c) - '0'
  88  					continue
  89  				} else if 'a' <= c && c <= 'f' {
  90  					x = 16*x + rune(c) - 'a' + 10
  91  					continue
  92  				} else if 'A' <= c && c <= 'F' {
  93  					x = 16*x + rune(c) - 'A' + 10
  94  					continue
  95  				}
  96  			} else if '0' <= c && c <= '9' {
  97  				x = 10*x + rune(c) - '0'
  98  				continue
  99  			}
 100  			if c != ';' {
 101  				i--
 102  			}
 103  			break
 104  		}
 105  
 106  		if i <= 3 { // No characters matched.
 107  			b[dst] = b[src]
 108  			return dst + 1, src + 1
 109  		}
 110  
 111  		if 0x80 <= x && x <= 0x9F {
 112  			// Replace characters from Windows-1252 with UTF-8 equivalents.
 113  			x = replacementTable[x-0x80]
 114  		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
 115  			// Replace invalid characters with the replacement character.
 116  			x = '\uFFFD'
 117  		}
 118  
 119  		return dst + utf8.EncodeRune(b[dst:], x), src + i
 120  	}
 121  
 122  	// Consume the maximum number of characters possible, with the
 123  	// consumed characters matching one of the named references.
 124  
 125  	for i < len(s) {
 126  		c := s[i]
 127  		i++
 128  		// Lower-cased characters are more common in entities, so we check for them first.
 129  		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
 130  			continue
 131  		}
 132  		if c != ';' {
 133  			i--
 134  		}
 135  		break
 136  	}
 137  
 138  	entityName := string(s[1:i])
 139  	if entityName == "" {
 140  		// No-op.
 141  	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
 142  		// No-op.
 143  	} else if x := entity[entityName]; x != 0 {
 144  		return dst + utf8.EncodeRune(b[dst:], x), src + i
 145  	} else if x := entity2[entityName]; x[0] != 0 {
 146  		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
 147  		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
 148  	} else if !attribute {
 149  		maxLen := len(entityName) - 1
 150  		if maxLen > longestEntityWithoutSemicolon {
 151  			maxLen = longestEntityWithoutSemicolon
 152  		}
 153  		for j := maxLen; j > 1; j-- {
 154  			if x := entity[entityName[:j]]; x != 0 {
 155  				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
 156  			}
 157  		}
 158  	}
 159  
 160  	dst1, src1 = dst+i, src+i
 161  	copy(b[dst:dst1], b[src:src1])
 162  	return dst1, src1
 163  }
 164  
 165  // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
 166  // attribute should be true if parsing an attribute value.
 167  func unescape(b []byte, attribute bool) []byte {
 168  	for i, c := range b {
 169  		if c == '&' {
 170  			dst, src := unescapeEntity(b, i, i, attribute)
 171  			for src < len(b) {
 172  				c := b[src]
 173  				if c == '&' {
 174  					dst, src = unescapeEntity(b, dst, src, attribute)
 175  				} else {
 176  					b[dst] = c
 177  					dst, src = dst+1, src+1
 178  				}
 179  			}
 180  			return b[0:dst]
 181  		}
 182  	}
 183  	return b
 184  }
 185  
 186  // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
 187  func lower(b []byte) []byte {
 188  	for i, c := range b {
 189  		if 'A' <= c && c <= 'Z' {
 190  			b[i] = c + 'a' - 'A'
 191  		}
 192  	}
 193  	return b
 194  }
 195  
 196  // escapeComment is like func escape but escapes its input bytes less often.
 197  // Per https://github.com/golang/go/issues/58246 some HTML comments are (1)
 198  // meaningful and (2) contain angle brackets that we'd like to avoid escaping
 199  // unless we have to.
 200  //
 201  // "We have to" includes the '&' byte, since that introduces other escapes.
 202  //
 203  // It also includes those bytes (not including EOF) that would otherwise end
 204  // the comment. Per the summary table at the bottom of comment_test.go, this is
 205  // the '>' byte that, per above, we'd like to avoid escaping unless we have to.
 206  //
 207  // Studying the summary table (and T actions in its '>' column) closely, we
 208  // only need to escape in states 43, 44, 49, 51 and 52. State 43 is at the
 209  // start of the comment data. State 52 is after a '!'. The other three states
 210  // are after a '-'.
 211  //
 212  // Our algorithm is thus to escape every '&' and to escape '>' if and only if:
 213  //   - The '>' is after a '!' or '-' (in the unescaped data) or
 214  //   - The '>' is at the start of the comment data (after the opening "<!--").
 215  func escapeComment(w writer, s string) error {
 216  	// When modifying this function, consider manually increasing the
 217  	// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.
 218  	// That increase should only be temporary, not committed, as it
 219  	// exponentially affects the test running time.
 220  
 221  	if len(s) == 0 {
 222  		return nil
 223  	}
 224  
 225  	// Loop:
 226  	//   - Grow j such that s[i:j] does not need escaping.
 227  	//   - If s[j] does need escaping, output s[i:j] and an escaped s[j],
 228  	//     resetting i and j to point past that s[j] byte.
 229  	i := 0
 230  	for j := 0; j < len(s); j++ {
 231  		escaped := ""
 232  		switch s[j] {
 233  		case '&':
 234  			escaped = "&amp;"
 235  
 236  		case '>':
 237  			if j > 0 {
 238  				if prev := s[j-1]; (prev != '!') && (prev != '-') {
 239  					continue
 240  				}
 241  			}
 242  			escaped = "&gt;"
 243  
 244  		default:
 245  			continue
 246  		}
 247  
 248  		if i < j {
 249  			if _, err := w.WriteString(s[i:j]); err != nil {
 250  				return err
 251  			}
 252  		}
 253  		if _, err := w.WriteString(escaped); err != nil {
 254  			return err
 255  		}
 256  		i = j + 1
 257  	}
 258  
 259  	if i < len(s) {
 260  		if _, err := w.WriteString(s[i:]); err != nil {
 261  			return err
 262  		}
 263  	}
 264  	return nil
 265  }
 266  
 267  // escapeCommentString is to EscapeString as escapeComment is to escape.
 268  func escapeCommentString(s string) string {
 269  	if strings.IndexAny(s, "&>") == -1 {
 270  		return s
 271  	}
 272  	var buf bytes.Buffer
 273  	escapeComment(&buf, s)
 274  	return buf.String()
 275  }
 276  
 277  const escapedChars = "&'<>\"\r"
 278  
 279  func escape(w writer, s string) error {
 280  	i := strings.IndexAny(s, escapedChars)
 281  	for i != -1 {
 282  		if _, err := w.WriteString(s[:i]); err != nil {
 283  			return err
 284  		}
 285  		var esc string
 286  		switch s[i] {
 287  		case '&':
 288  			esc = "&amp;"
 289  		case '\'':
 290  			// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
 291  			esc = "&#39;"
 292  		case '<':
 293  			esc = "&lt;"
 294  		case '>':
 295  			esc = "&gt;"
 296  		case '"':
 297  			// "&#34;" is shorter than "&quot;".
 298  			esc = "&#34;"
 299  		case '\r':
 300  			esc = "&#13;"
 301  		default:
 302  			panic("html: unrecognized escape character")
 303  		}
 304  		s = s[i+1:]
 305  		if _, err := w.WriteString(esc); err != nil {
 306  			return err
 307  		}
 308  		i = strings.IndexAny(s, escapedChars)
 309  	}
 310  	_, err := w.WriteString(s)
 311  	return err
 312  }
 313  
 314  // EscapeString escapes special characters like "<" to become "&lt;". It
 315  // escapes only five such characters: <, >, &, ' and ".
 316  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 317  // always true.
 318  func EscapeString(s string) string {
 319  	if strings.IndexAny(s, escapedChars) == -1 {
 320  		return s
 321  	}
 322  	var buf bytes.Buffer
 323  	escape(&buf, s)
 324  	return buf.String()
 325  }
 326  
 327  // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
 328  // larger range of entities than EscapeString escapes. For example, "&aacute;"
 329  // unescapes to "รก", as does "&#225;" and "&xE1;".
 330  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 331  // always true.
 332  func UnescapeString(s string) string {
 333  	for _, c := range s {
 334  		if c == '&' {
 335  			return string(unescape([]byte(s), false))
 336  		}
 337  	}
 338  	return s
 339  }
 340