sniff.mx raw

   1  // Copyright 2011 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package http
   6  
   7  import (
   8  	"bytes"
   9  	"encoding/binary"
  10  )
  11  
  12  // The algorithm uses at most sniffLen bytes to make its decision.
  13  const sniffLen = 512
  14  
  15  // DetectContentType implements the algorithm described
  16  // at https://mimesniff.spec.whatwg.org/ to determine the
  17  // Content-Type of the given data. It considers at most the
  18  // first 512 bytes of data. DetectContentType always returns
  19  // a valid MIME type: if it cannot determine a more specific one, it
  20  // returns "application/octet-stream".
  21  func DetectContentType(data []byte) []byte {
  22  	if len(data) > sniffLen {
  23  		data = data[:sniffLen]
  24  	}
  25  
  26  	// Index of the first non-whitespace byte in data.
  27  	firstNonWS := 0
  28  	for ; firstNonWS < len(data) && isWS(data[firstNonWS]); firstNonWS++ {
  29  	}
  30  
  31  	for _, sig := range sniffSignatures {
  32  		if ct := sig.match(data, firstNonWS); ct != "" {
  33  			return ct
  34  		}
  35  	}
  36  
  37  	return "application/octet-stream" // fallback
  38  }
  39  
  40  // isWS reports whether the provided byte is a whitespace byte (0xWS)
  41  // as defined in https://mimesniff.spec.whatwg.org/#terminology.
  42  func isWS(b byte) bool {
  43  	switch b {
  44  	case '\t', '\n', '\x0c', '\r', ' ':
  45  		return true
  46  	}
  47  	return false
  48  }
  49  
  50  // isTT reports whether the provided byte is a tag-terminating byte (0xTT)
  51  // as defined in https://mimesniff.spec.whatwg.org/#terminology.
  52  func isTT(b byte) bool {
  53  	switch b {
  54  	case ' ', '>':
  55  		return true
  56  	}
  57  	return false
  58  }
  59  
  60  type sniffSig interface {
  61  	// match returns the MIME type of the data, or "" if unknown.
  62  	match(data []byte, firstNonWS int) []byte
  63  }
  64  
  65  // Data matching the table in section 6.
  66  var sniffSignatures = []sniffSig{
  67  	htmlSig("<!DOCTYPE HTML"),
  68  	htmlSig("<HTML"),
  69  	htmlSig("<HEAD"),
  70  	htmlSig("<SCRIPT"),
  71  	htmlSig("<IFRAME"),
  72  	htmlSig("<H1"),
  73  	htmlSig("<DIV"),
  74  	htmlSig("<FONT"),
  75  	htmlSig("<TABLE"),
  76  	htmlSig("<A"),
  77  	htmlSig("<STYLE"),
  78  	htmlSig("<TITLE"),
  79  	htmlSig("<B"),
  80  	htmlSig("<BODY"),
  81  	htmlSig("<BR"),
  82  	htmlSig("<P"),
  83  	htmlSig("<!--"),
  84  	&maskedSig{
  85  		mask:   []byte("\xFF\xFF\xFF\xFF\xFF"),
  86  		pat:    []byte("<?xml"),
  87  		skipWS: true,
  88  		ct:     "text/xml; charset=utf-8"},
  89  	&exactSig{[]byte("%PDF-"), "application/pdf"},
  90  	&exactSig{[]byte("%!PS-Adobe-"), "application/postscript"},
  91  
  92  	// UTF BOMs.
  93  	&maskedSig{
  94  		mask: []byte("\xFF\xFF\x00\x00"),
  95  		pat:  []byte("\xFE\xFF\x00\x00"),
  96  		ct:   "text/plain; charset=utf-16be",
  97  	},
  98  	&maskedSig{
  99  		mask: []byte("\xFF\xFF\x00\x00"),
 100  		pat:  []byte("\xFF\xFE\x00\x00"),
 101  		ct:   "text/plain; charset=utf-16le",
 102  	},
 103  	&maskedSig{
 104  		mask: []byte("\xFF\xFF\xFF\x00"),
 105  		pat:  []byte("\xEF\xBB\xBF\x00"),
 106  		ct:   "text/plain; charset=utf-8",
 107  	},
 108  
 109  	// Image types
 110  	// For posterity, we originally returned "image/vnd.microsoft.icon" from
 111  	// https://tools.ietf.org/html/draft-ietf-websec-mime-sniff-03#section-7
 112  	// https://codereview.appspot.com/4746042
 113  	// but that has since been replaced with "image/x-icon" in Section 6.2
 114  	// of https://mimesniff.spec.whatwg.org/#matching-an-image-type-pattern
 115  	&exactSig{[]byte("\x00\x00\x01\x00"), "image/x-icon"},
 116  	&exactSig{[]byte("\x00\x00\x02\x00"), "image/x-icon"},
 117  	&exactSig{[]byte("BM"), "image/bmp"},
 118  	&exactSig{[]byte("GIF87a"), "image/gif"},
 119  	&exactSig{[]byte("GIF89a"), "image/gif"},
 120  	&maskedSig{
 121  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF"),
 122  		pat:  []byte("RIFF\x00\x00\x00\x00WEBPVP"),
 123  		ct:   "image/webp",
 124  	},
 125  	&exactSig{[]byte("\x89PNG\x0D\x0A\x1A\x0A"), "image/png"},
 126  	&exactSig{[]byte("\xFF\xD8\xFF"), "image/jpeg"},
 127  
 128  	// Audio and Video types
 129  	// Enforce the pattern match ordering as prescribed in
 130  	// https://mimesniff.spec.whatwg.org/#matching-an-audio-or-video-type-pattern
 131  	&maskedSig{
 132  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
 133  		pat:  []byte("FORM\x00\x00\x00\x00AIFF"),
 134  		ct:   "audio/aiff",
 135  	},
 136  	&maskedSig{
 137  		mask: []byte("\xFF\xFF\xFF"),
 138  		pat:  []byte("ID3"),
 139  		ct:   "audio/mpeg",
 140  	},
 141  	&maskedSig{
 142  		mask: []byte("\xFF\xFF\xFF\xFF\xFF"),
 143  		pat:  []byte("OggS\x00"),
 144  		ct:   "application/ogg",
 145  	},
 146  	&maskedSig{
 147  		mask: []byte("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"),
 148  		pat:  []byte("MThd\x00\x00\x00\x06"),
 149  		ct:   "audio/midi",
 150  	},
 151  	&maskedSig{
 152  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
 153  		pat:  []byte("RIFF\x00\x00\x00\x00AVI "),
 154  		ct:   "video/avi",
 155  	},
 156  	&maskedSig{
 157  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
 158  		pat:  []byte("RIFF\x00\x00\x00\x00WAVE"),
 159  		ct:   "audio/wave",
 160  	},
 161  	// 6.2.0.2. video/mp4
 162  	mp4Sig{},
 163  	// 6.2.0.3. video/webm
 164  	&exactSig{[]byte("\x1A\x45\xDF\xA3"), "video/webm"},
 165  
 166  	// Font types
 167  	&maskedSig{
 168  		// 34 NULL bytes followed by the string "LP"
 169  		pat: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00LP"),
 170  		// 34 NULL bytes followed by \xF\xF
 171  		mask: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF"),
 172  		ct:   "application/vnd.ms-fontobject",
 173  	},
 174  	&exactSig{[]byte("\x00\x01\x00\x00"), "font/ttf"},
 175  	&exactSig{[]byte("OTTO"), "font/otf"},
 176  	&exactSig{[]byte("ttcf"), "font/collection"},
 177  	&exactSig{[]byte("wOFF"), "font/woff"},
 178  	&exactSig{[]byte("wOF2"), "font/woff2"},
 179  
 180  	// Archive types
 181  	&exactSig{[]byte("\x1F\x8B\x08"), "application/x-gzip"},
 182  	&exactSig{[]byte("PK\x03\x04"), "application/zip"},
 183  	// RAR's signatures are incorrectly defined by the MIME spec as per
 184  	//    https://github.com/whatwg/mimesniff/issues/63
 185  	// However, RAR Labs correctly defines it at:
 186  	//    https://www.rarlab.com/technote.htm#rarsign
 187  	// so we use the definition from RAR Labs.
 188  	// TODO: do whatever the spec ends up doing.
 189  	&exactSig{[]byte("Rar!\x1A\x07\x00"), "application/x-rar-compressed"},     // RAR v1.5-v4.0
 190  	&exactSig{[]byte("Rar!\x1A\x07\x01\x00"), "application/x-rar-compressed"}, // RAR v5+
 191  
 192  	&exactSig{[]byte("\x00\x61\x73\x6D"), "application/wasm"},
 193  
 194  	textSig{}, // should be last
 195  }
 196  
 197  type exactSig struct {
 198  	sig []byte
 199  	ct  []byte
 200  }
 201  
 202  func (e *exactSig) match(data []byte, firstNonWS int) []byte {
 203  	if bytes.HasPrefix(data, e.sig) {
 204  		return e.ct
 205  	}
 206  	return ""
 207  }
 208  
 209  type maskedSig struct {
 210  	mask, pat []byte
 211  	skipWS    bool
 212  	ct        []byte
 213  }
 214  
 215  func (m *maskedSig) match(data []byte, firstNonWS int) []byte {
 216  	// pattern matching algorithm section 6
 217  	// https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm
 218  
 219  	if m.skipWS {
 220  		data = data[firstNonWS:]
 221  	}
 222  	if len(m.pat) != len(m.mask) {
 223  		return ""
 224  	}
 225  	if len(data) < len(m.pat) {
 226  		return ""
 227  	}
 228  	for i, pb := range m.pat {
 229  		maskedData := data[i] & m.mask[i]
 230  		if maskedData != pb {
 231  			return ""
 232  		}
 233  	}
 234  	return m.ct
 235  }
 236  
 237  type htmlSig []byte
 238  
 239  func (h htmlSig) match(data []byte, firstNonWS int) []byte {
 240  	data = data[firstNonWS:]
 241  	if len(data) < len(h)+1 {
 242  		return ""
 243  	}
 244  	for i, b := range h {
 245  		db := data[i]
 246  		if 'A' <= b && b <= 'Z' {
 247  			db &= 0xDF
 248  		}
 249  		if b != db {
 250  			return ""
 251  		}
 252  	}
 253  	// Next byte must be a tag-terminating byte(0xTT).
 254  	if !isTT(data[len(h)]) {
 255  		return ""
 256  	}
 257  	return "text/html; charset=utf-8"
 258  }
 259  
 260  var mp4ftype = []byte("ftyp")
 261  var mp4 = []byte("mp4")
 262  
 263  type mp4Sig struct{}
 264  
 265  func (mp4Sig) match(data []byte, firstNonWS int) []byte {
 266  	// https://mimesniff.spec.whatwg.org/#signature-for-mp4
 267  	// c.f. section 6.2.1
 268  	if len(data) < 12 {
 269  		return ""
 270  	}
 271  	boxSize := int(binary.BigEndian.Uint32(data[:4]))
 272  	if len(data) < boxSize || boxSize%4 != 0 {
 273  		return ""
 274  	}
 275  	if !bytes.Equal(data[4:8], mp4ftype) {
 276  		return ""
 277  	}
 278  	for st := 8; st < boxSize; st += 4 {
 279  		if st == 12 {
 280  			// Ignores the four bytes that correspond to the version number of the "major brand".
 281  			continue
 282  		}
 283  		if bytes.Equal(data[st:st+3], mp4) {
 284  			return "video/mp4"
 285  		}
 286  	}
 287  	return ""
 288  }
 289  
 290  type textSig struct{}
 291  
 292  func (textSig) match(data []byte, firstNonWS int) []byte {
 293  	// c.f. section 5, step 4.
 294  	for _, b := range data[firstNonWS:] {
 295  		switch {
 296  		case b <= 0x08,
 297  			b == 0x0B,
 298  			0x0E <= b && b <= 0x1A,
 299  			0x1C <= b && b <= 0x1F:
 300  			return ""
 301  		}
 302  	}
 303  	return "text/plain; charset=utf-8"
 304  }
 305