list.go raw

   1  // Copyright 2012 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:generate go run gen.go
   6  
   7  // Package publicsuffix provides a public suffix list based on data from
   8  // https://publicsuffix.org/
   9  //
  10  // A public suffix is one under which Internet users can directly register
  11  // names. It is related to, but different from, a TLD (top level domain).
  12  //
  13  // "com" is a TLD (top level domain). Top level means it has no dots.
  14  //
  15  // "com" is also a public suffix. Amazon and Google have registered different
  16  // siblings under that domain: "amazon.com" and "google.com".
  17  //
  18  // "au" is another TLD, again because it has no dots. But it's not "amazon.au".
  19  // Instead, it's "amazon.com.au".
  20  //
  21  // "com.au" isn't an actual TLD, because it's not at the top level (it has
  22  // dots). But it is an eTLD (effective TLD), because that's the branching point
  23  // for domain name registrars.
  24  //
  25  // Another name for "an eTLD" is "a public suffix". Often, what's more of
  26  // interest is the eTLD+1, or one more label than the public suffix. For
  27  // example, browsers partition read/write access to HTTP cookies according to
  28  // the eTLD+1. Web pages served from "amazon.com.au" can't read cookies from
  29  // "google.com.au", but web pages served from "maps.google.com" can share
  30  // cookies from "www.google.com", so you don't have to sign into Google Maps
  31  // separately from signing into Google Web Search. Note that all four of those
  32  // domains have 3 labels and 2 dots. The first two domains are each an eTLD+1,
  33  // the last two are not (but share the same eTLD+1: "google.com").
  34  //
  35  // All of these domains have the same eTLD+1:
  36  //   - "www.books.amazon.co.uk"
  37  //   - "books.amazon.co.uk"
  38  //   - "amazon.co.uk"
  39  //
  40  // Specifically, the eTLD+1 is "amazon.co.uk", because the eTLD is "co.uk".
  41  //
  42  // There is no closed form algorithm to calculate the eTLD of a domain.
  43  // Instead, the calculation is data driven. This package provides a
  44  // pre-compiled snapshot of Mozilla's PSL (Public Suffix List) data at
  45  // https://publicsuffix.org/
  46  package publicsuffix // import "golang.org/x/net/publicsuffix"
  47  
  48  // TODO: specify case sensitivity and leading/trailing dot behavior for
  49  // func PublicSuffix and func EffectiveTLDPlusOne.
  50  
  51  import (
  52  	"fmt"
  53  	"net/http/cookiejar"
  54  	"net/netip"
  55  	"strings"
  56  )
  57  
  58  // List implements the cookiejar.PublicSuffixList interface by calling the
  59  // PublicSuffix function.
  60  var List cookiejar.PublicSuffixList = list{}
  61  
  62  type list struct{}
  63  
  64  func (list) PublicSuffix(domain string) string {
  65  	ps, _ := PublicSuffix(domain)
  66  	return ps
  67  }
  68  
  69  func (list) String() string {
  70  	return version
  71  }
  72  
  73  // PublicSuffix returns the public suffix of the domain using a copy of the
  74  // publicsuffix.org database compiled into the library.
  75  //
  76  // icann is whether the public suffix is managed by the Internet Corporation
  77  // for Assigned Names and Numbers. If not, the public suffix is either a
  78  // privately managed domain (and in practice, not a top level domain) or an
  79  // unmanaged top level domain (and not explicitly mentioned in the
  80  // publicsuffix.org list). For example, "foo.org" and "foo.co.uk" are ICANN
  81  // domains, "foo.dyndns.org" is a private domain and
  82  // "cromulent" is an unmanaged top level domain.
  83  //
  84  // Use cases for distinguishing ICANN domains like "foo.com" from private
  85  // domains like "foo.appspot.com" can be found at
  86  // https://wiki.mozilla.org/Public_Suffix_List/Use_Cases
  87  func PublicSuffix(domain string) (publicSuffix string, icann bool) {
  88  	if _, err := netip.ParseAddr(domain); err == nil {
  89  		return domain, false
  90  	}
  91  
  92  	lo, hi := uint32(0), uint32(numTLD)
  93  	s, suffix, icannNode, wildcard := domain, len(domain), false, false
  94  loop:
  95  	for {
  96  		dot := strings.LastIndexByte(s, '.')
  97  		if wildcard {
  98  			icann = icannNode
  99  			suffix = 1 + dot
 100  		}
 101  		if lo == hi {
 102  			break
 103  		}
 104  		f := find(s[1+dot:], lo, hi)
 105  		if f == notFound {
 106  			break
 107  		}
 108  
 109  		u := uint32(nodes.get(f) >> (nodesBitsTextOffset + nodesBitsTextLength))
 110  		icannNode = u&(1<<nodesBitsICANN-1) != 0
 111  		u >>= nodesBitsICANN
 112  		u = children.get(u & (1<<nodesBitsChildren - 1))
 113  		lo = u & (1<<childrenBitsLo - 1)
 114  		u >>= childrenBitsLo
 115  		hi = u & (1<<childrenBitsHi - 1)
 116  		u >>= childrenBitsHi
 117  		switch u & (1<<childrenBitsNodeType - 1) {
 118  		case nodeTypeNormal:
 119  			suffix = 1 + dot
 120  		case nodeTypeException:
 121  			suffix = 1 + len(s)
 122  			break loop
 123  		}
 124  		u >>= childrenBitsNodeType
 125  		wildcard = u&(1<<childrenBitsWildcard-1) != 0
 126  		if !wildcard {
 127  			icann = icannNode
 128  		}
 129  
 130  		if dot == -1 {
 131  			break
 132  		}
 133  		s = s[:dot]
 134  	}
 135  	if suffix == len(domain) {
 136  		// If no rules match, the prevailing rule is "*".
 137  		return domain[1+strings.LastIndexByte(domain, '.'):], icann
 138  	}
 139  	return domain[suffix:], icann
 140  }
 141  
 142  const notFound uint32 = 1<<32 - 1
 143  
 144  // find returns the index of the node in the range [lo, hi) whose label equals
 145  // label, or notFound if there is no such node. The range is assumed to be in
 146  // strictly increasing node label order.
 147  func find(label string, lo, hi uint32) uint32 {
 148  	for lo < hi {
 149  		mid := lo + (hi-lo)/2
 150  		s := nodeLabel(mid)
 151  		if s < label {
 152  			lo = mid + 1
 153  		} else if s == label {
 154  			return mid
 155  		} else {
 156  			hi = mid
 157  		}
 158  	}
 159  	return notFound
 160  }
 161  
 162  // nodeLabel returns the label for the i'th node.
 163  func nodeLabel(i uint32) string {
 164  	x := nodes.get(i)
 165  	length := x & (1<<nodesBitsTextLength - 1)
 166  	x >>= nodesBitsTextLength
 167  	offset := x & (1<<nodesBitsTextOffset - 1)
 168  	return text[offset : offset+length]
 169  }
 170  
 171  // EffectiveTLDPlusOne returns the effective top level domain plus one more
 172  // label. For example, the eTLD+1 for "foo.bar.golang.org" is "golang.org".
 173  func EffectiveTLDPlusOne(domain string) (string, error) {
 174  	if strings.HasPrefix(domain, ".") || strings.HasSuffix(domain, ".") || strings.Contains(domain, "..") {
 175  		return "", fmt.Errorf("publicsuffix: empty label in domain %q", domain)
 176  	}
 177  
 178  	suffix, _ := PublicSuffix(domain)
 179  	if len(domain) <= len(suffix) {
 180  		return "", fmt.Errorf("publicsuffix: cannot derive eTLD+1 for domain %q", domain)
 181  	}
 182  	i := len(domain) - len(suffix) - 1
 183  	if domain[i] != '.' {
 184  		return "", fmt.Errorf("publicsuffix: invalid public suffix %q for domain %q", suffix, domain)
 185  	}
 186  	return domain[1+strings.LastIndexByte(domain[:i], '.'):], nil
 187  }
 188  
 189  type uint32String string
 190  
 191  func (u uint32String) get(i uint32) uint32 {
 192  	off := i * 4
 193  	u = u[off:] // help the compiler reduce bounds checks
 194  	return uint32(u[3]) |
 195  		uint32(u[2])<<8 |
 196  		uint32(u[1])<<16 |
 197  		uint32(u[0])<<24
 198  }
 199  
 200  type uint40String string
 201  
 202  func (u uint40String) get(i uint32) uint64 {
 203  	off := uint64(i * (nodesBits / 8))
 204  	u = u[off:] // help the compiler reduce bounds checks
 205  	return uint64(u[4]) |
 206  		uint64(u[3])<<8 |
 207  		uint64(u[2])<<16 |
 208  		uint64(u[1])<<24 |
 209  		uint64(u[0])<<32
 210  }
 211