cache.go raw

   1  // Copyright 2017 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Package cache implements a build artifact cache.
   6  //
   7  // This package is a slightly modified fork of Go's
   8  // cmd/go/internal/cache package.
   9  package cache
  10  
  11  import (
  12  	"bytes"
  13  	"crypto/sha256"
  14  	"encoding/hex"
  15  	"errors"
  16  	"fmt"
  17  	"io"
  18  	"os"
  19  	"path/filepath"
  20  	"strconv"
  21  	"strings"
  22  	"time"
  23  
  24  	"honnef.co/go/tools/internal/renameio"
  25  )
  26  
  27  // An ActionID is a cache action key, the hash of a complete description of a
  28  // repeatable computation (command line, environment variables,
  29  // input file contents, executable contents).
  30  type ActionID [HashSize]byte
  31  
  32  // An OutputID is a cache output key, the hash of an output of a computation.
  33  type OutputID [HashSize]byte
  34  
  35  // A Cache is a package cache, backed by a file system directory tree.
  36  type Cache struct {
  37  	dir  string
  38  	now  func() time.Time
  39  	salt []byte
  40  }
  41  
  42  // Open opens and returns the cache in the given directory.
  43  //
  44  // It is safe for multiple processes on a single machine to use the
  45  // same cache directory in a local file system simultaneously.
  46  // They will coordinate using operating system file locks and may
  47  // duplicate effort but will not corrupt the cache.
  48  //
  49  // However, it is NOT safe for multiple processes on different machines
  50  // to share a cache directory (for example, if the directory were stored
  51  // in a network file system). File locking is notoriously unreliable in
  52  // network file systems and may not suffice to protect the cache.
  53  func Open(dir string) (*Cache, error) {
  54  	info, err := os.Stat(dir)
  55  	if err != nil {
  56  		return nil, err
  57  	}
  58  	if !info.IsDir() {
  59  		return nil, &os.PathError{Op: "open", Path: dir, Err: fmt.Errorf("not a directory")}
  60  	}
  61  	for i := 0; i < 256; i++ {
  62  		name := filepath.Join(dir, fmt.Sprintf("%02x", i))
  63  		if err := os.MkdirAll(name, 0777); err != nil {
  64  			return nil, err
  65  		}
  66  	}
  67  	c := &Cache{
  68  		dir: dir,
  69  		now: time.Now,
  70  	}
  71  	return c, nil
  72  }
  73  
  74  func (c *Cache) SetSalt(b []byte) {
  75  	c.salt = b
  76  }
  77  
  78  // fileName returns the name of the file corresponding to the given id.
  79  func (c *Cache) fileName(id [HashSize]byte, key string) string {
  80  	return filepath.Join(c.dir, fmt.Sprintf("%02x", id[0]), fmt.Sprintf("%x", id)+"-"+key)
  81  }
  82  
  83  // An entryNotFoundError indicates that a cache entry was not found, with an
  84  // optional underlying reason.
  85  type entryNotFoundError struct {
  86  	Err error
  87  }
  88  
  89  func (e *entryNotFoundError) Error() string {
  90  	if e.Err == nil {
  91  		return "cache entry not found"
  92  	}
  93  	return fmt.Sprintf("cache entry not found: %v", e.Err)
  94  }
  95  
  96  func (e *entryNotFoundError) Unwrap() error {
  97  	return e.Err
  98  }
  99  
 100  const (
 101  	// action entry file is "v1 <hex id> <hex out> <decimal size space-padded to 20 bytes> <unixnano space-padded to 20 bytes>\n"
 102  	hexSize   = HashSize * 2
 103  	entrySize = 2 + 1 + hexSize + 1 + hexSize + 1 + 20 + 1 + 20 + 1
 104  )
 105  
 106  // verify controls whether to run the cache in verify mode.
 107  // In verify mode, the cache always returns errMissing from Get
 108  // but then double-checks in Put that the data being written
 109  // exactly matches any existing entry. This provides an easy
 110  // way to detect program behavior that would have been different
 111  // had the cache entry been returned from Get.
 112  //
 113  // verify is enabled by setting the environment variable
 114  // GODEBUG=gocacheverify=1.
 115  var verify = false
 116  
 117  var errVerifyMode = errors.New("gocacheverify=1")
 118  
 119  // DebugTest is set when GODEBUG=gocachetest=1 is in the environment.
 120  var DebugTest = false
 121  
 122  func init() { initEnv() }
 123  
 124  func initEnv() {
 125  	verify = false
 126  	debugHash = false
 127  	debug := strings.Split(os.Getenv("GODEBUG"), ",")
 128  	for _, f := range debug {
 129  		if f == "gocacheverify=1" {
 130  			verify = true
 131  		}
 132  		if f == "gocachehash=1" {
 133  			debugHash = true
 134  		}
 135  		if f == "gocachetest=1" {
 136  			DebugTest = true
 137  		}
 138  	}
 139  }
 140  
 141  // Get looks up the action ID in the cache,
 142  // returning the corresponding output ID and file size, if any.
 143  // Note that finding an output ID does not guarantee that the
 144  // saved file for that output ID is still available.
 145  func (c *Cache) Get(id ActionID) (Entry, error) {
 146  	if verify {
 147  		return Entry{}, &entryNotFoundError{Err: errVerifyMode}
 148  	}
 149  	return c.get(id)
 150  }
 151  
 152  type Entry struct {
 153  	OutputID OutputID
 154  	Size     int64
 155  	Time     time.Time
 156  }
 157  
 158  // get is Get but does not respect verify mode, so that Put can use it.
 159  func (c *Cache) get(id ActionID) (Entry, error) {
 160  	missing := func(reason error) (Entry, error) {
 161  		return Entry{}, &entryNotFoundError{Err: reason}
 162  	}
 163  	f, err := os.Open(c.fileName(id, "a"))
 164  	if err != nil {
 165  		return missing(err)
 166  	}
 167  	defer f.Close()
 168  	entry := make([]byte, entrySize+1) // +1 to detect whether f is too long
 169  	if n, err := io.ReadFull(f, entry); n > entrySize {
 170  		return missing(errors.New("too long"))
 171  	} else if err != io.ErrUnexpectedEOF {
 172  		if err == io.EOF {
 173  			return missing(errors.New("file is empty"))
 174  		}
 175  		return missing(err)
 176  	} else if n < entrySize {
 177  		return missing(errors.New("entry file incomplete"))
 178  	}
 179  	if entry[0] != 'v' || entry[1] != '1' || entry[2] != ' ' || entry[3+hexSize] != ' ' || entry[3+hexSize+1+hexSize] != ' ' || entry[3+hexSize+1+hexSize+1+20] != ' ' || entry[entrySize-1] != '\n' {
 180  		return missing(errors.New("invalid header"))
 181  	}
 182  	eid, entry := entry[3:3+hexSize], entry[3+hexSize:]
 183  	eout, entry := entry[1:1+hexSize], entry[1+hexSize:]
 184  	esize, entry := entry[1:1+20], entry[1+20:]
 185  	//lint:ignore SA4006 See https://github.com/dominikh/go-tools/issues/465
 186  	etime, entry := entry[1:1+20], entry[1+20:]
 187  	var buf [HashSize]byte
 188  	if _, err := hex.Decode(buf[:], eid); err != nil {
 189  		return missing(fmt.Errorf("decoding ID: %v", err))
 190  	} else if buf != id {
 191  		return missing(errors.New("mismatched ID"))
 192  	}
 193  	if _, err := hex.Decode(buf[:], eout); err != nil {
 194  		return missing(fmt.Errorf("decoding output ID: %v", err))
 195  	}
 196  	i := 0
 197  	for i < len(esize) && esize[i] == ' ' {
 198  		i++
 199  	}
 200  	size, err := strconv.ParseInt(string(esize[i:]), 10, 64)
 201  	if err != nil {
 202  		return missing(fmt.Errorf("parsing size: %v", err))
 203  	} else if size < 0 {
 204  		return missing(errors.New("negative size"))
 205  	}
 206  	i = 0
 207  	for i < len(etime) && etime[i] == ' ' {
 208  		i++
 209  	}
 210  	tm, err := strconv.ParseInt(string(etime[i:]), 10, 64)
 211  	if err != nil {
 212  		return missing(fmt.Errorf("parsing timestamp: %v", err))
 213  	} else if tm < 0 {
 214  		return missing(errors.New("negative timestamp"))
 215  	}
 216  
 217  	c.used(c.fileName(id, "a"))
 218  
 219  	return Entry{buf, size, time.Unix(0, tm)}, nil
 220  }
 221  
 222  // GetFile looks up the action ID in the cache and returns
 223  // the name of the corresponding data file.
 224  func (c *Cache) GetFile(id ActionID) (file string, entry Entry, err error) {
 225  	entry, err = c.Get(id)
 226  	if err != nil {
 227  		return "", Entry{}, err
 228  	}
 229  	file = c.OutputFile(entry.OutputID)
 230  	info, err := os.Stat(file)
 231  	if err != nil {
 232  		return "", Entry{}, &entryNotFoundError{Err: err}
 233  	}
 234  	if info.Size() != entry.Size {
 235  		return "", Entry{}, &entryNotFoundError{Err: errors.New("file incomplete")}
 236  	}
 237  	return file, entry, nil
 238  }
 239  
 240  // GetBytes looks up the action ID in the cache and returns
 241  // the corresponding output bytes.
 242  // GetBytes should only be used for data that can be expected to fit in memory.
 243  func (c *Cache) GetBytes(id ActionID) ([]byte, Entry, error) {
 244  	entry, err := c.Get(id)
 245  	if err != nil {
 246  		return nil, entry, err
 247  	}
 248  	data, _ := os.ReadFile(c.OutputFile(entry.OutputID))
 249  	if sha256.Sum256(data) != entry.OutputID {
 250  		return nil, entry, &entryNotFoundError{Err: errors.New("bad checksum")}
 251  	}
 252  	return data, entry, nil
 253  }
 254  
 255  // OutputFile returns the name of the cache file storing output with the given OutputID.
 256  func (c *Cache) OutputFile(out OutputID) string {
 257  	file := c.fileName(out, "d")
 258  	c.used(file)
 259  	return file
 260  }
 261  
 262  // Time constants for cache expiration.
 263  //
 264  // We set the mtime on a cache file on each use, but at most one per mtimeInterval (1 hour),
 265  // to avoid causing many unnecessary inode updates. The mtimes therefore
 266  // roughly reflect "time of last use" but may in fact be older by at most an hour.
 267  //
 268  // We scan the cache for entries to delete at most once per trimInterval (1 day).
 269  //
 270  // When we do scan the cache, we delete entries that have not been used for
 271  // at least trimLimit (5 days). Statistics gathered from a month of usage by
 272  // Go developers found that essentially all reuse of cached entries happened
 273  // within 5 days of the previous reuse. See golang.org/issue/22990.
 274  const (
 275  	mtimeInterval = 1 * time.Hour
 276  	trimInterval  = 24 * time.Hour
 277  	trimLimit     = 5 * 24 * time.Hour
 278  )
 279  
 280  // used makes a best-effort attempt to update mtime on file,
 281  // so that mtime reflects cache access time.
 282  //
 283  // Because the reflection only needs to be approximate,
 284  // and to reduce the amount of disk activity caused by using
 285  // cache entries, used only updates the mtime if the current
 286  // mtime is more than an hour old. This heuristic eliminates
 287  // nearly all of the mtime updates that would otherwise happen,
 288  // while still keeping the mtimes useful for cache trimming.
 289  func (c *Cache) used(file string) {
 290  	info, err := os.Stat(file)
 291  	if err == nil && c.now().Sub(info.ModTime()) < mtimeInterval {
 292  		return
 293  	}
 294  	os.Chtimes(file, c.now(), c.now())
 295  }
 296  
 297  // Trim removes old cache entries that are likely not to be reused.
 298  func (c *Cache) Trim() {
 299  	now := c.now()
 300  
 301  	// We maintain in dir/trim.txt the time of the last completed cache trim.
 302  	// If the cache has been trimmed recently enough, do nothing.
 303  	// This is the common case.
 304  	data, _ := renameio.ReadFile(filepath.Join(c.dir, "trim.txt"))
 305  	t, err := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64)
 306  	if err == nil && now.Sub(time.Unix(t, 0)) < trimInterval {
 307  		return
 308  	}
 309  
 310  	// Trim each of the 256 subdirectories.
 311  	// We subtract an additional mtimeInterval
 312  	// to account for the imprecision of our "last used" mtimes.
 313  	cutoff := now.Add(-trimLimit - mtimeInterval)
 314  	for i := 0; i < 256; i++ {
 315  		subdir := filepath.Join(c.dir, fmt.Sprintf("%02x", i))
 316  		c.trimSubdir(subdir, cutoff)
 317  	}
 318  
 319  	// Ignore errors from here: if we don't write the complete timestamp, the
 320  	// cache will appear older than it is, and we'll trim it again next time.
 321  	renameio.WriteFile(filepath.Join(c.dir, "trim.txt"), []byte(fmt.Sprintf("%d", now.Unix())), 0666)
 322  }
 323  
 324  // trimSubdir trims a single cache subdirectory.
 325  func (c *Cache) trimSubdir(subdir string, cutoff time.Time) {
 326  	// Read all directory entries from subdir before removing
 327  	// any files, in case removing files invalidates the file offset
 328  	// in the directory scan. Also, ignore error from f.Readdirnames,
 329  	// because we don't care about reporting the error and we still
 330  	// want to process any entries found before the error.
 331  	f, err := os.Open(subdir)
 332  	if err != nil {
 333  		return
 334  	}
 335  	names, _ := f.Readdirnames(-1)
 336  	f.Close()
 337  
 338  	for _, name := range names {
 339  		// Remove only cache entries (xxxx-a and xxxx-d).
 340  		if !strings.HasSuffix(name, "-a") && !strings.HasSuffix(name, "-d") {
 341  			continue
 342  		}
 343  		entry := filepath.Join(subdir, name)
 344  		info, err := os.Stat(entry)
 345  		if err == nil && info.ModTime().Before(cutoff) {
 346  			os.Remove(entry)
 347  		}
 348  	}
 349  }
 350  
 351  // putIndexEntry adds an entry to the cache recording that executing the action
 352  // with the given id produces an output with the given output id (hash) and size.
 353  func (c *Cache) putIndexEntry(id ActionID, out OutputID, size int64, allowVerify bool) error {
 354  	// Note: We expect that for one reason or another it may happen
 355  	// that repeating an action produces a different output hash
 356  	// (for example, if the output contains a time stamp or temp dir name).
 357  	// While not ideal, this is also not a correctness problem, so we
 358  	// don't make a big deal about it. In particular, we leave the action
 359  	// cache entries writable specifically so that they can be overwritten.
 360  	//
 361  	// Setting GODEBUG=gocacheverify=1 does make a big deal:
 362  	// in verify mode we are double-checking that the cache entries
 363  	// are entirely reproducible. As just noted, this may be unrealistic
 364  	// in some cases but the check is also useful for shaking out real bugs.
 365  	entry := fmt.Sprintf("v1 %x %x %20d %20d\n", id, out, size, time.Now().UnixNano())
 366  	if verify && allowVerify {
 367  		old, err := c.get(id)
 368  		if err == nil && (old.OutputID != out || old.Size != size) {
 369  			// panic to show stack trace, so we can see what code is generating this cache entry.
 370  			msg := fmt.Sprintf("go: internal cache error: cache verify failed: id=%x changed:<<<\n%s\n>>>\nold: %x %d\nnew: %x %d", id, reverseHash(id), out, size, old.OutputID, old.Size)
 371  			panic(msg)
 372  		}
 373  	}
 374  	file := c.fileName(id, "a")
 375  
 376  	// Copy file to cache directory.
 377  	mode := os.O_WRONLY | os.O_CREATE
 378  	f, err := os.OpenFile(file, mode, 0666)
 379  	if err != nil {
 380  		return err
 381  	}
 382  	_, err = f.WriteString(entry)
 383  	if err == nil {
 384  		// Truncate the file only *after* writing it.
 385  		// (This should be a no-op, but truncate just in case of previous corruption.)
 386  		//
 387  		// This differs from ioutil.WriteFile, which truncates to 0 *before* writing
 388  		// via os.O_TRUNC. Truncating only after writing ensures that a second write
 389  		// of the same content to the same file is idempotent, and does not — even
 390  		// temporarily! — undo the effect of the first write.
 391  		err = f.Truncate(int64(len(entry)))
 392  	}
 393  	if closeErr := f.Close(); err == nil {
 394  		err = closeErr
 395  	}
 396  	if err != nil {
 397  		// TODO(bcmills): This Remove potentially races with another go command writing to file.
 398  		// Can we eliminate it?
 399  		os.Remove(file)
 400  		return err
 401  	}
 402  	os.Chtimes(file, c.now(), c.now()) // mainly for tests
 403  
 404  	return nil
 405  }
 406  
 407  // Put stores the given output in the cache as the output for the action ID.
 408  // It may read file twice. The content of file must not change between the two passes.
 409  func (c *Cache) Put(id ActionID, file io.ReadSeeker) (OutputID, int64, error) {
 410  	return c.put(id, file, true)
 411  }
 412  
 413  // PutNoVerify is like Put but disables the verify check
 414  // when GODEBUG=goverifycache=1 is set.
 415  // It is meant for data that is OK to cache but that we expect to vary slightly from run to run,
 416  // like test output containing times and the like.
 417  func (c *Cache) PutNoVerify(id ActionID, file io.ReadSeeker) (OutputID, int64, error) {
 418  	return c.put(id, file, false)
 419  }
 420  
 421  func (c *Cache) put(id ActionID, file io.ReadSeeker, allowVerify bool) (OutputID, int64, error) {
 422  	// Compute output ID.
 423  	h := sha256.New()
 424  	if _, err := file.Seek(0, 0); err != nil {
 425  		return OutputID{}, 0, err
 426  	}
 427  	size, err := io.Copy(h, file)
 428  	if err != nil {
 429  		return OutputID{}, 0, err
 430  	}
 431  	var out OutputID
 432  	h.Sum(out[:0])
 433  
 434  	// Copy to cached output file (if not already present).
 435  	if err := c.copyFile(file, out, size); err != nil {
 436  		return out, size, err
 437  	}
 438  
 439  	// Add to cache index.
 440  	return out, size, c.putIndexEntry(id, out, size, allowVerify)
 441  }
 442  
 443  // PutBytes stores the given bytes in the cache as the output for the action ID.
 444  func (c *Cache) PutBytes(id ActionID, data []byte) error {
 445  	_, _, err := c.Put(id, bytes.NewReader(data))
 446  	return err
 447  }
 448  
 449  // copyFile copies file into the cache, expecting it to have the given
 450  // output ID and size, if that file is not present already.
 451  func (c *Cache) copyFile(file io.ReadSeeker, out OutputID, size int64) error {
 452  	name := c.fileName(out, "d")
 453  	info, err := os.Stat(name)
 454  	if err == nil && info.Size() == size {
 455  		// Check hash.
 456  		if f, err := os.Open(name); err == nil {
 457  			h := sha256.New()
 458  			io.Copy(h, f)
 459  			f.Close()
 460  			var out2 OutputID
 461  			h.Sum(out2[:0])
 462  			if out == out2 {
 463  				return nil
 464  			}
 465  		}
 466  		// Hash did not match. Fall through and rewrite file.
 467  	}
 468  
 469  	// Copy file to cache directory.
 470  	mode := os.O_RDWR | os.O_CREATE
 471  	if err == nil && info.Size() > size { // shouldn't happen but fix in case
 472  		mode |= os.O_TRUNC
 473  	}
 474  	f, err := os.OpenFile(name, mode, 0666)
 475  	if err != nil {
 476  		return err
 477  	}
 478  	defer f.Close()
 479  	if size == 0 {
 480  		// File now exists with correct size.
 481  		// Only one possible zero-length file, so contents are OK too.
 482  		// Early return here makes sure there's a "last byte" for code below.
 483  		return nil
 484  	}
 485  
 486  	// From here on, if any of the I/O writing the file fails,
 487  	// we make a best-effort attempt to truncate the file f
 488  	// before returning, to avoid leaving bad bytes in the file.
 489  
 490  	// Copy file to f, but also into h to double-check hash.
 491  	if _, err := file.Seek(0, 0); err != nil {
 492  		f.Truncate(0)
 493  		return err
 494  	}
 495  	h := sha256.New()
 496  	w := io.MultiWriter(f, h)
 497  	if _, err := io.CopyN(w, file, size-1); err != nil {
 498  		f.Truncate(0)
 499  		return err
 500  	}
 501  	// Check last byte before writing it; writing it will make the size match
 502  	// what other processes expect to find and might cause them to start
 503  	// using the file.
 504  	buf := make([]byte, 1)
 505  	if _, err := file.Read(buf); err != nil {
 506  		f.Truncate(0)
 507  		return err
 508  	}
 509  	h.Write(buf)
 510  	sum := h.Sum(nil)
 511  	if !bytes.Equal(sum, out[:]) {
 512  		f.Truncate(0)
 513  		return fmt.Errorf("file content changed underfoot")
 514  	}
 515  
 516  	// Commit cache file entry.
 517  	if _, err := f.Write(buf); err != nil {
 518  		f.Truncate(0)
 519  		return err
 520  	}
 521  	if err := f.Close(); err != nil {
 522  		// Data might not have been written,
 523  		// but file may look like it is the right size.
 524  		// To be extra careful, remove cached file.
 525  		os.Remove(name)
 526  		return err
 527  	}
 528  	os.Chtimes(name, c.now(), c.now()) // mainly for tests
 529  
 530  	return nil
 531  }
 532