exec_linux.mx raw

   1  // Copyright 2011 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build linux
   6  
   7  package syscall
   8  
   9  import (
  10  	errpkg "errors"
  11  	"internal/itoa"
  12  	"runtime"
  13  	"unsafe"
  14  )
  15  
  16  // Linux unshare/clone/clone2/clone3 flags, architecture-independent,
  17  // copied from linux/sched.h.
  18  const (
  19  	CLONE_VM             = 0x00000100 // set if VM shared between processes
  20  	CLONE_FS             = 0x00000200 // set if fs info shared between processes
  21  	CLONE_FILES          = 0x00000400 // set if open files shared between processes
  22  	CLONE_SIGHAND        = 0x00000800 // set if signal handlers and blocked signals shared
  23  	CLONE_PIDFD          = 0x00001000 // set if a pidfd should be placed in parent
  24  	CLONE_PTRACE         = 0x00002000 // set if we want to let tracing continue on the child too
  25  	CLONE_VFORK          = 0x00004000 // set if the parent wants the child to wake it up on mm_release
  26  	CLONE_PARENT         = 0x00008000 // set if we want to have the same parent as the cloner
  27  	CLONE_THREAD         = 0x00010000 // Same thread group?
  28  	CLONE_NEWNS          = 0x00020000 // New mount namespace group
  29  	CLONE_SYSVSEM        = 0x00040000 // share system V SEM_UNDO semantics
  30  	CLONE_SETTLS         = 0x00080000 // create a new TLS for the child
  31  	CLONE_PARENT_SETTID  = 0x00100000 // set the TID in the parent
  32  	CLONE_CHILD_CLEARTID = 0x00200000 // clear the TID in the child
  33  	CLONE_DETACHED       = 0x00400000 // Unused, ignored
  34  	CLONE_UNTRACED       = 0x00800000 // set if the tracing process can't force CLONE_PTRACE on this clone
  35  	CLONE_CHILD_SETTID   = 0x01000000 // set the TID in the child
  36  	CLONE_NEWCGROUP      = 0x02000000 // New cgroup namespace
  37  	CLONE_NEWUTS         = 0x04000000 // New utsname namespace
  38  	CLONE_NEWIPC         = 0x08000000 // New ipc namespace
  39  	CLONE_NEWUSER        = 0x10000000 // New user namespace
  40  	CLONE_NEWPID         = 0x20000000 // New pid namespace
  41  	CLONE_NEWNET         = 0x40000000 // New network namespace
  42  	CLONE_IO             = 0x80000000 // Clone io context
  43  
  44  	// Flags for the clone3() syscall.
  45  
  46  	CLONE_CLEAR_SIGHAND = 0x100000000 // Clear any signal handler and reset to SIG_DFL.
  47  	CLONE_INTO_CGROUP   = 0x200000000 // Clone into a specific cgroup given the right permissions.
  48  
  49  	// Cloning flags intersect with CSIGNAL so can be used with unshare and clone3
  50  	// syscalls only:
  51  
  52  	CLONE_NEWTIME = 0x00000080 // New time namespace
  53  )
  54  
  55  // SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux.
  56  // See user_namespaces(7).
  57  //
  58  // Note that User Namespaces are not available on a number of popular Linux
  59  // versions (due to security issues), or are available but subject to AppArmor
  60  // restrictions like in Ubuntu 24.04.
  61  type SysProcIDMap struct {
  62  	ContainerID int // Container ID.
  63  	HostID      int // Host ID.
  64  	Size        int // Size.
  65  }
  66  
  67  type SysProcAttr struct {
  68  	Chroot     string      // Chroot.
  69  	Credential *Credential // Credential.
  70  	// Ptrace tells the child to call ptrace(PTRACE_TRACEME).
  71  	// Call runtime.LockOSThread before starting a process with this set,
  72  	// and don't call UnlockOSThread until done with PtraceSyscall calls.
  73  	Ptrace bool
  74  	Setsid bool // Create session.
  75  	// Setpgid sets the process group ID of the child to Pgid,
  76  	// or, if Pgid == 0, to the new child's process ID.
  77  	Setpgid bool
  78  	// Setctty sets the controlling terminal of the child to
  79  	// file descriptor Ctty. Ctty must be a descriptor number
  80  	// in the child process: an index into ProcAttr.Files.
  81  	// This is only meaningful if Setsid is true.
  82  	Setctty bool
  83  	Noctty  bool // Detach fd 0 from controlling terminal.
  84  	Ctty    int  // Controlling TTY fd.
  85  	// Foreground places the child process group in the foreground.
  86  	// This implies Setpgid. The Ctty field must be set to
  87  	// the descriptor of the controlling TTY.
  88  	// Unlike Setctty, in this case Ctty must be a descriptor
  89  	// number in the parent process.
  90  	Foreground bool
  91  	Pgid       int // Child's process group ID if Setpgid.
  92  	// Pdeathsig, if non-zero, is a signal that the kernel will send to
  93  	// the child process when the creating thread dies. Note that the signal
  94  	// is sent on thread termination, which may happen before process termination.
  95  	// There are more details at https://go.dev/issue/27505.
  96  	Pdeathsig    Signal
  97  	Cloneflags   uintptr        // Flags for clone calls.
  98  	Unshareflags uintptr        // Flags for unshare calls.
  99  	UidMappings  []SysProcIDMap // User ID mappings for user namespaces.
 100  	GidMappings  []SysProcIDMap // Group ID mappings for user namespaces.
 101  	// GidMappingsEnableSetgroups enabling setgroups syscall.
 102  	// If false, then setgroups syscall will be disabled for the child process.
 103  	// This parameter is no-op if GidMappings == nil. Otherwise for unprivileged
 104  	// users this should be set to false for mappings work.
 105  	GidMappingsEnableSetgroups bool
 106  	AmbientCaps                []uintptr // Ambient capabilities.
 107  	UseCgroupFD                bool      // Whether to make use of the CgroupFD field.
 108  	CgroupFD                   int       // File descriptor of a cgroup to put the new process into.
 109  	// PidFD, if not nil, is used to store the pidfd of a child, if the
 110  	// functionality is supported by the kernel, or -1. Note *PidFD is
 111  	// changed only if the process starts successfully.
 112  	PidFD *int
 113  }
 114  
 115  var (
 116  	none  = [...]byte{'n', 'o', 'n', 'e', 0}
 117  	slash = [...]byte{'/', 0}
 118  
 119  	forceClone3 = false // Used by unit tests only.
 120  )
 121  
 122  // Implemented in runtime package.
 123  func runtime_BeforeFork()
 124  func runtime_AfterFork()
 125  func runtime_AfterForkInChild()
 126  
 127  // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child.
 128  // If a dup or exec fails, write the errno error to pipe.
 129  // (Pipe is close-on-exec so if exec succeeds, it will be closed.)
 130  // In the child, this function must not acquire any locks, because
 131  // they might have been locked at the time of the fork. This means
 132  // no rescheduling, no malloc calls, and no new stack segments.
 133  // For the same reason compiler does not race instrument it.
 134  // The calls to RawSyscall are okay because they are assembly
 135  // functions that do not grow the stack.
 136  //
 137  //go:norace
 138  func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
 139  	// Set up and fork. This returns immediately in the parent or
 140  	// if there's an error.
 141  	upid, pidfd, err, mapPipe, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
 142  	if locked {
 143  		runtime_AfterFork()
 144  	}
 145  	if err != 0 {
 146  		return 0, err
 147  	}
 148  
 149  	// parent; return PID
 150  	pid = int(upid)
 151  	if sys.PidFD != nil {
 152  		*sys.PidFD = int(pidfd)
 153  	}
 154  
 155  	if sys.UidMappings != nil || sys.GidMappings != nil {
 156  		Close(mapPipe[0])
 157  		var err2 Errno
 158  		// uid/gid mappings will be written after fork and unshare(2) for user
 159  		// namespaces.
 160  		if sys.Unshareflags&CLONE_NEWUSER == 0 {
 161  			if err := writeUidGidMappings(pid, sys); err != nil {
 162  				err2 = err.(Errno)
 163  			}
 164  		}
 165  		RawSyscall(SYS_WRITE, uintptr(mapPipe[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
 166  		Close(mapPipe[1])
 167  	}
 168  
 169  	return pid, 0
 170  }
 171  
 172  const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
 173  
 174  type capHeader struct {
 175  	version uint32
 176  	pid     int32
 177  }
 178  
 179  type capData struct {
 180  	effective   uint32
 181  	permitted   uint32
 182  	inheritable uint32
 183  }
 184  type caps struct {
 185  	hdr  capHeader
 186  	data [2]capData
 187  }
 188  
 189  // See CAP_TO_INDEX in linux/capability.h:
 190  func capToIndex(cap uintptr) uintptr { return cap >> 5 }
 191  
 192  // See CAP_TO_MASK in linux/capability.h:
 193  func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
 194  
 195  // cloneArgs holds arguments for clone3 Linux syscall.
 196  type cloneArgs struct {
 197  	flags      uint64 // Flags bit mask
 198  	pidFD      uint64 // Where to store PID file descriptor (int *)
 199  	childTID   uint64 // Where to store child TID, in child's memory (pid_t *)
 200  	parentTID  uint64 // Where to store child TID, in parent's memory (pid_t *)
 201  	exitSignal uint64 // Signal to deliver to parent on child termination
 202  	stack      uint64 // Pointer to lowest byte of stack
 203  	stackSize  uint64 // Size of stack
 204  	tls        uint64 // Location of new TLS
 205  	setTID     uint64 // Pointer to a pid_t array (since Linux 5.5)
 206  	setTIDSize uint64 // Number of elements in set_tid (since Linux 5.5)
 207  	cgroup     uint64 // File descriptor for target cgroup of child (since Linux 5.7)
 208  }
 209  
 210  // forkAndExecInChild1 implements the body of forkAndExecInChild up to
 211  // the parent's post-fork path. This is a separate function so we can
 212  // separate the child's and parent's stack frames if we're using
 213  // vfork.
 214  //
 215  // This is go:noinline because the point is to keep the stack frames
 216  // of this and forkAndExecInChild separate.
 217  //
 218  //go:noinline
 219  //go:norace
 220  //go:nocheckptr
 221  func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid uintptr, pidfd int32, err1 Errno, mapPipe [2]int, locked bool) {
 222  	// Defined in linux/prctl.h starting with Linux 4.3.
 223  	const (
 224  		PR_CAP_AMBIENT       = 0x2f
 225  		PR_CAP_AMBIENT_RAISE = 0x2
 226  	)
 227  
 228  	// vfork requires that the child not touch any of the parent's
 229  	// active stack frames. Hence, the child does all post-fork
 230  	// processing in this stack frame and never returns, while the
 231  	// parent returns immediately from this frame and does all
 232  	// post-fork processing in the outer frame.
 233  	//
 234  	// Declare all variables at top in case any
 235  	// declarations require heap allocation (e.g., err2).
 236  	// ":=" should not be used to declare any variable after
 237  	// the call to runtime_BeforeFork.
 238  	//
 239  	// NOTE(bcmills): The allocation behavior described in the above comment
 240  	// seems to lack a corresponding test, and it may be rendered invalid
 241  	// by an otherwise-correct change in the compiler.
 242  	var (
 243  		err2                      Errno
 244  		nextfd                    int
 245  		i                         int
 246  		caps                      caps
 247  		fd1, flags                uintptr
 248  		puid, psetgroups, pgid    []byte
 249  		uidmap, setgroups, gidmap []byte
 250  		clone3                    *cloneArgs
 251  		pgrp                      int32
 252  		dirfd                     int
 253  		cred                      *Credential
 254  		ngroups, groups           uintptr
 255  		c                         uintptr
 256  		rlim                      *Rlimit
 257  		lim                       Rlimit
 258  	)
 259  	pidfd = -1
 260  
 261  	rlim = origRlimitNofile.Load()
 262  
 263  	if sys.UidMappings != nil {
 264  		puid = []byte("/proc/self/uid_map\000")
 265  		uidmap = formatIDMappings(sys.UidMappings)
 266  	}
 267  
 268  	if sys.GidMappings != nil {
 269  		psetgroups = []byte("/proc/self/setgroups\000")
 270  		pgid = []byte("/proc/self/gid_map\000")
 271  
 272  		if sys.GidMappingsEnableSetgroups {
 273  			setgroups = []byte("allow\000")
 274  		} else {
 275  			setgroups = []byte("deny\000")
 276  		}
 277  		gidmap = formatIDMappings(sys.GidMappings)
 278  	}
 279  
 280  	// Record parent PID so child can test if it has died.
 281  	ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
 282  
 283  	// Guard against side effects of shuffling fds below.
 284  	// Make sure that nextfd is beyond any currently open files so
 285  	// that we can't run the risk of overwriting any of them.
 286  	fd := make([]int, len(attr.Files))
 287  	nextfd = len(attr.Files)
 288  	for i, ufd := range attr.Files {
 289  		if nextfd < int(ufd) {
 290  			nextfd = int(ufd)
 291  		}
 292  		fd[i] = int(ufd)
 293  	}
 294  	nextfd++
 295  
 296  	// Allocate another pipe for parent to child communication for
 297  	// synchronizing writing of User ID/Group ID mappings.
 298  	if sys.UidMappings != nil || sys.GidMappings != nil {
 299  		if err := forkExecPipe(mapPipe[:]); err != nil {
 300  			err1 = err.(Errno)
 301  			return
 302  		}
 303  	}
 304  
 305  	flags = sys.Cloneflags
 306  	if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
 307  		flags |= CLONE_VFORK | CLONE_VM
 308  	}
 309  	if sys.PidFD != nil {
 310  		flags |= CLONE_PIDFD
 311  	}
 312  	// Whether to use clone3.
 313  	if sys.UseCgroupFD || flags&CLONE_NEWTIME != 0 || forceClone3 {
 314  		clone3 = &cloneArgs{
 315  			flags:      uint64(flags),
 316  			exitSignal: uint64(SIGCHLD),
 317  		}
 318  		if sys.UseCgroupFD {
 319  			clone3.flags |= CLONE_INTO_CGROUP
 320  			clone3.cgroup = uint64(sys.CgroupFD)
 321  		}
 322  		if sys.PidFD != nil {
 323  			clone3.pidFD = uint64(uintptr(unsafe.Pointer(&pidfd)))
 324  		}
 325  	}
 326  
 327  	// About to call fork.
 328  	// No more allocation or calls of non-assembly functions.
 329  	runtime_BeforeFork()
 330  	locked = true
 331  	if clone3 != nil {
 332  		pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0)
 333  	} else {
 334  		// N.B. Keep in sync with doCheckClonePidfd.
 335  		flags |= uintptr(SIGCHLD)
 336  		if runtime.GOARCH == "s390x" {
 337  			// On Linux/s390, the first two arguments of clone(2) are swapped.
 338  			pid, err1 = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(&pidfd)))
 339  		} else {
 340  			pid, err1 = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(&pidfd)))
 341  		}
 342  	}
 343  	if err1 != 0 || pid != 0 {
 344  		// If we're in the parent, we must return immediately
 345  		// so we're not in the same stack frame as the child.
 346  		// This can at most use the return PC, which the child
 347  		// will not modify, and the results of
 348  		// rawVforkSyscall, which must have been written after
 349  		// the child was replaced.
 350  		return
 351  	}
 352  
 353  	// Fork succeeded, now in child.
 354  
 355  	// Enable the "keep capabilities" flag to set ambient capabilities later.
 356  	if len(sys.AmbientCaps) > 0 {
 357  		_, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
 358  		if err1 != 0 {
 359  			goto childerror
 360  		}
 361  	}
 362  
 363  	// Wait for User ID/Group ID mappings to be written.
 364  	if sys.UidMappings != nil || sys.GidMappings != nil {
 365  		if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {
 366  			goto childerror
 367  		}
 368  		pid, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
 369  		if err1 != 0 {
 370  			goto childerror
 371  		}
 372  		if pid != unsafe.Sizeof(err2) {
 373  			err1 = EINVAL
 374  			goto childerror
 375  		}
 376  		if err2 != 0 {
 377  			err1 = err2
 378  			goto childerror
 379  		}
 380  	}
 381  
 382  	// Session ID
 383  	if sys.Setsid {
 384  		_, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
 385  		if err1 != 0 {
 386  			goto childerror
 387  		}
 388  	}
 389  
 390  	// Set process group
 391  	if sys.Setpgid || sys.Foreground {
 392  		// Place child in process group.
 393  		_, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
 394  		if err1 != 0 {
 395  			goto childerror
 396  		}
 397  	}
 398  
 399  	if sys.Foreground {
 400  		pgrp = int32(sys.Pgid)
 401  		if pgrp == 0 {
 402  			pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
 403  
 404  			pgrp = int32(pid)
 405  		}
 406  
 407  		// Place process group in foreground.
 408  		_, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
 409  		if err1 != 0 {
 410  			goto childerror
 411  		}
 412  	}
 413  
 414  	// Restore the signal mask. We do this after TIOCSPGRP to avoid
 415  	// having the kernel send a SIGTTOU signal to the process group.
 416  	runtime_AfterForkInChild()
 417  
 418  	// Unshare
 419  	if sys.Unshareflags != 0 {
 420  		_, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
 421  		if err1 != 0 {
 422  			goto childerror
 423  		}
 424  
 425  		if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
 426  			dirfd = int(_AT_FDCWD)
 427  			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
 428  				goto childerror
 429  			}
 430  			pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
 431  			if err1 != 0 {
 432  				goto childerror
 433  			}
 434  			if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
 435  				goto childerror
 436  			}
 437  
 438  			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
 439  				goto childerror
 440  			}
 441  			pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
 442  			if err1 != 0 {
 443  				goto childerror
 444  			}
 445  			if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
 446  				goto childerror
 447  			}
 448  		}
 449  
 450  		if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
 451  			dirfd = int(_AT_FDCWD)
 452  			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
 453  				goto childerror
 454  			}
 455  			pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
 456  			if err1 != 0 {
 457  				goto childerror
 458  			}
 459  			if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
 460  				goto childerror
 461  			}
 462  		}
 463  
 464  		// The unshare system call in Linux doesn't unshare mount points
 465  		// mounted with --shared. Systemd mounts / with --shared. For a
 466  		// long discussion of the pros and cons of this see debian bug 739593.
 467  		// The Go model of unsharing is more like Plan 9, where you ask
 468  		// to unshare and the namespaces are unconditionally unshared.
 469  		// To make this model work we must further mark / as MS_PRIVATE.
 470  		// This is what the standard unshare command does.
 471  		if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
 472  			_, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
 473  			if err1 != 0 {
 474  				goto childerror
 475  			}
 476  		}
 477  	}
 478  
 479  	// Chroot
 480  	if chroot != nil {
 481  		_, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
 482  		if err1 != 0 {
 483  			goto childerror
 484  		}
 485  	}
 486  
 487  	// User and groups
 488  	if cred = sys.Credential; cred != nil {
 489  		ngroups = uintptr(len(cred.Groups))
 490  		groups = uintptr(0)
 491  		if ngroups > 0 {
 492  			groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
 493  		}
 494  		if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
 495  			_, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
 496  			if err1 != 0 {
 497  				goto childerror
 498  			}
 499  		}
 500  		_, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
 501  		if err1 != 0 {
 502  			goto childerror
 503  		}
 504  		_, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
 505  		if err1 != 0 {
 506  			goto childerror
 507  		}
 508  	}
 509  
 510  	if len(sys.AmbientCaps) != 0 {
 511  		// Ambient capabilities were added in the 4.3 kernel,
 512  		// so it is safe to always use _LINUX_CAPABILITY_VERSION_3.
 513  		caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
 514  
 515  		if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
 516  			goto childerror
 517  		}
 518  
 519  		for _, c = range sys.AmbientCaps {
 520  			// Add the c capability to the permitted and inheritable capability mask,
 521  			// otherwise we will not be able to add it to the ambient capability mask.
 522  			caps.data[capToIndex(c)].permitted |= capToMask(c)
 523  			caps.data[capToIndex(c)].inheritable |= capToMask(c)
 524  		}
 525  
 526  		if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
 527  			goto childerror
 528  		}
 529  
 530  		for _, c = range sys.AmbientCaps {
 531  			_, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
 532  			if err1 != 0 {
 533  				goto childerror
 534  			}
 535  		}
 536  	}
 537  
 538  	// Chdir
 539  	if dir != nil {
 540  		_, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
 541  		if err1 != 0 {
 542  			goto childerror
 543  		}
 544  	}
 545  
 546  	// Parent death signal
 547  	if sys.Pdeathsig != 0 {
 548  		_, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
 549  		if err1 != 0 {
 550  			goto childerror
 551  		}
 552  
 553  		// Signal self if parent is already dead. This might cause a
 554  		// duplicate signal in rare cases, but it won't matter when
 555  		// using SIGKILL.
 556  		pid, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
 557  		if pid != ppid {
 558  			pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
 559  			_, _, err1 = RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
 560  			if err1 != 0 {
 561  				goto childerror
 562  			}
 563  		}
 564  	}
 565  
 566  	// Pass 1: look for fd[i] < i and move those up above len(fd)
 567  	// so that pass 2 won't stomp on an fd it needs later.
 568  	if pipe < nextfd {
 569  		_, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
 570  		if err1 != 0 {
 571  			goto childerror
 572  		}
 573  		pipe = nextfd
 574  		nextfd++
 575  	}
 576  	for i = 0; i < len(fd); i++ {
 577  		if fd[i] >= 0 && fd[i] < i {
 578  			if nextfd == pipe { // don't stomp on pipe
 579  				nextfd++
 580  			}
 581  			_, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
 582  			if err1 != 0 {
 583  				goto childerror
 584  			}
 585  			fd[i] = nextfd
 586  			nextfd++
 587  		}
 588  	}
 589  
 590  	// Pass 2: dup fd[i] down onto i.
 591  	for i = 0; i < len(fd); i++ {
 592  		if fd[i] == -1 {
 593  			RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
 594  			continue
 595  		}
 596  		if fd[i] == i {
 597  			// dup2(i, i) won't clear close-on-exec flag on Linux,
 598  			// probably not elsewhere either.
 599  			_, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
 600  			if err1 != 0 {
 601  				goto childerror
 602  			}
 603  			continue
 604  		}
 605  		// The new fd is created NOT close-on-exec,
 606  		// which is exactly what we want.
 607  		_, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
 608  		if err1 != 0 {
 609  			goto childerror
 610  		}
 611  	}
 612  
 613  	// By convention, we don't close-on-exec the fds we are
 614  	// started with, so if len(fd) < 3, close 0, 1, 2 as needed.
 615  	// Programs that know they inherit fds >= 3 will need
 616  	// to set them close-on-exec.
 617  	for i = len(fd); i < 3; i++ {
 618  		RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
 619  	}
 620  
 621  	// Detach fd 0 from tty
 622  	if sys.Noctty {
 623  		_, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
 624  		if err1 != 0 {
 625  			goto childerror
 626  		}
 627  	}
 628  
 629  	// Set the controlling TTY to Ctty
 630  	if sys.Setctty {
 631  		_, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
 632  		if err1 != 0 {
 633  			goto childerror
 634  		}
 635  	}
 636  
 637  	// Restore original rlimit.
 638  	if rlim != nil {
 639  		// Some other process may have changed our rlimit by
 640  		// calling prlimit. We can check for that case because
 641  		// our current rlimit will not be the value we set when
 642  		// caching the rlimit in the init function in rlimit.go.
 643  		//
 644  		// Note that this test is imperfect, since it won't catch
 645  		// the case in which some other process used prlimit to
 646  		// set our rlimits to max-1/max. In that case we will fall
 647  		// back to the original cur/max when starting the child.
 648  		// We hope that setting to max-1/max is unlikely.
 649  		_, _, err1 = RawSyscall6(SYS_PRLIMIT64, 0, RLIMIT_NOFILE, 0, uintptr(unsafe.Pointer(&lim)), 0, 0)
 650  		if err1 != 0 || (lim.Cur == rlim.Max-1 && lim.Max == rlim.Max) {
 651  			RawSyscall6(SYS_PRLIMIT64, 0, RLIMIT_NOFILE, uintptr(unsafe.Pointer(rlim)), 0, 0, 0)
 652  		}
 653  	}
 654  
 655  	// Enable tracing if requested.
 656  	// Do this right before exec so that we don't unnecessarily trace the runtime
 657  	// setting up after the fork. See issue #21428.
 658  	if sys.Ptrace {
 659  		_, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
 660  		if err1 != 0 {
 661  			goto childerror
 662  		}
 663  	}
 664  
 665  	// Time to exec.
 666  	_, _, err1 = RawSyscall(SYS_EXECVE,
 667  		uintptr(unsafe.Pointer(argv0)),
 668  		uintptr(unsafe.Pointer(&argv[0])),
 669  		uintptr(unsafe.Pointer(&envv[0])))
 670  
 671  childerror:
 672  	// send error code on pipe
 673  	RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
 674  	for {
 675  		RawSyscall(SYS_EXIT, 253, 0, 0)
 676  	}
 677  }
 678  
 679  func formatIDMappings(idMap []SysProcIDMap) []byte {
 680  	var data []byte
 681  	for _, im := range idMap {
 682  		data = append(data, itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n"...)
 683  	}
 684  	return data
 685  }
 686  
 687  // writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path.
 688  func writeIDMappings(path string, idMap []SysProcIDMap) error {
 689  	fd, err := Open(path, O_RDWR, 0)
 690  	if err != nil {
 691  		return err
 692  	}
 693  
 694  	if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
 695  		Close(fd)
 696  		return err
 697  	}
 698  
 699  	if err := Close(fd); err != nil {
 700  		return err
 701  	}
 702  
 703  	return nil
 704  }
 705  
 706  // writeSetgroups writes to /proc/PID/setgroups "deny" if enable is false
 707  // and "allow" if enable is true.
 708  // This is needed since kernel 3.19, because you can't write gid_map without
 709  // disabling setgroups() system call.
 710  func writeSetgroups(pid int, enable bool) error {
 711  	sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
 712  	fd, err := Open(sgf, O_RDWR, 0)
 713  	if err != nil {
 714  		return err
 715  	}
 716  
 717  	var data []byte
 718  	if enable {
 719  		data = []byte("allow")
 720  	} else {
 721  		data = []byte("deny")
 722  	}
 723  
 724  	if _, err := Write(fd, data); err != nil {
 725  		Close(fd)
 726  		return err
 727  	}
 728  
 729  	return Close(fd)
 730  }
 731  
 732  // writeUidGidMappings writes User ID and Group ID mappings for user namespaces
 733  // for a process and it is called from the parent process.
 734  func writeUidGidMappings(pid int, sys *SysProcAttr) error {
 735  	if sys.UidMappings != nil {
 736  		uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
 737  		if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
 738  			return err
 739  		}
 740  	}
 741  
 742  	if sys.GidMappings != nil {
 743  		// If the kernel is too old to support /proc/PID/setgroups, writeSetGroups will return ENOENT; this is OK.
 744  		if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
 745  			return err
 746  		}
 747  		gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
 748  		if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
 749  			return err
 750  		}
 751  	}
 752  
 753  	return nil
 754  }
 755  
 756  // forkAndExecFailureCleanup cleans up after an exec failure.
 757  func forkAndExecFailureCleanup(attr *ProcAttr, sys *SysProcAttr) {
 758  	if sys.PidFD != nil && *sys.PidFD != -1 {
 759  		Close(*sys.PidFD)
 760  		*sys.PidFD = -1
 761  	}
 762  }
 763  
 764  // checkClonePidfd verifies that clone(CLONE_PIDFD) works by actually doing a
 765  // clone.
 766  //
 767  //go:linkname os_checkClonePidfd os.checkClonePidfd
 768  func os_checkClonePidfd() error {
 769  	pidfd := int32(-1)
 770  	pid, errno := doCheckClonePidfd(&pidfd)
 771  	if errno != 0 {
 772  		return errno
 773  	}
 774  
 775  	if pidfd == -1 {
 776  		// Bad: CLONE_PIDFD failed to provide a pidfd. Reap the process
 777  		// before returning.
 778  
 779  		var err error
 780  		for {
 781  			var status WaitStatus
 782  			// WCLONE is an untyped constant that sets bit 31, so
 783  			// it cannot convert directly to int on 32-bit
 784  			// GOARCHes. We must convert through another type
 785  			// first.
 786  			flags := uint(WCLONE)
 787  			_, err = Wait4(int(pid), &status, int(flags), nil)
 788  			if err != EINTR {
 789  				break
 790  			}
 791  		}
 792  		if err != nil {
 793  			return err
 794  		}
 795  
 796  		return errpkg.New("clone(CLONE_PIDFD) failed to return pidfd")
 797  	}
 798  
 799  	// Good: CLONE_PIDFD provided a pidfd. Reap the process and close the
 800  	// pidfd.
 801  	defer Close(int(pidfd))
 802  
 803  	// TODO(roland): this is necessary to prevent valgrind from complaining
 804  	// about passing 0x0 to waitid, which is doesn't like. This is clearly not
 805  	// ideal. The structures are copied (mostly) verbatim from syscall/unix,
 806  	// which we obviously cannot import because of an import loop.
 807  
 808  	const is64bit = ^uint(0) >> 63 // 0 for 32-bit hosts, 1 for 64-bit ones.
 809  	type sigInfo struct {
 810  		Signo int32
 811  		_     struct {
 812  			Errno int32
 813  			Code  int32
 814  		} // Two int32 fields, swapped on MIPS.
 815  		_ [is64bit]int32 // Extra padding for 64-bit hosts only.
 816  
 817  		// End of common part. Beginning of signal-specific part.
 818  
 819  		Pid    int32
 820  		Uid    uint32
 821  		Status int32
 822  
 823  		// Pad to 128 bytes.
 824  		_ [128 - (6+is64bit)*4]byte
 825  	}
 826  
 827  	for {
 828  		const _P_PIDFD = 3
 829  		var info sigInfo
 830  		_, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), uintptr(unsafe.Pointer(&info)), WEXITED|WCLONE, 0, 0)
 831  		if errno != EINTR {
 832  			break
 833  		}
 834  	}
 835  	if errno != 0 {
 836  		return errno
 837  	}
 838  
 839  	return nil
 840  }
 841  
 842  // doCheckClonePidfd implements the actual clone call of os_checkClonePidfd and
 843  // child execution. This is a separate function so we can separate the child's
 844  // and parent's stack frames if we're using vfork.
 845  //
 846  // This is go:noinline because the point is to keep the stack frames of this
 847  // and os_checkClonePidfd separate.
 848  //
 849  //go:noinline
 850  func doCheckClonePidfd(pidfd *int32) (pid uintptr, errno Errno) {
 851  	flags := uintptr(CLONE_VFORK | CLONE_VM | CLONE_PIDFD)
 852  	if runtime.GOARCH == "s390x" {
 853  		// On Linux/s390, the first two arguments of clone(2) are swapped.
 854  		pid, errno = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(pidfd)))
 855  	} else {
 856  		pid, errno = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(pidfd)))
 857  	}
 858  	if errno != 0 || pid != 0 {
 859  		// If we're in the parent, we must return immediately
 860  		// so we're not in the same stack frame as the child.
 861  		// This can at most use the return PC, which the child
 862  		// will not modify, and the results of
 863  		// rawVforkSyscall, which must have been written after
 864  		// the child was replaced.
 865  		return
 866  	}
 867  
 868  	for {
 869  		RawSyscall(SYS_EXIT_GROUP, 0, 0, 0)
 870  	}
 871  }
 872