1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 //go:build linux
6 7 package syscall
8 9 import (
10 errpkg "errors"
11 "internal/itoa"
12 "runtime"
13 "unsafe"
14 )
15 16 // Linux unshare/clone/clone2/clone3 flags, architecture-independent,
17 // copied from linux/sched.h.
18 const (
19 CLONE_VM = 0x00000100 // set if VM shared between processes
20 CLONE_FS = 0x00000200 // set if fs info shared between processes
21 CLONE_FILES = 0x00000400 // set if open files shared between processes
22 CLONE_SIGHAND = 0x00000800 // set if signal handlers and blocked signals shared
23 CLONE_PIDFD = 0x00001000 // set if a pidfd should be placed in parent
24 CLONE_PTRACE = 0x00002000 // set if we want to let tracing continue on the child too
25 CLONE_VFORK = 0x00004000 // set if the parent wants the child to wake it up on mm_release
26 CLONE_PARENT = 0x00008000 // set if we want to have the same parent as the cloner
27 CLONE_THREAD = 0x00010000 // Same thread group?
28 CLONE_NEWNS = 0x00020000 // New mount namespace group
29 CLONE_SYSVSEM = 0x00040000 // share system V SEM_UNDO semantics
30 CLONE_SETTLS = 0x00080000 // create a new TLS for the child
31 CLONE_PARENT_SETTID = 0x00100000 // set the TID in the parent
32 CLONE_CHILD_CLEARTID = 0x00200000 // clear the TID in the child
33 CLONE_DETACHED = 0x00400000 // Unused, ignored
34 CLONE_UNTRACED = 0x00800000 // set if the tracing process can't force CLONE_PTRACE on this clone
35 CLONE_CHILD_SETTID = 0x01000000 // set the TID in the child
36 CLONE_NEWCGROUP = 0x02000000 // New cgroup namespace
37 CLONE_NEWUTS = 0x04000000 // New utsname namespace
38 CLONE_NEWIPC = 0x08000000 // New ipc namespace
39 CLONE_NEWUSER = 0x10000000 // New user namespace
40 CLONE_NEWPID = 0x20000000 // New pid namespace
41 CLONE_NEWNET = 0x40000000 // New network namespace
42 CLONE_IO = 0x80000000 // Clone io context
43 44 // Flags for the clone3() syscall.
45 46 CLONE_CLEAR_SIGHAND = 0x100000000 // Clear any signal handler and reset to SIG_DFL.
47 CLONE_INTO_CGROUP = 0x200000000 // Clone into a specific cgroup given the right permissions.
48 49 // Cloning flags intersect with CSIGNAL so can be used with unshare and clone3
50 // syscalls only:
51 52 CLONE_NEWTIME = 0x00000080 // New time namespace
53 )
54 55 // SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux.
56 // See user_namespaces(7).
57 //
58 // Note that User Namespaces are not available on a number of popular Linux
59 // versions (due to security issues), or are available but subject to AppArmor
60 // restrictions like in Ubuntu 24.04.
61 type SysProcIDMap struct {
62 ContainerID int // Container ID.
63 HostID int // Host ID.
64 Size int // Size.
65 }
66 67 type SysProcAttr struct {
68 Chroot string // Chroot.
69 Credential *Credential // Credential.
70 // Ptrace tells the child to call ptrace(PTRACE_TRACEME).
71 // Call runtime.LockOSThread before starting a process with this set,
72 // and don't call UnlockOSThread until done with PtraceSyscall calls.
73 Ptrace bool
74 Setsid bool // Create session.
75 // Setpgid sets the process group ID of the child to Pgid,
76 // or, if Pgid == 0, to the new child's process ID.
77 Setpgid bool
78 // Setctty sets the controlling terminal of the child to
79 // file descriptor Ctty. Ctty must be a descriptor number
80 // in the child process: an index into ProcAttr.Files.
81 // This is only meaningful if Setsid is true.
82 Setctty bool
83 Noctty bool // Detach fd 0 from controlling terminal.
84 Ctty int // Controlling TTY fd.
85 // Foreground places the child process group in the foreground.
86 // This implies Setpgid. The Ctty field must be set to
87 // the descriptor of the controlling TTY.
88 // Unlike Setctty, in this case Ctty must be a descriptor
89 // number in the parent process.
90 Foreground bool
91 Pgid int // Child's process group ID if Setpgid.
92 // Pdeathsig, if non-zero, is a signal that the kernel will send to
93 // the child process when the creating thread dies. Note that the signal
94 // is sent on thread termination, which may happen before process termination.
95 // There are more details at https://go.dev/issue/27505.
96 Pdeathsig Signal
97 Cloneflags uintptr // Flags for clone calls.
98 Unshareflags uintptr // Flags for unshare calls.
99 UidMappings []SysProcIDMap // User ID mappings for user namespaces.
100 GidMappings []SysProcIDMap // Group ID mappings for user namespaces.
101 // GidMappingsEnableSetgroups enabling setgroups syscall.
102 // If false, then setgroups syscall will be disabled for the child process.
103 // This parameter is no-op if GidMappings == nil. Otherwise for unprivileged
104 // users this should be set to false for mappings work.
105 GidMappingsEnableSetgroups bool
106 AmbientCaps []uintptr // Ambient capabilities.
107 UseCgroupFD bool // Whether to make use of the CgroupFD field.
108 CgroupFD int // File descriptor of a cgroup to put the new process into.
109 // PidFD, if not nil, is used to store the pidfd of a child, if the
110 // functionality is supported by the kernel, or -1. Note *PidFD is
111 // changed only if the process starts successfully.
112 PidFD *int
113 }
114 115 var (
116 none = [...]byte{'n', 'o', 'n', 'e', 0}
117 slash = [...]byte{'/', 0}
118 119 forceClone3 = false // Used by unit tests only.
120 )
121 122 // Implemented in runtime package.
123 func runtime_BeforeFork()
124 func runtime_AfterFork()
125 func runtime_AfterForkInChild()
126 127 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child.
128 // If a dup or exec fails, write the errno error to pipe.
129 // (Pipe is close-on-exec so if exec succeeds, it will be closed.)
130 // In the child, this function must not acquire any locks, because
131 // they might have been locked at the time of the fork. This means
132 // no rescheduling, no malloc calls, and no new stack segments.
133 // For the same reason compiler does not race instrument it.
134 // The calls to RawSyscall are okay because they are assembly
135 // functions that do not grow the stack.
136 //
137 //go:norace
138 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
139 // Set up and fork. This returns immediately in the parent or
140 // if there's an error.
141 upid, pidfd, err, mapPipe, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
142 if locked {
143 runtime_AfterFork()
144 }
145 if err != 0 {
146 return 0, err
147 }
148 149 // parent; return PID
150 pid = int(upid)
151 if sys.PidFD != nil {
152 *sys.PidFD = int(pidfd)
153 }
154 155 if sys.UidMappings != nil || sys.GidMappings != nil {
156 Close(mapPipe[0])
157 var err2 Errno
158 // uid/gid mappings will be written after fork and unshare(2) for user
159 // namespaces.
160 if sys.Unshareflags&CLONE_NEWUSER == 0 {
161 if err := writeUidGidMappings(pid, sys); err != nil {
162 err2 = err.(Errno)
163 }
164 }
165 RawSyscall(SYS_WRITE, uintptr(mapPipe[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
166 Close(mapPipe[1])
167 }
168 169 return pid, 0
170 }
171 172 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
173 174 type capHeader struct {
175 version uint32
176 pid int32
177 }
178 179 type capData struct {
180 effective uint32
181 permitted uint32
182 inheritable uint32
183 }
184 type caps struct {
185 hdr capHeader
186 data [2]capData
187 }
188 189 // See CAP_TO_INDEX in linux/capability.h:
190 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
191 192 // See CAP_TO_MASK in linux/capability.h:
193 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
194 195 // cloneArgs holds arguments for clone3 Linux syscall.
196 type cloneArgs struct {
197 flags uint64 // Flags bit mask
198 pidFD uint64 // Where to store PID file descriptor (int *)
199 childTID uint64 // Where to store child TID, in child's memory (pid_t *)
200 parentTID uint64 // Where to store child TID, in parent's memory (pid_t *)
201 exitSignal uint64 // Signal to deliver to parent on child termination
202 stack uint64 // Pointer to lowest byte of stack
203 stackSize uint64 // Size of stack
204 tls uint64 // Location of new TLS
205 setTID uint64 // Pointer to a pid_t array (since Linux 5.5)
206 setTIDSize uint64 // Number of elements in set_tid (since Linux 5.5)
207 cgroup uint64 // File descriptor for target cgroup of child (since Linux 5.7)
208 }
209 210 // forkAndExecInChild1 implements the body of forkAndExecInChild up to
211 // the parent's post-fork path. This is a separate function so we can
212 // separate the child's and parent's stack frames if we're using
213 // vfork.
214 //
215 // This is go:noinline because the point is to keep the stack frames
216 // of this and forkAndExecInChild separate.
217 //
218 //go:noinline
219 //go:norace
220 //go:nocheckptr
221 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid uintptr, pidfd int32, err1 Errno, mapPipe [2]int, locked bool) {
222 // Defined in linux/prctl.h starting with Linux 4.3.
223 const (
224 PR_CAP_AMBIENT = 0x2f
225 PR_CAP_AMBIENT_RAISE = 0x2
226 )
227 228 // vfork requires that the child not touch any of the parent's
229 // active stack frames. Hence, the child does all post-fork
230 // processing in this stack frame and never returns, while the
231 // parent returns immediately from this frame and does all
232 // post-fork processing in the outer frame.
233 //
234 // Declare all variables at top in case any
235 // declarations require heap allocation (e.g., err2).
236 // ":=" should not be used to declare any variable after
237 // the call to runtime_BeforeFork.
238 //
239 // NOTE(bcmills): The allocation behavior described in the above comment
240 // seems to lack a corresponding test, and it may be rendered invalid
241 // by an otherwise-correct change in the compiler.
242 var (
243 err2 Errno
244 nextfd int
245 i int
246 caps caps
247 fd1, flags uintptr
248 puid, psetgroups, pgid []byte
249 uidmap, setgroups, gidmap []byte
250 clone3 *cloneArgs
251 pgrp int32
252 dirfd int
253 cred *Credential
254 ngroups, groups uintptr
255 c uintptr
256 rlim *Rlimit
257 lim Rlimit
258 )
259 pidfd = -1
260 261 rlim = origRlimitNofile.Load()
262 263 if sys.UidMappings != nil {
264 puid = []byte("/proc/self/uid_map\000")
265 uidmap = formatIDMappings(sys.UidMappings)
266 }
267 268 if sys.GidMappings != nil {
269 psetgroups = []byte("/proc/self/setgroups\000")
270 pgid = []byte("/proc/self/gid_map\000")
271 272 if sys.GidMappingsEnableSetgroups {
273 setgroups = []byte("allow\000")
274 } else {
275 setgroups = []byte("deny\000")
276 }
277 gidmap = formatIDMappings(sys.GidMappings)
278 }
279 280 // Record parent PID so child can test if it has died.
281 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
282 283 // Guard against side effects of shuffling fds below.
284 // Make sure that nextfd is beyond any currently open files so
285 // that we can't run the risk of overwriting any of them.
286 fd := make([]int, len(attr.Files))
287 nextfd = len(attr.Files)
288 for i, ufd := range attr.Files {
289 if nextfd < int(ufd) {
290 nextfd = int(ufd)
291 }
292 fd[i] = int(ufd)
293 }
294 nextfd++
295 296 // Allocate another pipe for parent to child communication for
297 // synchronizing writing of User ID/Group ID mappings.
298 if sys.UidMappings != nil || sys.GidMappings != nil {
299 if err := forkExecPipe(mapPipe[:]); err != nil {
300 err1 = err.(Errno)
301 return
302 }
303 }
304 305 flags = sys.Cloneflags
306 if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
307 flags |= CLONE_VFORK | CLONE_VM
308 }
309 if sys.PidFD != nil {
310 flags |= CLONE_PIDFD
311 }
312 // Whether to use clone3.
313 if sys.UseCgroupFD || flags&CLONE_NEWTIME != 0 || forceClone3 {
314 clone3 = &cloneArgs{
315 flags: uint64(flags),
316 exitSignal: uint64(SIGCHLD),
317 }
318 if sys.UseCgroupFD {
319 clone3.flags |= CLONE_INTO_CGROUP
320 clone3.cgroup = uint64(sys.CgroupFD)
321 }
322 if sys.PidFD != nil {
323 clone3.pidFD = uint64(uintptr(unsafe.Pointer(&pidfd)))
324 }
325 }
326 327 // About to call fork.
328 // No more allocation or calls of non-assembly functions.
329 runtime_BeforeFork()
330 locked = true
331 if clone3 != nil {
332 pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0)
333 } else {
334 // N.B. Keep in sync with doCheckClonePidfd.
335 flags |= uintptr(SIGCHLD)
336 if runtime.GOARCH == "s390x" {
337 // On Linux/s390, the first two arguments of clone(2) are swapped.
338 pid, err1 = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(&pidfd)))
339 } else {
340 pid, err1 = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(&pidfd)))
341 }
342 }
343 if err1 != 0 || pid != 0 {
344 // If we're in the parent, we must return immediately
345 // so we're not in the same stack frame as the child.
346 // This can at most use the return PC, which the child
347 // will not modify, and the results of
348 // rawVforkSyscall, which must have been written after
349 // the child was replaced.
350 return
351 }
352 353 // Fork succeeded, now in child.
354 355 // Enable the "keep capabilities" flag to set ambient capabilities later.
356 if len(sys.AmbientCaps) > 0 {
357 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
358 if err1 != 0 {
359 goto childerror
360 }
361 }
362 363 // Wait for User ID/Group ID mappings to be written.
364 if sys.UidMappings != nil || sys.GidMappings != nil {
365 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {
366 goto childerror
367 }
368 pid, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
369 if err1 != 0 {
370 goto childerror
371 }
372 if pid != unsafe.Sizeof(err2) {
373 err1 = EINVAL
374 goto childerror
375 }
376 if err2 != 0 {
377 err1 = err2
378 goto childerror
379 }
380 }
381 382 // Session ID
383 if sys.Setsid {
384 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
385 if err1 != 0 {
386 goto childerror
387 }
388 }
389 390 // Set process group
391 if sys.Setpgid || sys.Foreground {
392 // Place child in process group.
393 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
394 if err1 != 0 {
395 goto childerror
396 }
397 }
398 399 if sys.Foreground {
400 pgrp = int32(sys.Pgid)
401 if pgrp == 0 {
402 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
403 404 pgrp = int32(pid)
405 }
406 407 // Place process group in foreground.
408 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
409 if err1 != 0 {
410 goto childerror
411 }
412 }
413 414 // Restore the signal mask. We do this after TIOCSPGRP to avoid
415 // having the kernel send a SIGTTOU signal to the process group.
416 runtime_AfterForkInChild()
417 418 // Unshare
419 if sys.Unshareflags != 0 {
420 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
421 if err1 != 0 {
422 goto childerror
423 }
424 425 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
426 dirfd = int(_AT_FDCWD)
427 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
428 goto childerror
429 }
430 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
431 if err1 != 0 {
432 goto childerror
433 }
434 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
435 goto childerror
436 }
437 438 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
439 goto childerror
440 }
441 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
442 if err1 != 0 {
443 goto childerror
444 }
445 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
446 goto childerror
447 }
448 }
449 450 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
451 dirfd = int(_AT_FDCWD)
452 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
453 goto childerror
454 }
455 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
456 if err1 != 0 {
457 goto childerror
458 }
459 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
460 goto childerror
461 }
462 }
463 464 // The unshare system call in Linux doesn't unshare mount points
465 // mounted with --shared. Systemd mounts / with --shared. For a
466 // long discussion of the pros and cons of this see debian bug 739593.
467 // The Go model of unsharing is more like Plan 9, where you ask
468 // to unshare and the namespaces are unconditionally unshared.
469 // To make this model work we must further mark / as MS_PRIVATE.
470 // This is what the standard unshare command does.
471 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
472 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
473 if err1 != 0 {
474 goto childerror
475 }
476 }
477 }
478 479 // Chroot
480 if chroot != nil {
481 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
482 if err1 != 0 {
483 goto childerror
484 }
485 }
486 487 // User and groups
488 if cred = sys.Credential; cred != nil {
489 ngroups = uintptr(len(cred.Groups))
490 groups = uintptr(0)
491 if ngroups > 0 {
492 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
493 }
494 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
495 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
496 if err1 != 0 {
497 goto childerror
498 }
499 }
500 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
501 if err1 != 0 {
502 goto childerror
503 }
504 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
505 if err1 != 0 {
506 goto childerror
507 }
508 }
509 510 if len(sys.AmbientCaps) != 0 {
511 // Ambient capabilities were added in the 4.3 kernel,
512 // so it is safe to always use _LINUX_CAPABILITY_VERSION_3.
513 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
514 515 if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
516 goto childerror
517 }
518 519 for _, c = range sys.AmbientCaps {
520 // Add the c capability to the permitted and inheritable capability mask,
521 // otherwise we will not be able to add it to the ambient capability mask.
522 caps.data[capToIndex(c)].permitted |= capToMask(c)
523 caps.data[capToIndex(c)].inheritable |= capToMask(c)
524 }
525 526 if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
527 goto childerror
528 }
529 530 for _, c = range sys.AmbientCaps {
531 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
532 if err1 != 0 {
533 goto childerror
534 }
535 }
536 }
537 538 // Chdir
539 if dir != nil {
540 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
541 if err1 != 0 {
542 goto childerror
543 }
544 }
545 546 // Parent death signal
547 if sys.Pdeathsig != 0 {
548 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
549 if err1 != 0 {
550 goto childerror
551 }
552 553 // Signal self if parent is already dead. This might cause a
554 // duplicate signal in rare cases, but it won't matter when
555 // using SIGKILL.
556 pid, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
557 if pid != ppid {
558 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
559 _, _, err1 = RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
560 if err1 != 0 {
561 goto childerror
562 }
563 }
564 }
565 566 // Pass 1: look for fd[i] < i and move those up above len(fd)
567 // so that pass 2 won't stomp on an fd it needs later.
568 if pipe < nextfd {
569 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
570 if err1 != 0 {
571 goto childerror
572 }
573 pipe = nextfd
574 nextfd++
575 }
576 for i = 0; i < len(fd); i++ {
577 if fd[i] >= 0 && fd[i] < i {
578 if nextfd == pipe { // don't stomp on pipe
579 nextfd++
580 }
581 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
582 if err1 != 0 {
583 goto childerror
584 }
585 fd[i] = nextfd
586 nextfd++
587 }
588 }
589 590 // Pass 2: dup fd[i] down onto i.
591 for i = 0; i < len(fd); i++ {
592 if fd[i] == -1 {
593 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
594 continue
595 }
596 if fd[i] == i {
597 // dup2(i, i) won't clear close-on-exec flag on Linux,
598 // probably not elsewhere either.
599 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
600 if err1 != 0 {
601 goto childerror
602 }
603 continue
604 }
605 // The new fd is created NOT close-on-exec,
606 // which is exactly what we want.
607 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
608 if err1 != 0 {
609 goto childerror
610 }
611 }
612 613 // By convention, we don't close-on-exec the fds we are
614 // started with, so if len(fd) < 3, close 0, 1, 2 as needed.
615 // Programs that know they inherit fds >= 3 will need
616 // to set them close-on-exec.
617 for i = len(fd); i < 3; i++ {
618 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
619 }
620 621 // Detach fd 0 from tty
622 if sys.Noctty {
623 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
624 if err1 != 0 {
625 goto childerror
626 }
627 }
628 629 // Set the controlling TTY to Ctty
630 if sys.Setctty {
631 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
632 if err1 != 0 {
633 goto childerror
634 }
635 }
636 637 // Restore original rlimit.
638 if rlim != nil {
639 // Some other process may have changed our rlimit by
640 // calling prlimit. We can check for that case because
641 // our current rlimit will not be the value we set when
642 // caching the rlimit in the init function in rlimit.go.
643 //
644 // Note that this test is imperfect, since it won't catch
645 // the case in which some other process used prlimit to
646 // set our rlimits to max-1/max. In that case we will fall
647 // back to the original cur/max when starting the child.
648 // We hope that setting to max-1/max is unlikely.
649 _, _, err1 = RawSyscall6(SYS_PRLIMIT64, 0, RLIMIT_NOFILE, 0, uintptr(unsafe.Pointer(&lim)), 0, 0)
650 if err1 != 0 || (lim.Cur == rlim.Max-1 && lim.Max == rlim.Max) {
651 RawSyscall6(SYS_PRLIMIT64, 0, RLIMIT_NOFILE, uintptr(unsafe.Pointer(rlim)), 0, 0, 0)
652 }
653 }
654 655 // Enable tracing if requested.
656 // Do this right before exec so that we don't unnecessarily trace the runtime
657 // setting up after the fork. See issue #21428.
658 if sys.Ptrace {
659 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
660 if err1 != 0 {
661 goto childerror
662 }
663 }
664 665 // Time to exec.
666 _, _, err1 = RawSyscall(SYS_EXECVE,
667 uintptr(unsafe.Pointer(argv0)),
668 uintptr(unsafe.Pointer(&argv[0])),
669 uintptr(unsafe.Pointer(&envv[0])))
670 671 childerror:
672 // send error code on pipe
673 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
674 for {
675 RawSyscall(SYS_EXIT, 253, 0, 0)
676 }
677 }
678 679 func formatIDMappings(idMap []SysProcIDMap) []byte {
680 var data []byte
681 for _, im := range idMap {
682 data = append(data, itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n"...)
683 }
684 return data
685 }
686 687 // writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path.
688 func writeIDMappings(path string, idMap []SysProcIDMap) error {
689 fd, err := Open(path, O_RDWR, 0)
690 if err != nil {
691 return err
692 }
693 694 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
695 Close(fd)
696 return err
697 }
698 699 if err := Close(fd); err != nil {
700 return err
701 }
702 703 return nil
704 }
705 706 // writeSetgroups writes to /proc/PID/setgroups "deny" if enable is false
707 // and "allow" if enable is true.
708 // This is needed since kernel 3.19, because you can't write gid_map without
709 // disabling setgroups() system call.
710 func writeSetgroups(pid int, enable bool) error {
711 sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
712 fd, err := Open(sgf, O_RDWR, 0)
713 if err != nil {
714 return err
715 }
716 717 var data []byte
718 if enable {
719 data = []byte("allow")
720 } else {
721 data = []byte("deny")
722 }
723 724 if _, err := Write(fd, data); err != nil {
725 Close(fd)
726 return err
727 }
728 729 return Close(fd)
730 }
731 732 // writeUidGidMappings writes User ID and Group ID mappings for user namespaces
733 // for a process and it is called from the parent process.
734 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
735 if sys.UidMappings != nil {
736 uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
737 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
738 return err
739 }
740 }
741 742 if sys.GidMappings != nil {
743 // If the kernel is too old to support /proc/PID/setgroups, writeSetGroups will return ENOENT; this is OK.
744 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
745 return err
746 }
747 gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
748 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
749 return err
750 }
751 }
752 753 return nil
754 }
755 756 // forkAndExecFailureCleanup cleans up after an exec failure.
757 func forkAndExecFailureCleanup(attr *ProcAttr, sys *SysProcAttr) {
758 if sys.PidFD != nil && *sys.PidFD != -1 {
759 Close(*sys.PidFD)
760 *sys.PidFD = -1
761 }
762 }
763 764 // checkClonePidfd verifies that clone(CLONE_PIDFD) works by actually doing a
765 // clone.
766 //
767 //go:linkname os_checkClonePidfd os.checkClonePidfd
768 func os_checkClonePidfd() error {
769 pidfd := int32(-1)
770 pid, errno := doCheckClonePidfd(&pidfd)
771 if errno != 0 {
772 return errno
773 }
774 775 if pidfd == -1 {
776 // Bad: CLONE_PIDFD failed to provide a pidfd. Reap the process
777 // before returning.
778 779 var err error
780 for {
781 var status WaitStatus
782 // WCLONE is an untyped constant that sets bit 31, so
783 // it cannot convert directly to int on 32-bit
784 // GOARCHes. We must convert through another type
785 // first.
786 flags := uint(WCLONE)
787 _, err = Wait4(int(pid), &status, int(flags), nil)
788 if err != EINTR {
789 break
790 }
791 }
792 if err != nil {
793 return err
794 }
795 796 return errpkg.New("clone(CLONE_PIDFD) failed to return pidfd")
797 }
798 799 // Good: CLONE_PIDFD provided a pidfd. Reap the process and close the
800 // pidfd.
801 defer Close(int(pidfd))
802 803 // TODO(roland): this is necessary to prevent valgrind from complaining
804 // about passing 0x0 to waitid, which is doesn't like. This is clearly not
805 // ideal. The structures are copied (mostly) verbatim from syscall/unix,
806 // which we obviously cannot import because of an import loop.
807 808 const is64bit = ^uint(0) >> 63 // 0 for 32-bit hosts, 1 for 64-bit ones.
809 type sigInfo struct {
810 Signo int32
811 _ struct {
812 Errno int32
813 Code int32
814 } // Two int32 fields, swapped on MIPS.
815 _ [is64bit]int32 // Extra padding for 64-bit hosts only.
816 817 // End of common part. Beginning of signal-specific part.
818 819 Pid int32
820 Uid uint32
821 Status int32
822 823 // Pad to 128 bytes.
824 _ [128 - (6+is64bit)*4]byte
825 }
826 827 for {
828 const _P_PIDFD = 3
829 var info sigInfo
830 _, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), uintptr(unsafe.Pointer(&info)), WEXITED|WCLONE, 0, 0)
831 if errno != EINTR {
832 break
833 }
834 }
835 if errno != 0 {
836 return errno
837 }
838 839 return nil
840 }
841 842 // doCheckClonePidfd implements the actual clone call of os_checkClonePidfd and
843 // child execution. This is a separate function so we can separate the child's
844 // and parent's stack frames if we're using vfork.
845 //
846 // This is go:noinline because the point is to keep the stack frames of this
847 // and os_checkClonePidfd separate.
848 //
849 //go:noinline
850 func doCheckClonePidfd(pidfd *int32) (pid uintptr, errno Errno) {
851 flags := uintptr(CLONE_VFORK | CLONE_VM | CLONE_PIDFD)
852 if runtime.GOARCH == "s390x" {
853 // On Linux/s390, the first two arguments of clone(2) are swapped.
854 pid, errno = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(pidfd)))
855 } else {
856 pid, errno = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(pidfd)))
857 }
858 if errno != 0 || pid != 0 {
859 // If we're in the parent, we must return immediately
860 // so we're not in the same stack frame as the child.
861 // This can at most use the return PC, which the child
862 // will not modify, and the results of
863 // rawVforkSyscall, which must have been written after
864 // the child was replaced.
865 return
866 }
867 868 for {
869 RawSyscall(SYS_EXIT_GROUP, 0, 0, 0)
870 }
871 }
872