• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build linux
6
7package syscall
8
9import (
10	errpkg "errors"
11	"internal/itoa"
12	"runtime"
13	"unsafe"
14)
15
16// Linux unshare/clone/clone2/clone3 flags, architecture-independent,
17// copied from linux/sched.h.
18const (
19	CLONE_VM             = 0x00000100 // set if VM shared between processes
20	CLONE_FS             = 0x00000200 // set if fs info shared between processes
21	CLONE_FILES          = 0x00000400 // set if open files shared between processes
22	CLONE_SIGHAND        = 0x00000800 // set if signal handlers and blocked signals shared
23	CLONE_PIDFD          = 0x00001000 // set if a pidfd should be placed in parent
24	CLONE_PTRACE         = 0x00002000 // set if we want to let tracing continue on the child too
25	CLONE_VFORK          = 0x00004000 // set if the parent wants the child to wake it up on mm_release
26	CLONE_PARENT         = 0x00008000 // set if we want to have the same parent as the cloner
27	CLONE_THREAD         = 0x00010000 // Same thread group?
28	CLONE_NEWNS          = 0x00020000 // New mount namespace group
29	CLONE_SYSVSEM        = 0x00040000 // share system V SEM_UNDO semantics
30	CLONE_SETTLS         = 0x00080000 // create a new TLS for the child
31	CLONE_PARENT_SETTID  = 0x00100000 // set the TID in the parent
32	CLONE_CHILD_CLEARTID = 0x00200000 // clear the TID in the child
33	CLONE_DETACHED       = 0x00400000 // Unused, ignored
34	CLONE_UNTRACED       = 0x00800000 // set if the tracing process can't force CLONE_PTRACE on this clone
35	CLONE_CHILD_SETTID   = 0x01000000 // set the TID in the child
36	CLONE_NEWCGROUP      = 0x02000000 // New cgroup namespace
37	CLONE_NEWUTS         = 0x04000000 // New utsname namespace
38	CLONE_NEWIPC         = 0x08000000 // New ipc namespace
39	CLONE_NEWUSER        = 0x10000000 // New user namespace
40	CLONE_NEWPID         = 0x20000000 // New pid namespace
41	CLONE_NEWNET         = 0x40000000 // New network namespace
42	CLONE_IO             = 0x80000000 // Clone io context
43
44	// Flags for the clone3() syscall.
45
46	CLONE_CLEAR_SIGHAND = 0x100000000 // Clear any signal handler and reset to SIG_DFL.
47	CLONE_INTO_CGROUP   = 0x200000000 // Clone into a specific cgroup given the right permissions.
48
49	// Cloning flags intersect with CSIGNAL so can be used with unshare and clone3
50	// syscalls only:
51
52	CLONE_NEWTIME = 0x00000080 // New time namespace
53)
54
55// SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux.
56// See user_namespaces(7).
57//
58// Note that User Namespaces are not available on a number of popular Linux
59// versions (due to security issues), or are available but subject to AppArmor
60// restrictions like in Ubuntu 24.04.
61type SysProcIDMap struct {
62	ContainerID int // Container ID.
63	HostID      int // Host ID.
64	Size        int // Size.
65}
66
67type SysProcAttr struct {
68	Chroot     string      // Chroot.
69	Credential *Credential // Credential.
70	// Ptrace tells the child to call ptrace(PTRACE_TRACEME).
71	// Call runtime.LockOSThread before starting a process with this set,
72	// and don't call UnlockOSThread until done with PtraceSyscall calls.
73	Ptrace bool
74	Setsid bool // Create session.
75	// Setpgid sets the process group ID of the child to Pgid,
76	// or, if Pgid == 0, to the new child's process ID.
77	Setpgid bool
78	// Setctty sets the controlling terminal of the child to
79	// file descriptor Ctty. Ctty must be a descriptor number
80	// in the child process: an index into ProcAttr.Files.
81	// This is only meaningful if Setsid is true.
82	Setctty bool
83	Noctty  bool // Detach fd 0 from controlling terminal.
84	Ctty    int  // Controlling TTY fd.
85	// Foreground places the child process group in the foreground.
86	// This implies Setpgid. The Ctty field must be set to
87	// the descriptor of the controlling TTY.
88	// Unlike Setctty, in this case Ctty must be a descriptor
89	// number in the parent process.
90	Foreground bool
91	Pgid       int // Child's process group ID if Setpgid.
92	// Pdeathsig, if non-zero, is a signal that the kernel will send to
93	// the child process when the creating thread dies. Note that the signal
94	// is sent on thread termination, which may happen before process termination.
95	// There are more details at https://go.dev/issue/27505.
96	Pdeathsig    Signal
97	Cloneflags   uintptr        // Flags for clone calls.
98	Unshareflags uintptr        // Flags for unshare calls.
99	UidMappings  []SysProcIDMap // User ID mappings for user namespaces.
100	GidMappings  []SysProcIDMap // Group ID mappings for user namespaces.
101	// GidMappingsEnableSetgroups enabling setgroups syscall.
102	// If false, then setgroups syscall will be disabled for the child process.
103	// This parameter is no-op if GidMappings == nil. Otherwise for unprivileged
104	// users this should be set to false for mappings work.
105	GidMappingsEnableSetgroups bool
106	AmbientCaps                []uintptr // Ambient capabilities.
107	UseCgroupFD                bool      // Whether to make use of the CgroupFD field.
108	CgroupFD                   int       // File descriptor of a cgroup to put the new process into.
109	// PidFD, if not nil, is used to store the pidfd of a child, if the
110	// functionality is supported by the kernel, or -1. Note *PidFD is
111	// changed only if the process starts successfully.
112	PidFD *int
113}
114
115var (
116	none  = [...]byte{'n', 'o', 'n', 'e', 0}
117	slash = [...]byte{'/', 0}
118
119	forceClone3 = false // Used by unit tests only.
120)
121
122// Implemented in runtime package.
123func runtime_BeforeFork()
124func runtime_AfterFork()
125func runtime_AfterForkInChild()
126
127// Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child.
128// If a dup or exec fails, write the errno error to pipe.
129// (Pipe is close-on-exec so if exec succeeds, it will be closed.)
130// In the child, this function must not acquire any locks, because
131// they might have been locked at the time of the fork. This means
132// no rescheduling, no malloc calls, and no new stack segments.
133// For the same reason compiler does not race instrument it.
134// The calls to RawSyscall are okay because they are assembly
135// functions that do not grow the stack.
136//
137//go:norace
138func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
139	// Set up and fork. This returns immediately in the parent or
140	// if there's an error.
141	upid, pidfd, err, mapPipe, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
142	if locked {
143		runtime_AfterFork()
144	}
145	if err != 0 {
146		return 0, err
147	}
148
149	// parent; return PID
150	pid = int(upid)
151	if sys.PidFD != nil {
152		*sys.PidFD = int(pidfd)
153	}
154
155	if sys.UidMappings != nil || sys.GidMappings != nil {
156		Close(mapPipe[0])
157		var err2 Errno
158		// uid/gid mappings will be written after fork and unshare(2) for user
159		// namespaces.
160		if sys.Unshareflags&CLONE_NEWUSER == 0 {
161			if err := writeUidGidMappings(pid, sys); err != nil {
162				err2 = err.(Errno)
163			}
164		}
165		RawSyscall(SYS_WRITE, uintptr(mapPipe[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
166		Close(mapPipe[1])
167	}
168
169	return pid, 0
170}
171
172const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
173
174type capHeader struct {
175	version uint32
176	pid     int32
177}
178
179type capData struct {
180	effective   uint32
181	permitted   uint32
182	inheritable uint32
183}
184type caps struct {
185	hdr  capHeader
186	data [2]capData
187}
188
189// See CAP_TO_INDEX in linux/capability.h:
190func capToIndex(cap uintptr) uintptr { return cap >> 5 }
191
192// See CAP_TO_MASK in linux/capability.h:
193func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
194
195// cloneArgs holds arguments for clone3 Linux syscall.
196type cloneArgs struct {
197	flags      uint64 // Flags bit mask
198	pidFD      uint64 // Where to store PID file descriptor (int *)
199	childTID   uint64 // Where to store child TID, in child's memory (pid_t *)
200	parentTID  uint64 // Where to store child TID, in parent's memory (pid_t *)
201	exitSignal uint64 // Signal to deliver to parent on child termination
202	stack      uint64 // Pointer to lowest byte of stack
203	stackSize  uint64 // Size of stack
204	tls        uint64 // Location of new TLS
205	setTID     uint64 // Pointer to a pid_t array (since Linux 5.5)
206	setTIDSize uint64 // Number of elements in set_tid (since Linux 5.5)
207	cgroup     uint64 // File descriptor for target cgroup of child (since Linux 5.7)
208}
209
210// forkAndExecInChild1 implements the body of forkAndExecInChild up to
211// the parent's post-fork path. This is a separate function so we can
212// separate the child's and parent's stack frames if we're using
213// vfork.
214//
215// This is go:noinline because the point is to keep the stack frames
216// of this and forkAndExecInChild separate.
217//
218//go:noinline
219//go:norace
220//go:nocheckptr
221func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid uintptr, pidfd int32, err1 Errno, mapPipe [2]int, locked bool) {
222	// Defined in linux/prctl.h starting with Linux 4.3.
223	const (
224		PR_CAP_AMBIENT       = 0x2f
225		PR_CAP_AMBIENT_RAISE = 0x2
226	)
227
228	// vfork requires that the child not touch any of the parent's
229	// active stack frames. Hence, the child does all post-fork
230	// processing in this stack frame and never returns, while the
231	// parent returns immediately from this frame and does all
232	// post-fork processing in the outer frame.
233	//
234	// Declare all variables at top in case any
235	// declarations require heap allocation (e.g., err2).
236	// ":=" should not be used to declare any variable after
237	// the call to runtime_BeforeFork.
238	//
239	// NOTE(bcmills): The allocation behavior described in the above comment
240	// seems to lack a corresponding test, and it may be rendered invalid
241	// by an otherwise-correct change in the compiler.
242	var (
243		err2                      Errno
244		nextfd                    int
245		i                         int
246		caps                      caps
247		fd1, flags                uintptr
248		puid, psetgroups, pgid    []byte
249		uidmap, setgroups, gidmap []byte
250		clone3                    *cloneArgs
251		pgrp                      int32
252		dirfd                     int
253		cred                      *Credential
254		ngroups, groups           uintptr
255		c                         uintptr
256	)
257	pidfd = -1
258
259	rlim := origRlimitNofile.Load()
260
261	if sys.UidMappings != nil {
262		puid = []byte("/proc/self/uid_map\000")
263		uidmap = formatIDMappings(sys.UidMappings)
264	}
265
266	if sys.GidMappings != nil {
267		psetgroups = []byte("/proc/self/setgroups\000")
268		pgid = []byte("/proc/self/gid_map\000")
269
270		if sys.GidMappingsEnableSetgroups {
271			setgroups = []byte("allow\000")
272		} else {
273			setgroups = []byte("deny\000")
274		}
275		gidmap = formatIDMappings(sys.GidMappings)
276	}
277
278	// Record parent PID so child can test if it has died.
279	ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
280
281	// Guard against side effects of shuffling fds below.
282	// Make sure that nextfd is beyond any currently open files so
283	// that we can't run the risk of overwriting any of them.
284	fd := make([]int, len(attr.Files))
285	nextfd = len(attr.Files)
286	for i, ufd := range attr.Files {
287		if nextfd < int(ufd) {
288			nextfd = int(ufd)
289		}
290		fd[i] = int(ufd)
291	}
292	nextfd++
293
294	// Allocate another pipe for parent to child communication for
295	// synchronizing writing of User ID/Group ID mappings.
296	if sys.UidMappings != nil || sys.GidMappings != nil {
297		if err := forkExecPipe(mapPipe[:]); err != nil {
298			err1 = err.(Errno)
299			return
300		}
301	}
302
303	flags = sys.Cloneflags
304	if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
305		flags |= CLONE_VFORK | CLONE_VM
306	}
307	if sys.PidFD != nil {
308		flags |= CLONE_PIDFD
309	}
310	// Whether to use clone3.
311	if sys.UseCgroupFD || flags&CLONE_NEWTIME != 0 || forceClone3 {
312		clone3 = &cloneArgs{
313			flags:      uint64(flags),
314			exitSignal: uint64(SIGCHLD),
315		}
316		if sys.UseCgroupFD {
317			clone3.flags |= CLONE_INTO_CGROUP
318			clone3.cgroup = uint64(sys.CgroupFD)
319		}
320		if sys.PidFD != nil {
321			clone3.pidFD = uint64(uintptr(unsafe.Pointer(&pidfd)))
322		}
323	}
324
325	// About to call fork.
326	// No more allocation or calls of non-assembly functions.
327	runtime_BeforeFork()
328	locked = true
329	if clone3 != nil {
330		pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0)
331	} else {
332		// N.B. Keep in sync with doCheckClonePidfd.
333		flags |= uintptr(SIGCHLD)
334		if runtime.GOARCH == "s390x" {
335			// On Linux/s390, the first two arguments of clone(2) are swapped.
336			pid, err1 = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(&pidfd)))
337		} else {
338			pid, err1 = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(&pidfd)))
339		}
340	}
341	if err1 != 0 || pid != 0 {
342		// If we're in the parent, we must return immediately
343		// so we're not in the same stack frame as the child.
344		// This can at most use the return PC, which the child
345		// will not modify, and the results of
346		// rawVforkSyscall, which must have been written after
347		// the child was replaced.
348		return
349	}
350
351	// Fork succeeded, now in child.
352
353	// Enable the "keep capabilities" flag to set ambient capabilities later.
354	if len(sys.AmbientCaps) > 0 {
355		_, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
356		if err1 != 0 {
357			goto childerror
358		}
359	}
360
361	// Wait for User ID/Group ID mappings to be written.
362	if sys.UidMappings != nil || sys.GidMappings != nil {
363		if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {
364			goto childerror
365		}
366		pid, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
367		if err1 != 0 {
368			goto childerror
369		}
370		if pid != unsafe.Sizeof(err2) {
371			err1 = EINVAL
372			goto childerror
373		}
374		if err2 != 0 {
375			err1 = err2
376			goto childerror
377		}
378	}
379
380	// Session ID
381	if sys.Setsid {
382		_, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
383		if err1 != 0 {
384			goto childerror
385		}
386	}
387
388	// Set process group
389	if sys.Setpgid || sys.Foreground {
390		// Place child in process group.
391		_, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
392		if err1 != 0 {
393			goto childerror
394		}
395	}
396
397	if sys.Foreground {
398		pgrp = int32(sys.Pgid)
399		if pgrp == 0 {
400			pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
401
402			pgrp = int32(pid)
403		}
404
405		// Place process group in foreground.
406		_, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
407		if err1 != 0 {
408			goto childerror
409		}
410	}
411
412	// Restore the signal mask. We do this after TIOCSPGRP to avoid
413	// having the kernel send a SIGTTOU signal to the process group.
414	runtime_AfterForkInChild()
415
416	// Unshare
417	if sys.Unshareflags != 0 {
418		_, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
419		if err1 != 0 {
420			goto childerror
421		}
422
423		if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
424			dirfd = int(_AT_FDCWD)
425			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
426				goto childerror
427			}
428			pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
429			if err1 != 0 {
430				goto childerror
431			}
432			if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
433				goto childerror
434			}
435
436			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
437				goto childerror
438			}
439			pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
440			if err1 != 0 {
441				goto childerror
442			}
443			if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
444				goto childerror
445			}
446		}
447
448		if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
449			dirfd = int(_AT_FDCWD)
450			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
451				goto childerror
452			}
453			pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
454			if err1 != 0 {
455				goto childerror
456			}
457			if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
458				goto childerror
459			}
460		}
461
462		// The unshare system call in Linux doesn't unshare mount points
463		// mounted with --shared. Systemd mounts / with --shared. For a
464		// long discussion of the pros and cons of this see debian bug 739593.
465		// The Go model of unsharing is more like Plan 9, where you ask
466		// to unshare and the namespaces are unconditionally unshared.
467		// To make this model work we must further mark / as MS_PRIVATE.
468		// This is what the standard unshare command does.
469		if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
470			_, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
471			if err1 != 0 {
472				goto childerror
473			}
474		}
475	}
476
477	// Chroot
478	if chroot != nil {
479		_, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
480		if err1 != 0 {
481			goto childerror
482		}
483	}
484
485	// User and groups
486	if cred = sys.Credential; cred != nil {
487		ngroups = uintptr(len(cred.Groups))
488		groups = uintptr(0)
489		if ngroups > 0 {
490			groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
491		}
492		if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
493			_, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
494			if err1 != 0 {
495				goto childerror
496			}
497		}
498		_, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
499		if err1 != 0 {
500			goto childerror
501		}
502		_, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
503		if err1 != 0 {
504			goto childerror
505		}
506	}
507
508	if len(sys.AmbientCaps) != 0 {
509		// Ambient capabilities were added in the 4.3 kernel,
510		// so it is safe to always use _LINUX_CAPABILITY_VERSION_3.
511		caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
512
513		if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
514			goto childerror
515		}
516
517		for _, c = range sys.AmbientCaps {
518			// Add the c capability to the permitted and inheritable capability mask,
519			// otherwise we will not be able to add it to the ambient capability mask.
520			caps.data[capToIndex(c)].permitted |= capToMask(c)
521			caps.data[capToIndex(c)].inheritable |= capToMask(c)
522		}
523
524		if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
525			goto childerror
526		}
527
528		for _, c = range sys.AmbientCaps {
529			_, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
530			if err1 != 0 {
531				goto childerror
532			}
533		}
534	}
535
536	// Chdir
537	if dir != nil {
538		_, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
539		if err1 != 0 {
540			goto childerror
541		}
542	}
543
544	// Parent death signal
545	if sys.Pdeathsig != 0 {
546		_, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
547		if err1 != 0 {
548			goto childerror
549		}
550
551		// Signal self if parent is already dead. This might cause a
552		// duplicate signal in rare cases, but it won't matter when
553		// using SIGKILL.
554		pid, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
555		if pid != ppid {
556			pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
557			_, _, err1 = RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
558			if err1 != 0 {
559				goto childerror
560			}
561		}
562	}
563
564	// Pass 1: look for fd[i] < i and move those up above len(fd)
565	// so that pass 2 won't stomp on an fd it needs later.
566	if pipe < nextfd {
567		_, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
568		if err1 != 0 {
569			goto childerror
570		}
571		pipe = nextfd
572		nextfd++
573	}
574	for i = 0; i < len(fd); i++ {
575		if fd[i] >= 0 && fd[i] < i {
576			if nextfd == pipe { // don't stomp on pipe
577				nextfd++
578			}
579			_, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
580			if err1 != 0 {
581				goto childerror
582			}
583			fd[i] = nextfd
584			nextfd++
585		}
586	}
587
588	// Pass 2: dup fd[i] down onto i.
589	for i = 0; i < len(fd); i++ {
590		if fd[i] == -1 {
591			RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
592			continue
593		}
594		if fd[i] == i {
595			// dup2(i, i) won't clear close-on-exec flag on Linux,
596			// probably not elsewhere either.
597			_, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
598			if err1 != 0 {
599				goto childerror
600			}
601			continue
602		}
603		// The new fd is created NOT close-on-exec,
604		// which is exactly what we want.
605		_, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
606		if err1 != 0 {
607			goto childerror
608		}
609	}
610
611	// By convention, we don't close-on-exec the fds we are
612	// started with, so if len(fd) < 3, close 0, 1, 2 as needed.
613	// Programs that know they inherit fds >= 3 will need
614	// to set them close-on-exec.
615	for i = len(fd); i < 3; i++ {
616		RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
617	}
618
619	// Detach fd 0 from tty
620	if sys.Noctty {
621		_, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
622		if err1 != 0 {
623			goto childerror
624		}
625	}
626
627	// Set the controlling TTY to Ctty
628	if sys.Setctty {
629		_, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
630		if err1 != 0 {
631			goto childerror
632		}
633	}
634
635	// Restore original rlimit.
636	if rlim != nil {
637		rawSetrlimit(RLIMIT_NOFILE, rlim)
638	}
639
640	// Enable tracing if requested.
641	// Do this right before exec so that we don't unnecessarily trace the runtime
642	// setting up after the fork. See issue #21428.
643	if sys.Ptrace {
644		_, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
645		if err1 != 0 {
646			goto childerror
647		}
648	}
649
650	// Time to exec.
651	_, _, err1 = RawSyscall(SYS_EXECVE,
652		uintptr(unsafe.Pointer(argv0)),
653		uintptr(unsafe.Pointer(&argv[0])),
654		uintptr(unsafe.Pointer(&envv[0])))
655
656childerror:
657	// send error code on pipe
658	RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
659	for {
660		RawSyscall(SYS_EXIT, 253, 0, 0)
661	}
662}
663
664func formatIDMappings(idMap []SysProcIDMap) []byte {
665	var data []byte
666	for _, im := range idMap {
667		data = append(data, itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n"...)
668	}
669	return data
670}
671
672// writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path.
673func writeIDMappings(path string, idMap []SysProcIDMap) error {
674	fd, err := Open(path, O_RDWR, 0)
675	if err != nil {
676		return err
677	}
678
679	if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
680		Close(fd)
681		return err
682	}
683
684	if err := Close(fd); err != nil {
685		return err
686	}
687
688	return nil
689}
690
691// writeSetgroups writes to /proc/PID/setgroups "deny" if enable is false
692// and "allow" if enable is true.
693// This is needed since kernel 3.19, because you can't write gid_map without
694// disabling setgroups() system call.
695func writeSetgroups(pid int, enable bool) error {
696	sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
697	fd, err := Open(sgf, O_RDWR, 0)
698	if err != nil {
699		return err
700	}
701
702	var data []byte
703	if enable {
704		data = []byte("allow")
705	} else {
706		data = []byte("deny")
707	}
708
709	if _, err := Write(fd, data); err != nil {
710		Close(fd)
711		return err
712	}
713
714	return Close(fd)
715}
716
717// writeUidGidMappings writes User ID and Group ID mappings for user namespaces
718// for a process and it is called from the parent process.
719func writeUidGidMappings(pid int, sys *SysProcAttr) error {
720	if sys.UidMappings != nil {
721		uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
722		if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
723			return err
724		}
725	}
726
727	if sys.GidMappings != nil {
728		// If the kernel is too old to support /proc/PID/setgroups, writeSetGroups will return ENOENT; this is OK.
729		if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
730			return err
731		}
732		gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
733		if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
734			return err
735		}
736	}
737
738	return nil
739}
740
741// forkAndExecFailureCleanup cleans up after an exec failure.
742func forkAndExecFailureCleanup(attr *ProcAttr, sys *SysProcAttr) {
743	if sys.PidFD != nil && *sys.PidFD != -1 {
744		Close(*sys.PidFD)
745		*sys.PidFD = -1
746	}
747}
748
749// checkClonePidfd verifies that clone(CLONE_PIDFD) works by actually doing a
750// clone.
751//
752//go:linkname os_checkClonePidfd os.checkClonePidfd
753func os_checkClonePidfd() error {
754	pidfd := int32(-1)
755	pid, errno := doCheckClonePidfd(&pidfd)
756	if errno != 0 {
757		return errno
758	}
759
760	if pidfd == -1 {
761		// Bad: CLONE_PIDFD failed to provide a pidfd. Reap the process
762		// before returning.
763
764		var err error
765		for {
766			var status WaitStatus
767			_, err = Wait4(int(pid), &status, 0, nil)
768			if err != EINTR {
769				break
770			}
771		}
772		if err != nil {
773			return err
774		}
775
776		return errpkg.New("clone(CLONE_PIDFD) failed to return pidfd")
777	}
778
779	// Good: CLONE_PIDFD provided a pidfd. Reap the process and close the
780	// pidfd.
781	defer Close(int(pidfd))
782
783	for {
784		const _P_PIDFD = 3
785		_, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), 0, WEXITED, 0, 0)
786		if errno != EINTR {
787			break
788		}
789	}
790	if errno != 0 {
791		return errno
792	}
793
794	return nil
795}
796
797// doCheckClonePidfd implements the actual clone call of os_checkClonePidfd and
798// child execution. This is a separate function so we can separate the child's
799// and parent's stack frames if we're using vfork.
800//
801// This is go:noinline because the point is to keep the stack frames of this
802// and os_checkClonePidfd separate.
803//
804//go:noinline
805func doCheckClonePidfd(pidfd *int32) (pid uintptr, errno Errno) {
806	flags := uintptr(CLONE_VFORK|CLONE_VM|CLONE_PIDFD|SIGCHLD)
807	if runtime.GOARCH == "s390x" {
808		// On Linux/s390, the first two arguments of clone(2) are swapped.
809		pid, errno = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(pidfd)))
810	} else {
811		pid, errno = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(pidfd)))
812	}
813	if errno != 0 || pid != 0 {
814		// If we're in the parent, we must return immediately
815		// so we're not in the same stack frame as the child.
816		// This can at most use the return PC, which the child
817		// will not modify, and the results of
818		// rawVforkSyscall, which must have been written after
819		// the child was replaced.
820		return
821	}
822
823	for {
824		RawSyscall(SYS_EXIT_GROUP, 0, 0, 0)
825	}
826}
827