• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Implementation of the sandbox2::ForkServer class.
16 
17 #include "sandboxed_api/sandbox2/forkserver.h"
18 
19 #include <fcntl.h>
20 #include <linux/filter.h>
21 #include <linux/seccomp.h>
22 #include <sched.h>
23 #include <sys/prctl.h>
24 #include <sys/resource.h>
25 #include <sys/socket.h>
26 #include <sys/uio.h>
27 #include <sys/wait.h>
28 #include <syscall.h>
29 #include <unistd.h>
30 
31 #include <cerrno>
32 #include <csignal>
33 #include <cstdint>
34 #include <cstdlib>
35 #include <cstring>
36 #include <fstream>
37 #include <initializer_list>
38 #include <string>
39 #include <utility>
40 #include <vector>
41 
42 #include "absl/base/attributes.h"
43 #include "absl/container/flat_hash_map.h"
44 #include "absl/container/flat_hash_set.h"
45 #include "absl/status/status.h"
46 #include "absl/status/statusor.h"
47 #include "absl/strings/match.h"
48 #include "absl/strings/str_cat.h"
49 #include "absl/strings/str_join.h"
50 #include "absl/strings/str_split.h"
51 #include "absl/strings/string_view.h"
52 #include "sys/capability.h" // AOSP: match libcap exported includes
53 #include "sandboxed_api/sandbox2/client.h"
54 #include "sandboxed_api/sandbox2/comms.h"
55 #include "sandboxed_api/sandbox2/fork_client.h"
56 #include "sandboxed_api/sandbox2/forkserver.pb.h"
57 #include "sandboxed_api/sandbox2/namespace.h"
58 #include "sandboxed_api/sandbox2/policy.h"
59 #include "sandboxed_api/sandbox2/sanitizer.h"
60 #include "sandboxed_api/sandbox2/syscall.h"
61 #include "sandboxed_api/sandbox2/util.h"
62 #include "sandboxed_api/sandbox2/util/bpf_helper.h"
63 #include "sandboxed_api/util/fileops.h"
64 #include "sandboxed_api/util/raw_logging.h"
65 #include "sandboxed_api/util/strerror.h"
66 
67 namespace sandbox2 {
68 namespace {
69 
70 using ::sapi::StrError;
71 using ::sapi::file_util::fileops::FDCloser;
72 
73 // "Moves" FDs in move_fds from current to target FD number while keeping FDs
74 // in keep_fds open - potentially moving them to another FD number as well in
75 // case of colisions.
76 // Ignores invalid (-1) fds.
MoveFDs(std::initializer_list<std::pair<int *,int>> move_fds,std::initializer_list<int * > keep_fds)77 void MoveFDs(std::initializer_list<std::pair<int*, int>> move_fds,
78              std::initializer_list<int*> keep_fds) {
79   absl::flat_hash_map<int, int*> fd_map;
80   for (int* fd : keep_fds) {
81     if (*fd != -1) {
82       fd_map.emplace(*fd, fd);
83     }
84   }
85 
86   for (auto [old_fd, new_fd] : move_fds) {
87     if (*old_fd != -1) {
88       fd_map.emplace(*old_fd, old_fd);
89     }
90   }
91 
92   for (auto [old_fd, new_fd] : move_fds) {
93     if (*old_fd == -1 || *old_fd == new_fd) {
94       continue;
95     }
96 
97     // Make sure we won't override another fd
98     if (auto it = fd_map.find(new_fd); it != fd_map.end()) {
99       int fd = dup(new_fd);
100       SAPI_RAW_CHECK(fd != -1, "Duplicating an FD failed.");
101       *it->second = fd;
102       fd_map.emplace(fd, it->second);
103       fd_map.erase(it);
104     }
105 
106     if (dup2(*old_fd, new_fd) == -1) {
107       SAPI_RAW_PLOG(FATAL, "Moving temporary to proper FD failed.");
108     }
109 
110     close(*old_fd);
111     fd_map.erase(*old_fd);
112     *old_fd = new_fd;
113   }
114 }
115 
116 struct Pipe {
117   FDCloser read;
118   FDCloser write;
119 };
120 
CreatePipe()121 Pipe CreatePipe() {
122   int pfds[2];
123   SAPI_RAW_PCHECK(pipe(pfds) == 0, "creating pipe");
124   return {FDCloser(pfds[0]), FDCloser(pfds[1])};
125 }
126 
RunInitProcess(pid_t main_pid,FDCloser pipe_fd)127 ABSL_ATTRIBUTE_NORETURN void RunInitProcess(pid_t main_pid, FDCloser pipe_fd) {
128   if (prctl(PR_SET_NAME, "S2-INIT-PROC", 0, 0, 0) != 0) {
129     SAPI_RAW_PLOG(WARNING, "prctl(PR_SET_NAME, 'S2-INIT-PROC')");
130   }
131 
132   // Clear SA_NOCLDWAIT.
133   struct sigaction sa;
134   sa.sa_handler = SIG_DFL;
135   sa.sa_flags = 0;
136   sigemptyset(&sa.sa_mask);
137   SAPI_RAW_CHECK(sigaction(SIGCHLD, &sa, nullptr) == 0,
138                  "clearing SA_NOCLDWAIT");
139 
140   // Apply seccomp.
141   std::vector<sock_filter> code = {
142       LOAD_ARCH,
143       JNE32(sandbox2::Syscall::GetHostAuditArch(), DENY),
144 
145       LOAD_SYSCALL_NR,
146       SYSCALL(__NR_waitid, ALLOW),
147       SYSCALL(__NR_exit, ALLOW),
148   };
149   if (pipe_fd.get() >= 0) {
150     code.insert(code.end(),
151                 {SYSCALL(__NR_getrusage, ALLOW), SYSCALL(__NR_write, ALLOW)});
152   }
153   code.push_back(DENY);
154 
155   struct sock_fprog prog{
156       .len = static_cast<uint16_t>(code.size()),
157       .filter = code.data(),
158   };
159 
160   SAPI_RAW_CHECK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0,
161                  "Denying new privs");
162   SAPI_RAW_CHECK(prctl(PR_SET_KEEPCAPS, 0) == 0, "Dropping caps");
163   SAPI_RAW_CHECK(
164       syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
165               reinterpret_cast<uintptr_t>(&prog)) == 0,
166       "Enabling seccomp filter");
167 
168   siginfo_t info;
169   // Reap children.
170   for (;;) {
171     int rv = TEMP_FAILURE_RETRY(waitid(P_ALL, -1, &info, WEXITED | __WALL));
172     if (rv != 0) {
173       _exit(1);
174     }
175 
176     if (info.si_pid == main_pid) {
177       if (pipe_fd.get() >= 0) {
178         (void)write(pipe_fd.get(), &info.si_code, sizeof(info.si_code));
179         (void)write(pipe_fd.get(), &info.si_status, sizeof(info.si_status));
180 
181         rusage usage{};
182         getrusage(RUSAGE_CHILDREN, &usage);
183         (void)write(pipe_fd.get(), &usage, sizeof(usage));
184       }
185       _exit(0);
186     }
187   }
188 }
189 
SendPid(int signaling_fd)190 absl::Status SendPid(int signaling_fd) {
191   // Send our PID (the actual sandboxee process) via SCM_CREDENTIALS.
192   // The ancillary message will be attached to the message as SO_PASSCRED is set
193   // on the socket.
194   char dummy = ' ';
195   if (TEMP_FAILURE_RETRY(send(signaling_fd, &dummy, 1, 0)) != 1) {
196     return absl::ErrnoToStatus(errno, "Sending PID: send()");
197   }
198   return absl::OkStatus();
199 }
200 
ReceivePid(int signaling_fd)201 absl::StatusOr<pid_t> ReceivePid(int signaling_fd) {
202   union {
203     struct cmsghdr cmh;
204     char ctrl[CMSG_SPACE(sizeof(struct ucred))];
205   } ucred_msg{};
206 
207   struct msghdr msgh{};
208   struct iovec iov{};
209 
210   msgh.msg_iov = &iov;
211   msgh.msg_iovlen = 1;
212   msgh.msg_control = ucred_msg.ctrl;
213   msgh.msg_controllen = sizeof(ucred_msg);
214 
215   char dummy;
216   iov.iov_base = &dummy;
217   iov.iov_len = sizeof(char);
218 
219   if (TEMP_FAILURE_RETRY(recvmsg(signaling_fd, &msgh, MSG_WAITALL)) != 1) {
220     return absl::ErrnoToStatus(errno, "Receiving pid failed: recvmsg");
221   }
222   struct cmsghdr* cmsgp = CMSG_FIRSTHDR(&msgh);
223   if (cmsgp->cmsg_len != CMSG_LEN(sizeof(struct ucred)) ||
224       cmsgp->cmsg_level != SOL_SOCKET || cmsgp->cmsg_type != SCM_CREDENTIALS) {
225     return absl::InternalError("Receiving pid failed");
226   }
227   auto* ucredp = reinterpret_cast<struct ucred*>(CMSG_DATA(cmsgp));
228   return ucredp->pid;
229 }
230 
GetRootMountId(const std::string & proc_id)231 absl::StatusOr<std::string> GetRootMountId(const std::string& proc_id) {
232   std::ifstream mounts(absl::StrCat("/proc/", proc_id, "/mountinfo"));
233   if (!mounts.good()) {
234     return absl::InternalError("Failed to open mountinfo");
235   }
236   std::string line;
237   while (std::getline(mounts, line)) {
238     std::vector<absl::string_view> parts =
239         absl::StrSplit(line, absl::MaxSplits(' ', 4));
240     if (parts.size() >= 4 && parts[3] == "/") {
241       return std::string(parts[0]);
242     }
243   }
244   return absl::NotFoundError("Root entry not found in mountinfo");
245 }
246 
IsLikelyChrooted()247 bool IsLikelyChrooted() {
248   absl::StatusOr<std::string> self_root_id = GetRootMountId("self");
249   if (!self_root_id.ok()) {
250     return absl::IsNotFound(self_root_id.status());
251   }
252   absl::StatusOr<std::string> init_root_id = GetRootMountId("1");
253   if (!init_root_id.ok()) {
254     return false;
255   }
256   return *self_root_id != *init_root_id;
257 }
258 
259 }  // namespace
260 
PrepareExecveArgs(const ForkRequest & request,std::vector<std::string> * args,std::vector<std::string> * envp)261 void ForkServer::PrepareExecveArgs(const ForkRequest& request,
262                                    std::vector<std::string>* args,
263                                    std::vector<std::string>* envp) {
264   // Prepare arguments for execve.
265   for (const auto& arg : request.args()) {
266     args->push_back(arg);
267   }
268 
269   // Prepare environment variables for execve.
270   for (const auto& env : request.envs()) {
271     envp->push_back(env);
272   }
273 
274   // The child process should not start any fork-servers.
275   envp->push_back(absl::StrCat(kForkServerDisableEnv, "=1"));
276 
277   constexpr char kSapiVlogLevel[] = "SAPI_VLOG_LEVEL";
278   char* sapi_vlog = getenv(kSapiVlogLevel);
279   if (sapi_vlog && strlen(sapi_vlog) > 0) {
280     envp->push_back(absl::StrCat(kSapiVlogLevel, "=", sapi_vlog));
281   }
282 
283   SAPI_RAW_VLOG(1, "Will execute args:['%s'], environment:['%s']",
284                 absl::StrJoin(*args, "', '").c_str(),
285                 absl::StrJoin(*envp, "', '").c_str());
286 }
287 
LaunchChild(const ForkRequest & request,int execve_fd,uid_t uid,gid_t gid,FDCloser signaling_fd,FDCloser status_fd,bool avoid_pivot_root) const288 void ForkServer::LaunchChild(const ForkRequest& request, int execve_fd,
289                              uid_t uid, gid_t gid, FDCloser signaling_fd,
290                              FDCloser status_fd, bool avoid_pivot_root) const {
291   SAPI_RAW_CHECK(request.mode() != FORKSERVER_FORK_UNSPECIFIED,
292                  "Forkserver mode is unspecified");
293 
294   const bool will_execve = execve_fd != -1;
295   const bool should_sandbox = request.mode() == FORKSERVER_FORK_EXECVE_SANDBOX;
296 
297   absl::StatusOr<absl::flat_hash_set<int>> open_fds = sanitizer::GetListOfFDs();
298   if (!open_fds.ok()) {
299     SAPI_RAW_LOG(WARNING, "Could not get list of current open FDs: %s",
300                  std::string(open_fds.status().message()).c_str());
301     open_fds = absl::flat_hash_set<int>();
302   }
303   SanitizeEnvironment();
304 
305   InitializeNamespaces(request, uid, gid, avoid_pivot_root);
306 
307   auto caps = cap_init();
308   SAPI_RAW_CHECK(cap_set_proc(caps) == 0, "while dropping capabilities");
309   cap_free(caps);
310 
311   // A custom init process is only needed if a new PID NS is created.
312   if (request.clone_flags() & CLONE_NEWPID) {
313     // Spawn a child process
314     pid_t child = util::ForkWithFlags(SIGCHLD);
315     if (child < 0) {
316       SAPI_RAW_PLOG(FATAL, "Could not spawn init process");
317     }
318     if (child != 0) {
319       if (status_fd.get() >= 0) {
320         open_fds->erase(status_fd.get());
321       }
322       // Close all open fds (equals to CloseAllFDsExcept but does not require
323       // /proc to be available).
324       for (const auto& fd : *open_fds) {
325         close(fd);
326       }
327       RunInitProcess(child, std::move(status_fd));
328     }
329     // Send sandboxee pid
330     auto status = SendPid(signaling_fd.get());
331     SAPI_RAW_CHECK(status.ok(),
332                    absl::StrCat("sending pid: ", status.message()).c_str());
333   }
334   signaling_fd.Close();
335   status_fd.Close();
336 
337   Client client(comms_);
338   client.allow_speculation_ = request.allow_speculation();
339 
340   // Prepare the arguments before sandboxing (if needed), as doing it after
341   // sandoxing can cause syscall violations (e.g. related to memory management).
342   std::vector<std::string> args;
343   std::vector<std::string> envs;
344   if (will_execve) {
345     PrepareExecveArgs(request, &args, &envs);
346   }
347 
348   // Sandboxing can be enabled either here - just before execve, or somewhere
349   // inside the executed binary (e.g. after basic structures have been
350   // initialized, and resources acquired). In the latter case, it's up to the
351   // sandboxed binary to establish proper Comms channel (using
352   // Comms::kSandbox2ClientCommsFD) and call sandbox2::Client::SandboxMeHere()
353   if (should_sandbox) {
354     // The following client calls are basically SandboxMeHere. We split it so
355     // that we can set up the envp after we received the file descriptors but
356     // before we enable the syscall filter.
357     client.PrepareEnvironment(&execve_fd);
358     if (comms_->GetConnectionFD() != Comms::kSandbox2ClientCommsFD) {
359       envs.push_back(absl::StrCat(Comms::kSandbox2CommsFDEnvVar, "=",
360                                   comms_->GetConnectionFD()));
361     }
362     envs.push_back(client.GetFdMapEnvVar());
363   }
364 
365   // Convert args and envs before enabling sandbox (it'll allocate which might
366   // be blocked).
367   util::CharPtrArray argv = util::CharPtrArray::FromStringVector(args);
368   util::CharPtrArray envp = util::CharPtrArray::FromStringVector(envs);
369 
370   if (should_sandbox) {
371     client.EnableSandbox();
372   }
373 
374   if (will_execve) {
375     ExecuteProcess(execve_fd, argv.data(), envp.data());
376   }
377 }
378 
ServeRequest()379 pid_t ForkServer::ServeRequest() {
380   ForkRequest fork_request;
381   if (!comms_->RecvProtoBuf(&fork_request)) {
382     if (comms_->IsTerminated()) {
383       return -1;
384     }
385     SAPI_RAW_LOG(FATAL, "Failed to receive ForkServer request");
386   }
387   int comms_fd;
388   SAPI_RAW_CHECK(comms_->RecvFD(&comms_fd), "Failed to receive Comms FD");
389 
390   SAPI_RAW_CHECK(fork_request.mode() != FORKSERVER_FORK_UNSPECIFIED,
391                  "Forkserver mode is unspecified");
392 
393   int exec_fd = -1;
394   if (fork_request.mode() == FORKSERVER_FORK_EXECVE ||
395       fork_request.mode() == FORKSERVER_FORK_EXECVE_SANDBOX) {
396     SAPI_RAW_CHECK(comms_->RecvFD(&exec_fd), "Failed to receive Exec FD");
397   }
398 
399   // Make the kernel notify us with SIGCHLD when the process terminates.
400   // We use sigaction(SIGCHLD, flags=SA_NOCLDWAIT) in combination with
401   // this to make sure the zombie process is reaped immediately.
402   int clone_flags = fork_request.clone_flags() | SIGCHLD;
403 
404   // Store uid and gid since they will change if CLONE_NEWUSER is set.
405   uid_t uid = getuid();
406   uid_t gid = getgid();
407 
408   Pipe pipe_fds;
409   if (fork_request.monitor_type() == FORKSERVER_MONITOR_UNOTIFY) {
410     pipe_fds = CreatePipe();
411   }
412 
413   int socketpair_fds[2];
414   SAPI_RAW_PCHECK(
415       socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, socketpair_fds) == 0,
416       "creating signaling socketpair");
417   for (int i = 0; i < 2; ++i) {
418     int val = 1;
419     SAPI_RAW_PCHECK(setsockopt(socketpair_fds[i], SOL_SOCKET, SO_PASSCRED, &val,
420                                sizeof(val)) == 0,
421                     "setsockopt failed");
422   }
423 
424   FDCloser signaling_fds[] = {FDCloser(socketpair_fds[0]),
425                               FDCloser(socketpair_fds[1])};
426 
427   // Note: init_pid will be overwritten with the actual init pid if the init
428   //       process was started or stays at 0 if that is not needed - no pidns.
429   pid_t init_pid = 0;
430   pid_t sandboxee_pid = -1;
431   bool avoid_pivot_root = clone_flags & (CLONE_NEWUSER | CLONE_NEWNS);
432   if (avoid_pivot_root) {
433     // Create initial namespaces only when they're first needed.
434     // This allows sandbox2 to be still used without any namespaces support
435     if (initial_mntns_fd_ == -1) {
436       CreateInitialNamespaces();
437     }
438     if (fork_request.netns_mode() == NETNS_MODE_SHARED_PER_FORKSERVER &&
439         initial_netns_fd_ == -1) {
440       CreateForkserverSharedNetworkNamespace();
441     }
442     // We first just fork a child, which will join the initial namespaces
443     // Note: Not a regular fork() as one really needs to be single-threaded to
444     //       setns and this is not the case with TSAN.
445     pid_t pid = util::ForkWithFlags(SIGCHLD);
446     SAPI_RAW_PCHECK(pid != -1, "fork failed");
447     if (pid == 0) {
448       SAPI_RAW_PCHECK(setns(initial_userns_fd_, CLONE_NEWUSER) != -1,
449                       "joining initial user namespace");
450       SAPI_RAW_PCHECK(setns(initial_mntns_fd_, CLONE_NEWNS) != -1,
451                       "joining initial mnt namespace");
452       if (fork_request.netns_mode() == NETNS_MODE_SHARED_PER_FORKSERVER) {
453         SAPI_RAW_PCHECK(setns(initial_netns_fd_, CLONE_NEWNET) != -1,
454                         "joining initial net namespace");
455         close(initial_netns_fd_);
456       }
457       close(initial_userns_fd_);
458       close(initial_mntns_fd_);
459       // Do not create new userns it will be unshared later
460       sandboxee_pid =
461           util::ForkWithFlags((clone_flags & ~CLONE_NEWUSER) | CLONE_PARENT);
462       if (sandboxee_pid == -1) {
463         SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
464       }
465       if (sandboxee_pid != 0) {
466         _exit(0);
467       }
468       // Send sandboxee pid
469       absl::Status status = SendPid(signaling_fds[1].get());
470       SAPI_RAW_CHECK(status.ok(),
471                      absl::StrCat("sending pid: ", status.message()).c_str());
472     }
473   } else {
474     sandboxee_pid = util::ForkWithFlags(clone_flags);
475     if (sandboxee_pid == -1) {
476       SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
477     }
478     if (sandboxee_pid == 0) {
479       close(initial_userns_fd_);
480       close(initial_mntns_fd_);
481     }
482   }
483 
484   // Child.
485   if (sandboxee_pid == 0) {
486     signaling_fds[0].Close();
487     pipe_fds.read.Close();
488     // Make sure we override the forkserver's comms fd
489     comms_->Terminate();
490     if (exec_fd != -1) {
491       int signaling_fd = signaling_fds[1].Release();
492       int pipe_fd = pipe_fds.write.Release();
493       MoveFDs({{&exec_fd, Comms::kSandbox2TargetExecFD},
494                {&comms_fd, Comms::kSandbox2ClientCommsFD}},
495               {&signaling_fd, &pipe_fd});
496       signaling_fds[1] = FDCloser(signaling_fd);
497       pipe_fds.write = FDCloser(pipe_fd);
498     }
499     *comms_ = Comms(comms_fd);
500     LaunchChild(fork_request, exec_fd, uid, gid, std::move(signaling_fds[1]),
501                 std::move(pipe_fds.write), avoid_pivot_root);
502     return sandboxee_pid;
503   }
504 
505   signaling_fds[1].Close();
506 
507   if (avoid_pivot_root) {
508     if (auto pid = ReceivePid(signaling_fds[0].get()); !pid.ok()) {
509       SAPI_RAW_LOG(ERROR, "%s", std::string(pid.status().message()).c_str());
510     } else {
511       sandboxee_pid = pid.value();
512     }
513   }
514 
515   if (fork_request.clone_flags() & CLONE_NEWPID) {
516     // The pid of the init process is equal to the child process that we've
517     // previously forked.
518     init_pid = sandboxee_pid;
519     sandboxee_pid = -1;
520     // And the actual sandboxee is forked from the init process, so we need to
521     // receive the actual PID.
522     if (auto pid_or = ReceivePid(signaling_fds[0].get()); !pid_or.ok()) {
523       SAPI_RAW_LOG(ERROR, "%s", std::string(pid_or.status().message()).c_str());
524       if (init_pid != -1) {
525         kill(init_pid, SIGKILL);
526       }
527       init_pid = -1;
528     } else {
529       sandboxee_pid = pid_or.value();
530     }
531   }
532 
533   // Parent.
534   pipe_fds.write.Close();
535   close(comms_fd);
536   if (exec_fd >= 0) {
537     close(exec_fd);
538   }
539   SAPI_RAW_CHECK(comms_->SendInt32(init_pid),
540                  absl::StrCat("Failed to send init PID: ", init_pid).c_str());
541   SAPI_RAW_CHECK(
542       comms_->SendInt32(sandboxee_pid),
543       absl::StrCat("Failed to send sandboxee PID: ", sandboxee_pid).c_str());
544 
545   if (pipe_fds.read.get() >= 0) {
546     SAPI_RAW_CHECK(comms_->SendFD(pipe_fds.read.get()),
547                    "Failed to send status pipe");
548   }
549   return sandboxee_pid;
550 }
551 
IsTerminated() const552 bool ForkServer::IsTerminated() const { return comms_->IsTerminated(); }
553 
Initialize()554 bool ForkServer::Initialize() {
555   // For safety drop as many capabilities as possible.
556   // Note that cap_t is actually a pointer.
557   cap_t have_caps = cap_get_proc();  // caps we currently have
558   SAPI_RAW_CHECK(have_caps, "failed to cap_get_proc()");
559   cap_t wanted_caps = cap_init();  // starts as empty set, ie. no caps
560   SAPI_RAW_CHECK(wanted_caps, "failed to cap_init()");
561 
562   // CAP_SYS_PTRACE appears to be needed for apparmor (or possibly yama)
563   // CAP_SETFCAP is needed on newer kernels (5.10 needs it, 4.15 does not)
564   for (cap_value_t cap : {CAP_SYS_PTRACE, CAP_SETFCAP}) {
565     for (cap_flag_t flag : {CAP_EFFECTIVE, CAP_PERMITTED}) {
566       cap_flag_value_t value;
567       int rc = cap_get_flag(have_caps, cap, flag, &value);
568       SAPI_RAW_CHECK(!rc, "cap_get_flag");
569       if (value == CAP_SET) {
570         cap_value_t caps_to_set[1] = {
571             cap,
572         };
573         rc = cap_set_flag(wanted_caps, flag, 1, caps_to_set, CAP_SET);
574         SAPI_RAW_CHECK(!rc, "cap_set_flag");
575       }
576     }
577   }
578 
579   SAPI_RAW_CHECK(!cap_set_proc(wanted_caps), "while dropping capabilities");
580   SAPI_RAW_CHECK(!cap_free(wanted_caps), "while freeing wanted_caps");
581   SAPI_RAW_CHECK(!cap_free(have_caps), "while freeing have_caps");
582 
583   // All processes spawned by the fork'd/execute'd process will see this process
584   // as /sbin/init. Therefore it will receive (and ignore) their final status
585   // (see the next comment as well). PR_SET_CHILD_SUBREAPER is available since
586   // kernel version 3.4, so don't panic if it fails.
587   if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) == -1) {
588     SAPI_RAW_VLOG(3, "prctl(PR_SET_CHILD_SUBREAPER, 1): %s [%d]",
589                   StrError(errno).c_str(), errno);
590   }
591 
592   // Don't convert terminated child processes into zombies. It's up to the
593   // sandbox (Monitor) to track them and receive/report their final status.
594   struct sigaction sa;
595   sa.sa_handler = SIG_DFL;
596   sa.sa_flags = SA_NOCLDWAIT;
597   sigemptyset(&sa.sa_mask);
598   if (sigaction(SIGCHLD, &sa, nullptr) == -1) {
599     SAPI_RAW_PLOG(ERROR, "sigaction(SIGCHLD, flags=SA_NOCLDWAIT)");
600     return false;
601   }
602   return true;
603 }
604 
CreateInitialNamespaces()605 void ForkServer::CreateInitialNamespaces() {
606   // Spawn a new process to create initial user and mount namespaces to be used
607   // as a base for each namespaced sandboxee.
608 
609   // Store uid and gid to create mappings after CLONE_NEWUSER
610   uid_t uid = getuid();
611   gid_t gid = getgid();
612 
613   // Socket to synchronize so that we open ns fds before process dies
614   Pipe create_pipe = CreatePipe();
615   Pipe open_pipe = CreatePipe();
616   pid_t pid = util::ForkWithFlags(CLONE_NEWUSER | CLONE_NEWNS | SIGCHLD);
617   if (pid == -1 && errno == EPERM && IsLikelyChrooted()) {
618     SAPI_RAW_LOG(FATAL,
619                  "failed to fork initial namespaces process: parent process is "
620                  "likely chrooted");
621   }
622   SAPI_RAW_PCHECK(pid != -1, "failed to fork initial namespaces process");
623   char value = ' ';
624   if (pid == 0) {
625     create_pipe.read.Close();
626     open_pipe.write.Close();
627     Namespace::InitializeInitialNamespaces(uid, gid);
628     SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(create_pipe.write.get(), &value,
629                                              sizeof(value))) == sizeof(value),
630                     "synchronizing initial namespaces creation");
631     SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(open_pipe.read.get(), &value,
632                                             sizeof(value))) == sizeof(value),
633                     "synchronizing initial namespaces creation");
634     SAPI_RAW_PCHECK(chroot("/realroot") == 0,
635                     "chrooting prior to dumping coverage");
636     util::DumpCoverageData();
637     _exit(0);
638   }
639   open_pipe.read.Close();
640   create_pipe.write.Close();
641   SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(create_pipe.read.get(), &value,
642                                           sizeof(value))) == sizeof(value),
643                   "synchronizing initial namespaces creation");
644   initial_userns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/user").c_str(),
645                             O_RDONLY | O_CLOEXEC);
646   SAPI_RAW_PCHECK(initial_userns_fd_ != -1, "getting initial userns fd");
647   initial_mntns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/mnt").c_str(),
648                            O_RDONLY | O_CLOEXEC);
649   SAPI_RAW_PCHECK(initial_mntns_fd_ != -1, "getting initial mntns fd");
650   SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(open_pipe.write.get(), &value,
651                                            sizeof(value))) == sizeof(value),
652                   "synchronizing initial namespaces creation");
653 }
654 
CreateForkserverSharedNetworkNamespace()655 void ForkServer::CreateForkserverSharedNetworkNamespace() {
656   Pipe create_pipe = CreatePipe();
657   Pipe open_pipe = CreatePipe();
658   pid_t pid = util::ForkWithFlags(SIGCHLD);
659   SAPI_RAW_PCHECK(pid != -1, "failed to fork shared netns process");
660   char value = ' ';
661   if (pid == 0) {
662     create_pipe.read.Close();
663     open_pipe.write.Close();
664     SAPI_RAW_PCHECK(setns(initial_userns_fd_, CLONE_NEWUSER) == 0,
665                     "joining initial user namespace");
666     SAPI_RAW_PCHECK(unshare(CLONE_NEWNET) == 0, "unsharing netns");
667     SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(create_pipe.write.get(), &value,
668                                              sizeof(value))) == sizeof(value),
669                     "synchronizing shared netns creation");
670     SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(open_pipe.read.get(), &value,
671                                             sizeof(value))) == sizeof(value),
672                     "synchronizing shared netns creation");
673     util::DumpCoverageData();
674     _exit(0);
675   }
676   open_pipe.read.Close();
677   create_pipe.write.Close();
678   SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(create_pipe.read.get(), &value,
679                                           sizeof(value))) == sizeof(value),
680                   "synchronizing shared netns creation");
681   initial_netns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/net").c_str(),
682                            O_RDONLY | O_CLOEXEC);
683   SAPI_RAW_PCHECK(initial_netns_fd_ != -1, "getting initial netns fd");
684   SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(open_pipe.write.get(), &value,
685                                            sizeof(value))) == sizeof(value),
686                   "synchronizing initial namespaces creation");
687 }
688 
SanitizeEnvironment() const689 void ForkServer::SanitizeEnvironment() const {
690   // Mark all file descriptors, except the standard ones (needed
691   // for proper sandboxed process operations), as close-on-exec.
692   absl::Status status = sanitizer::SanitizeCurrentProcess(
693       {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO, comms_->GetConnectionFD()},
694       /* close_fds = */ false);
695   SAPI_RAW_CHECK(
696       status.ok(),
697       absl::StrCat("while sanitizing process: ", status.message()).c_str());
698 }
699 
ExecuteProcess(int execve_fd,const char * const * argv,const char * const * envp)700 void ForkServer::ExecuteProcess(int execve_fd, const char* const* argv,
701                                 const char* const* envp) {
702   // Do not add any code before execve(), as it's subject to seccomp policies.
703   // Indicate that it's a special execve(), by setting 4th, 5th and 6th syscall
704   // argument to magic values.
705   util::Execveat(execve_fd, "", argv, envp, AT_EMPTY_PATH,
706                  internal::kExecveMagic);
707 
708   int saved_errno = errno;
709   SAPI_RAW_PLOG(ERROR, "execveat failed");
710   if (argv[0]) {
711     SAPI_RAW_LOG(ERROR, "argv[0]=%s", argv[0]);
712   }
713 
714   if (saved_errno == ENOSYS) {
715     SAPI_RAW_LOG(ERROR,
716                  "This is likely caused by running on a kernel that is too old."
717     );
718   } else if (saved_errno == ENOENT && execve_fd >= 0) {
719     // Since we know the file exists, it must be that the file is dynamically
720     // linked and the ELF interpreter is what's actually missing.
721     SAPI_RAW_LOG(
722         ERROR,
723         "This is likely caused by running dynamically-linked sandboxee without "
724         "calling .AddLibrariesForBinary() on the policy builder.");
725   }
726 
727   util::Syscall(__NR_exit_group, EXIT_FAILURE);
728   abort();
729 }
730 
InitializeNamespaces(const ForkRequest & request,uid_t uid,gid_t gid,bool avoid_pivot_root)731 void ForkServer::InitializeNamespaces(const ForkRequest& request, uid_t uid,
732                                       gid_t gid, bool avoid_pivot_root) {
733   if (!request.has_mount_tree()) {
734     return;
735   }
736   Namespace::InitializeNamespaces(
737       uid, gid, request.clone_flags(), Mounts(request.mount_tree()),
738       request.hostname(), avoid_pivot_root, request.allow_mount_propagation());
739 }
740 
741 }  // namespace sandbox2
742