1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // Implementation of the sandbox2::ForkServer class.
16
17 #include "sandboxed_api/sandbox2/forkserver.h"
18
19 #include <fcntl.h>
20 #include <linux/filter.h>
21 #include <linux/seccomp.h>
22 #include <sched.h>
23 #include <sys/prctl.h>
24 #include <sys/resource.h>
25 #include <sys/socket.h>
26 #include <sys/uio.h>
27 #include <sys/wait.h>
28 #include <syscall.h>
29 #include <unistd.h>
30
31 #include <cerrno>
32 #include <csignal>
33 #include <cstdint>
34 #include <cstdlib>
35 #include <cstring>
36 #include <fstream>
37 #include <initializer_list>
38 #include <string>
39 #include <utility>
40 #include <vector>
41
42 #include "absl/base/attributes.h"
43 #include "absl/container/flat_hash_map.h"
44 #include "absl/container/flat_hash_set.h"
45 #include "absl/status/status.h"
46 #include "absl/status/statusor.h"
47 #include "absl/strings/match.h"
48 #include "absl/strings/str_cat.h"
49 #include "absl/strings/str_join.h"
50 #include "absl/strings/str_split.h"
51 #include "absl/strings/string_view.h"
52 #include "sys/capability.h" // AOSP: match libcap exported includes
53 #include "sandboxed_api/sandbox2/client.h"
54 #include "sandboxed_api/sandbox2/comms.h"
55 #include "sandboxed_api/sandbox2/fork_client.h"
56 #include "sandboxed_api/sandbox2/forkserver.pb.h"
57 #include "sandboxed_api/sandbox2/namespace.h"
58 #include "sandboxed_api/sandbox2/policy.h"
59 #include "sandboxed_api/sandbox2/sanitizer.h"
60 #include "sandboxed_api/sandbox2/syscall.h"
61 #include "sandboxed_api/sandbox2/util.h"
62 #include "sandboxed_api/sandbox2/util/bpf_helper.h"
63 #include "sandboxed_api/util/fileops.h"
64 #include "sandboxed_api/util/raw_logging.h"
65 #include "sandboxed_api/util/strerror.h"
66
67 namespace sandbox2 {
68 namespace {
69
70 using ::sapi::StrError;
71 using ::sapi::file_util::fileops::FDCloser;
72
73 // "Moves" FDs in move_fds from current to target FD number while keeping FDs
74 // in keep_fds open - potentially moving them to another FD number as well in
75 // case of colisions.
76 // Ignores invalid (-1) fds.
MoveFDs(std::initializer_list<std::pair<int *,int>> move_fds,std::initializer_list<int * > keep_fds)77 void MoveFDs(std::initializer_list<std::pair<int*, int>> move_fds,
78 std::initializer_list<int*> keep_fds) {
79 absl::flat_hash_map<int, int*> fd_map;
80 for (int* fd : keep_fds) {
81 if (*fd != -1) {
82 fd_map.emplace(*fd, fd);
83 }
84 }
85
86 for (auto [old_fd, new_fd] : move_fds) {
87 if (*old_fd != -1) {
88 fd_map.emplace(*old_fd, old_fd);
89 }
90 }
91
92 for (auto [old_fd, new_fd] : move_fds) {
93 if (*old_fd == -1 || *old_fd == new_fd) {
94 continue;
95 }
96
97 // Make sure we won't override another fd
98 if (auto it = fd_map.find(new_fd); it != fd_map.end()) {
99 int fd = dup(new_fd);
100 SAPI_RAW_CHECK(fd != -1, "Duplicating an FD failed.");
101 *it->second = fd;
102 fd_map.emplace(fd, it->second);
103 fd_map.erase(it);
104 }
105
106 if (dup2(*old_fd, new_fd) == -1) {
107 SAPI_RAW_PLOG(FATAL, "Moving temporary to proper FD failed.");
108 }
109
110 close(*old_fd);
111 fd_map.erase(*old_fd);
112 *old_fd = new_fd;
113 }
114 }
115
116 struct Pipe {
117 FDCloser read;
118 FDCloser write;
119 };
120
CreatePipe()121 Pipe CreatePipe() {
122 int pfds[2];
123 SAPI_RAW_PCHECK(pipe(pfds) == 0, "creating pipe");
124 return {FDCloser(pfds[0]), FDCloser(pfds[1])};
125 }
126
RunInitProcess(pid_t main_pid,FDCloser pipe_fd)127 ABSL_ATTRIBUTE_NORETURN void RunInitProcess(pid_t main_pid, FDCloser pipe_fd) {
128 if (prctl(PR_SET_NAME, "S2-INIT-PROC", 0, 0, 0) != 0) {
129 SAPI_RAW_PLOG(WARNING, "prctl(PR_SET_NAME, 'S2-INIT-PROC')");
130 }
131
132 // Clear SA_NOCLDWAIT.
133 struct sigaction sa;
134 sa.sa_handler = SIG_DFL;
135 sa.sa_flags = 0;
136 sigemptyset(&sa.sa_mask);
137 SAPI_RAW_CHECK(sigaction(SIGCHLD, &sa, nullptr) == 0,
138 "clearing SA_NOCLDWAIT");
139
140 // Apply seccomp.
141 std::vector<sock_filter> code = {
142 LOAD_ARCH,
143 JNE32(sandbox2::Syscall::GetHostAuditArch(), DENY),
144
145 LOAD_SYSCALL_NR,
146 SYSCALL(__NR_waitid, ALLOW),
147 SYSCALL(__NR_exit, ALLOW),
148 };
149 if (pipe_fd.get() >= 0) {
150 code.insert(code.end(),
151 {SYSCALL(__NR_getrusage, ALLOW), SYSCALL(__NR_write, ALLOW)});
152 }
153 code.push_back(DENY);
154
155 struct sock_fprog prog{
156 .len = static_cast<uint16_t>(code.size()),
157 .filter = code.data(),
158 };
159
160 SAPI_RAW_CHECK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0,
161 "Denying new privs");
162 SAPI_RAW_CHECK(prctl(PR_SET_KEEPCAPS, 0) == 0, "Dropping caps");
163 SAPI_RAW_CHECK(
164 syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
165 reinterpret_cast<uintptr_t>(&prog)) == 0,
166 "Enabling seccomp filter");
167
168 siginfo_t info;
169 // Reap children.
170 for (;;) {
171 int rv = TEMP_FAILURE_RETRY(waitid(P_ALL, -1, &info, WEXITED | __WALL));
172 if (rv != 0) {
173 _exit(1);
174 }
175
176 if (info.si_pid == main_pid) {
177 if (pipe_fd.get() >= 0) {
178 (void)write(pipe_fd.get(), &info.si_code, sizeof(info.si_code));
179 (void)write(pipe_fd.get(), &info.si_status, sizeof(info.si_status));
180
181 rusage usage{};
182 getrusage(RUSAGE_CHILDREN, &usage);
183 (void)write(pipe_fd.get(), &usage, sizeof(usage));
184 }
185 _exit(0);
186 }
187 }
188 }
189
SendPid(int signaling_fd)190 absl::Status SendPid(int signaling_fd) {
191 // Send our PID (the actual sandboxee process) via SCM_CREDENTIALS.
192 // The ancillary message will be attached to the message as SO_PASSCRED is set
193 // on the socket.
194 char dummy = ' ';
195 if (TEMP_FAILURE_RETRY(send(signaling_fd, &dummy, 1, 0)) != 1) {
196 return absl::ErrnoToStatus(errno, "Sending PID: send()");
197 }
198 return absl::OkStatus();
199 }
200
ReceivePid(int signaling_fd)201 absl::StatusOr<pid_t> ReceivePid(int signaling_fd) {
202 union {
203 struct cmsghdr cmh;
204 char ctrl[CMSG_SPACE(sizeof(struct ucred))];
205 } ucred_msg{};
206
207 struct msghdr msgh{};
208 struct iovec iov{};
209
210 msgh.msg_iov = &iov;
211 msgh.msg_iovlen = 1;
212 msgh.msg_control = ucred_msg.ctrl;
213 msgh.msg_controllen = sizeof(ucred_msg);
214
215 char dummy;
216 iov.iov_base = &dummy;
217 iov.iov_len = sizeof(char);
218
219 if (TEMP_FAILURE_RETRY(recvmsg(signaling_fd, &msgh, MSG_WAITALL)) != 1) {
220 return absl::ErrnoToStatus(errno, "Receiving pid failed: recvmsg");
221 }
222 struct cmsghdr* cmsgp = CMSG_FIRSTHDR(&msgh);
223 if (cmsgp->cmsg_len != CMSG_LEN(sizeof(struct ucred)) ||
224 cmsgp->cmsg_level != SOL_SOCKET || cmsgp->cmsg_type != SCM_CREDENTIALS) {
225 return absl::InternalError("Receiving pid failed");
226 }
227 auto* ucredp = reinterpret_cast<struct ucred*>(CMSG_DATA(cmsgp));
228 return ucredp->pid;
229 }
230
GetRootMountId(const std::string & proc_id)231 absl::StatusOr<std::string> GetRootMountId(const std::string& proc_id) {
232 std::ifstream mounts(absl::StrCat("/proc/", proc_id, "/mountinfo"));
233 if (!mounts.good()) {
234 return absl::InternalError("Failed to open mountinfo");
235 }
236 std::string line;
237 while (std::getline(mounts, line)) {
238 std::vector<absl::string_view> parts =
239 absl::StrSplit(line, absl::MaxSplits(' ', 4));
240 if (parts.size() >= 4 && parts[3] == "/") {
241 return std::string(parts[0]);
242 }
243 }
244 return absl::NotFoundError("Root entry not found in mountinfo");
245 }
246
IsLikelyChrooted()247 bool IsLikelyChrooted() {
248 absl::StatusOr<std::string> self_root_id = GetRootMountId("self");
249 if (!self_root_id.ok()) {
250 return absl::IsNotFound(self_root_id.status());
251 }
252 absl::StatusOr<std::string> init_root_id = GetRootMountId("1");
253 if (!init_root_id.ok()) {
254 return false;
255 }
256 return *self_root_id != *init_root_id;
257 }
258
259 } // namespace
260
PrepareExecveArgs(const ForkRequest & request,std::vector<std::string> * args,std::vector<std::string> * envp)261 void ForkServer::PrepareExecveArgs(const ForkRequest& request,
262 std::vector<std::string>* args,
263 std::vector<std::string>* envp) {
264 // Prepare arguments for execve.
265 for (const auto& arg : request.args()) {
266 args->push_back(arg);
267 }
268
269 // Prepare environment variables for execve.
270 for (const auto& env : request.envs()) {
271 envp->push_back(env);
272 }
273
274 // The child process should not start any fork-servers.
275 envp->push_back(absl::StrCat(kForkServerDisableEnv, "=1"));
276
277 constexpr char kSapiVlogLevel[] = "SAPI_VLOG_LEVEL";
278 char* sapi_vlog = getenv(kSapiVlogLevel);
279 if (sapi_vlog && strlen(sapi_vlog) > 0) {
280 envp->push_back(absl::StrCat(kSapiVlogLevel, "=", sapi_vlog));
281 }
282
283 SAPI_RAW_VLOG(1, "Will execute args:['%s'], environment:['%s']",
284 absl::StrJoin(*args, "', '").c_str(),
285 absl::StrJoin(*envp, "', '").c_str());
286 }
287
LaunchChild(const ForkRequest & request,int execve_fd,uid_t uid,gid_t gid,FDCloser signaling_fd,FDCloser status_fd,bool avoid_pivot_root) const288 void ForkServer::LaunchChild(const ForkRequest& request, int execve_fd,
289 uid_t uid, gid_t gid, FDCloser signaling_fd,
290 FDCloser status_fd, bool avoid_pivot_root) const {
291 SAPI_RAW_CHECK(request.mode() != FORKSERVER_FORK_UNSPECIFIED,
292 "Forkserver mode is unspecified");
293
294 const bool will_execve = execve_fd != -1;
295 const bool should_sandbox = request.mode() == FORKSERVER_FORK_EXECVE_SANDBOX;
296
297 absl::StatusOr<absl::flat_hash_set<int>> open_fds = sanitizer::GetListOfFDs();
298 if (!open_fds.ok()) {
299 SAPI_RAW_LOG(WARNING, "Could not get list of current open FDs: %s",
300 std::string(open_fds.status().message()).c_str());
301 open_fds = absl::flat_hash_set<int>();
302 }
303 SanitizeEnvironment();
304
305 InitializeNamespaces(request, uid, gid, avoid_pivot_root);
306
307 auto caps = cap_init();
308 SAPI_RAW_CHECK(cap_set_proc(caps) == 0, "while dropping capabilities");
309 cap_free(caps);
310
311 // A custom init process is only needed if a new PID NS is created.
312 if (request.clone_flags() & CLONE_NEWPID) {
313 // Spawn a child process
314 pid_t child = util::ForkWithFlags(SIGCHLD);
315 if (child < 0) {
316 SAPI_RAW_PLOG(FATAL, "Could not spawn init process");
317 }
318 if (child != 0) {
319 if (status_fd.get() >= 0) {
320 open_fds->erase(status_fd.get());
321 }
322 // Close all open fds (equals to CloseAllFDsExcept but does not require
323 // /proc to be available).
324 for (const auto& fd : *open_fds) {
325 close(fd);
326 }
327 RunInitProcess(child, std::move(status_fd));
328 }
329 // Send sandboxee pid
330 auto status = SendPid(signaling_fd.get());
331 SAPI_RAW_CHECK(status.ok(),
332 absl::StrCat("sending pid: ", status.message()).c_str());
333 }
334 signaling_fd.Close();
335 status_fd.Close();
336
337 Client client(comms_);
338 client.allow_speculation_ = request.allow_speculation();
339
340 // Prepare the arguments before sandboxing (if needed), as doing it after
341 // sandoxing can cause syscall violations (e.g. related to memory management).
342 std::vector<std::string> args;
343 std::vector<std::string> envs;
344 if (will_execve) {
345 PrepareExecveArgs(request, &args, &envs);
346 }
347
348 // Sandboxing can be enabled either here - just before execve, or somewhere
349 // inside the executed binary (e.g. after basic structures have been
350 // initialized, and resources acquired). In the latter case, it's up to the
351 // sandboxed binary to establish proper Comms channel (using
352 // Comms::kSandbox2ClientCommsFD) and call sandbox2::Client::SandboxMeHere()
353 if (should_sandbox) {
354 // The following client calls are basically SandboxMeHere. We split it so
355 // that we can set up the envp after we received the file descriptors but
356 // before we enable the syscall filter.
357 client.PrepareEnvironment(&execve_fd);
358 if (comms_->GetConnectionFD() != Comms::kSandbox2ClientCommsFD) {
359 envs.push_back(absl::StrCat(Comms::kSandbox2CommsFDEnvVar, "=",
360 comms_->GetConnectionFD()));
361 }
362 envs.push_back(client.GetFdMapEnvVar());
363 }
364
365 // Convert args and envs before enabling sandbox (it'll allocate which might
366 // be blocked).
367 util::CharPtrArray argv = util::CharPtrArray::FromStringVector(args);
368 util::CharPtrArray envp = util::CharPtrArray::FromStringVector(envs);
369
370 if (should_sandbox) {
371 client.EnableSandbox();
372 }
373
374 if (will_execve) {
375 ExecuteProcess(execve_fd, argv.data(), envp.data());
376 }
377 }
378
ServeRequest()379 pid_t ForkServer::ServeRequest() {
380 ForkRequest fork_request;
381 if (!comms_->RecvProtoBuf(&fork_request)) {
382 if (comms_->IsTerminated()) {
383 return -1;
384 }
385 SAPI_RAW_LOG(FATAL, "Failed to receive ForkServer request");
386 }
387 int comms_fd;
388 SAPI_RAW_CHECK(comms_->RecvFD(&comms_fd), "Failed to receive Comms FD");
389
390 SAPI_RAW_CHECK(fork_request.mode() != FORKSERVER_FORK_UNSPECIFIED,
391 "Forkserver mode is unspecified");
392
393 int exec_fd = -1;
394 if (fork_request.mode() == FORKSERVER_FORK_EXECVE ||
395 fork_request.mode() == FORKSERVER_FORK_EXECVE_SANDBOX) {
396 SAPI_RAW_CHECK(comms_->RecvFD(&exec_fd), "Failed to receive Exec FD");
397 }
398
399 // Make the kernel notify us with SIGCHLD when the process terminates.
400 // We use sigaction(SIGCHLD, flags=SA_NOCLDWAIT) in combination with
401 // this to make sure the zombie process is reaped immediately.
402 int clone_flags = fork_request.clone_flags() | SIGCHLD;
403
404 // Store uid and gid since they will change if CLONE_NEWUSER is set.
405 uid_t uid = getuid();
406 uid_t gid = getgid();
407
408 Pipe pipe_fds;
409 if (fork_request.monitor_type() == FORKSERVER_MONITOR_UNOTIFY) {
410 pipe_fds = CreatePipe();
411 }
412
413 int socketpair_fds[2];
414 SAPI_RAW_PCHECK(
415 socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, socketpair_fds) == 0,
416 "creating signaling socketpair");
417 for (int i = 0; i < 2; ++i) {
418 int val = 1;
419 SAPI_RAW_PCHECK(setsockopt(socketpair_fds[i], SOL_SOCKET, SO_PASSCRED, &val,
420 sizeof(val)) == 0,
421 "setsockopt failed");
422 }
423
424 FDCloser signaling_fds[] = {FDCloser(socketpair_fds[0]),
425 FDCloser(socketpair_fds[1])};
426
427 // Note: init_pid will be overwritten with the actual init pid if the init
428 // process was started or stays at 0 if that is not needed - no pidns.
429 pid_t init_pid = 0;
430 pid_t sandboxee_pid = -1;
431 bool avoid_pivot_root = clone_flags & (CLONE_NEWUSER | CLONE_NEWNS);
432 if (avoid_pivot_root) {
433 // Create initial namespaces only when they're first needed.
434 // This allows sandbox2 to be still used without any namespaces support
435 if (initial_mntns_fd_ == -1) {
436 CreateInitialNamespaces();
437 }
438 if (fork_request.netns_mode() == NETNS_MODE_SHARED_PER_FORKSERVER &&
439 initial_netns_fd_ == -1) {
440 CreateForkserverSharedNetworkNamespace();
441 }
442 // We first just fork a child, which will join the initial namespaces
443 // Note: Not a regular fork() as one really needs to be single-threaded to
444 // setns and this is not the case with TSAN.
445 pid_t pid = util::ForkWithFlags(SIGCHLD);
446 SAPI_RAW_PCHECK(pid != -1, "fork failed");
447 if (pid == 0) {
448 SAPI_RAW_PCHECK(setns(initial_userns_fd_, CLONE_NEWUSER) != -1,
449 "joining initial user namespace");
450 SAPI_RAW_PCHECK(setns(initial_mntns_fd_, CLONE_NEWNS) != -1,
451 "joining initial mnt namespace");
452 if (fork_request.netns_mode() == NETNS_MODE_SHARED_PER_FORKSERVER) {
453 SAPI_RAW_PCHECK(setns(initial_netns_fd_, CLONE_NEWNET) != -1,
454 "joining initial net namespace");
455 close(initial_netns_fd_);
456 }
457 close(initial_userns_fd_);
458 close(initial_mntns_fd_);
459 // Do not create new userns it will be unshared later
460 sandboxee_pid =
461 util::ForkWithFlags((clone_flags & ~CLONE_NEWUSER) | CLONE_PARENT);
462 if (sandboxee_pid == -1) {
463 SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
464 }
465 if (sandboxee_pid != 0) {
466 _exit(0);
467 }
468 // Send sandboxee pid
469 absl::Status status = SendPid(signaling_fds[1].get());
470 SAPI_RAW_CHECK(status.ok(),
471 absl::StrCat("sending pid: ", status.message()).c_str());
472 }
473 } else {
474 sandboxee_pid = util::ForkWithFlags(clone_flags);
475 if (sandboxee_pid == -1) {
476 SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
477 }
478 if (sandboxee_pid == 0) {
479 close(initial_userns_fd_);
480 close(initial_mntns_fd_);
481 }
482 }
483
484 // Child.
485 if (sandboxee_pid == 0) {
486 signaling_fds[0].Close();
487 pipe_fds.read.Close();
488 // Make sure we override the forkserver's comms fd
489 comms_->Terminate();
490 if (exec_fd != -1) {
491 int signaling_fd = signaling_fds[1].Release();
492 int pipe_fd = pipe_fds.write.Release();
493 MoveFDs({{&exec_fd, Comms::kSandbox2TargetExecFD},
494 {&comms_fd, Comms::kSandbox2ClientCommsFD}},
495 {&signaling_fd, &pipe_fd});
496 signaling_fds[1] = FDCloser(signaling_fd);
497 pipe_fds.write = FDCloser(pipe_fd);
498 }
499 *comms_ = Comms(comms_fd);
500 LaunchChild(fork_request, exec_fd, uid, gid, std::move(signaling_fds[1]),
501 std::move(pipe_fds.write), avoid_pivot_root);
502 return sandboxee_pid;
503 }
504
505 signaling_fds[1].Close();
506
507 if (avoid_pivot_root) {
508 if (auto pid = ReceivePid(signaling_fds[0].get()); !pid.ok()) {
509 SAPI_RAW_LOG(ERROR, "%s", std::string(pid.status().message()).c_str());
510 } else {
511 sandboxee_pid = pid.value();
512 }
513 }
514
515 if (fork_request.clone_flags() & CLONE_NEWPID) {
516 // The pid of the init process is equal to the child process that we've
517 // previously forked.
518 init_pid = sandboxee_pid;
519 sandboxee_pid = -1;
520 // And the actual sandboxee is forked from the init process, so we need to
521 // receive the actual PID.
522 if (auto pid_or = ReceivePid(signaling_fds[0].get()); !pid_or.ok()) {
523 SAPI_RAW_LOG(ERROR, "%s", std::string(pid_or.status().message()).c_str());
524 if (init_pid != -1) {
525 kill(init_pid, SIGKILL);
526 }
527 init_pid = -1;
528 } else {
529 sandboxee_pid = pid_or.value();
530 }
531 }
532
533 // Parent.
534 pipe_fds.write.Close();
535 close(comms_fd);
536 if (exec_fd >= 0) {
537 close(exec_fd);
538 }
539 SAPI_RAW_CHECK(comms_->SendInt32(init_pid),
540 absl::StrCat("Failed to send init PID: ", init_pid).c_str());
541 SAPI_RAW_CHECK(
542 comms_->SendInt32(sandboxee_pid),
543 absl::StrCat("Failed to send sandboxee PID: ", sandboxee_pid).c_str());
544
545 if (pipe_fds.read.get() >= 0) {
546 SAPI_RAW_CHECK(comms_->SendFD(pipe_fds.read.get()),
547 "Failed to send status pipe");
548 }
549 return sandboxee_pid;
550 }
551
IsTerminated() const552 bool ForkServer::IsTerminated() const { return comms_->IsTerminated(); }
553
Initialize()554 bool ForkServer::Initialize() {
555 // For safety drop as many capabilities as possible.
556 // Note that cap_t is actually a pointer.
557 cap_t have_caps = cap_get_proc(); // caps we currently have
558 SAPI_RAW_CHECK(have_caps, "failed to cap_get_proc()");
559 cap_t wanted_caps = cap_init(); // starts as empty set, ie. no caps
560 SAPI_RAW_CHECK(wanted_caps, "failed to cap_init()");
561
562 // CAP_SYS_PTRACE appears to be needed for apparmor (or possibly yama)
563 // CAP_SETFCAP is needed on newer kernels (5.10 needs it, 4.15 does not)
564 for (cap_value_t cap : {CAP_SYS_PTRACE, CAP_SETFCAP}) {
565 for (cap_flag_t flag : {CAP_EFFECTIVE, CAP_PERMITTED}) {
566 cap_flag_value_t value;
567 int rc = cap_get_flag(have_caps, cap, flag, &value);
568 SAPI_RAW_CHECK(!rc, "cap_get_flag");
569 if (value == CAP_SET) {
570 cap_value_t caps_to_set[1] = {
571 cap,
572 };
573 rc = cap_set_flag(wanted_caps, flag, 1, caps_to_set, CAP_SET);
574 SAPI_RAW_CHECK(!rc, "cap_set_flag");
575 }
576 }
577 }
578
579 SAPI_RAW_CHECK(!cap_set_proc(wanted_caps), "while dropping capabilities");
580 SAPI_RAW_CHECK(!cap_free(wanted_caps), "while freeing wanted_caps");
581 SAPI_RAW_CHECK(!cap_free(have_caps), "while freeing have_caps");
582
583 // All processes spawned by the fork'd/execute'd process will see this process
584 // as /sbin/init. Therefore it will receive (and ignore) their final status
585 // (see the next comment as well). PR_SET_CHILD_SUBREAPER is available since
586 // kernel version 3.4, so don't panic if it fails.
587 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) == -1) {
588 SAPI_RAW_VLOG(3, "prctl(PR_SET_CHILD_SUBREAPER, 1): %s [%d]",
589 StrError(errno).c_str(), errno);
590 }
591
592 // Don't convert terminated child processes into zombies. It's up to the
593 // sandbox (Monitor) to track them and receive/report their final status.
594 struct sigaction sa;
595 sa.sa_handler = SIG_DFL;
596 sa.sa_flags = SA_NOCLDWAIT;
597 sigemptyset(&sa.sa_mask);
598 if (sigaction(SIGCHLD, &sa, nullptr) == -1) {
599 SAPI_RAW_PLOG(ERROR, "sigaction(SIGCHLD, flags=SA_NOCLDWAIT)");
600 return false;
601 }
602 return true;
603 }
604
CreateInitialNamespaces()605 void ForkServer::CreateInitialNamespaces() {
606 // Spawn a new process to create initial user and mount namespaces to be used
607 // as a base for each namespaced sandboxee.
608
609 // Store uid and gid to create mappings after CLONE_NEWUSER
610 uid_t uid = getuid();
611 gid_t gid = getgid();
612
613 // Socket to synchronize so that we open ns fds before process dies
614 Pipe create_pipe = CreatePipe();
615 Pipe open_pipe = CreatePipe();
616 pid_t pid = util::ForkWithFlags(CLONE_NEWUSER | CLONE_NEWNS | SIGCHLD);
617 if (pid == -1 && errno == EPERM && IsLikelyChrooted()) {
618 SAPI_RAW_LOG(FATAL,
619 "failed to fork initial namespaces process: parent process is "
620 "likely chrooted");
621 }
622 SAPI_RAW_PCHECK(pid != -1, "failed to fork initial namespaces process");
623 char value = ' ';
624 if (pid == 0) {
625 create_pipe.read.Close();
626 open_pipe.write.Close();
627 Namespace::InitializeInitialNamespaces(uid, gid);
628 SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(create_pipe.write.get(), &value,
629 sizeof(value))) == sizeof(value),
630 "synchronizing initial namespaces creation");
631 SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(open_pipe.read.get(), &value,
632 sizeof(value))) == sizeof(value),
633 "synchronizing initial namespaces creation");
634 SAPI_RAW_PCHECK(chroot("/realroot") == 0,
635 "chrooting prior to dumping coverage");
636 util::DumpCoverageData();
637 _exit(0);
638 }
639 open_pipe.read.Close();
640 create_pipe.write.Close();
641 SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(create_pipe.read.get(), &value,
642 sizeof(value))) == sizeof(value),
643 "synchronizing initial namespaces creation");
644 initial_userns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/user").c_str(),
645 O_RDONLY | O_CLOEXEC);
646 SAPI_RAW_PCHECK(initial_userns_fd_ != -1, "getting initial userns fd");
647 initial_mntns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/mnt").c_str(),
648 O_RDONLY | O_CLOEXEC);
649 SAPI_RAW_PCHECK(initial_mntns_fd_ != -1, "getting initial mntns fd");
650 SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(open_pipe.write.get(), &value,
651 sizeof(value))) == sizeof(value),
652 "synchronizing initial namespaces creation");
653 }
654
CreateForkserverSharedNetworkNamespace()655 void ForkServer::CreateForkserverSharedNetworkNamespace() {
656 Pipe create_pipe = CreatePipe();
657 Pipe open_pipe = CreatePipe();
658 pid_t pid = util::ForkWithFlags(SIGCHLD);
659 SAPI_RAW_PCHECK(pid != -1, "failed to fork shared netns process");
660 char value = ' ';
661 if (pid == 0) {
662 create_pipe.read.Close();
663 open_pipe.write.Close();
664 SAPI_RAW_PCHECK(setns(initial_userns_fd_, CLONE_NEWUSER) == 0,
665 "joining initial user namespace");
666 SAPI_RAW_PCHECK(unshare(CLONE_NEWNET) == 0, "unsharing netns");
667 SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(create_pipe.write.get(), &value,
668 sizeof(value))) == sizeof(value),
669 "synchronizing shared netns creation");
670 SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(open_pipe.read.get(), &value,
671 sizeof(value))) == sizeof(value),
672 "synchronizing shared netns creation");
673 util::DumpCoverageData();
674 _exit(0);
675 }
676 open_pipe.read.Close();
677 create_pipe.write.Close();
678 SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(create_pipe.read.get(), &value,
679 sizeof(value))) == sizeof(value),
680 "synchronizing shared netns creation");
681 initial_netns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/net").c_str(),
682 O_RDONLY | O_CLOEXEC);
683 SAPI_RAW_PCHECK(initial_netns_fd_ != -1, "getting initial netns fd");
684 SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(open_pipe.write.get(), &value,
685 sizeof(value))) == sizeof(value),
686 "synchronizing initial namespaces creation");
687 }
688
SanitizeEnvironment() const689 void ForkServer::SanitizeEnvironment() const {
690 // Mark all file descriptors, except the standard ones (needed
691 // for proper sandboxed process operations), as close-on-exec.
692 absl::Status status = sanitizer::SanitizeCurrentProcess(
693 {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO, comms_->GetConnectionFD()},
694 /* close_fds = */ false);
695 SAPI_RAW_CHECK(
696 status.ok(),
697 absl::StrCat("while sanitizing process: ", status.message()).c_str());
698 }
699
ExecuteProcess(int execve_fd,const char * const * argv,const char * const * envp)700 void ForkServer::ExecuteProcess(int execve_fd, const char* const* argv,
701 const char* const* envp) {
702 // Do not add any code before execve(), as it's subject to seccomp policies.
703 // Indicate that it's a special execve(), by setting 4th, 5th and 6th syscall
704 // argument to magic values.
705 util::Execveat(execve_fd, "", argv, envp, AT_EMPTY_PATH,
706 internal::kExecveMagic);
707
708 int saved_errno = errno;
709 SAPI_RAW_PLOG(ERROR, "execveat failed");
710 if (argv[0]) {
711 SAPI_RAW_LOG(ERROR, "argv[0]=%s", argv[0]);
712 }
713
714 if (saved_errno == ENOSYS) {
715 SAPI_RAW_LOG(ERROR,
716 "This is likely caused by running on a kernel that is too old."
717 );
718 } else if (saved_errno == ENOENT && execve_fd >= 0) {
719 // Since we know the file exists, it must be that the file is dynamically
720 // linked and the ELF interpreter is what's actually missing.
721 SAPI_RAW_LOG(
722 ERROR,
723 "This is likely caused by running dynamically-linked sandboxee without "
724 "calling .AddLibrariesForBinary() on the policy builder.");
725 }
726
727 util::Syscall(__NR_exit_group, EXIT_FAILURE);
728 abort();
729 }
730
InitializeNamespaces(const ForkRequest & request,uid_t uid,gid_t gid,bool avoid_pivot_root)731 void ForkServer::InitializeNamespaces(const ForkRequest& request, uid_t uid,
732 gid_t gid, bool avoid_pivot_root) {
733 if (!request.has_mount_tree()) {
734 return;
735 }
736 Namespace::InitializeNamespaces(
737 uid, gid, request.clone_flags(), Mounts(request.mount_tree()),
738 request.hostname(), avoid_pivot_root, request.allow_mount_propagation());
739 }
740
741 } // namespace sandbox2
742