1 #include "sandboxed_api/sandbox2/monitor_unotify.h"
2
3 #include <linux/audit.h>
4 #include <linux/seccomp.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/ptrace.h>
9 #include <sys/resource.h>
10 #include <sys/sysinfo.h>
11 #include <sys/uio.h>
12 #include <sys/wait.h>
13 #include <syscall.h>
14 #include <unistd.h>
15
16 #include <algorithm>
17 #include <atomic>
18 #include <cerrno>
19 #include <cstdint>
20 #include <cstdlib>
21 #include <cstring>
22 #include <memory>
23 #include <string>
24 #include <utility>
25 #include <vector>
26
27 #include "absl/base/macros.h"
28 #include "absl/cleanup/cleanup.h"
29 #include "absl/log/check.h"
30 #include "absl/log/log.h"
31 #include "absl/status/status.h"
32 #include "absl/status/statusor.h"
33 #include "absl/strings/str_cat.h"
34 #include "absl/synchronization/mutex.h"
35 #include "absl/synchronization/notification.h"
36 #include "absl/time/clock.h"
37 #include "absl/time/time.h"
38 #include "absl/types/span.h"
39 #include "sandboxed_api/config.h"
40 #include "sandboxed_api/sandbox2/bpf_evaluator.h"
41 #include "sandboxed_api/sandbox2/client.h"
42 #include "sandboxed_api/sandbox2/executor.h"
43 #include "sandboxed_api/sandbox2/forkserver.pb.h"
44 #include "sandboxed_api/sandbox2/monitor_base.h"
45 #include "sandboxed_api/sandbox2/notify.h"
46 #include "sandboxed_api/sandbox2/policy.h"
47 #include "sandboxed_api/sandbox2/result.h"
48 #include "sandboxed_api/util/fileops.h"
49 #include "sandboxed_api/util/status_macros.h"
50 #include "sandboxed_api/util/thread.h"
51
52 #ifndef SECCOMP_RET_USER_NOTIF
53 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U /* notifies userspace */
54 #endif
55
56 #ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
57 #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 1
58 #endif
59
60 #define DO_USER_NOTIF BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF)
61
62 #ifndef SECCOMP_GET_NOTIF_SIZES
63 #define SECCOMP_GET_NOTIF_SIZES 3
64
65 struct seccomp_notif_sizes {
66 __u16 seccomp_notif;
67 __u16 seccomp_notif_resp;
68 __u16 seccomp_data;
69 };
70 #endif
71
72 #ifndef SECCOMP_IOCTL_NOTIF_RECV
73 #ifndef SECCOMP_IOWR
74 #define SECCOMP_IOC_MAGIC '!'
75 #define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr)
76 #define SECCOMP_IOWR(nr, type) _IOWR(SECCOMP_IOC_MAGIC, nr, type)
77 #endif
78
79 // Flags for seccomp notification fd ioctl.
80 #define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif)
81 #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, struct seccomp_notif_resp)
82 #endif
83
84 namespace sandbox2 {
85
86 namespace {
87
88 using ::sapi::file_util::fileops::FDCloser;
89
seccomp(unsigned int operation,unsigned int flags,void * args)90 int seccomp(unsigned int operation, unsigned int flags, void* args) {
91 return syscall(SYS_seccomp, operation, flags, args);
92 }
93
AuditArchToCPUArch(uint32_t arch)94 sapi::cpu::Architecture AuditArchToCPUArch(uint32_t arch) {
95 switch (arch) {
96 case AUDIT_ARCH_AARCH64:
97 return sapi::cpu::Architecture::kArm64;
98 case AUDIT_ARCH_ARM:
99 return sapi::cpu::Architecture::kArm;
100 case AUDIT_ARCH_X86_64:
101 return sapi::cpu::Architecture::kX8664;
102 case AUDIT_ARCH_I386:
103 return sapi::cpu::Architecture::kX86;
104 case AUDIT_ARCH_PPC64LE:
105 return sapi::cpu::Architecture::kPPC64LE;
106 default:
107 return sapi::cpu::Architecture::kUnknown;
108 }
109 }
110
WaitForFdReadable(int fd,absl::Time deadline)111 absl::Status WaitForFdReadable(int fd, absl::Time deadline) {
112 pollfd pfds[] = {
113 {.fd = fd, .events = POLLIN},
114 };
115 for (absl::Duration remaining = deadline - absl::Now();
116 remaining > absl::ZeroDuration(); remaining = deadline - absl::Now()) {
117 int ret = poll(pfds, ABSL_ARRAYSIZE(pfds),
118 static_cast<int>(absl::ToInt64Milliseconds(remaining)));
119 if (ret > 0) {
120 if (pfds[0].revents & POLLIN) {
121 return absl::OkStatus();
122 }
123 if (pfds[0].revents & POLLHUP) {
124 return absl::UnavailableError("hangup");
125 }
126 return absl::InternalError("poll");
127 }
128 if (ret == -1 && errno != EINTR) {
129 return absl::ErrnoToStatus(errno, "poll");
130 }
131 }
132 return absl::DeadlineExceededError("waiting for fd");
133 }
134
ReadWholeWithDeadline(int fd,std::vector<iovec> vecs_vec,absl::Time deadline)135 absl::Status ReadWholeWithDeadline(int fd, std::vector<iovec> vecs_vec,
136 absl::Time deadline) {
137 absl::Span<iovec> vecs = absl::MakeSpan(vecs_vec);
138 while (!vecs.empty()) {
139 SAPI_RETURN_IF_ERROR(WaitForFdReadable(fd, deadline));
140 ssize_t r = readv(fd, vecs.data(), vecs.size());
141 if (r < 0 && errno != EINTR) {
142 return absl::ErrnoToStatus(errno, "readv");
143 }
144 while (r > 0) {
145 if (vecs.empty()) {
146 return absl::InternalError("readv return value too big");
147 }
148 iovec& vec = vecs.front();
149 if (r < vec.iov_len) {
150 vec.iov_len -= r;
151 vec.iov_base = reinterpret_cast<char*>(vec.iov_base) + r;
152 break;
153 }
154 r -= vec.iov_len;
155 vecs.remove_prefix(1);
156 }
157 }
158 return absl::OkStatus();
159 }
160
161 } // namespace
162
UnotifyMonitor(Executor * executor,Policy * policy,Notify * notify)163 UnotifyMonitor::UnotifyMonitor(Executor* executor, Policy* policy,
164 Notify* notify)
165 : MonitorBase(executor, policy, notify) {
166 type_ = FORKSERVER_MONITOR_UNOTIFY;
167 if (executor_->limits()->wall_time_limit() != absl::ZeroDuration()) {
168 auto deadline = absl::Now() + executor_->limits()->wall_time_limit();
169 deadline_millis_.store(absl::ToUnixMillis(deadline),
170 std::memory_order_relaxed);
171 }
172 external_kill_request_flag_.test_and_set(std::memory_order_relaxed);
173 dump_stack_request_flag_.test_and_set(std::memory_order_relaxed);
174 }
175
RunInternal()176 void UnotifyMonitor::RunInternal() {
177 thread_ = sapi::Thread(this, &UnotifyMonitor::Run, "sandbox2-Monitor");
178
179 // Wait for the Monitor to set-up the sandboxee correctly (or fail while
180 // doing that). From here on, it is safe to use the IPC object for
181 // non-sandbox-related data exchange.
182 setup_notification_.WaitForNotification();
183 }
184
SendPolicy(const std::vector<sock_filter> & policy)185 absl::Status UnotifyMonitor::SendPolicy(
186 const std::vector<sock_filter>& policy) {
187 original_policy_ = policy;
188 std::vector<sock_filter> modified_policy = policy;
189 const sock_filter trace_action = SANDBOX2_TRACE;
190 for (sock_filter& filter : modified_policy) {
191 if ((filter.code == BPF_RET + BPF_K && filter.k == SECCOMP_RET_KILL) ||
192 (filter.code == trace_action.code && filter.k == trace_action.k)) {
193 filter = DO_USER_NOTIF;
194 }
195 }
196 return MonitorBase::SendPolicy(modified_policy);
197 }
198
HandleViolation(const Syscall & syscall)199 void UnotifyMonitor::HandleViolation(const Syscall& syscall) {
200 ViolationType violation_type = syscall.arch() == Syscall::GetHostArch()
201 ? ViolationType::kSyscall
202 : ViolationType::kArchitectureSwitch;
203 LogSyscallViolation(syscall);
204 notify_->EventSyscallViolation(syscall, violation_type);
205 MaybeGetStackTrace(req_->pid, Result::VIOLATION);
206 SetExitStatusCode(Result::VIOLATION, syscall.nr());
207 notify_->EventSyscallViolation(syscall, violation_type);
208 result_.SetSyscall(std::make_unique<Syscall>(syscall));
209 KillSandboxee();
210 }
211
AllowSyscallViaUnotify()212 void UnotifyMonitor::AllowSyscallViaUnotify() {
213 memset(resp_.get(), 0, resp_size_);
214 resp_->id = req_->id;
215 resp_->val = 0;
216 resp_->error = 0;
217 resp_->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
218 if (ioctl(seccomp_notify_fd_.get(), SECCOMP_IOCTL_NOTIF_SEND, resp_.get()) !=
219 0) {
220 if (errno == ENOENT) {
221 VLOG(1) << "Unotify send failed with ENOENT";
222 } else {
223 LOG_IF(ERROR, errno == EINVAL)
224 << "Unotify send failed with EINVAL. Likely "
225 "SECCOMP_USER_NOTIF_FLAG_CONTINUE unsupported by the kernel.";
226 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_NOTIFY);
227 }
228 }
229 }
230
HandleUnotify()231 void UnotifyMonitor::HandleUnotify() {
232 memset(req_.get(), 0, req_size_);
233 if (ioctl(seccomp_notify_fd_.get(), SECCOMP_IOCTL_NOTIF_RECV, req_.get()) !=
234 0) {
235 if (errno == ENOENT) {
236 VLOG(1) << "Unotify recv failed with ENOENT";
237 } else {
238 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_NOTIFY);
239 }
240 return;
241 }
242 Syscall syscall(AuditArchToCPUArch(req_->data.arch), req_->data.nr,
243 {req_->data.args[0], req_->data.args[1], req_->data.args[2],
244 req_->data.args[3], req_->data.args[4], req_->data.args[5]},
245 req_->pid, 0, req_->data.instruction_pointer);
246 absl::StatusOr<uint32_t> policy_ret =
247 bpf::Evaluate(original_policy_, req_->data);
248 if (!policy_ret.ok()) {
249 LOG(ERROR) << "Failed to evaluate policy: " << policy_ret.status();
250 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_NOTIFY);
251 }
252 const sock_filter trace_action = SANDBOX2_TRACE;
253 bool should_trace = *policy_ret == trace_action.k;
254 Notify::TraceAction trace_response = Notify::TraceAction::kDeny;
255 if (should_trace) {
256 trace_response = notify_->EventSyscallTrace(syscall);
257 }
258 switch (trace_response) {
259 case Notify::TraceAction::kAllow:
260 AllowSyscallViaUnotify();
261 return;
262 case Notify::TraceAction::kDeny:
263 HandleViolation(syscall);
264 return;
265 case Notify::TraceAction::kInspectAfterReturn:
266 LOG(FATAL) << "TraceAction::kInspectAfterReturn not supported by unotify "
267 "monitor";
268 default:
269 LOG(FATAL) << "Unknown TraceAction: " << static_cast<int>(trace_response);
270 }
271 }
272
Run()273 void UnotifyMonitor::Run() {
274 absl::Cleanup monitor_done = [this] {
275 getrusage(RUSAGE_THREAD, result_.GetRUsageMonitor());
276 OnDone();
277 };
278
279 absl::Cleanup setup_notify = [this] { setup_notification_.Notify(); };
280 if (!InitSetupUnotify()) {
281 SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_NOTIFY);
282 return;
283 }
284 if (!InitSetupNotifyEventFd()) {
285 SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_NOTIFY);
286 return;
287 }
288
289 std::move(setup_notify).Invoke();
290
291 pollfd pfds[] = {
292 {.fd = process_.status_fd.get(), .events = POLLIN},
293 {.fd = seccomp_notify_fd_.get(), .events = POLLIN},
294 {.fd = monitor_notify_fd_.get(), .events = POLLIN},
295 };
296 while (result_.final_status() == Result::UNSET) {
297 int64_t deadline = deadline_millis_.load(std::memory_order_relaxed);
298 absl::Duration remaining = absl::FromUnixMillis(deadline) - absl::Now();
299 if (deadline != 0 && remaining <= absl::ZeroDuration()) {
300 VLOG(1) << "Sandbox process hit timeout due to the walltime timer";
301 timed_out_ = true;
302 MaybeGetStackTrace(process_.main_pid, Result::TIMEOUT);
303 KillSandboxee();
304 SetExitStatusFromStatusPipe();
305 break;
306 }
307
308 if (!external_kill_request_flag_.test_and_set(std::memory_order_relaxed)) {
309 external_kill_ = true;
310 MaybeGetStackTrace(process_.main_pid, Result::EXTERNAL_KILL);
311 KillSandboxee();
312 SetExitStatusFromStatusPipe();
313 break;
314 }
315
316 if (network_proxy_server_ &&
317 network_proxy_server_->violation_occurred_.load(
318 std::memory_order_acquire) &&
319 !network_violation_) {
320 network_violation_ = true;
321 MaybeGetStackTrace(process_.main_pid, Result::VIOLATION);
322 KillSandboxee();
323 SetExitStatusFromStatusPipe();
324 break;
325 }
326 constexpr int64_t kMinWakeupMsec = 30000;
327 int timeout_msec = kMinWakeupMsec;
328 if (remaining > absl::ZeroDuration()) {
329 timeout_msec = static_cast<int>(
330 std::min(kMinWakeupMsec, absl::ToInt64Milliseconds(remaining)));
331 }
332 int ret = poll(pfds, ABSL_ARRAYSIZE(pfds), timeout_msec);
333 if (ret == 0 || (ret == -1 && errno == EINTR)) {
334 continue;
335 }
336 if (ret == -1) {
337 PLOG(ERROR) << "waiting for action failed";
338 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_MONITOR);
339 break;
340 }
341 if (pfds[2].revents & POLLIN) {
342 uint64_t value = 0;
343 (void)read(monitor_notify_fd_.get(), &value, sizeof(value));
344 continue;
345 }
346 if (pfds[0].revents & POLLIN) {
347 SetExitStatusFromStatusPipe();
348 break;
349 }
350 if (pfds[0].revents & POLLHUP) {
351 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_MONITOR);
352 break;
353 }
354 if (pfds[1].revents & POLLIN) {
355 HandleUnotify();
356 }
357 }
358 KillInit();
359 }
360
SetExitStatusFromStatusPipe()361 void UnotifyMonitor::SetExitStatusFromStatusPipe() {
362 int code, status;
363 rusage usage;
364
365 std::vector<iovec> iov = {
366 {.iov_base = &code, .iov_len = sizeof(code)},
367 {.iov_base = &status, .iov_len = sizeof(status)},
368 {.iov_base = &usage, .iov_len = sizeof(usage)},
369 };
370
371 if (absl::Status status = ReadWholeWithDeadline(
372 process_.status_fd.get(), iov, absl::Now() + absl::Seconds(1));
373 !status.ok()) {
374 PLOG(ERROR) << "reading status pipe failed " << status;
375 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_MONITOR);
376 return;
377 }
378
379 result_.SetRUsageSandboxee(usage);
380 if (code == CLD_EXITED) {
381 SetExitStatusCode(Result::OK, status);
382 } else if (code == CLD_KILLED || code == CLD_DUMPED) {
383 if (network_violation_) {
384 SetExitStatusCode(Result::VIOLATION, Result::VIOLATION_NETWORK);
385 result_.SetNetworkViolation(network_proxy_server_->violation_msg_);
386 } else if (external_kill_) {
387 SetExitStatusCode(Result::EXTERNAL_KILL, 0);
388 } else if (timed_out_) {
389 SetExitStatusCode(Result::TIMEOUT, 0);
390 } else {
391 SetExitStatusCode(Result::SIGNALED, status);
392 }
393 } else {
394 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_MONITOR);
395 }
396 }
397
InitSetupUnotify()398 bool UnotifyMonitor::InitSetupUnotify() {
399 if (!comms_->SendUint32(Client::kSandbox2ClientUnotify)) {
400 LOG(ERROR) << "Couldn't send Client::kSandbox2ClientUnotify message";
401 return false;
402 }
403 int fd;
404 if (!comms_->RecvFD(&fd)) {
405 LOG(ERROR) << "Couldn't recv unotify fd";
406 return false;
407 }
408 seccomp_notify_fd_ = FDCloser(fd);
409 struct seccomp_notif_sizes sizes = {};
410 if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) == -1) {
411 LOG(ERROR) << "Couldn't get seccomp_notif_sizes";
412 return false;
413 }
414 req_size_ = sizes.seccomp_notif;
415 req_.reset(static_cast<seccomp_notif*>(malloc(req_size_)));
416 resp_size_ = sizes.seccomp_notif_resp;
417 resp_.reset(static_cast<seccomp_notif_resp*>(malloc(resp_size_)));
418 return true;
419 }
420
InitSetupNotifyEventFd()421 bool UnotifyMonitor::InitSetupNotifyEventFd() {
422 int fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
423 if (fd == -1) {
424 PLOG(ERROR) << "failed creating monitor pipe";
425 return false;
426 }
427 monitor_notify_fd_ = FDCloser(fd);
428 return true;
429 }
430
NotifyMonitor()431 void UnotifyMonitor::NotifyMonitor() {
432 absl::ReaderMutexLock lock(¬ify_mutex_);
433 if (monitor_notify_fd_.get() < 0) {
434 return;
435 }
436 uint64_t value = 1;
437 PCHECK(write(monitor_notify_fd_.get(), &value, sizeof(value)) ==
438 sizeof(value));
439 }
440
KillSandboxee()441 bool UnotifyMonitor::KillSandboxee() {
442 VLOG(1) << "Sending SIGKILL to the PID: " << process_.main_pid;
443 if (kill(process_.main_pid, SIGKILL) != 0) {
444 PLOG(ERROR) << "Could not send SIGKILL to PID " << process_.main_pid;
445 return false;
446 }
447 return true;
448 }
449
KillInit()450 void UnotifyMonitor::KillInit() {
451 VLOG(1) << "Sending SIGKILL to the PID: " << process_.init_pid;
452 if (kill(process_.init_pid, SIGKILL) != 0) {
453 PLOG(ERROR) << "Could not send SIGKILL to PID " << process_.init_pid;
454 }
455 }
456
Join()457 void UnotifyMonitor::Join() {
458 absl::MutexLock lock(¬ify_mutex_);
459 if (thread_.IsJoinable()) {
460 thread_.Join();
461 CHECK(IsDone()) << "Monitor did not terminate";
462 VLOG(1) << "Final execution status: " << result_.ToString();
463 CHECK(result_.final_status() != Result::UNSET);
464 monitor_notify_fd_.Close();
465 }
466 }
467
MaybeGetStackTrace(pid_t pid,Result::StatusEnum status)468 void UnotifyMonitor::MaybeGetStackTrace(pid_t pid, Result::StatusEnum status) {
469 if (ShouldCollectStackTrace(status)) {
470 auto stack = GetStackTrace(pid);
471 if (stack.ok()) {
472 result_.set_stack_trace(*stack);
473 } else {
474 LOG(ERROR) << "Getting stack trace: " << stack.status();
475 }
476 }
477 }
478
GetStackTrace(pid_t pid)479 absl::StatusOr<std::vector<std::string>> UnotifyMonitor::GetStackTrace(
480 pid_t pid) {
481 if (ptrace(PTRACE_ATTACH, pid, 0, 0) != 0) {
482 return absl::ErrnoToStatus(errno,
483 absl::StrCat("could not attach to pid = ", pid));
484 }
485 int wstatus = 0;
486 while (!WIFSTOPPED(wstatus)) {
487 pid_t ret =
488 waitpid(pid, &wstatus, __WNOTHREAD | __WALL | WUNTRACED | WNOHANG);
489 if (ret == -1) {
490 return absl::ErrnoToStatus(errno,
491 absl::StrCat("waiting for stop, pid = ", pid));
492 }
493 }
494 absl::Cleanup cleanup = [pid] {
495 if (ptrace(PTRACE_DETACH, pid, 0, 0) != 0) {
496 LOG(ERROR) << "Could not detach after obtaining stack trace from pid = "
497 << pid;
498 }
499 };
500 Regs regs(pid);
501 absl::Status status = regs.Fetch();
502 if (!status.ok()) {
503 if (absl::IsNotFound(status)) {
504 LOG(WARNING) << "failed to fetch regs: " << status;
505 return status;
506 }
507 SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_FETCH);
508 return status;
509 }
510 return GetAndLogStackTrace(®s);
511 }
512
513 } // namespace sandbox2
514