• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Implementation of the sandbox2::Policy class.
16 
17 #include "sandboxed_api/sandbox2/policy.h"
18 
19 #include <fcntl.h>
20 #include <linux/audit.h>
21 #include <linux/bpf_common.h>
22 #include <linux/filter.h>
23 #include <linux/seccomp.h>
24 #include <sched.h>
25 #include <sys/mman.h>
26 #include <syscall.h>
27 
28 #include <cerrno>
29 #include <cstdint>
30 #include <limits>
31 #include <optional>
32 #include <string>
33 #include <vector>
34 
35 #include "absl/flags/flag.h"
36 #include "absl/log/log.h"
37 #include "absl/strings/string_view.h"
38 #include "sandboxed_api/config.h"
39 #include "sandboxed_api/sandbox2/bpfdisassembler.h"
40 #include "sandboxed_api/sandbox2/syscall.h"
41 #include "sandboxed_api/sandbox2/util.h"
42 #include "sandboxed_api/sandbox2/util/bpf_helper.h"
43 
44 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
45 #define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
46 #endif
47 
48 ABSL_FLAG(bool, sandbox2_danger_danger_permit_all, false,
49           "Allow all syscalls, useful for testing");
50 ABSL_FLAG(std::string, sandbox2_danger_danger_permit_all_and_log, "",
51           "Allow all syscalls and log them into specified file");
52 
53 namespace sandbox2 {
54 
55 // The final policy is the concatenation of:
56 //   1. default policy (GetDefaultPolicy, private),
57 //   2. user policy (user_policy_, public),
58 //   3. default KILL action (avoid failing open if user policy did not do it).
GetPolicy(bool user_notif) const59 std::vector<sock_filter> Policy::GetPolicy(bool user_notif) const {
60   if (absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all) ||
61       !absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all_and_log).empty()) {
62     return GetTrackingPolicy();
63   }
64 
65   // Now we can start building the policy.
66   // 1. Start with the default policy (e.g. syscall architecture checks).
67   auto policy = GetDefaultPolicy(user_notif);
68   VLOG(3) << "Default policy:\n" << bpf::Disasm(policy);
69 
70   // 2. Append user policy.
71   VLOG(3) << "User policy:\n" << bpf::Disasm(user_policy_);
72   // Add default syscall_nr loading in case the user forgets.
73   policy.push_back(LOAD_SYSCALL_NR);
74   policy.insert(policy.end(), user_policy_.begin(), user_policy_.end());
75 
76   // 3. Finish with default KILL action.
77   policy.push_back(KILL);
78 
79   VLOG(2) << "Final policy:\n" << bpf::Disasm(policy);
80   return policy;
81 }
82 
83 // If you modify this function, you should also modify.
84 // Monitor::LogAccessViolation to keep them in sync.
85 //
86 // Produces a policy which returns SECCOMP_RET_TRACE instead of SECCOMP_RET_KILL
87 // for the __NR_execve syscall, so the tracer can make a decision to allow or
88 // disallow it depending on which occurrence of __NR_execve it was.
GetDefaultPolicy(bool user_notif) const89 std::vector<sock_filter> Policy::GetDefaultPolicy(bool user_notif) const {
90   bpf_labels l = {0};
91 
92   std::vector<sock_filter> policy;
93   if (user_notif) {
94     policy = {
95         // If compiled arch is different from the runtime one, inform the
96         // Monitor.
97         LOAD_ARCH,
98         JNE32(Syscall::GetHostAuditArch(), DENY),
99         LOAD_SYSCALL_NR,
100         JNE32(__NR_seccomp, JUMP(&l, past_seccomp_l)),
101         ARG_32(3),
102         JNE32(internal::kExecveMagic, JUMP(&l, past_seccomp_l)),
103         ALLOW,
104         LABEL(&l, past_seccomp_l),
105         LOAD_SYSCALL_NR,
106         JNE32(__NR_execveat, JUMP(&l, past_execveat_l)),
107         ARG_32(4),
108         JNE32(AT_EMPTY_PATH, JUMP(&l, past_execveat_l)),
109         ARG_32(5),
110         JNE32(internal::kExecveMagic, JUMP(&l, past_execveat_l)),
111         ALLOW,
112         LABEL(&l, past_execveat_l),
113 
114         LOAD_SYSCALL_NR,
115     };
116   } else {
117     policy = {
118         // If compiled arch is different from the runtime one, inform the
119         // Monitor.
120         LOAD_ARCH,
121         JEQ32(Syscall::GetHostAuditArch(), JUMP(&l, past_arch_check_l)),
122 #if defined(SAPI_X86_64)
123         JEQ32(AUDIT_ARCH_I386, TRACE(sapi::cpu::kX86)),  // 32-bit sandboxee
124 #endif
125         TRACE(sapi::cpu::kUnknown),
126         LABEL(&l, past_arch_check_l),
127 
128         // After the policy is uploaded, forkserver will execve the sandboxee.
129         // We need to allow this execve but not others. Since BPF does not have
130         // state, we need to inform the Monitor to decide, and for that we use a
131         // magic value in syscall args 5. Note that this value is not supposed
132         // to be secret, but just an optimization so that the monitor is not
133         // triggered on every call to execveat.
134         LOAD_SYSCALL_NR,
135         JNE32(__NR_execveat, JUMP(&l, past_execveat_l)),
136         ARG_32(4),
137         JNE32(AT_EMPTY_PATH, JUMP(&l, past_execveat_l)),
138         ARG_32(5),
139         JNE32(internal::kExecveMagic, JUMP(&l, past_execveat_l)),
140         SANDBOX2_TRACE,
141         LABEL(&l, past_execveat_l),
142 
143         LOAD_SYSCALL_NR,
144     };
145   }
146 
147   // Insert a custom syscall to signal the sandboxee it's running inside a
148   // sandbox.
149   // Executing a syscall with ID util::kMagicSyscallNo will return
150   // util::kMagicSyscallErr when the call by the sandboxee code is made inside
151   // the sandbox and ENOSYS when it is not inside the sandbox.
152   policy.insert(policy.end(), {SYSCALL(internal::kMagicSyscallNo,
153                                        ERRNO(internal::kMagicSyscallErr))});
154 
155   // Forbid ptrace because it's unsafe or too risky. The user policy can only
156   // block (i.e. return an error instead of killing the process) but not allow
157   // ptrace. This uses LOAD_SYSCALL_NR from above.
158   if (!user_policy_handles_ptrace_) {
159     policy.insert(policy.end(), {JEQ32(__NR_ptrace, DENY)});
160   }
161 
162   // If user policy doesn't mention it, then forbid bpf because it's unsafe or
163   // too risky. This uses LOAD_SYSCALL_NR from above.
164   if (!user_policy_handles_bpf_) {
165     policy.insert(policy.end(), {JEQ32(__NR_bpf, DENY)});
166   }
167 
168   if (!allow_map_exec_) {
169     policy.insert(
170         policy.end(),
171         {
172 #ifdef __NR_mmap
173             JNE32(__NR_mmap, JUMP(&l, past_map_exec_l)),
174 #endif
175 #ifdef __NR_mmap2  // Arm32
176             JNE32(__NR_mmap2, JUMP(&l, past_map_exec_l)),
177 #endif
178             JNE32(__NR_mprotect, JUMP(&l, past_map_exec_l)),
179 #ifdef __NR_pkey_mprotect
180             JNE32(__NR_pkey_mprotect, JUMP(&l, past_map_exec_l)),
181 #endif
182             // Load "prot" argument, which is the same for all four syscalls.
183             ARG_32(2),
184             // Deny executable mappings. This also disallows them for all PKEYS
185             // (not just the default one).
186             JA32(PROT_EXEC, DENY),
187 
188             LABEL(&l, past_map_exec_l),
189             LOAD_SYSCALL_NR,
190         });
191   }
192 
193 #ifndef CLONE_NEWCGROUP
194 #define CLONE_NEWCGROUP 0x02000000
195 #endif
196   constexpr uintptr_t kNewNamespacesFlags =
197       CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS |
198       CLONE_NEWCGROUP | CLONE_NEWIPC | CLONE_NEWPID;
199   static_assert(kNewNamespacesFlags <= std::numeric_limits<uint32_t>::max());
200   constexpr uintptr_t kUnsafeCloneFlags = kNewNamespacesFlags | CLONE_UNTRACED;
201   static_assert(kUnsafeCloneFlags <= std::numeric_limits<uint32_t>::max());
202   policy.insert(policy.end(),
203                 {
204 #ifdef __NR_clone3
205                     // Disallow clone3. Errno instead of DENY so that libraries
206                     // can fallback to regular clone/clone2.
207                     JEQ32(__NR_clone3, ERRNO(ENOSYS)),
208 #endif
209                     // Disallow clone3 and clone with unsafe flags.  This uses
210                     // LOAD_SYSCALL_NR from above.
211                     JNE32(__NR_clone, JUMP(&l, past_clone_unsafe_l)),
212                     // Regardless of arch, we only care about the lower 32-bits
213                     // of the flags.
214                     ARG_32(0),
215                     JA32(kUnsafeCloneFlags, DENY),
216                     LABEL(&l, past_clone_unsafe_l),
217                     // Disallow unshare with unsafe flags.
218                     LOAD_SYSCALL_NR,
219                     JNE32(__NR_unshare, JUMP(&l, past_unshare_unsafe_l)),
220                     // Regardless of arch, we only care about the lower 32-bits
221                     // of the flags.
222                     ARG_32(0),
223                     JA32(kNewNamespacesFlags, DENY),
224                     LABEL(&l, past_unshare_unsafe_l),
225                     // Disallow seccomp with SECCOMP_FILTER_FLAG_NEW_LISTENER
226                     // flag.
227                     LOAD_SYSCALL_NR,
228                     JNE32(__NR_seccomp, JUMP(&l, past_seccomp_new_listener)),
229                     // Regardless of arch, we only care about the lower 32-bits
230                     // of the flags.
231                     ARG_32(1),
232                     JA32(SECCOMP_FILTER_FLAG_NEW_LISTENER, DENY),
233                     LABEL(&l, past_seccomp_new_listener),
234                 });
235 
236   if (bpf_resolve_jumps(&l, policy.data(), policy.size()) != 0) {
237     LOG(FATAL) << "Cannot resolve bpf jumps";
238   }
239 
240   return policy;
241 }
242 
GetTrackingPolicy() const243 std::vector<sock_filter> Policy::GetTrackingPolicy() const {
244   return {
245       LOAD_ARCH,
246 #if defined(SAPI_X86_64)
247       JEQ32(AUDIT_ARCH_X86_64, TRACE(sapi::cpu::kX8664)),
248       JEQ32(AUDIT_ARCH_I386, TRACE(sapi::cpu::kX86)),
249 #elif defined(SAPI_PPC64_LE)
250       JEQ32(AUDIT_ARCH_PPC64LE, TRACE(sapi::cpu::kPPC64LE)),
251 #elif defined(SAPI_ARM64)
252       JEQ32(AUDIT_ARCH_AARCH64, TRACE(sapi::cpu::kArm64)),
253 #elif defined(SAPI_ARM)
254       JEQ32(AUDIT_ARCH_ARM, TRACE(sapi::cpu::kArm)),
255 #endif
256       TRACE(sapi::cpu::kUnknown),
257   };
258 }
259 
260 }  // namespace sandbox2
261