1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // Implementation of the sandbox2::Policy class.
16
17 #include "sandboxed_api/sandbox2/policy.h"
18
19 #include <fcntl.h>
20 #include <linux/audit.h>
21 #include <linux/bpf_common.h>
22 #include <linux/filter.h>
23 #include <linux/seccomp.h>
24 #include <sched.h>
25 #include <sys/mman.h>
26 #include <syscall.h>
27
28 #include <cerrno>
29 #include <cstdint>
30 #include <limits>
31 #include <optional>
32 #include <string>
33 #include <vector>
34
35 #include "absl/flags/flag.h"
36 #include "absl/log/log.h"
37 #include "absl/strings/string_view.h"
38 #include "sandboxed_api/config.h"
39 #include "sandboxed_api/sandbox2/bpfdisassembler.h"
40 #include "sandboxed_api/sandbox2/syscall.h"
41 #include "sandboxed_api/sandbox2/util.h"
42 #include "sandboxed_api/sandbox2/util/bpf_helper.h"
43
44 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
45 #define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
46 #endif
47
48 ABSL_FLAG(bool, sandbox2_danger_danger_permit_all, false,
49 "Allow all syscalls, useful for testing");
50 ABSL_FLAG(std::string, sandbox2_danger_danger_permit_all_and_log, "",
51 "Allow all syscalls and log them into specified file");
52
53 namespace sandbox2 {
54
55 // The final policy is the concatenation of:
56 // 1. default policy (GetDefaultPolicy, private),
57 // 2. user policy (user_policy_, public),
58 // 3. default KILL action (avoid failing open if user policy did not do it).
GetPolicy(bool user_notif) const59 std::vector<sock_filter> Policy::GetPolicy(bool user_notif) const {
60 if (absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all) ||
61 !absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all_and_log).empty()) {
62 return GetTrackingPolicy();
63 }
64
65 // Now we can start building the policy.
66 // 1. Start with the default policy (e.g. syscall architecture checks).
67 auto policy = GetDefaultPolicy(user_notif);
68 VLOG(3) << "Default policy:\n" << bpf::Disasm(policy);
69
70 // 2. Append user policy.
71 VLOG(3) << "User policy:\n" << bpf::Disasm(user_policy_);
72 // Add default syscall_nr loading in case the user forgets.
73 policy.push_back(LOAD_SYSCALL_NR);
74 policy.insert(policy.end(), user_policy_.begin(), user_policy_.end());
75
76 // 3. Finish with default KILL action.
77 policy.push_back(KILL);
78
79 VLOG(2) << "Final policy:\n" << bpf::Disasm(policy);
80 return policy;
81 }
82
83 // If you modify this function, you should also modify.
84 // Monitor::LogAccessViolation to keep them in sync.
85 //
86 // Produces a policy which returns SECCOMP_RET_TRACE instead of SECCOMP_RET_KILL
87 // for the __NR_execve syscall, so the tracer can make a decision to allow or
88 // disallow it depending on which occurrence of __NR_execve it was.
GetDefaultPolicy(bool user_notif) const89 std::vector<sock_filter> Policy::GetDefaultPolicy(bool user_notif) const {
90 bpf_labels l = {0};
91
92 std::vector<sock_filter> policy;
93 if (user_notif) {
94 policy = {
95 // If compiled arch is different from the runtime one, inform the
96 // Monitor.
97 LOAD_ARCH,
98 JNE32(Syscall::GetHostAuditArch(), DENY),
99 LOAD_SYSCALL_NR,
100 JNE32(__NR_seccomp, JUMP(&l, past_seccomp_l)),
101 ARG_32(3),
102 JNE32(internal::kExecveMagic, JUMP(&l, past_seccomp_l)),
103 ALLOW,
104 LABEL(&l, past_seccomp_l),
105 LOAD_SYSCALL_NR,
106 JNE32(__NR_execveat, JUMP(&l, past_execveat_l)),
107 ARG_32(4),
108 JNE32(AT_EMPTY_PATH, JUMP(&l, past_execveat_l)),
109 ARG_32(5),
110 JNE32(internal::kExecveMagic, JUMP(&l, past_execveat_l)),
111 ALLOW,
112 LABEL(&l, past_execveat_l),
113
114 LOAD_SYSCALL_NR,
115 };
116 } else {
117 policy = {
118 // If compiled arch is different from the runtime one, inform the
119 // Monitor.
120 LOAD_ARCH,
121 JEQ32(Syscall::GetHostAuditArch(), JUMP(&l, past_arch_check_l)),
122 #if defined(SAPI_X86_64)
123 JEQ32(AUDIT_ARCH_I386, TRACE(sapi::cpu::kX86)), // 32-bit sandboxee
124 #endif
125 TRACE(sapi::cpu::kUnknown),
126 LABEL(&l, past_arch_check_l),
127
128 // After the policy is uploaded, forkserver will execve the sandboxee.
129 // We need to allow this execve but not others. Since BPF does not have
130 // state, we need to inform the Monitor to decide, and for that we use a
131 // magic value in syscall args 5. Note that this value is not supposed
132 // to be secret, but just an optimization so that the monitor is not
133 // triggered on every call to execveat.
134 LOAD_SYSCALL_NR,
135 JNE32(__NR_execveat, JUMP(&l, past_execveat_l)),
136 ARG_32(4),
137 JNE32(AT_EMPTY_PATH, JUMP(&l, past_execveat_l)),
138 ARG_32(5),
139 JNE32(internal::kExecveMagic, JUMP(&l, past_execveat_l)),
140 SANDBOX2_TRACE,
141 LABEL(&l, past_execveat_l),
142
143 LOAD_SYSCALL_NR,
144 };
145 }
146
147 // Insert a custom syscall to signal the sandboxee it's running inside a
148 // sandbox.
149 // Executing a syscall with ID util::kMagicSyscallNo will return
150 // util::kMagicSyscallErr when the call by the sandboxee code is made inside
151 // the sandbox and ENOSYS when it is not inside the sandbox.
152 policy.insert(policy.end(), {SYSCALL(internal::kMagicSyscallNo,
153 ERRNO(internal::kMagicSyscallErr))});
154
155 // Forbid ptrace because it's unsafe or too risky. The user policy can only
156 // block (i.e. return an error instead of killing the process) but not allow
157 // ptrace. This uses LOAD_SYSCALL_NR from above.
158 if (!user_policy_handles_ptrace_) {
159 policy.insert(policy.end(), {JEQ32(__NR_ptrace, DENY)});
160 }
161
162 // If user policy doesn't mention it, then forbid bpf because it's unsafe or
163 // too risky. This uses LOAD_SYSCALL_NR from above.
164 if (!user_policy_handles_bpf_) {
165 policy.insert(policy.end(), {JEQ32(__NR_bpf, DENY)});
166 }
167
168 if (!allow_map_exec_) {
169 policy.insert(
170 policy.end(),
171 {
172 #ifdef __NR_mmap
173 JNE32(__NR_mmap, JUMP(&l, past_map_exec_l)),
174 #endif
175 #ifdef __NR_mmap2 // Arm32
176 JNE32(__NR_mmap2, JUMP(&l, past_map_exec_l)),
177 #endif
178 JNE32(__NR_mprotect, JUMP(&l, past_map_exec_l)),
179 #ifdef __NR_pkey_mprotect
180 JNE32(__NR_pkey_mprotect, JUMP(&l, past_map_exec_l)),
181 #endif
182 // Load "prot" argument, which is the same for all four syscalls.
183 ARG_32(2),
184 // Deny executable mappings. This also disallows them for all PKEYS
185 // (not just the default one).
186 JA32(PROT_EXEC, DENY),
187
188 LABEL(&l, past_map_exec_l),
189 LOAD_SYSCALL_NR,
190 });
191 }
192
193 #ifndef CLONE_NEWCGROUP
194 #define CLONE_NEWCGROUP 0x02000000
195 #endif
196 constexpr uintptr_t kNewNamespacesFlags =
197 CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS |
198 CLONE_NEWCGROUP | CLONE_NEWIPC | CLONE_NEWPID;
199 static_assert(kNewNamespacesFlags <= std::numeric_limits<uint32_t>::max());
200 constexpr uintptr_t kUnsafeCloneFlags = kNewNamespacesFlags | CLONE_UNTRACED;
201 static_assert(kUnsafeCloneFlags <= std::numeric_limits<uint32_t>::max());
202 policy.insert(policy.end(),
203 {
204 #ifdef __NR_clone3
205 // Disallow clone3. Errno instead of DENY so that libraries
206 // can fallback to regular clone/clone2.
207 JEQ32(__NR_clone3, ERRNO(ENOSYS)),
208 #endif
209 // Disallow clone3 and clone with unsafe flags. This uses
210 // LOAD_SYSCALL_NR from above.
211 JNE32(__NR_clone, JUMP(&l, past_clone_unsafe_l)),
212 // Regardless of arch, we only care about the lower 32-bits
213 // of the flags.
214 ARG_32(0),
215 JA32(kUnsafeCloneFlags, DENY),
216 LABEL(&l, past_clone_unsafe_l),
217 // Disallow unshare with unsafe flags.
218 LOAD_SYSCALL_NR,
219 JNE32(__NR_unshare, JUMP(&l, past_unshare_unsafe_l)),
220 // Regardless of arch, we only care about the lower 32-bits
221 // of the flags.
222 ARG_32(0),
223 JA32(kNewNamespacesFlags, DENY),
224 LABEL(&l, past_unshare_unsafe_l),
225 // Disallow seccomp with SECCOMP_FILTER_FLAG_NEW_LISTENER
226 // flag.
227 LOAD_SYSCALL_NR,
228 JNE32(__NR_seccomp, JUMP(&l, past_seccomp_new_listener)),
229 // Regardless of arch, we only care about the lower 32-bits
230 // of the flags.
231 ARG_32(1),
232 JA32(SECCOMP_FILTER_FLAG_NEW_LISTENER, DENY),
233 LABEL(&l, past_seccomp_new_listener),
234 });
235
236 if (bpf_resolve_jumps(&l, policy.data(), policy.size()) != 0) {
237 LOG(FATAL) << "Cannot resolve bpf jumps";
238 }
239
240 return policy;
241 }
242
GetTrackingPolicy() const243 std::vector<sock_filter> Policy::GetTrackingPolicy() const {
244 return {
245 LOAD_ARCH,
246 #if defined(SAPI_X86_64)
247 JEQ32(AUDIT_ARCH_X86_64, TRACE(sapi::cpu::kX8664)),
248 JEQ32(AUDIT_ARCH_I386, TRACE(sapi::cpu::kX86)),
249 #elif defined(SAPI_PPC64_LE)
250 JEQ32(AUDIT_ARCH_PPC64LE, TRACE(sapi::cpu::kPPC64LE)),
251 #elif defined(SAPI_ARM64)
252 JEQ32(AUDIT_ARCH_AARCH64, TRACE(sapi::cpu::kArm64)),
253 #elif defined(SAPI_ARM)
254 JEQ32(AUDIT_ARCH_ARM, TRACE(sapi::cpu::kArm)),
255 #endif
256 TRACE(sapi::cpu::kUnknown),
257 };
258 }
259
260 } // namespace sandbox2
261