1 /* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6 #define _BSD_SOURCE
7 #define _GNU_SOURCE
8
9 #include <asm/unistd.h>
10 #include <ctype.h>
11 #include <errno.h>
12 #include <fcntl.h>
13 #include <grp.h>
14 #include <inttypes.h>
15 #include <limits.h>
16 #include <linux/capability.h>
17 #include <pwd.h>
18 #include <sched.h>
19 #include <signal.h>
20 #include <stdarg.h>
21 #include <stdbool.h>
22 #include <stddef.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <syscall.h>
27 #include <sys/capability.h>
28 #include <sys/mount.h>
29 #include <sys/param.h>
30 #include <sys/prctl.h>
31 #include <sys/stat.h>
32 #include <sys/types.h>
33 #include <sys/user.h>
34 #include <sys/utsname.h>
35 #include <sys/wait.h>
36 #include <unistd.h>
37
38 #include "libminijail.h"
39 #include "libminijail-private.h"
40
41 #include "signal_handler.h"
42 #include "syscall_filter.h"
43 #include "util.h"
44
45 #ifdef HAVE_SECUREBITS_H
46 #include <linux/securebits.h>
47 #else
48 #define SECURE_ALL_BITS 0x15
49 #define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
50 #endif
51
52 /* Until these are reliably available in linux/prctl.h */
53 #ifndef PR_SET_SECCOMP
54 # define PR_SET_SECCOMP 22
55 #endif
56
57 #ifndef PR_ALT_SYSCALL
58 # define PR_ALT_SYSCALL 0x43724f53
59 #endif
60
61 /* For seccomp_filter using BPF. */
62 #ifndef PR_SET_NO_NEW_PRIVS
63 # define PR_SET_NO_NEW_PRIVS 38
64 #endif
65 #ifndef SECCOMP_MODE_FILTER
66 # define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
67 #endif
68
69 #ifdef USE_SECCOMP_SOFTFAIL
70 # define SECCOMP_SOFTFAIL 1
71 #else
72 # define SECCOMP_SOFTFAIL 0
73 #endif
74
75 #define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */
76
77 struct mountpoint {
78 char *src;
79 char *dest;
80 char *type;
81 unsigned long flags;
82 struct mountpoint *next;
83 };
84
85 struct minijail {
86 /*
87 * WARNING: if you add a flag here you need to make sure it's
88 * accounted for in minijail_pre{enter|exec}() below.
89 */
90 struct {
91 int uid:1;
92 int gid:1;
93 int usergroups:1;
94 int suppl_gids:1;
95 int caps:1;
96 int vfs:1;
97 int enter_vfs:1;
98 int pids:1;
99 int ipc:1;
100 int net:1;
101 int enter_net:1;
102 int userns:1;
103 int seccomp:1;
104 int remount_proc_ro:1;
105 int no_new_privs:1;
106 int seccomp_filter:1;
107 int log_seccomp_filter:1;
108 int chroot:1;
109 int pivot_root:1;
110 int mount_tmp:1;
111 int do_init:1;
112 int pid_file:1;
113 int cgroups:1;
114 int alt_syscall:1;
115 int reset_signal_mask:1;
116 } flags;
117 uid_t uid;
118 gid_t gid;
119 gid_t usergid;
120 char *user;
121 size_t suppl_gid_count;
122 gid_t *suppl_gid_list;
123 uint64_t caps;
124 pid_t initpid;
125 int mountns_fd;
126 int netns_fd;
127 char *chrootdir;
128 char *pid_file_path;
129 char *uidmap;
130 char *gidmap;
131 size_t filter_len;
132 struct sock_fprog *filter_prog;
133 char *alt_syscall_table;
134 struct mountpoint *mounts_head;
135 struct mountpoint *mounts_tail;
136 size_t mounts_count;
137 char *cgroups[MAX_CGROUPS];
138 size_t cgroup_count;
139 };
140
141 /*
142 * Strip out flags meant for the parent.
143 * We keep things that are not inherited across execve(2) (e.g. capabilities),
144 * or are easier to set after execve(2) (e.g. seccomp filters).
145 */
minijail_preenter(struct minijail * j)146 void minijail_preenter(struct minijail *j)
147 {
148 j->flags.vfs = 0;
149 j->flags.enter_vfs = 0;
150 j->flags.remount_proc_ro = 0;
151 j->flags.pids = 0;
152 j->flags.do_init = 0;
153 j->flags.pid_file = 0;
154 j->flags.cgroups = 0;
155 }
156
157 /*
158 * Strip out flags meant for the child.
159 * We keep things that are inherited across execve(2).
160 */
minijail_preexec(struct minijail * j)161 void minijail_preexec(struct minijail *j)
162 {
163 int vfs = j->flags.vfs;
164 int enter_vfs = j->flags.enter_vfs;
165 int remount_proc_ro = j->flags.remount_proc_ro;
166 int userns = j->flags.userns;
167 if (j->user)
168 free(j->user);
169 j->user = NULL;
170 if (j->suppl_gid_list)
171 free(j->suppl_gid_list);
172 j->suppl_gid_list = NULL;
173 memset(&j->flags, 0, sizeof(j->flags));
174 /* Now restore anything we meant to keep. */
175 j->flags.vfs = vfs;
176 j->flags.enter_vfs = enter_vfs;
177 j->flags.remount_proc_ro = remount_proc_ro;
178 j->flags.userns = userns;
179 /* Note, |pids| will already have been used before this call. */
180 }
181
182 /* Returns true if the kernel version is less than 3.8. */
seccomp_kernel_support_not_required()183 int seccomp_kernel_support_not_required()
184 {
185 int major, minor;
186 struct utsname uts;
187 return (uname(&uts) != -1 &&
188 sscanf(uts.release, "%d.%d", &major, &minor) == 2 &&
189 ((major < 3) || ((major == 3) && (minor < 8))));
190 }
191
192 /* Allow seccomp soft-fail on Android devices with kernel version < 3.8. */
can_softfail()193 int can_softfail()
194 {
195 #if SECCOMP_SOFTFAIL
196 if (is_android()) {
197 if (seccomp_kernel_support_not_required())
198 return 1;
199 else
200 return 0;
201 } else {
202 return 1;
203 }
204 #endif
205 return 0;
206 }
207
208 /* Minijail API. */
209
minijail_new(void)210 struct minijail API *minijail_new(void)
211 {
212 return calloc(1, sizeof(struct minijail));
213 }
214
minijail_change_uid(struct minijail * j,uid_t uid)215 void API minijail_change_uid(struct minijail *j, uid_t uid)
216 {
217 if (uid == 0)
218 die("useless change to uid 0");
219 j->uid = uid;
220 j->flags.uid = 1;
221 }
222
minijail_change_gid(struct minijail * j,gid_t gid)223 void API minijail_change_gid(struct minijail *j, gid_t gid)
224 {
225 if (gid == 0)
226 die("useless change to gid 0");
227 j->gid = gid;
228 j->flags.gid = 1;
229 }
230
minijail_set_supplementary_gids(struct minijail * j,size_t size,const gid_t * list)231 void API minijail_set_supplementary_gids(struct minijail *j, size_t size,
232 const gid_t *list)
233 {
234 size_t i;
235
236 if (j->flags.usergroups)
237 die("cannot inherit *and* set supplementary groups");
238
239 if (size == 0) {
240 /* Clear supplementary groups. */
241 j->suppl_gid_list = NULL;
242 j->suppl_gid_count = 0;
243 j->flags.suppl_gids = 1;
244 return;
245 }
246
247 /* Copy the gid_t array. */
248 j->suppl_gid_list = calloc(size, sizeof(gid_t));
249 if (!j->suppl_gid_list) {
250 die("failed to allocate internal supplementary group array");
251 }
252 for (i = 0; i < size; i++) {
253 j->suppl_gid_list[i] = list[i];
254 }
255 j->suppl_gid_count = size;
256 j->flags.suppl_gids = 1;
257 }
258
minijail_change_user(struct minijail * j,const char * user)259 int API minijail_change_user(struct minijail *j, const char *user)
260 {
261 char *buf = NULL;
262 struct passwd pw;
263 struct passwd *ppw = NULL;
264 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
265 if (sz == -1)
266 sz = 65536; /* your guess is as good as mine... */
267
268 /*
269 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
270 * the maximum needed size of the buffer, so we don't have to search.
271 */
272 buf = malloc(sz);
273 if (!buf)
274 return -ENOMEM;
275 getpwnam_r(user, &pw, buf, sz, &ppw);
276 /*
277 * We're safe to free the buffer here. The strings inside |pw| point
278 * inside |buf|, but we don't use any of them; this leaves the pointers
279 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3) succeeded.
280 */
281 free(buf);
282 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
283 if (!ppw)
284 return -1;
285 minijail_change_uid(j, ppw->pw_uid);
286 j->user = strdup(user);
287 if (!j->user)
288 return -ENOMEM;
289 j->usergid = ppw->pw_gid;
290 return 0;
291 }
292
minijail_change_group(struct minijail * j,const char * group)293 int API minijail_change_group(struct minijail *j, const char *group)
294 {
295 char *buf = NULL;
296 struct group gr;
297 struct group *pgr = NULL;
298 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
299 if (sz == -1)
300 sz = 65536; /* and mine is as good as yours, really */
301
302 /*
303 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
304 * the maximum needed size of the buffer, so we don't have to search.
305 */
306 buf = malloc(sz);
307 if (!buf)
308 return -ENOMEM;
309 getgrnam_r(group, &gr, buf, sz, &pgr);
310 /*
311 * We're safe to free the buffer here. The strings inside gr point
312 * inside buf, but we don't use any of them; this leaves the pointers
313 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
314 */
315 free(buf);
316 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
317 if (!pgr)
318 return -1;
319 minijail_change_gid(j, pgr->gr_gid);
320 return 0;
321 }
322
minijail_use_seccomp(struct minijail * j)323 void API minijail_use_seccomp(struct minijail *j)
324 {
325 j->flags.seccomp = 1;
326 }
327
minijail_no_new_privs(struct minijail * j)328 void API minijail_no_new_privs(struct minijail *j)
329 {
330 j->flags.no_new_privs = 1;
331 }
332
minijail_use_seccomp_filter(struct minijail * j)333 void API minijail_use_seccomp_filter(struct minijail *j)
334 {
335 j->flags.seccomp_filter = 1;
336 }
337
minijail_log_seccomp_filter_failures(struct minijail * j)338 void API minijail_log_seccomp_filter_failures(struct minijail *j)
339 {
340 j->flags.log_seccomp_filter = 1;
341 }
342
minijail_use_caps(struct minijail * j,uint64_t capmask)343 void API minijail_use_caps(struct minijail *j, uint64_t capmask)
344 {
345 j->caps = capmask;
346 j->flags.caps = 1;
347 }
348
minijail_reset_signal_mask(struct minijail * j)349 void API minijail_reset_signal_mask(struct minijail* j) {
350 j->flags.reset_signal_mask = 1;
351 }
352
minijail_namespace_vfs(struct minijail * j)353 void API minijail_namespace_vfs(struct minijail *j)
354 {
355 j->flags.vfs = 1;
356 }
357
minijail_namespace_enter_vfs(struct minijail * j,const char * ns_path)358 void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
359 {
360 int ns_fd = open(ns_path, O_RDONLY);
361 if (ns_fd < 0) {
362 pdie("failed to open namespace '%s'", ns_path);
363 }
364 j->mountns_fd = ns_fd;
365 j->flags.enter_vfs = 1;
366 }
367
minijail_namespace_pids(struct minijail * j)368 void API minijail_namespace_pids(struct minijail *j)
369 {
370 j->flags.vfs = 1;
371 j->flags.remount_proc_ro = 1;
372 j->flags.pids = 1;
373 j->flags.do_init = 1;
374 }
375
minijail_namespace_ipc(struct minijail * j)376 void API minijail_namespace_ipc(struct minijail *j)
377 {
378 j->flags.ipc = 1;
379 }
380
minijail_namespace_net(struct minijail * j)381 void API minijail_namespace_net(struct minijail *j)
382 {
383 j->flags.net = 1;
384 }
385
minijail_namespace_enter_net(struct minijail * j,const char * ns_path)386 void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
387 {
388 int ns_fd = open(ns_path, O_RDONLY);
389 if (ns_fd < 0) {
390 pdie("failed to open namespace '%s'", ns_path);
391 }
392 j->netns_fd = ns_fd;
393 j->flags.enter_net = 1;
394 }
395
minijail_remount_proc_readonly(struct minijail * j)396 void API minijail_remount_proc_readonly(struct minijail *j)
397 {
398 j->flags.vfs = 1;
399 j->flags.remount_proc_ro = 1;
400 }
401
minijail_namespace_user(struct minijail * j)402 void API minijail_namespace_user(struct minijail *j)
403 {
404 j->flags.userns = 1;
405 }
406
minijail_uidmap(struct minijail * j,const char * uidmap)407 int API minijail_uidmap(struct minijail *j, const char *uidmap)
408 {
409 j->uidmap = strdup(uidmap);
410 if (!j->uidmap)
411 return -ENOMEM;
412 char *ch;
413 for (ch = j->uidmap; *ch; ch++) {
414 if (*ch == ',')
415 *ch = '\n';
416 }
417 return 0;
418 }
419
minijail_gidmap(struct minijail * j,const char * gidmap)420 int API minijail_gidmap(struct minijail *j, const char *gidmap)
421 {
422 j->gidmap = strdup(gidmap);
423 if (!j->gidmap)
424 return -ENOMEM;
425 char *ch;
426 for (ch = j->gidmap; *ch; ch++) {
427 if (*ch == ',')
428 *ch = '\n';
429 }
430 return 0;
431 }
432
minijail_inherit_usergroups(struct minijail * j)433 void API minijail_inherit_usergroups(struct minijail *j)
434 {
435 j->flags.usergroups = 1;
436 }
437
minijail_run_as_init(struct minijail * j)438 void API minijail_run_as_init(struct minijail *j)
439 {
440 /*
441 * Since the jailed program will become 'init' in the new PID namespace,
442 * Minijail does not need to fork an 'init' process.
443 */
444 j->flags.do_init = 0;
445 }
446
minijail_enter_chroot(struct minijail * j,const char * dir)447 int API minijail_enter_chroot(struct minijail *j, const char *dir)
448 {
449 if (j->chrootdir)
450 return -EINVAL;
451 j->chrootdir = strdup(dir);
452 if (!j->chrootdir)
453 return -ENOMEM;
454 j->flags.chroot = 1;
455 return 0;
456 }
457
minijail_enter_pivot_root(struct minijail * j,const char * dir)458 int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
459 {
460 if (j->chrootdir)
461 return -EINVAL;
462 j->chrootdir = strdup(dir);
463 if (!j->chrootdir)
464 return -ENOMEM;
465 j->flags.pivot_root = 1;
466 return 0;
467 }
468
append_external_path(const char * external_path,const char * path_inside_chroot)469 static char *append_external_path(const char *external_path,
470 const char *path_inside_chroot)
471 {
472 char *path;
473 size_t pathlen;
474
475 /* One extra char for '/' and one for '\0', hence + 2. */
476 pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2;
477 path = malloc(pathlen);
478 snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot);
479
480 return path;
481 }
482
minijail_get_original_path(struct minijail * j,const char * path_inside_chroot)483 char API *minijail_get_original_path(struct minijail *j,
484 const char *path_inside_chroot)
485 {
486 struct mountpoint *b;
487
488 b = j->mounts_head;
489 while (b) {
490 /*
491 * If |path_inside_chroot| is the exact destination of a
492 * mount, then the original path is exactly the source of
493 * the mount.
494 * for example: "-b /some/path/exe,/chroot/path/exe"
495 * mount source = /some/path/exe, mount dest =
496 * /chroot/path/exe Then when getting the original path of
497 * "/chroot/path/exe", the source of that mount,
498 * "/some/path/exe" is what should be returned.
499 */
500 if (!strcmp(b->dest, path_inside_chroot))
501 return strdup(b->src);
502
503 /*
504 * If |path_inside_chroot| is within the destination path of a
505 * mount, take the suffix of the chroot path relative to the
506 * mount destination path, and append it to the mount source
507 * path.
508 */
509 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
510 const char *relative_path =
511 path_inside_chroot + strlen(b->dest);
512 return append_external_path(b->src, relative_path);
513 }
514 b = b->next;
515 }
516
517 /* If there is a chroot path, append |path_inside_chroot| to that. */
518 if (j->chrootdir)
519 return append_external_path(j->chrootdir, path_inside_chroot);
520
521 /* No chroot, so the path outside is the same as it is inside. */
522 return strdup(path_inside_chroot);
523 }
524
minijail_mount_tmp(struct minijail * j)525 void API minijail_mount_tmp(struct minijail *j)
526 {
527 j->flags.mount_tmp = 1;
528 }
529
minijail_write_pid_file(struct minijail * j,const char * path)530 int API minijail_write_pid_file(struct minijail *j, const char *path)
531 {
532 j->pid_file_path = strdup(path);
533 if (!j->pid_file_path)
534 return -ENOMEM;
535 j->flags.pid_file = 1;
536 return 0;
537 }
538
minijail_add_to_cgroup(struct minijail * j,const char * path)539 int API minijail_add_to_cgroup(struct minijail *j, const char *path)
540 {
541 if (j->cgroup_count >= MAX_CGROUPS)
542 return -ENOMEM;
543 j->cgroups[j->cgroup_count] = strdup(path);
544 if (!j->cgroups[j->cgroup_count])
545 return -ENOMEM;
546 j->cgroup_count++;
547 j->flags.cgroups = 1;
548 return 0;
549 }
550
minijail_mount(struct minijail * j,const char * src,const char * dest,const char * type,unsigned long flags)551 int API minijail_mount(struct minijail *j, const char *src, const char *dest,
552 const char *type, unsigned long flags)
553 {
554 struct mountpoint *m;
555
556 if (*dest != '/')
557 return -EINVAL;
558 m = calloc(1, sizeof(*m));
559 if (!m)
560 return -ENOMEM;
561 m->dest = strdup(dest);
562 if (!m->dest)
563 goto error;
564 m->src = strdup(src);
565 if (!m->src)
566 goto error;
567 m->type = strdup(type);
568 if (!m->type)
569 goto error;
570 m->flags = flags;
571
572 info("mount %s -> %s type '%s'", src, dest, type);
573
574 /*
575 * Force vfs namespacing so the mounts don't leak out into the
576 * containing vfs namespace.
577 */
578 minijail_namespace_vfs(j);
579
580 if (j->mounts_tail)
581 j->mounts_tail->next = m;
582 else
583 j->mounts_head = m;
584 j->mounts_tail = m;
585 j->mounts_count++;
586
587 return 0;
588
589 error:
590 free(m->src);
591 free(m->dest);
592 free(m);
593 return -ENOMEM;
594 }
595
minijail_bind(struct minijail * j,const char * src,const char * dest,int writeable)596 int API minijail_bind(struct minijail *j, const char *src, const char *dest,
597 int writeable)
598 {
599 unsigned long flags = MS_BIND;
600
601 if (!writeable)
602 flags |= MS_RDONLY;
603
604 return minijail_mount(j, src, dest, "", flags);
605 }
606
minijail_parse_seccomp_filters(struct minijail * j,const char * path)607 void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
608 {
609 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
610 if ((errno == EINVAL) && can_softfail()) {
611 warn("not loading seccomp filter,"
612 " seccomp not supported");
613 j->flags.seccomp_filter = 0;
614 j->flags.log_seccomp_filter = 0;
615 j->filter_len = 0;
616 j->filter_prog = NULL;
617 j->flags.no_new_privs = 0;
618 }
619 }
620 FILE *file = fopen(path, "r");
621 if (!file) {
622 pdie("failed to open seccomp filter file '%s'", path);
623 }
624
625 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
626 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
627 die("failed to compile seccomp filter BPF program in '%s'",
628 path);
629 }
630
631 j->filter_len = fprog->len;
632 j->filter_prog = fprog;
633
634 fclose(file);
635 }
636
minijail_use_alt_syscall(struct minijail * j,const char * table)637 int API minijail_use_alt_syscall(struct minijail *j, const char *table)
638 {
639 j->alt_syscall_table = strdup(table);
640 if (!j->alt_syscall_table)
641 return -ENOMEM;
642 j->flags.alt_syscall = 1;
643 return 0;
644 }
645
646 struct marshal_state {
647 size_t available;
648 size_t total;
649 char *buf;
650 };
651
marshal_state_init(struct marshal_state * state,char * buf,size_t available)652 void marshal_state_init(struct marshal_state *state,
653 char *buf, size_t available)
654 {
655 state->available = available;
656 state->buf = buf;
657 state->total = 0;
658 }
659
marshal_append(struct marshal_state * state,void * src,size_t length)660 void marshal_append(struct marshal_state *state,
661 void *src, size_t length)
662 {
663 size_t copy_len = MIN(state->available, length);
664
665 /* Up to |available| will be written. */
666 if (copy_len) {
667 memcpy(state->buf, src, copy_len);
668 state->buf += copy_len;
669 state->available -= copy_len;
670 }
671 /* |total| will contain the expected length. */
672 state->total += length;
673 }
674
minijail_marshal_helper(struct marshal_state * state,const struct minijail * j)675 void minijail_marshal_helper(struct marshal_state *state,
676 const struct minijail *j)
677 {
678 struct mountpoint *m = NULL;
679 size_t i;
680
681 marshal_append(state, (char *)j, sizeof(*j));
682 if (j->user)
683 marshal_append(state, j->user, strlen(j->user) + 1);
684 if (j->suppl_gid_list) {
685 marshal_append(state, j->suppl_gid_list,
686 j->suppl_gid_count * sizeof(gid_t));
687 }
688 if (j->chrootdir)
689 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
690 if (j->alt_syscall_table) {
691 marshal_append(state, j->alt_syscall_table,
692 strlen(j->alt_syscall_table) + 1);
693 }
694 if (j->flags.seccomp_filter && j->filter_prog) {
695 struct sock_fprog *fp = j->filter_prog;
696 marshal_append(state, (char *)fp->filter,
697 fp->len * sizeof(struct sock_filter));
698 }
699 for (m = j->mounts_head; m; m = m->next) {
700 marshal_append(state, m->src, strlen(m->src) + 1);
701 marshal_append(state, m->dest, strlen(m->dest) + 1);
702 marshal_append(state, m->type, strlen(m->type) + 1);
703 marshal_append(state, (char *)&m->flags, sizeof(m->flags));
704 }
705 for (i = 0; i < j->cgroup_count; ++i)
706 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1);
707 }
708
minijail_size(const struct minijail * j)709 size_t API minijail_size(const struct minijail *j)
710 {
711 struct marshal_state state;
712 marshal_state_init(&state, NULL, 0);
713 minijail_marshal_helper(&state, j);
714 return state.total;
715 }
716
minijail_marshal(const struct minijail * j,char * buf,size_t available)717 int minijail_marshal(const struct minijail *j, char *buf, size_t available)
718 {
719 struct marshal_state state;
720 marshal_state_init(&state, buf, available);
721 minijail_marshal_helper(&state, j);
722 return (state.total > available);
723 }
724
725 /*
726 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength
727 * @length Number of bytes to consume
728 * @buf Buffer to consume from
729 * @buflength Size of @buf
730 *
731 * Returns a pointer to the base of the bytes, or NULL for errors.
732 */
consumebytes(size_t length,char ** buf,size_t * buflength)733 void *consumebytes(size_t length, char **buf, size_t *buflength)
734 {
735 char *p = *buf;
736 if (length > *buflength)
737 return NULL;
738 *buf += length;
739 *buflength -= length;
740 return p;
741 }
742
743 /*
744 * consumestr: consumes a C string from a buffer @buf of length @length
745 * @buf Buffer to consume
746 * @length Length of buffer
747 *
748 * Returns a pointer to the base of the string, or NULL for errors.
749 */
consumestr(char ** buf,size_t * buflength)750 char *consumestr(char **buf, size_t *buflength)
751 {
752 size_t len = strnlen(*buf, *buflength);
753 if (len == *buflength)
754 /* There's no null-terminator. */
755 return NULL;
756 return consumebytes(len + 1, buf, buflength);
757 }
758
minijail_unmarshal(struct minijail * j,char * serialized,size_t length)759 int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
760 {
761 size_t i;
762 size_t count;
763 int ret = -EINVAL;
764
765 if (length < sizeof(*j))
766 goto out;
767 memcpy((void *)j, serialized, sizeof(*j));
768 serialized += sizeof(*j);
769 length -= sizeof(*j);
770
771 /* Potentially stale pointers not used as signals. */
772 j->mounts_head = NULL;
773 j->mounts_tail = NULL;
774 j->filter_prog = NULL;
775
776 if (j->user) { /* stale pointer */
777 char *user = consumestr(&serialized, &length);
778 if (!user)
779 goto clear_pointers;
780 j->user = strdup(user);
781 if (!j->user)
782 goto clear_pointers;
783 }
784
785 if (j->suppl_gid_list) { /* stale pointer */
786 if (j->suppl_gid_count > NGROUPS_MAX) {
787 goto bad_gid_list;
788 }
789 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
790 void *gid_list_bytes =
791 consumebytes(gid_list_size, &serialized, &length);
792 if (!gid_list_bytes)
793 goto bad_gid_list;
794
795 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
796 if (!j->suppl_gid_list)
797 goto bad_gid_list;
798
799 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
800 }
801
802 if (j->chrootdir) { /* stale pointer */
803 char *chrootdir = consumestr(&serialized, &length);
804 if (!chrootdir)
805 goto bad_chrootdir;
806 j->chrootdir = strdup(chrootdir);
807 if (!j->chrootdir)
808 goto bad_chrootdir;
809 }
810
811 if (j->alt_syscall_table) { /* stale pointer */
812 char *alt_syscall_table = consumestr(&serialized, &length);
813 if (!alt_syscall_table)
814 goto bad_syscall_table;
815 j->alt_syscall_table = strdup(alt_syscall_table);
816 if (!j->alt_syscall_table)
817 goto bad_syscall_table;
818 }
819
820 if (j->flags.seccomp_filter && j->filter_len > 0) {
821 size_t ninstrs = j->filter_len;
822 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
823 ninstrs > USHRT_MAX)
824 goto bad_filters;
825
826 size_t program_len = ninstrs * sizeof(struct sock_filter);
827 void *program = consumebytes(program_len, &serialized, &length);
828 if (!program)
829 goto bad_filters;
830
831 j->filter_prog = malloc(sizeof(struct sock_fprog));
832 if (!j->filter_prog)
833 goto bad_filters;
834
835 j->filter_prog->len = ninstrs;
836 j->filter_prog->filter = malloc(program_len);
837 if (!j->filter_prog->filter)
838 goto bad_filter_prog_instrs;
839
840 memcpy(j->filter_prog->filter, program, program_len);
841 }
842
843 count = j->mounts_count;
844 j->mounts_count = 0;
845 for (i = 0; i < count; ++i) {
846 unsigned long *flags;
847 const char *dest;
848 const char *type;
849 const char *src = consumestr(&serialized, &length);
850 if (!src)
851 goto bad_mounts;
852 dest = consumestr(&serialized, &length);
853 if (!dest)
854 goto bad_mounts;
855 type = consumestr(&serialized, &length);
856 if (!type)
857 goto bad_mounts;
858 flags = consumebytes(sizeof(*flags), &serialized, &length);
859 if (!flags)
860 goto bad_mounts;
861 if (minijail_mount(j, src, dest, type, *flags))
862 goto bad_mounts;
863 }
864
865 count = j->cgroup_count;
866 j->cgroup_count = 0;
867 for (i = 0; i < count; ++i) {
868 char *cgroup = consumestr(&serialized, &length);
869 if (!cgroup)
870 goto bad_cgroups;
871 j->cgroups[i] = strdup(cgroup);
872 if (!j->cgroups[i])
873 goto bad_cgroups;
874 ++j->cgroup_count;
875 }
876
877 return 0;
878
879 bad_cgroups:
880 while (j->mounts_head) {
881 struct mountpoint *m = j->mounts_head;
882 j->mounts_head = j->mounts_head->next;
883 free(m->type);
884 free(m->dest);
885 free(m->src);
886 free(m);
887 }
888 for (i = 0; i < j->cgroup_count; ++i)
889 free(j->cgroups[i]);
890 bad_mounts:
891 if (j->flags.seccomp_filter && j->filter_len > 0) {
892 free(j->filter_prog->filter);
893 free(j->filter_prog);
894 }
895 bad_filter_prog_instrs:
896 if (j->filter_prog)
897 free(j->filter_prog);
898 bad_filters:
899 if (j->alt_syscall_table)
900 free(j->alt_syscall_table);
901 bad_syscall_table:
902 if (j->chrootdir)
903 free(j->chrootdir);
904 bad_chrootdir:
905 if (j->suppl_gid_list)
906 free(j->suppl_gid_list);
907 bad_gid_list:
908 if (j->user)
909 free(j->user);
910 clear_pointers:
911 j->user = NULL;
912 j->suppl_gid_list = NULL;
913 j->chrootdir = NULL;
914 j->alt_syscall_table = NULL;
915 j->cgroup_count = 0;
916 out:
917 return ret;
918 }
919
write_ugid_mappings(const struct minijail * j)920 static void write_ugid_mappings(const struct minijail *j)
921 {
922 int fd, ret, len;
923 size_t sz;
924 char fname[32];
925
926 sz = sizeof(fname);
927 if (j->uidmap) {
928 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid);
929 if (ret < 0 || (size_t)ret >= sz)
930 die("failed to write file name of uid_map");
931 fd = open(fname, O_WRONLY);
932 if (fd < 0)
933 pdie("failed to open '%s'", fname);
934 len = strlen(j->uidmap);
935 if (write(fd, j->uidmap, len) < len)
936 die("failed to set uid_map");
937 close(fd);
938 }
939 if (j->gidmap) {
940 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid);
941 if (ret < 0 || (size_t)ret >= sz)
942 die("failed to write file name of gid_map");
943 fd = open(fname, O_WRONLY);
944 if (fd < 0)
945 pdie("failed to open '%s'", fname);
946 len = strlen(j->gidmap);
947 if (write(fd, j->gidmap, len) < len)
948 die("failed to set gid_map");
949 close(fd);
950 }
951 }
952
parent_setup_complete(int * pipe_fds)953 static void parent_setup_complete(int *pipe_fds)
954 {
955 close(pipe_fds[0]);
956 close(pipe_fds[1]);
957 }
958
959 /*
960 * wait_for_parent_setup: Called by the child process to wait for any
961 * further parent-side setup to complete before continuing.
962 */
wait_for_parent_setup(int * pipe_fds)963 static void wait_for_parent_setup(int *pipe_fds)
964 {
965 char buf;
966
967 close(pipe_fds[1]);
968
969 /* Wait for parent to complete setup and close the pipe. */
970 if (read(pipe_fds[0], &buf, 1) != 0)
971 die("failed to sync with parent");
972 close(pipe_fds[0]);
973 }
974
enter_user_namespace(const struct minijail * j)975 static void enter_user_namespace(const struct minijail *j)
976 {
977 if (j->uidmap && setresuid(0, 0, 0))
978 pdie("setresuid");
979 if (j->gidmap && setresgid(0, 0, 0))
980 pdie("setresgid");
981 }
982
983 /*
984 * mount_one: Applies mounts from @m for @j, recursing as needed.
985 * @j Minijail these mounts are for
986 * @m Head of list of mounts
987 *
988 * Returns 0 for success.
989 */
mount_one(const struct minijail * j,struct mountpoint * m)990 static int mount_one(const struct minijail *j, struct mountpoint *m)
991 {
992 int ret;
993 char *dest;
994 int remount_ro = 0;
995
996 /* |dest| has a leading "/". */
997 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0)
998 return -ENOMEM;
999
1000 /*
1001 * R/O bind mounts have to be remounted since 'bind' and 'ro'
1002 * can't both be specified in the original bind mount.
1003 * Remount R/O after the initial mount.
1004 */
1005 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) {
1006 remount_ro = 1;
1007 m->flags &= ~MS_RDONLY;
1008 }
1009
1010 ret = mount(m->src, dest, m->type, m->flags, NULL);
1011 if (ret)
1012 pdie("mount: %s -> %s", m->src, dest);
1013
1014 if (remount_ro) {
1015 m->flags |= MS_RDONLY;
1016 ret = mount(m->src, dest, NULL,
1017 m->flags | MS_REMOUNT, NULL);
1018 if (ret)
1019 pdie("bind ro: %s -> %s", m->src, dest);
1020 }
1021
1022 free(dest);
1023 if (m->next)
1024 return mount_one(j, m->next);
1025 return ret;
1026 }
1027
enter_chroot(const struct minijail * j)1028 int enter_chroot(const struct minijail *j)
1029 {
1030 int ret;
1031
1032 if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
1033 return ret;
1034
1035 if (chroot(j->chrootdir))
1036 return -errno;
1037
1038 if (chdir("/"))
1039 return -errno;
1040
1041 return 0;
1042 }
1043
enter_pivot_root(const struct minijail * j)1044 int enter_pivot_root(const struct minijail *j)
1045 {
1046 int ret, oldroot, newroot;
1047
1048 if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
1049 return ret;
1050
1051 /*
1052 * Keep the fd for both old and new root.
1053 * It will be used in fchdir later.
1054 */
1055 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1056 if (oldroot < 0)
1057 pdie("failed to open / for fchdir");
1058 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY);
1059 if (newroot < 0)
1060 pdie("failed to open %s for fchdir", j->chrootdir);
1061
1062 /*
1063 * To ensure chrootdir is the root of a file system,
1064 * do a self bind mount.
1065 */
1066 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
1067 pdie("failed to bind mount '%s'", j->chrootdir);
1068 if (chdir(j->chrootdir))
1069 return -errno;
1070 if (syscall(SYS_pivot_root, ".", "."))
1071 pdie("pivot_root");
1072
1073 /*
1074 * Now the old root is mounted on top of the new root. Use fchdir to
1075 * change to the old root and unmount it.
1076 */
1077 if (fchdir(oldroot))
1078 pdie("failed to fchdir to old /");
1079 /* The old root might be busy, so use lazy unmount. */
1080 if (umount2(".", MNT_DETACH))
1081 pdie("umount(/)");
1082 /* Change back to the new root. */
1083 if (fchdir(newroot))
1084 return -errno;
1085 if (chroot("/"))
1086 return -errno;
1087 /* Set correct CWD for getcwd(3). */
1088 if (chdir("/"))
1089 return -errno;
1090
1091 return 0;
1092 }
1093
mount_tmp(void)1094 int mount_tmp(void)
1095 {
1096 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
1097 }
1098
remount_proc_readonly(const struct minijail * j)1099 int remount_proc_readonly(const struct minijail *j)
1100 {
1101 const char *kProcPath = "/proc";
1102 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
1103 /*
1104 * Right now, we're holding a reference to our parent's old mount of
1105 * /proc in our namespace, which means using MS_REMOUNT here would
1106 * mutate our parent's mount as well, even though we're in a VFS
1107 * namespace (!). Instead, remove their mount from our namespace
1108 * and make our own. However, if we are in a new user namespace, /proc
1109 * is not seen as mounted, so don't return error if umount() fails.
1110 */
1111 if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns)
1112 return -errno;
1113 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
1114 return -errno;
1115 return 0;
1116 }
1117
write_pid_to_path(pid_t pid,const char * path)1118 static void write_pid_to_path(pid_t pid, const char *path)
1119 {
1120 FILE *fp = fopen(path, "w");
1121
1122 if (!fp)
1123 pdie("failed to open '%s'", path);
1124 if (fprintf(fp, "%d\n", (int)pid) < 0)
1125 pdie("fprintf(%s)", path);
1126 if (fclose(fp))
1127 pdie("fclose(%s)", path);
1128 }
1129
write_pid_file(const struct minijail * j)1130 static void write_pid_file(const struct minijail *j)
1131 {
1132 write_pid_to_path(j->initpid, j->pid_file_path);
1133 }
1134
add_to_cgroups(const struct minijail * j)1135 static void add_to_cgroups(const struct minijail *j)
1136 {
1137 size_t i;
1138
1139 for (i = 0; i < j->cgroup_count; ++i)
1140 write_pid_to_path(j->initpid, j->cgroups[i]);
1141 }
1142
drop_ugid(const struct minijail * j)1143 void drop_ugid(const struct minijail *j)
1144 {
1145 if (j->flags.usergroups && j->flags.suppl_gids) {
1146 die("tried to inherit *and* set supplementary groups;"
1147 " can only do one");
1148 }
1149
1150 if (j->flags.usergroups) {
1151 if (initgroups(j->user, j->usergid))
1152 pdie("initgroups");
1153 } else if (j->flags.suppl_gids) {
1154 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) {
1155 pdie("setgroups");
1156 }
1157 } else {
1158 /*
1159 * Only attempt to clear supplementary groups if we are changing
1160 * users.
1161 */
1162 if ((j->uid || j->gid) && setgroups(0, NULL))
1163 pdie("setgroups");
1164 }
1165
1166 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
1167 pdie("setresgid");
1168
1169 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
1170 pdie("setresuid");
1171 }
1172
1173 /*
1174 * We specifically do not use cap_valid() as that only tells us the last
1175 * valid cap we were *compiled* against (i.e. what the version of kernel
1176 * headers says). If we run on a different kernel version, then it's not
1177 * uncommon for that to be less (if an older kernel) or more (if a newer
1178 * kernel).
1179 * Normally, we suck up the answer via /proc. On Android, not all processes are
1180 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
1181 * programmatically find the value by calling prctl(PR_CAPBSET_READ).
1182 */
get_last_valid_cap()1183 static unsigned int get_last_valid_cap()
1184 {
1185 unsigned int last_valid_cap = 0;
1186 if (is_android()) {
1187 for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
1188 ++last_valid_cap);
1189
1190 /* |last_valid_cap| will be the first failing value. */
1191 if (last_valid_cap > 0) {
1192 last_valid_cap--;
1193 }
1194 } else {
1195 const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
1196 FILE *fp = fopen(cap_file, "re");
1197 if (fscanf(fp, "%u", &last_valid_cap) != 1)
1198 pdie("fscanf(%s)", cap_file);
1199 fclose(fp);
1200 }
1201 return last_valid_cap;
1202 }
1203
drop_caps(const struct minijail * j,unsigned int last_valid_cap)1204 void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
1205 {
1206 cap_t caps = cap_get_proc();
1207 cap_value_t flag[1];
1208 const uint64_t one = 1;
1209 unsigned int i;
1210 if (!caps)
1211 die("can't get process caps");
1212 if (cap_clear_flag(caps, CAP_INHERITABLE))
1213 die("can't clear inheritable caps");
1214 if (cap_clear_flag(caps, CAP_EFFECTIVE))
1215 die("can't clear effective caps");
1216 if (cap_clear_flag(caps, CAP_PERMITTED))
1217 die("can't clear permitted caps");
1218 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
1219 /* Keep CAP_SETPCAP for dropping bounding set bits. */
1220 if (i != CAP_SETPCAP && !(j->caps & (one << i)))
1221 continue;
1222 flag[0] = i;
1223 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
1224 die("can't add effective cap");
1225 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
1226 die("can't add permitted cap");
1227 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
1228 die("can't add inheritable cap");
1229 }
1230 if (cap_set_proc(caps))
1231 die("can't apply initial cleaned capset");
1232
1233 /*
1234 * Instead of dropping bounding set first, do it here in case
1235 * the caller had a more permissive bounding set which could
1236 * have been used above to raise a capability that wasn't already
1237 * present. This requires CAP_SETPCAP, so we raised/kept it above.
1238 */
1239 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
1240 if (j->caps & (one << i))
1241 continue;
1242 if (prctl(PR_CAPBSET_DROP, i))
1243 pdie("prctl(PR_CAPBSET_DROP)");
1244 }
1245
1246 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
1247 if ((j->caps & (one << CAP_SETPCAP)) == 0) {
1248 flag[0] = CAP_SETPCAP;
1249 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
1250 die("can't clear effective cap");
1251 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
1252 die("can't clear permitted cap");
1253 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
1254 die("can't clear inheritable cap");
1255 }
1256
1257 if (cap_set_proc(caps))
1258 die("can't apply final cleaned capset");
1259
1260 cap_free(caps);
1261 }
1262
set_seccomp_filter(const struct minijail * j)1263 void set_seccomp_filter(const struct minijail *j)
1264 {
1265 /*
1266 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
1267 * in the kernel source tree for an explanation of the parameters.
1268 */
1269 if (j->flags.no_new_privs) {
1270 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
1271 pdie("prctl(PR_SET_NO_NEW_PRIVS)");
1272 }
1273
1274 /*
1275 * Code running with ASan
1276 * (https://github.com/google/sanitizers/wiki/AddressSanitizer)
1277 * will make system calls not included in the syscall filter policy,
1278 * which will likely crash the program. Skip setting seccomp filter in
1279 * that case.
1280 * 'running_with_asan()' has no inputs and is completely defined at
1281 * build time, so this cannot be used by an attacker to skip setting
1282 * seccomp filter.
1283 */
1284 if (j->flags.seccomp_filter && running_with_asan()) {
1285 warn("running with ASan, not setting seccomp filter");
1286 return;
1287 }
1288
1289 /*
1290 * If we're logging seccomp filter failures,
1291 * install the SIGSYS handler first.
1292 */
1293 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
1294 if (install_sigsys_handler())
1295 pdie("install SIGSYS handler");
1296 warn("logging seccomp filter failures");
1297 }
1298
1299 /*
1300 * Install the syscall filter.
1301 */
1302 if (j->flags.seccomp_filter) {
1303 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
1304 j->filter_prog)) {
1305 if ((errno == EINVAL) && can_softfail()) {
1306 warn("seccomp not supported");
1307 return;
1308 }
1309 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
1310 }
1311 }
1312 }
1313
minijail_enter(const struct minijail * j)1314 void API minijail_enter(const struct minijail *j)
1315 {
1316 /*
1317 * If we're dropping caps, get the last valid cap from /proc now,
1318 * since /proc can be unmounted before drop_caps() is called.
1319 */
1320 unsigned int last_valid_cap = 0;
1321 if (j->flags.caps)
1322 last_valid_cap = get_last_valid_cap();
1323
1324 if (j->flags.pids)
1325 die("tried to enter a pid-namespaced jail;"
1326 " try minijail_run()?");
1327
1328 if (j->flags.usergroups && !j->user)
1329 die("usergroup inheritance without username");
1330
1331 /*
1332 * We can't recover from failures if we've dropped privileges partially,
1333 * so we don't even try. If any of our operations fail, we abort() the
1334 * entire process.
1335 */
1336 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
1337 pdie("setns(CLONE_NEWNS)");
1338
1339 if (j->flags.vfs) {
1340 if (unshare(CLONE_NEWNS))
1341 pdie("unshare(vfs)");
1342 /*
1343 * Remount all filesystems as private. If they are shared
1344 * new bind mounts will creep out of our namespace.
1345 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
1346 */
1347 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
1348 pdie("mount(/, private)");
1349 }
1350
1351 if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
1352 pdie("unshare(ipc)");
1353 }
1354
1355 if (j->flags.enter_net) {
1356 if (setns(j->netns_fd, CLONE_NEWNET))
1357 pdie("setns(CLONE_NEWNET)");
1358 } else if (j->flags.net && unshare(CLONE_NEWNET)) {
1359 pdie("unshare(net)");
1360 }
1361
1362 if (j->flags.chroot && enter_chroot(j))
1363 pdie("chroot");
1364
1365 if (j->flags.pivot_root && enter_pivot_root(j))
1366 pdie("pivot_root");
1367
1368 if (j->flags.mount_tmp && mount_tmp())
1369 pdie("mount_tmp");
1370
1371 if (j->flags.remount_proc_ro && remount_proc_readonly(j))
1372 pdie("remount");
1373
1374 if (j->flags.caps) {
1375 /*
1376 * POSIX capabilities are a bit tricky. If we drop our
1377 * capability to change uids, our attempt to use setuid()
1378 * below will fail. Hang on to root caps across setuid(), then
1379 * lock securebits.
1380 */
1381 if (prctl(PR_SET_KEEPCAPS, 1))
1382 pdie("prctl(PR_SET_KEEPCAPS)");
1383 if (prctl
1384 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
1385 pdie("prctl(PR_SET_SECUREBITS)");
1386 }
1387
1388 /*
1389 * If we're setting no_new_privs, we can drop privileges
1390 * before setting seccomp filter. This way filter policies
1391 * don't need to allow privilege-dropping syscalls.
1392 */
1393 if (j->flags.no_new_privs) {
1394 drop_ugid(j);
1395 if (j->flags.caps)
1396 drop_caps(j, last_valid_cap);
1397
1398 set_seccomp_filter(j);
1399 } else {
1400 /*
1401 * If we're not setting no_new_privs,
1402 * we need to set seccomp filter *before* dropping privileges.
1403 * WARNING: this means that filter policies *must* allow
1404 * setgroups()/setresgid()/setresuid() for dropping root and
1405 * capget()/capset()/prctl() for dropping caps.
1406 */
1407 set_seccomp_filter(j);
1408
1409 drop_ugid(j);
1410 if (j->flags.caps)
1411 drop_caps(j, last_valid_cap);
1412 }
1413
1414 /*
1415 * Select the specified alternate syscall table. The table must not
1416 * block prctl(2) if we're using seccomp as well.
1417 */
1418 if (j->flags.alt_syscall) {
1419 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
1420 pdie("prctl(PR_ALT_SYSCALL)");
1421 }
1422
1423 /*
1424 * seccomp has to come last since it cuts off all the other
1425 * privilege-dropping syscalls :)
1426 */
1427 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
1428 if ((errno == EINVAL) && can_softfail()) {
1429 warn("seccomp not supported");
1430 return;
1431 }
1432 pdie("prctl(PR_SET_SECCOMP)");
1433 }
1434 }
1435
1436 /* TODO(wad) will visibility affect this variable? */
1437 static int init_exitstatus = 0;
1438
init_term(int sig)1439 void init_term(int __attribute__ ((unused)) sig)
1440 {
1441 _exit(init_exitstatus);
1442 }
1443
init(pid_t rootpid)1444 int init(pid_t rootpid)
1445 {
1446 pid_t pid;
1447 int status;
1448 /* so that we exit with the right status */
1449 signal(SIGTERM, init_term);
1450 /* TODO(wad) self jail with seccomp_filters here. */
1451 while ((pid = wait(&status)) > 0) {
1452 /*
1453 * This loop will only end when either there are no processes
1454 * left inside our pid namespace or we get a signal.
1455 */
1456 if (pid == rootpid)
1457 init_exitstatus = status;
1458 }
1459 if (!WIFEXITED(init_exitstatus))
1460 _exit(MINIJAIL_ERR_INIT);
1461 _exit(WEXITSTATUS(init_exitstatus));
1462 }
1463
minijail_from_fd(int fd,struct minijail * j)1464 int API minijail_from_fd(int fd, struct minijail *j)
1465 {
1466 size_t sz = 0;
1467 size_t bytes = read(fd, &sz, sizeof(sz));
1468 char *buf;
1469 int r;
1470 if (sizeof(sz) != bytes)
1471 return -EINVAL;
1472 if (sz > USHRT_MAX) /* arbitrary sanity check */
1473 return -E2BIG;
1474 buf = malloc(sz);
1475 if (!buf)
1476 return -ENOMEM;
1477 bytes = read(fd, buf, sz);
1478 if (bytes != sz) {
1479 free(buf);
1480 return -EINVAL;
1481 }
1482 r = minijail_unmarshal(j, buf, sz);
1483 free(buf);
1484 return r;
1485 }
1486
minijail_to_fd(struct minijail * j,int fd)1487 int API minijail_to_fd(struct minijail *j, int fd)
1488 {
1489 char *buf;
1490 size_t sz = minijail_size(j);
1491 ssize_t written;
1492 int r;
1493
1494 if (!sz)
1495 return -EINVAL;
1496 buf = malloc(sz);
1497 r = minijail_marshal(j, buf, sz);
1498 if (r) {
1499 free(buf);
1500 return r;
1501 }
1502 /* Sends [size][minijail]. */
1503 written = write(fd, &sz, sizeof(sz));
1504 if (written != sizeof(sz)) {
1505 free(buf);
1506 return -EFAULT;
1507 }
1508 written = write(fd, buf, sz);
1509 if (written < 0 || (size_t) written != sz) {
1510 free(buf);
1511 return -EFAULT;
1512 }
1513 free(buf);
1514 return 0;
1515 }
1516
setup_preload(void)1517 int setup_preload(void)
1518 {
1519 #if defined(__ANDROID__)
1520 /* Don't use LDPRELOAD on Brillo. */
1521 return 0;
1522 #else
1523 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
1524 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
1525 if (!newenv)
1526 return -ENOMEM;
1527
1528 /* Only insert a separating space if we have something to separate... */
1529 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
1530 PRELOADPATH);
1531
1532 /* setenv() makes a copy of the string we give it. */
1533 setenv(kLdPreloadEnvVar, newenv, 1);
1534 free(newenv);
1535 return 0;
1536 #endif
1537 }
1538
setup_pipe(int fds[2])1539 int setup_pipe(int fds[2])
1540 {
1541 int r = pipe(fds);
1542 char fd_buf[11];
1543 if (r)
1544 return r;
1545 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
1546 if (r <= 0)
1547 return -EINVAL;
1548 setenv(kFdEnvVar, fd_buf, 1);
1549 return 0;
1550 }
1551
setup_pipe_end(int fds[2],size_t index)1552 int setup_pipe_end(int fds[2], size_t index)
1553 {
1554 if (index > 1)
1555 return -1;
1556
1557 close(fds[1 - index]);
1558 return fds[index];
1559 }
1560
setup_and_dupe_pipe_end(int fds[2],size_t index,int fd)1561 int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
1562 {
1563 if (index > 1)
1564 return -1;
1565
1566 close(fds[1 - index]);
1567 /* dup2(2) the corresponding end of the pipe into |fd|. */
1568 return dup2(fds[index], fd);
1569 }
1570
1571 int minijail_run_internal(struct minijail *j, const char *filename,
1572 char *const argv[], pid_t *pchild_pid,
1573 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1574 int use_preload);
1575
minijail_run(struct minijail * j,const char * filename,char * const argv[])1576 int API minijail_run(struct minijail *j, const char *filename,
1577 char *const argv[])
1578 {
1579 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1580 true);
1581 }
1582
minijail_run_pid(struct minijail * j,const char * filename,char * const argv[],pid_t * pchild_pid)1583 int API minijail_run_pid(struct minijail *j, const char *filename,
1584 char *const argv[], pid_t *pchild_pid)
1585 {
1586 return minijail_run_internal(j, filename, argv, pchild_pid,
1587 NULL, NULL, NULL, true);
1588 }
1589
minijail_run_pipe(struct minijail * j,const char * filename,char * const argv[],int * pstdin_fd)1590 int API minijail_run_pipe(struct minijail *j, const char *filename,
1591 char *const argv[], int *pstdin_fd)
1592 {
1593 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd,
1594 NULL, NULL, true);
1595 }
1596
minijail_run_pid_pipes(struct minijail * j,const char * filename,char * const argv[],pid_t * pchild_pid,int * pstdin_fd,int * pstdout_fd,int * pstderr_fd)1597 int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
1598 char *const argv[], pid_t *pchild_pid,
1599 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
1600 {
1601 return minijail_run_internal(j, filename, argv, pchild_pid,
1602 pstdin_fd, pstdout_fd, pstderr_fd, true);
1603 }
1604
minijail_run_no_preload(struct minijail * j,const char * filename,char * const argv[])1605 int API minijail_run_no_preload(struct minijail *j, const char *filename,
1606 char *const argv[])
1607 {
1608 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1609 false);
1610 }
1611
minijail_run_pid_pipes_no_preload(struct minijail * j,const char * filename,char * const argv[],pid_t * pchild_pid,int * pstdin_fd,int * pstdout_fd,int * pstderr_fd)1612 int API minijail_run_pid_pipes_no_preload(struct minijail *j,
1613 const char *filename,
1614 char *const argv[],
1615 pid_t *pchild_pid,
1616 int *pstdin_fd, int *pstdout_fd,
1617 int *pstderr_fd) {
1618 return minijail_run_internal(j, filename, argv, pchild_pid,
1619 pstdin_fd, pstdout_fd, pstderr_fd, false);
1620 }
1621
minijail_run_internal(struct minijail * j,const char * filename,char * const argv[],pid_t * pchild_pid,int * pstdin_fd,int * pstdout_fd,int * pstderr_fd,int use_preload)1622 int minijail_run_internal(struct minijail *j, const char *filename,
1623 char *const argv[], pid_t *pchild_pid,
1624 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1625 int use_preload)
1626 {
1627 char *oldenv, *oldenv_copy = NULL;
1628 pid_t child_pid;
1629 int pipe_fds[2];
1630 int stdin_fds[2];
1631 int stdout_fds[2];
1632 int stderr_fds[2];
1633 int child_sync_pipe_fds[2];
1634 int sync_child = 0;
1635 int ret;
1636 /* We need to remember this across the minijail_preexec() call. */
1637 int pid_namespace = j->flags.pids;
1638 int do_init = j->flags.do_init;
1639
1640 if (use_preload) {
1641 oldenv = getenv(kLdPreloadEnvVar);
1642 if (oldenv) {
1643 oldenv_copy = strdup(oldenv);
1644 if (!oldenv_copy)
1645 return -ENOMEM;
1646 }
1647
1648 if (setup_preload())
1649 return -EFAULT;
1650 }
1651
1652 if (!use_preload) {
1653 if (j->flags.caps)
1654 die("capabilities are not supported without "
1655 "LD_PRELOAD");
1656 }
1657
1658 /*
1659 * Make the process group ID of this process equal to its PID, so that
1660 * both the Minijail process and the jailed process can be killed
1661 * together.
1662 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1663 * the process is already a process group leader.
1664 */
1665 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1666 if (errno != EPERM) {
1667 pdie("setpgid(0, 0)");
1668 }
1669 }
1670
1671 if (use_preload) {
1672 /*
1673 * Before we fork(2) and execve(2) the child process, we need
1674 * to open a pipe(2) to send the minijail configuration over.
1675 */
1676 if (setup_pipe(pipe_fds))
1677 return -EFAULT;
1678 }
1679
1680 /*
1681 * If we want to write to the child process' standard input,
1682 * create the pipe(2) now.
1683 */
1684 if (pstdin_fd) {
1685 if (pipe(stdin_fds))
1686 return -EFAULT;
1687 }
1688
1689 /*
1690 * If we want to read from the child process' standard output,
1691 * create the pipe(2) now.
1692 */
1693 if (pstdout_fd) {
1694 if (pipe(stdout_fds))
1695 return -EFAULT;
1696 }
1697
1698 /*
1699 * If we want to read from the child process' standard error,
1700 * create the pipe(2) now.
1701 */
1702 if (pstderr_fd) {
1703 if (pipe(stderr_fds))
1704 return -EFAULT;
1705 }
1706
1707 /*
1708 * If we want to set up a new uid/gid mapping in the user namespace,
1709 * or if we need to add the child process to cgroups, create the pipe(2)
1710 * to sync between parent and child.
1711 */
1712 if (j->flags.userns || j->flags.cgroups) {
1713 sync_child = 1;
1714 if (pipe(child_sync_pipe_fds))
1715 return -EFAULT;
1716 }
1717
1718 /*
1719 * Use sys_clone() if and only if we're creating a pid namespace.
1720 *
1721 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1722 *
1723 * In multithreaded programs, there are a bunch of locks inside libc,
1724 * some of which may be held by other threads at the time that we call
1725 * minijail_run_pid(). If we call fork(), glibc does its level best to
1726 * ensure that we hold all of these locks before it calls clone()
1727 * internally and drop them after clone() returns, but when we call
1728 * sys_clone(2) directly, all that gets bypassed and we end up with a
1729 * child address space where some of libc's important locks are held by
1730 * other threads (which did not get cloned, and hence will never release
1731 * those locks). This is okay so long as we call exec() immediately
1732 * after, but a bunch of seemingly-innocent libc functions like setenv()
1733 * take locks.
1734 *
1735 * Hence, only call sys_clone() if we need to, in order to get at pid
1736 * namespacing. If we follow this path, the child's address space might
1737 * have broken locks; you may only call functions that do not acquire
1738 * any locks.
1739 *
1740 * Unfortunately, fork() acquires every lock it can get its hands on, as
1741 * previously detailed, so this function is highly likely to deadlock
1742 * later on (see "deadlock here") if we're multithreaded.
1743 *
1744 * We might hack around this by having the clone()d child (init of the
1745 * pid namespace) return directly, rather than leaving the clone()d
1746 * process hanging around to be init for the new namespace (and having
1747 * its fork()ed child return in turn), but that process would be crippled
1748 * with its libc locks potentially broken. We might try fork()ing in the
1749 * parent before we clone() to ensure that we own all the locks, but
1750 * then we have to have the forked child hanging around consuming
1751 * resources (and possibly having file descriptors / shared memory
1752 * regions / etc attached). We'd need to keep the child around to avoid
1753 * having its children get reparented to init.
1754 *
1755 * TODO(ellyjones): figure out if the "forked child hanging around"
1756 * problem is fixable or not. It would be nice if we worked in this
1757 * case.
1758 */
1759 if (pid_namespace) {
1760 int clone_flags = CLONE_NEWPID | SIGCHLD;
1761 if (j->flags.userns)
1762 clone_flags |= CLONE_NEWUSER;
1763 child_pid = syscall(SYS_clone, clone_flags, NULL);
1764 } else {
1765 child_pid = fork();
1766 }
1767
1768 if (child_pid < 0) {
1769 if (use_preload) {
1770 free(oldenv_copy);
1771 }
1772 die("failed to fork child");
1773 }
1774
1775 if (child_pid) {
1776 if (use_preload) {
1777 /* Restore parent's LD_PRELOAD. */
1778 if (oldenv_copy) {
1779 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1780 free(oldenv_copy);
1781 } else {
1782 unsetenv(kLdPreloadEnvVar);
1783 }
1784 unsetenv(kFdEnvVar);
1785 }
1786
1787 j->initpid = child_pid;
1788
1789 if (j->flags.pid_file)
1790 write_pid_file(j);
1791
1792 if (j->flags.cgroups)
1793 add_to_cgroups(j);
1794
1795 if (j->flags.userns)
1796 write_ugid_mappings(j);
1797
1798 if (sync_child)
1799 parent_setup_complete(child_sync_pipe_fds);
1800
1801 if (use_preload) {
1802 /* Send marshalled minijail. */
1803 close(pipe_fds[0]); /* read endpoint */
1804 ret = minijail_to_fd(j, pipe_fds[1]);
1805 close(pipe_fds[1]); /* write endpoint */
1806 if (ret) {
1807 kill(j->initpid, SIGKILL);
1808 die("failed to send marshalled minijail");
1809 }
1810 }
1811
1812 if (pchild_pid)
1813 *pchild_pid = child_pid;
1814
1815 /*
1816 * If we want to write to the child process' standard input,
1817 * set up the write end of the pipe.
1818 */
1819 if (pstdin_fd)
1820 *pstdin_fd = setup_pipe_end(stdin_fds,
1821 1 /* write end */);
1822
1823 /*
1824 * If we want to read from the child process' standard output,
1825 * set up the read end of the pipe.
1826 */
1827 if (pstdout_fd)
1828 *pstdout_fd = setup_pipe_end(stdout_fds,
1829 0 /* read end */);
1830
1831 /*
1832 * If we want to read from the child process' standard error,
1833 * set up the read end of the pipe.
1834 */
1835 if (pstderr_fd)
1836 *pstderr_fd = setup_pipe_end(stderr_fds,
1837 0 /* read end */);
1838
1839 return 0;
1840 }
1841 free(oldenv_copy);
1842
1843 if (j->flags.reset_signal_mask) {
1844 sigset_t signal_mask;
1845 if (sigemptyset(&signal_mask) != 0)
1846 pdie("sigemptyset failed");
1847 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
1848 pdie("sigprocmask failed");
1849 }
1850
1851 if (sync_child)
1852 wait_for_parent_setup(child_sync_pipe_fds);
1853
1854 if (j->flags.userns)
1855 enter_user_namespace(j);
1856
1857 /*
1858 * If we want to write to the jailed process' standard input,
1859 * set up the read end of the pipe.
1860 */
1861 if (pstdin_fd) {
1862 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1863 STDIN_FILENO) < 0)
1864 die("failed to set up stdin pipe");
1865 }
1866
1867 /*
1868 * If we want to read from the jailed process' standard output,
1869 * set up the write end of the pipe.
1870 */
1871 if (pstdout_fd) {
1872 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1873 STDOUT_FILENO) < 0)
1874 die("failed to set up stdout pipe");
1875 }
1876
1877 /*
1878 * If we want to read from the jailed process' standard error,
1879 * set up the write end of the pipe.
1880 */
1881 if (pstderr_fd) {
1882 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1883 STDERR_FILENO) < 0)
1884 die("failed to set up stderr pipe");
1885 }
1886
1887 /* If running an init program, let it decide when/how to mount /proc. */
1888 if (pid_namespace && !do_init)
1889 j->flags.remount_proc_ro = 0;
1890
1891 if (use_preload) {
1892 /* Strip out flags that cannot be inherited across execve(2). */
1893 minijail_preexec(j);
1894 } else {
1895 j->flags.pids = 0;
1896 }
1897 /* Jail this process, then execve() the target. */
1898 minijail_enter(j);
1899
1900 if (pid_namespace && do_init) {
1901 /*
1902 * pid namespace: this process will become init inside the new
1903 * namespace. We don't want all programs we might exec to have
1904 * to know how to be init. Normally (do_init == 1) we fork off
1905 * a child to actually run the program. If |do_init == 0|, we
1906 * let the program keep pid 1 and be init.
1907 *
1908 * If we're multithreaded, we'll probably deadlock here. See
1909 * WARNING above.
1910 */
1911 child_pid = fork();
1912 if (child_pid < 0)
1913 _exit(child_pid);
1914 else if (child_pid > 0)
1915 init(child_pid); /* never returns */
1916 }
1917
1918 /*
1919 * If we aren't pid-namespaced, or the jailed program asked to be init:
1920 * calling process
1921 * -> execve()-ing process
1922 * If we are:
1923 * calling process
1924 * -> init()-ing process
1925 * -> execve()-ing process
1926 */
1927 _exit(execve(filename, argv, environ));
1928 }
1929
minijail_kill(struct minijail * j)1930 int API minijail_kill(struct minijail *j)
1931 {
1932 int st;
1933 if (kill(j->initpid, SIGTERM))
1934 return -errno;
1935 if (waitpid(j->initpid, &st, 0) < 0)
1936 return -errno;
1937 return st;
1938 }
1939
minijail_wait(struct minijail * j)1940 int API minijail_wait(struct minijail *j)
1941 {
1942 int st;
1943 if (waitpid(j->initpid, &st, 0) < 0)
1944 return -errno;
1945
1946 if (!WIFEXITED(st)) {
1947 int error_status = st;
1948 if (WIFSIGNALED(st)) {
1949 int signum = WTERMSIG(st);
1950 warn("child process %d received signal %d",
1951 j->initpid, signum);
1952 /*
1953 * We return MINIJAIL_ERR_JAIL if the process received
1954 * SIGSYS, which happens when a syscall is blocked by
1955 * seccomp filters.
1956 * If not, we do what bash(1) does:
1957 * $? = 128 + signum
1958 */
1959 if (signum == SIGSYS) {
1960 error_status = MINIJAIL_ERR_JAIL;
1961 } else {
1962 error_status = 128 + signum;
1963 }
1964 }
1965 return error_status;
1966 }
1967
1968 int exit_status = WEXITSTATUS(st);
1969 if (exit_status != 0)
1970 info("child process %d exited with status %d",
1971 j->initpid, exit_status);
1972
1973 return exit_status;
1974 }
1975
minijail_destroy(struct minijail * j)1976 void API minijail_destroy(struct minijail *j)
1977 {
1978 size_t i;
1979
1980 if (j->flags.seccomp_filter && j->filter_prog) {
1981 free(j->filter_prog->filter);
1982 free(j->filter_prog);
1983 }
1984 while (j->mounts_head) {
1985 struct mountpoint *m = j->mounts_head;
1986 j->mounts_head = j->mounts_head->next;
1987 free(m->type);
1988 free(m->dest);
1989 free(m->src);
1990 free(m);
1991 }
1992 j->mounts_tail = NULL;
1993 if (j->user)
1994 free(j->user);
1995 if (j->suppl_gid_list)
1996 free(j->suppl_gid_list);
1997 if (j->chrootdir)
1998 free(j->chrootdir);
1999 if (j->alt_syscall_table)
2000 free(j->alt_syscall_table);
2001 for (i = 0; i < j->cgroup_count; ++i)
2002 free(j->cgroups[i]);
2003 free(j);
2004 }
2005