1 /*
2 * Copyright (c) 2019-21 Andrew G Morgan <morgan@kernel.org>
3 *
4 * This file contains a collection of routines that perform thread
5 * synchronization to ensure that a whole process is running as a
6 * single privilege entity - independent of the number of pthreads.
7 *
8 * The whole file would be unnecessary if glibc exported an explicit
9 * psx_syscall()-like function that leveraged the nptl:setxid
10 * mechanism to synchronize thread state over the whole process.
11 */
12 #undef _POSIX_C_SOURCE
13 #define _POSIX_C_SOURCE 199309L
14
15 #ifndef _GNU_SOURCE
16 #define _GNU_SOURCE
17 #endif
18
19 #include <errno.h>
20 #include <pthread.h>
21 #include <sched.h>
22 #include <signal.h>
23 #include <stdarg.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <sys/syscall.h>
29
30 #include "psx_syscall.h"
31
32 /*
33 * psx_load_syscalls() can be weakly defined in dependent libraries to
34 * provide a mechanism for a library to optionally leverage this psx
35 * mechanism. Specifically, when libcap calls psx_load_sycalls() it
36 * provides a weakly declared default that maps its system calls to
37 * the regular system call functions. However, when linked with psx,
38 * this function here overrides the syscalls to be the psx ones.
39 */
psx_load_syscalls(long int (** syscall_fn)(long int,long int,long int,long int),long int (** syscall6_fn)(long int,long int,long int,long int,long int,long int,long int))40 void psx_load_syscalls(long int (**syscall_fn)(long int,
41 long int, long int, long int),
42 long int (**syscall6_fn)(long int,
43 long int, long int, long int,
44 long int, long int, long int))
45 {
46 *syscall_fn = psx_syscall3;
47 *syscall6_fn = psx_syscall6;
48 }
49
50 /*
51 * type to keep track of registered threads.
52 */
53 typedef struct registered_thread_s {
54 struct registered_thread_s *next, *prev;
55 pthread_t thread;
56 pthread_mutex_t mu;
57 int pending;
58 int gone;
59 } registered_thread_t;
60
61 static pthread_once_t psx_tracker_initialized = PTHREAD_ONCE_INIT;
62
63 typedef enum {
64 _PSX_IDLE = 0,
65 _PSX_SETUP = 1,
66 _PSX_SYSCALL = 2,
67 _PSX_CREATE = 3,
68 _PSX_INFORK = 4,
69 _PSX_EXITING = 5,
70 } psx_tracker_state_t;
71
72 /*
73 * This global structure holds the global coordination state for
74 * libcap's psx_posix_syscall() support.
75 */
76 static struct psx_tracker_s {
77 int has_forked;
78
79 pthread_mutex_t state_mu;
80 pthread_cond_t cond; /* this is only used to wait on 'state' changes */
81 psx_tracker_state_t state;
82 int initialized;
83 int psx_sig;
84
85 struct {
86 long syscall_nr;
87 long arg1, arg2, arg3, arg4, arg5, arg6;
88 int six;
89 int active;
90 } cmd;
91
92 struct sigaction sig_action;
93 struct sigaction chained_action;
94 registered_thread_t *root;
95 } psx_tracker;
96
97 /*
98 * psx_action_key is used for thread local storage of the thread's
99 * registration.
100 */
101 pthread_key_t psx_action_key;
102
103 /*
104 * psx_do_registration called locked and creates a tracker entry for
105 * the current thread with a TLS specific key pointing at the threads
106 * specific tracker.
107 */
psx_do_registration(void)108 static void *psx_do_registration(void) {
109 registered_thread_t *node = calloc(1, sizeof(registered_thread_t));
110 pthread_mutex_init(&node->mu, NULL);
111 node->thread = pthread_self();
112 pthread_setspecific(psx_action_key, node);
113 node->next = psx_tracker.root;
114 if (node->next) {
115 node->next->prev = node;
116 }
117 psx_tracker.root = node;
118 return node;
119 }
120
121 /*
122 * psx_posix_syscall_actor performs the system call on the targeted
123 * thread and signals it is no longer pending.
124 */
psx_posix_syscall_actor(int signum,siginfo_t * info,void * ignore)125 static void psx_posix_syscall_actor(int signum, siginfo_t *info, void *ignore) {
126 /* bail early if this isn't something we recognize */
127 if (signum != psx_tracker.psx_sig || !psx_tracker.cmd.active ||
128 info == NULL || info->si_code != SI_TKILL || info->si_pid != getpid()) {
129 if (psx_tracker.chained_action.sa_sigaction != 0) {
130 psx_tracker.chained_action.sa_sigaction(signum, info, ignore);
131 }
132 return;
133 }
134
135 if (!psx_tracker.cmd.six) {
136 (void) syscall(psx_tracker.cmd.syscall_nr,
137 psx_tracker.cmd.arg1,
138 psx_tracker.cmd.arg2,
139 psx_tracker.cmd.arg3);
140 } else {
141 (void) syscall(psx_tracker.cmd.syscall_nr,
142 psx_tracker.cmd.arg1,
143 psx_tracker.cmd.arg2,
144 psx_tracker.cmd.arg3,
145 psx_tracker.cmd.arg4,
146 psx_tracker.cmd.arg5,
147 psx_tracker.cmd.arg6);
148 }
149
150 /*
151 * This handler can only be called on registered threads which
152 * have had this specific defined at start-up. (But see the
153 * subsequent test.)
154 */
155 registered_thread_t *ref = pthread_getspecific(psx_action_key);
156 if (ref) {
157 pthread_mutex_lock(&ref->mu);
158 ref->pending = 0;
159 pthread_mutex_unlock(&ref->mu);
160 } /*
161 * else thread must be dying and its psx_action_key has already
162 * been cleaned up.
163 */
164 }
165
166 /*
167 * Some forward declarations for the initialization
168 * psx_syscall_start() routine.
169 */
170 static void _psx_prepare_fork(void);
171 static void _psx_fork_completed(void);
172 static void _psx_forked_child(void);
173 int __wrap_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
174 void *(*start_routine) (void *), void *arg);
175
176 /*
177 * psx requires this function to be provided by the linkage wrapping.
178 */
179 extern int __real_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
180 void *(*start_routine) (void *), void *arg);
181
182 /*
183 * psx_confirm_sigaction reconfirms that the psx handler is the first
184 * handler to respond to the psx signal. It assumes that
185 * psx_tracker.psx_sig has been set.
186 */
psx_confirm_sigaction(void)187 static void psx_confirm_sigaction(void) {
188 sigset_t mask, orig;
189 struct sigaction existing_sa;
190
191 /*
192 * Block interrupts while potentially rewriting the handler.
193 */
194 sigemptyset(&mask);
195 sigaddset(&mask, psx_tracker.psx_sig);
196 sigprocmask(SIG_BLOCK, &mask, &orig);
197
198 sigaction(psx_tracker.psx_sig, NULL, &existing_sa);
199 if (existing_sa.sa_sigaction != psx_posix_syscall_actor) {
200 memcpy(&psx_tracker.chained_action, &existing_sa, sizeof(struct sigaction));
201 psx_tracker.sig_action.sa_sigaction = psx_posix_syscall_actor;
202 sigemptyset(&psx_tracker.sig_action.sa_mask);
203 psx_tracker.sig_action.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART;
204 sigaction(psx_tracker.psx_sig, &psx_tracker.sig_action, NULL);
205 }
206
207 sigprocmask(SIG_SETMASK, &orig, NULL);
208 }
209
210 /*
211 * psx_syscall_start initializes the subsystem including initializing
212 * the mutex.
213 */
psx_syscall_start(void)214 static void psx_syscall_start(void) {
215 pthread_mutex_init(&psx_tracker.state_mu, NULL);
216 pthread_cond_init(&psx_tracker.cond, NULL);
217 pthread_key_create(&psx_action_key, NULL);
218 pthread_atfork(_psx_prepare_fork, _psx_fork_completed, _psx_forked_child);
219
220 /*
221 * All sorts of things are assumed by Linux and glibc and/or musl
222 * about signal handlers and which can be blocked. Go has its own
223 * idiosyncrasies too. We tried SIGRTMAX until
224 *
225 * https://bugzilla.kernel.org/show_bug.cgi?id=210533
226 *
227 * Our current strategy is to aggressively intercept SIGSYS.
228 */
229 psx_tracker.psx_sig = SIGSYS;
230
231 psx_confirm_sigaction();
232 psx_do_registration(); // register the main thread.
233
234 psx_tracker.initialized = 1;
235 }
236
237 /*
238 * This is the only way this library globally locks. Note, this is not
239 * to be confused with psx_sig (interrupt) blocking - which is
240 * performed around thread creation and when the signal handler is
241 * being confirmed.
242 */
psx_lock(void)243 static void psx_lock(void)
244 {
245 pthread_once(&psx_tracker_initialized, psx_syscall_start);
246 pthread_mutex_lock(&psx_tracker.state_mu);
247 }
248
249 /*
250 * This is the only way this library unlocks.
251 */
psx_unlock(void)252 static void psx_unlock(void)
253 {
254 pthread_mutex_unlock(&psx_tracker.state_mu);
255 }
256
257 /*
258 * under lock perform a state transition.
259 */
psx_new_state(psx_tracker_state_t was,psx_tracker_state_t is)260 static void psx_new_state(psx_tracker_state_t was, psx_tracker_state_t is)
261 {
262 psx_lock();
263 while (psx_tracker.state != was) {
264 pthread_cond_wait(&psx_tracker.cond, &psx_tracker.state_mu);
265 }
266 psx_tracker.state = is;
267 if (is == _PSX_IDLE) {
268 /* only announce newly idle states since that is all we wait for */
269 pthread_cond_signal(&psx_tracker.cond);
270 }
271 psx_unlock();
272 }
273
psx_syscall3(long int syscall_nr,long int arg1,long int arg2,long int arg3)274 long int psx_syscall3(long int syscall_nr,
275 long int arg1, long int arg2, long int arg3) {
276 return psx_syscall(syscall_nr, arg1, arg2, arg3);
277 }
278
psx_syscall6(long int syscall_nr,long int arg1,long int arg2,long int arg3,long int arg4,long int arg5,long int arg6)279 long int psx_syscall6(long int syscall_nr,
280 long int arg1, long int arg2, long int arg3,
281 long int arg4, long int arg5, long int arg6) {
282 return psx_syscall(syscall_nr, arg1, arg2, arg3, arg4, arg5, arg6);
283 }
284
_psx_prepare_fork(void)285 static void _psx_prepare_fork(void) {
286 /*
287 * obtain global lock - we don't want any syscalls while the fork
288 * is occurring since it may interfere with the preparation for
289 * the fork.
290 */
291 psx_new_state(_PSX_IDLE, _PSX_INFORK);
292 }
293
_psx_fork_completed(void)294 static void _psx_fork_completed(void) {
295 /*
296 * The only way we can get here is if state is _PSX_INFORK and was
297 * previously _PSX_IDLE. Now that the fork has completed, the
298 * parent can continue as if it hadn't happened - the forked child
299 * does not tie its security state to that of the parent process
300 * and threads.
301 *
302 * We don't strictly need to change the psx_tracker.state since we
303 * hold the mutex over the fork, but we do to make deadlock
304 * debugging easier.
305 */
306 psx_new_state(_PSX_INFORK, _PSX_IDLE);
307 }
308
_psx_forked_child(void)309 static void _psx_forked_child(void) {
310 /*
311 * The only way we can get here is if state is _PSX_INFORK and was
312 * previously _PSX_IDLE. However, none of the registered threads
313 * exist in this newly minted child process, so we have to reset
314 * the tracking structure to avoid any confusion. We also scuttle
315 * any chance of the PSX API working on more than one thread in
316 * the child by leaving the state as _PSX_INFORK. We do support
317 * all psx_syscall()s by reverting to them being direct in the
318 * fork()ed child.
319 *
320 * We do this because the glibc man page for fork() suggests that
321 * only a subset of things will work post fork(). Specifically,
322 * only a "async-signal-safe functions (see signal- safety(7))
323 * until such time as it calls execve(2)" can be relied upon. That
324 * man page suggests that you can't expect mutexes to work: "not
325 * async-signal-safe because it uses pthread_mutex_lock(3)
326 * internally.".
327 */
328 registered_thread_t *next, *old_root;
329 old_root = psx_tracker.root;
330 psx_tracker.root = NULL;
331
332 psx_tracker.has_forked = 1;
333
334 for (; old_root; old_root = next) {
335 next = old_root->next;
336 memset(old_root, 0, sizeof(*old_root));
337 free(old_root);
338 }
339 }
340
341 /*
342 * called locked to unregister a node from the tracker.
343 */
psx_do_unregister(registered_thread_t * node)344 static void psx_do_unregister(registered_thread_t *node) {
345 if (psx_tracker.root == node) {
346 psx_tracker.root = node->next;
347 }
348 if (node->next) {
349 node->next->prev = node->prev;
350 }
351 if (node->prev) {
352 node->prev->next = node->next;
353 }
354 pthread_mutex_destroy(&node->mu);
355 memset(node, 0, sizeof(*node));
356 free(node);
357 }
358
359 typedef struct {
360 void *(*fn)(void *);
361 void *arg;
362 sigset_t sigbits;
363 } psx_starter_t;
364
365 /*
366 * _psx_exiting is used to cleanup the node for the thread on its exit
367 * path. This is needed for musl libc:
368 *
369 * https://bugzilla.kernel.org/show_bug.cgi?id=208477
370 *
371 * and likely wise for glibc too:
372 *
373 * https://sourceware.org/bugzilla/show_bug.cgi?id=12889
374 */
_psx_exiting(void * node)375 static void _psx_exiting(void *node) {
376 /*
377 * Until we are in the _PSX_EXITING state, we must not block the
378 * psx_sig interrupt for this dying thread. That is, until this
379 * exiting thread can set ref->gone to 1, this dying thread is
380 * still participating in the psx syscall distribution.
381 *
382 * See https://github.com/golang/go/issues/42494 for a situation
383 * where this code is called with psx_tracker.psx_sig blocked.
384 */
385 sigset_t sigbit, orig_sigbits;
386 sigemptyset(&sigbit);
387 pthread_sigmask(SIG_UNBLOCK, &sigbit, &orig_sigbits);
388 sigaddset(&sigbit, psx_tracker.psx_sig);
389 pthread_sigmask(SIG_UNBLOCK, &sigbit, NULL);
390
391 /*
392 * With psx_tracker.psx_sig unblocked we can wait until this
393 * thread can enter the _PSX_EXITING state.
394 */
395 psx_new_state(_PSX_IDLE, _PSX_EXITING);
396
397 /*
398 * We now indicate that this thread is no longer participating in
399 * the psx mechanism.
400 */
401 registered_thread_t *ref = node;
402 pthread_mutex_lock(&ref->mu);
403 ref->gone = 1;
404 pthread_mutex_unlock(&ref->mu);
405
406 /*
407 * At this point, we can restore the calling sigmask to whatever
408 * the caller thought was appropriate for a dying thread to have.
409 */
410 pthread_sigmask(SIG_SETMASK, &orig_sigbits, NULL);
411
412 /*
413 * Allow the rest of the psx system carry on as per normal.
414 */
415 psx_new_state(_PSX_EXITING, _PSX_IDLE);
416 }
417
418 /*
419 * _psx_start_fn is a trampoline for the intended start function, it
420 * is called blocked (_PSX_CREATE), but releases the block before
421 * calling starter->fn. Before releasing the block, the TLS specific
422 * attributes are initialized for use by the interrupt handler under
423 * the psx mutex, so it doesn't race with an interrupt received by
424 * this thread and the interrupt handler does not need to poll for
425 * that specific attribute to be present (which is problematic during
426 * thread shutdown).
427 */
_psx_start_fn(void * data)428 static void *_psx_start_fn(void *data) {
429 void *node = psx_do_registration();
430
431 psx_new_state(_PSX_CREATE, _PSX_IDLE);
432
433 psx_starter_t *starter = data;
434 pthread_sigmask(SIG_SETMASK, &starter->sigbits, NULL);
435 void *(*fn)(void *) = starter->fn;
436 void *arg = starter->arg;
437
438 memset(data, 0, sizeof(*starter));
439 free(data);
440
441 void *ret;
442
443 pthread_cleanup_push(_psx_exiting, node);
444 ret = fn(arg);
445 pthread_cleanup_pop(1);
446
447 return ret;
448 }
449
450 /*
451 * __wrap_pthread_create is the wrapped destination of all regular
452 * pthread_create calls.
453 */
__wrap_pthread_create(pthread_t * thread,const pthread_attr_t * attr,void * (* start_routine)(void *),void * arg)454 int __wrap_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
455 void *(*start_routine) (void *), void *arg) {
456 psx_starter_t *starter = calloc(1, sizeof(psx_starter_t));
457 starter->fn = start_routine;
458 starter->arg = arg;
459 /*
460 * Until we are in the _PSX_IDLE state and locked, we must not
461 * block the psx_sig interrupt for this parent thread. Arrange
462 * that parent thread and newly created one can restore signal
463 * mask.
464 */
465 sigset_t sigbit, orig_sigbits;
466 sigemptyset(&sigbit);
467 pthread_sigmask(SIG_UNBLOCK, &sigbit, &starter->sigbits);
468 sigaddset(&sigbit, psx_tracker.psx_sig);
469 pthread_sigmask(SIG_UNBLOCK, &sigbit, &orig_sigbits);
470
471 psx_new_state(_PSX_IDLE, _PSX_CREATE);
472
473 /*
474 * until the child thread has been blessed with its own TLS
475 * specific attribute(s) we prevent either the parent thread or
476 * the new one from experiencing a PSX interrupt.
477 */
478 pthread_sigmask(SIG_BLOCK, &sigbit, NULL);
479
480 int ret = __real_pthread_create(thread, attr, _psx_start_fn, starter);
481 if (ret == -1) {
482 psx_new_state(_PSX_CREATE, _PSX_IDLE);
483 memset(starter, 0, sizeof(*starter));
484 free(starter);
485 } /* else unlock happens in _psx_start_fn */
486
487 /* the parent can once again receive psx interrupt signals */
488 pthread_sigmask(SIG_SETMASK, &orig_sigbits, NULL);
489
490 return ret;
491 }
492
493 /*
494 * __psx_immediate_syscall does one syscall using the current
495 * process.
496 */
__psx_immediate_syscall(long int syscall_nr,int count,long int * arg)497 static long int __psx_immediate_syscall(long int syscall_nr,
498 int count, long int *arg) {
499 psx_tracker.cmd.syscall_nr = syscall_nr;
500 psx_tracker.cmd.arg1 = count > 0 ? arg[0] : 0;
501 psx_tracker.cmd.arg2 = count > 1 ? arg[1] : 0;
502 psx_tracker.cmd.arg3 = count > 2 ? arg[2] : 0;
503
504 if (count > 3) {
505 psx_tracker.cmd.six = 1;
506 psx_tracker.cmd.arg4 = arg[3];
507 psx_tracker.cmd.arg5 = count > 4 ? arg[4] : 0;
508 psx_tracker.cmd.arg6 = count > 5 ? arg[5] : 0;
509 return syscall(syscall_nr,
510 psx_tracker.cmd.arg1,
511 psx_tracker.cmd.arg2,
512 psx_tracker.cmd.arg3,
513 psx_tracker.cmd.arg4,
514 psx_tracker.cmd.arg5,
515 psx_tracker.cmd.arg6);
516 }
517
518 psx_tracker.cmd.six = 0;
519 return syscall(syscall_nr, psx_tracker.cmd.arg1,
520 psx_tracker.cmd.arg2, psx_tracker.cmd.arg3);
521 }
522
523 /*
524 * __psx_syscall performs the syscall on the current thread and if no
525 * error is detected it ensures that the syscall is also performed on
526 * all (other) registered threads. The return code is the value for
527 * the first invocation. It uses a trick to figure out how many
528 * arguments the user has supplied. The other half of the trick is
529 * provided by the macro psx_syscall() in the <sys/psx_syscall.h>
530 * file. The trick is the 7th optional argument (8th over all) to
531 * __psx_syscall is the count of arguments supplied to psx_syscall.
532 *
533 * User:
534 * psx_syscall(nr, a, b);
535 * Expanded by macro to:
536 * __psx_syscall(nr, a, b, 6, 5, 4, 3, 2, 1, 0);
537 * The eighth arg is now ------------------------------------^
538 */
__psx_syscall(long int syscall_nr,...)539 long int __psx_syscall(long int syscall_nr, ...) {
540 long int arg[7];
541 int i;
542
543 va_list aptr;
544 va_start(aptr, syscall_nr);
545 for (i = 0; i < 7; i++) {
546 arg[i] = va_arg(aptr, long int);
547 }
548 va_end(aptr);
549
550 int count = arg[6];
551 if (count < 0 || count > 6) {
552 errno = EINVAL;
553 return -1;
554 }
555
556 if (psx_tracker.has_forked) {
557 return __psx_immediate_syscall(syscall_nr, count, arg);
558 }
559
560 psx_new_state(_PSX_IDLE, _PSX_SETUP);
561 psx_confirm_sigaction();
562
563 long int ret;
564
565 ret = __psx_immediate_syscall(syscall_nr, count, arg);
566 if (ret == -1 || !psx_tracker.initialized) {
567 psx_new_state(_PSX_SETUP, _PSX_IDLE);
568 goto defer;
569 }
570
571 int restore_errno = errno;
572
573 psx_new_state(_PSX_SETUP, _PSX_SYSCALL);
574 psx_tracker.cmd.active = 1;
575
576 pthread_t self = pthread_self();
577 registered_thread_t *next = NULL, *ref;
578
579 psx_lock();
580 for (ref = psx_tracker.root; ref; ref = next) {
581 next = ref->next;
582 if (ref->thread == self) {
583 continue;
584 }
585 pthread_mutex_lock(&ref->mu);
586 ref->pending = 1;
587 int gone = ref->gone;
588 if (!gone) {
589 gone = pthread_kill(ref->thread, psx_tracker.psx_sig) != 0;
590 }
591 pthread_mutex_unlock(&ref->mu);
592 if (!gone) {
593 continue;
594 }
595 /*
596 * need to remove invalid thread id from linked list
597 */
598 psx_do_unregister(ref);
599 }
600 psx_unlock();
601
602 for (;;) {
603 int waiting = 0;
604 psx_lock();
605 for (ref = psx_tracker.root; ref; ref = next) {
606 next = ref->next;
607 if (ref->thread == self) {
608 continue;
609 }
610
611 pthread_mutex_lock(&ref->mu);
612 int pending = ref->pending;
613 int gone = ref->gone;
614 if (pending && !gone) {
615 gone = (pthread_kill(ref->thread, 0) != 0);
616 }
617 pthread_mutex_unlock(&ref->mu);
618 if (!gone) {
619 waiting += pending;
620 continue;
621 }
622 /*
623 * need to remove invalid thread id from linked list
624 */
625 psx_do_unregister(ref);
626 }
627 psx_unlock();
628 if (!waiting) {
629 break;
630 }
631 sched_yield();
632 }
633
634 errno = restore_errno;
635 psx_tracker.cmd.active = 0;
636 psx_new_state(_PSX_SYSCALL, _PSX_IDLE);
637
638 defer:
639 return ret;
640 }
641