• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2019-21 Andrew G Morgan <morgan@kernel.org>
3  *
4  * This file contains a collection of routines that perform thread
5  * synchronization to ensure that a whole process is running as a
6  * single privilege entity - independent of the number of pthreads.
7  *
8  * The whole file would be unnecessary if glibc exported an explicit
9  * psx_syscall()-like function that leveraged the nptl:setxid
10  * mechanism to synchronize thread state over the whole process.
11  */
12 #undef _POSIX_C_SOURCE
13 #define _POSIX_C_SOURCE 199309L
14 
15 #ifndef _GNU_SOURCE
16 #define _GNU_SOURCE
17 #endif
18 
19 #include <errno.h>
20 #include <pthread.h>
21 #include <sched.h>
22 #include <signal.h>
23 #include <stdarg.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <sys/syscall.h>
29 
30 #include "psx_syscall.h"
31 
32 /*
33  * psx_load_syscalls() can be weakly defined in dependent libraries to
34  * provide a mechanism for a library to optionally leverage this psx
35  * mechanism. Specifically, when libcap calls psx_load_sycalls() it
36  * provides a weakly declared default that maps its system calls to
37  * the regular system call functions. However, when linked with psx,
38  * this function here overrides the syscalls to be the psx ones.
39  */
psx_load_syscalls(long int (** syscall_fn)(long int,long int,long int,long int),long int (** syscall6_fn)(long int,long int,long int,long int,long int,long int,long int))40 void psx_load_syscalls(long int (**syscall_fn)(long int,
41 					      long int, long int, long int),
42 		       long int (**syscall6_fn)(long int,
43 					       long int, long int, long int,
44 					       long int, long int, long int))
45 {
46     *syscall_fn = psx_syscall3;
47     *syscall6_fn = psx_syscall6;
48 }
49 
50 /*
51  * type to keep track of registered threads.
52  */
53 typedef struct registered_thread_s {
54     struct registered_thread_s *next, *prev;
55     pthread_t thread;
56     pthread_mutex_t mu;
57     int pending;
58     int gone;
59 } registered_thread_t;
60 
61 static pthread_once_t psx_tracker_initialized = PTHREAD_ONCE_INIT;
62 
63 typedef enum {
64     _PSX_IDLE = 0,
65     _PSX_SETUP = 1,
66     _PSX_SYSCALL = 2,
67     _PSX_CREATE = 3,
68     _PSX_INFORK = 4,
69     _PSX_EXITING = 5,
70 } psx_tracker_state_t;
71 
72 /*
73  * This global structure holds the global coordination state for
74  * libcap's psx_posix_syscall() support.
75  */
76 static struct psx_tracker_s {
77     int has_forked;
78 
79     pthread_mutex_t state_mu;
80     pthread_cond_t cond; /* this is only used to wait on 'state' changes */
81     psx_tracker_state_t state;
82     int initialized;
83     int psx_sig;
84 
85     struct {
86 	long syscall_nr;
87 	long arg1, arg2, arg3, arg4, arg5, arg6;
88 	int six;
89 	int active;
90     } cmd;
91 
92     struct sigaction sig_action;
93     struct sigaction chained_action;
94     registered_thread_t *root;
95 } psx_tracker;
96 
97 /*
98  * psx_action_key is used for thread local storage of the thread's
99  * registration.
100  */
101 pthread_key_t psx_action_key;
102 
103 /*
104  * psx_do_registration called locked and creates a tracker entry for
105  * the current thread with a TLS specific key pointing at the threads
106  * specific tracker.
107  */
psx_do_registration(void)108 static void *psx_do_registration(void) {
109     registered_thread_t *node = calloc(1, sizeof(registered_thread_t));
110     pthread_mutex_init(&node->mu, NULL);
111     node->thread = pthread_self();
112     pthread_setspecific(psx_action_key, node);
113     node->next = psx_tracker.root;
114     if (node->next) {
115 	node->next->prev = node;
116     }
117     psx_tracker.root = node;
118     return node;
119 }
120 
121 /*
122  * psx_posix_syscall_actor performs the system call on the targeted
123  * thread and signals it is no longer pending.
124  */
psx_posix_syscall_actor(int signum,siginfo_t * info,void * ignore)125 static void psx_posix_syscall_actor(int signum, siginfo_t *info, void *ignore) {
126     /* bail early if this isn't something we recognize */
127     if (signum != psx_tracker.psx_sig || !psx_tracker.cmd.active ||
128 	info == NULL || info->si_code != SI_TKILL || info->si_pid != getpid()) {
129 	if (psx_tracker.chained_action.sa_sigaction != 0) {
130 	    psx_tracker.chained_action.sa_sigaction(signum, info, ignore);
131 	}
132 	return;
133     }
134 
135     if (!psx_tracker.cmd.six) {
136 	(void) syscall(psx_tracker.cmd.syscall_nr,
137 		       psx_tracker.cmd.arg1,
138 		       psx_tracker.cmd.arg2,
139 		       psx_tracker.cmd.arg3);
140     } else {
141 	(void) syscall(psx_tracker.cmd.syscall_nr,
142 		       psx_tracker.cmd.arg1,
143 		       psx_tracker.cmd.arg2,
144 		       psx_tracker.cmd.arg3,
145 		       psx_tracker.cmd.arg4,
146 		       psx_tracker.cmd.arg5,
147 		       psx_tracker.cmd.arg6);
148     }
149 
150     /*
151      * This handler can only be called on registered threads which
152      * have had this specific defined at start-up. (But see the
153      * subsequent test.)
154      */
155     registered_thread_t *ref = pthread_getspecific(psx_action_key);
156     if (ref) {
157 	pthread_mutex_lock(&ref->mu);
158 	ref->pending = 0;
159 	pthread_mutex_unlock(&ref->mu);
160     } /*
161        * else thread must be dying and its psx_action_key has already
162        * been cleaned up.
163        */
164 }
165 
166 /*
167  * Some forward declarations for the initialization
168  * psx_syscall_start() routine.
169  */
170 static void _psx_prepare_fork(void);
171 static void _psx_fork_completed(void);
172 static void _psx_forked_child(void);
173 int __wrap_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
174 			  void *(*start_routine) (void *), void *arg);
175 
176 /*
177  * psx requires this function to be provided by the linkage wrapping.
178  */
179 extern int __real_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
180 				 void *(*start_routine) (void *), void *arg);
181 
182 /*
183  * psx_confirm_sigaction reconfirms that the psx handler is the first
184  * handler to respond to the psx signal. It assumes that
185  * psx_tracker.psx_sig has been set.
186  */
psx_confirm_sigaction(void)187 static void psx_confirm_sigaction(void) {
188     sigset_t mask, orig;
189     struct sigaction existing_sa;
190 
191     /*
192      * Block interrupts while potentially rewriting the handler.
193      */
194     sigemptyset(&mask);
195     sigaddset(&mask, psx_tracker.psx_sig);
196     sigprocmask(SIG_BLOCK, &mask, &orig);
197 
198     sigaction(psx_tracker.psx_sig, NULL, &existing_sa);
199     if (existing_sa.sa_sigaction != psx_posix_syscall_actor) {
200 	memcpy(&psx_tracker.chained_action, &existing_sa, sizeof(struct sigaction));
201 	psx_tracker.sig_action.sa_sigaction = psx_posix_syscall_actor;
202 	sigemptyset(&psx_tracker.sig_action.sa_mask);
203 	psx_tracker.sig_action.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART;
204 	sigaction(psx_tracker.psx_sig, &psx_tracker.sig_action, NULL);
205     }
206 
207     sigprocmask(SIG_SETMASK, &orig, NULL);
208 }
209 
210 /*
211  * psx_syscall_start initializes the subsystem including initializing
212  * the mutex.
213  */
psx_syscall_start(void)214 static void psx_syscall_start(void) {
215     pthread_mutex_init(&psx_tracker.state_mu, NULL);
216     pthread_cond_init(&psx_tracker.cond, NULL);
217     pthread_key_create(&psx_action_key, NULL);
218     pthread_atfork(_psx_prepare_fork, _psx_fork_completed, _psx_forked_child);
219 
220     /*
221      * All sorts of things are assumed by Linux and glibc and/or musl
222      * about signal handlers and which can be blocked. Go has its own
223      * idiosyncrasies too. We tried SIGRTMAX until
224      *
225      *   https://bugzilla.kernel.org/show_bug.cgi?id=210533
226      *
227      * Our current strategy is to aggressively intercept SIGSYS.
228      */
229     psx_tracker.psx_sig = SIGSYS;
230 
231     psx_confirm_sigaction();
232     psx_do_registration(); // register the main thread.
233 
234     psx_tracker.initialized = 1;
235 }
236 
237 /*
238  * This is the only way this library globally locks. Note, this is not
239  * to be confused with psx_sig (interrupt) blocking - which is
240  * performed around thread creation and when the signal handler is
241  * being confirmed.
242  */
psx_lock(void)243 static void psx_lock(void)
244 {
245     pthread_once(&psx_tracker_initialized, psx_syscall_start);
246     pthread_mutex_lock(&psx_tracker.state_mu);
247 }
248 
249 /*
250  * This is the only way this library unlocks.
251  */
psx_unlock(void)252 static void psx_unlock(void)
253 {
254     pthread_mutex_unlock(&psx_tracker.state_mu);
255 }
256 
257 /*
258  * under lock perform a state transition.
259  */
psx_new_state(psx_tracker_state_t was,psx_tracker_state_t is)260 static void psx_new_state(psx_tracker_state_t was, psx_tracker_state_t is)
261 {
262     psx_lock();
263     while (psx_tracker.state != was) {
264 	pthread_cond_wait(&psx_tracker.cond, &psx_tracker.state_mu);
265     }
266     psx_tracker.state = is;
267     if (is == _PSX_IDLE) {
268 	/* only announce newly idle states since that is all we wait for */
269 	pthread_cond_signal(&psx_tracker.cond);
270     }
271     psx_unlock();
272 }
273 
psx_syscall3(long int syscall_nr,long int arg1,long int arg2,long int arg3)274 long int psx_syscall3(long int syscall_nr,
275 		      long int arg1, long int arg2, long int arg3) {
276     return psx_syscall(syscall_nr, arg1, arg2, arg3);
277 }
278 
psx_syscall6(long int syscall_nr,long int arg1,long int arg2,long int arg3,long int arg4,long int arg5,long int arg6)279 long int psx_syscall6(long int syscall_nr,
280 		      long int arg1, long int arg2, long int arg3,
281 		      long int arg4, long int arg5, long int arg6) {
282     return psx_syscall(syscall_nr, arg1, arg2, arg3, arg4, arg5, arg6);
283 }
284 
_psx_prepare_fork(void)285 static void _psx_prepare_fork(void) {
286     /*
287      * obtain global lock - we don't want any syscalls while the fork
288      * is occurring since it may interfere with the preparation for
289      * the fork.
290      */
291     psx_new_state(_PSX_IDLE, _PSX_INFORK);
292 }
293 
_psx_fork_completed(void)294 static void _psx_fork_completed(void) {
295     /*
296      * The only way we can get here is if state is _PSX_INFORK and was
297      * previously _PSX_IDLE. Now that the fork has completed, the
298      * parent can continue as if it hadn't happened - the forked child
299      * does not tie its security state to that of the parent process
300      * and threads.
301      *
302      * We don't strictly need to change the psx_tracker.state since we
303      * hold the mutex over the fork, but we do to make deadlock
304      * debugging easier.
305      */
306     psx_new_state(_PSX_INFORK, _PSX_IDLE);
307 }
308 
_psx_forked_child(void)309 static void _psx_forked_child(void) {
310     /*
311      * The only way we can get here is if state is _PSX_INFORK and was
312      * previously _PSX_IDLE. However, none of the registered threads
313      * exist in this newly minted child process, so we have to reset
314      * the tracking structure to avoid any confusion. We also scuttle
315      * any chance of the PSX API working on more than one thread in
316      * the child by leaving the state as _PSX_INFORK. We do support
317      * all psx_syscall()s by reverting to them being direct in the
318      * fork()ed child.
319      *
320      * We do this because the glibc man page for fork() suggests that
321      * only a subset of things will work post fork(). Specifically,
322      * only a "async-signal-safe functions (see signal- safety(7))
323      * until such time as it calls execve(2)" can be relied upon. That
324      * man page suggests that you can't expect mutexes to work: "not
325      * async-signal-safe because it uses pthread_mutex_lock(3)
326      * internally.".
327      */
328     registered_thread_t *next, *old_root;
329     old_root = psx_tracker.root;
330     psx_tracker.root = NULL;
331 
332     psx_tracker.has_forked = 1;
333 
334     for (; old_root; old_root = next) {
335 	next = old_root->next;
336 	memset(old_root, 0, sizeof(*old_root));
337 	free(old_root);
338     }
339 }
340 
341 /*
342  * called locked to unregister a node from the tracker.
343  */
psx_do_unregister(registered_thread_t * node)344 static void psx_do_unregister(registered_thread_t *node) {
345     if (psx_tracker.root == node) {
346 	psx_tracker.root = node->next;
347     }
348     if (node->next) {
349 	node->next->prev = node->prev;
350     }
351     if (node->prev) {
352 	node->prev->next = node->next;
353     }
354     pthread_mutex_destroy(&node->mu);
355     memset(node, 0, sizeof(*node));
356     free(node);
357 }
358 
359 typedef struct {
360     void *(*fn)(void *);
361     void *arg;
362     sigset_t sigbits;
363 } psx_starter_t;
364 
365 /*
366  * _psx_exiting is used to cleanup the node for the thread on its exit
367  * path. This is needed for musl libc:
368  *
369  *    https://bugzilla.kernel.org/show_bug.cgi?id=208477
370  *
371  * and likely wise for glibc too:
372  *
373  *    https://sourceware.org/bugzilla/show_bug.cgi?id=12889
374  */
_psx_exiting(void * node)375 static void _psx_exiting(void *node) {
376     /*
377      * Until we are in the _PSX_EXITING state, we must not block the
378      * psx_sig interrupt for this dying thread. That is, until this
379      * exiting thread can set ref->gone to 1, this dying thread is
380      * still participating in the psx syscall distribution.
381      *
382      * See https://github.com/golang/go/issues/42494 for a situation
383      * where this code is called with psx_tracker.psx_sig blocked.
384      */
385     sigset_t sigbit, orig_sigbits;
386     sigemptyset(&sigbit);
387     pthread_sigmask(SIG_UNBLOCK, &sigbit, &orig_sigbits);
388     sigaddset(&sigbit, psx_tracker.psx_sig);
389     pthread_sigmask(SIG_UNBLOCK, &sigbit, NULL);
390 
391     /*
392      * With psx_tracker.psx_sig unblocked we can wait until this
393      * thread can enter the _PSX_EXITING state.
394      */
395     psx_new_state(_PSX_IDLE, _PSX_EXITING);
396 
397     /*
398      * We now indicate that this thread is no longer participating in
399      * the psx mechanism.
400      */
401     registered_thread_t *ref = node;
402     pthread_mutex_lock(&ref->mu);
403     ref->gone = 1;
404     pthread_mutex_unlock(&ref->mu);
405 
406     /*
407      * At this point, we can restore the calling sigmask to whatever
408      * the caller thought was appropriate for a dying thread to have.
409      */
410     pthread_sigmask(SIG_SETMASK, &orig_sigbits, NULL);
411 
412     /*
413      * Allow the rest of the psx system carry on as per normal.
414      */
415     psx_new_state(_PSX_EXITING, _PSX_IDLE);
416 }
417 
418 /*
419  * _psx_start_fn is a trampoline for the intended start function, it
420  * is called blocked (_PSX_CREATE), but releases the block before
421  * calling starter->fn. Before releasing the block, the TLS specific
422  * attributes are initialized for use by the interrupt handler under
423  * the psx mutex, so it doesn't race with an interrupt received by
424  * this thread and the interrupt handler does not need to poll for
425  * that specific attribute to be present (which is problematic during
426  * thread shutdown).
427  */
_psx_start_fn(void * data)428 static void *_psx_start_fn(void *data) {
429     void *node = psx_do_registration();
430 
431     psx_new_state(_PSX_CREATE, _PSX_IDLE);
432 
433     psx_starter_t *starter = data;
434     pthread_sigmask(SIG_SETMASK, &starter->sigbits, NULL);
435     void *(*fn)(void *) = starter->fn;
436     void *arg = starter->arg;
437 
438     memset(data, 0, sizeof(*starter));
439     free(data);
440 
441     void *ret;
442 
443     pthread_cleanup_push(_psx_exiting, node);
444     ret = fn(arg);
445     pthread_cleanup_pop(1);
446 
447     return ret;
448 }
449 
450 /*
451  * __wrap_pthread_create is the wrapped destination of all regular
452  * pthread_create calls.
453  */
__wrap_pthread_create(pthread_t * thread,const pthread_attr_t * attr,void * (* start_routine)(void *),void * arg)454 int __wrap_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
455 			  void *(*start_routine) (void *), void *arg) {
456     psx_starter_t *starter = calloc(1, sizeof(psx_starter_t));
457     starter->fn = start_routine;
458     starter->arg = arg;
459     /*
460      * Until we are in the _PSX_IDLE state and locked, we must not
461      * block the psx_sig interrupt for this parent thread. Arrange
462      * that parent thread and newly created one can restore signal
463      * mask.
464      */
465     sigset_t sigbit, orig_sigbits;
466     sigemptyset(&sigbit);
467     pthread_sigmask(SIG_UNBLOCK, &sigbit, &starter->sigbits);
468     sigaddset(&sigbit, psx_tracker.psx_sig);
469     pthread_sigmask(SIG_UNBLOCK, &sigbit, &orig_sigbits);
470 
471     psx_new_state(_PSX_IDLE, _PSX_CREATE);
472 
473     /*
474      * until the child thread has been blessed with its own TLS
475      * specific attribute(s) we prevent either the parent thread or
476      * the new one from experiencing a PSX interrupt.
477      */
478     pthread_sigmask(SIG_BLOCK, &sigbit, NULL);
479 
480     int ret = __real_pthread_create(thread, attr, _psx_start_fn, starter);
481     if (ret == -1) {
482 	psx_new_state(_PSX_CREATE, _PSX_IDLE);
483 	memset(starter, 0, sizeof(*starter));
484 	free(starter);
485     } /* else unlock happens in _psx_start_fn */
486 
487     /* the parent can once again receive psx interrupt signals */
488     pthread_sigmask(SIG_SETMASK, &orig_sigbits, NULL);
489 
490     return ret;
491 }
492 
493 /*
494  * __psx_immediate_syscall does one syscall using the current
495  * process.
496  */
__psx_immediate_syscall(long int syscall_nr,int count,long int * arg)497 static long int __psx_immediate_syscall(long int syscall_nr,
498 					int count, long int *arg) {
499     psx_tracker.cmd.syscall_nr = syscall_nr;
500     psx_tracker.cmd.arg1 = count > 0 ? arg[0] : 0;
501     psx_tracker.cmd.arg2 = count > 1 ? arg[1] : 0;
502     psx_tracker.cmd.arg3 = count > 2 ? arg[2] : 0;
503 
504     if (count > 3) {
505 	psx_tracker.cmd.six = 1;
506 	psx_tracker.cmd.arg4 = arg[3];
507 	psx_tracker.cmd.arg5 = count > 4 ? arg[4] : 0;
508 	psx_tracker.cmd.arg6 = count > 5 ? arg[5] : 0;
509 	return syscall(syscall_nr,
510 		      psx_tracker.cmd.arg1,
511 		      psx_tracker.cmd.arg2,
512 		      psx_tracker.cmd.arg3,
513 		      psx_tracker.cmd.arg4,
514 		      psx_tracker.cmd.arg5,
515 		      psx_tracker.cmd.arg6);
516     }
517 
518     psx_tracker.cmd.six = 0;
519     return syscall(syscall_nr, psx_tracker.cmd.arg1,
520 		   psx_tracker.cmd.arg2, psx_tracker.cmd.arg3);
521 }
522 
523 /*
524  * __psx_syscall performs the syscall on the current thread and if no
525  * error is detected it ensures that the syscall is also performed on
526  * all (other) registered threads. The return code is the value for
527  * the first invocation. It uses a trick to figure out how many
528  * arguments the user has supplied. The other half of the trick is
529  * provided by the macro psx_syscall() in the <sys/psx_syscall.h>
530  * file. The trick is the 7th optional argument (8th over all) to
531  * __psx_syscall is the count of arguments supplied to psx_syscall.
532  *
533  * User:
534  *                       psx_syscall(nr, a, b);
535  * Expanded by macro to:
536  *                       __psx_syscall(nr, a, b, 6, 5, 4, 3, 2, 1, 0);
537  * The eighth arg is now ------------------------------------^
538  */
__psx_syscall(long int syscall_nr,...)539 long int __psx_syscall(long int syscall_nr, ...) {
540     long int arg[7];
541     int i;
542 
543     va_list aptr;
544     va_start(aptr, syscall_nr);
545     for (i = 0; i < 7; i++) {
546 	arg[i] = va_arg(aptr, long int);
547     }
548     va_end(aptr);
549 
550     int count = arg[6];
551     if (count < 0 || count > 6) {
552 	errno = EINVAL;
553 	return -1;
554     }
555 
556     if (psx_tracker.has_forked) {
557 	return __psx_immediate_syscall(syscall_nr, count, arg);
558     }
559 
560     psx_new_state(_PSX_IDLE, _PSX_SETUP);
561     psx_confirm_sigaction();
562 
563     long int ret;
564 
565     ret = __psx_immediate_syscall(syscall_nr, count, arg);
566     if (ret == -1 || !psx_tracker.initialized) {
567 	psx_new_state(_PSX_SETUP, _PSX_IDLE);
568 	goto defer;
569     }
570 
571     int restore_errno = errno;
572 
573     psx_new_state(_PSX_SETUP, _PSX_SYSCALL);
574     psx_tracker.cmd.active = 1;
575 
576     pthread_t self = pthread_self();
577     registered_thread_t *next = NULL, *ref;
578 
579     psx_lock();
580     for (ref = psx_tracker.root; ref; ref = next) {
581 	next = ref->next;
582 	if (ref->thread == self) {
583 	    continue;
584 	}
585 	pthread_mutex_lock(&ref->mu);
586 	ref->pending = 1;
587 	int gone = ref->gone;
588 	if (!gone) {
589 	    gone = pthread_kill(ref->thread, psx_tracker.psx_sig) != 0;
590 	}
591 	pthread_mutex_unlock(&ref->mu);
592 	if (!gone) {
593 	    continue;
594 	}
595 	/*
596 	 * need to remove invalid thread id from linked list
597 	 */
598 	psx_do_unregister(ref);
599     }
600     psx_unlock();
601 
602     for (;;) {
603 	int waiting = 0;
604 	psx_lock();
605 	for (ref = psx_tracker.root; ref; ref = next) {
606 	    next = ref->next;
607 	    if (ref->thread == self) {
608 		continue;
609 	    }
610 
611 	    pthread_mutex_lock(&ref->mu);
612 	    int pending = ref->pending;
613 	    int gone = ref->gone;
614 	    if (pending && !gone) {
615 		gone = (pthread_kill(ref->thread, 0) != 0);
616 	    }
617 	    pthread_mutex_unlock(&ref->mu);
618 	    if (!gone) {
619 		waiting += pending;
620 		continue;
621 	    }
622 	    /*
623 	     * need to remove invalid thread id from linked list
624 	     */
625 	    psx_do_unregister(ref);
626 	}
627 	psx_unlock();
628 	if (!waiting) {
629 	    break;
630 	}
631 	sched_yield();
632     }
633 
634     errno = restore_errno;
635     psx_tracker.cmd.active = 0;
636     psx_new_state(_PSX_SYSCALL, _PSX_IDLE);
637 
638 defer:
639     return ret;
640 }
641