1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Fast Userspace Mutexes (which I call "Futexes!").
4 * (C) Rusty Russell, IBM 2002
5 *
6 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
7 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
8 *
9 * Removed page pinning, fix privately mapped COW pages and other cleanups
10 * (C) Copyright 2003, 2004 Jamie Lokier
11 *
12 * Robust futex support started by Ingo Molnar
13 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
14 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
15 *
16 * PI-futex support started by Ingo Molnar and Thomas Gleixner
17 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
18 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
19 *
20 * PRIVATE futexes by Eric Dumazet
21 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
22 *
23 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
24 * Copyright (C) IBM Corporation, 2009
25 * Thanks to Thomas Gleixner for conceptual design and careful reviews.
26 *
27 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
28 * enough at me, Linus for the original (flawed) idea, Matthew
29 * Kirkwood for proof-of-concept implementation.
30 *
31 * "The futexes are also cursed."
32 * "But they come in a choice of three flavours!"
33 */
34 #include <linux/compat.h>
35 #include <linux/jhash.h>
36 #include <linux/pagemap.h>
37 #include <linux/syscalls.h>
38 #include <linux/freezer.h>
39 #include <linux/memblock.h>
40 #include <linux/fault-inject.h>
41 #include <linux/time_namespace.h>
42
43 #include <asm/futex.h>
44
45 #include "../locking/rtmutex_common.h"
46 #include <trace/hooks/futex.h>
47
48 /*
49 * READ this before attempting to hack on futexes!
50 *
51 * Basic futex operation and ordering guarantees
52 * =============================================
53 *
54 * The waiter reads the futex value in user space and calls
55 * futex_wait(). This function computes the hash bucket and acquires
56 * the hash bucket lock. After that it reads the futex user space value
57 * again and verifies that the data has not changed. If it has not changed
58 * it enqueues itself into the hash bucket, releases the hash bucket lock
59 * and schedules.
60 *
61 * The waker side modifies the user space value of the futex and calls
62 * futex_wake(). This function computes the hash bucket and acquires the
63 * hash bucket lock. Then it looks for waiters on that futex in the hash
64 * bucket and wakes them.
65 *
66 * In futex wake up scenarios where no tasks are blocked on a futex, taking
67 * the hb spinlock can be avoided and simply return. In order for this
68 * optimization to work, ordering guarantees must exist so that the waiter
69 * being added to the list is acknowledged when the list is concurrently being
70 * checked by the waker, avoiding scenarios like the following:
71 *
72 * CPU 0 CPU 1
73 * val = *futex;
74 * sys_futex(WAIT, futex, val);
75 * futex_wait(futex, val);
76 * uval = *futex;
77 * *futex = newval;
78 * sys_futex(WAKE, futex);
79 * futex_wake(futex);
80 * if (queue_empty())
81 * return;
82 * if (uval == val)
83 * lock(hash_bucket(futex));
84 * queue();
85 * unlock(hash_bucket(futex));
86 * schedule();
87 *
88 * This would cause the waiter on CPU 0 to wait forever because it
89 * missed the transition of the user space value from val to newval
90 * and the waker did not find the waiter in the hash bucket queue.
91 *
92 * The correct serialization ensures that a waiter either observes
93 * the changed user space value before blocking or is woken by a
94 * concurrent waker:
95 *
96 * CPU 0 CPU 1
97 * val = *futex;
98 * sys_futex(WAIT, futex, val);
99 * futex_wait(futex, val);
100 *
101 * waiters++; (a)
102 * smp_mb(); (A) <-- paired with -.
103 * |
104 * lock(hash_bucket(futex)); |
105 * |
106 * uval = *futex; |
107 * | *futex = newval;
108 * | sys_futex(WAKE, futex);
109 * | futex_wake(futex);
110 * |
111 * `--------> smp_mb(); (B)
112 * if (uval == val)
113 * queue();
114 * unlock(hash_bucket(futex));
115 * schedule(); if (waiters)
116 * lock(hash_bucket(futex));
117 * else wake_waiters(futex);
118 * waiters--; (b) unlock(hash_bucket(futex));
119 *
120 * Where (A) orders the waiters increment and the futex value read through
121 * atomic operations (see hb_waiters_inc) and where (B) orders the write
122 * to futex and the waiters read (see hb_waiters_pending()).
123 *
124 * This yields the following case (where X:=waiters, Y:=futex):
125 *
126 * X = Y = 0
127 *
128 * w[X]=1 w[Y]=1
129 * MB MB
130 * r[Y]=y r[X]=x
131 *
132 * Which guarantees that x==0 && y==0 is impossible; which translates back into
133 * the guarantee that we cannot both miss the futex variable change and the
134 * enqueue.
135 *
136 * Note that a new waiter is accounted for in (a) even when it is possible that
137 * the wait call can return error, in which case we backtrack from it in (b).
138 * Refer to the comment in queue_lock().
139 *
140 * Similarly, in order to account for waiters being requeued on another
141 * address we always increment the waiters for the destination bucket before
142 * acquiring the lock. It then decrements them again after releasing it -
143 * the code that actually moves the futex(es) between hash buckets (requeue_futex)
144 * will do the additional required waiter count housekeeping. This is done for
145 * double_lock_hb() and double_unlock_hb(), respectively.
146 */
147
148 #ifdef CONFIG_HAVE_FUTEX_CMPXCHG
149 #define futex_cmpxchg_enabled 1
150 #else
151 static int __read_mostly futex_cmpxchg_enabled;
152 #endif
153
154 /*
155 * Futex flags used to encode options to functions and preserve them across
156 * restarts.
157 */
158 #ifdef CONFIG_MMU
159 # define FLAGS_SHARED 0x01
160 #else
161 /*
162 * NOMMU does not have per process address space. Let the compiler optimize
163 * code away.
164 */
165 # define FLAGS_SHARED 0x00
166 #endif
167 #define FLAGS_CLOCKRT 0x02
168 #define FLAGS_HAS_TIMEOUT 0x04
169
170 /*
171 * Priority Inheritance state:
172 */
173 struct futex_pi_state {
174 /*
175 * list of 'owned' pi_state instances - these have to be
176 * cleaned up in do_exit() if the task exits prematurely:
177 */
178 struct list_head list;
179
180 /*
181 * The PI object:
182 */
183 struct rt_mutex_base pi_mutex;
184
185 struct task_struct *owner;
186 refcount_t refcount;
187
188 union futex_key key;
189 } __randomize_layout;
190
191 /**
192 * struct futex_q - The hashed futex queue entry, one per waiting task
193 * @list: priority-sorted list of tasks waiting on this futex
194 * @task: the task waiting on the futex
195 * @lock_ptr: the hash bucket lock
196 * @key: the key the futex is hashed on
197 * @pi_state: optional priority inheritance state
198 * @rt_waiter: rt_waiter storage for use with requeue_pi
199 * @requeue_pi_key: the requeue_pi target futex key
200 * @bitset: bitset for the optional bitmasked wakeup
201 * @requeue_state: State field for futex_requeue_pi()
202 * @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
203 *
204 * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
205 * we can wake only the relevant ones (hashed queues may be shared).
206 *
207 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
208 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
209 * The order of wakeup is always to make the first condition true, then
210 * the second.
211 *
212 * PI futexes are typically woken before they are removed from the hash list via
213 * the rt_mutex code. See unqueue_me_pi().
214 */
215 struct futex_q {
216 struct plist_node list;
217
218 struct task_struct *task;
219 spinlock_t *lock_ptr;
220 union futex_key key;
221 struct futex_pi_state *pi_state;
222 struct rt_mutex_waiter *rt_waiter;
223 union futex_key *requeue_pi_key;
224 u32 bitset;
225 atomic_t requeue_state;
226 #ifdef CONFIG_PREEMPT_RT
227 struct rcuwait requeue_wait;
228 #endif
229 } __randomize_layout;
230
231 /*
232 * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
233 * underlying rtmutex. The task which is about to be requeued could have
234 * just woken up (timeout, signal). After the wake up the task has to
235 * acquire hash bucket lock, which is held by the requeue code. As a task
236 * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
237 * and the hash bucket lock blocking would collide and corrupt state.
238 *
239 * On !PREEMPT_RT this is not a problem and everything could be serialized
240 * on hash bucket lock, but aside of having the benefit of common code,
241 * this allows to avoid doing the requeue when the task is already on the
242 * way out and taking the hash bucket lock of the original uaddr1 when the
243 * requeue has been completed.
244 *
245 * The following state transitions are valid:
246 *
247 * On the waiter side:
248 * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE
249 * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT
250 *
251 * On the requeue side:
252 * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS
253 * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED
254 * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed)
255 * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED
256 * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed)
257 *
258 * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
259 * signals that the waiter is already on the way out. It also means that
260 * the waiter is still on the 'wait' futex, i.e. uaddr1.
261 *
262 * The waiter side signals early wakeup to the requeue side either through
263 * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
264 * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
265 * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
266 * which means the wakeup is interleaving with a requeue in progress it has
267 * to wait for the requeue side to change the state. Either to DONE/LOCKED
268 * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
269 * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
270 * the requeue side when the requeue attempt failed via deadlock detection
271 * and therefore the waiter q is still on the uaddr1 futex.
272 */
273 enum {
274 Q_REQUEUE_PI_NONE = 0,
275 Q_REQUEUE_PI_IGNORE,
276 Q_REQUEUE_PI_IN_PROGRESS,
277 Q_REQUEUE_PI_WAIT,
278 Q_REQUEUE_PI_DONE,
279 Q_REQUEUE_PI_LOCKED,
280 };
281
282 static const struct futex_q futex_q_init = {
283 /* list gets initialized in queue_me()*/
284 .key = FUTEX_KEY_INIT,
285 .bitset = FUTEX_BITSET_MATCH_ANY,
286 .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
287 };
288
289 /*
290 * Hash buckets are shared by all the futex_keys that hash to the same
291 * location. Each key may have multiple futex_q structures, one for each task
292 * waiting on a futex.
293 */
294 struct futex_hash_bucket {
295 atomic_t waiters;
296 spinlock_t lock;
297 struct plist_head chain;
298 } ____cacheline_aligned_in_smp;
299
300 /*
301 * The base of the bucket array and its size are always used together
302 * (after initialization only in hash_futex()), so ensure that they
303 * reside in the same cacheline.
304 */
305 static struct {
306 struct futex_hash_bucket *queues;
307 unsigned long hashsize;
308 } __futex_data __read_mostly __aligned(2*sizeof(long));
309 #define futex_queues (__futex_data.queues)
310 #define futex_hashsize (__futex_data.hashsize)
311
312
313 /*
314 * Fault injections for futexes.
315 */
316 #ifdef CONFIG_FAIL_FUTEX
317
318 static struct {
319 struct fault_attr attr;
320
321 bool ignore_private;
322 } fail_futex = {
323 .attr = FAULT_ATTR_INITIALIZER,
324 .ignore_private = false,
325 };
326
setup_fail_futex(char * str)327 static int __init setup_fail_futex(char *str)
328 {
329 return setup_fault_attr(&fail_futex.attr, str);
330 }
331 __setup("fail_futex=", setup_fail_futex);
332
should_fail_futex(bool fshared)333 static bool should_fail_futex(bool fshared)
334 {
335 if (fail_futex.ignore_private && !fshared)
336 return false;
337
338 return should_fail(&fail_futex.attr, 1);
339 }
340
341 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
342
fail_futex_debugfs(void)343 static int __init fail_futex_debugfs(void)
344 {
345 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
346 struct dentry *dir;
347
348 dir = fault_create_debugfs_attr("fail_futex", NULL,
349 &fail_futex.attr);
350 if (IS_ERR(dir))
351 return PTR_ERR(dir);
352
353 debugfs_create_bool("ignore-private", mode, dir,
354 &fail_futex.ignore_private);
355 return 0;
356 }
357
358 late_initcall(fail_futex_debugfs);
359
360 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
361
362 #else
should_fail_futex(bool fshared)363 static inline bool should_fail_futex(bool fshared)
364 {
365 return false;
366 }
367 #endif /* CONFIG_FAIL_FUTEX */
368
369 #ifdef CONFIG_COMPAT
370 static void compat_exit_robust_list(struct task_struct *curr);
371 #endif
372
373 /*
374 * Reflects a new waiter being added to the waitqueue.
375 */
hb_waiters_inc(struct futex_hash_bucket * hb)376 static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
377 {
378 #ifdef CONFIG_SMP
379 atomic_inc(&hb->waiters);
380 /*
381 * Full barrier (A), see the ordering comment above.
382 */
383 smp_mb__after_atomic();
384 #endif
385 }
386
387 /*
388 * Reflects a waiter being removed from the waitqueue by wakeup
389 * paths.
390 */
hb_waiters_dec(struct futex_hash_bucket * hb)391 static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
392 {
393 #ifdef CONFIG_SMP
394 atomic_dec(&hb->waiters);
395 #endif
396 }
397
hb_waiters_pending(struct futex_hash_bucket * hb)398 static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
399 {
400 #ifdef CONFIG_SMP
401 /*
402 * Full barrier (B), see the ordering comment above.
403 */
404 smp_mb();
405 return atomic_read(&hb->waiters);
406 #else
407 return 1;
408 #endif
409 }
410
411 /**
412 * hash_futex - Return the hash bucket in the global hash
413 * @key: Pointer to the futex key for which the hash is calculated
414 *
415 * We hash on the keys returned from get_futex_key (see below) and return the
416 * corresponding hash bucket in the global hash.
417 */
hash_futex(union futex_key * key)418 static struct futex_hash_bucket *hash_futex(union futex_key *key)
419 {
420 u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
421 key->both.offset);
422
423 return &futex_queues[hash & (futex_hashsize - 1)];
424 }
425
426
427 /**
428 * match_futex - Check whether two futex keys are equal
429 * @key1: Pointer to key1
430 * @key2: Pointer to key2
431 *
432 * Return 1 if two futex_keys are equal, 0 otherwise.
433 */
match_futex(union futex_key * key1,union futex_key * key2)434 static inline int match_futex(union futex_key *key1, union futex_key *key2)
435 {
436 return (key1 && key2
437 && key1->both.word == key2->both.word
438 && key1->both.ptr == key2->both.ptr
439 && key1->both.offset == key2->both.offset);
440 }
441
442 enum futex_access {
443 FUTEX_READ,
444 FUTEX_WRITE
445 };
446
447 /**
448 * futex_setup_timer - set up the sleeping hrtimer.
449 * @time: ptr to the given timeout value
450 * @timeout: the hrtimer_sleeper structure to be set up
451 * @flags: futex flags
452 * @range_ns: optional range in ns
453 *
454 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
455 * value given
456 */
457 static inline struct hrtimer_sleeper *
futex_setup_timer(ktime_t * time,struct hrtimer_sleeper * timeout,int flags,u64 range_ns)458 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
459 int flags, u64 range_ns)
460 {
461 if (!time)
462 return NULL;
463
464 hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
465 CLOCK_REALTIME : CLOCK_MONOTONIC,
466 HRTIMER_MODE_ABS);
467 /*
468 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
469 * effectively the same as calling hrtimer_set_expires().
470 */
471 hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
472
473 return timeout;
474 }
475
476 /*
477 * Generate a machine wide unique identifier for this inode.
478 *
479 * This relies on u64 not wrapping in the life-time of the machine; which with
480 * 1ns resolution means almost 585 years.
481 *
482 * This further relies on the fact that a well formed program will not unmap
483 * the file while it has a (shared) futex waiting on it. This mapping will have
484 * a file reference which pins the mount and inode.
485 *
486 * If for some reason an inode gets evicted and read back in again, it will get
487 * a new sequence number and will _NOT_ match, even though it is the exact same
488 * file.
489 *
490 * It is important that match_futex() will never have a false-positive, esp.
491 * for PI futexes that can mess up the state. The above argues that false-negatives
492 * are only possible for malformed programs.
493 */
get_inode_sequence_number(struct inode * inode)494 static u64 get_inode_sequence_number(struct inode *inode)
495 {
496 static atomic64_t i_seq;
497 u64 old;
498
499 /* Does the inode already have a sequence number? */
500 old = atomic64_read(&inode->i_sequence);
501 if (likely(old))
502 return old;
503
504 for (;;) {
505 u64 new = atomic64_add_return(1, &i_seq);
506 if (WARN_ON_ONCE(!new))
507 continue;
508
509 old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
510 if (old)
511 return old;
512 return new;
513 }
514 }
515
516 /**
517 * get_futex_key() - Get parameters which are the keys for a futex
518 * @uaddr: virtual address of the futex
519 * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
520 * @key: address where result is stored.
521 * @rw: mapping needs to be read/write (values: FUTEX_READ,
522 * FUTEX_WRITE)
523 *
524 * Return: a negative error code or 0
525 *
526 * The key words are stored in @key on success.
527 *
528 * For shared mappings (when @fshared), the key is:
529 *
530 * ( inode->i_sequence, page->index, offset_within_page )
531 *
532 * [ also see get_inode_sequence_number() ]
533 *
534 * For private mappings (or when !@fshared), the key is:
535 *
536 * ( current->mm, address, 0 )
537 *
538 * This allows (cross process, where applicable) identification of the futex
539 * without keeping the page pinned for the duration of the FUTEX_WAIT.
540 *
541 * lock_page() might sleep, the caller should not hold a spinlock.
542 */
get_futex_key(u32 __user * uaddr,bool fshared,union futex_key * key,enum futex_access rw)543 static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
544 enum futex_access rw)
545 {
546 unsigned long address = (unsigned long)uaddr;
547 struct mm_struct *mm = current->mm;
548 struct page *page, *tail;
549 struct address_space *mapping;
550 int err, ro = 0;
551
552 /*
553 * The futex address must be "naturally" aligned.
554 */
555 key->both.offset = address % PAGE_SIZE;
556 if (unlikely((address % sizeof(u32)) != 0))
557 return -EINVAL;
558 address -= key->both.offset;
559
560 if (unlikely(!access_ok(uaddr, sizeof(u32))))
561 return -EFAULT;
562
563 if (unlikely(should_fail_futex(fshared)))
564 return -EFAULT;
565
566 /*
567 * PROCESS_PRIVATE futexes are fast.
568 * As the mm cannot disappear under us and the 'key' only needs
569 * virtual address, we dont even have to find the underlying vma.
570 * Note : We do have to check 'uaddr' is a valid user address,
571 * but access_ok() should be faster than find_vma()
572 */
573 if (!fshared) {
574 /*
575 * On no-MMU, shared futexes are treated as private, therefore
576 * we must not include the current process in the key. Since
577 * there is only one address space, the address is a unique key
578 * on its own.
579 */
580 if (IS_ENABLED(CONFIG_MMU))
581 key->private.mm = mm;
582 else
583 key->private.mm = NULL;
584
585 key->private.address = address;
586 return 0;
587 }
588
589 again:
590 /* Ignore any VERIFY_READ mapping (futex common case) */
591 if (unlikely(should_fail_futex(true)))
592 return -EFAULT;
593
594 err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
595 /*
596 * If write access is not required (eg. FUTEX_WAIT), try
597 * and get read-only access.
598 */
599 if (err == -EFAULT && rw == FUTEX_READ) {
600 err = get_user_pages_fast(address, 1, 0, &page);
601 ro = 1;
602 }
603 if (err < 0)
604 return err;
605 else
606 err = 0;
607
608 /*
609 * The treatment of mapping from this point on is critical. The page
610 * lock protects many things but in this context the page lock
611 * stabilizes mapping, prevents inode freeing in the shared
612 * file-backed region case and guards against movement to swap cache.
613 *
614 * Strictly speaking the page lock is not needed in all cases being
615 * considered here and page lock forces unnecessarily serialization
616 * From this point on, mapping will be re-verified if necessary and
617 * page lock will be acquired only if it is unavoidable
618 *
619 * Mapping checks require the head page for any compound page so the
620 * head page and mapping is looked up now. For anonymous pages, it
621 * does not matter if the page splits in the future as the key is
622 * based on the address. For filesystem-backed pages, the tail is
623 * required as the index of the page determines the key. For
624 * base pages, there is no tail page and tail == page.
625 */
626 tail = page;
627 page = compound_head(page);
628 mapping = READ_ONCE(page->mapping);
629
630 /*
631 * If page->mapping is NULL, then it cannot be a PageAnon
632 * page; but it might be the ZERO_PAGE or in the gate area or
633 * in a special mapping (all cases which we are happy to fail);
634 * or it may have been a good file page when get_user_pages_fast
635 * found it, but truncated or holepunched or subjected to
636 * invalidate_complete_page2 before we got the page lock (also
637 * cases which we are happy to fail). And we hold a reference,
638 * so refcount care in invalidate_complete_page's remove_mapping
639 * prevents drop_caches from setting mapping to NULL beneath us.
640 *
641 * The case we do have to guard against is when memory pressure made
642 * shmem_writepage move it from filecache to swapcache beneath us:
643 * an unlikely race, but we do need to retry for page->mapping.
644 */
645 if (unlikely(!mapping)) {
646 int shmem_swizzled;
647
648 /*
649 * Page lock is required to identify which special case above
650 * applies. If this is really a shmem page then the page lock
651 * will prevent unexpected transitions.
652 */
653 lock_page(page);
654 shmem_swizzled = PageSwapCache(page) || page->mapping;
655 unlock_page(page);
656 put_page(page);
657
658 if (shmem_swizzled)
659 goto again;
660
661 return -EFAULT;
662 }
663
664 /*
665 * Private mappings are handled in a simple way.
666 *
667 * If the futex key is stored on an anonymous page, then the associated
668 * object is the mm which is implicitly pinned by the calling process.
669 *
670 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
671 * it's a read-only handle, it's expected that futexes attach to
672 * the object not the particular process.
673 */
674 if (PageAnon(page)) {
675 /*
676 * A RO anonymous page will never change and thus doesn't make
677 * sense for futex operations.
678 */
679 if (unlikely(should_fail_futex(true)) || ro) {
680 err = -EFAULT;
681 goto out;
682 }
683
684 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
685 key->private.mm = mm;
686 key->private.address = address;
687
688 } else {
689 struct inode *inode;
690
691 /*
692 * The associated futex object in this case is the inode and
693 * the page->mapping must be traversed. Ordinarily this should
694 * be stabilised under page lock but it's not strictly
695 * necessary in this case as we just want to pin the inode, not
696 * update the radix tree or anything like that.
697 *
698 * The RCU read lock is taken as the inode is finally freed
699 * under RCU. If the mapping still matches expectations then the
700 * mapping->host can be safely accessed as being a valid inode.
701 */
702 rcu_read_lock();
703
704 if (READ_ONCE(page->mapping) != mapping) {
705 rcu_read_unlock();
706 put_page(page);
707
708 goto again;
709 }
710
711 inode = READ_ONCE(mapping->host);
712 if (!inode) {
713 rcu_read_unlock();
714 put_page(page);
715
716 goto again;
717 }
718
719 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
720 key->shared.i_seq = get_inode_sequence_number(inode);
721 key->shared.pgoff = page_to_pgoff(tail);
722 rcu_read_unlock();
723 }
724
725 out:
726 put_page(page);
727 return err;
728 }
729
730 /**
731 * fault_in_user_writeable() - Fault in user address and verify RW access
732 * @uaddr: pointer to faulting user space address
733 *
734 * Slow path to fixup the fault we just took in the atomic write
735 * access to @uaddr.
736 *
737 * We have no generic implementation of a non-destructive write to the
738 * user address. We know that we faulted in the atomic pagefault
739 * disabled section so we can as well avoid the #PF overhead by
740 * calling get_user_pages() right away.
741 */
fault_in_user_writeable(u32 __user * uaddr)742 static int fault_in_user_writeable(u32 __user *uaddr)
743 {
744 struct mm_struct *mm = current->mm;
745 int ret;
746
747 mmap_read_lock(mm);
748 ret = fixup_user_fault(mm, (unsigned long)uaddr,
749 FAULT_FLAG_WRITE, NULL);
750 mmap_read_unlock(mm);
751
752 return ret < 0 ? ret : 0;
753 }
754
755 /**
756 * futex_top_waiter() - Return the highest priority waiter on a futex
757 * @hb: the hash bucket the futex_q's reside in
758 * @key: the futex key (to distinguish it from other futex futex_q's)
759 *
760 * Must be called with the hb lock held.
761 */
futex_top_waiter(struct futex_hash_bucket * hb,union futex_key * key)762 static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
763 union futex_key *key)
764 {
765 struct futex_q *this;
766
767 plist_for_each_entry(this, &hb->chain, list) {
768 if (match_futex(&this->key, key))
769 return this;
770 }
771 return NULL;
772 }
773
cmpxchg_futex_value_locked(u32 * curval,u32 __user * uaddr,u32 uval,u32 newval)774 static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
775 u32 uval, u32 newval)
776 {
777 int ret;
778
779 pagefault_disable();
780 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
781 pagefault_enable();
782
783 return ret;
784 }
785
get_futex_value_locked(u32 * dest,u32 __user * from)786 static int get_futex_value_locked(u32 *dest, u32 __user *from)
787 {
788 int ret;
789
790 pagefault_disable();
791 ret = __get_user(*dest, from);
792 pagefault_enable();
793
794 return ret ? -EFAULT : 0;
795 }
796
797
798 /*
799 * PI code:
800 */
refill_pi_state_cache(void)801 static int refill_pi_state_cache(void)
802 {
803 struct futex_pi_state *pi_state;
804
805 if (likely(current->pi_state_cache))
806 return 0;
807
808 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
809
810 if (!pi_state)
811 return -ENOMEM;
812
813 INIT_LIST_HEAD(&pi_state->list);
814 /* pi_mutex gets initialized later */
815 pi_state->owner = NULL;
816 refcount_set(&pi_state->refcount, 1);
817 pi_state->key = FUTEX_KEY_INIT;
818
819 current->pi_state_cache = pi_state;
820
821 return 0;
822 }
823
alloc_pi_state(void)824 static struct futex_pi_state *alloc_pi_state(void)
825 {
826 struct futex_pi_state *pi_state = current->pi_state_cache;
827
828 WARN_ON(!pi_state);
829 current->pi_state_cache = NULL;
830
831 return pi_state;
832 }
833
pi_state_update_owner(struct futex_pi_state * pi_state,struct task_struct * new_owner)834 static void pi_state_update_owner(struct futex_pi_state *pi_state,
835 struct task_struct *new_owner)
836 {
837 struct task_struct *old_owner = pi_state->owner;
838
839 lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
840
841 if (old_owner) {
842 raw_spin_lock(&old_owner->pi_lock);
843 WARN_ON(list_empty(&pi_state->list));
844 list_del_init(&pi_state->list);
845 raw_spin_unlock(&old_owner->pi_lock);
846 }
847
848 if (new_owner) {
849 raw_spin_lock(&new_owner->pi_lock);
850 WARN_ON(!list_empty(&pi_state->list));
851 list_add(&pi_state->list, &new_owner->pi_state_list);
852 pi_state->owner = new_owner;
853 raw_spin_unlock(&new_owner->pi_lock);
854 }
855 }
856
get_pi_state(struct futex_pi_state * pi_state)857 static void get_pi_state(struct futex_pi_state *pi_state)
858 {
859 WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
860 }
861
862 /*
863 * Drops a reference to the pi_state object and frees or caches it
864 * when the last reference is gone.
865 */
put_pi_state(struct futex_pi_state * pi_state)866 static void put_pi_state(struct futex_pi_state *pi_state)
867 {
868 if (!pi_state)
869 return;
870
871 if (!refcount_dec_and_test(&pi_state->refcount))
872 return;
873
874 /*
875 * If pi_state->owner is NULL, the owner is most probably dying
876 * and has cleaned up the pi_state already
877 */
878 if (pi_state->owner) {
879 unsigned long flags;
880
881 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
882 pi_state_update_owner(pi_state, NULL);
883 rt_mutex_proxy_unlock(&pi_state->pi_mutex);
884 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
885 }
886
887 if (current->pi_state_cache) {
888 kfree(pi_state);
889 } else {
890 /*
891 * pi_state->list is already empty.
892 * clear pi_state->owner.
893 * refcount is at 0 - put it back to 1.
894 */
895 pi_state->owner = NULL;
896 refcount_set(&pi_state->refcount, 1);
897 current->pi_state_cache = pi_state;
898 }
899 }
900
901 #ifdef CONFIG_FUTEX_PI
902
903 /*
904 * This task is holding PI mutexes at exit time => bad.
905 * Kernel cleans up PI-state, but userspace is likely hosed.
906 * (Robust-futex cleanup is separate and might save the day for userspace.)
907 */
exit_pi_state_list(struct task_struct * curr)908 static void exit_pi_state_list(struct task_struct *curr)
909 {
910 struct list_head *next, *head = &curr->pi_state_list;
911 struct futex_pi_state *pi_state;
912 struct futex_hash_bucket *hb;
913 union futex_key key = FUTEX_KEY_INIT;
914
915 if (!futex_cmpxchg_enabled)
916 return;
917 /*
918 * We are a ZOMBIE and nobody can enqueue itself on
919 * pi_state_list anymore, but we have to be careful
920 * versus waiters unqueueing themselves:
921 */
922 raw_spin_lock_irq(&curr->pi_lock);
923 while (!list_empty(head)) {
924 next = head->next;
925 pi_state = list_entry(next, struct futex_pi_state, list);
926 key = pi_state->key;
927 hb = hash_futex(&key);
928
929 /*
930 * We can race against put_pi_state() removing itself from the
931 * list (a waiter going away). put_pi_state() will first
932 * decrement the reference count and then modify the list, so
933 * its possible to see the list entry but fail this reference
934 * acquire.
935 *
936 * In that case; drop the locks to let put_pi_state() make
937 * progress and retry the loop.
938 */
939 if (!refcount_inc_not_zero(&pi_state->refcount)) {
940 raw_spin_unlock_irq(&curr->pi_lock);
941 cpu_relax();
942 raw_spin_lock_irq(&curr->pi_lock);
943 continue;
944 }
945 raw_spin_unlock_irq(&curr->pi_lock);
946
947 spin_lock(&hb->lock);
948 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
949 raw_spin_lock(&curr->pi_lock);
950 /*
951 * We dropped the pi-lock, so re-check whether this
952 * task still owns the PI-state:
953 */
954 if (head->next != next) {
955 /* retain curr->pi_lock for the loop invariant */
956 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
957 spin_unlock(&hb->lock);
958 put_pi_state(pi_state);
959 continue;
960 }
961
962 WARN_ON(pi_state->owner != curr);
963 WARN_ON(list_empty(&pi_state->list));
964 list_del_init(&pi_state->list);
965 pi_state->owner = NULL;
966
967 raw_spin_unlock(&curr->pi_lock);
968 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
969 spin_unlock(&hb->lock);
970
971 rt_mutex_futex_unlock(&pi_state->pi_mutex);
972 put_pi_state(pi_state);
973
974 raw_spin_lock_irq(&curr->pi_lock);
975 }
976 raw_spin_unlock_irq(&curr->pi_lock);
977 }
978 #else
exit_pi_state_list(struct task_struct * curr)979 static inline void exit_pi_state_list(struct task_struct *curr) { }
980 #endif
981
982 /*
983 * We need to check the following states:
984 *
985 * Waiter | pi_state | pi->owner | uTID | uODIED | ?
986 *
987 * [1] NULL | --- | --- | 0 | 0/1 | Valid
988 * [2] NULL | --- | --- | >0 | 0/1 | Valid
989 *
990 * [3] Found | NULL | -- | Any | 0/1 | Invalid
991 *
992 * [4] Found | Found | NULL | 0 | 1 | Valid
993 * [5] Found | Found | NULL | >0 | 1 | Invalid
994 *
995 * [6] Found | Found | task | 0 | 1 | Valid
996 *
997 * [7] Found | Found | NULL | Any | 0 | Invalid
998 *
999 * [8] Found | Found | task | ==taskTID | 0/1 | Valid
1000 * [9] Found | Found | task | 0 | 0 | Invalid
1001 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
1002 *
1003 * [1] Indicates that the kernel can acquire the futex atomically. We
1004 * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
1005 *
1006 * [2] Valid, if TID does not belong to a kernel thread. If no matching
1007 * thread is found then it indicates that the owner TID has died.
1008 *
1009 * [3] Invalid. The waiter is queued on a non PI futex
1010 *
1011 * [4] Valid state after exit_robust_list(), which sets the user space
1012 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
1013 *
1014 * [5] The user space value got manipulated between exit_robust_list()
1015 * and exit_pi_state_list()
1016 *
1017 * [6] Valid state after exit_pi_state_list() which sets the new owner in
1018 * the pi_state but cannot access the user space value.
1019 *
1020 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
1021 *
1022 * [8] Owner and user space value match
1023 *
1024 * [9] There is no transient state which sets the user space TID to 0
1025 * except exit_robust_list(), but this is indicated by the
1026 * FUTEX_OWNER_DIED bit. See [4]
1027 *
1028 * [10] There is no transient state which leaves owner and user space
1029 * TID out of sync. Except one error case where the kernel is denied
1030 * write access to the user address, see fixup_pi_state_owner().
1031 *
1032 *
1033 * Serialization and lifetime rules:
1034 *
1035 * hb->lock:
1036 *
1037 * hb -> futex_q, relation
1038 * futex_q -> pi_state, relation
1039 *
1040 * (cannot be raw because hb can contain arbitrary amount
1041 * of futex_q's)
1042 *
1043 * pi_mutex->wait_lock:
1044 *
1045 * {uval, pi_state}
1046 *
1047 * (and pi_mutex 'obviously')
1048 *
1049 * p->pi_lock:
1050 *
1051 * p->pi_state_list -> pi_state->list, relation
1052 * pi_mutex->owner -> pi_state->owner, relation
1053 *
1054 * pi_state->refcount:
1055 *
1056 * pi_state lifetime
1057 *
1058 *
1059 * Lock order:
1060 *
1061 * hb->lock
1062 * pi_mutex->wait_lock
1063 * p->pi_lock
1064 *
1065 */
1066
1067 /*
1068 * Validate that the existing waiter has a pi_state and sanity check
1069 * the pi_state against the user space value. If correct, attach to
1070 * it.
1071 */
attach_to_pi_state(u32 __user * uaddr,u32 uval,struct futex_pi_state * pi_state,struct futex_pi_state ** ps)1072 static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
1073 struct futex_pi_state *pi_state,
1074 struct futex_pi_state **ps)
1075 {
1076 pid_t pid = uval & FUTEX_TID_MASK;
1077 u32 uval2;
1078 int ret;
1079
1080 /*
1081 * Userspace might have messed up non-PI and PI futexes [3]
1082 */
1083 if (unlikely(!pi_state))
1084 return -EINVAL;
1085
1086 /*
1087 * We get here with hb->lock held, and having found a
1088 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
1089 * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
1090 * which in turn means that futex_lock_pi() still has a reference on
1091 * our pi_state.
1092 *
1093 * The waiter holding a reference on @pi_state also protects against
1094 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
1095 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
1096 * free pi_state before we can take a reference ourselves.
1097 */
1098 WARN_ON(!refcount_read(&pi_state->refcount));
1099
1100 /*
1101 * Now that we have a pi_state, we can acquire wait_lock
1102 * and do the state validation.
1103 */
1104 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1105
1106 /*
1107 * Since {uval, pi_state} is serialized by wait_lock, and our current
1108 * uval was read without holding it, it can have changed. Verify it
1109 * still is what we expect it to be, otherwise retry the entire
1110 * operation.
1111 */
1112 if (get_futex_value_locked(&uval2, uaddr))
1113 goto out_efault;
1114
1115 if (uval != uval2)
1116 goto out_eagain;
1117
1118 /*
1119 * Handle the owner died case:
1120 */
1121 if (uval & FUTEX_OWNER_DIED) {
1122 /*
1123 * exit_pi_state_list sets owner to NULL and wakes the
1124 * topmost waiter. The task which acquires the
1125 * pi_state->rt_mutex will fixup owner.
1126 */
1127 if (!pi_state->owner) {
1128 /*
1129 * No pi state owner, but the user space TID
1130 * is not 0. Inconsistent state. [5]
1131 */
1132 if (pid)
1133 goto out_einval;
1134 /*
1135 * Take a ref on the state and return success. [4]
1136 */
1137 goto out_attach;
1138 }
1139
1140 /*
1141 * If TID is 0, then either the dying owner has not
1142 * yet executed exit_pi_state_list() or some waiter
1143 * acquired the rtmutex in the pi state, but did not
1144 * yet fixup the TID in user space.
1145 *
1146 * Take a ref on the state and return success. [6]
1147 */
1148 if (!pid)
1149 goto out_attach;
1150 } else {
1151 /*
1152 * If the owner died bit is not set, then the pi_state
1153 * must have an owner. [7]
1154 */
1155 if (!pi_state->owner)
1156 goto out_einval;
1157 }
1158
1159 /*
1160 * Bail out if user space manipulated the futex value. If pi
1161 * state exists then the owner TID must be the same as the
1162 * user space TID. [9/10]
1163 */
1164 if (pid != task_pid_vnr(pi_state->owner))
1165 goto out_einval;
1166
1167 out_attach:
1168 get_pi_state(pi_state);
1169 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1170 *ps = pi_state;
1171 return 0;
1172
1173 out_einval:
1174 ret = -EINVAL;
1175 goto out_error;
1176
1177 out_eagain:
1178 ret = -EAGAIN;
1179 goto out_error;
1180
1181 out_efault:
1182 ret = -EFAULT;
1183 goto out_error;
1184
1185 out_error:
1186 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1187 return ret;
1188 }
1189
1190 /**
1191 * wait_for_owner_exiting - Block until the owner has exited
1192 * @ret: owner's current futex lock status
1193 * @exiting: Pointer to the exiting task
1194 *
1195 * Caller must hold a refcount on @exiting.
1196 */
wait_for_owner_exiting(int ret,struct task_struct * exiting)1197 static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
1198 {
1199 if (ret != -EBUSY) {
1200 WARN_ON_ONCE(exiting);
1201 return;
1202 }
1203
1204 if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
1205 return;
1206
1207 mutex_lock(&exiting->futex_exit_mutex);
1208 /*
1209 * No point in doing state checking here. If the waiter got here
1210 * while the task was in exec()->exec_futex_release() then it can
1211 * have any FUTEX_STATE_* value when the waiter has acquired the
1212 * mutex. OK, if running, EXITING or DEAD if it reached exit()
1213 * already. Highly unlikely and not a problem. Just one more round
1214 * through the futex maze.
1215 */
1216 mutex_unlock(&exiting->futex_exit_mutex);
1217
1218 put_task_struct(exiting);
1219 }
1220
handle_exit_race(u32 __user * uaddr,u32 uval,struct task_struct * tsk)1221 static int handle_exit_race(u32 __user *uaddr, u32 uval,
1222 struct task_struct *tsk)
1223 {
1224 u32 uval2;
1225
1226 /*
1227 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
1228 * caller that the alleged owner is busy.
1229 */
1230 if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
1231 return -EBUSY;
1232
1233 /*
1234 * Reread the user space value to handle the following situation:
1235 *
1236 * CPU0 CPU1
1237 *
1238 * sys_exit() sys_futex()
1239 * do_exit() futex_lock_pi()
1240 * futex_lock_pi_atomic()
1241 * exit_signals(tsk) No waiters:
1242 * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
1243 * mm_release(tsk) Set waiter bit
1244 * exit_robust_list(tsk) { *uaddr = 0x80000PID;
1245 * Set owner died attach_to_pi_owner() {
1246 * *uaddr = 0xC0000000; tsk = get_task(PID);
1247 * } if (!tsk->flags & PF_EXITING) {
1248 * ... attach();
1249 * tsk->futex_state = } else {
1250 * FUTEX_STATE_DEAD; if (tsk->futex_state !=
1251 * FUTEX_STATE_DEAD)
1252 * return -EAGAIN;
1253 * return -ESRCH; <--- FAIL
1254 * }
1255 *
1256 * Returning ESRCH unconditionally is wrong here because the
1257 * user space value has been changed by the exiting task.
1258 *
1259 * The same logic applies to the case where the exiting task is
1260 * already gone.
1261 */
1262 if (get_futex_value_locked(&uval2, uaddr))
1263 return -EFAULT;
1264
1265 /* If the user space value has changed, try again. */
1266 if (uval2 != uval)
1267 return -EAGAIN;
1268
1269 /*
1270 * The exiting task did not have a robust list, the robust list was
1271 * corrupted or the user space value in *uaddr is simply bogus.
1272 * Give up and tell user space.
1273 */
1274 return -ESRCH;
1275 }
1276
__attach_to_pi_owner(struct task_struct * p,union futex_key * key,struct futex_pi_state ** ps)1277 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
1278 struct futex_pi_state **ps)
1279 {
1280 /*
1281 * No existing pi state. First waiter. [2]
1282 *
1283 * This creates pi_state, we have hb->lock held, this means nothing can
1284 * observe this state, wait_lock is irrelevant.
1285 */
1286 struct futex_pi_state *pi_state = alloc_pi_state();
1287
1288 /*
1289 * Initialize the pi_mutex in locked state and make @p
1290 * the owner of it:
1291 */
1292 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
1293
1294 /* Store the key for possible exit cleanups: */
1295 pi_state->key = *key;
1296
1297 WARN_ON(!list_empty(&pi_state->list));
1298 list_add(&pi_state->list, &p->pi_state_list);
1299 /*
1300 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
1301 * because there is no concurrency as the object is not published yet.
1302 */
1303 pi_state->owner = p;
1304
1305 *ps = pi_state;
1306 }
1307 /*
1308 * Lookup the task for the TID provided from user space and attach to
1309 * it after doing proper sanity checks.
1310 */
attach_to_pi_owner(u32 __user * uaddr,u32 uval,union futex_key * key,struct futex_pi_state ** ps,struct task_struct ** exiting)1311 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
1312 struct futex_pi_state **ps,
1313 struct task_struct **exiting)
1314 {
1315 pid_t pid = uval & FUTEX_TID_MASK;
1316 struct task_struct *p;
1317
1318 /*
1319 * We are the first waiter - try to look up the real owner and attach
1320 * the new pi_state to it, but bail out when TID = 0 [1]
1321 *
1322 * The !pid check is paranoid. None of the call sites should end up
1323 * with pid == 0, but better safe than sorry. Let the caller retry
1324 */
1325 if (!pid)
1326 return -EAGAIN;
1327 p = find_get_task_by_vpid(pid);
1328 if (!p)
1329 return handle_exit_race(uaddr, uval, NULL);
1330
1331 if (unlikely(p->flags & PF_KTHREAD)) {
1332 put_task_struct(p);
1333 return -EPERM;
1334 }
1335
1336 /*
1337 * We need to look at the task state to figure out, whether the
1338 * task is exiting. To protect against the change of the task state
1339 * in futex_exit_release(), we do this protected by p->pi_lock:
1340 */
1341 raw_spin_lock_irq(&p->pi_lock);
1342 if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
1343 /*
1344 * The task is on the way out. When the futex state is
1345 * FUTEX_STATE_DEAD, we know that the task has finished
1346 * the cleanup:
1347 */
1348 int ret = handle_exit_race(uaddr, uval, p);
1349
1350 raw_spin_unlock_irq(&p->pi_lock);
1351 /*
1352 * If the owner task is between FUTEX_STATE_EXITING and
1353 * FUTEX_STATE_DEAD then store the task pointer and keep
1354 * the reference on the task struct. The calling code will
1355 * drop all locks, wait for the task to reach
1356 * FUTEX_STATE_DEAD and then drop the refcount. This is
1357 * required to prevent a live lock when the current task
1358 * preempted the exiting task between the two states.
1359 */
1360 if (ret == -EBUSY)
1361 *exiting = p;
1362 else
1363 put_task_struct(p);
1364 return ret;
1365 }
1366
1367 __attach_to_pi_owner(p, key, ps);
1368 raw_spin_unlock_irq(&p->pi_lock);
1369
1370 put_task_struct(p);
1371
1372 return 0;
1373 }
1374
lock_pi_update_atomic(u32 __user * uaddr,u32 uval,u32 newval)1375 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
1376 {
1377 int err;
1378 u32 curval;
1379
1380 if (unlikely(should_fail_futex(true)))
1381 return -EFAULT;
1382
1383 err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1384 if (unlikely(err))
1385 return err;
1386
1387 /* If user space value changed, let the caller retry */
1388 return curval != uval ? -EAGAIN : 0;
1389 }
1390
1391 /**
1392 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
1393 * @uaddr: the pi futex user address
1394 * @hb: the pi futex hash bucket
1395 * @key: the futex key associated with uaddr and hb
1396 * @ps: the pi_state pointer where we store the result of the
1397 * lookup
1398 * @task: the task to perform the atomic lock work for. This will
1399 * be "current" except in the case of requeue pi.
1400 * @exiting: Pointer to store the task pointer of the owner task
1401 * which is in the middle of exiting
1402 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1403 *
1404 * Return:
1405 * - 0 - ready to wait;
1406 * - 1 - acquired the lock;
1407 * - <0 - error
1408 *
1409 * The hb->lock must be held by the caller.
1410 *
1411 * @exiting is only set when the return value is -EBUSY. If so, this holds
1412 * a refcount on the exiting task on return and the caller needs to drop it
1413 * after waiting for the exit to complete.
1414 */
futex_lock_pi_atomic(u32 __user * uaddr,struct futex_hash_bucket * hb,union futex_key * key,struct futex_pi_state ** ps,struct task_struct * task,struct task_struct ** exiting,int set_waiters)1415 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
1416 union futex_key *key,
1417 struct futex_pi_state **ps,
1418 struct task_struct *task,
1419 struct task_struct **exiting,
1420 int set_waiters)
1421 {
1422 u32 uval, newval, vpid = task_pid_vnr(task);
1423 struct futex_q *top_waiter;
1424 int ret;
1425
1426 /*
1427 * Read the user space value first so we can validate a few
1428 * things before proceeding further.
1429 */
1430 if (get_futex_value_locked(&uval, uaddr))
1431 return -EFAULT;
1432
1433 if (unlikely(should_fail_futex(true)))
1434 return -EFAULT;
1435
1436 /*
1437 * Detect deadlocks.
1438 */
1439 if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
1440 return -EDEADLK;
1441
1442 if ((unlikely(should_fail_futex(true))))
1443 return -EDEADLK;
1444
1445 /*
1446 * Lookup existing state first. If it exists, try to attach to
1447 * its pi_state.
1448 */
1449 top_waiter = futex_top_waiter(hb, key);
1450 if (top_waiter)
1451 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1452
1453 /*
1454 * No waiter and user TID is 0. We are here because the
1455 * waiters or the owner died bit is set or called from
1456 * requeue_cmp_pi or for whatever reason something took the
1457 * syscall.
1458 */
1459 if (!(uval & FUTEX_TID_MASK)) {
1460 /*
1461 * We take over the futex. No other waiters and the user space
1462 * TID is 0. We preserve the owner died bit.
1463 */
1464 newval = uval & FUTEX_OWNER_DIED;
1465 newval |= vpid;
1466
1467 /* The futex requeue_pi code can enforce the waiters bit */
1468 if (set_waiters)
1469 newval |= FUTEX_WAITERS;
1470
1471 ret = lock_pi_update_atomic(uaddr, uval, newval);
1472 if (ret)
1473 return ret;
1474
1475 /*
1476 * If the waiter bit was requested the caller also needs PI
1477 * state attached to the new owner of the user space futex.
1478 *
1479 * @task is guaranteed to be alive and it cannot be exiting
1480 * because it is either sleeping or waiting in
1481 * futex_requeue_pi_wakeup_sync().
1482 *
1483 * No need to do the full attach_to_pi_owner() exercise
1484 * because @task is known and valid.
1485 */
1486 if (set_waiters) {
1487 raw_spin_lock_irq(&task->pi_lock);
1488 __attach_to_pi_owner(task, key, ps);
1489 raw_spin_unlock_irq(&task->pi_lock);
1490 }
1491 return 1;
1492 }
1493
1494 /*
1495 * First waiter. Set the waiters bit before attaching ourself to
1496 * the owner. If owner tries to unlock, it will be forced into
1497 * the kernel and blocked on hb->lock.
1498 */
1499 newval = uval | FUTEX_WAITERS;
1500 ret = lock_pi_update_atomic(uaddr, uval, newval);
1501 if (ret)
1502 return ret;
1503 /*
1504 * If the update of the user space value succeeded, we try to
1505 * attach to the owner. If that fails, no harm done, we only
1506 * set the FUTEX_WAITERS bit in the user space variable.
1507 */
1508 return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
1509 }
1510
1511 /**
1512 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
1513 * @q: The futex_q to unqueue
1514 *
1515 * The q->lock_ptr must not be NULL and must be held by the caller.
1516 */
__unqueue_futex(struct futex_q * q)1517 static void __unqueue_futex(struct futex_q *q)
1518 {
1519 struct futex_hash_bucket *hb;
1520
1521 if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
1522 return;
1523 lockdep_assert_held(q->lock_ptr);
1524
1525 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
1526 plist_del(&q->list, &hb->chain);
1527 hb_waiters_dec(hb);
1528 }
1529
1530 /*
1531 * The hash bucket lock must be held when this is called.
1532 * Afterwards, the futex_q must not be accessed. Callers
1533 * must ensure to later call wake_up_q() for the actual
1534 * wakeups to occur.
1535 */
mark_wake_futex(struct wake_q_head * wake_q,struct futex_q * q)1536 static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
1537 {
1538 struct task_struct *p = q->task;
1539
1540 if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
1541 return;
1542
1543 get_task_struct(p);
1544 __unqueue_futex(q);
1545 /*
1546 * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
1547 * is written, without taking any locks. This is possible in the event
1548 * of a spurious wakeup, for example. A memory barrier is required here
1549 * to prevent the following store to lock_ptr from getting ahead of the
1550 * plist_del in __unqueue_futex().
1551 */
1552 smp_store_release(&q->lock_ptr, NULL);
1553
1554 /*
1555 * Queue the task for later wakeup for after we've released
1556 * the hb->lock.
1557 */
1558 wake_q_add_safe(wake_q, p);
1559 }
1560
1561 /*
1562 * Caller must hold a reference on @pi_state.
1563 */
wake_futex_pi(u32 __user * uaddr,u32 uval,struct futex_pi_state * pi_state)1564 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
1565 {
1566 struct rt_mutex_waiter *top_waiter;
1567 struct task_struct *new_owner;
1568 bool postunlock = false;
1569 DEFINE_RT_WAKE_Q(wqh);
1570 u32 curval, newval;
1571 int ret = 0;
1572
1573 top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
1574 if (WARN_ON_ONCE(!top_waiter)) {
1575 /*
1576 * As per the comment in futex_unlock_pi() this should not happen.
1577 *
1578 * When this happens, give up our locks and try again, giving
1579 * the futex_lock_pi() instance time to complete, either by
1580 * waiting on the rtmutex or removing itself from the futex
1581 * queue.
1582 */
1583 ret = -EAGAIN;
1584 goto out_unlock;
1585 }
1586
1587 new_owner = top_waiter->task;
1588
1589 /*
1590 * We pass it to the next owner. The WAITERS bit is always kept
1591 * enabled while there is PI state around. We cleanup the owner
1592 * died bit, because we are the owner.
1593 */
1594 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1595
1596 if (unlikely(should_fail_futex(true))) {
1597 ret = -EFAULT;
1598 goto out_unlock;
1599 }
1600
1601 ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1602 if (!ret && (curval != uval)) {
1603 /*
1604 * If a unconditional UNLOCK_PI operation (user space did not
1605 * try the TID->0 transition) raced with a waiter setting the
1606 * FUTEX_WAITERS flag between get_user() and locking the hash
1607 * bucket lock, retry the operation.
1608 */
1609 if ((FUTEX_TID_MASK & curval) == uval)
1610 ret = -EAGAIN;
1611 else
1612 ret = -EINVAL;
1613 }
1614
1615 if (!ret) {
1616 /*
1617 * This is a point of no return; once we modified the uval
1618 * there is no going back and subsequent operations must
1619 * not fail.
1620 */
1621 pi_state_update_owner(pi_state, new_owner);
1622 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
1623 }
1624
1625 out_unlock:
1626 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1627
1628 if (postunlock)
1629 rt_mutex_postunlock(&wqh);
1630
1631 return ret;
1632 }
1633
1634 /*
1635 * Express the locking dependencies for lockdep:
1636 */
1637 static inline void
double_lock_hb(struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2)1638 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1639 {
1640 if (hb1 <= hb2) {
1641 spin_lock(&hb1->lock);
1642 if (hb1 < hb2)
1643 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1644 } else { /* hb1 > hb2 */
1645 spin_lock(&hb2->lock);
1646 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1647 }
1648 }
1649
1650 static inline void
double_unlock_hb(struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2)1651 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1652 {
1653 spin_unlock(&hb1->lock);
1654 if (hb1 != hb2)
1655 spin_unlock(&hb2->lock);
1656 }
1657
1658 /*
1659 * Wake up waiters matching bitset queued on this futex (uaddr).
1660 */
1661 static int
futex_wake(u32 __user * uaddr,unsigned int flags,int nr_wake,u32 bitset)1662 futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1663 {
1664 struct futex_hash_bucket *hb;
1665 struct futex_q *this, *next;
1666 union futex_key key = FUTEX_KEY_INIT;
1667 int ret;
1668 int target_nr;
1669 DEFINE_WAKE_Q(wake_q);
1670
1671 if (!bitset)
1672 return -EINVAL;
1673
1674 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
1675 if (unlikely(ret != 0))
1676 return ret;
1677
1678 hb = hash_futex(&key);
1679
1680 /* Make sure we really have tasks to wakeup */
1681 if (!hb_waiters_pending(hb))
1682 return ret;
1683
1684 spin_lock(&hb->lock);
1685
1686 trace_android_vh_futex_wake_traverse_plist(&hb->chain, &target_nr, key, bitset);
1687 plist_for_each_entry_safe(this, next, &hb->chain, list) {
1688 if (match_futex (&this->key, &key)) {
1689 if (this->pi_state || this->rt_waiter) {
1690 ret = -EINVAL;
1691 break;
1692 }
1693
1694 /* Check if one of the bits is set in both bitsets */
1695 if (!(this->bitset & bitset))
1696 continue;
1697
1698 trace_android_vh_futex_wake_this(ret, nr_wake, target_nr, this->task);
1699 mark_wake_futex(&wake_q, this);
1700 if (++ret >= nr_wake)
1701 break;
1702 }
1703 }
1704
1705 spin_unlock(&hb->lock);
1706 wake_up_q(&wake_q);
1707 trace_android_vh_futex_wake_up_q_finish(nr_wake, target_nr);
1708 return ret;
1709 }
1710
futex_atomic_op_inuser(unsigned int encoded_op,u32 __user * uaddr)1711 static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
1712 {
1713 unsigned int op = (encoded_op & 0x70000000) >> 28;
1714 unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
1715 int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
1716 int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
1717 int oldval, ret;
1718
1719 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
1720 if (oparg < 0 || oparg > 31) {
1721 char comm[sizeof(current->comm)];
1722 /*
1723 * kill this print and return -EINVAL when userspace
1724 * is sane again
1725 */
1726 pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
1727 get_task_comm(comm, current), oparg);
1728 oparg &= 31;
1729 }
1730 oparg = 1 << oparg;
1731 }
1732
1733 pagefault_disable();
1734 ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
1735 pagefault_enable();
1736 if (ret)
1737 return ret;
1738
1739 switch (cmp) {
1740 case FUTEX_OP_CMP_EQ:
1741 return oldval == cmparg;
1742 case FUTEX_OP_CMP_NE:
1743 return oldval != cmparg;
1744 case FUTEX_OP_CMP_LT:
1745 return oldval < cmparg;
1746 case FUTEX_OP_CMP_GE:
1747 return oldval >= cmparg;
1748 case FUTEX_OP_CMP_LE:
1749 return oldval <= cmparg;
1750 case FUTEX_OP_CMP_GT:
1751 return oldval > cmparg;
1752 default:
1753 return -ENOSYS;
1754 }
1755 }
1756
1757 /*
1758 * Wake up all waiters hashed on the physical page that is mapped
1759 * to this virtual address:
1760 */
1761 static int
futex_wake_op(u32 __user * uaddr1,unsigned int flags,u32 __user * uaddr2,int nr_wake,int nr_wake2,int op)1762 futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1763 int nr_wake, int nr_wake2, int op)
1764 {
1765 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1766 struct futex_hash_bucket *hb1, *hb2;
1767 struct futex_q *this, *next;
1768 int ret, op_ret;
1769 DEFINE_WAKE_Q(wake_q);
1770
1771 retry:
1772 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
1773 if (unlikely(ret != 0))
1774 return ret;
1775 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
1776 if (unlikely(ret != 0))
1777 return ret;
1778
1779 hb1 = hash_futex(&key1);
1780 hb2 = hash_futex(&key2);
1781
1782 retry_private:
1783 double_lock_hb(hb1, hb2);
1784 op_ret = futex_atomic_op_inuser(op, uaddr2);
1785 if (unlikely(op_ret < 0)) {
1786 double_unlock_hb(hb1, hb2);
1787
1788 if (!IS_ENABLED(CONFIG_MMU) ||
1789 unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
1790 /*
1791 * we don't get EFAULT from MMU faults if we don't have
1792 * an MMU, but we might get them from range checking
1793 */
1794 ret = op_ret;
1795 return ret;
1796 }
1797
1798 if (op_ret == -EFAULT) {
1799 ret = fault_in_user_writeable(uaddr2);
1800 if (ret)
1801 return ret;
1802 }
1803
1804 cond_resched();
1805 if (!(flags & FLAGS_SHARED))
1806 goto retry_private;
1807 goto retry;
1808 }
1809
1810 plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1811 if (match_futex (&this->key, &key1)) {
1812 if (this->pi_state || this->rt_waiter) {
1813 ret = -EINVAL;
1814 goto out_unlock;
1815 }
1816 mark_wake_futex(&wake_q, this);
1817 if (++ret >= nr_wake)
1818 break;
1819 }
1820 }
1821
1822 if (op_ret > 0) {
1823 op_ret = 0;
1824 plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1825 if (match_futex (&this->key, &key2)) {
1826 if (this->pi_state || this->rt_waiter) {
1827 ret = -EINVAL;
1828 goto out_unlock;
1829 }
1830 mark_wake_futex(&wake_q, this);
1831 if (++op_ret >= nr_wake2)
1832 break;
1833 }
1834 }
1835 ret += op_ret;
1836 }
1837
1838 out_unlock:
1839 double_unlock_hb(hb1, hb2);
1840 wake_up_q(&wake_q);
1841 return ret;
1842 }
1843
1844 /**
1845 * requeue_futex() - Requeue a futex_q from one hb to another
1846 * @q: the futex_q to requeue
1847 * @hb1: the source hash_bucket
1848 * @hb2: the target hash_bucket
1849 * @key2: the new key for the requeued futex_q
1850 */
1851 static inline
requeue_futex(struct futex_q * q,struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2,union futex_key * key2)1852 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1853 struct futex_hash_bucket *hb2, union futex_key *key2)
1854 {
1855
1856 /*
1857 * If key1 and key2 hash to the same bucket, no need to
1858 * requeue.
1859 */
1860 if (likely(&hb1->chain != &hb2->chain)) {
1861 plist_del(&q->list, &hb1->chain);
1862 hb_waiters_dec(hb1);
1863 hb_waiters_inc(hb2);
1864 plist_add(&q->list, &hb2->chain);
1865 q->lock_ptr = &hb2->lock;
1866 }
1867 q->key = *key2;
1868 }
1869
futex_requeue_pi_prepare(struct futex_q * q,struct futex_pi_state * pi_state)1870 static inline bool futex_requeue_pi_prepare(struct futex_q *q,
1871 struct futex_pi_state *pi_state)
1872 {
1873 int old, new;
1874
1875 /*
1876 * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
1877 * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
1878 * ignore the waiter.
1879 */
1880 old = atomic_read_acquire(&q->requeue_state);
1881 do {
1882 if (old == Q_REQUEUE_PI_IGNORE)
1883 return false;
1884
1885 /*
1886 * futex_proxy_trylock_atomic() might have set it to
1887 * IN_PROGRESS and a interleaved early wake to WAIT.
1888 *
1889 * It was considered to have an extra state for that
1890 * trylock, but that would just add more conditionals
1891 * all over the place for a dubious value.
1892 */
1893 if (old != Q_REQUEUE_PI_NONE)
1894 break;
1895
1896 new = Q_REQUEUE_PI_IN_PROGRESS;
1897 } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
1898
1899 q->pi_state = pi_state;
1900 return true;
1901 }
1902
futex_requeue_pi_complete(struct futex_q * q,int locked)1903 static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
1904 {
1905 int old, new;
1906
1907 old = atomic_read_acquire(&q->requeue_state);
1908 do {
1909 if (old == Q_REQUEUE_PI_IGNORE)
1910 return;
1911
1912 if (locked >= 0) {
1913 /* Requeue succeeded. Set DONE or LOCKED */
1914 WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
1915 old != Q_REQUEUE_PI_WAIT);
1916 new = Q_REQUEUE_PI_DONE + locked;
1917 } else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
1918 /* Deadlock, no early wakeup interleave */
1919 new = Q_REQUEUE_PI_NONE;
1920 } else {
1921 /* Deadlock, early wakeup interleave. */
1922 WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
1923 new = Q_REQUEUE_PI_IGNORE;
1924 }
1925 } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
1926
1927 #ifdef CONFIG_PREEMPT_RT
1928 /* If the waiter interleaved with the requeue let it know */
1929 if (unlikely(old == Q_REQUEUE_PI_WAIT))
1930 rcuwait_wake_up(&q->requeue_wait);
1931 #endif
1932 }
1933
futex_requeue_pi_wakeup_sync(struct futex_q * q)1934 static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
1935 {
1936 int old, new;
1937
1938 old = atomic_read_acquire(&q->requeue_state);
1939 do {
1940 /* Is requeue done already? */
1941 if (old >= Q_REQUEUE_PI_DONE)
1942 return old;
1943
1944 /*
1945 * If not done, then tell the requeue code to either ignore
1946 * the waiter or to wake it up once the requeue is done.
1947 */
1948 new = Q_REQUEUE_PI_WAIT;
1949 if (old == Q_REQUEUE_PI_NONE)
1950 new = Q_REQUEUE_PI_IGNORE;
1951 } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
1952
1953 /* If the requeue was in progress, wait for it to complete */
1954 if (old == Q_REQUEUE_PI_IN_PROGRESS) {
1955 #ifdef CONFIG_PREEMPT_RT
1956 rcuwait_wait_event(&q->requeue_wait,
1957 atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
1958 TASK_UNINTERRUPTIBLE);
1959 #else
1960 (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
1961 #endif
1962 }
1963
1964 /*
1965 * Requeue is now either prohibited or complete. Reread state
1966 * because during the wait above it might have changed. Nothing
1967 * will modify q->requeue_state after this point.
1968 */
1969 return atomic_read(&q->requeue_state);
1970 }
1971
1972 /**
1973 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1974 * @q: the futex_q
1975 * @key: the key of the requeue target futex
1976 * @hb: the hash_bucket of the requeue target futex
1977 *
1978 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1979 * target futex if it is uncontended or via a lock steal.
1980 *
1981 * 1) Set @q::key to the requeue target futex key so the waiter can detect
1982 * the wakeup on the right futex.
1983 *
1984 * 2) Dequeue @q from the hash bucket.
1985 *
1986 * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
1987 * acquisition.
1988 *
1989 * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
1990 * the waiter has to fixup the pi state.
1991 *
1992 * 5) Complete the requeue state so the waiter can make progress. After
1993 * this point the waiter task can return from the syscall immediately in
1994 * case that the pi state does not have to be fixed up.
1995 *
1996 * 6) Wake the waiter task.
1997 *
1998 * Must be called with both q->lock_ptr and hb->lock held.
1999 */
2000 static inline
requeue_pi_wake_futex(struct futex_q * q,union futex_key * key,struct futex_hash_bucket * hb)2001 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
2002 struct futex_hash_bucket *hb)
2003 {
2004 q->key = *key;
2005
2006 __unqueue_futex(q);
2007
2008 WARN_ON(!q->rt_waiter);
2009 q->rt_waiter = NULL;
2010
2011 q->lock_ptr = &hb->lock;
2012
2013 /* Signal locked state to the waiter */
2014 futex_requeue_pi_complete(q, 1);
2015 wake_up_state(q->task, TASK_NORMAL);
2016 }
2017
2018 /**
2019 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
2020 * @pifutex: the user address of the to futex
2021 * @hb1: the from futex hash bucket, must be locked by the caller
2022 * @hb2: the to futex hash bucket, must be locked by the caller
2023 * @key1: the from futex key
2024 * @key2: the to futex key
2025 * @ps: address to store the pi_state pointer
2026 * @exiting: Pointer to store the task pointer of the owner task
2027 * which is in the middle of exiting
2028 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
2029 *
2030 * Try and get the lock on behalf of the top waiter if we can do it atomically.
2031 * Wake the top waiter if we succeed. If the caller specified set_waiters,
2032 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
2033 * hb1 and hb2 must be held by the caller.
2034 *
2035 * @exiting is only set when the return value is -EBUSY. If so, this holds
2036 * a refcount on the exiting task on return and the caller needs to drop it
2037 * after waiting for the exit to complete.
2038 *
2039 * Return:
2040 * - 0 - failed to acquire the lock atomically;
2041 * - >0 - acquired the lock, return value is vpid of the top_waiter
2042 * - <0 - error
2043 */
2044 static int
futex_proxy_trylock_atomic(u32 __user * pifutex,struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2,union futex_key * key1,union futex_key * key2,struct futex_pi_state ** ps,struct task_struct ** exiting,int set_waiters)2045 futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
2046 struct futex_hash_bucket *hb2, union futex_key *key1,
2047 union futex_key *key2, struct futex_pi_state **ps,
2048 struct task_struct **exiting, int set_waiters)
2049 {
2050 struct futex_q *top_waiter = NULL;
2051 u32 curval;
2052 int ret;
2053
2054 if (get_futex_value_locked(&curval, pifutex))
2055 return -EFAULT;
2056
2057 if (unlikely(should_fail_futex(true)))
2058 return -EFAULT;
2059
2060 /*
2061 * Find the top_waiter and determine if there are additional waiters.
2062 * If the caller intends to requeue more than 1 waiter to pifutex,
2063 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
2064 * as we have means to handle the possible fault. If not, don't set
2065 * the bit unnecessarily as it will force the subsequent unlock to enter
2066 * the kernel.
2067 */
2068 top_waiter = futex_top_waiter(hb1, key1);
2069
2070 /* There are no waiters, nothing for us to do. */
2071 if (!top_waiter)
2072 return 0;
2073
2074 /*
2075 * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
2076 * and waiting on the 'waitqueue' futex which is always !PI.
2077 */
2078 if (!top_waiter->rt_waiter || top_waiter->pi_state)
2079 return -EINVAL;
2080
2081 /* Ensure we requeue to the expected futex. */
2082 if (!match_futex(top_waiter->requeue_pi_key, key2))
2083 return -EINVAL;
2084
2085 /* Ensure that this does not race against an early wakeup */
2086 if (!futex_requeue_pi_prepare(top_waiter, NULL))
2087 return -EAGAIN;
2088
2089 /*
2090 * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
2091 * in the contended case or if @set_waiters is true.
2092 *
2093 * In the contended case PI state is attached to the lock owner. If
2094 * the user space lock can be acquired then PI state is attached to
2095 * the new owner (@top_waiter->task) when @set_waiters is true.
2096 */
2097 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
2098 exiting, set_waiters);
2099 if (ret == 1) {
2100 /*
2101 * Lock was acquired in user space and PI state was
2102 * attached to @top_waiter->task. That means state is fully
2103 * consistent and the waiter can return to user space
2104 * immediately after the wakeup.
2105 */
2106 requeue_pi_wake_futex(top_waiter, key2, hb2);
2107 } else if (ret < 0) {
2108 /* Rewind top_waiter::requeue_state */
2109 futex_requeue_pi_complete(top_waiter, ret);
2110 } else {
2111 /*
2112 * futex_lock_pi_atomic() did not acquire the user space
2113 * futex, but managed to establish the proxy lock and pi
2114 * state. top_waiter::requeue_state cannot be fixed up here
2115 * because the waiter is not enqueued on the rtmutex
2116 * yet. This is handled at the callsite depending on the
2117 * result of rt_mutex_start_proxy_lock() which is
2118 * guaranteed to be reached with this function returning 0.
2119 */
2120 }
2121 return ret;
2122 }
2123
2124 /**
2125 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
2126 * @uaddr1: source futex user address
2127 * @flags: futex flags (FLAGS_SHARED, etc.)
2128 * @uaddr2: target futex user address
2129 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
2130 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
2131 * @cmpval: @uaddr1 expected value (or %NULL)
2132 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
2133 * pi futex (pi to pi requeue is not supported)
2134 *
2135 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
2136 * uaddr2 atomically on behalf of the top waiter.
2137 *
2138 * Return:
2139 * - >=0 - on success, the number of tasks requeued or woken;
2140 * - <0 - on error
2141 */
futex_requeue(u32 __user * uaddr1,unsigned int flags,u32 __user * uaddr2,int nr_wake,int nr_requeue,u32 * cmpval,int requeue_pi)2142 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
2143 u32 __user *uaddr2, int nr_wake, int nr_requeue,
2144 u32 *cmpval, int requeue_pi)
2145 {
2146 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
2147 int task_count = 0, ret;
2148 struct futex_pi_state *pi_state = NULL;
2149 struct futex_hash_bucket *hb1, *hb2;
2150 struct futex_q *this, *next;
2151 DEFINE_WAKE_Q(wake_q);
2152
2153 if (nr_wake < 0 || nr_requeue < 0)
2154 return -EINVAL;
2155
2156 /*
2157 * When PI not supported: return -ENOSYS if requeue_pi is true,
2158 * consequently the compiler knows requeue_pi is always false past
2159 * this point which will optimize away all the conditional code
2160 * further down.
2161 */
2162 if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
2163 return -ENOSYS;
2164
2165 if (requeue_pi) {
2166 /*
2167 * Requeue PI only works on two distinct uaddrs. This
2168 * check is only valid for private futexes. See below.
2169 */
2170 if (uaddr1 == uaddr2)
2171 return -EINVAL;
2172
2173 /*
2174 * futex_requeue() allows the caller to define the number
2175 * of waiters to wake up via the @nr_wake argument. With
2176 * REQUEUE_PI, waking up more than one waiter is creating
2177 * more problems than it solves. Waking up a waiter makes
2178 * only sense if the PI futex @uaddr2 is uncontended as
2179 * this allows the requeue code to acquire the futex
2180 * @uaddr2 before waking the waiter. The waiter can then
2181 * return to user space without further action. A secondary
2182 * wakeup would just make the futex_wait_requeue_pi()
2183 * handling more complex, because that code would have to
2184 * look up pi_state and do more or less all the handling
2185 * which the requeue code has to do for the to be requeued
2186 * waiters. So restrict the number of waiters to wake to
2187 * one, and only wake it up when the PI futex is
2188 * uncontended. Otherwise requeue it and let the unlock of
2189 * the PI futex handle the wakeup.
2190 *
2191 * All REQUEUE_PI users, e.g. pthread_cond_signal() and
2192 * pthread_cond_broadcast() must use nr_wake=1.
2193 */
2194 if (nr_wake != 1)
2195 return -EINVAL;
2196
2197 /*
2198 * requeue_pi requires a pi_state, try to allocate it now
2199 * without any locks in case it fails.
2200 */
2201 if (refill_pi_state_cache())
2202 return -ENOMEM;
2203 }
2204
2205 retry:
2206 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
2207 if (unlikely(ret != 0))
2208 return ret;
2209 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
2210 requeue_pi ? FUTEX_WRITE : FUTEX_READ);
2211 if (unlikely(ret != 0))
2212 return ret;
2213
2214 /*
2215 * The check above which compares uaddrs is not sufficient for
2216 * shared futexes. We need to compare the keys:
2217 */
2218 if (requeue_pi && match_futex(&key1, &key2))
2219 return -EINVAL;
2220
2221 hb1 = hash_futex(&key1);
2222 hb2 = hash_futex(&key2);
2223
2224 retry_private:
2225 hb_waiters_inc(hb2);
2226 double_lock_hb(hb1, hb2);
2227
2228 if (likely(cmpval != NULL)) {
2229 u32 curval;
2230
2231 ret = get_futex_value_locked(&curval, uaddr1);
2232
2233 if (unlikely(ret)) {
2234 double_unlock_hb(hb1, hb2);
2235 hb_waiters_dec(hb2);
2236
2237 ret = get_user(curval, uaddr1);
2238 if (ret)
2239 return ret;
2240
2241 if (!(flags & FLAGS_SHARED))
2242 goto retry_private;
2243
2244 goto retry;
2245 }
2246 if (curval != *cmpval) {
2247 ret = -EAGAIN;
2248 goto out_unlock;
2249 }
2250 }
2251
2252 if (requeue_pi) {
2253 struct task_struct *exiting = NULL;
2254
2255 /*
2256 * Attempt to acquire uaddr2 and wake the top waiter. If we
2257 * intend to requeue waiters, force setting the FUTEX_WAITERS
2258 * bit. We force this here where we are able to easily handle
2259 * faults rather in the requeue loop below.
2260 *
2261 * Updates topwaiter::requeue_state if a top waiter exists.
2262 */
2263 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
2264 &key2, &pi_state,
2265 &exiting, nr_requeue);
2266
2267 /*
2268 * At this point the top_waiter has either taken uaddr2 or
2269 * is waiting on it. In both cases pi_state has been
2270 * established and an initial refcount on it. In case of an
2271 * error there's nothing.
2272 *
2273 * The top waiter's requeue_state is up to date:
2274 *
2275 * - If the lock was acquired atomically (ret == 1), then
2276 * the state is Q_REQUEUE_PI_LOCKED.
2277 *
2278 * The top waiter has been dequeued and woken up and can
2279 * return to user space immediately. The kernel/user
2280 * space state is consistent. In case that there must be
2281 * more waiters requeued the WAITERS bit in the user
2282 * space futex is set so the top waiter task has to go
2283 * into the syscall slowpath to unlock the futex. This
2284 * will block until this requeue operation has been
2285 * completed and the hash bucket locks have been
2286 * dropped.
2287 *
2288 * - If the trylock failed with an error (ret < 0) then
2289 * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
2290 * happened", or Q_REQUEUE_PI_IGNORE when there was an
2291 * interleaved early wakeup.
2292 *
2293 * - If the trylock did not succeed (ret == 0) then the
2294 * state is either Q_REQUEUE_PI_IN_PROGRESS or
2295 * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
2296 * This will be cleaned up in the loop below, which
2297 * cannot fail because futex_proxy_trylock_atomic() did
2298 * the same sanity checks for requeue_pi as the loop
2299 * below does.
2300 */
2301 switch (ret) {
2302 case 0:
2303 /* We hold a reference on the pi state. */
2304 break;
2305
2306 case 1:
2307 /*
2308 * futex_proxy_trylock_atomic() acquired the user space
2309 * futex. Adjust task_count.
2310 */
2311 task_count++;
2312 ret = 0;
2313 break;
2314
2315 /*
2316 * If the above failed, then pi_state is NULL and
2317 * waiter::requeue_state is correct.
2318 */
2319 case -EFAULT:
2320 double_unlock_hb(hb1, hb2);
2321 hb_waiters_dec(hb2);
2322 ret = fault_in_user_writeable(uaddr2);
2323 if (!ret)
2324 goto retry;
2325 return ret;
2326 case -EBUSY:
2327 case -EAGAIN:
2328 /*
2329 * Two reasons for this:
2330 * - EBUSY: Owner is exiting and we just wait for the
2331 * exit to complete.
2332 * - EAGAIN: The user space value changed.
2333 */
2334 double_unlock_hb(hb1, hb2);
2335 hb_waiters_dec(hb2);
2336 /*
2337 * Handle the case where the owner is in the middle of
2338 * exiting. Wait for the exit to complete otherwise
2339 * this task might loop forever, aka. live lock.
2340 */
2341 wait_for_owner_exiting(ret, exiting);
2342 cond_resched();
2343 goto retry;
2344 default:
2345 goto out_unlock;
2346 }
2347 }
2348
2349 plist_for_each_entry_safe(this, next, &hb1->chain, list) {
2350 if (task_count - nr_wake >= nr_requeue)
2351 break;
2352
2353 if (!match_futex(&this->key, &key1))
2354 continue;
2355
2356 /*
2357 * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
2358 * be paired with each other and no other futex ops.
2359 *
2360 * We should never be requeueing a futex_q with a pi_state,
2361 * which is awaiting a futex_unlock_pi().
2362 */
2363 if ((requeue_pi && !this->rt_waiter) ||
2364 (!requeue_pi && this->rt_waiter) ||
2365 this->pi_state) {
2366 ret = -EINVAL;
2367 break;
2368 }
2369
2370 /* Plain futexes just wake or requeue and are done */
2371 if (!requeue_pi) {
2372 if (++task_count <= nr_wake)
2373 mark_wake_futex(&wake_q, this);
2374 else
2375 requeue_futex(this, hb1, hb2, &key2);
2376 continue;
2377 }
2378
2379 /* Ensure we requeue to the expected futex for requeue_pi. */
2380 if (!match_futex(this->requeue_pi_key, &key2)) {
2381 ret = -EINVAL;
2382 break;
2383 }
2384
2385 /*
2386 * Requeue nr_requeue waiters and possibly one more in the case
2387 * of requeue_pi if we couldn't acquire the lock atomically.
2388 *
2389 * Prepare the waiter to take the rt_mutex. Take a refcount
2390 * on the pi_state and store the pointer in the futex_q
2391 * object of the waiter.
2392 */
2393 get_pi_state(pi_state);
2394
2395 /* Don't requeue when the waiter is already on the way out. */
2396 if (!futex_requeue_pi_prepare(this, pi_state)) {
2397 /*
2398 * Early woken waiter signaled that it is on the
2399 * way out. Drop the pi_state reference and try the
2400 * next waiter. @this->pi_state is still NULL.
2401 */
2402 put_pi_state(pi_state);
2403 continue;
2404 }
2405
2406 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
2407 this->rt_waiter,
2408 this->task);
2409
2410 if (ret == 1) {
2411 /*
2412 * We got the lock. We do neither drop the refcount
2413 * on pi_state nor clear this->pi_state because the
2414 * waiter needs the pi_state for cleaning up the
2415 * user space value. It will drop the refcount
2416 * after doing so. this::requeue_state is updated
2417 * in the wakeup as well.
2418 */
2419 requeue_pi_wake_futex(this, &key2, hb2);
2420 task_count++;
2421 } else if (!ret) {
2422 /* Waiter is queued, move it to hb2 */
2423 requeue_futex(this, hb1, hb2, &key2);
2424 futex_requeue_pi_complete(this, 0);
2425 task_count++;
2426 } else {
2427 /*
2428 * rt_mutex_start_proxy_lock() detected a potential
2429 * deadlock when we tried to queue that waiter.
2430 * Drop the pi_state reference which we took above
2431 * and remove the pointer to the state from the
2432 * waiters futex_q object.
2433 */
2434 this->pi_state = NULL;
2435 put_pi_state(pi_state);
2436 futex_requeue_pi_complete(this, ret);
2437 /*
2438 * We stop queueing more waiters and let user space
2439 * deal with the mess.
2440 */
2441 break;
2442 }
2443 }
2444
2445 /*
2446 * We took an extra initial reference to the pi_state in
2447 * futex_proxy_trylock_atomic(). We need to drop it here again.
2448 */
2449 put_pi_state(pi_state);
2450
2451 out_unlock:
2452 double_unlock_hb(hb1, hb2);
2453 wake_up_q(&wake_q);
2454 hb_waiters_dec(hb2);
2455 return ret ? ret : task_count;
2456 }
2457
2458 /* The key must be already stored in q->key. */
queue_lock(struct futex_q * q)2459 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
2460 __acquires(&hb->lock)
2461 {
2462 struct futex_hash_bucket *hb;
2463
2464 hb = hash_futex(&q->key);
2465
2466 /*
2467 * Increment the counter before taking the lock so that
2468 * a potential waker won't miss a to-be-slept task that is
2469 * waiting for the spinlock. This is safe as all queue_lock()
2470 * users end up calling queue_me(). Similarly, for housekeeping,
2471 * decrement the counter at queue_unlock() when some error has
2472 * occurred and we don't end up adding the task to the list.
2473 */
2474 hb_waiters_inc(hb); /* implies smp_mb(); (A) */
2475
2476 q->lock_ptr = &hb->lock;
2477
2478 spin_lock(&hb->lock);
2479 return hb;
2480 }
2481
2482 static inline void
queue_unlock(struct futex_hash_bucket * hb)2483 queue_unlock(struct futex_hash_bucket *hb)
2484 __releases(&hb->lock)
2485 {
2486 spin_unlock(&hb->lock);
2487 hb_waiters_dec(hb);
2488 }
2489
__queue_me(struct futex_q * q,struct futex_hash_bucket * hb)2490 static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2491 {
2492 int prio;
2493 bool already_on_hb = false;
2494
2495 /*
2496 * The priority used to register this element is
2497 * - either the real thread-priority for the real-time threads
2498 * (i.e. threads with a priority lower than MAX_RT_PRIO)
2499 * - or MAX_RT_PRIO for non-RT threads.
2500 * Thus, all RT-threads are woken first in priority order, and
2501 * the others are woken last, in FIFO order.
2502 */
2503 prio = min(current->normal_prio, MAX_RT_PRIO);
2504
2505 plist_node_init(&q->list, prio);
2506 trace_android_vh_alter_futex_plist_add(&q->list, &hb->chain, &already_on_hb);
2507 if (!already_on_hb)
2508 plist_add(&q->list, &hb->chain);
2509 q->task = current;
2510 }
2511
2512 /**
2513 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
2514 * @q: The futex_q to enqueue
2515 * @hb: The destination hash bucket
2516 *
2517 * The hb->lock must be held by the caller, and is released here. A call to
2518 * queue_me() is typically paired with exactly one call to unqueue_me(). The
2519 * exceptions involve the PI related operations, which may use unqueue_me_pi()
2520 * or nothing if the unqueue is done as part of the wake process and the unqueue
2521 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
2522 * an example).
2523 */
queue_me(struct futex_q * q,struct futex_hash_bucket * hb)2524 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2525 __releases(&hb->lock)
2526 {
2527 __queue_me(q, hb);
2528 spin_unlock(&hb->lock);
2529 }
2530
2531 /**
2532 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
2533 * @q: The futex_q to unqueue
2534 *
2535 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
2536 * be paired with exactly one earlier call to queue_me().
2537 *
2538 * Return:
2539 * - 1 - if the futex_q was still queued (and we removed unqueued it);
2540 * - 0 - if the futex_q was already removed by the waking thread
2541 */
unqueue_me(struct futex_q * q)2542 static int unqueue_me(struct futex_q *q)
2543 {
2544 spinlock_t *lock_ptr;
2545 int ret = 0;
2546
2547 /* In the common case we don't take the spinlock, which is nice. */
2548 retry:
2549 /*
2550 * q->lock_ptr can change between this read and the following spin_lock.
2551 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
2552 * optimizing lock_ptr out of the logic below.
2553 */
2554 lock_ptr = READ_ONCE(q->lock_ptr);
2555 if (lock_ptr != NULL) {
2556 spin_lock(lock_ptr);
2557 /*
2558 * q->lock_ptr can change between reading it and
2559 * spin_lock(), causing us to take the wrong lock. This
2560 * corrects the race condition.
2561 *
2562 * Reasoning goes like this: if we have the wrong lock,
2563 * q->lock_ptr must have changed (maybe several times)
2564 * between reading it and the spin_lock(). It can
2565 * change again after the spin_lock() but only if it was
2566 * already changed before the spin_lock(). It cannot,
2567 * however, change back to the original value. Therefore
2568 * we can detect whether we acquired the correct lock.
2569 */
2570 if (unlikely(lock_ptr != q->lock_ptr)) {
2571 spin_unlock(lock_ptr);
2572 goto retry;
2573 }
2574 __unqueue_futex(q);
2575
2576 BUG_ON(q->pi_state);
2577
2578 spin_unlock(lock_ptr);
2579 ret = 1;
2580 }
2581
2582 return ret;
2583 }
2584
2585 /*
2586 * PI futexes can not be requeued and must remove themselves from the
2587 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
2588 */
unqueue_me_pi(struct futex_q * q)2589 static void unqueue_me_pi(struct futex_q *q)
2590 {
2591 __unqueue_futex(q);
2592
2593 BUG_ON(!q->pi_state);
2594 put_pi_state(q->pi_state);
2595 q->pi_state = NULL;
2596 }
2597
__fixup_pi_state_owner(u32 __user * uaddr,struct futex_q * q,struct task_struct * argowner)2598 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2599 struct task_struct *argowner)
2600 {
2601 struct futex_pi_state *pi_state = q->pi_state;
2602 struct task_struct *oldowner, *newowner;
2603 u32 uval, curval, newval, newtid;
2604 int err = 0;
2605
2606 oldowner = pi_state->owner;
2607
2608 /*
2609 * We are here because either:
2610 *
2611 * - we stole the lock and pi_state->owner needs updating to reflect
2612 * that (@argowner == current),
2613 *
2614 * or:
2615 *
2616 * - someone stole our lock and we need to fix things to point to the
2617 * new owner (@argowner == NULL).
2618 *
2619 * Either way, we have to replace the TID in the user space variable.
2620 * This must be atomic as we have to preserve the owner died bit here.
2621 *
2622 * Note: We write the user space value _before_ changing the pi_state
2623 * because we can fault here. Imagine swapped out pages or a fork
2624 * that marked all the anonymous memory readonly for cow.
2625 *
2626 * Modifying pi_state _before_ the user space value would leave the
2627 * pi_state in an inconsistent state when we fault here, because we
2628 * need to drop the locks to handle the fault. This might be observed
2629 * in the PID checks when attaching to PI state .
2630 */
2631 retry:
2632 if (!argowner) {
2633 if (oldowner != current) {
2634 /*
2635 * We raced against a concurrent self; things are
2636 * already fixed up. Nothing to do.
2637 */
2638 return 0;
2639 }
2640
2641 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
2642 /* We got the lock. pi_state is correct. Tell caller. */
2643 return 1;
2644 }
2645
2646 /*
2647 * The trylock just failed, so either there is an owner or
2648 * there is a higher priority waiter than this one.
2649 */
2650 newowner = rt_mutex_owner(&pi_state->pi_mutex);
2651 /*
2652 * If the higher priority waiter has not yet taken over the
2653 * rtmutex then newowner is NULL. We can't return here with
2654 * that state because it's inconsistent vs. the user space
2655 * state. So drop the locks and try again. It's a valid
2656 * situation and not any different from the other retry
2657 * conditions.
2658 */
2659 if (unlikely(!newowner)) {
2660 err = -EAGAIN;
2661 goto handle_err;
2662 }
2663 } else {
2664 WARN_ON_ONCE(argowner != current);
2665 if (oldowner == current) {
2666 /*
2667 * We raced against a concurrent self; things are
2668 * already fixed up. Nothing to do.
2669 */
2670 return 1;
2671 }
2672 newowner = argowner;
2673 }
2674
2675 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2676 /* Owner died? */
2677 if (!pi_state->owner)
2678 newtid |= FUTEX_OWNER_DIED;
2679
2680 err = get_futex_value_locked(&uval, uaddr);
2681 if (err)
2682 goto handle_err;
2683
2684 for (;;) {
2685 newval = (uval & FUTEX_OWNER_DIED) | newtid;
2686
2687 err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
2688 if (err)
2689 goto handle_err;
2690
2691 if (curval == uval)
2692 break;
2693 uval = curval;
2694 }
2695
2696 /*
2697 * We fixed up user space. Now we need to fix the pi_state
2698 * itself.
2699 */
2700 pi_state_update_owner(pi_state, newowner);
2701
2702 return argowner == current;
2703
2704 /*
2705 * In order to reschedule or handle a page fault, we need to drop the
2706 * locks here. In the case of a fault, this gives the other task
2707 * (either the highest priority waiter itself or the task which stole
2708 * the rtmutex) the chance to try the fixup of the pi_state. So once we
2709 * are back from handling the fault we need to check the pi_state after
2710 * reacquiring the locks and before trying to do another fixup. When
2711 * the fixup has been done already we simply return.
2712 *
2713 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
2714 * drop hb->lock since the caller owns the hb -> futex_q relation.
2715 * Dropping the pi_mutex->wait_lock requires the state revalidate.
2716 */
2717 handle_err:
2718 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2719 spin_unlock(q->lock_ptr);
2720
2721 switch (err) {
2722 case -EFAULT:
2723 err = fault_in_user_writeable(uaddr);
2724 break;
2725
2726 case -EAGAIN:
2727 cond_resched();
2728 err = 0;
2729 break;
2730
2731 default:
2732 WARN_ON_ONCE(1);
2733 break;
2734 }
2735
2736 spin_lock(q->lock_ptr);
2737 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2738
2739 /*
2740 * Check if someone else fixed it for us:
2741 */
2742 if (pi_state->owner != oldowner)
2743 return argowner == current;
2744
2745 /* Retry if err was -EAGAIN or the fault in succeeded */
2746 if (!err)
2747 goto retry;
2748
2749 /*
2750 * fault_in_user_writeable() failed so user state is immutable. At
2751 * best we can make the kernel state consistent but user state will
2752 * be most likely hosed and any subsequent unlock operation will be
2753 * rejected due to PI futex rule [10].
2754 *
2755 * Ensure that the rtmutex owner is also the pi_state owner despite
2756 * the user space value claiming something different. There is no
2757 * point in unlocking the rtmutex if current is the owner as it
2758 * would need to wait until the next waiter has taken the rtmutex
2759 * to guarantee consistent state. Keep it simple. Userspace asked
2760 * for this wreckaged state.
2761 *
2762 * The rtmutex has an owner - either current or some other
2763 * task. See the EAGAIN loop above.
2764 */
2765 pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
2766
2767 return err;
2768 }
2769
fixup_pi_state_owner(u32 __user * uaddr,struct futex_q * q,struct task_struct * argowner)2770 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2771 struct task_struct *argowner)
2772 {
2773 struct futex_pi_state *pi_state = q->pi_state;
2774 int ret;
2775
2776 lockdep_assert_held(q->lock_ptr);
2777
2778 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2779 ret = __fixup_pi_state_owner(uaddr, q, argowner);
2780 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2781 return ret;
2782 }
2783
2784 static long futex_wait_restart(struct restart_block *restart);
2785
2786 /**
2787 * fixup_owner() - Post lock pi_state and corner case management
2788 * @uaddr: user address of the futex
2789 * @q: futex_q (contains pi_state and access to the rt_mutex)
2790 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
2791 *
2792 * After attempting to lock an rt_mutex, this function is called to cleanup
2793 * the pi_state owner as well as handle race conditions that may allow us to
2794 * acquire the lock. Must be called with the hb lock held.
2795 *
2796 * Return:
2797 * - 1 - success, lock taken;
2798 * - 0 - success, lock not taken;
2799 * - <0 - on error (-EFAULT)
2800 */
fixup_owner(u32 __user * uaddr,struct futex_q * q,int locked)2801 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2802 {
2803 if (locked) {
2804 /*
2805 * Got the lock. We might not be the anticipated owner if we
2806 * did a lock-steal - fix up the PI-state in that case:
2807 *
2808 * Speculative pi_state->owner read (we don't hold wait_lock);
2809 * since we own the lock pi_state->owner == current is the
2810 * stable state, anything else needs more attention.
2811 */
2812 if (q->pi_state->owner != current)
2813 return fixup_pi_state_owner(uaddr, q, current);
2814 return 1;
2815 }
2816
2817 /*
2818 * If we didn't get the lock; check if anybody stole it from us. In
2819 * that case, we need to fix up the uval to point to them instead of
2820 * us, otherwise bad things happen. [10]
2821 *
2822 * Another speculative read; pi_state->owner == current is unstable
2823 * but needs our attention.
2824 */
2825 if (q->pi_state->owner == current)
2826 return fixup_pi_state_owner(uaddr, q, NULL);
2827
2828 /*
2829 * Paranoia check. If we did not take the lock, then we should not be
2830 * the owner of the rt_mutex. Warn and establish consistent state.
2831 */
2832 if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
2833 return fixup_pi_state_owner(uaddr, q, current);
2834
2835 return 0;
2836 }
2837
2838 /**
2839 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
2840 * @hb: the futex hash bucket, must be locked by the caller
2841 * @q: the futex_q to queue up on
2842 * @timeout: the prepared hrtimer_sleeper, or null for no timeout
2843 */
futex_wait_queue_me(struct futex_hash_bucket * hb,struct futex_q * q,struct hrtimer_sleeper * timeout)2844 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
2845 struct hrtimer_sleeper *timeout)
2846 {
2847 /*
2848 * The task state is guaranteed to be set before another task can
2849 * wake it. set_current_state() is implemented using smp_store_mb() and
2850 * queue_me() calls spin_unlock() upon completion, both serializing
2851 * access to the hash list and forcing another memory barrier.
2852 */
2853 set_current_state(TASK_INTERRUPTIBLE);
2854 queue_me(q, hb);
2855
2856 /* Arm the timer */
2857 if (timeout)
2858 hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
2859
2860 /*
2861 * If we have been removed from the hash list, then another task
2862 * has tried to wake us, and we can skip the call to schedule().
2863 */
2864 if (likely(!plist_node_empty(&q->list))) {
2865 /*
2866 * If the timer has already expired, current will already be
2867 * flagged for rescheduling. Only call schedule if there
2868 * is no timeout, or if it has yet to expire.
2869 */
2870 if (!timeout || timeout->task) {
2871 trace_android_vh_futex_sleep_start(current);
2872 freezable_schedule();
2873 }
2874 }
2875 __set_current_state(TASK_RUNNING);
2876 }
2877
2878 /**
2879 * futex_wait_setup() - Prepare to wait on a futex
2880 * @uaddr: the futex userspace address
2881 * @val: the expected value
2882 * @flags: futex flags (FLAGS_SHARED, etc.)
2883 * @q: the associated futex_q
2884 * @hb: storage for hash_bucket pointer to be returned to caller
2885 *
2886 * Setup the futex_q and locate the hash_bucket. Get the futex value and
2887 * compare it with the expected value. Handle atomic faults internally.
2888 * Return with the hb lock held on success, and unlocked on failure.
2889 *
2890 * Return:
2891 * - 0 - uaddr contains val and hb has been locked;
2892 * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
2893 */
futex_wait_setup(u32 __user * uaddr,u32 val,unsigned int flags,struct futex_q * q,struct futex_hash_bucket ** hb)2894 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
2895 struct futex_q *q, struct futex_hash_bucket **hb)
2896 {
2897 u32 uval;
2898 int ret;
2899
2900 /*
2901 * Access the page AFTER the hash-bucket is locked.
2902 * Order is important:
2903 *
2904 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
2905 * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
2906 *
2907 * The basic logical guarantee of a futex is that it blocks ONLY
2908 * if cond(var) is known to be true at the time of blocking, for
2909 * any cond. If we locked the hash-bucket after testing *uaddr, that
2910 * would open a race condition where we could block indefinitely with
2911 * cond(var) false, which would violate the guarantee.
2912 *
2913 * On the other hand, we insert q and release the hash-bucket only
2914 * after testing *uaddr. This guarantees that futex_wait() will NOT
2915 * absorb a wakeup if *uaddr does not match the desired values
2916 * while the syscall executes.
2917 */
2918 retry:
2919 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
2920 if (unlikely(ret != 0))
2921 return ret;
2922
2923 retry_private:
2924 *hb = queue_lock(q);
2925
2926 ret = get_futex_value_locked(&uval, uaddr);
2927
2928 if (ret) {
2929 queue_unlock(*hb);
2930
2931 ret = get_user(uval, uaddr);
2932 if (ret)
2933 return ret;
2934
2935 if (!(flags & FLAGS_SHARED))
2936 goto retry_private;
2937
2938 goto retry;
2939 }
2940
2941 if (uval != val) {
2942 queue_unlock(*hb);
2943 ret = -EWOULDBLOCK;
2944 }
2945
2946 return ret;
2947 }
2948
futex_wait(u32 __user * uaddr,unsigned int flags,u32 val,ktime_t * abs_time,u32 bitset)2949 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2950 ktime_t *abs_time, u32 bitset)
2951 {
2952 struct hrtimer_sleeper timeout, *to;
2953 struct restart_block *restart;
2954 struct futex_hash_bucket *hb;
2955 struct futex_q q = futex_q_init;
2956 int ret;
2957
2958 if (!bitset)
2959 return -EINVAL;
2960 q.bitset = bitset;
2961 trace_android_vh_futex_wait_start(flags, bitset);
2962
2963 to = futex_setup_timer(abs_time, &timeout, flags,
2964 current->timer_slack_ns);
2965 retry:
2966 /*
2967 * Prepare to wait on uaddr. On success, it holds hb->lock and q
2968 * is initialized.
2969 */
2970 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2971 if (ret)
2972 goto out;
2973
2974 /* queue_me and wait for wakeup, timeout, or a signal. */
2975 futex_wait_queue_me(hb, &q, to);
2976
2977 /* If we were woken (and unqueued), we succeeded, whatever. */
2978 ret = 0;
2979 if (!unqueue_me(&q))
2980 goto out;
2981 ret = -ETIMEDOUT;
2982 if (to && !to->task)
2983 goto out;
2984
2985 /*
2986 * We expect signal_pending(current), but we might be the
2987 * victim of a spurious wakeup as well.
2988 */
2989 if (!signal_pending(current))
2990 goto retry;
2991
2992 ret = -ERESTARTSYS;
2993 if (!abs_time)
2994 goto out;
2995
2996 restart = ¤t->restart_block;
2997 restart->futex.uaddr = uaddr;
2998 restart->futex.val = val;
2999 restart->futex.time = *abs_time;
3000 restart->futex.bitset = bitset;
3001 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
3002
3003 ret = set_restart_fn(restart, futex_wait_restart);
3004
3005 out:
3006 if (to) {
3007 hrtimer_cancel(&to->timer);
3008 destroy_hrtimer_on_stack(&to->timer);
3009 }
3010 trace_android_vh_futex_wait_end(flags, bitset);
3011 return ret;
3012 }
3013
3014
futex_wait_restart(struct restart_block * restart)3015 static long futex_wait_restart(struct restart_block *restart)
3016 {
3017 u32 __user *uaddr = restart->futex.uaddr;
3018 ktime_t t, *tp = NULL;
3019
3020 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
3021 t = restart->futex.time;
3022 tp = &t;
3023 }
3024 restart->fn = do_no_restart_syscall;
3025
3026 return (long)futex_wait(uaddr, restart->futex.flags,
3027 restart->futex.val, tp, restart->futex.bitset);
3028 }
3029
3030
3031 /*
3032 * Userspace tried a 0 -> TID atomic transition of the futex value
3033 * and failed. The kernel side here does the whole locking operation:
3034 * if there are waiters then it will block as a consequence of relying
3035 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
3036 * a 0 value of the futex too.).
3037 *
3038 * Also serves as futex trylock_pi()'ing, and due semantics.
3039 */
futex_lock_pi(u32 __user * uaddr,unsigned int flags,ktime_t * time,int trylock)3040 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
3041 ktime_t *time, int trylock)
3042 {
3043 struct hrtimer_sleeper timeout, *to;
3044 struct task_struct *exiting = NULL;
3045 struct rt_mutex_waiter rt_waiter;
3046 struct futex_hash_bucket *hb;
3047 struct futex_q q = futex_q_init;
3048 int res, ret;
3049
3050 if (!IS_ENABLED(CONFIG_FUTEX_PI))
3051 return -ENOSYS;
3052
3053 if (refill_pi_state_cache())
3054 return -ENOMEM;
3055
3056 to = futex_setup_timer(time, &timeout, flags, 0);
3057
3058 retry:
3059 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
3060 if (unlikely(ret != 0))
3061 goto out;
3062
3063 retry_private:
3064 hb = queue_lock(&q);
3065
3066 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
3067 &exiting, 0);
3068 if (unlikely(ret)) {
3069 /*
3070 * Atomic work succeeded and we got the lock,
3071 * or failed. Either way, we do _not_ block.
3072 */
3073 switch (ret) {
3074 case 1:
3075 /* We got the lock. */
3076 ret = 0;
3077 goto out_unlock_put_key;
3078 case -EFAULT:
3079 goto uaddr_faulted;
3080 case -EBUSY:
3081 case -EAGAIN:
3082 /*
3083 * Two reasons for this:
3084 * - EBUSY: Task is exiting and we just wait for the
3085 * exit to complete.
3086 * - EAGAIN: The user space value changed.
3087 */
3088 queue_unlock(hb);
3089 /*
3090 * Handle the case where the owner is in the middle of
3091 * exiting. Wait for the exit to complete otherwise
3092 * this task might loop forever, aka. live lock.
3093 */
3094 wait_for_owner_exiting(ret, exiting);
3095 cond_resched();
3096 goto retry;
3097 default:
3098 goto out_unlock_put_key;
3099 }
3100 }
3101
3102 WARN_ON(!q.pi_state);
3103
3104 /*
3105 * Only actually queue now that the atomic ops are done:
3106 */
3107 __queue_me(&q, hb);
3108
3109 if (trylock) {
3110 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
3111 /* Fixup the trylock return value: */
3112 ret = ret ? 0 : -EWOULDBLOCK;
3113 goto no_block;
3114 }
3115
3116 rt_mutex_init_waiter(&rt_waiter);
3117
3118 /*
3119 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
3120 * hold it while doing rt_mutex_start_proxy(), because then it will
3121 * include hb->lock in the blocking chain, even through we'll not in
3122 * fact hold it while blocking. This will lead it to report -EDEADLK
3123 * and BUG when futex_unlock_pi() interleaves with this.
3124 *
3125 * Therefore acquire wait_lock while holding hb->lock, but drop the
3126 * latter before calling __rt_mutex_start_proxy_lock(). This
3127 * interleaves with futex_unlock_pi() -- which does a similar lock
3128 * handoff -- such that the latter can observe the futex_q::pi_state
3129 * before __rt_mutex_start_proxy_lock() is done.
3130 */
3131 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
3132 spin_unlock(q.lock_ptr);
3133 /*
3134 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
3135 * such that futex_unlock_pi() is guaranteed to observe the waiter when
3136 * it sees the futex_q::pi_state.
3137 */
3138 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
3139 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
3140
3141 if (ret) {
3142 if (ret == 1)
3143 ret = 0;
3144 goto cleanup;
3145 }
3146
3147 if (unlikely(to))
3148 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
3149
3150 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
3151
3152 cleanup:
3153 spin_lock(q.lock_ptr);
3154 /*
3155 * If we failed to acquire the lock (deadlock/signal/timeout), we must
3156 * first acquire the hb->lock before removing the lock from the
3157 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
3158 * lists consistent.
3159 *
3160 * In particular; it is important that futex_unlock_pi() can not
3161 * observe this inconsistency.
3162 */
3163 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
3164 ret = 0;
3165
3166 no_block:
3167 /*
3168 * Fixup the pi_state owner and possibly acquire the lock if we
3169 * haven't already.
3170 */
3171 res = fixup_owner(uaddr, &q, !ret);
3172 /*
3173 * If fixup_owner() returned an error, propagate that. If it acquired
3174 * the lock, clear our -ETIMEDOUT or -EINTR.
3175 */
3176 if (res)
3177 ret = (res < 0) ? res : 0;
3178
3179 unqueue_me_pi(&q);
3180 spin_unlock(q.lock_ptr);
3181 goto out;
3182
3183 out_unlock_put_key:
3184 queue_unlock(hb);
3185
3186 out:
3187 if (to) {
3188 hrtimer_cancel(&to->timer);
3189 destroy_hrtimer_on_stack(&to->timer);
3190 }
3191 return ret != -EINTR ? ret : -ERESTARTNOINTR;
3192
3193 uaddr_faulted:
3194 queue_unlock(hb);
3195
3196 ret = fault_in_user_writeable(uaddr);
3197 if (ret)
3198 goto out;
3199
3200 if (!(flags & FLAGS_SHARED))
3201 goto retry_private;
3202
3203 goto retry;
3204 }
3205
3206 /*
3207 * Userspace attempted a TID -> 0 atomic transition, and failed.
3208 * This is the in-kernel slowpath: we look up the PI state (if any),
3209 * and do the rt-mutex unlock.
3210 */
futex_unlock_pi(u32 __user * uaddr,unsigned int flags)3211 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
3212 {
3213 u32 curval, uval, vpid = task_pid_vnr(current);
3214 union futex_key key = FUTEX_KEY_INIT;
3215 struct futex_hash_bucket *hb;
3216 struct futex_q *top_waiter;
3217 int ret;
3218
3219 if (!IS_ENABLED(CONFIG_FUTEX_PI))
3220 return -ENOSYS;
3221
3222 retry:
3223 if (get_user(uval, uaddr))
3224 return -EFAULT;
3225 /*
3226 * We release only a lock we actually own:
3227 */
3228 if ((uval & FUTEX_TID_MASK) != vpid)
3229 return -EPERM;
3230
3231 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
3232 if (ret)
3233 return ret;
3234
3235 hb = hash_futex(&key);
3236 spin_lock(&hb->lock);
3237
3238 /*
3239 * Check waiters first. We do not trust user space values at
3240 * all and we at least want to know if user space fiddled
3241 * with the futex value instead of blindly unlocking.
3242 */
3243 top_waiter = futex_top_waiter(hb, &key);
3244 if (top_waiter) {
3245 struct futex_pi_state *pi_state = top_waiter->pi_state;
3246
3247 ret = -EINVAL;
3248 if (!pi_state)
3249 goto out_unlock;
3250
3251 /*
3252 * If current does not own the pi_state then the futex is
3253 * inconsistent and user space fiddled with the futex value.
3254 */
3255 if (pi_state->owner != current)
3256 goto out_unlock;
3257
3258 get_pi_state(pi_state);
3259 /*
3260 * By taking wait_lock while still holding hb->lock, we ensure
3261 * there is no point where we hold neither; and therefore
3262 * wake_futex_pi() must observe a state consistent with what we
3263 * observed.
3264 *
3265 * In particular; this forces __rt_mutex_start_proxy() to
3266 * complete such that we're guaranteed to observe the
3267 * rt_waiter. Also see the WARN in wake_futex_pi().
3268 */
3269 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
3270 spin_unlock(&hb->lock);
3271
3272 /* drops pi_state->pi_mutex.wait_lock */
3273 ret = wake_futex_pi(uaddr, uval, pi_state);
3274
3275 put_pi_state(pi_state);
3276
3277 /*
3278 * Success, we're done! No tricky corner cases.
3279 */
3280 if (!ret)
3281 return ret;
3282 /*
3283 * The atomic access to the futex value generated a
3284 * pagefault, so retry the user-access and the wakeup:
3285 */
3286 if (ret == -EFAULT)
3287 goto pi_faulted;
3288 /*
3289 * A unconditional UNLOCK_PI op raced against a waiter
3290 * setting the FUTEX_WAITERS bit. Try again.
3291 */
3292 if (ret == -EAGAIN)
3293 goto pi_retry;
3294 /*
3295 * wake_futex_pi has detected invalid state. Tell user
3296 * space.
3297 */
3298 return ret;
3299 }
3300
3301 /*
3302 * We have no kernel internal state, i.e. no waiters in the
3303 * kernel. Waiters which are about to queue themselves are stuck
3304 * on hb->lock. So we can safely ignore them. We do neither
3305 * preserve the WAITERS bit not the OWNER_DIED one. We are the
3306 * owner.
3307 */
3308 if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
3309 spin_unlock(&hb->lock);
3310 switch (ret) {
3311 case -EFAULT:
3312 goto pi_faulted;
3313
3314 case -EAGAIN:
3315 goto pi_retry;
3316
3317 default:
3318 WARN_ON_ONCE(1);
3319 return ret;
3320 }
3321 }
3322
3323 /*
3324 * If uval has changed, let user space handle it.
3325 */
3326 ret = (curval == uval) ? 0 : -EAGAIN;
3327
3328 out_unlock:
3329 spin_unlock(&hb->lock);
3330 return ret;
3331
3332 pi_retry:
3333 cond_resched();
3334 goto retry;
3335
3336 pi_faulted:
3337
3338 ret = fault_in_user_writeable(uaddr);
3339 if (!ret)
3340 goto retry;
3341
3342 return ret;
3343 }
3344
3345 /**
3346 * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
3347 * @hb: the hash_bucket futex_q was original enqueued on
3348 * @q: the futex_q woken while waiting to be requeued
3349 * @timeout: the timeout associated with the wait (NULL if none)
3350 *
3351 * Determine the cause for the early wakeup.
3352 *
3353 * Return:
3354 * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
3355 */
3356 static inline
handle_early_requeue_pi_wakeup(struct futex_hash_bucket * hb,struct futex_q * q,struct hrtimer_sleeper * timeout)3357 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
3358 struct futex_q *q,
3359 struct hrtimer_sleeper *timeout)
3360 {
3361 int ret;
3362
3363 /*
3364 * With the hb lock held, we avoid races while we process the wakeup.
3365 * We only need to hold hb (and not hb2) to ensure atomicity as the
3366 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
3367 * It can't be requeued from uaddr2 to something else since we don't
3368 * support a PI aware source futex for requeue.
3369 */
3370 WARN_ON_ONCE(&hb->lock != q->lock_ptr);
3371
3372 /*
3373 * We were woken prior to requeue by a timeout or a signal.
3374 * Unqueue the futex_q and determine which it was.
3375 */
3376 plist_del(&q->list, &hb->chain);
3377 hb_waiters_dec(hb);
3378
3379 /* Handle spurious wakeups gracefully */
3380 ret = -EWOULDBLOCK;
3381 if (timeout && !timeout->task)
3382 ret = -ETIMEDOUT;
3383 else if (signal_pending(current))
3384 ret = -ERESTARTNOINTR;
3385 return ret;
3386 }
3387
3388 /**
3389 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
3390 * @uaddr: the futex we initially wait on (non-pi)
3391 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
3392 * the same type, no requeueing from private to shared, etc.
3393 * @val: the expected value of uaddr
3394 * @abs_time: absolute timeout
3395 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
3396 * @uaddr2: the pi futex we will take prior to returning to user-space
3397 *
3398 * The caller will wait on uaddr and will be requeued by futex_requeue() to
3399 * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
3400 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
3401 * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
3402 * without one, the pi logic would not know which task to boost/deboost, if
3403 * there was a need to.
3404 *
3405 * We call schedule in futex_wait_queue_me() when we enqueue and return there
3406 * via the following--
3407 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
3408 * 2) wakeup on uaddr2 after a requeue
3409 * 3) signal
3410 * 4) timeout
3411 *
3412 * If 3, cleanup and return -ERESTARTNOINTR.
3413 *
3414 * If 2, we may then block on trying to take the rt_mutex and return via:
3415 * 5) successful lock
3416 * 6) signal
3417 * 7) timeout
3418 * 8) other lock acquisition failure
3419 *
3420 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
3421 *
3422 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
3423 *
3424 * Return:
3425 * - 0 - On success;
3426 * - <0 - On error
3427 */
futex_wait_requeue_pi(u32 __user * uaddr,unsigned int flags,u32 val,ktime_t * abs_time,u32 bitset,u32 __user * uaddr2)3428 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
3429 u32 val, ktime_t *abs_time, u32 bitset,
3430 u32 __user *uaddr2)
3431 {
3432 struct hrtimer_sleeper timeout, *to;
3433 struct rt_mutex_waiter rt_waiter;
3434 struct futex_hash_bucket *hb;
3435 union futex_key key2 = FUTEX_KEY_INIT;
3436 struct futex_q q = futex_q_init;
3437 struct rt_mutex_base *pi_mutex;
3438 int res, ret;
3439
3440 if (!IS_ENABLED(CONFIG_FUTEX_PI))
3441 return -ENOSYS;
3442
3443 if (uaddr == uaddr2)
3444 return -EINVAL;
3445
3446 if (!bitset)
3447 return -EINVAL;
3448
3449 to = futex_setup_timer(abs_time, &timeout, flags,
3450 current->timer_slack_ns);
3451
3452 /*
3453 * The waiter is allocated on our stack, manipulated by the requeue
3454 * code while we sleep on uaddr.
3455 */
3456 rt_mutex_init_waiter(&rt_waiter);
3457
3458 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
3459 if (unlikely(ret != 0))
3460 goto out;
3461
3462 q.bitset = bitset;
3463 q.rt_waiter = &rt_waiter;
3464 q.requeue_pi_key = &key2;
3465
3466 /*
3467 * Prepare to wait on uaddr. On success, it holds hb->lock and q
3468 * is initialized.
3469 */
3470 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
3471 if (ret)
3472 goto out;
3473
3474 /*
3475 * The check above which compares uaddrs is not sufficient for
3476 * shared futexes. We need to compare the keys:
3477 */
3478 if (match_futex(&q.key, &key2)) {
3479 queue_unlock(hb);
3480 ret = -EINVAL;
3481 goto out;
3482 }
3483
3484 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
3485 futex_wait_queue_me(hb, &q, to);
3486
3487 switch (futex_requeue_pi_wakeup_sync(&q)) {
3488 case Q_REQUEUE_PI_IGNORE:
3489 /* The waiter is still on uaddr1 */
3490 spin_lock(&hb->lock);
3491 ret = handle_early_requeue_pi_wakeup(hb, &q, to);
3492 spin_unlock(&hb->lock);
3493 break;
3494
3495 case Q_REQUEUE_PI_LOCKED:
3496 /* The requeue acquired the lock */
3497 if (q.pi_state && (q.pi_state->owner != current)) {
3498 spin_lock(q.lock_ptr);
3499 ret = fixup_owner(uaddr2, &q, true);
3500 /*
3501 * Drop the reference to the pi state which the
3502 * requeue_pi() code acquired for us.
3503 */
3504 put_pi_state(q.pi_state);
3505 spin_unlock(q.lock_ptr);
3506 /*
3507 * Adjust the return value. It's either -EFAULT or
3508 * success (1) but the caller expects 0 for success.
3509 */
3510 ret = ret < 0 ? ret : 0;
3511 }
3512 break;
3513
3514 case Q_REQUEUE_PI_DONE:
3515 /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
3516 pi_mutex = &q.pi_state->pi_mutex;
3517 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
3518
3519 /* Current is not longer pi_blocked_on */
3520 spin_lock(q.lock_ptr);
3521 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
3522 ret = 0;
3523
3524 debug_rt_mutex_free_waiter(&rt_waiter);
3525 /*
3526 * Fixup the pi_state owner and possibly acquire the lock if we
3527 * haven't already.
3528 */
3529 res = fixup_owner(uaddr2, &q, !ret);
3530 /*
3531 * If fixup_owner() returned an error, propagate that. If it
3532 * acquired the lock, clear -ETIMEDOUT or -EINTR.
3533 */
3534 if (res)
3535 ret = (res < 0) ? res : 0;
3536
3537 unqueue_me_pi(&q);
3538 spin_unlock(q.lock_ptr);
3539
3540 if (ret == -EINTR) {
3541 /*
3542 * We've already been requeued, but cannot restart
3543 * by calling futex_lock_pi() directly. We could
3544 * restart this syscall, but it would detect that
3545 * the user space "val" changed and return
3546 * -EWOULDBLOCK. Save the overhead of the restart
3547 * and return -EWOULDBLOCK directly.
3548 */
3549 ret = -EWOULDBLOCK;
3550 }
3551 break;
3552 default:
3553 BUG();
3554 }
3555
3556 out:
3557 if (to) {
3558 hrtimer_cancel(&to->timer);
3559 destroy_hrtimer_on_stack(&to->timer);
3560 }
3561 return ret;
3562 }
3563
3564 /*
3565 * Support for robust futexes: the kernel cleans up held futexes at
3566 * thread exit time.
3567 *
3568 * Implementation: user-space maintains a per-thread list of locks it
3569 * is holding. Upon do_exit(), the kernel carefully walks this list,
3570 * and marks all locks that are owned by this thread with the
3571 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
3572 * always manipulated with the lock held, so the list is private and
3573 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
3574 * field, to allow the kernel to clean up if the thread dies after
3575 * acquiring the lock, but just before it could have added itself to
3576 * the list. There can only be one such pending lock.
3577 */
3578
3579 /**
3580 * sys_set_robust_list() - Set the robust-futex list head of a task
3581 * @head: pointer to the list-head
3582 * @len: length of the list-head, as userspace expects
3583 */
SYSCALL_DEFINE2(set_robust_list,struct robust_list_head __user *,head,size_t,len)3584 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
3585 size_t, len)
3586 {
3587 if (!futex_cmpxchg_enabled)
3588 return -ENOSYS;
3589 /*
3590 * The kernel knows only one size for now:
3591 */
3592 if (unlikely(len != sizeof(*head)))
3593 return -EINVAL;
3594
3595 current->robust_list = head;
3596
3597 return 0;
3598 }
3599
3600 /**
3601 * sys_get_robust_list() - Get the robust-futex list head of a task
3602 * @pid: pid of the process [zero for current task]
3603 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
3604 * @len_ptr: pointer to a length field, the kernel fills in the header size
3605 */
SYSCALL_DEFINE3(get_robust_list,int,pid,struct robust_list_head __user * __user *,head_ptr,size_t __user *,len_ptr)3606 SYSCALL_DEFINE3(get_robust_list, int, pid,
3607 struct robust_list_head __user * __user *, head_ptr,
3608 size_t __user *, len_ptr)
3609 {
3610 struct robust_list_head __user *head;
3611 unsigned long ret;
3612 struct task_struct *p;
3613
3614 if (!futex_cmpxchg_enabled)
3615 return -ENOSYS;
3616
3617 rcu_read_lock();
3618
3619 ret = -ESRCH;
3620 if (!pid)
3621 p = current;
3622 else {
3623 p = find_task_by_vpid(pid);
3624 if (!p)
3625 goto err_unlock;
3626 }
3627
3628 ret = -EPERM;
3629 if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
3630 goto err_unlock;
3631
3632 head = p->robust_list;
3633 rcu_read_unlock();
3634
3635 if (put_user(sizeof(*head), len_ptr))
3636 return -EFAULT;
3637 return put_user(head, head_ptr);
3638
3639 err_unlock:
3640 rcu_read_unlock();
3641
3642 return ret;
3643 }
3644
3645 /* Constants for the pending_op argument of handle_futex_death */
3646 #define HANDLE_DEATH_PENDING true
3647 #define HANDLE_DEATH_LIST false
3648
3649 /*
3650 * Process a futex-list entry, check whether it's owned by the
3651 * dying task, and do notification if so:
3652 */
handle_futex_death(u32 __user * uaddr,struct task_struct * curr,bool pi,bool pending_op)3653 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
3654 bool pi, bool pending_op)
3655 {
3656 u32 uval, nval, mval;
3657 pid_t owner;
3658 int err;
3659
3660 /* Futex address must be 32bit aligned */
3661 if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
3662 return -1;
3663
3664 retry:
3665 if (get_user(uval, uaddr))
3666 return -1;
3667
3668 /*
3669 * Special case for regular (non PI) futexes. The unlock path in
3670 * user space has two race scenarios:
3671 *
3672 * 1. The unlock path releases the user space futex value and
3673 * before it can execute the futex() syscall to wake up
3674 * waiters it is killed.
3675 *
3676 * 2. A woken up waiter is killed before it can acquire the
3677 * futex in user space.
3678 *
3679 * In the second case, the wake up notification could be generated
3680 * by the unlock path in user space after setting the futex value
3681 * to zero or by the kernel after setting the OWNER_DIED bit below.
3682 *
3683 * In both cases the TID validation below prevents a wakeup of
3684 * potential waiters which can cause these waiters to block
3685 * forever.
3686 *
3687 * In both cases the following conditions are met:
3688 *
3689 * 1) task->robust_list->list_op_pending != NULL
3690 * @pending_op == true
3691 * 2) The owner part of user space futex value == 0
3692 * 3) Regular futex: @pi == false
3693 *
3694 * If these conditions are met, it is safe to attempt waking up a
3695 * potential waiter without touching the user space futex value and
3696 * trying to set the OWNER_DIED bit. If the futex value is zero,
3697 * the rest of the user space mutex state is consistent, so a woken
3698 * waiter will just take over the uncontended futex. Setting the
3699 * OWNER_DIED bit would create inconsistent state and malfunction
3700 * of the user space owner died handling. Otherwise, the OWNER_DIED
3701 * bit is already set, and the woken waiter is expected to deal with
3702 * this.
3703 */
3704 owner = uval & FUTEX_TID_MASK;
3705
3706 if (pending_op && !pi && !owner) {
3707 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3708 return 0;
3709 }
3710
3711 if (owner != task_pid_vnr(curr))
3712 return 0;
3713
3714 /*
3715 * Ok, this dying thread is truly holding a futex
3716 * of interest. Set the OWNER_DIED bit atomically
3717 * via cmpxchg, and if the value had FUTEX_WAITERS
3718 * set, wake up a waiter (if any). (We have to do a
3719 * futex_wake() even if OWNER_DIED is already set -
3720 * to handle the rare but possible case of recursive
3721 * thread-death.) The rest of the cleanup is done in
3722 * userspace.
3723 */
3724 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
3725
3726 /*
3727 * We are not holding a lock here, but we want to have
3728 * the pagefault_disable/enable() protection because
3729 * we want to handle the fault gracefully. If the
3730 * access fails we try to fault in the futex with R/W
3731 * verification via get_user_pages. get_user() above
3732 * does not guarantee R/W access. If that fails we
3733 * give up and leave the futex locked.
3734 */
3735 if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
3736 switch (err) {
3737 case -EFAULT:
3738 if (fault_in_user_writeable(uaddr))
3739 return -1;
3740 goto retry;
3741
3742 case -EAGAIN:
3743 cond_resched();
3744 goto retry;
3745
3746 default:
3747 WARN_ON_ONCE(1);
3748 return err;
3749 }
3750 }
3751
3752 if (nval != uval)
3753 goto retry;
3754
3755 /*
3756 * Wake robust non-PI futexes here. The wakeup of
3757 * PI futexes happens in exit_pi_state():
3758 */
3759 if (!pi && (uval & FUTEX_WAITERS))
3760 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3761
3762 return 0;
3763 }
3764
3765 /*
3766 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
3767 */
fetch_robust_entry(struct robust_list __user ** entry,struct robust_list __user * __user * head,unsigned int * pi)3768 static inline int fetch_robust_entry(struct robust_list __user **entry,
3769 struct robust_list __user * __user *head,
3770 unsigned int *pi)
3771 {
3772 unsigned long uentry;
3773
3774 if (get_user(uentry, (unsigned long __user *)head))
3775 return -EFAULT;
3776
3777 *entry = (void __user *)(uentry & ~1UL);
3778 *pi = uentry & 1;
3779
3780 return 0;
3781 }
3782
3783 /*
3784 * Walk curr->robust_list (very carefully, it's a userspace list!)
3785 * and mark any locks found there dead, and notify any waiters.
3786 *
3787 * We silently return on any sign of list-walking problem.
3788 */
exit_robust_list(struct task_struct * curr)3789 static void exit_robust_list(struct task_struct *curr)
3790 {
3791 struct robust_list_head __user *head = curr->robust_list;
3792 struct robust_list __user *entry, *next_entry, *pending;
3793 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
3794 unsigned int next_pi;
3795 unsigned long futex_offset;
3796 int rc;
3797
3798 if (!futex_cmpxchg_enabled)
3799 return;
3800
3801 /*
3802 * Fetch the list head (which was registered earlier, via
3803 * sys_set_robust_list()):
3804 */
3805 if (fetch_robust_entry(&entry, &head->list.next, &pi))
3806 return;
3807 /*
3808 * Fetch the relative futex offset:
3809 */
3810 if (get_user(futex_offset, &head->futex_offset))
3811 return;
3812 /*
3813 * Fetch any possibly pending lock-add first, and handle it
3814 * if it exists:
3815 */
3816 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
3817 return;
3818
3819 next_entry = NULL; /* avoid warning with gcc */
3820 while (entry != &head->list) {
3821 /*
3822 * Fetch the next entry in the list before calling
3823 * handle_futex_death:
3824 */
3825 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
3826 /*
3827 * A pending lock might already be on the list, so
3828 * don't process it twice:
3829 */
3830 if (entry != pending) {
3831 if (handle_futex_death((void __user *)entry + futex_offset,
3832 curr, pi, HANDLE_DEATH_LIST))
3833 return;
3834 }
3835 if (rc)
3836 return;
3837 entry = next_entry;
3838 pi = next_pi;
3839 /*
3840 * Avoid excessively long or circular lists:
3841 */
3842 if (!--limit)
3843 break;
3844
3845 cond_resched();
3846 }
3847
3848 if (pending) {
3849 handle_futex_death((void __user *)pending + futex_offset,
3850 curr, pip, HANDLE_DEATH_PENDING);
3851 }
3852 }
3853
futex_cleanup(struct task_struct * tsk)3854 static void futex_cleanup(struct task_struct *tsk)
3855 {
3856 if (unlikely(tsk->robust_list)) {
3857 exit_robust_list(tsk);
3858 tsk->robust_list = NULL;
3859 }
3860
3861 #ifdef CONFIG_COMPAT
3862 if (unlikely(tsk->compat_robust_list)) {
3863 compat_exit_robust_list(tsk);
3864 tsk->compat_robust_list = NULL;
3865 }
3866 #endif
3867
3868 if (unlikely(!list_empty(&tsk->pi_state_list)))
3869 exit_pi_state_list(tsk);
3870 }
3871
3872 /**
3873 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
3874 * @tsk: task to set the state on
3875 *
3876 * Set the futex exit state of the task lockless. The futex waiter code
3877 * observes that state when a task is exiting and loops until the task has
3878 * actually finished the futex cleanup. The worst case for this is that the
3879 * waiter runs through the wait loop until the state becomes visible.
3880 *
3881 * This is called from the recursive fault handling path in do_exit().
3882 *
3883 * This is best effort. Either the futex exit code has run already or
3884 * not. If the OWNER_DIED bit has been set on the futex then the waiter can
3885 * take it over. If not, the problem is pushed back to user space. If the
3886 * futex exit code did not run yet, then an already queued waiter might
3887 * block forever, but there is nothing which can be done about that.
3888 */
futex_exit_recursive(struct task_struct * tsk)3889 void futex_exit_recursive(struct task_struct *tsk)
3890 {
3891 /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
3892 if (tsk->futex_state == FUTEX_STATE_EXITING)
3893 mutex_unlock(&tsk->futex_exit_mutex);
3894 tsk->futex_state = FUTEX_STATE_DEAD;
3895 }
3896
futex_cleanup_begin(struct task_struct * tsk)3897 static void futex_cleanup_begin(struct task_struct *tsk)
3898 {
3899 /*
3900 * Prevent various race issues against a concurrent incoming waiter
3901 * including live locks by forcing the waiter to block on
3902 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
3903 * attach_to_pi_owner().
3904 */
3905 mutex_lock(&tsk->futex_exit_mutex);
3906
3907 /*
3908 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
3909 *
3910 * This ensures that all subsequent checks of tsk->futex_state in
3911 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
3912 * tsk->pi_lock held.
3913 *
3914 * It guarantees also that a pi_state which was queued right before
3915 * the state change under tsk->pi_lock by a concurrent waiter must
3916 * be observed in exit_pi_state_list().
3917 */
3918 raw_spin_lock_irq(&tsk->pi_lock);
3919 tsk->futex_state = FUTEX_STATE_EXITING;
3920 raw_spin_unlock_irq(&tsk->pi_lock);
3921 }
3922
futex_cleanup_end(struct task_struct * tsk,int state)3923 static void futex_cleanup_end(struct task_struct *tsk, int state)
3924 {
3925 /*
3926 * Lockless store. The only side effect is that an observer might
3927 * take another loop until it becomes visible.
3928 */
3929 tsk->futex_state = state;
3930 /*
3931 * Drop the exit protection. This unblocks waiters which observed
3932 * FUTEX_STATE_EXITING to reevaluate the state.
3933 */
3934 mutex_unlock(&tsk->futex_exit_mutex);
3935 }
3936
futex_exec_release(struct task_struct * tsk)3937 void futex_exec_release(struct task_struct *tsk)
3938 {
3939 /*
3940 * The state handling is done for consistency, but in the case of
3941 * exec() there is no way to prevent further damage as the PID stays
3942 * the same. But for the unlikely and arguably buggy case that a
3943 * futex is held on exec(), this provides at least as much state
3944 * consistency protection which is possible.
3945 */
3946 futex_cleanup_begin(tsk);
3947 futex_cleanup(tsk);
3948 /*
3949 * Reset the state to FUTEX_STATE_OK. The task is alive and about
3950 * exec a new binary.
3951 */
3952 futex_cleanup_end(tsk, FUTEX_STATE_OK);
3953 }
3954
futex_exit_release(struct task_struct * tsk)3955 void futex_exit_release(struct task_struct *tsk)
3956 {
3957 futex_cleanup_begin(tsk);
3958 futex_cleanup(tsk);
3959 futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
3960 }
3961
do_futex(u32 __user * uaddr,int op,u32 val,ktime_t * timeout,u32 __user * uaddr2,u32 val2,u32 val3)3962 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
3963 u32 __user *uaddr2, u32 val2, u32 val3)
3964 {
3965 int cmd = op & FUTEX_CMD_MASK;
3966 unsigned int flags = 0;
3967
3968 if (!(op & FUTEX_PRIVATE_FLAG))
3969 flags |= FLAGS_SHARED;
3970
3971 if (op & FUTEX_CLOCK_REALTIME) {
3972 flags |= FLAGS_CLOCKRT;
3973 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
3974 cmd != FUTEX_LOCK_PI2)
3975 return -ENOSYS;
3976 }
3977
3978 switch (cmd) {
3979 case FUTEX_LOCK_PI:
3980 case FUTEX_LOCK_PI2:
3981 case FUTEX_UNLOCK_PI:
3982 case FUTEX_TRYLOCK_PI:
3983 case FUTEX_WAIT_REQUEUE_PI:
3984 case FUTEX_CMP_REQUEUE_PI:
3985 if (!futex_cmpxchg_enabled)
3986 return -ENOSYS;
3987 }
3988
3989 trace_android_vh_do_futex(cmd, &flags, uaddr2);
3990 switch (cmd) {
3991 case FUTEX_WAIT:
3992 val3 = FUTEX_BITSET_MATCH_ANY;
3993 fallthrough;
3994 case FUTEX_WAIT_BITSET:
3995 return futex_wait(uaddr, flags, val, timeout, val3);
3996 case FUTEX_WAKE:
3997 val3 = FUTEX_BITSET_MATCH_ANY;
3998 fallthrough;
3999 case FUTEX_WAKE_BITSET:
4000 return futex_wake(uaddr, flags, val, val3);
4001 case FUTEX_REQUEUE:
4002 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
4003 case FUTEX_CMP_REQUEUE:
4004 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
4005 case FUTEX_WAKE_OP:
4006 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
4007 case FUTEX_LOCK_PI:
4008 flags |= FLAGS_CLOCKRT;
4009 fallthrough;
4010 case FUTEX_LOCK_PI2:
4011 return futex_lock_pi(uaddr, flags, timeout, 0);
4012 case FUTEX_UNLOCK_PI:
4013 return futex_unlock_pi(uaddr, flags);
4014 case FUTEX_TRYLOCK_PI:
4015 return futex_lock_pi(uaddr, flags, NULL, 1);
4016 case FUTEX_WAIT_REQUEUE_PI:
4017 val3 = FUTEX_BITSET_MATCH_ANY;
4018 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
4019 uaddr2);
4020 case FUTEX_CMP_REQUEUE_PI:
4021 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
4022 }
4023 return -ENOSYS;
4024 }
4025
futex_cmd_has_timeout(u32 cmd)4026 static __always_inline bool futex_cmd_has_timeout(u32 cmd)
4027 {
4028 switch (cmd) {
4029 case FUTEX_WAIT:
4030 case FUTEX_LOCK_PI:
4031 case FUTEX_LOCK_PI2:
4032 case FUTEX_WAIT_BITSET:
4033 case FUTEX_WAIT_REQUEUE_PI:
4034 return true;
4035 }
4036 return false;
4037 }
4038
4039 static __always_inline int
futex_init_timeout(u32 cmd,u32 op,struct timespec64 * ts,ktime_t * t)4040 futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
4041 {
4042 if (!timespec64_valid(ts))
4043 return -EINVAL;
4044
4045 *t = timespec64_to_ktime(*ts);
4046 if (cmd == FUTEX_WAIT)
4047 *t = ktime_add_safe(ktime_get(), *t);
4048 else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
4049 *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
4050 return 0;
4051 }
4052
SYSCALL_DEFINE6(futex,u32 __user *,uaddr,int,op,u32,val,const struct __kernel_timespec __user *,utime,u32 __user *,uaddr2,u32,val3)4053 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
4054 const struct __kernel_timespec __user *, utime,
4055 u32 __user *, uaddr2, u32, val3)
4056 {
4057 int ret, cmd = op & FUTEX_CMD_MASK;
4058 ktime_t t, *tp = NULL;
4059 struct timespec64 ts;
4060
4061 if (utime && futex_cmd_has_timeout(cmd)) {
4062 if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
4063 return -EFAULT;
4064 if (get_timespec64(&ts, utime))
4065 return -EFAULT;
4066 ret = futex_init_timeout(cmd, op, &ts, &t);
4067 if (ret)
4068 return ret;
4069 tp = &t;
4070 }
4071
4072 return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
4073 }
4074
4075 #ifdef CONFIG_COMPAT
4076 /*
4077 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
4078 */
4079 static inline int
compat_fetch_robust_entry(compat_uptr_t * uentry,struct robust_list __user ** entry,compat_uptr_t __user * head,unsigned int * pi)4080 compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
4081 compat_uptr_t __user *head, unsigned int *pi)
4082 {
4083 if (get_user(*uentry, head))
4084 return -EFAULT;
4085
4086 *entry = compat_ptr((*uentry) & ~1);
4087 *pi = (unsigned int)(*uentry) & 1;
4088
4089 return 0;
4090 }
4091
futex_uaddr(struct robust_list __user * entry,compat_long_t futex_offset)4092 static void __user *futex_uaddr(struct robust_list __user *entry,
4093 compat_long_t futex_offset)
4094 {
4095 compat_uptr_t base = ptr_to_compat(entry);
4096 void __user *uaddr = compat_ptr(base + futex_offset);
4097
4098 return uaddr;
4099 }
4100
4101 /*
4102 * Walk curr->robust_list (very carefully, it's a userspace list!)
4103 * and mark any locks found there dead, and notify any waiters.
4104 *
4105 * We silently return on any sign of list-walking problem.
4106 */
compat_exit_robust_list(struct task_struct * curr)4107 static void compat_exit_robust_list(struct task_struct *curr)
4108 {
4109 struct compat_robust_list_head __user *head = curr->compat_robust_list;
4110 struct robust_list __user *entry, *next_entry, *pending;
4111 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
4112 unsigned int next_pi;
4113 compat_uptr_t uentry, next_uentry, upending;
4114 compat_long_t futex_offset;
4115 int rc;
4116
4117 if (!futex_cmpxchg_enabled)
4118 return;
4119
4120 /*
4121 * Fetch the list head (which was registered earlier, via
4122 * sys_set_robust_list()):
4123 */
4124 if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
4125 return;
4126 /*
4127 * Fetch the relative futex offset:
4128 */
4129 if (get_user(futex_offset, &head->futex_offset))
4130 return;
4131 /*
4132 * Fetch any possibly pending lock-add first, and handle it
4133 * if it exists:
4134 */
4135 if (compat_fetch_robust_entry(&upending, &pending,
4136 &head->list_op_pending, &pip))
4137 return;
4138
4139 next_entry = NULL; /* avoid warning with gcc */
4140 while (entry != (struct robust_list __user *) &head->list) {
4141 /*
4142 * Fetch the next entry in the list before calling
4143 * handle_futex_death:
4144 */
4145 rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
4146 (compat_uptr_t __user *)&entry->next, &next_pi);
4147 /*
4148 * A pending lock might already be on the list, so
4149 * dont process it twice:
4150 */
4151 if (entry != pending) {
4152 void __user *uaddr = futex_uaddr(entry, futex_offset);
4153
4154 if (handle_futex_death(uaddr, curr, pi,
4155 HANDLE_DEATH_LIST))
4156 return;
4157 }
4158 if (rc)
4159 return;
4160 uentry = next_uentry;
4161 entry = next_entry;
4162 pi = next_pi;
4163 /*
4164 * Avoid excessively long or circular lists:
4165 */
4166 if (!--limit)
4167 break;
4168
4169 cond_resched();
4170 }
4171 if (pending) {
4172 void __user *uaddr = futex_uaddr(pending, futex_offset);
4173
4174 handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
4175 }
4176 }
4177
COMPAT_SYSCALL_DEFINE2(set_robust_list,struct compat_robust_list_head __user *,head,compat_size_t,len)4178 COMPAT_SYSCALL_DEFINE2(set_robust_list,
4179 struct compat_robust_list_head __user *, head,
4180 compat_size_t, len)
4181 {
4182 if (!futex_cmpxchg_enabled)
4183 return -ENOSYS;
4184
4185 if (unlikely(len != sizeof(*head)))
4186 return -EINVAL;
4187
4188 current->compat_robust_list = head;
4189
4190 return 0;
4191 }
4192
COMPAT_SYSCALL_DEFINE3(get_robust_list,int,pid,compat_uptr_t __user *,head_ptr,compat_size_t __user *,len_ptr)4193 COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
4194 compat_uptr_t __user *, head_ptr,
4195 compat_size_t __user *, len_ptr)
4196 {
4197 struct compat_robust_list_head __user *head;
4198 unsigned long ret;
4199 struct task_struct *p;
4200
4201 if (!futex_cmpxchg_enabled)
4202 return -ENOSYS;
4203
4204 rcu_read_lock();
4205
4206 ret = -ESRCH;
4207 if (!pid)
4208 p = current;
4209 else {
4210 p = find_task_by_vpid(pid);
4211 if (!p)
4212 goto err_unlock;
4213 }
4214
4215 ret = -EPERM;
4216 if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
4217 goto err_unlock;
4218
4219 head = p->compat_robust_list;
4220 rcu_read_unlock();
4221
4222 if (put_user(sizeof(*head), len_ptr))
4223 return -EFAULT;
4224 return put_user(ptr_to_compat(head), head_ptr);
4225
4226 err_unlock:
4227 rcu_read_unlock();
4228
4229 return ret;
4230 }
4231 #endif /* CONFIG_COMPAT */
4232
4233 #ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE6(futex_time32,u32 __user *,uaddr,int,op,u32,val,const struct old_timespec32 __user *,utime,u32 __user *,uaddr2,u32,val3)4234 SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
4235 const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
4236 u32, val3)
4237 {
4238 int ret, cmd = op & FUTEX_CMD_MASK;
4239 ktime_t t, *tp = NULL;
4240 struct timespec64 ts;
4241
4242 if (utime && futex_cmd_has_timeout(cmd)) {
4243 if (get_old_timespec32(&ts, utime))
4244 return -EFAULT;
4245 ret = futex_init_timeout(cmd, op, &ts, &t);
4246 if (ret)
4247 return ret;
4248 tp = &t;
4249 }
4250
4251 return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
4252 }
4253 #endif /* CONFIG_COMPAT_32BIT_TIME */
4254
futex_detect_cmpxchg(void)4255 static void __init futex_detect_cmpxchg(void)
4256 {
4257 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
4258 u32 curval;
4259
4260 /*
4261 * This will fail and we want it. Some arch implementations do
4262 * runtime detection of the futex_atomic_cmpxchg_inatomic()
4263 * functionality. We want to know that before we call in any
4264 * of the complex code paths. Also we want to prevent
4265 * registration of robust lists in that case. NULL is
4266 * guaranteed to fault and we get -EFAULT on functional
4267 * implementation, the non-functional ones will return
4268 * -ENOSYS.
4269 */
4270 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
4271 futex_cmpxchg_enabled = 1;
4272 #endif
4273 }
4274
futex_init(void)4275 static int __init futex_init(void)
4276 {
4277 unsigned int futex_shift;
4278 unsigned long i;
4279
4280 #if CONFIG_BASE_SMALL
4281 futex_hashsize = 16;
4282 #else
4283 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
4284 #endif
4285
4286 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
4287 futex_hashsize, 0,
4288 futex_hashsize < 256 ? HASH_SMALL : 0,
4289 &futex_shift, NULL,
4290 futex_hashsize, futex_hashsize);
4291 futex_hashsize = 1UL << futex_shift;
4292
4293 futex_detect_cmpxchg();
4294
4295 for (i = 0; i < futex_hashsize; i++) {
4296 atomic_set(&futex_queues[i].waiters, 0);
4297 plist_head_init(&futex_queues[i].chain);
4298 spin_lock_init(&futex_queues[i].lock);
4299 }
4300
4301 return 0;
4302 }
4303 core_initcall(futex_init);
4304