• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Fast Userspace Mutexes (which I call "Futexes!").
3  *  (C) Rusty Russell, IBM 2002
4  *
5  *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
6  *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
7  *
8  *  Removed page pinning, fix privately mapped COW pages and other cleanups
9  *  (C) Copyright 2003, 2004 Jamie Lokier
10  *
11  *  Robust futex support started by Ingo Molnar
12  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14  *
15  *  PI-futex support started by Ingo Molnar and Thomas Gleixner
16  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18  *
19  *  PRIVATE futexes by Eric Dumazet
20  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21  *
22  *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23  *  Copyright (C) IBM Corporation, 2009
24  *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
25  *
26  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
27  *  enough at me, Linus for the original (flawed) idea, Matthew
28  *  Kirkwood for proof-of-concept implementation.
29  *
30  *  "The futexes are also cursed."
31  *  "But they come in a choice of three flavours!"
32  *
33  *  This program is free software; you can redistribute it and/or modify
34  *  it under the terms of the GNU General Public License as published by
35  *  the Free Software Foundation; either version 2 of the License, or
36  *  (at your option) any later version.
37  *
38  *  This program is distributed in the hope that it will be useful,
39  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
40  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
41  *  GNU General Public License for more details.
42  *
43  *  You should have received a copy of the GNU General Public License
44  *  along with this program; if not, write to the Free Software
45  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
46  */
47 #include <linux/compat.h>
48 #include <linux/slab.h>
49 #include <linux/poll.h>
50 #include <linux/fs.h>
51 #include <linux/file.h>
52 #include <linux/jhash.h>
53 #include <linux/init.h>
54 #include <linux/futex.h>
55 #include <linux/mount.h>
56 #include <linux/pagemap.h>
57 #include <linux/syscalls.h>
58 #include <linux/signal.h>
59 #include <linux/export.h>
60 #include <linux/magic.h>
61 #include <linux/pid.h>
62 #include <linux/nsproxy.h>
63 #include <linux/ptrace.h>
64 #include <linux/sched/rt.h>
65 #include <linux/sched/wake_q.h>
66 #include <linux/sched/mm.h>
67 #include <linux/hugetlb.h>
68 #include <linux/freezer.h>
69 #include <linux/bootmem.h>
70 #include <linux/fault-inject.h>
71 
72 #include <asm/futex.h>
73 
74 #include "locking/rtmutex_common.h"
75 
76 /*
77  * READ this before attempting to hack on futexes!
78  *
79  * Basic futex operation and ordering guarantees
80  * =============================================
81  *
82  * The waiter reads the futex value in user space and calls
83  * futex_wait(). This function computes the hash bucket and acquires
84  * the hash bucket lock. After that it reads the futex user space value
85  * again and verifies that the data has not changed. If it has not changed
86  * it enqueues itself into the hash bucket, releases the hash bucket lock
87  * and schedules.
88  *
89  * The waker side modifies the user space value of the futex and calls
90  * futex_wake(). This function computes the hash bucket and acquires the
91  * hash bucket lock. Then it looks for waiters on that futex in the hash
92  * bucket and wakes them.
93  *
94  * In futex wake up scenarios where no tasks are blocked on a futex, taking
95  * the hb spinlock can be avoided and simply return. In order for this
96  * optimization to work, ordering guarantees must exist so that the waiter
97  * being added to the list is acknowledged when the list is concurrently being
98  * checked by the waker, avoiding scenarios like the following:
99  *
100  * CPU 0                               CPU 1
101  * val = *futex;
102  * sys_futex(WAIT, futex, val);
103  *   futex_wait(futex, val);
104  *   uval = *futex;
105  *                                     *futex = newval;
106  *                                     sys_futex(WAKE, futex);
107  *                                       futex_wake(futex);
108  *                                       if (queue_empty())
109  *                                         return;
110  *   if (uval == val)
111  *      lock(hash_bucket(futex));
112  *      queue();
113  *     unlock(hash_bucket(futex));
114  *     schedule();
115  *
116  * This would cause the waiter on CPU 0 to wait forever because it
117  * missed the transition of the user space value from val to newval
118  * and the waker did not find the waiter in the hash bucket queue.
119  *
120  * The correct serialization ensures that a waiter either observes
121  * the changed user space value before blocking or is woken by a
122  * concurrent waker:
123  *
124  * CPU 0                                 CPU 1
125  * val = *futex;
126  * sys_futex(WAIT, futex, val);
127  *   futex_wait(futex, val);
128  *
129  *   waiters++; (a)
130  *   smp_mb(); (A) <-- paired with -.
131  *                                  |
132  *   lock(hash_bucket(futex));      |
133  *                                  |
134  *   uval = *futex;                 |
135  *                                  |        *futex = newval;
136  *                                  |        sys_futex(WAKE, futex);
137  *                                  |          futex_wake(futex);
138  *                                  |
139  *                                  `--------> smp_mb(); (B)
140  *   if (uval == val)
141  *     queue();
142  *     unlock(hash_bucket(futex));
143  *     schedule();                         if (waiters)
144  *                                           lock(hash_bucket(futex));
145  *   else                                    wake_waiters(futex);
146  *     waiters--; (b)                        unlock(hash_bucket(futex));
147  *
148  * Where (A) orders the waiters increment and the futex value read through
149  * atomic operations (see hb_waiters_inc) and where (B) orders the write
150  * to futex and the waiters read -- this is done by the barriers for both
151  * shared and private futexes in get_futex_key_refs().
152  *
153  * This yields the following case (where X:=waiters, Y:=futex):
154  *
155  *	X = Y = 0
156  *
157  *	w[X]=1		w[Y]=1
158  *	MB		MB
159  *	r[Y]=y		r[X]=x
160  *
161  * Which guarantees that x==0 && y==0 is impossible; which translates back into
162  * the guarantee that we cannot both miss the futex variable change and the
163  * enqueue.
164  *
165  * Note that a new waiter is accounted for in (a) even when it is possible that
166  * the wait call can return error, in which case we backtrack from it in (b).
167  * Refer to the comment in queue_lock().
168  *
169  * Similarly, in order to account for waiters being requeued on another
170  * address we always increment the waiters for the destination bucket before
171  * acquiring the lock. It then decrements them again  after releasing it -
172  * the code that actually moves the futex(es) between hash buckets (requeue_futex)
173  * will do the additional required waiter count housekeeping. This is done for
174  * double_lock_hb() and double_unlock_hb(), respectively.
175  */
176 
177 #ifdef CONFIG_HAVE_FUTEX_CMPXCHG
178 #define futex_cmpxchg_enabled 1
179 #else
180 static int  __read_mostly futex_cmpxchg_enabled;
181 #endif
182 
183 /*
184  * Futex flags used to encode options to functions and preserve them across
185  * restarts.
186  */
187 #ifdef CONFIG_MMU
188 # define FLAGS_SHARED		0x01
189 #else
190 /*
191  * NOMMU does not have per process address space. Let the compiler optimize
192  * code away.
193  */
194 # define FLAGS_SHARED		0x00
195 #endif
196 #define FLAGS_CLOCKRT		0x02
197 #define FLAGS_HAS_TIMEOUT	0x04
198 
199 /*
200  * Priority Inheritance state:
201  */
202 struct futex_pi_state {
203 	/*
204 	 * list of 'owned' pi_state instances - these have to be
205 	 * cleaned up in do_exit() if the task exits prematurely:
206 	 */
207 	struct list_head list;
208 
209 	/*
210 	 * The PI object:
211 	 */
212 	struct rt_mutex pi_mutex;
213 
214 	struct task_struct *owner;
215 	atomic_t refcount;
216 
217 	union futex_key key;
218 } __randomize_layout;
219 
220 /**
221  * struct futex_q - The hashed futex queue entry, one per waiting task
222  * @list:		priority-sorted list of tasks waiting on this futex
223  * @task:		the task waiting on the futex
224  * @lock_ptr:		the hash bucket lock
225  * @key:		the key the futex is hashed on
226  * @pi_state:		optional priority inheritance state
227  * @rt_waiter:		rt_waiter storage for use with requeue_pi
228  * @requeue_pi_key:	the requeue_pi target futex key
229  * @bitset:		bitset for the optional bitmasked wakeup
230  *
231  * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
232  * we can wake only the relevant ones (hashed queues may be shared).
233  *
234  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
235  * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
236  * The order of wakeup is always to make the first condition true, then
237  * the second.
238  *
239  * PI futexes are typically woken before they are removed from the hash list via
240  * the rt_mutex code. See unqueue_me_pi().
241  */
242 struct futex_q {
243 	struct plist_node list;
244 
245 	struct task_struct *task;
246 	spinlock_t *lock_ptr;
247 	union futex_key key;
248 	struct futex_pi_state *pi_state;
249 	struct rt_mutex_waiter *rt_waiter;
250 	union futex_key *requeue_pi_key;
251 	u32 bitset;
252 } __randomize_layout;
253 
254 static const struct futex_q futex_q_init = {
255 	/* list gets initialized in queue_me()*/
256 	.key = FUTEX_KEY_INIT,
257 	.bitset = FUTEX_BITSET_MATCH_ANY
258 };
259 
260 /*
261  * Hash buckets are shared by all the futex_keys that hash to the same
262  * location.  Each key may have multiple futex_q structures, one for each task
263  * waiting on a futex.
264  */
265 struct futex_hash_bucket {
266 	atomic_t waiters;
267 	spinlock_t lock;
268 	struct plist_head chain;
269 } ____cacheline_aligned_in_smp;
270 
271 /*
272  * The base of the bucket array and its size are always used together
273  * (after initialization only in hash_futex()), so ensure that they
274  * reside in the same cacheline.
275  */
276 static struct {
277 	struct futex_hash_bucket *queues;
278 	unsigned long            hashsize;
279 } __futex_data __read_mostly __aligned(2*sizeof(long));
280 #define futex_queues   (__futex_data.queues)
281 #define futex_hashsize (__futex_data.hashsize)
282 
283 
284 /*
285  * Fault injections for futexes.
286  */
287 #ifdef CONFIG_FAIL_FUTEX
288 
289 static struct {
290 	struct fault_attr attr;
291 
292 	bool ignore_private;
293 } fail_futex = {
294 	.attr = FAULT_ATTR_INITIALIZER,
295 	.ignore_private = false,
296 };
297 
setup_fail_futex(char * str)298 static int __init setup_fail_futex(char *str)
299 {
300 	return setup_fault_attr(&fail_futex.attr, str);
301 }
302 __setup("fail_futex=", setup_fail_futex);
303 
should_fail_futex(bool fshared)304 static bool should_fail_futex(bool fshared)
305 {
306 	if (fail_futex.ignore_private && !fshared)
307 		return false;
308 
309 	return should_fail(&fail_futex.attr, 1);
310 }
311 
312 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
313 
fail_futex_debugfs(void)314 static int __init fail_futex_debugfs(void)
315 {
316 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
317 	struct dentry *dir;
318 
319 	dir = fault_create_debugfs_attr("fail_futex", NULL,
320 					&fail_futex.attr);
321 	if (IS_ERR(dir))
322 		return PTR_ERR(dir);
323 
324 	if (!debugfs_create_bool("ignore-private", mode, dir,
325 				 &fail_futex.ignore_private)) {
326 		debugfs_remove_recursive(dir);
327 		return -ENOMEM;
328 	}
329 
330 	return 0;
331 }
332 
333 late_initcall(fail_futex_debugfs);
334 
335 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
336 
337 #else
should_fail_futex(bool fshared)338 static inline bool should_fail_futex(bool fshared)
339 {
340 	return false;
341 }
342 #endif /* CONFIG_FAIL_FUTEX */
343 
344 #ifdef CONFIG_COMPAT
345 static void compat_exit_robust_list(struct task_struct *curr);
346 #else
compat_exit_robust_list(struct task_struct * curr)347 static inline void compat_exit_robust_list(struct task_struct *curr) { }
348 #endif
349 
futex_get_mm(union futex_key * key)350 static inline void futex_get_mm(union futex_key *key)
351 {
352 	mmgrab(key->private.mm);
353 	/*
354 	 * Ensure futex_get_mm() implies a full barrier such that
355 	 * get_futex_key() implies a full barrier. This is relied upon
356 	 * as smp_mb(); (B), see the ordering comment above.
357 	 */
358 	smp_mb__after_atomic();
359 }
360 
361 /*
362  * Reflects a new waiter being added to the waitqueue.
363  */
hb_waiters_inc(struct futex_hash_bucket * hb)364 static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
365 {
366 #ifdef CONFIG_SMP
367 	atomic_inc(&hb->waiters);
368 	/*
369 	 * Full barrier (A), see the ordering comment above.
370 	 */
371 	smp_mb__after_atomic();
372 #endif
373 }
374 
375 /*
376  * Reflects a waiter being removed from the waitqueue by wakeup
377  * paths.
378  */
hb_waiters_dec(struct futex_hash_bucket * hb)379 static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
380 {
381 #ifdef CONFIG_SMP
382 	atomic_dec(&hb->waiters);
383 #endif
384 }
385 
hb_waiters_pending(struct futex_hash_bucket * hb)386 static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
387 {
388 #ifdef CONFIG_SMP
389 	return atomic_read(&hb->waiters);
390 #else
391 	return 1;
392 #endif
393 }
394 
395 /**
396  * hash_futex - Return the hash bucket in the global hash
397  * @key:	Pointer to the futex key for which the hash is calculated
398  *
399  * We hash on the keys returned from get_futex_key (see below) and return the
400  * corresponding hash bucket in the global hash.
401  */
hash_futex(union futex_key * key)402 static struct futex_hash_bucket *hash_futex(union futex_key *key)
403 {
404 	u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
405 			  key->both.offset);
406 
407 	return &futex_queues[hash & (futex_hashsize - 1)];
408 }
409 
410 
411 /**
412  * match_futex - Check whether two futex keys are equal
413  * @key1:	Pointer to key1
414  * @key2:	Pointer to key2
415  *
416  * Return 1 if two futex_keys are equal, 0 otherwise.
417  */
match_futex(union futex_key * key1,union futex_key * key2)418 static inline int match_futex(union futex_key *key1, union futex_key *key2)
419 {
420 	return (key1 && key2
421 		&& key1->both.word == key2->both.word
422 		&& key1->both.ptr == key2->both.ptr
423 		&& key1->both.offset == key2->both.offset);
424 }
425 
426 /*
427  * Take a reference to the resource addressed by a key.
428  * Can be called while holding spinlocks.
429  *
430  */
get_futex_key_refs(union futex_key * key)431 static void get_futex_key_refs(union futex_key *key)
432 {
433 	if (!key->both.ptr)
434 		return;
435 
436 	/*
437 	 * On MMU less systems futexes are always "private" as there is no per
438 	 * process address space. We need the smp wmb nevertheless - yes,
439 	 * arch/blackfin has MMU less SMP ...
440 	 */
441 	if (!IS_ENABLED(CONFIG_MMU)) {
442 		smp_mb(); /* explicit smp_mb(); (B) */
443 		return;
444 	}
445 
446 	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
447 	case FUT_OFF_INODE:
448 		smp_mb();		/* explicit smp_mb(); (B) */
449 		break;
450 	case FUT_OFF_MMSHARED:
451 		futex_get_mm(key); /* implies smp_mb(); (B) */
452 		break;
453 	default:
454 		/*
455 		 * Private futexes do not hold reference on an inode or
456 		 * mm, therefore the only purpose of calling get_futex_key_refs
457 		 * is because we need the barrier for the lockless waiter check.
458 		 */
459 		smp_mb(); /* explicit smp_mb(); (B) */
460 	}
461 }
462 
463 /*
464  * Drop a reference to the resource addressed by a key.
465  * The hash bucket spinlock must not be held. This is
466  * a no-op for private futexes, see comment in the get
467  * counterpart.
468  */
drop_futex_key_refs(union futex_key * key)469 static void drop_futex_key_refs(union futex_key *key)
470 {
471 	if (!key->both.ptr) {
472 		/* If we're here then we tried to put a key we failed to get */
473 		WARN_ON_ONCE(1);
474 		return;
475 	}
476 
477 	if (!IS_ENABLED(CONFIG_MMU))
478 		return;
479 
480 	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
481 	case FUT_OFF_INODE:
482 		break;
483 	case FUT_OFF_MMSHARED:
484 		mmdrop(key->private.mm);
485 		break;
486 	}
487 }
488 
489 /*
490  * Generate a machine wide unique identifier for this inode.
491  *
492  * This relies on u64 not wrapping in the life-time of the machine; which with
493  * 1ns resolution means almost 585 years.
494  *
495  * This further relies on the fact that a well formed program will not unmap
496  * the file while it has a (shared) futex waiting on it. This mapping will have
497  * a file reference which pins the mount and inode.
498  *
499  * If for some reason an inode gets evicted and read back in again, it will get
500  * a new sequence number and will _NOT_ match, even though it is the exact same
501  * file.
502  *
503  * It is important that match_futex() will never have a false-positive, esp.
504  * for PI futexes that can mess up the state. The above argues that false-negatives
505  * are only possible for malformed programs.
506  */
get_inode_sequence_number(struct inode * inode)507 static u64 get_inode_sequence_number(struct inode *inode)
508 {
509 	static atomic64_t i_seq;
510 	u64 old;
511 
512 	/* Does the inode already have a sequence number? */
513 	old = atomic64_read(&inode->i_sequence);
514 	if (likely(old))
515 		return old;
516 
517 	for (;;) {
518 		u64 new = atomic64_add_return(1, &i_seq);
519 		if (WARN_ON_ONCE(!new))
520 			continue;
521 
522 		old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
523 		if (old)
524 			return old;
525 		return new;
526 	}
527 }
528 
529 /**
530  * get_futex_key() - Get parameters which are the keys for a futex
531  * @uaddr:	virtual address of the futex
532  * @fshared:	0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
533  * @key:	address where result is stored.
534  * @rw:		mapping needs to be read/write (values: VERIFY_READ,
535  *              VERIFY_WRITE)
536  *
537  * Return: a negative error code or 0
538  *
539  * The key words are stored in @key on success.
540  *
541  * For shared mappings (when @fshared), the key is:
542  *   ( inode->i_sequence, page->index, offset_within_page )
543  * [ also see get_inode_sequence_number() ]
544  *
545  * For private mappings (or when !@fshared), the key is:
546  *   ( current->mm, address, 0 )
547  *
548  * This allows (cross process, where applicable) identification of the futex
549  * without keeping the page pinned for the duration of the FUTEX_WAIT.
550  *
551  * lock_page() might sleep, the caller should not hold a spinlock.
552  */
553 static int
get_futex_key(u32 __user * uaddr,int fshared,union futex_key * key,int rw)554 get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
555 {
556 	unsigned long address = (unsigned long)uaddr;
557 	struct mm_struct *mm = current->mm;
558 	struct page *page, *tail;
559 	struct address_space *mapping;
560 	int err, ro = 0;
561 
562 	/*
563 	 * The futex address must be "naturally" aligned.
564 	 */
565 	key->both.offset = address % PAGE_SIZE;
566 	if (unlikely((address % sizeof(u32)) != 0))
567 		return -EINVAL;
568 	address -= key->both.offset;
569 
570 	if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
571 		return -EFAULT;
572 
573 	if (unlikely(should_fail_futex(fshared)))
574 		return -EFAULT;
575 
576 	/*
577 	 * PROCESS_PRIVATE futexes are fast.
578 	 * As the mm cannot disappear under us and the 'key' only needs
579 	 * virtual address, we dont even have to find the underlying vma.
580 	 * Note : We do have to check 'uaddr' is a valid user address,
581 	 *        but access_ok() should be faster than find_vma()
582 	 */
583 	if (!fshared) {
584 		key->private.mm = mm;
585 		key->private.address = address;
586 		get_futex_key_refs(key);  /* implies smp_mb(); (B) */
587 		return 0;
588 	}
589 
590 again:
591 	/* Ignore any VERIFY_READ mapping (futex common case) */
592 	if (unlikely(should_fail_futex(fshared)))
593 		return -EFAULT;
594 
595 	err = get_user_pages_fast(address, 1, 1, &page);
596 	/*
597 	 * If write access is not required (eg. FUTEX_WAIT), try
598 	 * and get read-only access.
599 	 */
600 	if (err == -EFAULT && rw == VERIFY_READ) {
601 		err = get_user_pages_fast(address, 1, 0, &page);
602 		ro = 1;
603 	}
604 	if (err < 0)
605 		return err;
606 	else
607 		err = 0;
608 
609 	/*
610 	 * The treatment of mapping from this point on is critical. The page
611 	 * lock protects many things but in this context the page lock
612 	 * stabilizes mapping, prevents inode freeing in the shared
613 	 * file-backed region case and guards against movement to swap cache.
614 	 *
615 	 * Strictly speaking the page lock is not needed in all cases being
616 	 * considered here and page lock forces unnecessarily serialization
617 	 * From this point on, mapping will be re-verified if necessary and
618 	 * page lock will be acquired only if it is unavoidable
619 	 *
620 	 * Mapping checks require the head page for any compound page so the
621 	 * head page and mapping is looked up now. For anonymous pages, it
622 	 * does not matter if the page splits in the future as the key is
623 	 * based on the address. For filesystem-backed pages, the tail is
624 	 * required as the index of the page determines the key. For
625 	 * base pages, there is no tail page and tail == page.
626 	 */
627 	tail = page;
628 	page = compound_head(page);
629 	mapping = READ_ONCE(page->mapping);
630 
631 	/*
632 	 * If page->mapping is NULL, then it cannot be a PageAnon
633 	 * page; but it might be the ZERO_PAGE or in the gate area or
634 	 * in a special mapping (all cases which we are happy to fail);
635 	 * or it may have been a good file page when get_user_pages_fast
636 	 * found it, but truncated or holepunched or subjected to
637 	 * invalidate_complete_page2 before we got the page lock (also
638 	 * cases which we are happy to fail).  And we hold a reference,
639 	 * so refcount care in invalidate_complete_page's remove_mapping
640 	 * prevents drop_caches from setting mapping to NULL beneath us.
641 	 *
642 	 * The case we do have to guard against is when memory pressure made
643 	 * shmem_writepage move it from filecache to swapcache beneath us:
644 	 * an unlikely race, but we do need to retry for page->mapping.
645 	 */
646 	if (unlikely(!mapping)) {
647 		int shmem_swizzled;
648 
649 		/*
650 		 * Page lock is required to identify which special case above
651 		 * applies. If this is really a shmem page then the page lock
652 		 * will prevent unexpected transitions.
653 		 */
654 		lock_page(page);
655 		shmem_swizzled = PageSwapCache(page) || page->mapping;
656 		unlock_page(page);
657 		put_page(page);
658 
659 		if (shmem_swizzled)
660 			goto again;
661 
662 		return -EFAULT;
663 	}
664 
665 	/*
666 	 * Private mappings are handled in a simple way.
667 	 *
668 	 * If the futex key is stored on an anonymous page, then the associated
669 	 * object is the mm which is implicitly pinned by the calling process.
670 	 *
671 	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
672 	 * it's a read-only handle, it's expected that futexes attach to
673 	 * the object not the particular process.
674 	 */
675 	if (PageAnon(page)) {
676 		/*
677 		 * A RO anonymous page will never change and thus doesn't make
678 		 * sense for futex operations.
679 		 */
680 		if (unlikely(should_fail_futex(fshared)) || ro) {
681 			err = -EFAULT;
682 			goto out;
683 		}
684 
685 		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
686 		key->private.mm = mm;
687 		key->private.address = address;
688 
689 	} else {
690 		struct inode *inode;
691 
692 		/*
693 		 * The associated futex object in this case is the inode and
694 		 * the page->mapping must be traversed. Ordinarily this should
695 		 * be stabilised under page lock but it's not strictly
696 		 * necessary in this case as we just want to pin the inode, not
697 		 * update the radix tree or anything like that.
698 		 *
699 		 * The RCU read lock is taken as the inode is finally freed
700 		 * under RCU. If the mapping still matches expectations then the
701 		 * mapping->host can be safely accessed as being a valid inode.
702 		 */
703 		rcu_read_lock();
704 
705 		if (READ_ONCE(page->mapping) != mapping) {
706 			rcu_read_unlock();
707 			put_page(page);
708 
709 			goto again;
710 		}
711 
712 		inode = READ_ONCE(mapping->host);
713 		if (!inode) {
714 			rcu_read_unlock();
715 			put_page(page);
716 
717 			goto again;
718 		}
719 
720 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
721 		key->shared.i_seq = get_inode_sequence_number(inode);
722 		key->shared.pgoff = basepage_index(tail);
723 		rcu_read_unlock();
724 	}
725 
726 	get_futex_key_refs(key); /* implies smp_mb(); (B) */
727 
728 out:
729 	put_page(page);
730 	return err;
731 }
732 
put_futex_key(union futex_key * key)733 static inline void put_futex_key(union futex_key *key)
734 {
735 	drop_futex_key_refs(key);
736 }
737 
738 /**
739  * fault_in_user_writeable() - Fault in user address and verify RW access
740  * @uaddr:	pointer to faulting user space address
741  *
742  * Slow path to fixup the fault we just took in the atomic write
743  * access to @uaddr.
744  *
745  * We have no generic implementation of a non-destructive write to the
746  * user address. We know that we faulted in the atomic pagefault
747  * disabled section so we can as well avoid the #PF overhead by
748  * calling get_user_pages() right away.
749  */
fault_in_user_writeable(u32 __user * uaddr)750 static int fault_in_user_writeable(u32 __user *uaddr)
751 {
752 	struct mm_struct *mm = current->mm;
753 	int ret;
754 
755 	down_read(&mm->mmap_sem);
756 	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
757 			       FAULT_FLAG_WRITE, NULL);
758 	up_read(&mm->mmap_sem);
759 
760 	return ret < 0 ? ret : 0;
761 }
762 
763 /**
764  * futex_top_waiter() - Return the highest priority waiter on a futex
765  * @hb:		the hash bucket the futex_q's reside in
766  * @key:	the futex key (to distinguish it from other futex futex_q's)
767  *
768  * Must be called with the hb lock held.
769  */
futex_top_waiter(struct futex_hash_bucket * hb,union futex_key * key)770 static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
771 					union futex_key *key)
772 {
773 	struct futex_q *this;
774 
775 	plist_for_each_entry(this, &hb->chain, list) {
776 		if (match_futex(&this->key, key))
777 			return this;
778 	}
779 	return NULL;
780 }
781 
cmpxchg_futex_value_locked(u32 * curval,u32 __user * uaddr,u32 uval,u32 newval)782 static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
783 				      u32 uval, u32 newval)
784 {
785 	int ret;
786 
787 	pagefault_disable();
788 	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
789 	pagefault_enable();
790 
791 	return ret;
792 }
793 
get_futex_value_locked(u32 * dest,u32 __user * from)794 static int get_futex_value_locked(u32 *dest, u32 __user *from)
795 {
796 	int ret;
797 
798 	pagefault_disable();
799 	ret = __get_user(*dest, from);
800 	pagefault_enable();
801 
802 	return ret ? -EFAULT : 0;
803 }
804 
805 
806 /*
807  * PI code:
808  */
refill_pi_state_cache(void)809 static int refill_pi_state_cache(void)
810 {
811 	struct futex_pi_state *pi_state;
812 
813 	if (likely(current->pi_state_cache))
814 		return 0;
815 
816 	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
817 
818 	if (!pi_state)
819 		return -ENOMEM;
820 
821 	INIT_LIST_HEAD(&pi_state->list);
822 	/* pi_mutex gets initialized later */
823 	pi_state->owner = NULL;
824 	atomic_set(&pi_state->refcount, 1);
825 	pi_state->key = FUTEX_KEY_INIT;
826 
827 	current->pi_state_cache = pi_state;
828 
829 	return 0;
830 }
831 
alloc_pi_state(void)832 static struct futex_pi_state *alloc_pi_state(void)
833 {
834 	struct futex_pi_state *pi_state = current->pi_state_cache;
835 
836 	WARN_ON(!pi_state);
837 	current->pi_state_cache = NULL;
838 
839 	return pi_state;
840 }
841 
get_pi_state(struct futex_pi_state * pi_state)842 static void get_pi_state(struct futex_pi_state *pi_state)
843 {
844 	WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
845 }
846 
847 /*
848  * Drops a reference to the pi_state object and frees or caches it
849  * when the last reference is gone.
850  */
put_pi_state(struct futex_pi_state * pi_state)851 static void put_pi_state(struct futex_pi_state *pi_state)
852 {
853 	if (!pi_state)
854 		return;
855 
856 	if (!atomic_dec_and_test(&pi_state->refcount))
857 		return;
858 
859 	/*
860 	 * If pi_state->owner is NULL, the owner is most probably dying
861 	 * and has cleaned up the pi_state already
862 	 */
863 	if (pi_state->owner) {
864 		struct task_struct *owner;
865 
866 		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
867 		owner = pi_state->owner;
868 		if (owner) {
869 			raw_spin_lock(&owner->pi_lock);
870 			list_del_init(&pi_state->list);
871 			raw_spin_unlock(&owner->pi_lock);
872 		}
873 		rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
874 		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
875 	}
876 
877 	if (current->pi_state_cache) {
878 		kfree(pi_state);
879 	} else {
880 		/*
881 		 * pi_state->list is already empty.
882 		 * clear pi_state->owner.
883 		 * refcount is at 0 - put it back to 1.
884 		 */
885 		pi_state->owner = NULL;
886 		atomic_set(&pi_state->refcount, 1);
887 		current->pi_state_cache = pi_state;
888 	}
889 }
890 
891 /*
892  * Look up the task based on what TID userspace gave us.
893  * We dont trust it.
894  */
futex_find_get_task(pid_t pid)895 static struct task_struct *futex_find_get_task(pid_t pid)
896 {
897 	struct task_struct *p;
898 
899 	rcu_read_lock();
900 	p = find_task_by_vpid(pid);
901 	if (p)
902 		get_task_struct(p);
903 
904 	rcu_read_unlock();
905 
906 	return p;
907 }
908 
909 #ifdef CONFIG_FUTEX_PI
910 
911 /*
912  * This task is holding PI mutexes at exit time => bad.
913  * Kernel cleans up PI-state, but userspace is likely hosed.
914  * (Robust-futex cleanup is separate and might save the day for userspace.)
915  */
exit_pi_state_list(struct task_struct * curr)916 static void exit_pi_state_list(struct task_struct *curr)
917 {
918 	struct list_head *next, *head = &curr->pi_state_list;
919 	struct futex_pi_state *pi_state;
920 	struct futex_hash_bucket *hb;
921 	union futex_key key = FUTEX_KEY_INIT;
922 
923 	if (!futex_cmpxchg_enabled)
924 		return;
925 	/*
926 	 * We are a ZOMBIE and nobody can enqueue itself on
927 	 * pi_state_list anymore, but we have to be careful
928 	 * versus waiters unqueueing themselves:
929 	 */
930 	raw_spin_lock_irq(&curr->pi_lock);
931 	while (!list_empty(head)) {
932 		next = head->next;
933 		pi_state = list_entry(next, struct futex_pi_state, list);
934 		key = pi_state->key;
935 		hb = hash_futex(&key);
936 
937 		/*
938 		 * We can race against put_pi_state() removing itself from the
939 		 * list (a waiter going away). put_pi_state() will first
940 		 * decrement the reference count and then modify the list, so
941 		 * its possible to see the list entry but fail this reference
942 		 * acquire.
943 		 *
944 		 * In that case; drop the locks to let put_pi_state() make
945 		 * progress and retry the loop.
946 		 */
947 		if (!atomic_inc_not_zero(&pi_state->refcount)) {
948 			raw_spin_unlock_irq(&curr->pi_lock);
949 			cpu_relax();
950 			raw_spin_lock_irq(&curr->pi_lock);
951 			continue;
952 		}
953 		raw_spin_unlock_irq(&curr->pi_lock);
954 
955 		spin_lock(&hb->lock);
956 		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
957 		raw_spin_lock(&curr->pi_lock);
958 		/*
959 		 * We dropped the pi-lock, so re-check whether this
960 		 * task still owns the PI-state:
961 		 */
962 		if (head->next != next) {
963 			/* retain curr->pi_lock for the loop invariant */
964 			raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
965 			spin_unlock(&hb->lock);
966 			put_pi_state(pi_state);
967 			continue;
968 		}
969 
970 		WARN_ON(pi_state->owner != curr);
971 		WARN_ON(list_empty(&pi_state->list));
972 		list_del_init(&pi_state->list);
973 		pi_state->owner = NULL;
974 
975 		raw_spin_unlock(&curr->pi_lock);
976 		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
977 		spin_unlock(&hb->lock);
978 
979 		rt_mutex_futex_unlock(&pi_state->pi_mutex);
980 		put_pi_state(pi_state);
981 
982 		raw_spin_lock_irq(&curr->pi_lock);
983 	}
984 	raw_spin_unlock_irq(&curr->pi_lock);
985 }
986 #else
exit_pi_state_list(struct task_struct * curr)987 static inline void exit_pi_state_list(struct task_struct *curr) { }
988 #endif
989 
990 /*
991  * We need to check the following states:
992  *
993  *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
994  *
995  * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
996  * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
997  *
998  * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
999  *
1000  * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
1001  * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
1002  *
1003  * [6]  Found  | Found    | task      | 0         | 1      | Valid
1004  *
1005  * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
1006  *
1007  * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
1008  * [9]  Found  | Found    | task      | 0         | 0      | Invalid
1009  * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
1010  *
1011  * [1]	Indicates that the kernel can acquire the futex atomically. We
1012  *	came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
1013  *
1014  * [2]	Valid, if TID does not belong to a kernel thread. If no matching
1015  *      thread is found then it indicates that the owner TID has died.
1016  *
1017  * [3]	Invalid. The waiter is queued on a non PI futex
1018  *
1019  * [4]	Valid state after exit_robust_list(), which sets the user space
1020  *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
1021  *
1022  * [5]	The user space value got manipulated between exit_robust_list()
1023  *	and exit_pi_state_list()
1024  *
1025  * [6]	Valid state after exit_pi_state_list() which sets the new owner in
1026  *	the pi_state but cannot access the user space value.
1027  *
1028  * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
1029  *
1030  * [8]	Owner and user space value match
1031  *
1032  * [9]	There is no transient state which sets the user space TID to 0
1033  *	except exit_robust_list(), but this is indicated by the
1034  *	FUTEX_OWNER_DIED bit. See [4]
1035  *
1036  * [10] There is no transient state which leaves owner and user space
1037  *	TID out of sync.
1038  *
1039  *
1040  * Serialization and lifetime rules:
1041  *
1042  * hb->lock:
1043  *
1044  *	hb -> futex_q, relation
1045  *	futex_q -> pi_state, relation
1046  *
1047  *	(cannot be raw because hb can contain arbitrary amount
1048  *	 of futex_q's)
1049  *
1050  * pi_mutex->wait_lock:
1051  *
1052  *	{uval, pi_state}
1053  *
1054  *	(and pi_mutex 'obviously')
1055  *
1056  * p->pi_lock:
1057  *
1058  *	p->pi_state_list -> pi_state->list, relation
1059  *
1060  * pi_state->refcount:
1061  *
1062  *	pi_state lifetime
1063  *
1064  *
1065  * Lock order:
1066  *
1067  *   hb->lock
1068  *     pi_mutex->wait_lock
1069  *       p->pi_lock
1070  *
1071  */
1072 
1073 /*
1074  * Validate that the existing waiter has a pi_state and sanity check
1075  * the pi_state against the user space value. If correct, attach to
1076  * it.
1077  */
attach_to_pi_state(u32 __user * uaddr,u32 uval,struct futex_pi_state * pi_state,struct futex_pi_state ** ps)1078 static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
1079 			      struct futex_pi_state *pi_state,
1080 			      struct futex_pi_state **ps)
1081 {
1082 	pid_t pid = uval & FUTEX_TID_MASK;
1083 	u32 uval2;
1084 	int ret;
1085 
1086 	/*
1087 	 * Userspace might have messed up non-PI and PI futexes [3]
1088 	 */
1089 	if (unlikely(!pi_state))
1090 		return -EINVAL;
1091 
1092 	/*
1093 	 * We get here with hb->lock held, and having found a
1094 	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
1095 	 * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
1096 	 * which in turn means that futex_lock_pi() still has a reference on
1097 	 * our pi_state.
1098 	 *
1099 	 * The waiter holding a reference on @pi_state also protects against
1100 	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
1101 	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
1102 	 * free pi_state before we can take a reference ourselves.
1103 	 */
1104 	WARN_ON(!atomic_read(&pi_state->refcount));
1105 
1106 	/*
1107 	 * Now that we have a pi_state, we can acquire wait_lock
1108 	 * and do the state validation.
1109 	 */
1110 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1111 
1112 	/*
1113 	 * Since {uval, pi_state} is serialized by wait_lock, and our current
1114 	 * uval was read without holding it, it can have changed. Verify it
1115 	 * still is what we expect it to be, otherwise retry the entire
1116 	 * operation.
1117 	 */
1118 	if (get_futex_value_locked(&uval2, uaddr))
1119 		goto out_efault;
1120 
1121 	if (uval != uval2)
1122 		goto out_eagain;
1123 
1124 	/*
1125 	 * Handle the owner died case:
1126 	 */
1127 	if (uval & FUTEX_OWNER_DIED) {
1128 		/*
1129 		 * exit_pi_state_list sets owner to NULL and wakes the
1130 		 * topmost waiter. The task which acquires the
1131 		 * pi_state->rt_mutex will fixup owner.
1132 		 */
1133 		if (!pi_state->owner) {
1134 			/*
1135 			 * No pi state owner, but the user space TID
1136 			 * is not 0. Inconsistent state. [5]
1137 			 */
1138 			if (pid)
1139 				goto out_einval;
1140 			/*
1141 			 * Take a ref on the state and return success. [4]
1142 			 */
1143 			goto out_attach;
1144 		}
1145 
1146 		/*
1147 		 * If TID is 0, then either the dying owner has not
1148 		 * yet executed exit_pi_state_list() or some waiter
1149 		 * acquired the rtmutex in the pi state, but did not
1150 		 * yet fixup the TID in user space.
1151 		 *
1152 		 * Take a ref on the state and return success. [6]
1153 		 */
1154 		if (!pid)
1155 			goto out_attach;
1156 	} else {
1157 		/*
1158 		 * If the owner died bit is not set, then the pi_state
1159 		 * must have an owner. [7]
1160 		 */
1161 		if (!pi_state->owner)
1162 			goto out_einval;
1163 	}
1164 
1165 	/*
1166 	 * Bail out if user space manipulated the futex value. If pi
1167 	 * state exists then the owner TID must be the same as the
1168 	 * user space TID. [9/10]
1169 	 */
1170 	if (pid != task_pid_vnr(pi_state->owner))
1171 		goto out_einval;
1172 
1173 out_attach:
1174 	get_pi_state(pi_state);
1175 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1176 	*ps = pi_state;
1177 	return 0;
1178 
1179 out_einval:
1180 	ret = -EINVAL;
1181 	goto out_error;
1182 
1183 out_eagain:
1184 	ret = -EAGAIN;
1185 	goto out_error;
1186 
1187 out_efault:
1188 	ret = -EFAULT;
1189 	goto out_error;
1190 
1191 out_error:
1192 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1193 	return ret;
1194 }
1195 
1196 /**
1197  * wait_for_owner_exiting - Block until the owner has exited
1198  * @exiting:	Pointer to the exiting task
1199  *
1200  * Caller must hold a refcount on @exiting.
1201  */
wait_for_owner_exiting(int ret,struct task_struct * exiting)1202 static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
1203 {
1204 	if (ret != -EBUSY) {
1205 		WARN_ON_ONCE(exiting);
1206 		return;
1207 	}
1208 
1209 	if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
1210 		return;
1211 
1212 	mutex_lock(&exiting->futex_exit_mutex);
1213 	/*
1214 	 * No point in doing state checking here. If the waiter got here
1215 	 * while the task was in exec()->exec_futex_release() then it can
1216 	 * have any FUTEX_STATE_* value when the waiter has acquired the
1217 	 * mutex. OK, if running, EXITING or DEAD if it reached exit()
1218 	 * already. Highly unlikely and not a problem. Just one more round
1219 	 * through the futex maze.
1220 	 */
1221 	mutex_unlock(&exiting->futex_exit_mutex);
1222 
1223 	put_task_struct(exiting);
1224 }
1225 
handle_exit_race(u32 __user * uaddr,u32 uval,struct task_struct * tsk)1226 static int handle_exit_race(u32 __user *uaddr, u32 uval,
1227 			    struct task_struct *tsk)
1228 {
1229 	u32 uval2;
1230 
1231 	/*
1232 	 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
1233 	 * caller that the alleged owner is busy.
1234 	 */
1235 	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
1236 		return -EBUSY;
1237 
1238 	/*
1239 	 * Reread the user space value to handle the following situation:
1240 	 *
1241 	 * CPU0				CPU1
1242 	 *
1243 	 * sys_exit()			sys_futex()
1244 	 *  do_exit()			 futex_lock_pi()
1245 	 *                                futex_lock_pi_atomic()
1246 	 *   exit_signals(tsk)		    No waiters:
1247 	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
1248 	 *  mm_release(tsk)		    Set waiter bit
1249 	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
1250 	 *      Set owner died		    attach_to_pi_owner() {
1251 	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
1252 	 *   }				     if (!tsk->flags & PF_EXITING) {
1253 	 *  ...				       attach();
1254 	 *  tsk->futex_state =               } else {
1255 	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
1256 	 *					  FUTEX_STATE_DEAD)
1257 	 *				         return -EAGAIN;
1258 	 *				       return -ESRCH; <--- FAIL
1259 	 *				     }
1260 	 *
1261 	 * Returning ESRCH unconditionally is wrong here because the
1262 	 * user space value has been changed by the exiting task.
1263 	 *
1264 	 * The same logic applies to the case where the exiting task is
1265 	 * already gone.
1266 	 */
1267 	if (get_futex_value_locked(&uval2, uaddr))
1268 		return -EFAULT;
1269 
1270 	/* If the user space value has changed, try again. */
1271 	if (uval2 != uval)
1272 		return -EAGAIN;
1273 
1274 	/*
1275 	 * The exiting task did not have a robust list, the robust list was
1276 	 * corrupted or the user space value in *uaddr is simply bogus.
1277 	 * Give up and tell user space.
1278 	 */
1279 	return -ESRCH;
1280 }
1281 
1282 /*
1283  * Lookup the task for the TID provided from user space and attach to
1284  * it after doing proper sanity checks.
1285  */
attach_to_pi_owner(u32 __user * uaddr,u32 uval,union futex_key * key,struct futex_pi_state ** ps,struct task_struct ** exiting)1286 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
1287 			      struct futex_pi_state **ps,
1288 			      struct task_struct **exiting)
1289 {
1290 	pid_t pid = uval & FUTEX_TID_MASK;
1291 	struct futex_pi_state *pi_state;
1292 	struct task_struct *p;
1293 
1294 	/*
1295 	 * We are the first waiter - try to look up the real owner and attach
1296 	 * the new pi_state to it, but bail out when TID = 0 [1]
1297 	 *
1298 	 * The !pid check is paranoid. None of the call sites should end up
1299 	 * with pid == 0, but better safe than sorry. Let the caller retry
1300 	 */
1301 	if (!pid)
1302 		return -EAGAIN;
1303 	p = futex_find_get_task(pid);
1304 	if (!p)
1305 		return handle_exit_race(uaddr, uval, NULL);
1306 
1307 	if (unlikely(p->flags & PF_KTHREAD)) {
1308 		put_task_struct(p);
1309 		return -EPERM;
1310 	}
1311 
1312 	/*
1313 	 * We need to look at the task state to figure out, whether the
1314 	 * task is exiting. To protect against the change of the task state
1315 	 * in futex_exit_release(), we do this protected by p->pi_lock:
1316 	 */
1317 	raw_spin_lock_irq(&p->pi_lock);
1318 	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
1319 		/*
1320 		 * The task is on the way out. When the futex state is
1321 		 * FUTEX_STATE_DEAD, we know that the task has finished
1322 		 * the cleanup:
1323 		 */
1324 		int ret = handle_exit_race(uaddr, uval, p);
1325 
1326 		raw_spin_unlock_irq(&p->pi_lock);
1327 		/*
1328 		 * If the owner task is between FUTEX_STATE_EXITING and
1329 		 * FUTEX_STATE_DEAD then store the task pointer and keep
1330 		 * the reference on the task struct. The calling code will
1331 		 * drop all locks, wait for the task to reach
1332 		 * FUTEX_STATE_DEAD and then drop the refcount. This is
1333 		 * required to prevent a live lock when the current task
1334 		 * preempted the exiting task between the two states.
1335 		 */
1336 		if (ret == -EBUSY)
1337 			*exiting = p;
1338 		else
1339 			put_task_struct(p);
1340 		return ret;
1341 	}
1342 
1343 	/*
1344 	 * No existing pi state. First waiter. [2]
1345 	 *
1346 	 * This creates pi_state, we have hb->lock held, this means nothing can
1347 	 * observe this state, wait_lock is irrelevant.
1348 	 */
1349 	pi_state = alloc_pi_state();
1350 
1351 	/*
1352 	 * Initialize the pi_mutex in locked state and make @p
1353 	 * the owner of it:
1354 	 */
1355 	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
1356 
1357 	/* Store the key for possible exit cleanups: */
1358 	pi_state->key = *key;
1359 
1360 	WARN_ON(!list_empty(&pi_state->list));
1361 	list_add(&pi_state->list, &p->pi_state_list);
1362 	/*
1363 	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
1364 	 * because there is no concurrency as the object is not published yet.
1365 	 */
1366 	pi_state->owner = p;
1367 	raw_spin_unlock_irq(&p->pi_lock);
1368 
1369 	put_task_struct(p);
1370 
1371 	*ps = pi_state;
1372 
1373 	return 0;
1374 }
1375 
lookup_pi_state(u32 __user * uaddr,u32 uval,struct futex_hash_bucket * hb,union futex_key * key,struct futex_pi_state ** ps,struct task_struct ** exiting)1376 static int lookup_pi_state(u32 __user *uaddr, u32 uval,
1377 			   struct futex_hash_bucket *hb,
1378 			   union futex_key *key, struct futex_pi_state **ps,
1379 			   struct task_struct **exiting)
1380 {
1381 	struct futex_q *top_waiter = futex_top_waiter(hb, key);
1382 
1383 	/*
1384 	 * If there is a waiter on that futex, validate it and
1385 	 * attach to the pi_state when the validation succeeds.
1386 	 */
1387 	if (top_waiter)
1388 		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1389 
1390 	/*
1391 	 * We are the first waiter - try to look up the owner based on
1392 	 * @uval and attach to it.
1393 	 */
1394 	return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
1395 }
1396 
lock_pi_update_atomic(u32 __user * uaddr,u32 uval,u32 newval)1397 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
1398 {
1399 	int err;
1400 	u32 uninitialized_var(curval);
1401 
1402 	if (unlikely(should_fail_futex(true)))
1403 		return -EFAULT;
1404 
1405 	err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1406 	if (unlikely(err))
1407 		return err;
1408 
1409 	/* If user space value changed, let the caller retry */
1410 	return curval != uval ? -EAGAIN : 0;
1411 }
1412 
1413 /**
1414  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
1415  * @uaddr:		the pi futex user address
1416  * @hb:			the pi futex hash bucket
1417  * @key:		the futex key associated with uaddr and hb
1418  * @ps:			the pi_state pointer where we store the result of the
1419  *			lookup
1420  * @task:		the task to perform the atomic lock work for.  This will
1421  *			be "current" except in the case of requeue pi.
1422  * @exiting:		Pointer to store the task pointer of the owner task
1423  *			which is in the middle of exiting
1424  * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
1425  *
1426  * Return:
1427  *  -  0 - ready to wait;
1428  *  -  1 - acquired the lock;
1429  *  - <0 - error
1430  *
1431  * The hb->lock and futex_key refs shall be held by the caller.
1432  *
1433  * @exiting is only set when the return value is -EBUSY. If so, this holds
1434  * a refcount on the exiting task on return and the caller needs to drop it
1435  * after waiting for the exit to complete.
1436  */
futex_lock_pi_atomic(u32 __user * uaddr,struct futex_hash_bucket * hb,union futex_key * key,struct futex_pi_state ** ps,struct task_struct * task,struct task_struct ** exiting,int set_waiters)1437 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
1438 				union futex_key *key,
1439 				struct futex_pi_state **ps,
1440 				struct task_struct *task,
1441 				struct task_struct **exiting,
1442 				int set_waiters)
1443 {
1444 	u32 uval, newval, vpid = task_pid_vnr(task);
1445 	struct futex_q *top_waiter;
1446 	int ret;
1447 
1448 	/*
1449 	 * Read the user space value first so we can validate a few
1450 	 * things before proceeding further.
1451 	 */
1452 	if (get_futex_value_locked(&uval, uaddr))
1453 		return -EFAULT;
1454 
1455 	if (unlikely(should_fail_futex(true)))
1456 		return -EFAULT;
1457 
1458 	/*
1459 	 * Detect deadlocks.
1460 	 */
1461 	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
1462 		return -EDEADLK;
1463 
1464 	if ((unlikely(should_fail_futex(true))))
1465 		return -EDEADLK;
1466 
1467 	/*
1468 	 * Lookup existing state first. If it exists, try to attach to
1469 	 * its pi_state.
1470 	 */
1471 	top_waiter = futex_top_waiter(hb, key);
1472 	if (top_waiter)
1473 		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1474 
1475 	/*
1476 	 * No waiter and user TID is 0. We are here because the
1477 	 * waiters or the owner died bit is set or called from
1478 	 * requeue_cmp_pi or for whatever reason something took the
1479 	 * syscall.
1480 	 */
1481 	if (!(uval & FUTEX_TID_MASK)) {
1482 		/*
1483 		 * We take over the futex. No other waiters and the user space
1484 		 * TID is 0. We preserve the owner died bit.
1485 		 */
1486 		newval = uval & FUTEX_OWNER_DIED;
1487 		newval |= vpid;
1488 
1489 		/* The futex requeue_pi code can enforce the waiters bit */
1490 		if (set_waiters)
1491 			newval |= FUTEX_WAITERS;
1492 
1493 		ret = lock_pi_update_atomic(uaddr, uval, newval);
1494 		/* If the take over worked, return 1 */
1495 		return ret < 0 ? ret : 1;
1496 	}
1497 
1498 	/*
1499 	 * First waiter. Set the waiters bit before attaching ourself to
1500 	 * the owner. If owner tries to unlock, it will be forced into
1501 	 * the kernel and blocked on hb->lock.
1502 	 */
1503 	newval = uval | FUTEX_WAITERS;
1504 	ret = lock_pi_update_atomic(uaddr, uval, newval);
1505 	if (ret)
1506 		return ret;
1507 	/*
1508 	 * If the update of the user space value succeeded, we try to
1509 	 * attach to the owner. If that fails, no harm done, we only
1510 	 * set the FUTEX_WAITERS bit in the user space variable.
1511 	 */
1512 	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
1513 }
1514 
1515 /**
1516  * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
1517  * @q:	The futex_q to unqueue
1518  *
1519  * The q->lock_ptr must not be NULL and must be held by the caller.
1520  */
__unqueue_futex(struct futex_q * q)1521 static void __unqueue_futex(struct futex_q *q)
1522 {
1523 	struct futex_hash_bucket *hb;
1524 
1525 	if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
1526 	    || WARN_ON(plist_node_empty(&q->list)))
1527 		return;
1528 
1529 	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
1530 	plist_del(&q->list, &hb->chain);
1531 	hb_waiters_dec(hb);
1532 }
1533 
1534 /*
1535  * The hash bucket lock must be held when this is called.
1536  * Afterwards, the futex_q must not be accessed. Callers
1537  * must ensure to later call wake_up_q() for the actual
1538  * wakeups to occur.
1539  */
mark_wake_futex(struct wake_q_head * wake_q,struct futex_q * q)1540 static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
1541 {
1542 	struct task_struct *p = q->task;
1543 
1544 	if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
1545 		return;
1546 
1547 	get_task_struct(p);
1548 	__unqueue_futex(q);
1549 	/*
1550 	 * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
1551 	 * is written, without taking any locks. This is possible in the event
1552 	 * of a spurious wakeup, for example. A memory barrier is required here
1553 	 * to prevent the following store to lock_ptr from getting ahead of the
1554 	 * plist_del in __unqueue_futex().
1555 	 */
1556 	smp_store_release(&q->lock_ptr, NULL);
1557 
1558 	/*
1559 	 * Queue the task for later wakeup for after we've released
1560 	 * the hb->lock. wake_q_add() grabs reference to p.
1561 	 */
1562 	wake_q_add(wake_q, p);
1563 	put_task_struct(p);
1564 }
1565 
1566 /*
1567  * Caller must hold a reference on @pi_state.
1568  */
wake_futex_pi(u32 __user * uaddr,u32 uval,struct futex_pi_state * pi_state)1569 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
1570 {
1571 	u32 uninitialized_var(curval), newval;
1572 	struct task_struct *new_owner;
1573 	bool postunlock = false;
1574 	DEFINE_WAKE_Q(wake_q);
1575 	int ret = 0;
1576 
1577 	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
1578 	if (WARN_ON_ONCE(!new_owner)) {
1579 		/*
1580 		 * As per the comment in futex_unlock_pi() this should not happen.
1581 		 *
1582 		 * When this happens, give up our locks and try again, giving
1583 		 * the futex_lock_pi() instance time to complete, either by
1584 		 * waiting on the rtmutex or removing itself from the futex
1585 		 * queue.
1586 		 */
1587 		ret = -EAGAIN;
1588 		goto out_unlock;
1589 	}
1590 
1591 	/*
1592 	 * We pass it to the next owner. The WAITERS bit is always kept
1593 	 * enabled while there is PI state around. We cleanup the owner
1594 	 * died bit, because we are the owner.
1595 	 */
1596 	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1597 
1598 	if (unlikely(should_fail_futex(true)))
1599 		ret = -EFAULT;
1600 
1601 	ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1602 	if (!ret && (curval != uval)) {
1603 		/*
1604 		 * If a unconditional UNLOCK_PI operation (user space did not
1605 		 * try the TID->0 transition) raced with a waiter setting the
1606 		 * FUTEX_WAITERS flag between get_user() and locking the hash
1607 		 * bucket lock, retry the operation.
1608 		 */
1609 		if ((FUTEX_TID_MASK & curval) == uval)
1610 			ret = -EAGAIN;
1611 		else
1612 			ret = -EINVAL;
1613 	}
1614 
1615 	if (ret)
1616 		goto out_unlock;
1617 
1618 	/*
1619 	 * This is a point of no return; once we modify the uval there is no
1620 	 * going back and subsequent operations must not fail.
1621 	 */
1622 
1623 	raw_spin_lock(&pi_state->owner->pi_lock);
1624 	WARN_ON(list_empty(&pi_state->list));
1625 	list_del_init(&pi_state->list);
1626 	raw_spin_unlock(&pi_state->owner->pi_lock);
1627 
1628 	raw_spin_lock(&new_owner->pi_lock);
1629 	WARN_ON(!list_empty(&pi_state->list));
1630 	list_add(&pi_state->list, &new_owner->pi_state_list);
1631 	pi_state->owner = new_owner;
1632 	raw_spin_unlock(&new_owner->pi_lock);
1633 
1634 	postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
1635 
1636 out_unlock:
1637 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1638 
1639 	if (postunlock)
1640 		rt_mutex_postunlock(&wake_q);
1641 
1642 	return ret;
1643 }
1644 
1645 /*
1646  * Express the locking dependencies for lockdep:
1647  */
1648 static inline void
double_lock_hb(struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2)1649 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1650 {
1651 	if (hb1 <= hb2) {
1652 		spin_lock(&hb1->lock);
1653 		if (hb1 < hb2)
1654 			spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1655 	} else { /* hb1 > hb2 */
1656 		spin_lock(&hb2->lock);
1657 		spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1658 	}
1659 }
1660 
1661 static inline void
double_unlock_hb(struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2)1662 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1663 {
1664 	spin_unlock(&hb1->lock);
1665 	if (hb1 != hb2)
1666 		spin_unlock(&hb2->lock);
1667 }
1668 
1669 /*
1670  * Wake up waiters matching bitset queued on this futex (uaddr).
1671  */
1672 static int
futex_wake(u32 __user * uaddr,unsigned int flags,int nr_wake,u32 bitset)1673 futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1674 {
1675 	struct futex_hash_bucket *hb;
1676 	struct futex_q *this, *next;
1677 	union futex_key key = FUTEX_KEY_INIT;
1678 	int ret;
1679 	DEFINE_WAKE_Q(wake_q);
1680 
1681 	if (!bitset)
1682 		return -EINVAL;
1683 
1684 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
1685 	if (unlikely(ret != 0))
1686 		goto out;
1687 
1688 	hb = hash_futex(&key);
1689 
1690 	/* Make sure we really have tasks to wakeup */
1691 	if (!hb_waiters_pending(hb))
1692 		goto out_put_key;
1693 
1694 	spin_lock(&hb->lock);
1695 
1696 	plist_for_each_entry_safe(this, next, &hb->chain, list) {
1697 		if (match_futex (&this->key, &key)) {
1698 			if (this->pi_state || this->rt_waiter) {
1699 				ret = -EINVAL;
1700 				break;
1701 			}
1702 
1703 			/* Check if one of the bits is set in both bitsets */
1704 			if (!(this->bitset & bitset))
1705 				continue;
1706 
1707 			mark_wake_futex(&wake_q, this);
1708 			if (++ret >= nr_wake)
1709 				break;
1710 		}
1711 	}
1712 
1713 	spin_unlock(&hb->lock);
1714 	wake_up_q(&wake_q);
1715 out_put_key:
1716 	put_futex_key(&key);
1717 out:
1718 	return ret;
1719 }
1720 
futex_atomic_op_inuser(unsigned int encoded_op,u32 __user * uaddr)1721 static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
1722 {
1723 	unsigned int op =	  (encoded_op & 0x70000000) >> 28;
1724 	unsigned int cmp =	  (encoded_op & 0x0f000000) >> 24;
1725 	int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12);
1726 	int cmparg = sign_extend32(encoded_op & 0x00000fff, 12);
1727 	int oldval, ret;
1728 
1729 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
1730 		if (oparg < 0 || oparg > 31) {
1731 			char comm[sizeof(current->comm)];
1732 			/*
1733 			 * kill this print and return -EINVAL when userspace
1734 			 * is sane again
1735 			 */
1736 			pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
1737 					get_task_comm(comm, current), oparg);
1738 			oparg &= 31;
1739 		}
1740 		oparg = 1 << oparg;
1741 	}
1742 
1743 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
1744 		return -EFAULT;
1745 
1746 	ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
1747 	if (ret)
1748 		return ret;
1749 
1750 	switch (cmp) {
1751 	case FUTEX_OP_CMP_EQ:
1752 		return oldval == cmparg;
1753 	case FUTEX_OP_CMP_NE:
1754 		return oldval != cmparg;
1755 	case FUTEX_OP_CMP_LT:
1756 		return oldval < cmparg;
1757 	case FUTEX_OP_CMP_GE:
1758 		return oldval >= cmparg;
1759 	case FUTEX_OP_CMP_LE:
1760 		return oldval <= cmparg;
1761 	case FUTEX_OP_CMP_GT:
1762 		return oldval > cmparg;
1763 	default:
1764 		return -ENOSYS;
1765 	}
1766 }
1767 
1768 /*
1769  * Wake up all waiters hashed on the physical page that is mapped
1770  * to this virtual address:
1771  */
1772 static int
futex_wake_op(u32 __user * uaddr1,unsigned int flags,u32 __user * uaddr2,int nr_wake,int nr_wake2,int op)1773 futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1774 	      int nr_wake, int nr_wake2, int op)
1775 {
1776 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1777 	struct futex_hash_bucket *hb1, *hb2;
1778 	struct futex_q *this, *next;
1779 	int ret, op_ret;
1780 	DEFINE_WAKE_Q(wake_q);
1781 
1782 retry:
1783 	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1784 	if (unlikely(ret != 0))
1785 		goto out;
1786 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
1787 	if (unlikely(ret != 0))
1788 		goto out_put_key1;
1789 
1790 	hb1 = hash_futex(&key1);
1791 	hb2 = hash_futex(&key2);
1792 
1793 retry_private:
1794 	double_lock_hb(hb1, hb2);
1795 	op_ret = futex_atomic_op_inuser(op, uaddr2);
1796 	if (unlikely(op_ret < 0)) {
1797 		double_unlock_hb(hb1, hb2);
1798 
1799 		if (!IS_ENABLED(CONFIG_MMU) ||
1800 		    unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
1801 			/*
1802 			 * we don't get EFAULT from MMU faults if we don't have
1803 			 * an MMU, but we might get them from range checking
1804 			 */
1805 			ret = op_ret;
1806 			goto out_put_keys;
1807 		}
1808 
1809 		if (op_ret == -EFAULT) {
1810 			ret = fault_in_user_writeable(uaddr2);
1811 			if (ret)
1812 				goto out_put_keys;
1813 		}
1814 
1815 		if (!(flags & FLAGS_SHARED)) {
1816 			cond_resched();
1817 			goto retry_private;
1818 		}
1819 
1820 		put_futex_key(&key2);
1821 		put_futex_key(&key1);
1822 		cond_resched();
1823 		goto retry;
1824 	}
1825 
1826 	plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1827 		if (match_futex (&this->key, &key1)) {
1828 			if (this->pi_state || this->rt_waiter) {
1829 				ret = -EINVAL;
1830 				goto out_unlock;
1831 			}
1832 			mark_wake_futex(&wake_q, this);
1833 			if (++ret >= nr_wake)
1834 				break;
1835 		}
1836 	}
1837 
1838 	if (op_ret > 0) {
1839 		op_ret = 0;
1840 		plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1841 			if (match_futex (&this->key, &key2)) {
1842 				if (this->pi_state || this->rt_waiter) {
1843 					ret = -EINVAL;
1844 					goto out_unlock;
1845 				}
1846 				mark_wake_futex(&wake_q, this);
1847 				if (++op_ret >= nr_wake2)
1848 					break;
1849 			}
1850 		}
1851 		ret += op_ret;
1852 	}
1853 
1854 out_unlock:
1855 	double_unlock_hb(hb1, hb2);
1856 	wake_up_q(&wake_q);
1857 out_put_keys:
1858 	put_futex_key(&key2);
1859 out_put_key1:
1860 	put_futex_key(&key1);
1861 out:
1862 	return ret;
1863 }
1864 
1865 /**
1866  * requeue_futex() - Requeue a futex_q from one hb to another
1867  * @q:		the futex_q to requeue
1868  * @hb1:	the source hash_bucket
1869  * @hb2:	the target hash_bucket
1870  * @key2:	the new key for the requeued futex_q
1871  */
1872 static inline
requeue_futex(struct futex_q * q,struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2,union futex_key * key2)1873 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1874 		   struct futex_hash_bucket *hb2, union futex_key *key2)
1875 {
1876 
1877 	/*
1878 	 * If key1 and key2 hash to the same bucket, no need to
1879 	 * requeue.
1880 	 */
1881 	if (likely(&hb1->chain != &hb2->chain)) {
1882 		plist_del(&q->list, &hb1->chain);
1883 		hb_waiters_dec(hb1);
1884 		hb_waiters_inc(hb2);
1885 		plist_add(&q->list, &hb2->chain);
1886 		q->lock_ptr = &hb2->lock;
1887 	}
1888 	get_futex_key_refs(key2);
1889 	q->key = *key2;
1890 }
1891 
1892 /**
1893  * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1894  * @q:		the futex_q
1895  * @key:	the key of the requeue target futex
1896  * @hb:		the hash_bucket of the requeue target futex
1897  *
1898  * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1899  * target futex if it is uncontended or via a lock steal.  Set the futex_q key
1900  * to the requeue target futex so the waiter can detect the wakeup on the right
1901  * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1902  * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
1903  * to protect access to the pi_state to fixup the owner later.  Must be called
1904  * with both q->lock_ptr and hb->lock held.
1905  */
1906 static inline
requeue_pi_wake_futex(struct futex_q * q,union futex_key * key,struct futex_hash_bucket * hb)1907 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1908 			   struct futex_hash_bucket *hb)
1909 {
1910 	get_futex_key_refs(key);
1911 	q->key = *key;
1912 
1913 	__unqueue_futex(q);
1914 
1915 	WARN_ON(!q->rt_waiter);
1916 	q->rt_waiter = NULL;
1917 
1918 	q->lock_ptr = &hb->lock;
1919 
1920 	wake_up_state(q->task, TASK_NORMAL);
1921 }
1922 
1923 /**
1924  * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1925  * @pifutex:		the user address of the to futex
1926  * @hb1:		the from futex hash bucket, must be locked by the caller
1927  * @hb2:		the to futex hash bucket, must be locked by the caller
1928  * @key1:		the from futex key
1929  * @key2:		the to futex key
1930  * @ps:			address to store the pi_state pointer
1931  * @exiting:		Pointer to store the task pointer of the owner task
1932  *			which is in the middle of exiting
1933  * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
1934  *
1935  * Try and get the lock on behalf of the top waiter if we can do it atomically.
1936  * Wake the top waiter if we succeed.  If the caller specified set_waiters,
1937  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1938  * hb1 and hb2 must be held by the caller.
1939  *
1940  * @exiting is only set when the return value is -EBUSY. If so, this holds
1941  * a refcount on the exiting task on return and the caller needs to drop it
1942  * after waiting for the exit to complete.
1943  *
1944  * Return:
1945  *  -  0 - failed to acquire the lock atomically;
1946  *  - >0 - acquired the lock, return value is vpid of the top_waiter
1947  *  - <0 - error
1948  */
1949 static int
futex_proxy_trylock_atomic(u32 __user * pifutex,struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2,union futex_key * key1,union futex_key * key2,struct futex_pi_state ** ps,struct task_struct ** exiting,int set_waiters)1950 futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
1951 			   struct futex_hash_bucket *hb2, union futex_key *key1,
1952 			   union futex_key *key2, struct futex_pi_state **ps,
1953 			   struct task_struct **exiting, int set_waiters)
1954 {
1955 	struct futex_q *top_waiter = NULL;
1956 	u32 curval;
1957 	int ret, vpid;
1958 
1959 	if (get_futex_value_locked(&curval, pifutex))
1960 		return -EFAULT;
1961 
1962 	if (unlikely(should_fail_futex(true)))
1963 		return -EFAULT;
1964 
1965 	/*
1966 	 * Find the top_waiter and determine if there are additional waiters.
1967 	 * If the caller intends to requeue more than 1 waiter to pifutex,
1968 	 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1969 	 * as we have means to handle the possible fault.  If not, don't set
1970 	 * the bit unecessarily as it will force the subsequent unlock to enter
1971 	 * the kernel.
1972 	 */
1973 	top_waiter = futex_top_waiter(hb1, key1);
1974 
1975 	/* There are no waiters, nothing for us to do. */
1976 	if (!top_waiter)
1977 		return 0;
1978 
1979 	/* Ensure we requeue to the expected futex. */
1980 	if (!match_futex(top_waiter->requeue_pi_key, key2))
1981 		return -EINVAL;
1982 
1983 	/*
1984 	 * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
1985 	 * the contended case or if set_waiters is 1.  The pi_state is returned
1986 	 * in ps in contended cases.
1987 	 */
1988 	vpid = task_pid_vnr(top_waiter->task);
1989 	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1990 				   exiting, set_waiters);
1991 	if (ret == 1) {
1992 		requeue_pi_wake_futex(top_waiter, key2, hb2);
1993 		return vpid;
1994 	}
1995 	return ret;
1996 }
1997 
1998 /**
1999  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
2000  * @uaddr1:	source futex user address
2001  * @flags:	futex flags (FLAGS_SHARED, etc.)
2002  * @uaddr2:	target futex user address
2003  * @nr_wake:	number of waiters to wake (must be 1 for requeue_pi)
2004  * @nr_requeue:	number of waiters to requeue (0-INT_MAX)
2005  * @cmpval:	@uaddr1 expected value (or %NULL)
2006  * @requeue_pi:	if we are attempting to requeue from a non-pi futex to a
2007  *		pi futex (pi to pi requeue is not supported)
2008  *
2009  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
2010  * uaddr2 atomically on behalf of the top waiter.
2011  *
2012  * Return:
2013  *  - >=0 - on success, the number of tasks requeued or woken;
2014  *  -  <0 - on error
2015  */
futex_requeue(u32 __user * uaddr1,unsigned int flags,u32 __user * uaddr2,int nr_wake,int nr_requeue,u32 * cmpval,int requeue_pi)2016 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
2017 			 u32 __user *uaddr2, int nr_wake, int nr_requeue,
2018 			 u32 *cmpval, int requeue_pi)
2019 {
2020 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
2021 	int drop_count = 0, task_count = 0, ret;
2022 	struct futex_pi_state *pi_state = NULL;
2023 	struct futex_hash_bucket *hb1, *hb2;
2024 	struct futex_q *this, *next;
2025 	DEFINE_WAKE_Q(wake_q);
2026 
2027 	if (nr_wake < 0 || nr_requeue < 0)
2028 		return -EINVAL;
2029 
2030 	/*
2031 	 * When PI not supported: return -ENOSYS if requeue_pi is true,
2032 	 * consequently the compiler knows requeue_pi is always false past
2033 	 * this point which will optimize away all the conditional code
2034 	 * further down.
2035 	 */
2036 	if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
2037 		return -ENOSYS;
2038 
2039 	if (requeue_pi) {
2040 		/*
2041 		 * Requeue PI only works on two distinct uaddrs. This
2042 		 * check is only valid for private futexes. See below.
2043 		 */
2044 		if (uaddr1 == uaddr2)
2045 			return -EINVAL;
2046 
2047 		/*
2048 		 * requeue_pi requires a pi_state, try to allocate it now
2049 		 * without any locks in case it fails.
2050 		 */
2051 		if (refill_pi_state_cache())
2052 			return -ENOMEM;
2053 		/*
2054 		 * requeue_pi must wake as many tasks as it can, up to nr_wake
2055 		 * + nr_requeue, since it acquires the rt_mutex prior to
2056 		 * returning to userspace, so as to not leave the rt_mutex with
2057 		 * waiters and no owner.  However, second and third wake-ups
2058 		 * cannot be predicted as they involve race conditions with the
2059 		 * first wake and a fault while looking up the pi_state.  Both
2060 		 * pthread_cond_signal() and pthread_cond_broadcast() should
2061 		 * use nr_wake=1.
2062 		 */
2063 		if (nr_wake != 1)
2064 			return -EINVAL;
2065 	}
2066 
2067 retry:
2068 	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
2069 	if (unlikely(ret != 0))
2070 		goto out;
2071 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
2072 			    requeue_pi ? VERIFY_WRITE : VERIFY_READ);
2073 	if (unlikely(ret != 0))
2074 		goto out_put_key1;
2075 
2076 	/*
2077 	 * The check above which compares uaddrs is not sufficient for
2078 	 * shared futexes. We need to compare the keys:
2079 	 */
2080 	if (requeue_pi && match_futex(&key1, &key2)) {
2081 		ret = -EINVAL;
2082 		goto out_put_keys;
2083 	}
2084 
2085 	hb1 = hash_futex(&key1);
2086 	hb2 = hash_futex(&key2);
2087 
2088 retry_private:
2089 	hb_waiters_inc(hb2);
2090 	double_lock_hb(hb1, hb2);
2091 
2092 	if (likely(cmpval != NULL)) {
2093 		u32 curval;
2094 
2095 		ret = get_futex_value_locked(&curval, uaddr1);
2096 
2097 		if (unlikely(ret)) {
2098 			double_unlock_hb(hb1, hb2);
2099 			hb_waiters_dec(hb2);
2100 
2101 			ret = get_user(curval, uaddr1);
2102 			if (ret)
2103 				goto out_put_keys;
2104 
2105 			if (!(flags & FLAGS_SHARED))
2106 				goto retry_private;
2107 
2108 			put_futex_key(&key2);
2109 			put_futex_key(&key1);
2110 			goto retry;
2111 		}
2112 		if (curval != *cmpval) {
2113 			ret = -EAGAIN;
2114 			goto out_unlock;
2115 		}
2116 	}
2117 
2118 	if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
2119 		struct task_struct *exiting = NULL;
2120 
2121 		/*
2122 		 * Attempt to acquire uaddr2 and wake the top waiter. If we
2123 		 * intend to requeue waiters, force setting the FUTEX_WAITERS
2124 		 * bit.  We force this here where we are able to easily handle
2125 		 * faults rather in the requeue loop below.
2126 		 */
2127 		ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
2128 						 &key2, &pi_state,
2129 						 &exiting, nr_requeue);
2130 
2131 		/*
2132 		 * At this point the top_waiter has either taken uaddr2 or is
2133 		 * waiting on it.  If the former, then the pi_state will not
2134 		 * exist yet, look it up one more time to ensure we have a
2135 		 * reference to it. If the lock was taken, ret contains the
2136 		 * vpid of the top waiter task.
2137 		 * If the lock was not taken, we have pi_state and an initial
2138 		 * refcount on it. In case of an error we have nothing.
2139 		 */
2140 		if (ret > 0) {
2141 			WARN_ON(pi_state);
2142 			drop_count++;
2143 			task_count++;
2144 			/*
2145 			 * If we acquired the lock, then the user space value
2146 			 * of uaddr2 should be vpid. It cannot be changed by
2147 			 * the top waiter as it is blocked on hb2 lock if it
2148 			 * tries to do so. If something fiddled with it behind
2149 			 * our back the pi state lookup might unearth it. So
2150 			 * we rather use the known value than rereading and
2151 			 * handing potential crap to lookup_pi_state.
2152 			 *
2153 			 * If that call succeeds then we have pi_state and an
2154 			 * initial refcount on it.
2155 			 */
2156 			ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
2157 					      &pi_state, &exiting);
2158 		}
2159 
2160 		switch (ret) {
2161 		case 0:
2162 			/* We hold a reference on the pi state. */
2163 			break;
2164 
2165 			/* If the above failed, then pi_state is NULL */
2166 		case -EFAULT:
2167 			double_unlock_hb(hb1, hb2);
2168 			hb_waiters_dec(hb2);
2169 			put_futex_key(&key2);
2170 			put_futex_key(&key1);
2171 			ret = fault_in_user_writeable(uaddr2);
2172 			if (!ret)
2173 				goto retry;
2174 			goto out;
2175 		case -EBUSY:
2176 		case -EAGAIN:
2177 			/*
2178 			 * Two reasons for this:
2179 			 * - EBUSY: Owner is exiting and we just wait for the
2180 			 *   exit to complete.
2181 			 * - EAGAIN: The user space value changed.
2182 			 */
2183 			double_unlock_hb(hb1, hb2);
2184 			hb_waiters_dec(hb2);
2185 			put_futex_key(&key2);
2186 			put_futex_key(&key1);
2187 			/*
2188 			 * Handle the case where the owner is in the middle of
2189 			 * exiting. Wait for the exit to complete otherwise
2190 			 * this task might loop forever, aka. live lock.
2191 			 */
2192 			wait_for_owner_exiting(ret, exiting);
2193 			cond_resched();
2194 			goto retry;
2195 		default:
2196 			goto out_unlock;
2197 		}
2198 	}
2199 
2200 	plist_for_each_entry_safe(this, next, &hb1->chain, list) {
2201 		if (task_count - nr_wake >= nr_requeue)
2202 			break;
2203 
2204 		if (!match_futex(&this->key, &key1))
2205 			continue;
2206 
2207 		/*
2208 		 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
2209 		 * be paired with each other and no other futex ops.
2210 		 *
2211 		 * We should never be requeueing a futex_q with a pi_state,
2212 		 * which is awaiting a futex_unlock_pi().
2213 		 */
2214 		if ((requeue_pi && !this->rt_waiter) ||
2215 		    (!requeue_pi && this->rt_waiter) ||
2216 		    this->pi_state) {
2217 			ret = -EINVAL;
2218 			break;
2219 		}
2220 
2221 		/*
2222 		 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
2223 		 * lock, we already woke the top_waiter.  If not, it will be
2224 		 * woken by futex_unlock_pi().
2225 		 */
2226 		if (++task_count <= nr_wake && !requeue_pi) {
2227 			mark_wake_futex(&wake_q, this);
2228 			continue;
2229 		}
2230 
2231 		/* Ensure we requeue to the expected futex for requeue_pi. */
2232 		if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
2233 			ret = -EINVAL;
2234 			break;
2235 		}
2236 
2237 		/*
2238 		 * Requeue nr_requeue waiters and possibly one more in the case
2239 		 * of requeue_pi if we couldn't acquire the lock atomically.
2240 		 */
2241 		if (requeue_pi) {
2242 			/*
2243 			 * Prepare the waiter to take the rt_mutex. Take a
2244 			 * refcount on the pi_state and store the pointer in
2245 			 * the futex_q object of the waiter.
2246 			 */
2247 			get_pi_state(pi_state);
2248 			this->pi_state = pi_state;
2249 			ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
2250 							this->rt_waiter,
2251 							this->task);
2252 			if (ret == 1) {
2253 				/*
2254 				 * We got the lock. We do neither drop the
2255 				 * refcount on pi_state nor clear
2256 				 * this->pi_state because the waiter needs the
2257 				 * pi_state for cleaning up the user space
2258 				 * value. It will drop the refcount after
2259 				 * doing so.
2260 				 */
2261 				requeue_pi_wake_futex(this, &key2, hb2);
2262 				drop_count++;
2263 				continue;
2264 			} else if (ret) {
2265 				/*
2266 				 * rt_mutex_start_proxy_lock() detected a
2267 				 * potential deadlock when we tried to queue
2268 				 * that waiter. Drop the pi_state reference
2269 				 * which we took above and remove the pointer
2270 				 * to the state from the waiters futex_q
2271 				 * object.
2272 				 */
2273 				this->pi_state = NULL;
2274 				put_pi_state(pi_state);
2275 				/*
2276 				 * We stop queueing more waiters and let user
2277 				 * space deal with the mess.
2278 				 */
2279 				break;
2280 			}
2281 		}
2282 		requeue_futex(this, hb1, hb2, &key2);
2283 		drop_count++;
2284 	}
2285 
2286 	/*
2287 	 * We took an extra initial reference to the pi_state either
2288 	 * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
2289 	 * need to drop it here again.
2290 	 */
2291 	put_pi_state(pi_state);
2292 
2293 out_unlock:
2294 	double_unlock_hb(hb1, hb2);
2295 	wake_up_q(&wake_q);
2296 	hb_waiters_dec(hb2);
2297 
2298 	/*
2299 	 * drop_futex_key_refs() must be called outside the spinlocks. During
2300 	 * the requeue we moved futex_q's from the hash bucket at key1 to the
2301 	 * one at key2 and updated their key pointer.  We no longer need to
2302 	 * hold the references to key1.
2303 	 */
2304 	while (--drop_count >= 0)
2305 		drop_futex_key_refs(&key1);
2306 
2307 out_put_keys:
2308 	put_futex_key(&key2);
2309 out_put_key1:
2310 	put_futex_key(&key1);
2311 out:
2312 	return ret ? ret : task_count;
2313 }
2314 
2315 /* The key must be already stored in q->key. */
queue_lock(struct futex_q * q)2316 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
2317 	__acquires(&hb->lock)
2318 {
2319 	struct futex_hash_bucket *hb;
2320 
2321 	hb = hash_futex(&q->key);
2322 
2323 	/*
2324 	 * Increment the counter before taking the lock so that
2325 	 * a potential waker won't miss a to-be-slept task that is
2326 	 * waiting for the spinlock. This is safe as all queue_lock()
2327 	 * users end up calling queue_me(). Similarly, for housekeeping,
2328 	 * decrement the counter at queue_unlock() when some error has
2329 	 * occurred and we don't end up adding the task to the list.
2330 	 */
2331 	hb_waiters_inc(hb);
2332 
2333 	q->lock_ptr = &hb->lock;
2334 
2335 	spin_lock(&hb->lock); /* implies smp_mb(); (A) */
2336 	return hb;
2337 }
2338 
2339 static inline void
queue_unlock(struct futex_hash_bucket * hb)2340 queue_unlock(struct futex_hash_bucket *hb)
2341 	__releases(&hb->lock)
2342 {
2343 	spin_unlock(&hb->lock);
2344 	hb_waiters_dec(hb);
2345 }
2346 
__queue_me(struct futex_q * q,struct futex_hash_bucket * hb)2347 static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2348 {
2349 	int prio;
2350 
2351 	/*
2352 	 * The priority used to register this element is
2353 	 * - either the real thread-priority for the real-time threads
2354 	 * (i.e. threads with a priority lower than MAX_RT_PRIO)
2355 	 * - or MAX_RT_PRIO for non-RT threads.
2356 	 * Thus, all RT-threads are woken first in priority order, and
2357 	 * the others are woken last, in FIFO order.
2358 	 */
2359 	prio = min(current->normal_prio, MAX_RT_PRIO);
2360 
2361 	plist_node_init(&q->list, prio);
2362 	plist_add(&q->list, &hb->chain);
2363 	q->task = current;
2364 }
2365 
2366 /**
2367  * queue_me() - Enqueue the futex_q on the futex_hash_bucket
2368  * @q:	The futex_q to enqueue
2369  * @hb:	The destination hash bucket
2370  *
2371  * The hb->lock must be held by the caller, and is released here. A call to
2372  * queue_me() is typically paired with exactly one call to unqueue_me().  The
2373  * exceptions involve the PI related operations, which may use unqueue_me_pi()
2374  * or nothing if the unqueue is done as part of the wake process and the unqueue
2375  * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
2376  * an example).
2377  */
queue_me(struct futex_q * q,struct futex_hash_bucket * hb)2378 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2379 	__releases(&hb->lock)
2380 {
2381 	__queue_me(q, hb);
2382 	spin_unlock(&hb->lock);
2383 }
2384 
2385 /**
2386  * unqueue_me() - Remove the futex_q from its futex_hash_bucket
2387  * @q:	The futex_q to unqueue
2388  *
2389  * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
2390  * be paired with exactly one earlier call to queue_me().
2391  *
2392  * Return:
2393  *  - 1 - if the futex_q was still queued (and we removed unqueued it);
2394  *  - 0 - if the futex_q was already removed by the waking thread
2395  */
unqueue_me(struct futex_q * q)2396 static int unqueue_me(struct futex_q *q)
2397 {
2398 	spinlock_t *lock_ptr;
2399 	int ret = 0;
2400 
2401 	/* In the common case we don't take the spinlock, which is nice. */
2402 retry:
2403 	/*
2404 	 * q->lock_ptr can change between this read and the following spin_lock.
2405 	 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
2406 	 * optimizing lock_ptr out of the logic below.
2407 	 */
2408 	lock_ptr = READ_ONCE(q->lock_ptr);
2409 	if (lock_ptr != NULL) {
2410 		spin_lock(lock_ptr);
2411 		/*
2412 		 * q->lock_ptr can change between reading it and
2413 		 * spin_lock(), causing us to take the wrong lock.  This
2414 		 * corrects the race condition.
2415 		 *
2416 		 * Reasoning goes like this: if we have the wrong lock,
2417 		 * q->lock_ptr must have changed (maybe several times)
2418 		 * between reading it and the spin_lock().  It can
2419 		 * change again after the spin_lock() but only if it was
2420 		 * already changed before the spin_lock().  It cannot,
2421 		 * however, change back to the original value.  Therefore
2422 		 * we can detect whether we acquired the correct lock.
2423 		 */
2424 		if (unlikely(lock_ptr != q->lock_ptr)) {
2425 			spin_unlock(lock_ptr);
2426 			goto retry;
2427 		}
2428 		__unqueue_futex(q);
2429 
2430 		BUG_ON(q->pi_state);
2431 
2432 		spin_unlock(lock_ptr);
2433 		ret = 1;
2434 	}
2435 
2436 	drop_futex_key_refs(&q->key);
2437 	return ret;
2438 }
2439 
2440 /*
2441  * PI futexes can not be requeued and must remove themself from the
2442  * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
2443  * and dropped here.
2444  */
unqueue_me_pi(struct futex_q * q)2445 static void unqueue_me_pi(struct futex_q *q)
2446 	__releases(q->lock_ptr)
2447 {
2448 	__unqueue_futex(q);
2449 
2450 	BUG_ON(!q->pi_state);
2451 	put_pi_state(q->pi_state);
2452 	q->pi_state = NULL;
2453 
2454 	spin_unlock(q->lock_ptr);
2455 }
2456 
fixup_pi_state_owner(u32 __user * uaddr,struct futex_q * q,struct task_struct * argowner)2457 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2458 				struct task_struct *argowner)
2459 {
2460 	struct futex_pi_state *pi_state = q->pi_state;
2461 	u32 uval, uninitialized_var(curval), newval;
2462 	struct task_struct *oldowner, *newowner;
2463 	u32 newtid;
2464 	int ret, err = 0;
2465 
2466 	lockdep_assert_held(q->lock_ptr);
2467 
2468 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2469 
2470 	oldowner = pi_state->owner;
2471 
2472 	/*
2473 	 * We are here because either:
2474 	 *
2475 	 *  - we stole the lock and pi_state->owner needs updating to reflect
2476 	 *    that (@argowner == current),
2477 	 *
2478 	 * or:
2479 	 *
2480 	 *  - someone stole our lock and we need to fix things to point to the
2481 	 *    new owner (@argowner == NULL).
2482 	 *
2483 	 * Either way, we have to replace the TID in the user space variable.
2484 	 * This must be atomic as we have to preserve the owner died bit here.
2485 	 *
2486 	 * Note: We write the user space value _before_ changing the pi_state
2487 	 * because we can fault here. Imagine swapped out pages or a fork
2488 	 * that marked all the anonymous memory readonly for cow.
2489 	 *
2490 	 * Modifying pi_state _before_ the user space value would leave the
2491 	 * pi_state in an inconsistent state when we fault here, because we
2492 	 * need to drop the locks to handle the fault. This might be observed
2493 	 * in the PID check in lookup_pi_state.
2494 	 */
2495 retry:
2496 	if (!argowner) {
2497 		if (oldowner != current) {
2498 			/*
2499 			 * We raced against a concurrent self; things are
2500 			 * already fixed up. Nothing to do.
2501 			 */
2502 			ret = 0;
2503 			goto out_unlock;
2504 		}
2505 
2506 		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
2507 			/* We got the lock after all, nothing to fix. */
2508 			ret = 0;
2509 			goto out_unlock;
2510 		}
2511 
2512 		/*
2513 		 * Since we just failed the trylock; there must be an owner.
2514 		 */
2515 		newowner = rt_mutex_owner(&pi_state->pi_mutex);
2516 		BUG_ON(!newowner);
2517 	} else {
2518 		WARN_ON_ONCE(argowner != current);
2519 		if (oldowner == current) {
2520 			/*
2521 			 * We raced against a concurrent self; things are
2522 			 * already fixed up. Nothing to do.
2523 			 */
2524 			ret = 0;
2525 			goto out_unlock;
2526 		}
2527 		newowner = argowner;
2528 	}
2529 
2530 	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2531 	/* Owner died? */
2532 	if (!pi_state->owner)
2533 		newtid |= FUTEX_OWNER_DIED;
2534 
2535 	err = get_futex_value_locked(&uval, uaddr);
2536 	if (err)
2537 		goto handle_err;
2538 
2539 	for (;;) {
2540 		newval = (uval & FUTEX_OWNER_DIED) | newtid;
2541 
2542 		err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
2543 		if (err)
2544 			goto handle_err;
2545 
2546 		if (curval == uval)
2547 			break;
2548 		uval = curval;
2549 	}
2550 
2551 	/*
2552 	 * We fixed up user space. Now we need to fix the pi_state
2553 	 * itself.
2554 	 */
2555 	if (pi_state->owner != NULL) {
2556 		raw_spin_lock(&pi_state->owner->pi_lock);
2557 		WARN_ON(list_empty(&pi_state->list));
2558 		list_del_init(&pi_state->list);
2559 		raw_spin_unlock(&pi_state->owner->pi_lock);
2560 	}
2561 
2562 	pi_state->owner = newowner;
2563 
2564 	raw_spin_lock(&newowner->pi_lock);
2565 	WARN_ON(!list_empty(&pi_state->list));
2566 	list_add(&pi_state->list, &newowner->pi_state_list);
2567 	raw_spin_unlock(&newowner->pi_lock);
2568 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2569 
2570 	return 0;
2571 
2572 	/*
2573 	 * In order to reschedule or handle a page fault, we need to drop the
2574 	 * locks here. In the case of a fault, this gives the other task
2575 	 * (either the highest priority waiter itself or the task which stole
2576 	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
2577 	 * are back from handling the fault we need to check the pi_state after
2578 	 * reacquiring the locks and before trying to do another fixup. When
2579 	 * the fixup has been done already we simply return.
2580 	 *
2581 	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
2582 	 * drop hb->lock since the caller owns the hb -> futex_q relation.
2583 	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
2584 	 */
2585 handle_err:
2586 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2587 	spin_unlock(q->lock_ptr);
2588 
2589 	switch (err) {
2590 	case -EFAULT:
2591 		ret = fault_in_user_writeable(uaddr);
2592 		break;
2593 
2594 	case -EAGAIN:
2595 		cond_resched();
2596 		ret = 0;
2597 		break;
2598 
2599 	default:
2600 		WARN_ON_ONCE(1);
2601 		ret = err;
2602 		break;
2603 	}
2604 
2605 	spin_lock(q->lock_ptr);
2606 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2607 
2608 	/*
2609 	 * Check if someone else fixed it for us:
2610 	 */
2611 	if (pi_state->owner != oldowner) {
2612 		ret = 0;
2613 		goto out_unlock;
2614 	}
2615 
2616 	if (ret)
2617 		goto out_unlock;
2618 
2619 	goto retry;
2620 
2621 out_unlock:
2622 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2623 	return ret;
2624 }
2625 
2626 static long futex_wait_restart(struct restart_block *restart);
2627 
2628 /**
2629  * fixup_owner() - Post lock pi_state and corner case management
2630  * @uaddr:	user address of the futex
2631  * @q:		futex_q (contains pi_state and access to the rt_mutex)
2632  * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
2633  *
2634  * After attempting to lock an rt_mutex, this function is called to cleanup
2635  * the pi_state owner as well as handle race conditions that may allow us to
2636  * acquire the lock. Must be called with the hb lock held.
2637  *
2638  * Return:
2639  *  -  1 - success, lock taken;
2640  *  -  0 - success, lock not taken;
2641  *  - <0 - on error (-EFAULT)
2642  */
fixup_owner(u32 __user * uaddr,struct futex_q * q,int locked)2643 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2644 {
2645 	int ret = 0;
2646 
2647 	if (locked) {
2648 		/*
2649 		 * Got the lock. We might not be the anticipated owner if we
2650 		 * did a lock-steal - fix up the PI-state in that case:
2651 		 *
2652 		 * Speculative pi_state->owner read (we don't hold wait_lock);
2653 		 * since we own the lock pi_state->owner == current is the
2654 		 * stable state, anything else needs more attention.
2655 		 */
2656 		if (q->pi_state->owner != current)
2657 			ret = fixup_pi_state_owner(uaddr, q, current);
2658 		goto out;
2659 	}
2660 
2661 	/*
2662 	 * If we didn't get the lock; check if anybody stole it from us. In
2663 	 * that case, we need to fix up the uval to point to them instead of
2664 	 * us, otherwise bad things happen. [10]
2665 	 *
2666 	 * Another speculative read; pi_state->owner == current is unstable
2667 	 * but needs our attention.
2668 	 */
2669 	if (q->pi_state->owner == current) {
2670 		ret = fixup_pi_state_owner(uaddr, q, NULL);
2671 		goto out;
2672 	}
2673 
2674 	/*
2675 	 * Paranoia check. If we did not take the lock, then we should not be
2676 	 * the owner of the rt_mutex.
2677 	 */
2678 	if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
2679 		printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
2680 				"pi-state %p\n", ret,
2681 				q->pi_state->pi_mutex.owner,
2682 				q->pi_state->owner);
2683 	}
2684 
2685 out:
2686 	return ret ? ret : locked;
2687 }
2688 
2689 /**
2690  * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
2691  * @hb:		the futex hash bucket, must be locked by the caller
2692  * @q:		the futex_q to queue up on
2693  * @timeout:	the prepared hrtimer_sleeper, or null for no timeout
2694  */
futex_wait_queue_me(struct futex_hash_bucket * hb,struct futex_q * q,struct hrtimer_sleeper * timeout)2695 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
2696 				struct hrtimer_sleeper *timeout)
2697 {
2698 	/*
2699 	 * The task state is guaranteed to be set before another task can
2700 	 * wake it. set_current_state() is implemented using smp_store_mb() and
2701 	 * queue_me() calls spin_unlock() upon completion, both serializing
2702 	 * access to the hash list and forcing another memory barrier.
2703 	 */
2704 	set_current_state(TASK_INTERRUPTIBLE);
2705 	queue_me(q, hb);
2706 
2707 	/* Arm the timer */
2708 	if (timeout)
2709 		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
2710 
2711 	/*
2712 	 * If we have been removed from the hash list, then another task
2713 	 * has tried to wake us, and we can skip the call to schedule().
2714 	 */
2715 	if (likely(!plist_node_empty(&q->list))) {
2716 		/*
2717 		 * If the timer has already expired, current will already be
2718 		 * flagged for rescheduling. Only call schedule if there
2719 		 * is no timeout, or if it has yet to expire.
2720 		 */
2721 		if (!timeout || timeout->task)
2722 			freezable_schedule();
2723 	}
2724 	__set_current_state(TASK_RUNNING);
2725 }
2726 
2727 /**
2728  * futex_wait_setup() - Prepare to wait on a futex
2729  * @uaddr:	the futex userspace address
2730  * @val:	the expected value
2731  * @flags:	futex flags (FLAGS_SHARED, etc.)
2732  * @q:		the associated futex_q
2733  * @hb:		storage for hash_bucket pointer to be returned to caller
2734  *
2735  * Setup the futex_q and locate the hash_bucket.  Get the futex value and
2736  * compare it with the expected value.  Handle atomic faults internally.
2737  * Return with the hb lock held and a q.key reference on success, and unlocked
2738  * with no q.key reference on failure.
2739  *
2740  * Return:
2741  *  -  0 - uaddr contains val and hb has been locked;
2742  *  - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
2743  */
futex_wait_setup(u32 __user * uaddr,u32 val,unsigned int flags,struct futex_q * q,struct futex_hash_bucket ** hb)2744 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
2745 			   struct futex_q *q, struct futex_hash_bucket **hb)
2746 {
2747 	u32 uval;
2748 	int ret;
2749 
2750 	/*
2751 	 * Access the page AFTER the hash-bucket is locked.
2752 	 * Order is important:
2753 	 *
2754 	 *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
2755 	 *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
2756 	 *
2757 	 * The basic logical guarantee of a futex is that it blocks ONLY
2758 	 * if cond(var) is known to be true at the time of blocking, for
2759 	 * any cond.  If we locked the hash-bucket after testing *uaddr, that
2760 	 * would open a race condition where we could block indefinitely with
2761 	 * cond(var) false, which would violate the guarantee.
2762 	 *
2763 	 * On the other hand, we insert q and release the hash-bucket only
2764 	 * after testing *uaddr.  This guarantees that futex_wait() will NOT
2765 	 * absorb a wakeup if *uaddr does not match the desired values
2766 	 * while the syscall executes.
2767 	 */
2768 retry:
2769 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
2770 	if (unlikely(ret != 0))
2771 		return ret;
2772 
2773 retry_private:
2774 	*hb = queue_lock(q);
2775 
2776 	ret = get_futex_value_locked(&uval, uaddr);
2777 
2778 	if (ret) {
2779 		queue_unlock(*hb);
2780 
2781 		ret = get_user(uval, uaddr);
2782 		if (ret)
2783 			goto out;
2784 
2785 		if (!(flags & FLAGS_SHARED))
2786 			goto retry_private;
2787 
2788 		put_futex_key(&q->key);
2789 		goto retry;
2790 	}
2791 
2792 	if (uval != val) {
2793 		queue_unlock(*hb);
2794 		ret = -EWOULDBLOCK;
2795 	}
2796 
2797 out:
2798 	if (ret)
2799 		put_futex_key(&q->key);
2800 	return ret;
2801 }
2802 
futex_wait(u32 __user * uaddr,unsigned int flags,u32 val,ktime_t * abs_time,u32 bitset)2803 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2804 		      ktime_t *abs_time, u32 bitset)
2805 {
2806 	struct hrtimer_sleeper timeout, *to = NULL;
2807 	struct restart_block *restart;
2808 	struct futex_hash_bucket *hb;
2809 	struct futex_q q = futex_q_init;
2810 	int ret;
2811 
2812 	if (!bitset)
2813 		return -EINVAL;
2814 	q.bitset = bitset;
2815 
2816 	if (abs_time) {
2817 		to = &timeout;
2818 
2819 		hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2820 				      CLOCK_REALTIME : CLOCK_MONOTONIC,
2821 				      HRTIMER_MODE_ABS);
2822 		hrtimer_init_sleeper(to, current);
2823 		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2824 					     current->timer_slack_ns);
2825 	}
2826 
2827 retry:
2828 	/*
2829 	 * Prepare to wait on uaddr. On success, holds hb lock and increments
2830 	 * q.key refs.
2831 	 */
2832 	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2833 	if (ret)
2834 		goto out;
2835 
2836 	/* queue_me and wait for wakeup, timeout, or a signal. */
2837 	futex_wait_queue_me(hb, &q, to);
2838 
2839 	/* If we were woken (and unqueued), we succeeded, whatever. */
2840 	ret = 0;
2841 	/* unqueue_me() drops q.key ref */
2842 	if (!unqueue_me(&q))
2843 		goto out;
2844 	ret = -ETIMEDOUT;
2845 	if (to && !to->task)
2846 		goto out;
2847 
2848 	/*
2849 	 * We expect signal_pending(current), but we might be the
2850 	 * victim of a spurious wakeup as well.
2851 	 */
2852 	if (!signal_pending(current))
2853 		goto retry;
2854 
2855 	ret = -ERESTARTSYS;
2856 	if (!abs_time)
2857 		goto out;
2858 
2859 	restart = &current->restart_block;
2860 	restart->fn = futex_wait_restart;
2861 	restart->futex.uaddr = uaddr;
2862 	restart->futex.val = val;
2863 	restart->futex.time = *abs_time;
2864 	restart->futex.bitset = bitset;
2865 	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
2866 
2867 	ret = -ERESTART_RESTARTBLOCK;
2868 
2869 out:
2870 	if (to) {
2871 		hrtimer_cancel(&to->timer);
2872 		destroy_hrtimer_on_stack(&to->timer);
2873 	}
2874 	return ret;
2875 }
2876 
2877 
futex_wait_restart(struct restart_block * restart)2878 static long futex_wait_restart(struct restart_block *restart)
2879 {
2880 	u32 __user *uaddr = restart->futex.uaddr;
2881 	ktime_t t, *tp = NULL;
2882 
2883 	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
2884 		t = restart->futex.time;
2885 		tp = &t;
2886 	}
2887 	restart->fn = do_no_restart_syscall;
2888 
2889 	return (long)futex_wait(uaddr, restart->futex.flags,
2890 				restart->futex.val, tp, restart->futex.bitset);
2891 }
2892 
2893 
2894 /*
2895  * Userspace tried a 0 -> TID atomic transition of the futex value
2896  * and failed. The kernel side here does the whole locking operation:
2897  * if there are waiters then it will block as a consequence of relying
2898  * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
2899  * a 0 value of the futex too.).
2900  *
2901  * Also serves as futex trylock_pi()'ing, and due semantics.
2902  */
futex_lock_pi(u32 __user * uaddr,unsigned int flags,ktime_t * time,int trylock)2903 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2904 			 ktime_t *time, int trylock)
2905 {
2906 	struct hrtimer_sleeper timeout, *to = NULL;
2907 	struct futex_pi_state *pi_state = NULL;
2908 	struct task_struct *exiting = NULL;
2909 	struct rt_mutex_waiter rt_waiter;
2910 	struct futex_hash_bucket *hb;
2911 	struct futex_q q = futex_q_init;
2912 	int res, ret;
2913 
2914 	if (!IS_ENABLED(CONFIG_FUTEX_PI))
2915 		return -ENOSYS;
2916 
2917 	if (refill_pi_state_cache())
2918 		return -ENOMEM;
2919 
2920 	if (time) {
2921 		to = &timeout;
2922 		hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
2923 				      HRTIMER_MODE_ABS);
2924 		hrtimer_init_sleeper(to, current);
2925 		hrtimer_set_expires(&to->timer, *time);
2926 	}
2927 
2928 retry:
2929 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
2930 	if (unlikely(ret != 0))
2931 		goto out;
2932 
2933 retry_private:
2934 	hb = queue_lock(&q);
2935 
2936 	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
2937 				   &exiting, 0);
2938 	if (unlikely(ret)) {
2939 		/*
2940 		 * Atomic work succeeded and we got the lock,
2941 		 * or failed. Either way, we do _not_ block.
2942 		 */
2943 		switch (ret) {
2944 		case 1:
2945 			/* We got the lock. */
2946 			ret = 0;
2947 			goto out_unlock_put_key;
2948 		case -EFAULT:
2949 			goto uaddr_faulted;
2950 		case -EBUSY:
2951 		case -EAGAIN:
2952 			/*
2953 			 * Two reasons for this:
2954 			 * - EBUSY: Task is exiting and we just wait for the
2955 			 *   exit to complete.
2956 			 * - EAGAIN: The user space value changed.
2957 			 */
2958 			queue_unlock(hb);
2959 			put_futex_key(&q.key);
2960 			/*
2961 			 * Handle the case where the owner is in the middle of
2962 			 * exiting. Wait for the exit to complete otherwise
2963 			 * this task might loop forever, aka. live lock.
2964 			 */
2965 			wait_for_owner_exiting(ret, exiting);
2966 			cond_resched();
2967 			goto retry;
2968 		default:
2969 			goto out_unlock_put_key;
2970 		}
2971 	}
2972 
2973 	WARN_ON(!q.pi_state);
2974 
2975 	/*
2976 	 * Only actually queue now that the atomic ops are done:
2977 	 */
2978 	__queue_me(&q, hb);
2979 
2980 	if (trylock) {
2981 		ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
2982 		/* Fixup the trylock return value: */
2983 		ret = ret ? 0 : -EWOULDBLOCK;
2984 		goto no_block;
2985 	}
2986 
2987 	rt_mutex_init_waiter(&rt_waiter);
2988 
2989 	/*
2990 	 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
2991 	 * hold it while doing rt_mutex_start_proxy(), because then it will
2992 	 * include hb->lock in the blocking chain, even through we'll not in
2993 	 * fact hold it while blocking. This will lead it to report -EDEADLK
2994 	 * and BUG when futex_unlock_pi() interleaves with this.
2995 	 *
2996 	 * Therefore acquire wait_lock while holding hb->lock, but drop the
2997 	 * latter before calling __rt_mutex_start_proxy_lock(). This
2998 	 * interleaves with futex_unlock_pi() -- which does a similar lock
2999 	 * handoff -- such that the latter can observe the futex_q::pi_state
3000 	 * before __rt_mutex_start_proxy_lock() is done.
3001 	 */
3002 	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
3003 	spin_unlock(q.lock_ptr);
3004 	/*
3005 	 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
3006 	 * such that futex_unlock_pi() is guaranteed to observe the waiter when
3007 	 * it sees the futex_q::pi_state.
3008 	 */
3009 	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
3010 	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
3011 
3012 	if (ret) {
3013 		if (ret == 1)
3014 			ret = 0;
3015 		goto cleanup;
3016 	}
3017 
3018 	if (unlikely(to))
3019 		hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
3020 
3021 	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
3022 
3023 cleanup:
3024 	spin_lock(q.lock_ptr);
3025 	/*
3026 	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
3027 	 * first acquire the hb->lock before removing the lock from the
3028 	 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
3029 	 * lists consistent.
3030 	 *
3031 	 * In particular; it is important that futex_unlock_pi() can not
3032 	 * observe this inconsistency.
3033 	 */
3034 	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
3035 		ret = 0;
3036 
3037 no_block:
3038 	/*
3039 	 * Fixup the pi_state owner and possibly acquire the lock if we
3040 	 * haven't already.
3041 	 */
3042 	res = fixup_owner(uaddr, &q, !ret);
3043 	/*
3044 	 * If fixup_owner() returned an error, proprogate that.  If it acquired
3045 	 * the lock, clear our -ETIMEDOUT or -EINTR.
3046 	 */
3047 	if (res)
3048 		ret = (res < 0) ? res : 0;
3049 
3050 	/*
3051 	 * If fixup_owner() faulted and was unable to handle the fault, unlock
3052 	 * it and return the fault to userspace.
3053 	 */
3054 	if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
3055 		pi_state = q.pi_state;
3056 		get_pi_state(pi_state);
3057 	}
3058 
3059 	/* Unqueue and drop the lock */
3060 	unqueue_me_pi(&q);
3061 
3062 	if (pi_state) {
3063 		rt_mutex_futex_unlock(&pi_state->pi_mutex);
3064 		put_pi_state(pi_state);
3065 	}
3066 
3067 	goto out_put_key;
3068 
3069 out_unlock_put_key:
3070 	queue_unlock(hb);
3071 
3072 out_put_key:
3073 	put_futex_key(&q.key);
3074 out:
3075 	if (to) {
3076 		hrtimer_cancel(&to->timer);
3077 		destroy_hrtimer_on_stack(&to->timer);
3078 	}
3079 	return ret != -EINTR ? ret : -ERESTARTNOINTR;
3080 
3081 uaddr_faulted:
3082 	queue_unlock(hb);
3083 
3084 	ret = fault_in_user_writeable(uaddr);
3085 	if (ret)
3086 		goto out_put_key;
3087 
3088 	if (!(flags & FLAGS_SHARED))
3089 		goto retry_private;
3090 
3091 	put_futex_key(&q.key);
3092 	goto retry;
3093 }
3094 
3095 /*
3096  * Userspace attempted a TID -> 0 atomic transition, and failed.
3097  * This is the in-kernel slowpath: we look up the PI state (if any),
3098  * and do the rt-mutex unlock.
3099  */
futex_unlock_pi(u32 __user * uaddr,unsigned int flags)3100 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
3101 {
3102 	u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
3103 	union futex_key key = FUTEX_KEY_INIT;
3104 	struct futex_hash_bucket *hb;
3105 	struct futex_q *top_waiter;
3106 	int ret;
3107 
3108 	if (!IS_ENABLED(CONFIG_FUTEX_PI))
3109 		return -ENOSYS;
3110 
3111 retry:
3112 	if (get_user(uval, uaddr))
3113 		return -EFAULT;
3114 	/*
3115 	 * We release only a lock we actually own:
3116 	 */
3117 	if ((uval & FUTEX_TID_MASK) != vpid)
3118 		return -EPERM;
3119 
3120 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
3121 	if (ret)
3122 		return ret;
3123 
3124 	hb = hash_futex(&key);
3125 	spin_lock(&hb->lock);
3126 
3127 	/*
3128 	 * Check waiters first. We do not trust user space values at
3129 	 * all and we at least want to know if user space fiddled
3130 	 * with the futex value instead of blindly unlocking.
3131 	 */
3132 	top_waiter = futex_top_waiter(hb, &key);
3133 	if (top_waiter) {
3134 		struct futex_pi_state *pi_state = top_waiter->pi_state;
3135 
3136 		ret = -EINVAL;
3137 		if (!pi_state)
3138 			goto out_unlock;
3139 
3140 		/*
3141 		 * If current does not own the pi_state then the futex is
3142 		 * inconsistent and user space fiddled with the futex value.
3143 		 */
3144 		if (pi_state->owner != current)
3145 			goto out_unlock;
3146 
3147 		get_pi_state(pi_state);
3148 		/*
3149 		 * By taking wait_lock while still holding hb->lock, we ensure
3150 		 * there is no point where we hold neither; and therefore
3151 		 * wake_futex_pi() must observe a state consistent with what we
3152 		 * observed.
3153 		 *
3154 		 * In particular; this forces __rt_mutex_start_proxy() to
3155 		 * complete such that we're guaranteed to observe the
3156 		 * rt_waiter. Also see the WARN in wake_futex_pi().
3157 		 */
3158 		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
3159 		spin_unlock(&hb->lock);
3160 
3161 		/* drops pi_state->pi_mutex.wait_lock */
3162 		ret = wake_futex_pi(uaddr, uval, pi_state);
3163 
3164 		put_pi_state(pi_state);
3165 
3166 		/*
3167 		 * Success, we're done! No tricky corner cases.
3168 		 */
3169 		if (!ret)
3170 			goto out_putkey;
3171 		/*
3172 		 * The atomic access to the futex value generated a
3173 		 * pagefault, so retry the user-access and the wakeup:
3174 		 */
3175 		if (ret == -EFAULT)
3176 			goto pi_faulted;
3177 		/*
3178 		 * A unconditional UNLOCK_PI op raced against a waiter
3179 		 * setting the FUTEX_WAITERS bit. Try again.
3180 		 */
3181 		if (ret == -EAGAIN)
3182 			goto pi_retry;
3183 		/*
3184 		 * wake_futex_pi has detected invalid state. Tell user
3185 		 * space.
3186 		 */
3187 		goto out_putkey;
3188 	}
3189 
3190 	/*
3191 	 * We have no kernel internal state, i.e. no waiters in the
3192 	 * kernel. Waiters which are about to queue themselves are stuck
3193 	 * on hb->lock. So we can safely ignore them. We do neither
3194 	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
3195 	 * owner.
3196 	 */
3197 	if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
3198 		spin_unlock(&hb->lock);
3199 		switch (ret) {
3200 		case -EFAULT:
3201 			goto pi_faulted;
3202 
3203 		case -EAGAIN:
3204 			goto pi_retry;
3205 
3206 		default:
3207 			WARN_ON_ONCE(1);
3208 			goto out_putkey;
3209 		}
3210 	}
3211 
3212 	/*
3213 	 * If uval has changed, let user space handle it.
3214 	 */
3215 	ret = (curval == uval) ? 0 : -EAGAIN;
3216 
3217 out_unlock:
3218 	spin_unlock(&hb->lock);
3219 out_putkey:
3220 	put_futex_key(&key);
3221 	return ret;
3222 
3223 pi_retry:
3224 	put_futex_key(&key);
3225 	cond_resched();
3226 	goto retry;
3227 
3228 pi_faulted:
3229 	put_futex_key(&key);
3230 
3231 	ret = fault_in_user_writeable(uaddr);
3232 	if (!ret)
3233 		goto retry;
3234 
3235 	return ret;
3236 }
3237 
3238 /**
3239  * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
3240  * @hb:		the hash_bucket futex_q was original enqueued on
3241  * @q:		the futex_q woken while waiting to be requeued
3242  * @key2:	the futex_key of the requeue target futex
3243  * @timeout:	the timeout associated with the wait (NULL if none)
3244  *
3245  * Detect if the task was woken on the initial futex as opposed to the requeue
3246  * target futex.  If so, determine if it was a timeout or a signal that caused
3247  * the wakeup and return the appropriate error code to the caller.  Must be
3248  * called with the hb lock held.
3249  *
3250  * Return:
3251  *  -  0 = no early wakeup detected;
3252  *  - <0 = -ETIMEDOUT or -ERESTARTNOINTR
3253  */
3254 static inline
handle_early_requeue_pi_wakeup(struct futex_hash_bucket * hb,struct futex_q * q,union futex_key * key2,struct hrtimer_sleeper * timeout)3255 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
3256 				   struct futex_q *q, union futex_key *key2,
3257 				   struct hrtimer_sleeper *timeout)
3258 {
3259 	int ret = 0;
3260 
3261 	/*
3262 	 * With the hb lock held, we avoid races while we process the wakeup.
3263 	 * We only need to hold hb (and not hb2) to ensure atomicity as the
3264 	 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
3265 	 * It can't be requeued from uaddr2 to something else since we don't
3266 	 * support a PI aware source futex for requeue.
3267 	 */
3268 	if (!match_futex(&q->key, key2)) {
3269 		WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
3270 		/*
3271 		 * We were woken prior to requeue by a timeout or a signal.
3272 		 * Unqueue the futex_q and determine which it was.
3273 		 */
3274 		plist_del(&q->list, &hb->chain);
3275 		hb_waiters_dec(hb);
3276 
3277 		/* Handle spurious wakeups gracefully */
3278 		ret = -EWOULDBLOCK;
3279 		if (timeout && !timeout->task)
3280 			ret = -ETIMEDOUT;
3281 		else if (signal_pending(current))
3282 			ret = -ERESTARTNOINTR;
3283 	}
3284 	return ret;
3285 }
3286 
3287 /**
3288  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
3289  * @uaddr:	the futex we initially wait on (non-pi)
3290  * @flags:	futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
3291  *		the same type, no requeueing from private to shared, etc.
3292  * @val:	the expected value of uaddr
3293  * @abs_time:	absolute timeout
3294  * @bitset:	32 bit wakeup bitset set by userspace, defaults to all
3295  * @uaddr2:	the pi futex we will take prior to returning to user-space
3296  *
3297  * The caller will wait on uaddr and will be requeued by futex_requeue() to
3298  * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
3299  * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
3300  * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
3301  * without one, the pi logic would not know which task to boost/deboost, if
3302  * there was a need to.
3303  *
3304  * We call schedule in futex_wait_queue_me() when we enqueue and return there
3305  * via the following--
3306  * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
3307  * 2) wakeup on uaddr2 after a requeue
3308  * 3) signal
3309  * 4) timeout
3310  *
3311  * If 3, cleanup and return -ERESTARTNOINTR.
3312  *
3313  * If 2, we may then block on trying to take the rt_mutex and return via:
3314  * 5) successful lock
3315  * 6) signal
3316  * 7) timeout
3317  * 8) other lock acquisition failure
3318  *
3319  * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
3320  *
3321  * If 4 or 7, we cleanup and return with -ETIMEDOUT.
3322  *
3323  * Return:
3324  *  -  0 - On success;
3325  *  - <0 - On error
3326  */
futex_wait_requeue_pi(u32 __user * uaddr,unsigned int flags,u32 val,ktime_t * abs_time,u32 bitset,u32 __user * uaddr2)3327 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
3328 				 u32 val, ktime_t *abs_time, u32 bitset,
3329 				 u32 __user *uaddr2)
3330 {
3331 	struct hrtimer_sleeper timeout, *to = NULL;
3332 	struct futex_pi_state *pi_state = NULL;
3333 	struct rt_mutex_waiter rt_waiter;
3334 	struct futex_hash_bucket *hb;
3335 	union futex_key key2 = FUTEX_KEY_INIT;
3336 	struct futex_q q = futex_q_init;
3337 	int res, ret;
3338 
3339 	if (!IS_ENABLED(CONFIG_FUTEX_PI))
3340 		return -ENOSYS;
3341 
3342 	if (uaddr == uaddr2)
3343 		return -EINVAL;
3344 
3345 	if (!bitset)
3346 		return -EINVAL;
3347 
3348 	if (abs_time) {
3349 		to = &timeout;
3350 		hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
3351 				      CLOCK_REALTIME : CLOCK_MONOTONIC,
3352 				      HRTIMER_MODE_ABS);
3353 		hrtimer_init_sleeper(to, current);
3354 		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
3355 					     current->timer_slack_ns);
3356 	}
3357 
3358 	/*
3359 	 * The waiter is allocated on our stack, manipulated by the requeue
3360 	 * code while we sleep on uaddr.
3361 	 */
3362 	rt_mutex_init_waiter(&rt_waiter);
3363 
3364 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
3365 	if (unlikely(ret != 0))
3366 		goto out;
3367 
3368 	q.bitset = bitset;
3369 	q.rt_waiter = &rt_waiter;
3370 	q.requeue_pi_key = &key2;
3371 
3372 	/*
3373 	 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
3374 	 * count.
3375 	 */
3376 	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
3377 	if (ret)
3378 		goto out_key2;
3379 
3380 	/*
3381 	 * The check above which compares uaddrs is not sufficient for
3382 	 * shared futexes. We need to compare the keys:
3383 	 */
3384 	if (match_futex(&q.key, &key2)) {
3385 		queue_unlock(hb);
3386 		ret = -EINVAL;
3387 		goto out_put_keys;
3388 	}
3389 
3390 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
3391 	futex_wait_queue_me(hb, &q, to);
3392 
3393 	spin_lock(&hb->lock);
3394 	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
3395 	spin_unlock(&hb->lock);
3396 	if (ret)
3397 		goto out_put_keys;
3398 
3399 	/*
3400 	 * In order for us to be here, we know our q.key == key2, and since
3401 	 * we took the hb->lock above, we also know that futex_requeue() has
3402 	 * completed and we no longer have to concern ourselves with a wakeup
3403 	 * race with the atomic proxy lock acquisition by the requeue code. The
3404 	 * futex_requeue dropped our key1 reference and incremented our key2
3405 	 * reference count.
3406 	 */
3407 
3408 	/* Check if the requeue code acquired the second futex for us. */
3409 	if (!q.rt_waiter) {
3410 		/*
3411 		 * Got the lock. We might not be the anticipated owner if we
3412 		 * did a lock-steal - fix up the PI-state in that case.
3413 		 */
3414 		if (q.pi_state && (q.pi_state->owner != current)) {
3415 			spin_lock(q.lock_ptr);
3416 			ret = fixup_pi_state_owner(uaddr2, &q, current);
3417 			if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
3418 				pi_state = q.pi_state;
3419 				get_pi_state(pi_state);
3420 			}
3421 			/*
3422 			 * Drop the reference to the pi state which
3423 			 * the requeue_pi() code acquired for us.
3424 			 */
3425 			put_pi_state(q.pi_state);
3426 			spin_unlock(q.lock_ptr);
3427 		}
3428 	} else {
3429 		struct rt_mutex *pi_mutex;
3430 
3431 		/*
3432 		 * We have been woken up by futex_unlock_pi(), a timeout, or a
3433 		 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
3434 		 * the pi_state.
3435 		 */
3436 		WARN_ON(!q.pi_state);
3437 		pi_mutex = &q.pi_state->pi_mutex;
3438 		ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
3439 
3440 		spin_lock(q.lock_ptr);
3441 		if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
3442 			ret = 0;
3443 
3444 		debug_rt_mutex_free_waiter(&rt_waiter);
3445 		/*
3446 		 * Fixup the pi_state owner and possibly acquire the lock if we
3447 		 * haven't already.
3448 		 */
3449 		res = fixup_owner(uaddr2, &q, !ret);
3450 		/*
3451 		 * If fixup_owner() returned an error, proprogate that.  If it
3452 		 * acquired the lock, clear -ETIMEDOUT or -EINTR.
3453 		 */
3454 		if (res)
3455 			ret = (res < 0) ? res : 0;
3456 
3457 		/*
3458 		 * If fixup_pi_state_owner() faulted and was unable to handle
3459 		 * the fault, unlock the rt_mutex and return the fault to
3460 		 * userspace.
3461 		 */
3462 		if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
3463 			pi_state = q.pi_state;
3464 			get_pi_state(pi_state);
3465 		}
3466 
3467 		/* Unqueue and drop the lock. */
3468 		unqueue_me_pi(&q);
3469 	}
3470 
3471 	if (pi_state) {
3472 		rt_mutex_futex_unlock(&pi_state->pi_mutex);
3473 		put_pi_state(pi_state);
3474 	}
3475 
3476 	if (ret == -EINTR) {
3477 		/*
3478 		 * We've already been requeued, but cannot restart by calling
3479 		 * futex_lock_pi() directly. We could restart this syscall, but
3480 		 * it would detect that the user space "val" changed and return
3481 		 * -EWOULDBLOCK.  Save the overhead of the restart and return
3482 		 * -EWOULDBLOCK directly.
3483 		 */
3484 		ret = -EWOULDBLOCK;
3485 	}
3486 
3487 out_put_keys:
3488 	put_futex_key(&q.key);
3489 out_key2:
3490 	put_futex_key(&key2);
3491 
3492 out:
3493 	if (to) {
3494 		hrtimer_cancel(&to->timer);
3495 		destroy_hrtimer_on_stack(&to->timer);
3496 	}
3497 	return ret;
3498 }
3499 
3500 /*
3501  * Support for robust futexes: the kernel cleans up held futexes at
3502  * thread exit time.
3503  *
3504  * Implementation: user-space maintains a per-thread list of locks it
3505  * is holding. Upon do_exit(), the kernel carefully walks this list,
3506  * and marks all locks that are owned by this thread with the
3507  * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
3508  * always manipulated with the lock held, so the list is private and
3509  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
3510  * field, to allow the kernel to clean up if the thread dies after
3511  * acquiring the lock, but just before it could have added itself to
3512  * the list. There can only be one such pending lock.
3513  */
3514 
3515 /**
3516  * sys_set_robust_list() - Set the robust-futex list head of a task
3517  * @head:	pointer to the list-head
3518  * @len:	length of the list-head, as userspace expects
3519  */
SYSCALL_DEFINE2(set_robust_list,struct robust_list_head __user *,head,size_t,len)3520 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
3521 		size_t, len)
3522 {
3523 	if (!futex_cmpxchg_enabled)
3524 		return -ENOSYS;
3525 	/*
3526 	 * The kernel knows only one size for now:
3527 	 */
3528 	if (unlikely(len != sizeof(*head)))
3529 		return -EINVAL;
3530 
3531 	current->robust_list = head;
3532 
3533 	return 0;
3534 }
3535 
3536 /**
3537  * sys_get_robust_list() - Get the robust-futex list head of a task
3538  * @pid:	pid of the process [zero for current task]
3539  * @head_ptr:	pointer to a list-head pointer, the kernel fills it in
3540  * @len_ptr:	pointer to a length field, the kernel fills in the header size
3541  */
SYSCALL_DEFINE3(get_robust_list,int,pid,struct robust_list_head __user * __user *,head_ptr,size_t __user *,len_ptr)3542 SYSCALL_DEFINE3(get_robust_list, int, pid,
3543 		struct robust_list_head __user * __user *, head_ptr,
3544 		size_t __user *, len_ptr)
3545 {
3546 	struct robust_list_head __user *head;
3547 	unsigned long ret;
3548 	struct task_struct *p;
3549 
3550 	if (!futex_cmpxchg_enabled)
3551 		return -ENOSYS;
3552 
3553 	rcu_read_lock();
3554 
3555 	ret = -ESRCH;
3556 	if (!pid)
3557 		p = current;
3558 	else {
3559 		p = find_task_by_vpid(pid);
3560 		if (!p)
3561 			goto err_unlock;
3562 	}
3563 
3564 	ret = -EPERM;
3565 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
3566 		goto err_unlock;
3567 
3568 	head = p->robust_list;
3569 	rcu_read_unlock();
3570 
3571 	if (put_user(sizeof(*head), len_ptr))
3572 		return -EFAULT;
3573 	return put_user(head, head_ptr);
3574 
3575 err_unlock:
3576 	rcu_read_unlock();
3577 
3578 	return ret;
3579 }
3580 
3581 /* Constants for the pending_op argument of handle_futex_death */
3582 #define HANDLE_DEATH_PENDING	true
3583 #define HANDLE_DEATH_LIST	false
3584 
3585 /*
3586  * Process a futex-list entry, check whether it's owned by the
3587  * dying task, and do notification if so:
3588  */
handle_futex_death(u32 __user * uaddr,struct task_struct * curr,bool pi,bool pending_op)3589 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
3590 			      bool pi, bool pending_op)
3591 {
3592 	u32 uval, uninitialized_var(nval), mval;
3593 	int err;
3594 
3595 	/* Futex address must be 32bit aligned */
3596 	if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
3597 		return -1;
3598 
3599 retry:
3600 	if (get_user(uval, uaddr))
3601 		return -1;
3602 
3603 	/*
3604 	 * Special case for regular (non PI) futexes. The unlock path in
3605 	 * user space has two race scenarios:
3606 	 *
3607 	 * 1. The unlock path releases the user space futex value and
3608 	 *    before it can execute the futex() syscall to wake up
3609 	 *    waiters it is killed.
3610 	 *
3611 	 * 2. A woken up waiter is killed before it can acquire the
3612 	 *    futex in user space.
3613 	 *
3614 	 * In both cases the TID validation below prevents a wakeup of
3615 	 * potential waiters which can cause these waiters to block
3616 	 * forever.
3617 	 *
3618 	 * In both cases the following conditions are met:
3619 	 *
3620 	 *	1) task->robust_list->list_op_pending != NULL
3621 	 *	   @pending_op == true
3622 	 *	2) User space futex value == 0
3623 	 *	3) Regular futex: @pi == false
3624 	 *
3625 	 * If these conditions are met, it is safe to attempt waking up a
3626 	 * potential waiter without touching the user space futex value and
3627 	 * trying to set the OWNER_DIED bit. The user space futex value is
3628 	 * uncontended and the rest of the user space mutex state is
3629 	 * consistent, so a woken waiter will just take over the
3630 	 * uncontended futex. Setting the OWNER_DIED bit would create
3631 	 * inconsistent state and malfunction of the user space owner died
3632 	 * handling.
3633 	 */
3634 	if (pending_op && !pi && !uval) {
3635 		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3636 		return 0;
3637 	}
3638 
3639 	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
3640 		return 0;
3641 
3642 	/*
3643 	 * Ok, this dying thread is truly holding a futex
3644 	 * of interest. Set the OWNER_DIED bit atomically
3645 	 * via cmpxchg, and if the value had FUTEX_WAITERS
3646 	 * set, wake up a waiter (if any). (We have to do a
3647 	 * futex_wake() even if OWNER_DIED is already set -
3648 	 * to handle the rare but possible case of recursive
3649 	 * thread-death.) The rest of the cleanup is done in
3650 	 * userspace.
3651 	 */
3652 	mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
3653 
3654 	/*
3655 	 * We are not holding a lock here, but we want to have
3656 	 * the pagefault_disable/enable() protection because
3657 	 * we want to handle the fault gracefully. If the
3658 	 * access fails we try to fault in the futex with R/W
3659 	 * verification via get_user_pages. get_user() above
3660 	 * does not guarantee R/W access. If that fails we
3661 	 * give up and leave the futex locked.
3662 	 */
3663 	if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
3664 		switch (err) {
3665 		case -EFAULT:
3666 			if (fault_in_user_writeable(uaddr))
3667 				return -1;
3668 			goto retry;
3669 
3670 		case -EAGAIN:
3671 			cond_resched();
3672 			goto retry;
3673 
3674 		default:
3675 			WARN_ON_ONCE(1);
3676 			return err;
3677 		}
3678 	}
3679 
3680 	if (nval != uval)
3681 		goto retry;
3682 
3683 	/*
3684 	 * Wake robust non-PI futexes here. The wakeup of
3685 	 * PI futexes happens in exit_pi_state():
3686 	 */
3687 	if (!pi && (uval & FUTEX_WAITERS))
3688 		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3689 
3690 	return 0;
3691 }
3692 
3693 /*
3694  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
3695  */
fetch_robust_entry(struct robust_list __user ** entry,struct robust_list __user * __user * head,unsigned int * pi)3696 static inline int fetch_robust_entry(struct robust_list __user **entry,
3697 				     struct robust_list __user * __user *head,
3698 				     unsigned int *pi)
3699 {
3700 	unsigned long uentry;
3701 
3702 	if (get_user(uentry, (unsigned long __user *)head))
3703 		return -EFAULT;
3704 
3705 	*entry = (void __user *)(uentry & ~1UL);
3706 	*pi = uentry & 1;
3707 
3708 	return 0;
3709 }
3710 
3711 /*
3712  * Walk curr->robust_list (very carefully, it's a userspace list!)
3713  * and mark any locks found there dead, and notify any waiters.
3714  *
3715  * We silently return on any sign of list-walking problem.
3716  */
exit_robust_list(struct task_struct * curr)3717 static void exit_robust_list(struct task_struct *curr)
3718 {
3719 	struct robust_list_head __user *head = curr->robust_list;
3720 	struct robust_list __user *entry, *next_entry, *pending;
3721 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
3722 	unsigned int uninitialized_var(next_pi);
3723 	unsigned long futex_offset;
3724 	int rc;
3725 
3726 	if (!futex_cmpxchg_enabled)
3727 		return;
3728 
3729 	/*
3730 	 * Fetch the list head (which was registered earlier, via
3731 	 * sys_set_robust_list()):
3732 	 */
3733 	if (fetch_robust_entry(&entry, &head->list.next, &pi))
3734 		return;
3735 	/*
3736 	 * Fetch the relative futex offset:
3737 	 */
3738 	if (get_user(futex_offset, &head->futex_offset))
3739 		return;
3740 	/*
3741 	 * Fetch any possibly pending lock-add first, and handle it
3742 	 * if it exists:
3743 	 */
3744 	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
3745 		return;
3746 
3747 	next_entry = NULL;	/* avoid warning with gcc */
3748 	while (entry != &head->list) {
3749 		/*
3750 		 * Fetch the next entry in the list before calling
3751 		 * handle_futex_death:
3752 		 */
3753 		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
3754 		/*
3755 		 * A pending lock might already be on the list, so
3756 		 * don't process it twice:
3757 		 */
3758 		if (entry != pending) {
3759 			if (handle_futex_death((void __user *)entry + futex_offset,
3760 						curr, pi, HANDLE_DEATH_LIST))
3761 				return;
3762 		}
3763 		if (rc)
3764 			return;
3765 		entry = next_entry;
3766 		pi = next_pi;
3767 		/*
3768 		 * Avoid excessively long or circular lists:
3769 		 */
3770 		if (!--limit)
3771 			break;
3772 
3773 		cond_resched();
3774 	}
3775 
3776 	if (pending) {
3777 		handle_futex_death((void __user *)pending + futex_offset,
3778 				   curr, pip, HANDLE_DEATH_PENDING);
3779 	}
3780 }
3781 
futex_cleanup(struct task_struct * tsk)3782 static void futex_cleanup(struct task_struct *tsk)
3783 {
3784 	if (unlikely(tsk->robust_list)) {
3785 		exit_robust_list(tsk);
3786 		tsk->robust_list = NULL;
3787 	}
3788 
3789 #ifdef CONFIG_COMPAT
3790 	if (unlikely(tsk->compat_robust_list)) {
3791 		compat_exit_robust_list(tsk);
3792 		tsk->compat_robust_list = NULL;
3793 	}
3794 #endif
3795 
3796 	if (unlikely(!list_empty(&tsk->pi_state_list)))
3797 		exit_pi_state_list(tsk);
3798 }
3799 
3800 /**
3801  * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
3802  * @tsk:	task to set the state on
3803  *
3804  * Set the futex exit state of the task lockless. The futex waiter code
3805  * observes that state when a task is exiting and loops until the task has
3806  * actually finished the futex cleanup. The worst case for this is that the
3807  * waiter runs through the wait loop until the state becomes visible.
3808  *
3809  * This is called from the recursive fault handling path in do_exit().
3810  *
3811  * This is best effort. Either the futex exit code has run already or
3812  * not. If the OWNER_DIED bit has been set on the futex then the waiter can
3813  * take it over. If not, the problem is pushed back to user space. If the
3814  * futex exit code did not run yet, then an already queued waiter might
3815  * block forever, but there is nothing which can be done about that.
3816  */
futex_exit_recursive(struct task_struct * tsk)3817 void futex_exit_recursive(struct task_struct *tsk)
3818 {
3819 	/* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
3820 	if (tsk->futex_state == FUTEX_STATE_EXITING)
3821 		mutex_unlock(&tsk->futex_exit_mutex);
3822 	tsk->futex_state = FUTEX_STATE_DEAD;
3823 }
3824 
futex_cleanup_begin(struct task_struct * tsk)3825 static void futex_cleanup_begin(struct task_struct *tsk)
3826 {
3827 	/*
3828 	 * Prevent various race issues against a concurrent incoming waiter
3829 	 * including live locks by forcing the waiter to block on
3830 	 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
3831 	 * attach_to_pi_owner().
3832 	 */
3833 	mutex_lock(&tsk->futex_exit_mutex);
3834 
3835 	/*
3836 	 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
3837 	 *
3838 	 * This ensures that all subsequent checks of tsk->futex_state in
3839 	 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
3840 	 * tsk->pi_lock held.
3841 	 *
3842 	 * It guarantees also that a pi_state which was queued right before
3843 	 * the state change under tsk->pi_lock by a concurrent waiter must
3844 	 * be observed in exit_pi_state_list().
3845 	 */
3846 	raw_spin_lock_irq(&tsk->pi_lock);
3847 	tsk->futex_state = FUTEX_STATE_EXITING;
3848 	raw_spin_unlock_irq(&tsk->pi_lock);
3849 }
3850 
futex_cleanup_end(struct task_struct * tsk,int state)3851 static void futex_cleanup_end(struct task_struct *tsk, int state)
3852 {
3853 	/*
3854 	 * Lockless store. The only side effect is that an observer might
3855 	 * take another loop until it becomes visible.
3856 	 */
3857 	tsk->futex_state = state;
3858 	/*
3859 	 * Drop the exit protection. This unblocks waiters which observed
3860 	 * FUTEX_STATE_EXITING to reevaluate the state.
3861 	 */
3862 	mutex_unlock(&tsk->futex_exit_mutex);
3863 }
3864 
futex_exec_release(struct task_struct * tsk)3865 void futex_exec_release(struct task_struct *tsk)
3866 {
3867 	/*
3868 	 * The state handling is done for consistency, but in the case of
3869 	 * exec() there is no way to prevent futher damage as the PID stays
3870 	 * the same. But for the unlikely and arguably buggy case that a
3871 	 * futex is held on exec(), this provides at least as much state
3872 	 * consistency protection which is possible.
3873 	 */
3874 	futex_cleanup_begin(tsk);
3875 	futex_cleanup(tsk);
3876 	/*
3877 	 * Reset the state to FUTEX_STATE_OK. The task is alive and about
3878 	 * exec a new binary.
3879 	 */
3880 	futex_cleanup_end(tsk, FUTEX_STATE_OK);
3881 }
3882 
futex_exit_release(struct task_struct * tsk)3883 void futex_exit_release(struct task_struct *tsk)
3884 {
3885 	futex_cleanup_begin(tsk);
3886 	futex_cleanup(tsk);
3887 	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
3888 }
3889 
do_futex(u32 __user * uaddr,int op,u32 val,ktime_t * timeout,u32 __user * uaddr2,u32 val2,u32 val3)3890 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
3891 		u32 __user *uaddr2, u32 val2, u32 val3)
3892 {
3893 	int cmd = op & FUTEX_CMD_MASK;
3894 	unsigned int flags = 0;
3895 
3896 	if (!(op & FUTEX_PRIVATE_FLAG))
3897 		flags |= FLAGS_SHARED;
3898 
3899 	if (op & FUTEX_CLOCK_REALTIME) {
3900 		flags |= FLAGS_CLOCKRT;
3901 		if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
3902 		    cmd != FUTEX_WAIT_REQUEUE_PI)
3903 			return -ENOSYS;
3904 	}
3905 
3906 	switch (cmd) {
3907 	case FUTEX_LOCK_PI:
3908 	case FUTEX_UNLOCK_PI:
3909 	case FUTEX_TRYLOCK_PI:
3910 	case FUTEX_WAIT_REQUEUE_PI:
3911 	case FUTEX_CMP_REQUEUE_PI:
3912 		if (!futex_cmpxchg_enabled)
3913 			return -ENOSYS;
3914 	}
3915 
3916 	switch (cmd) {
3917 	case FUTEX_WAIT:
3918 		val3 = FUTEX_BITSET_MATCH_ANY;
3919 	case FUTEX_WAIT_BITSET:
3920 		return futex_wait(uaddr, flags, val, timeout, val3);
3921 	case FUTEX_WAKE:
3922 		val3 = FUTEX_BITSET_MATCH_ANY;
3923 	case FUTEX_WAKE_BITSET:
3924 		return futex_wake(uaddr, flags, val, val3);
3925 	case FUTEX_REQUEUE:
3926 		return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
3927 	case FUTEX_CMP_REQUEUE:
3928 		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
3929 	case FUTEX_WAKE_OP:
3930 		return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
3931 	case FUTEX_LOCK_PI:
3932 		return futex_lock_pi(uaddr, flags, timeout, 0);
3933 	case FUTEX_UNLOCK_PI:
3934 		return futex_unlock_pi(uaddr, flags);
3935 	case FUTEX_TRYLOCK_PI:
3936 		return futex_lock_pi(uaddr, flags, NULL, 1);
3937 	case FUTEX_WAIT_REQUEUE_PI:
3938 		val3 = FUTEX_BITSET_MATCH_ANY;
3939 		return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
3940 					     uaddr2);
3941 	case FUTEX_CMP_REQUEUE_PI:
3942 		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
3943 	}
3944 	return -ENOSYS;
3945 }
3946 
3947 
SYSCALL_DEFINE6(futex,u32 __user *,uaddr,int,op,u32,val,struct timespec __user *,utime,u32 __user *,uaddr2,u32,val3)3948 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
3949 		struct timespec __user *, utime, u32 __user *, uaddr2,
3950 		u32, val3)
3951 {
3952 	struct timespec ts;
3953 	ktime_t t, *tp = NULL;
3954 	u32 val2 = 0;
3955 	int cmd = op & FUTEX_CMD_MASK;
3956 
3957 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
3958 		      cmd == FUTEX_WAIT_BITSET ||
3959 		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
3960 		if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
3961 			return -EFAULT;
3962 		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
3963 			return -EFAULT;
3964 		if (!timespec_valid(&ts))
3965 			return -EINVAL;
3966 
3967 		t = timespec_to_ktime(ts);
3968 		if (cmd == FUTEX_WAIT)
3969 			t = ktime_add_safe(ktime_get(), t);
3970 		tp = &t;
3971 	}
3972 	/*
3973 	 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
3974 	 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
3975 	 */
3976 	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
3977 	    cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
3978 		val2 = (u32) (unsigned long) utime;
3979 
3980 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
3981 }
3982 
3983 #ifdef CONFIG_COMPAT
3984 /*
3985  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
3986  */
3987 static inline int
compat_fetch_robust_entry(compat_uptr_t * uentry,struct robust_list __user ** entry,compat_uptr_t __user * head,unsigned int * pi)3988 compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
3989 		   compat_uptr_t __user *head, unsigned int *pi)
3990 {
3991 	if (get_user(*uentry, head))
3992 		return -EFAULT;
3993 
3994 	*entry = compat_ptr((*uentry) & ~1);
3995 	*pi = (unsigned int)(*uentry) & 1;
3996 
3997 	return 0;
3998 }
3999 
futex_uaddr(struct robust_list __user * entry,compat_long_t futex_offset)4000 static void __user *futex_uaddr(struct robust_list __user *entry,
4001 				compat_long_t futex_offset)
4002 {
4003 	compat_uptr_t base = ptr_to_compat(entry);
4004 	void __user *uaddr = compat_ptr(base + futex_offset);
4005 
4006 	return uaddr;
4007 }
4008 
4009 /*
4010  * Walk curr->robust_list (very carefully, it's a userspace list!)
4011  * and mark any locks found there dead, and notify any waiters.
4012  *
4013  * We silently return on any sign of list-walking problem.
4014  */
compat_exit_robust_list(struct task_struct * curr)4015 static void compat_exit_robust_list(struct task_struct *curr)
4016 {
4017 	struct compat_robust_list_head __user *head = curr->compat_robust_list;
4018 	struct robust_list __user *entry, *next_entry, *pending;
4019 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
4020 	unsigned int uninitialized_var(next_pi);
4021 	compat_uptr_t uentry, next_uentry, upending;
4022 	compat_long_t futex_offset;
4023 	int rc;
4024 
4025 	if (!futex_cmpxchg_enabled)
4026 		return;
4027 
4028 	/*
4029 	 * Fetch the list head (which was registered earlier, via
4030 	 * sys_set_robust_list()):
4031 	 */
4032 	if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
4033 		return;
4034 	/*
4035 	 * Fetch the relative futex offset:
4036 	 */
4037 	if (get_user(futex_offset, &head->futex_offset))
4038 		return;
4039 	/*
4040 	 * Fetch any possibly pending lock-add first, and handle it
4041 	 * if it exists:
4042 	 */
4043 	if (compat_fetch_robust_entry(&upending, &pending,
4044 			       &head->list_op_pending, &pip))
4045 		return;
4046 
4047 	next_entry = NULL;	/* avoid warning with gcc */
4048 	while (entry != (struct robust_list __user *) &head->list) {
4049 		/*
4050 		 * Fetch the next entry in the list before calling
4051 		 * handle_futex_death:
4052 		 */
4053 		rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
4054 			(compat_uptr_t __user *)&entry->next, &next_pi);
4055 		/*
4056 		 * A pending lock might already be on the list, so
4057 		 * dont process it twice:
4058 		 */
4059 		if (entry != pending) {
4060 			void __user *uaddr = futex_uaddr(entry, futex_offset);
4061 
4062 			if (handle_futex_death(uaddr, curr, pi,
4063 					       HANDLE_DEATH_LIST))
4064 				return;
4065 		}
4066 		if (rc)
4067 			return;
4068 		uentry = next_uentry;
4069 		entry = next_entry;
4070 		pi = next_pi;
4071 		/*
4072 		 * Avoid excessively long or circular lists:
4073 		 */
4074 		if (!--limit)
4075 			break;
4076 
4077 		cond_resched();
4078 	}
4079 	if (pending) {
4080 		void __user *uaddr = futex_uaddr(pending, futex_offset);
4081 
4082 		handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
4083 	}
4084 }
4085 
COMPAT_SYSCALL_DEFINE2(set_robust_list,struct compat_robust_list_head __user *,head,compat_size_t,len)4086 COMPAT_SYSCALL_DEFINE2(set_robust_list,
4087 		struct compat_robust_list_head __user *, head,
4088 		compat_size_t, len)
4089 {
4090 	if (!futex_cmpxchg_enabled)
4091 		return -ENOSYS;
4092 
4093 	if (unlikely(len != sizeof(*head)))
4094 		return -EINVAL;
4095 
4096 	current->compat_robust_list = head;
4097 
4098 	return 0;
4099 }
4100 
COMPAT_SYSCALL_DEFINE3(get_robust_list,int,pid,compat_uptr_t __user *,head_ptr,compat_size_t __user *,len_ptr)4101 COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
4102 			compat_uptr_t __user *, head_ptr,
4103 			compat_size_t __user *, len_ptr)
4104 {
4105 	struct compat_robust_list_head __user *head;
4106 	unsigned long ret;
4107 	struct task_struct *p;
4108 
4109 	if (!futex_cmpxchg_enabled)
4110 		return -ENOSYS;
4111 
4112 	rcu_read_lock();
4113 
4114 	ret = -ESRCH;
4115 	if (!pid)
4116 		p = current;
4117 	else {
4118 		p = find_task_by_vpid(pid);
4119 		if (!p)
4120 			goto err_unlock;
4121 	}
4122 
4123 	ret = -EPERM;
4124 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
4125 		goto err_unlock;
4126 
4127 	head = p->compat_robust_list;
4128 	rcu_read_unlock();
4129 
4130 	if (put_user(sizeof(*head), len_ptr))
4131 		return -EFAULT;
4132 	return put_user(ptr_to_compat(head), head_ptr);
4133 
4134 err_unlock:
4135 	rcu_read_unlock();
4136 
4137 	return ret;
4138 }
4139 
COMPAT_SYSCALL_DEFINE6(futex,u32 __user *,uaddr,int,op,u32,val,struct compat_timespec __user *,utime,u32 __user *,uaddr2,u32,val3)4140 COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
4141 		struct compat_timespec __user *, utime, u32 __user *, uaddr2,
4142 		u32, val3)
4143 {
4144 	struct timespec ts;
4145 	ktime_t t, *tp = NULL;
4146 	int val2 = 0;
4147 	int cmd = op & FUTEX_CMD_MASK;
4148 
4149 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
4150 		      cmd == FUTEX_WAIT_BITSET ||
4151 		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
4152 		if (compat_get_timespec(&ts, utime))
4153 			return -EFAULT;
4154 		if (!timespec_valid(&ts))
4155 			return -EINVAL;
4156 
4157 		t = timespec_to_ktime(ts);
4158 		if (cmd == FUTEX_WAIT)
4159 			t = ktime_add_safe(ktime_get(), t);
4160 		tp = &t;
4161 	}
4162 	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
4163 	    cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
4164 		val2 = (int) (unsigned long) utime;
4165 
4166 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
4167 }
4168 #endif /* CONFIG_COMPAT */
4169 
futex_detect_cmpxchg(void)4170 static void __init futex_detect_cmpxchg(void)
4171 {
4172 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
4173 	u32 curval;
4174 
4175 	/*
4176 	 * This will fail and we want it. Some arch implementations do
4177 	 * runtime detection of the futex_atomic_cmpxchg_inatomic()
4178 	 * functionality. We want to know that before we call in any
4179 	 * of the complex code paths. Also we want to prevent
4180 	 * registration of robust lists in that case. NULL is
4181 	 * guaranteed to fault and we get -EFAULT on functional
4182 	 * implementation, the non-functional ones will return
4183 	 * -ENOSYS.
4184 	 */
4185 	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
4186 		futex_cmpxchg_enabled = 1;
4187 #endif
4188 }
4189 
futex_init(void)4190 static int __init futex_init(void)
4191 {
4192 	unsigned int futex_shift;
4193 	unsigned long i;
4194 
4195 #if CONFIG_BASE_SMALL
4196 	futex_hashsize = 16;
4197 #else
4198 	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
4199 #endif
4200 
4201 	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
4202 					       futex_hashsize, 0,
4203 					       futex_hashsize < 256 ? HASH_SMALL : 0,
4204 					       &futex_shift, NULL,
4205 					       futex_hashsize, futex_hashsize);
4206 	futex_hashsize = 1UL << futex_shift;
4207 
4208 	futex_detect_cmpxchg();
4209 
4210 	for (i = 0; i < futex_hashsize; i++) {
4211 		atomic_set(&futex_queues[i].waiters, 0);
4212 		plist_head_init(&futex_queues[i].chain);
4213 		spin_lock_init(&futex_queues[i].lock);
4214 	}
4215 
4216 	return 0;
4217 }
4218 core_initcall(futex_init);
4219