1 /*
2 * Fast Userspace Mutexes (which I call "Futexes!").
3 * (C) Rusty Russell, IBM 2002
4 *
5 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
6 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
7 *
8 * Removed page pinning, fix privately mapped COW pages and other cleanups
9 * (C) Copyright 2003, 2004 Jamie Lokier
10 *
11 * Robust futex support started by Ingo Molnar
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 *
15 * PI-futex support started by Ingo Molnar and Thomas Gleixner
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 *
19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 *
22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23 * Copyright (C) IBM Corporation, 2009
24 * Thanks to Thomas Gleixner for conceptual design and careful reviews.
25 *
26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
27 * enough at me, Linus for the original (flawed) idea, Matthew
28 * Kirkwood for proof-of-concept implementation.
29 *
30 * "The futexes are also cursed."
31 * "But they come in a choice of three flavours!"
32 *
33 * This program is free software; you can redistribute it and/or modify
34 * it under the terms of the GNU General Public License as published by
35 * the Free Software Foundation; either version 2 of the License, or
36 * (at your option) any later version.
37 *
38 * This program is distributed in the hope that it will be useful,
39 * but WITHOUT ANY WARRANTY; without even the implied warranty of
40 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
41 * GNU General Public License for more details.
42 *
43 * You should have received a copy of the GNU General Public License
44 * along with this program; if not, write to the Free Software
45 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
46 */
47 #include <linux/slab.h>
48 #include <linux/poll.h>
49 #include <linux/fs.h>
50 #include <linux/file.h>
51 #include <linux/jhash.h>
52 #include <linux/init.h>
53 #include <linux/futex.h>
54 #include <linux/mount.h>
55 #include <linux/pagemap.h>
56 #include <linux/syscalls.h>
57 #include <linux/signal.h>
58 #include <linux/export.h>
59 #include <linux/magic.h>
60 #include <linux/pid.h>
61 #include <linux/nsproxy.h>
62 #include <linux/ptrace.h>
63 #include <linux/sched/rt.h>
64 #include <linux/freezer.h>
65
66 #include <asm/futex.h>
67
68 #include "rtmutex_common.h"
69
70 int __read_mostly futex_cmpxchg_enabled;
71
72 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
73
74 /*
75 * Futex flags used to encode options to functions and preserve them across
76 * restarts.
77 */
78 #define FLAGS_SHARED 0x01
79 #define FLAGS_CLOCKRT 0x02
80 #define FLAGS_HAS_TIMEOUT 0x04
81
82 /*
83 * Priority Inheritance state:
84 */
85 struct futex_pi_state {
86 /*
87 * list of 'owned' pi_state instances - these have to be
88 * cleaned up in do_exit() if the task exits prematurely:
89 */
90 struct list_head list;
91
92 /*
93 * The PI object:
94 */
95 struct rt_mutex pi_mutex;
96
97 struct task_struct *owner;
98 atomic_t refcount;
99
100 union futex_key key;
101 };
102
103 /**
104 * struct futex_q - The hashed futex queue entry, one per waiting task
105 * @list: priority-sorted list of tasks waiting on this futex
106 * @task: the task waiting on the futex
107 * @lock_ptr: the hash bucket lock
108 * @key: the key the futex is hashed on
109 * @pi_state: optional priority inheritance state
110 * @rt_waiter: rt_waiter storage for use with requeue_pi
111 * @requeue_pi_key: the requeue_pi target futex key
112 * @bitset: bitset for the optional bitmasked wakeup
113 *
114 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
115 * we can wake only the relevant ones (hashed queues may be shared).
116 *
117 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
118 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
119 * The order of wakeup is always to make the first condition true, then
120 * the second.
121 *
122 * PI futexes are typically woken before they are removed from the hash list via
123 * the rt_mutex code. See unqueue_me_pi().
124 */
125 struct futex_q {
126 struct plist_node list;
127
128 struct task_struct *task;
129 spinlock_t *lock_ptr;
130 union futex_key key;
131 struct futex_pi_state *pi_state;
132 struct rt_mutex_waiter *rt_waiter;
133 union futex_key *requeue_pi_key;
134 u32 bitset;
135 };
136
137 static const struct futex_q futex_q_init = {
138 /* list gets initialized in queue_me()*/
139 .key = FUTEX_KEY_INIT,
140 .bitset = FUTEX_BITSET_MATCH_ANY
141 };
142
143 /*
144 * Hash buckets are shared by all the futex_keys that hash to the same
145 * location. Each key may have multiple futex_q structures, one for each task
146 * waiting on a futex.
147 */
148 struct futex_hash_bucket {
149 spinlock_t lock;
150 struct plist_head chain;
151 };
152
153 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
154
155 /*
156 * We hash on the keys returned from get_futex_key (see below).
157 */
hash_futex(union futex_key * key)158 static struct futex_hash_bucket *hash_futex(union futex_key *key)
159 {
160 u32 hash = jhash2((u32*)&key->both.word,
161 (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
162 key->both.offset);
163 return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
164 }
165
166 /*
167 * Return 1 if two futex_keys are equal, 0 otherwise.
168 */
match_futex(union futex_key * key1,union futex_key * key2)169 static inline int match_futex(union futex_key *key1, union futex_key *key2)
170 {
171 return (key1 && key2
172 && key1->both.word == key2->both.word
173 && key1->both.ptr == key2->both.ptr
174 && key1->both.offset == key2->both.offset);
175 }
176
177 /*
178 * Take a reference to the resource addressed by a key.
179 * Can be called while holding spinlocks.
180 *
181 */
get_futex_key_refs(union futex_key * key)182 static void get_futex_key_refs(union futex_key *key)
183 {
184 if (!key->both.ptr)
185 return;
186
187 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
188 case FUT_OFF_INODE:
189 ihold(key->shared.inode);
190 break;
191 case FUT_OFF_MMSHARED:
192 atomic_inc(&key->private.mm->mm_count);
193 break;
194 }
195 }
196
197 /*
198 * Drop a reference to the resource addressed by a key.
199 * The hash bucket spinlock must not be held.
200 */
drop_futex_key_refs(union futex_key * key)201 static void drop_futex_key_refs(union futex_key *key)
202 {
203 if (!key->both.ptr) {
204 /* If we're here then we tried to put a key we failed to get */
205 WARN_ON_ONCE(1);
206 return;
207 }
208
209 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
210 case FUT_OFF_INODE:
211 iput(key->shared.inode);
212 break;
213 case FUT_OFF_MMSHARED:
214 mmdrop(key->private.mm);
215 break;
216 }
217 }
218
219 /**
220 * get_futex_key() - Get parameters which are the keys for a futex
221 * @uaddr: virtual address of the futex
222 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
223 * @key: address where result is stored.
224 * @rw: mapping needs to be read/write (values: VERIFY_READ,
225 * VERIFY_WRITE)
226 *
227 * Return: a negative error code or 0
228 *
229 * The key words are stored in *key on success.
230 *
231 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
232 * offset_within_page). For private mappings, it's (uaddr, current->mm).
233 * We can usually work out the index without swapping in the page.
234 *
235 * lock_page() might sleep, the caller should not hold a spinlock.
236 */
237 static int
get_futex_key(u32 __user * uaddr,int fshared,union futex_key * key,int rw)238 get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
239 {
240 unsigned long address = (unsigned long)uaddr;
241 struct mm_struct *mm = current->mm;
242 struct page *page, *page_head;
243 int err, ro = 0;
244
245 /*
246 * The futex address must be "naturally" aligned.
247 */
248 key->both.offset = address % PAGE_SIZE;
249 if (unlikely((address % sizeof(u32)) != 0))
250 return -EINVAL;
251 address -= key->both.offset;
252
253 /*
254 * PROCESS_PRIVATE futexes are fast.
255 * As the mm cannot disappear under us and the 'key' only needs
256 * virtual address, we dont even have to find the underlying vma.
257 * Note : We do have to check 'uaddr' is a valid user address,
258 * but access_ok() should be faster than find_vma()
259 */
260 if (!fshared) {
261 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
262 return -EFAULT;
263 key->private.mm = mm;
264 key->private.address = address;
265 get_futex_key_refs(key);
266 return 0;
267 }
268
269 again:
270 err = get_user_pages_fast(address, 1, 1, &page);
271 /*
272 * If write access is not required (eg. FUTEX_WAIT), try
273 * and get read-only access.
274 */
275 if (err == -EFAULT && rw == VERIFY_READ) {
276 err = get_user_pages_fast(address, 1, 0, &page);
277 ro = 1;
278 }
279 if (err < 0)
280 return err;
281 else
282 err = 0;
283
284 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
285 page_head = page;
286 if (unlikely(PageTail(page))) {
287 put_page(page);
288 /* serialize against __split_huge_page_splitting() */
289 local_irq_disable();
290 if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
291 page_head = compound_head(page);
292 /*
293 * page_head is valid pointer but we must pin
294 * it before taking the PG_lock and/or
295 * PG_compound_lock. The moment we re-enable
296 * irqs __split_huge_page_splitting() can
297 * return and the head page can be freed from
298 * under us. We can't take the PG_lock and/or
299 * PG_compound_lock on a page that could be
300 * freed from under us.
301 */
302 if (page != page_head) {
303 get_page(page_head);
304 put_page(page);
305 }
306 local_irq_enable();
307 } else {
308 local_irq_enable();
309 goto again;
310 }
311 }
312 #else
313 page_head = compound_head(page);
314 if (page != page_head) {
315 get_page(page_head);
316 put_page(page);
317 }
318 #endif
319
320 lock_page(page_head);
321
322 /*
323 * If page_head->mapping is NULL, then it cannot be a PageAnon
324 * page; but it might be the ZERO_PAGE or in the gate area or
325 * in a special mapping (all cases which we are happy to fail);
326 * or it may have been a good file page when get_user_pages_fast
327 * found it, but truncated or holepunched or subjected to
328 * invalidate_complete_page2 before we got the page lock (also
329 * cases which we are happy to fail). And we hold a reference,
330 * so refcount care in invalidate_complete_page's remove_mapping
331 * prevents drop_caches from setting mapping to NULL beneath us.
332 *
333 * The case we do have to guard against is when memory pressure made
334 * shmem_writepage move it from filecache to swapcache beneath us:
335 * an unlikely race, but we do need to retry for page_head->mapping.
336 */
337 if (!page_head->mapping) {
338 int shmem_swizzled = PageSwapCache(page_head);
339 unlock_page(page_head);
340 put_page(page_head);
341 if (shmem_swizzled)
342 goto again;
343 return -EFAULT;
344 }
345
346 /*
347 * Private mappings are handled in a simple way.
348 *
349 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
350 * it's a read-only handle, it's expected that futexes attach to
351 * the object not the particular process.
352 */
353 if (PageAnon(page_head)) {
354 /*
355 * A RO anonymous page will never change and thus doesn't make
356 * sense for futex operations.
357 */
358 if (ro) {
359 err = -EFAULT;
360 goto out;
361 }
362
363 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
364 key->private.mm = mm;
365 key->private.address = address;
366 } else {
367 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
368 key->shared.inode = page_head->mapping->host;
369 key->shared.pgoff = page_head->index;
370 }
371
372 get_futex_key_refs(key);
373
374 out:
375 unlock_page(page_head);
376 put_page(page_head);
377 return err;
378 }
379
put_futex_key(union futex_key * key)380 static inline void put_futex_key(union futex_key *key)
381 {
382 drop_futex_key_refs(key);
383 }
384
385 /**
386 * fault_in_user_writeable() - Fault in user address and verify RW access
387 * @uaddr: pointer to faulting user space address
388 *
389 * Slow path to fixup the fault we just took in the atomic write
390 * access to @uaddr.
391 *
392 * We have no generic implementation of a non-destructive write to the
393 * user address. We know that we faulted in the atomic pagefault
394 * disabled section so we can as well avoid the #PF overhead by
395 * calling get_user_pages() right away.
396 */
fault_in_user_writeable(u32 __user * uaddr)397 static int fault_in_user_writeable(u32 __user *uaddr)
398 {
399 struct mm_struct *mm = current->mm;
400 int ret;
401
402 down_read(&mm->mmap_sem);
403 ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
404 FAULT_FLAG_WRITE);
405 up_read(&mm->mmap_sem);
406
407 return ret < 0 ? ret : 0;
408 }
409
410 /**
411 * futex_top_waiter() - Return the highest priority waiter on a futex
412 * @hb: the hash bucket the futex_q's reside in
413 * @key: the futex key (to distinguish it from other futex futex_q's)
414 *
415 * Must be called with the hb lock held.
416 */
futex_top_waiter(struct futex_hash_bucket * hb,union futex_key * key)417 static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
418 union futex_key *key)
419 {
420 struct futex_q *this;
421
422 plist_for_each_entry(this, &hb->chain, list) {
423 if (match_futex(&this->key, key))
424 return this;
425 }
426 return NULL;
427 }
428
cmpxchg_futex_value_locked(u32 * curval,u32 __user * uaddr,u32 uval,u32 newval)429 static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
430 u32 uval, u32 newval)
431 {
432 int ret;
433
434 pagefault_disable();
435 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
436 pagefault_enable();
437
438 return ret;
439 }
440
get_futex_value_locked(u32 * dest,u32 __user * from)441 static int get_futex_value_locked(u32 *dest, u32 __user *from)
442 {
443 int ret;
444
445 pagefault_disable();
446 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
447 pagefault_enable();
448
449 return ret ? -EFAULT : 0;
450 }
451
452
453 /*
454 * PI code:
455 */
refill_pi_state_cache(void)456 static int refill_pi_state_cache(void)
457 {
458 struct futex_pi_state *pi_state;
459
460 if (likely(current->pi_state_cache))
461 return 0;
462
463 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
464
465 if (!pi_state)
466 return -ENOMEM;
467
468 INIT_LIST_HEAD(&pi_state->list);
469 /* pi_mutex gets initialized later */
470 pi_state->owner = NULL;
471 atomic_set(&pi_state->refcount, 1);
472 pi_state->key = FUTEX_KEY_INIT;
473
474 current->pi_state_cache = pi_state;
475
476 return 0;
477 }
478
alloc_pi_state(void)479 static struct futex_pi_state * alloc_pi_state(void)
480 {
481 struct futex_pi_state *pi_state = current->pi_state_cache;
482
483 WARN_ON(!pi_state);
484 current->pi_state_cache = NULL;
485
486 return pi_state;
487 }
488
free_pi_state(struct futex_pi_state * pi_state)489 static void free_pi_state(struct futex_pi_state *pi_state)
490 {
491 if (!atomic_dec_and_test(&pi_state->refcount))
492 return;
493
494 /*
495 * If pi_state->owner is NULL, the owner is most probably dying
496 * and has cleaned up the pi_state already
497 */
498 if (pi_state->owner) {
499 raw_spin_lock_irq(&pi_state->owner->pi_lock);
500 list_del_init(&pi_state->list);
501 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
502
503 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
504 }
505
506 if (current->pi_state_cache)
507 kfree(pi_state);
508 else {
509 /*
510 * pi_state->list is already empty.
511 * clear pi_state->owner.
512 * refcount is at 0 - put it back to 1.
513 */
514 pi_state->owner = NULL;
515 atomic_set(&pi_state->refcount, 1);
516 current->pi_state_cache = pi_state;
517 }
518 }
519
520 /*
521 * Look up the task based on what TID userspace gave us.
522 * We dont trust it.
523 */
futex_find_get_task(pid_t pid)524 static struct task_struct * futex_find_get_task(pid_t pid)
525 {
526 struct task_struct *p;
527
528 rcu_read_lock();
529 p = find_task_by_vpid(pid);
530 if (p)
531 get_task_struct(p);
532
533 rcu_read_unlock();
534
535 return p;
536 }
537
538 /*
539 * This task is holding PI mutexes at exit time => bad.
540 * Kernel cleans up PI-state, but userspace is likely hosed.
541 * (Robust-futex cleanup is separate and might save the day for userspace.)
542 */
exit_pi_state_list(struct task_struct * curr)543 void exit_pi_state_list(struct task_struct *curr)
544 {
545 struct list_head *next, *head = &curr->pi_state_list;
546 struct futex_pi_state *pi_state;
547 struct futex_hash_bucket *hb;
548 union futex_key key = FUTEX_KEY_INIT;
549
550 if (!futex_cmpxchg_enabled)
551 return;
552 /*
553 * We are a ZOMBIE and nobody can enqueue itself on
554 * pi_state_list anymore, but we have to be careful
555 * versus waiters unqueueing themselves:
556 */
557 raw_spin_lock_irq(&curr->pi_lock);
558 while (!list_empty(head)) {
559
560 next = head->next;
561 pi_state = list_entry(next, struct futex_pi_state, list);
562 key = pi_state->key;
563 hb = hash_futex(&key);
564 raw_spin_unlock_irq(&curr->pi_lock);
565
566 spin_lock(&hb->lock);
567
568 raw_spin_lock_irq(&curr->pi_lock);
569 /*
570 * We dropped the pi-lock, so re-check whether this
571 * task still owns the PI-state:
572 */
573 if (head->next != next) {
574 spin_unlock(&hb->lock);
575 continue;
576 }
577
578 WARN_ON(pi_state->owner != curr);
579 WARN_ON(list_empty(&pi_state->list));
580 list_del_init(&pi_state->list);
581 pi_state->owner = NULL;
582 raw_spin_unlock_irq(&curr->pi_lock);
583
584 rt_mutex_unlock(&pi_state->pi_mutex);
585
586 spin_unlock(&hb->lock);
587
588 raw_spin_lock_irq(&curr->pi_lock);
589 }
590 raw_spin_unlock_irq(&curr->pi_lock);
591 }
592
593 /*
594 * We need to check the following states:
595 *
596 * Waiter | pi_state | pi->owner | uTID | uODIED | ?
597 *
598 * [1] NULL | --- | --- | 0 | 0/1 | Valid
599 * [2] NULL | --- | --- | >0 | 0/1 | Valid
600 *
601 * [3] Found | NULL | -- | Any | 0/1 | Invalid
602 *
603 * [4] Found | Found | NULL | 0 | 1 | Valid
604 * [5] Found | Found | NULL | >0 | 1 | Invalid
605 *
606 * [6] Found | Found | task | 0 | 1 | Valid
607 *
608 * [7] Found | Found | NULL | Any | 0 | Invalid
609 *
610 * [8] Found | Found | task | ==taskTID | 0/1 | Valid
611 * [9] Found | Found | task | 0 | 0 | Invalid
612 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
613 *
614 * [1] Indicates that the kernel can acquire the futex atomically. We
615 * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
616 *
617 * [2] Valid, if TID does not belong to a kernel thread. If no matching
618 * thread is found then it indicates that the owner TID has died.
619 *
620 * [3] Invalid. The waiter is queued on a non PI futex
621 *
622 * [4] Valid state after exit_robust_list(), which sets the user space
623 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
624 *
625 * [5] The user space value got manipulated between exit_robust_list()
626 * and exit_pi_state_list()
627 *
628 * [6] Valid state after exit_pi_state_list() which sets the new owner in
629 * the pi_state but cannot access the user space value.
630 *
631 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
632 *
633 * [8] Owner and user space value match
634 *
635 * [9] There is no transient state which sets the user space TID to 0
636 * except exit_robust_list(), but this is indicated by the
637 * FUTEX_OWNER_DIED bit. See [4]
638 *
639 * [10] There is no transient state which leaves owner and user space
640 * TID out of sync.
641 */
642 static int
lookup_pi_state(u32 uval,struct futex_hash_bucket * hb,union futex_key * key,struct futex_pi_state ** ps)643 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
644 union futex_key *key, struct futex_pi_state **ps)
645 {
646 struct futex_pi_state *pi_state = NULL;
647 struct futex_q *this, *next;
648 struct plist_head *head;
649 struct task_struct *p;
650 pid_t pid = uval & FUTEX_TID_MASK;
651
652 head = &hb->chain;
653
654 plist_for_each_entry_safe(this, next, head, list) {
655 if (match_futex(&this->key, key)) {
656 /*
657 * Sanity check the waiter before increasing
658 * the refcount and attaching to it.
659 */
660 pi_state = this->pi_state;
661 /*
662 * Userspace might have messed up non-PI and
663 * PI futexes [3]
664 */
665 if (unlikely(!pi_state))
666 return -EINVAL;
667
668 WARN_ON(!atomic_read(&pi_state->refcount));
669
670 /*
671 * Handle the owner died case:
672 */
673 if (uval & FUTEX_OWNER_DIED) {
674 /*
675 * exit_pi_state_list sets owner to NULL and
676 * wakes the topmost waiter. The task which
677 * acquires the pi_state->rt_mutex will fixup
678 * owner.
679 */
680 if (!pi_state->owner) {
681 /*
682 * No pi state owner, but the user
683 * space TID is not 0. Inconsistent
684 * state. [5]
685 */
686 if (pid)
687 return -EINVAL;
688 /*
689 * Take a ref on the state and
690 * return. [4]
691 */
692 goto out_state;
693 }
694
695 /*
696 * If TID is 0, then either the dying owner
697 * has not yet executed exit_pi_state_list()
698 * or some waiter acquired the rtmutex in the
699 * pi state, but did not yet fixup the TID in
700 * user space.
701 *
702 * Take a ref on the state and return. [6]
703 */
704 if (!pid)
705 goto out_state;
706 } else {
707 /*
708 * If the owner died bit is not set,
709 * then the pi_state must have an
710 * owner. [7]
711 */
712 if (!pi_state->owner)
713 return -EINVAL;
714 }
715
716 /*
717 * Bail out if user space manipulated the
718 * futex value. If pi state exists then the
719 * owner TID must be the same as the user
720 * space TID. [9/10]
721 */
722 if (pid != task_pid_vnr(pi_state->owner))
723 return -EINVAL;
724
725 out_state:
726 atomic_inc(&pi_state->refcount);
727 *ps = pi_state;
728 return 0;
729 }
730 }
731
732 /*
733 * We are the first waiter - try to look up the real owner and attach
734 * the new pi_state to it, but bail out when TID = 0 [1]
735 */
736 if (!pid)
737 return -ESRCH;
738 p = futex_find_get_task(pid);
739 if (!p)
740 return -ESRCH;
741
742 /*
743 * We need to look at the task state flags to figure out,
744 * whether the task is exiting. To protect against the do_exit
745 * change of the task flags, we do this protected by
746 * p->pi_lock:
747 */
748 raw_spin_lock_irq(&p->pi_lock);
749 if (unlikely(p->flags & PF_EXITING)) {
750 /*
751 * The task is on the way out. When PF_EXITPIDONE is
752 * set, we know that the task has finished the
753 * cleanup:
754 */
755 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
756
757 raw_spin_unlock_irq(&p->pi_lock);
758 put_task_struct(p);
759 return ret;
760 }
761
762 /*
763 * No existing pi state. First waiter. [2]
764 */
765 pi_state = alloc_pi_state();
766
767 /*
768 * Initialize the pi_mutex in locked state and make 'p'
769 * the owner of it:
770 */
771 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
772
773 /* Store the key for possible exit cleanups: */
774 pi_state->key = *key;
775
776 WARN_ON(!list_empty(&pi_state->list));
777 list_add(&pi_state->list, &p->pi_state_list);
778 pi_state->owner = p;
779 raw_spin_unlock_irq(&p->pi_lock);
780
781 put_task_struct(p);
782
783 *ps = pi_state;
784
785 return 0;
786 }
787
788 /**
789 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
790 * @uaddr: the pi futex user address
791 * @hb: the pi futex hash bucket
792 * @key: the futex key associated with uaddr and hb
793 * @ps: the pi_state pointer where we store the result of the
794 * lookup
795 * @task: the task to perform the atomic lock work for. This will
796 * be "current" except in the case of requeue pi.
797 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
798 *
799 * Return:
800 * 0 - ready to wait;
801 * 1 - acquired the lock;
802 * <0 - error
803 *
804 * The hb->lock and futex_key refs shall be held by the caller.
805 */
futex_lock_pi_atomic(u32 __user * uaddr,struct futex_hash_bucket * hb,union futex_key * key,struct futex_pi_state ** ps,struct task_struct * task,int set_waiters)806 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
807 union futex_key *key,
808 struct futex_pi_state **ps,
809 struct task_struct *task, int set_waiters)
810 {
811 int lock_taken, ret, force_take = 0;
812 u32 uval, newval, curval, vpid = task_pid_vnr(task);
813
814 retry:
815 ret = lock_taken = 0;
816
817 /*
818 * To avoid races, we attempt to take the lock here again
819 * (by doing a 0 -> TID atomic cmpxchg), while holding all
820 * the locks. It will most likely not succeed.
821 */
822 newval = vpid;
823 if (set_waiters)
824 newval |= FUTEX_WAITERS;
825
826 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
827 return -EFAULT;
828
829 /*
830 * Detect deadlocks.
831 */
832 if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
833 return -EDEADLK;
834
835 /*
836 * Surprise - we got the lock, but we do not trust user space at all.
837 */
838 if (unlikely(!curval)) {
839 /*
840 * We verify whether there is kernel state for this
841 * futex. If not, we can safely assume, that the 0 ->
842 * TID transition is correct. If state exists, we do
843 * not bother to fixup the user space state as it was
844 * corrupted already.
845 */
846 return futex_top_waiter(hb, key) ? -EINVAL : 1;
847 }
848
849 uval = curval;
850
851 /*
852 * Set the FUTEX_WAITERS flag, so the owner will know it has someone
853 * to wake at the next unlock.
854 */
855 newval = curval | FUTEX_WAITERS;
856
857 /*
858 * Should we force take the futex? See below.
859 */
860 if (unlikely(force_take)) {
861 /*
862 * Keep the OWNER_DIED and the WAITERS bit and set the
863 * new TID value.
864 */
865 newval = (curval & ~FUTEX_TID_MASK) | vpid;
866 force_take = 0;
867 lock_taken = 1;
868 }
869
870 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
871 return -EFAULT;
872 if (unlikely(curval != uval))
873 goto retry;
874
875 /*
876 * We took the lock due to forced take over.
877 */
878 if (unlikely(lock_taken))
879 return 1;
880
881 /*
882 * We dont have the lock. Look up the PI state (or create it if
883 * we are the first waiter):
884 */
885 ret = lookup_pi_state(uval, hb, key, ps);
886
887 if (unlikely(ret)) {
888 switch (ret) {
889 case -ESRCH:
890 /*
891 * We failed to find an owner for this
892 * futex. So we have no pi_state to block
893 * on. This can happen in two cases:
894 *
895 * 1) The owner died
896 * 2) A stale FUTEX_WAITERS bit
897 *
898 * Re-read the futex value.
899 */
900 if (get_futex_value_locked(&curval, uaddr))
901 return -EFAULT;
902
903 /*
904 * If the owner died or we have a stale
905 * WAITERS bit the owner TID in the user space
906 * futex is 0.
907 */
908 if (!(curval & FUTEX_TID_MASK)) {
909 force_take = 1;
910 goto retry;
911 }
912 default:
913 break;
914 }
915 }
916
917 return ret;
918 }
919
920 /**
921 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
922 * @q: The futex_q to unqueue
923 *
924 * The q->lock_ptr must not be NULL and must be held by the caller.
925 */
__unqueue_futex(struct futex_q * q)926 static void __unqueue_futex(struct futex_q *q)
927 {
928 struct futex_hash_bucket *hb;
929
930 if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
931 || WARN_ON(plist_node_empty(&q->list)))
932 return;
933
934 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
935 plist_del(&q->list, &hb->chain);
936 }
937
938 /*
939 * The hash bucket lock must be held when this is called.
940 * Afterwards, the futex_q must not be accessed.
941 */
wake_futex(struct futex_q * q)942 static void wake_futex(struct futex_q *q)
943 {
944 struct task_struct *p = q->task;
945
946 if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
947 return;
948
949 /*
950 * We set q->lock_ptr = NULL _before_ we wake up the task. If
951 * a non-futex wake up happens on another CPU then the task
952 * might exit and p would dereference a non-existing task
953 * struct. Prevent this by holding a reference on p across the
954 * wake up.
955 */
956 get_task_struct(p);
957
958 __unqueue_futex(q);
959 /*
960 * The waiting task can free the futex_q as soon as
961 * q->lock_ptr = NULL is written, without taking any locks. A
962 * memory barrier is required here to prevent the following
963 * store to lock_ptr from getting ahead of the plist_del.
964 */
965 smp_wmb();
966 q->lock_ptr = NULL;
967
968 wake_up_state(p, TASK_NORMAL);
969 put_task_struct(p);
970 }
971
wake_futex_pi(u32 __user * uaddr,u32 uval,struct futex_q * this)972 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
973 {
974 struct task_struct *new_owner;
975 struct futex_pi_state *pi_state = this->pi_state;
976 u32 uninitialized_var(curval), newval;
977 int ret = 0;
978
979 if (!pi_state)
980 return -EINVAL;
981
982 /*
983 * If current does not own the pi_state then the futex is
984 * inconsistent and user space fiddled with the futex value.
985 */
986 if (pi_state->owner != current)
987 return -EINVAL;
988
989 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
990 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
991
992 /*
993 * It is possible that the next waiter (the one that brought
994 * this owner to the kernel) timed out and is no longer
995 * waiting on the lock.
996 */
997 if (!new_owner)
998 new_owner = this->task;
999
1000 /*
1001 * We pass it to the next owner. The WAITERS bit is always
1002 * kept enabled while there is PI state around. We cleanup the
1003 * owner died bit, because we are the owner.
1004 */
1005 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1006
1007 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1008 ret = -EFAULT;
1009 else if (curval != uval)
1010 ret = -EINVAL;
1011 if (ret) {
1012 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1013 return ret;
1014 }
1015
1016 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1017 WARN_ON(list_empty(&pi_state->list));
1018 list_del_init(&pi_state->list);
1019 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1020
1021 raw_spin_lock_irq(&new_owner->pi_lock);
1022 WARN_ON(!list_empty(&pi_state->list));
1023 list_add(&pi_state->list, &new_owner->pi_state_list);
1024 pi_state->owner = new_owner;
1025 raw_spin_unlock_irq(&new_owner->pi_lock);
1026
1027 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1028 rt_mutex_unlock(&pi_state->pi_mutex);
1029
1030 return 0;
1031 }
1032
unlock_futex_pi(u32 __user * uaddr,u32 uval)1033 static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
1034 {
1035 u32 uninitialized_var(oldval);
1036
1037 /*
1038 * There is no waiter, so we unlock the futex. The owner died
1039 * bit has not to be preserved here. We are the owner:
1040 */
1041 if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
1042 return -EFAULT;
1043 if (oldval != uval)
1044 return -EAGAIN;
1045
1046 return 0;
1047 }
1048
1049 /*
1050 * Express the locking dependencies for lockdep:
1051 */
1052 static inline void
double_lock_hb(struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2)1053 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1054 {
1055 if (hb1 <= hb2) {
1056 spin_lock(&hb1->lock);
1057 if (hb1 < hb2)
1058 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1059 } else { /* hb1 > hb2 */
1060 spin_lock(&hb2->lock);
1061 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1062 }
1063 }
1064
1065 static inline void
double_unlock_hb(struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2)1066 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1067 {
1068 spin_unlock(&hb1->lock);
1069 if (hb1 != hb2)
1070 spin_unlock(&hb2->lock);
1071 }
1072
1073 /*
1074 * Wake up waiters matching bitset queued on this futex (uaddr).
1075 */
1076 static int
futex_wake(u32 __user * uaddr,unsigned int flags,int nr_wake,u32 bitset)1077 futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1078 {
1079 struct futex_hash_bucket *hb;
1080 struct futex_q *this, *next;
1081 struct plist_head *head;
1082 union futex_key key = FUTEX_KEY_INIT;
1083 int ret;
1084
1085 if (!bitset)
1086 return -EINVAL;
1087
1088 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
1089 if (unlikely(ret != 0))
1090 goto out;
1091
1092 hb = hash_futex(&key);
1093 spin_lock(&hb->lock);
1094 head = &hb->chain;
1095
1096 plist_for_each_entry_safe(this, next, head, list) {
1097 if (match_futex (&this->key, &key)) {
1098 if (this->pi_state || this->rt_waiter) {
1099 ret = -EINVAL;
1100 break;
1101 }
1102
1103 /* Check if one of the bits is set in both bitsets */
1104 if (!(this->bitset & bitset))
1105 continue;
1106
1107 wake_futex(this);
1108 if (++ret >= nr_wake)
1109 break;
1110 }
1111 }
1112
1113 spin_unlock(&hb->lock);
1114 put_futex_key(&key);
1115 out:
1116 return ret;
1117 }
1118
1119 /*
1120 * Wake up all waiters hashed on the physical page that is mapped
1121 * to this virtual address:
1122 */
1123 static int
futex_wake_op(u32 __user * uaddr1,unsigned int flags,u32 __user * uaddr2,int nr_wake,int nr_wake2,int op)1124 futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1125 int nr_wake, int nr_wake2, int op)
1126 {
1127 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1128 struct futex_hash_bucket *hb1, *hb2;
1129 struct plist_head *head;
1130 struct futex_q *this, *next;
1131 int ret, op_ret;
1132
1133 retry:
1134 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1135 if (unlikely(ret != 0))
1136 goto out;
1137 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
1138 if (unlikely(ret != 0))
1139 goto out_put_key1;
1140
1141 hb1 = hash_futex(&key1);
1142 hb2 = hash_futex(&key2);
1143
1144 retry_private:
1145 double_lock_hb(hb1, hb2);
1146 op_ret = futex_atomic_op_inuser(op, uaddr2);
1147 if (unlikely(op_ret < 0)) {
1148
1149 double_unlock_hb(hb1, hb2);
1150
1151 #ifndef CONFIG_MMU
1152 /*
1153 * we don't get EFAULT from MMU faults if we don't have an MMU,
1154 * but we might get them from range checking
1155 */
1156 ret = op_ret;
1157 goto out_put_keys;
1158 #endif
1159
1160 if (unlikely(op_ret != -EFAULT)) {
1161 ret = op_ret;
1162 goto out_put_keys;
1163 }
1164
1165 ret = fault_in_user_writeable(uaddr2);
1166 if (ret)
1167 goto out_put_keys;
1168
1169 if (!(flags & FLAGS_SHARED))
1170 goto retry_private;
1171
1172 put_futex_key(&key2);
1173 put_futex_key(&key1);
1174 goto retry;
1175 }
1176
1177 head = &hb1->chain;
1178
1179 plist_for_each_entry_safe(this, next, head, list) {
1180 if (match_futex (&this->key, &key1)) {
1181 if (this->pi_state || this->rt_waiter) {
1182 ret = -EINVAL;
1183 goto out_unlock;
1184 }
1185 wake_futex(this);
1186 if (++ret >= nr_wake)
1187 break;
1188 }
1189 }
1190
1191 if (op_ret > 0) {
1192 head = &hb2->chain;
1193
1194 op_ret = 0;
1195 plist_for_each_entry_safe(this, next, head, list) {
1196 if (match_futex (&this->key, &key2)) {
1197 if (this->pi_state || this->rt_waiter) {
1198 ret = -EINVAL;
1199 goto out_unlock;
1200 }
1201 wake_futex(this);
1202 if (++op_ret >= nr_wake2)
1203 break;
1204 }
1205 }
1206 ret += op_ret;
1207 }
1208
1209 out_unlock:
1210 double_unlock_hb(hb1, hb2);
1211 out_put_keys:
1212 put_futex_key(&key2);
1213 out_put_key1:
1214 put_futex_key(&key1);
1215 out:
1216 return ret;
1217 }
1218
1219 /**
1220 * requeue_futex() - Requeue a futex_q from one hb to another
1221 * @q: the futex_q to requeue
1222 * @hb1: the source hash_bucket
1223 * @hb2: the target hash_bucket
1224 * @key2: the new key for the requeued futex_q
1225 */
1226 static inline
requeue_futex(struct futex_q * q,struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2,union futex_key * key2)1227 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1228 struct futex_hash_bucket *hb2, union futex_key *key2)
1229 {
1230
1231 /*
1232 * If key1 and key2 hash to the same bucket, no need to
1233 * requeue.
1234 */
1235 if (likely(&hb1->chain != &hb2->chain)) {
1236 plist_del(&q->list, &hb1->chain);
1237 plist_add(&q->list, &hb2->chain);
1238 q->lock_ptr = &hb2->lock;
1239 }
1240 get_futex_key_refs(key2);
1241 q->key = *key2;
1242 }
1243
1244 /**
1245 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1246 * @q: the futex_q
1247 * @key: the key of the requeue target futex
1248 * @hb: the hash_bucket of the requeue target futex
1249 *
1250 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1251 * target futex if it is uncontended or via a lock steal. Set the futex_q key
1252 * to the requeue target futex so the waiter can detect the wakeup on the right
1253 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1254 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
1255 * to protect access to the pi_state to fixup the owner later. Must be called
1256 * with both q->lock_ptr and hb->lock held.
1257 */
1258 static inline
requeue_pi_wake_futex(struct futex_q * q,union futex_key * key,struct futex_hash_bucket * hb)1259 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1260 struct futex_hash_bucket *hb)
1261 {
1262 get_futex_key_refs(key);
1263 q->key = *key;
1264
1265 __unqueue_futex(q);
1266
1267 WARN_ON(!q->rt_waiter);
1268 q->rt_waiter = NULL;
1269
1270 q->lock_ptr = &hb->lock;
1271
1272 wake_up_state(q->task, TASK_NORMAL);
1273 }
1274
1275 /**
1276 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1277 * @pifutex: the user address of the to futex
1278 * @hb1: the from futex hash bucket, must be locked by the caller
1279 * @hb2: the to futex hash bucket, must be locked by the caller
1280 * @key1: the from futex key
1281 * @key2: the to futex key
1282 * @ps: address to store the pi_state pointer
1283 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1284 *
1285 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1286 * Wake the top waiter if we succeed. If the caller specified set_waiters,
1287 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1288 * hb1 and hb2 must be held by the caller.
1289 *
1290 * Return:
1291 * 0 - failed to acquire the lock atomically;
1292 * 1 - acquired the lock;
1293 * <0 - error
1294 */
futex_proxy_trylock_atomic(u32 __user * pifutex,struct futex_hash_bucket * hb1,struct futex_hash_bucket * hb2,union futex_key * key1,union futex_key * key2,struct futex_pi_state ** ps,int set_waiters)1295 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1296 struct futex_hash_bucket *hb1,
1297 struct futex_hash_bucket *hb2,
1298 union futex_key *key1, union futex_key *key2,
1299 struct futex_pi_state **ps, int set_waiters)
1300 {
1301 struct futex_q *top_waiter = NULL;
1302 u32 curval;
1303 int ret;
1304
1305 if (get_futex_value_locked(&curval, pifutex))
1306 return -EFAULT;
1307
1308 /*
1309 * Find the top_waiter and determine if there are additional waiters.
1310 * If the caller intends to requeue more than 1 waiter to pifutex,
1311 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1312 * as we have means to handle the possible fault. If not, don't set
1313 * the bit unecessarily as it will force the subsequent unlock to enter
1314 * the kernel.
1315 */
1316 top_waiter = futex_top_waiter(hb1, key1);
1317
1318 /* There are no waiters, nothing for us to do. */
1319 if (!top_waiter)
1320 return 0;
1321
1322 /* Ensure we requeue to the expected futex. */
1323 if (!match_futex(top_waiter->requeue_pi_key, key2))
1324 return -EINVAL;
1325
1326 /*
1327 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1328 * the contended case or if set_waiters is 1. The pi_state is returned
1329 * in ps in contended cases.
1330 */
1331 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1332 set_waiters);
1333 if (ret == 1)
1334 requeue_pi_wake_futex(top_waiter, key2, hb2);
1335
1336 return ret;
1337 }
1338
1339 /**
1340 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1341 * @uaddr1: source futex user address
1342 * @flags: futex flags (FLAGS_SHARED, etc.)
1343 * @uaddr2: target futex user address
1344 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1345 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1346 * @cmpval: @uaddr1 expected value (or %NULL)
1347 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1348 * pi futex (pi to pi requeue is not supported)
1349 *
1350 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1351 * uaddr2 atomically on behalf of the top waiter.
1352 *
1353 * Return:
1354 * >=0 - on success, the number of tasks requeued or woken;
1355 * <0 - on error
1356 */
futex_requeue(u32 __user * uaddr1,unsigned int flags,u32 __user * uaddr2,int nr_wake,int nr_requeue,u32 * cmpval,int requeue_pi)1357 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1358 u32 __user *uaddr2, int nr_wake, int nr_requeue,
1359 u32 *cmpval, int requeue_pi)
1360 {
1361 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1362 int drop_count = 0, task_count = 0, ret;
1363 struct futex_pi_state *pi_state = NULL;
1364 struct futex_hash_bucket *hb1, *hb2;
1365 struct plist_head *head1;
1366 struct futex_q *this, *next;
1367 u32 curval2;
1368
1369 if (requeue_pi) {
1370 /*
1371 * Requeue PI only works on two distinct uaddrs. This
1372 * check is only valid for private futexes. See below.
1373 */
1374 if (uaddr1 == uaddr2)
1375 return -EINVAL;
1376
1377 /*
1378 * requeue_pi requires a pi_state, try to allocate it now
1379 * without any locks in case it fails.
1380 */
1381 if (refill_pi_state_cache())
1382 return -ENOMEM;
1383 /*
1384 * requeue_pi must wake as many tasks as it can, up to nr_wake
1385 * + nr_requeue, since it acquires the rt_mutex prior to
1386 * returning to userspace, so as to not leave the rt_mutex with
1387 * waiters and no owner. However, second and third wake-ups
1388 * cannot be predicted as they involve race conditions with the
1389 * first wake and a fault while looking up the pi_state. Both
1390 * pthread_cond_signal() and pthread_cond_broadcast() should
1391 * use nr_wake=1.
1392 */
1393 if (nr_wake != 1)
1394 return -EINVAL;
1395 }
1396
1397 retry:
1398 if (pi_state != NULL) {
1399 /*
1400 * We will have to lookup the pi_state again, so free this one
1401 * to keep the accounting correct.
1402 */
1403 free_pi_state(pi_state);
1404 pi_state = NULL;
1405 }
1406
1407 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1408 if (unlikely(ret != 0))
1409 goto out;
1410 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1411 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1412 if (unlikely(ret != 0))
1413 goto out_put_key1;
1414
1415 /*
1416 * The check above which compares uaddrs is not sufficient for
1417 * shared futexes. We need to compare the keys:
1418 */
1419 if (requeue_pi && match_futex(&key1, &key2)) {
1420 ret = -EINVAL;
1421 goto out_put_keys;
1422 }
1423
1424 hb1 = hash_futex(&key1);
1425 hb2 = hash_futex(&key2);
1426
1427 retry_private:
1428 double_lock_hb(hb1, hb2);
1429
1430 if (likely(cmpval != NULL)) {
1431 u32 curval;
1432
1433 ret = get_futex_value_locked(&curval, uaddr1);
1434
1435 if (unlikely(ret)) {
1436 double_unlock_hb(hb1, hb2);
1437
1438 ret = get_user(curval, uaddr1);
1439 if (ret)
1440 goto out_put_keys;
1441
1442 if (!(flags & FLAGS_SHARED))
1443 goto retry_private;
1444
1445 put_futex_key(&key2);
1446 put_futex_key(&key1);
1447 goto retry;
1448 }
1449 if (curval != *cmpval) {
1450 ret = -EAGAIN;
1451 goto out_unlock;
1452 }
1453 }
1454
1455 if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1456 /*
1457 * Attempt to acquire uaddr2 and wake the top waiter. If we
1458 * intend to requeue waiters, force setting the FUTEX_WAITERS
1459 * bit. We force this here where we are able to easily handle
1460 * faults rather in the requeue loop below.
1461 */
1462 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1463 &key2, &pi_state, nr_requeue);
1464
1465 /*
1466 * At this point the top_waiter has either taken uaddr2 or is
1467 * waiting on it. If the former, then the pi_state will not
1468 * exist yet, look it up one more time to ensure we have a
1469 * reference to it.
1470 */
1471 if (ret == 1) {
1472 WARN_ON(pi_state);
1473 drop_count++;
1474 task_count++;
1475 ret = get_futex_value_locked(&curval2, uaddr2);
1476 if (!ret)
1477 ret = lookup_pi_state(curval2, hb2, &key2,
1478 &pi_state);
1479 }
1480
1481 switch (ret) {
1482 case 0:
1483 break;
1484 case -EFAULT:
1485 double_unlock_hb(hb1, hb2);
1486 put_futex_key(&key2);
1487 put_futex_key(&key1);
1488 ret = fault_in_user_writeable(uaddr2);
1489 if (!ret)
1490 goto retry;
1491 goto out;
1492 case -EAGAIN:
1493 /* The owner was exiting, try again. */
1494 double_unlock_hb(hb1, hb2);
1495 put_futex_key(&key2);
1496 put_futex_key(&key1);
1497 cond_resched();
1498 goto retry;
1499 default:
1500 goto out_unlock;
1501 }
1502 }
1503
1504 head1 = &hb1->chain;
1505 plist_for_each_entry_safe(this, next, head1, list) {
1506 if (task_count - nr_wake >= nr_requeue)
1507 break;
1508
1509 if (!match_futex(&this->key, &key1))
1510 continue;
1511
1512 /*
1513 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1514 * be paired with each other and no other futex ops.
1515 *
1516 * We should never be requeueing a futex_q with a pi_state,
1517 * which is awaiting a futex_unlock_pi().
1518 */
1519 if ((requeue_pi && !this->rt_waiter) ||
1520 (!requeue_pi && this->rt_waiter) ||
1521 this->pi_state) {
1522 ret = -EINVAL;
1523 break;
1524 }
1525
1526 /*
1527 * Wake nr_wake waiters. For requeue_pi, if we acquired the
1528 * lock, we already woke the top_waiter. If not, it will be
1529 * woken by futex_unlock_pi().
1530 */
1531 if (++task_count <= nr_wake && !requeue_pi) {
1532 wake_futex(this);
1533 continue;
1534 }
1535
1536 /* Ensure we requeue to the expected futex for requeue_pi. */
1537 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1538 ret = -EINVAL;
1539 break;
1540 }
1541
1542 /*
1543 * Requeue nr_requeue waiters and possibly one more in the case
1544 * of requeue_pi if we couldn't acquire the lock atomically.
1545 */
1546 if (requeue_pi) {
1547 /* Prepare the waiter to take the rt_mutex. */
1548 atomic_inc(&pi_state->refcount);
1549 this->pi_state = pi_state;
1550 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1551 this->rt_waiter,
1552 this->task, 1);
1553 if (ret == 1) {
1554 /* We got the lock. */
1555 requeue_pi_wake_futex(this, &key2, hb2);
1556 drop_count++;
1557 continue;
1558 } else if (ret) {
1559 /* -EDEADLK */
1560 this->pi_state = NULL;
1561 free_pi_state(pi_state);
1562 goto out_unlock;
1563 }
1564 }
1565 requeue_futex(this, hb1, hb2, &key2);
1566 drop_count++;
1567 }
1568
1569 out_unlock:
1570 double_unlock_hb(hb1, hb2);
1571
1572 /*
1573 * drop_futex_key_refs() must be called outside the spinlocks. During
1574 * the requeue we moved futex_q's from the hash bucket at key1 to the
1575 * one at key2 and updated their key pointer. We no longer need to
1576 * hold the references to key1.
1577 */
1578 while (--drop_count >= 0)
1579 drop_futex_key_refs(&key1);
1580
1581 out_put_keys:
1582 put_futex_key(&key2);
1583 out_put_key1:
1584 put_futex_key(&key1);
1585 out:
1586 if (pi_state != NULL)
1587 free_pi_state(pi_state);
1588 return ret ? ret : task_count;
1589 }
1590
1591 /* The key must be already stored in q->key. */
queue_lock(struct futex_q * q)1592 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1593 __acquires(&hb->lock)
1594 {
1595 struct futex_hash_bucket *hb;
1596
1597 hb = hash_futex(&q->key);
1598 q->lock_ptr = &hb->lock;
1599
1600 spin_lock(&hb->lock);
1601 return hb;
1602 }
1603
1604 static inline void
queue_unlock(struct futex_q * q,struct futex_hash_bucket * hb)1605 queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1606 __releases(&hb->lock)
1607 {
1608 spin_unlock(&hb->lock);
1609 }
1610
1611 /**
1612 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1613 * @q: The futex_q to enqueue
1614 * @hb: The destination hash bucket
1615 *
1616 * The hb->lock must be held by the caller, and is released here. A call to
1617 * queue_me() is typically paired with exactly one call to unqueue_me(). The
1618 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1619 * or nothing if the unqueue is done as part of the wake process and the unqueue
1620 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1621 * an example).
1622 */
queue_me(struct futex_q * q,struct futex_hash_bucket * hb)1623 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1624 __releases(&hb->lock)
1625 {
1626 int prio;
1627
1628 /*
1629 * The priority used to register this element is
1630 * - either the real thread-priority for the real-time threads
1631 * (i.e. threads with a priority lower than MAX_RT_PRIO)
1632 * - or MAX_RT_PRIO for non-RT threads.
1633 * Thus, all RT-threads are woken first in priority order, and
1634 * the others are woken last, in FIFO order.
1635 */
1636 prio = min(current->normal_prio, MAX_RT_PRIO);
1637
1638 plist_node_init(&q->list, prio);
1639 plist_add(&q->list, &hb->chain);
1640 q->task = current;
1641 spin_unlock(&hb->lock);
1642 }
1643
1644 /**
1645 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1646 * @q: The futex_q to unqueue
1647 *
1648 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1649 * be paired with exactly one earlier call to queue_me().
1650 *
1651 * Return:
1652 * 1 - if the futex_q was still queued (and we removed unqueued it);
1653 * 0 - if the futex_q was already removed by the waking thread
1654 */
unqueue_me(struct futex_q * q)1655 static int unqueue_me(struct futex_q *q)
1656 {
1657 spinlock_t *lock_ptr;
1658 int ret = 0;
1659
1660 /* In the common case we don't take the spinlock, which is nice. */
1661 retry:
1662 lock_ptr = q->lock_ptr;
1663 barrier();
1664 if (lock_ptr != NULL) {
1665 spin_lock(lock_ptr);
1666 /*
1667 * q->lock_ptr can change between reading it and
1668 * spin_lock(), causing us to take the wrong lock. This
1669 * corrects the race condition.
1670 *
1671 * Reasoning goes like this: if we have the wrong lock,
1672 * q->lock_ptr must have changed (maybe several times)
1673 * between reading it and the spin_lock(). It can
1674 * change again after the spin_lock() but only if it was
1675 * already changed before the spin_lock(). It cannot,
1676 * however, change back to the original value. Therefore
1677 * we can detect whether we acquired the correct lock.
1678 */
1679 if (unlikely(lock_ptr != q->lock_ptr)) {
1680 spin_unlock(lock_ptr);
1681 goto retry;
1682 }
1683 __unqueue_futex(q);
1684
1685 BUG_ON(q->pi_state);
1686
1687 spin_unlock(lock_ptr);
1688 ret = 1;
1689 }
1690
1691 drop_futex_key_refs(&q->key);
1692 return ret;
1693 }
1694
1695 /*
1696 * PI futexes can not be requeued and must remove themself from the
1697 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1698 * and dropped here.
1699 */
unqueue_me_pi(struct futex_q * q)1700 static void unqueue_me_pi(struct futex_q *q)
1701 __releases(q->lock_ptr)
1702 {
1703 __unqueue_futex(q);
1704
1705 BUG_ON(!q->pi_state);
1706 free_pi_state(q->pi_state);
1707 q->pi_state = NULL;
1708
1709 spin_unlock(q->lock_ptr);
1710 }
1711
1712 /*
1713 * Fixup the pi_state owner with the new owner.
1714 *
1715 * Must be called with hash bucket lock held and mm->sem held for non
1716 * private futexes.
1717 */
fixup_pi_state_owner(u32 __user * uaddr,struct futex_q * q,struct task_struct * newowner)1718 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1719 struct task_struct *newowner)
1720 {
1721 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1722 struct futex_pi_state *pi_state = q->pi_state;
1723 struct task_struct *oldowner = pi_state->owner;
1724 u32 uval, uninitialized_var(curval), newval;
1725 int ret;
1726
1727 /* Owner died? */
1728 if (!pi_state->owner)
1729 newtid |= FUTEX_OWNER_DIED;
1730
1731 /*
1732 * We are here either because we stole the rtmutex from the
1733 * previous highest priority waiter or we are the highest priority
1734 * waiter but failed to get the rtmutex the first time.
1735 * We have to replace the newowner TID in the user space variable.
1736 * This must be atomic as we have to preserve the owner died bit here.
1737 *
1738 * Note: We write the user space value _before_ changing the pi_state
1739 * because we can fault here. Imagine swapped out pages or a fork
1740 * that marked all the anonymous memory readonly for cow.
1741 *
1742 * Modifying pi_state _before_ the user space value would
1743 * leave the pi_state in an inconsistent state when we fault
1744 * here, because we need to drop the hash bucket lock to
1745 * handle the fault. This might be observed in the PID check
1746 * in lookup_pi_state.
1747 */
1748 retry:
1749 if (get_futex_value_locked(&uval, uaddr))
1750 goto handle_fault;
1751
1752 while (1) {
1753 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1754
1755 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1756 goto handle_fault;
1757 if (curval == uval)
1758 break;
1759 uval = curval;
1760 }
1761
1762 /*
1763 * We fixed up user space. Now we need to fix the pi_state
1764 * itself.
1765 */
1766 if (pi_state->owner != NULL) {
1767 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1768 WARN_ON(list_empty(&pi_state->list));
1769 list_del_init(&pi_state->list);
1770 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1771 }
1772
1773 pi_state->owner = newowner;
1774
1775 raw_spin_lock_irq(&newowner->pi_lock);
1776 WARN_ON(!list_empty(&pi_state->list));
1777 list_add(&pi_state->list, &newowner->pi_state_list);
1778 raw_spin_unlock_irq(&newowner->pi_lock);
1779 return 0;
1780
1781 /*
1782 * To handle the page fault we need to drop the hash bucket
1783 * lock here. That gives the other task (either the highest priority
1784 * waiter itself or the task which stole the rtmutex) the
1785 * chance to try the fixup of the pi_state. So once we are
1786 * back from handling the fault we need to check the pi_state
1787 * after reacquiring the hash bucket lock and before trying to
1788 * do another fixup. When the fixup has been done already we
1789 * simply return.
1790 */
1791 handle_fault:
1792 spin_unlock(q->lock_ptr);
1793
1794 ret = fault_in_user_writeable(uaddr);
1795
1796 spin_lock(q->lock_ptr);
1797
1798 /*
1799 * Check if someone else fixed it for us:
1800 */
1801 if (pi_state->owner != oldowner)
1802 return 0;
1803
1804 if (ret)
1805 return ret;
1806
1807 goto retry;
1808 }
1809
1810 static long futex_wait_restart(struct restart_block *restart);
1811
1812 /**
1813 * fixup_owner() - Post lock pi_state and corner case management
1814 * @uaddr: user address of the futex
1815 * @q: futex_q (contains pi_state and access to the rt_mutex)
1816 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1817 *
1818 * After attempting to lock an rt_mutex, this function is called to cleanup
1819 * the pi_state owner as well as handle race conditions that may allow us to
1820 * acquire the lock. Must be called with the hb lock held.
1821 *
1822 * Return:
1823 * 1 - success, lock taken;
1824 * 0 - success, lock not taken;
1825 * <0 - on error (-EFAULT)
1826 */
fixup_owner(u32 __user * uaddr,struct futex_q * q,int locked)1827 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1828 {
1829 struct task_struct *owner;
1830 int ret = 0;
1831
1832 if (locked) {
1833 /*
1834 * Got the lock. We might not be the anticipated owner if we
1835 * did a lock-steal - fix up the PI-state in that case:
1836 */
1837 if (q->pi_state->owner != current)
1838 ret = fixup_pi_state_owner(uaddr, q, current);
1839 goto out;
1840 }
1841
1842 /*
1843 * Catch the rare case, where the lock was released when we were on the
1844 * way back before we locked the hash bucket.
1845 */
1846 if (q->pi_state->owner == current) {
1847 /*
1848 * Try to get the rt_mutex now. This might fail as some other
1849 * task acquired the rt_mutex after we removed ourself from the
1850 * rt_mutex waiters list.
1851 */
1852 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1853 locked = 1;
1854 goto out;
1855 }
1856
1857 /*
1858 * pi_state is incorrect, some other task did a lock steal and
1859 * we returned due to timeout or signal without taking the
1860 * rt_mutex. Too late.
1861 */
1862 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1863 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1864 if (!owner)
1865 owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1866 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1867 ret = fixup_pi_state_owner(uaddr, q, owner);
1868 goto out;
1869 }
1870
1871 /*
1872 * Paranoia check. If we did not take the lock, then we should not be
1873 * the owner of the rt_mutex.
1874 */
1875 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1876 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1877 "pi-state %p\n", ret,
1878 q->pi_state->pi_mutex.owner,
1879 q->pi_state->owner);
1880
1881 out:
1882 return ret ? ret : locked;
1883 }
1884
1885 /**
1886 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1887 * @hb: the futex hash bucket, must be locked by the caller
1888 * @q: the futex_q to queue up on
1889 * @timeout: the prepared hrtimer_sleeper, or null for no timeout
1890 */
futex_wait_queue_me(struct futex_hash_bucket * hb,struct futex_q * q,struct hrtimer_sleeper * timeout)1891 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1892 struct hrtimer_sleeper *timeout)
1893 {
1894 /*
1895 * The task state is guaranteed to be set before another task can
1896 * wake it. set_current_state() is implemented using set_mb() and
1897 * queue_me() calls spin_unlock() upon completion, both serializing
1898 * access to the hash list and forcing another memory barrier.
1899 */
1900 set_current_state(TASK_INTERRUPTIBLE);
1901 queue_me(q, hb);
1902
1903 /* Arm the timer */
1904 if (timeout) {
1905 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1906 if (!hrtimer_active(&timeout->timer))
1907 timeout->task = NULL;
1908 }
1909
1910 /*
1911 * If we have been removed from the hash list, then another task
1912 * has tried to wake us, and we can skip the call to schedule().
1913 */
1914 if (likely(!plist_node_empty(&q->list))) {
1915 /*
1916 * If the timer has already expired, current will already be
1917 * flagged for rescheduling. Only call schedule if there
1918 * is no timeout, or if it has yet to expire.
1919 */
1920 if (!timeout || timeout->task)
1921 freezable_schedule();
1922 }
1923 __set_current_state(TASK_RUNNING);
1924 }
1925
1926 /**
1927 * futex_wait_setup() - Prepare to wait on a futex
1928 * @uaddr: the futex userspace address
1929 * @val: the expected value
1930 * @flags: futex flags (FLAGS_SHARED, etc.)
1931 * @q: the associated futex_q
1932 * @hb: storage for hash_bucket pointer to be returned to caller
1933 *
1934 * Setup the futex_q and locate the hash_bucket. Get the futex value and
1935 * compare it with the expected value. Handle atomic faults internally.
1936 * Return with the hb lock held and a q.key reference on success, and unlocked
1937 * with no q.key reference on failure.
1938 *
1939 * Return:
1940 * 0 - uaddr contains val and hb has been locked;
1941 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1942 */
futex_wait_setup(u32 __user * uaddr,u32 val,unsigned int flags,struct futex_q * q,struct futex_hash_bucket ** hb)1943 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1944 struct futex_q *q, struct futex_hash_bucket **hb)
1945 {
1946 u32 uval;
1947 int ret;
1948
1949 /*
1950 * Access the page AFTER the hash-bucket is locked.
1951 * Order is important:
1952 *
1953 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
1954 * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
1955 *
1956 * The basic logical guarantee of a futex is that it blocks ONLY
1957 * if cond(var) is known to be true at the time of blocking, for
1958 * any cond. If we locked the hash-bucket after testing *uaddr, that
1959 * would open a race condition where we could block indefinitely with
1960 * cond(var) false, which would violate the guarantee.
1961 *
1962 * On the other hand, we insert q and release the hash-bucket only
1963 * after testing *uaddr. This guarantees that futex_wait() will NOT
1964 * absorb a wakeup if *uaddr does not match the desired values
1965 * while the syscall executes.
1966 */
1967 retry:
1968 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
1969 if (unlikely(ret != 0))
1970 return ret;
1971
1972 retry_private:
1973 *hb = queue_lock(q);
1974
1975 ret = get_futex_value_locked(&uval, uaddr);
1976
1977 if (ret) {
1978 queue_unlock(q, *hb);
1979
1980 ret = get_user(uval, uaddr);
1981 if (ret)
1982 goto out;
1983
1984 if (!(flags & FLAGS_SHARED))
1985 goto retry_private;
1986
1987 put_futex_key(&q->key);
1988 goto retry;
1989 }
1990
1991 if (uval != val) {
1992 queue_unlock(q, *hb);
1993 ret = -EWOULDBLOCK;
1994 }
1995
1996 out:
1997 if (ret)
1998 put_futex_key(&q->key);
1999 return ret;
2000 }
2001
futex_wait(u32 __user * uaddr,unsigned int flags,u32 val,ktime_t * abs_time,u32 bitset)2002 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2003 ktime_t *abs_time, u32 bitset)
2004 {
2005 struct hrtimer_sleeper timeout, *to = NULL;
2006 struct restart_block *restart;
2007 struct futex_hash_bucket *hb;
2008 struct futex_q q = futex_q_init;
2009 int ret;
2010
2011 if (!bitset)
2012 return -EINVAL;
2013 q.bitset = bitset;
2014
2015 if (abs_time) {
2016 to = &timeout;
2017
2018 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2019 CLOCK_REALTIME : CLOCK_MONOTONIC,
2020 HRTIMER_MODE_ABS);
2021 hrtimer_init_sleeper(to, current);
2022 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2023 current->timer_slack_ns);
2024 }
2025
2026 retry:
2027 /*
2028 * Prepare to wait on uaddr. On success, holds hb lock and increments
2029 * q.key refs.
2030 */
2031 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2032 if (ret)
2033 goto out;
2034
2035 /* queue_me and wait for wakeup, timeout, or a signal. */
2036 futex_wait_queue_me(hb, &q, to);
2037
2038 /* If we were woken (and unqueued), we succeeded, whatever. */
2039 ret = 0;
2040 /* unqueue_me() drops q.key ref */
2041 if (!unqueue_me(&q))
2042 goto out;
2043 ret = -ETIMEDOUT;
2044 if (to && !to->task)
2045 goto out;
2046
2047 /*
2048 * We expect signal_pending(current), but we might be the
2049 * victim of a spurious wakeup as well.
2050 */
2051 if (!signal_pending(current))
2052 goto retry;
2053
2054 ret = -ERESTARTSYS;
2055 if (!abs_time)
2056 goto out;
2057
2058 restart = ¤t_thread_info()->restart_block;
2059 restart->fn = futex_wait_restart;
2060 restart->futex.uaddr = uaddr;
2061 restart->futex.val = val;
2062 restart->futex.time = abs_time->tv64;
2063 restart->futex.bitset = bitset;
2064 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
2065
2066 ret = -ERESTART_RESTARTBLOCK;
2067
2068 out:
2069 if (to) {
2070 hrtimer_cancel(&to->timer);
2071 destroy_hrtimer_on_stack(&to->timer);
2072 }
2073 return ret;
2074 }
2075
2076
futex_wait_restart(struct restart_block * restart)2077 static long futex_wait_restart(struct restart_block *restart)
2078 {
2079 u32 __user *uaddr = restart->futex.uaddr;
2080 ktime_t t, *tp = NULL;
2081
2082 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
2083 t.tv64 = restart->futex.time;
2084 tp = &t;
2085 }
2086 restart->fn = do_no_restart_syscall;
2087
2088 return (long)futex_wait(uaddr, restart->futex.flags,
2089 restart->futex.val, tp, restart->futex.bitset);
2090 }
2091
2092
2093 /*
2094 * Userspace tried a 0 -> TID atomic transition of the futex value
2095 * and failed. The kernel side here does the whole locking operation:
2096 * if there are waiters then it will block, it does PI, etc. (Due to
2097 * races the kernel might see a 0 value of the futex too.)
2098 */
futex_lock_pi(u32 __user * uaddr,unsigned int flags,int detect,ktime_t * time,int trylock)2099 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
2100 ktime_t *time, int trylock)
2101 {
2102 struct hrtimer_sleeper timeout, *to = NULL;
2103 struct futex_hash_bucket *hb;
2104 struct futex_q q = futex_q_init;
2105 int res, ret;
2106
2107 if (refill_pi_state_cache())
2108 return -ENOMEM;
2109
2110 if (time) {
2111 to = &timeout;
2112 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
2113 HRTIMER_MODE_ABS);
2114 hrtimer_init_sleeper(to, current);
2115 hrtimer_set_expires(&to->timer, *time);
2116 }
2117
2118 retry:
2119 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
2120 if (unlikely(ret != 0))
2121 goto out;
2122
2123 retry_private:
2124 hb = queue_lock(&q);
2125
2126 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
2127 if (unlikely(ret)) {
2128 switch (ret) {
2129 case 1:
2130 /* We got the lock. */
2131 ret = 0;
2132 goto out_unlock_put_key;
2133 case -EFAULT:
2134 goto uaddr_faulted;
2135 case -EAGAIN:
2136 /*
2137 * Task is exiting and we just wait for the
2138 * exit to complete.
2139 */
2140 queue_unlock(&q, hb);
2141 put_futex_key(&q.key);
2142 cond_resched();
2143 goto retry;
2144 default:
2145 goto out_unlock_put_key;
2146 }
2147 }
2148
2149 /*
2150 * Only actually queue now that the atomic ops are done:
2151 */
2152 queue_me(&q, hb);
2153
2154 WARN_ON(!q.pi_state);
2155 /*
2156 * Block on the PI mutex:
2157 */
2158 if (!trylock)
2159 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
2160 else {
2161 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2162 /* Fixup the trylock return value: */
2163 ret = ret ? 0 : -EWOULDBLOCK;
2164 }
2165
2166 spin_lock(q.lock_ptr);
2167 /*
2168 * Fixup the pi_state owner and possibly acquire the lock if we
2169 * haven't already.
2170 */
2171 res = fixup_owner(uaddr, &q, !ret);
2172 /*
2173 * If fixup_owner() returned an error, proprogate that. If it acquired
2174 * the lock, clear our -ETIMEDOUT or -EINTR.
2175 */
2176 if (res)
2177 ret = (res < 0) ? res : 0;
2178
2179 /*
2180 * If fixup_owner() faulted and was unable to handle the fault, unlock
2181 * it and return the fault to userspace.
2182 */
2183 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
2184 rt_mutex_unlock(&q.pi_state->pi_mutex);
2185
2186 /* Unqueue and drop the lock */
2187 unqueue_me_pi(&q);
2188
2189 goto out_put_key;
2190
2191 out_unlock_put_key:
2192 queue_unlock(&q, hb);
2193
2194 out_put_key:
2195 put_futex_key(&q.key);
2196 out:
2197 if (to)
2198 destroy_hrtimer_on_stack(&to->timer);
2199 return ret != -EINTR ? ret : -ERESTARTNOINTR;
2200
2201 uaddr_faulted:
2202 queue_unlock(&q, hb);
2203
2204 ret = fault_in_user_writeable(uaddr);
2205 if (ret)
2206 goto out_put_key;
2207
2208 if (!(flags & FLAGS_SHARED))
2209 goto retry_private;
2210
2211 put_futex_key(&q.key);
2212 goto retry;
2213 }
2214
2215 /*
2216 * Userspace attempted a TID -> 0 atomic transition, and failed.
2217 * This is the in-kernel slowpath: we look up the PI state (if any),
2218 * and do the rt-mutex unlock.
2219 */
futex_unlock_pi(u32 __user * uaddr,unsigned int flags)2220 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2221 {
2222 struct futex_hash_bucket *hb;
2223 struct futex_q *this, *next;
2224 struct plist_head *head;
2225 union futex_key key = FUTEX_KEY_INIT;
2226 u32 uval, vpid = task_pid_vnr(current);
2227 int ret;
2228
2229 retry:
2230 if (get_user(uval, uaddr))
2231 return -EFAULT;
2232 /*
2233 * We release only a lock we actually own:
2234 */
2235 if ((uval & FUTEX_TID_MASK) != vpid)
2236 return -EPERM;
2237
2238 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2239 if (unlikely(ret != 0))
2240 goto out;
2241
2242 hb = hash_futex(&key);
2243 spin_lock(&hb->lock);
2244
2245 /*
2246 * To avoid races, try to do the TID -> 0 atomic transition
2247 * again. If it succeeds then we can return without waking
2248 * anyone else up. We only try this if neither the waiters nor
2249 * the owner died bit are set.
2250 */
2251 if (!(uval & ~FUTEX_TID_MASK) &&
2252 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2253 goto pi_faulted;
2254 /*
2255 * Rare case: we managed to release the lock atomically,
2256 * no need to wake anyone else up:
2257 */
2258 if (unlikely(uval == vpid))
2259 goto out_unlock;
2260
2261 /*
2262 * Ok, other tasks may need to be woken up - check waiters
2263 * and do the wakeup if necessary:
2264 */
2265 head = &hb->chain;
2266
2267 plist_for_each_entry_safe(this, next, head, list) {
2268 if (!match_futex (&this->key, &key))
2269 continue;
2270 ret = wake_futex_pi(uaddr, uval, this);
2271 /*
2272 * The atomic access to the futex value
2273 * generated a pagefault, so retry the
2274 * user-access and the wakeup:
2275 */
2276 if (ret == -EFAULT)
2277 goto pi_faulted;
2278 goto out_unlock;
2279 }
2280 /*
2281 * No waiters - kernel unlocks the futex:
2282 */
2283 ret = unlock_futex_pi(uaddr, uval);
2284 if (ret == -EFAULT)
2285 goto pi_faulted;
2286
2287 out_unlock:
2288 spin_unlock(&hb->lock);
2289 put_futex_key(&key);
2290
2291 out:
2292 return ret;
2293
2294 pi_faulted:
2295 spin_unlock(&hb->lock);
2296 put_futex_key(&key);
2297
2298 ret = fault_in_user_writeable(uaddr);
2299 if (!ret)
2300 goto retry;
2301
2302 return ret;
2303 }
2304
2305 /**
2306 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2307 * @hb: the hash_bucket futex_q was original enqueued on
2308 * @q: the futex_q woken while waiting to be requeued
2309 * @key2: the futex_key of the requeue target futex
2310 * @timeout: the timeout associated with the wait (NULL if none)
2311 *
2312 * Detect if the task was woken on the initial futex as opposed to the requeue
2313 * target futex. If so, determine if it was a timeout or a signal that caused
2314 * the wakeup and return the appropriate error code to the caller. Must be
2315 * called with the hb lock held.
2316 *
2317 * Return:
2318 * 0 = no early wakeup detected;
2319 * <0 = -ETIMEDOUT or -ERESTARTNOINTR
2320 */
2321 static inline
handle_early_requeue_pi_wakeup(struct futex_hash_bucket * hb,struct futex_q * q,union futex_key * key2,struct hrtimer_sleeper * timeout)2322 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2323 struct futex_q *q, union futex_key *key2,
2324 struct hrtimer_sleeper *timeout)
2325 {
2326 int ret = 0;
2327
2328 /*
2329 * With the hb lock held, we avoid races while we process the wakeup.
2330 * We only need to hold hb (and not hb2) to ensure atomicity as the
2331 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2332 * It can't be requeued from uaddr2 to something else since we don't
2333 * support a PI aware source futex for requeue.
2334 */
2335 if (!match_futex(&q->key, key2)) {
2336 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2337 /*
2338 * We were woken prior to requeue by a timeout or a signal.
2339 * Unqueue the futex_q and determine which it was.
2340 */
2341 plist_del(&q->list, &hb->chain);
2342
2343 /* Handle spurious wakeups gracefully */
2344 ret = -EWOULDBLOCK;
2345 if (timeout && !timeout->task)
2346 ret = -ETIMEDOUT;
2347 else if (signal_pending(current))
2348 ret = -ERESTARTNOINTR;
2349 }
2350 return ret;
2351 }
2352
2353 /**
2354 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2355 * @uaddr: the futex we initially wait on (non-pi)
2356 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2357 * the same type, no requeueing from private to shared, etc.
2358 * @val: the expected value of uaddr
2359 * @abs_time: absolute timeout
2360 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2361 * @uaddr2: the pi futex we will take prior to returning to user-space
2362 *
2363 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2364 * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
2365 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
2366 * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
2367 * without one, the pi logic would not know which task to boost/deboost, if
2368 * there was a need to.
2369 *
2370 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2371 * via the following--
2372 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2373 * 2) wakeup on uaddr2 after a requeue
2374 * 3) signal
2375 * 4) timeout
2376 *
2377 * If 3, cleanup and return -ERESTARTNOINTR.
2378 *
2379 * If 2, we may then block on trying to take the rt_mutex and return via:
2380 * 5) successful lock
2381 * 6) signal
2382 * 7) timeout
2383 * 8) other lock acquisition failure
2384 *
2385 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2386 *
2387 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2388 *
2389 * Return:
2390 * 0 - On success;
2391 * <0 - On error
2392 */
futex_wait_requeue_pi(u32 __user * uaddr,unsigned int flags,u32 val,ktime_t * abs_time,u32 bitset,u32 __user * uaddr2)2393 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2394 u32 val, ktime_t *abs_time, u32 bitset,
2395 u32 __user *uaddr2)
2396 {
2397 struct hrtimer_sleeper timeout, *to = NULL;
2398 struct rt_mutex_waiter rt_waiter;
2399 struct rt_mutex *pi_mutex = NULL;
2400 struct futex_hash_bucket *hb;
2401 union futex_key key2 = FUTEX_KEY_INIT;
2402 struct futex_q q = futex_q_init;
2403 int res, ret;
2404
2405 if (uaddr == uaddr2)
2406 return -EINVAL;
2407
2408 if (!bitset)
2409 return -EINVAL;
2410
2411 if (abs_time) {
2412 to = &timeout;
2413 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2414 CLOCK_REALTIME : CLOCK_MONOTONIC,
2415 HRTIMER_MODE_ABS);
2416 hrtimer_init_sleeper(to, current);
2417 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2418 current->timer_slack_ns);
2419 }
2420
2421 /*
2422 * The waiter is allocated on our stack, manipulated by the requeue
2423 * code while we sleep on uaddr.
2424 */
2425 debug_rt_mutex_init_waiter(&rt_waiter);
2426 rt_waiter.task = NULL;
2427
2428 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2429 if (unlikely(ret != 0))
2430 goto out;
2431
2432 q.bitset = bitset;
2433 q.rt_waiter = &rt_waiter;
2434 q.requeue_pi_key = &key2;
2435
2436 /*
2437 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2438 * count.
2439 */
2440 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2441 if (ret)
2442 goto out_key2;
2443
2444 /*
2445 * The check above which compares uaddrs is not sufficient for
2446 * shared futexes. We need to compare the keys:
2447 */
2448 if (match_futex(&q.key, &key2)) {
2449 ret = -EINVAL;
2450 goto out_put_keys;
2451 }
2452
2453 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2454 futex_wait_queue_me(hb, &q, to);
2455
2456 spin_lock(&hb->lock);
2457 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2458 spin_unlock(&hb->lock);
2459 if (ret)
2460 goto out_put_keys;
2461
2462 /*
2463 * In order for us to be here, we know our q.key == key2, and since
2464 * we took the hb->lock above, we also know that futex_requeue() has
2465 * completed and we no longer have to concern ourselves with a wakeup
2466 * race with the atomic proxy lock acquisition by the requeue code. The
2467 * futex_requeue dropped our key1 reference and incremented our key2
2468 * reference count.
2469 */
2470
2471 /* Check if the requeue code acquired the second futex for us. */
2472 if (!q.rt_waiter) {
2473 /*
2474 * Got the lock. We might not be the anticipated owner if we
2475 * did a lock-steal - fix up the PI-state in that case.
2476 */
2477 if (q.pi_state && (q.pi_state->owner != current)) {
2478 spin_lock(q.lock_ptr);
2479 ret = fixup_pi_state_owner(uaddr2, &q, current);
2480 spin_unlock(q.lock_ptr);
2481 }
2482 } else {
2483 /*
2484 * We have been woken up by futex_unlock_pi(), a timeout, or a
2485 * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2486 * the pi_state.
2487 */
2488 WARN_ON(!q.pi_state);
2489 pi_mutex = &q.pi_state->pi_mutex;
2490 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2491 debug_rt_mutex_free_waiter(&rt_waiter);
2492
2493 spin_lock(q.lock_ptr);
2494 /*
2495 * Fixup the pi_state owner and possibly acquire the lock if we
2496 * haven't already.
2497 */
2498 res = fixup_owner(uaddr2, &q, !ret);
2499 /*
2500 * If fixup_owner() returned an error, proprogate that. If it
2501 * acquired the lock, clear -ETIMEDOUT or -EINTR.
2502 */
2503 if (res)
2504 ret = (res < 0) ? res : 0;
2505
2506 /* Unqueue and drop the lock. */
2507 unqueue_me_pi(&q);
2508 }
2509
2510 /*
2511 * If fixup_pi_state_owner() faulted and was unable to handle the
2512 * fault, unlock the rt_mutex and return the fault to userspace.
2513 */
2514 if (ret == -EFAULT) {
2515 if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
2516 rt_mutex_unlock(pi_mutex);
2517 } else if (ret == -EINTR) {
2518 /*
2519 * We've already been requeued, but cannot restart by calling
2520 * futex_lock_pi() directly. We could restart this syscall, but
2521 * it would detect that the user space "val" changed and return
2522 * -EWOULDBLOCK. Save the overhead of the restart and return
2523 * -EWOULDBLOCK directly.
2524 */
2525 ret = -EWOULDBLOCK;
2526 }
2527
2528 out_put_keys:
2529 put_futex_key(&q.key);
2530 out_key2:
2531 put_futex_key(&key2);
2532
2533 out:
2534 if (to) {
2535 hrtimer_cancel(&to->timer);
2536 destroy_hrtimer_on_stack(&to->timer);
2537 }
2538 return ret;
2539 }
2540
2541 /*
2542 * Support for robust futexes: the kernel cleans up held futexes at
2543 * thread exit time.
2544 *
2545 * Implementation: user-space maintains a per-thread list of locks it
2546 * is holding. Upon do_exit(), the kernel carefully walks this list,
2547 * and marks all locks that are owned by this thread with the
2548 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
2549 * always manipulated with the lock held, so the list is private and
2550 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
2551 * field, to allow the kernel to clean up if the thread dies after
2552 * acquiring the lock, but just before it could have added itself to
2553 * the list. There can only be one such pending lock.
2554 */
2555
2556 /**
2557 * sys_set_robust_list() - Set the robust-futex list head of a task
2558 * @head: pointer to the list-head
2559 * @len: length of the list-head, as userspace expects
2560 */
SYSCALL_DEFINE2(set_robust_list,struct robust_list_head __user *,head,size_t,len)2561 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2562 size_t, len)
2563 {
2564 if (!futex_cmpxchg_enabled)
2565 return -ENOSYS;
2566 /*
2567 * The kernel knows only one size for now:
2568 */
2569 if (unlikely(len != sizeof(*head)))
2570 return -EINVAL;
2571
2572 current->robust_list = head;
2573
2574 return 0;
2575 }
2576
2577 /**
2578 * sys_get_robust_list() - Get the robust-futex list head of a task
2579 * @pid: pid of the process [zero for current task]
2580 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
2581 * @len_ptr: pointer to a length field, the kernel fills in the header size
2582 */
SYSCALL_DEFINE3(get_robust_list,int,pid,struct robust_list_head __user * __user *,head_ptr,size_t __user *,len_ptr)2583 SYSCALL_DEFINE3(get_robust_list, int, pid,
2584 struct robust_list_head __user * __user *, head_ptr,
2585 size_t __user *, len_ptr)
2586 {
2587 struct robust_list_head __user *head;
2588 unsigned long ret;
2589 struct task_struct *p;
2590
2591 if (!futex_cmpxchg_enabled)
2592 return -ENOSYS;
2593
2594 rcu_read_lock();
2595
2596 ret = -ESRCH;
2597 if (!pid)
2598 p = current;
2599 else {
2600 p = find_task_by_vpid(pid);
2601 if (!p)
2602 goto err_unlock;
2603 }
2604
2605 ret = -EPERM;
2606 if (!ptrace_may_access(p, PTRACE_MODE_READ))
2607 goto err_unlock;
2608
2609 head = p->robust_list;
2610 rcu_read_unlock();
2611
2612 if (put_user(sizeof(*head), len_ptr))
2613 return -EFAULT;
2614 return put_user(head, head_ptr);
2615
2616 err_unlock:
2617 rcu_read_unlock();
2618
2619 return ret;
2620 }
2621
2622 /*
2623 * Process a futex-list entry, check whether it's owned by the
2624 * dying task, and do notification if so:
2625 */
handle_futex_death(u32 __user * uaddr,struct task_struct * curr,int pi)2626 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2627 {
2628 u32 uval, uninitialized_var(nval), mval;
2629
2630 retry:
2631 if (get_user(uval, uaddr))
2632 return -1;
2633
2634 if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
2635 /*
2636 * Ok, this dying thread is truly holding a futex
2637 * of interest. Set the OWNER_DIED bit atomically
2638 * via cmpxchg, and if the value had FUTEX_WAITERS
2639 * set, wake up a waiter (if any). (We have to do a
2640 * futex_wake() even if OWNER_DIED is already set -
2641 * to handle the rare but possible case of recursive
2642 * thread-death.) The rest of the cleanup is done in
2643 * userspace.
2644 */
2645 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2646 /*
2647 * We are not holding a lock here, but we want to have
2648 * the pagefault_disable/enable() protection because
2649 * we want to handle the fault gracefully. If the
2650 * access fails we try to fault in the futex with R/W
2651 * verification via get_user_pages. get_user() above
2652 * does not guarantee R/W access. If that fails we
2653 * give up and leave the futex locked.
2654 */
2655 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2656 if (fault_in_user_writeable(uaddr))
2657 return -1;
2658 goto retry;
2659 }
2660 if (nval != uval)
2661 goto retry;
2662
2663 /*
2664 * Wake robust non-PI futexes here. The wakeup of
2665 * PI futexes happens in exit_pi_state():
2666 */
2667 if (!pi && (uval & FUTEX_WAITERS))
2668 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
2669 }
2670 return 0;
2671 }
2672
2673 /*
2674 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
2675 */
fetch_robust_entry(struct robust_list __user ** entry,struct robust_list __user * __user * head,unsigned int * pi)2676 static inline int fetch_robust_entry(struct robust_list __user **entry,
2677 struct robust_list __user * __user *head,
2678 unsigned int *pi)
2679 {
2680 unsigned long uentry;
2681
2682 if (get_user(uentry, (unsigned long __user *)head))
2683 return -EFAULT;
2684
2685 *entry = (void __user *)(uentry & ~1UL);
2686 *pi = uentry & 1;
2687
2688 return 0;
2689 }
2690
2691 /*
2692 * Walk curr->robust_list (very carefully, it's a userspace list!)
2693 * and mark any locks found there dead, and notify any waiters.
2694 *
2695 * We silently return on any sign of list-walking problem.
2696 */
exit_robust_list(struct task_struct * curr)2697 void exit_robust_list(struct task_struct *curr)
2698 {
2699 struct robust_list_head __user *head = curr->robust_list;
2700 struct robust_list __user *entry, *next_entry, *pending;
2701 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2702 unsigned int uninitialized_var(next_pi);
2703 unsigned long futex_offset;
2704 int rc;
2705
2706 if (!futex_cmpxchg_enabled)
2707 return;
2708
2709 /*
2710 * Fetch the list head (which was registered earlier, via
2711 * sys_set_robust_list()):
2712 */
2713 if (fetch_robust_entry(&entry, &head->list.next, &pi))
2714 return;
2715 /*
2716 * Fetch the relative futex offset:
2717 */
2718 if (get_user(futex_offset, &head->futex_offset))
2719 return;
2720 /*
2721 * Fetch any possibly pending lock-add first, and handle it
2722 * if it exists:
2723 */
2724 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
2725 return;
2726
2727 next_entry = NULL; /* avoid warning with gcc */
2728 while (entry != &head->list) {
2729 /*
2730 * Fetch the next entry in the list before calling
2731 * handle_futex_death:
2732 */
2733 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
2734 /*
2735 * A pending lock might already be on the list, so
2736 * don't process it twice:
2737 */
2738 if (entry != pending)
2739 if (handle_futex_death((void __user *)entry + futex_offset,
2740 curr, pi))
2741 return;
2742 if (rc)
2743 return;
2744 entry = next_entry;
2745 pi = next_pi;
2746 /*
2747 * Avoid excessively long or circular lists:
2748 */
2749 if (!--limit)
2750 break;
2751
2752 cond_resched();
2753 }
2754
2755 if (pending)
2756 handle_futex_death((void __user *)pending + futex_offset,
2757 curr, pip);
2758 }
2759
do_futex(u32 __user * uaddr,int op,u32 val,ktime_t * timeout,u32 __user * uaddr2,u32 val2,u32 val3)2760 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2761 u32 __user *uaddr2, u32 val2, u32 val3)
2762 {
2763 int cmd = op & FUTEX_CMD_MASK;
2764 unsigned int flags = 0;
2765
2766 if (!(op & FUTEX_PRIVATE_FLAG))
2767 flags |= FLAGS_SHARED;
2768
2769 if (op & FUTEX_CLOCK_REALTIME) {
2770 flags |= FLAGS_CLOCKRT;
2771 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2772 return -ENOSYS;
2773 }
2774
2775 switch (cmd) {
2776 case FUTEX_LOCK_PI:
2777 case FUTEX_UNLOCK_PI:
2778 case FUTEX_TRYLOCK_PI:
2779 case FUTEX_WAIT_REQUEUE_PI:
2780 case FUTEX_CMP_REQUEUE_PI:
2781 if (!futex_cmpxchg_enabled)
2782 return -ENOSYS;
2783 }
2784
2785 switch (cmd) {
2786 case FUTEX_WAIT:
2787 val3 = FUTEX_BITSET_MATCH_ANY;
2788 case FUTEX_WAIT_BITSET:
2789 return futex_wait(uaddr, flags, val, timeout, val3);
2790 case FUTEX_WAKE:
2791 val3 = FUTEX_BITSET_MATCH_ANY;
2792 case FUTEX_WAKE_BITSET:
2793 return futex_wake(uaddr, flags, val, val3);
2794 case FUTEX_REQUEUE:
2795 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2796 case FUTEX_CMP_REQUEUE:
2797 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2798 case FUTEX_WAKE_OP:
2799 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2800 case FUTEX_LOCK_PI:
2801 return futex_lock_pi(uaddr, flags, val, timeout, 0);
2802 case FUTEX_UNLOCK_PI:
2803 return futex_unlock_pi(uaddr, flags);
2804 case FUTEX_TRYLOCK_PI:
2805 return futex_lock_pi(uaddr, flags, 0, timeout, 1);
2806 case FUTEX_WAIT_REQUEUE_PI:
2807 val3 = FUTEX_BITSET_MATCH_ANY;
2808 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2809 uaddr2);
2810 case FUTEX_CMP_REQUEUE_PI:
2811 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2812 }
2813 return -ENOSYS;
2814 }
2815
2816
SYSCALL_DEFINE6(futex,u32 __user *,uaddr,int,op,u32,val,struct timespec __user *,utime,u32 __user *,uaddr2,u32,val3)2817 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2818 struct timespec __user *, utime, u32 __user *, uaddr2,
2819 u32, val3)
2820 {
2821 struct timespec ts;
2822 ktime_t t, *tp = NULL;
2823 u32 val2 = 0;
2824 int cmd = op & FUTEX_CMD_MASK;
2825
2826 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2827 cmd == FUTEX_WAIT_BITSET ||
2828 cmd == FUTEX_WAIT_REQUEUE_PI)) {
2829 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2830 return -EFAULT;
2831 if (!timespec_valid(&ts))
2832 return -EINVAL;
2833
2834 t = timespec_to_ktime(ts);
2835 if (cmd == FUTEX_WAIT)
2836 t = ktime_add_safe(ktime_get(), t);
2837 tp = &t;
2838 }
2839 /*
2840 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
2841 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2842 */
2843 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2844 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
2845 val2 = (u32) (unsigned long) utime;
2846
2847 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2848 }
2849
futex_init(void)2850 static int __init futex_init(void)
2851 {
2852 u32 curval;
2853 int i;
2854
2855 /*
2856 * This will fail and we want it. Some arch implementations do
2857 * runtime detection of the futex_atomic_cmpxchg_inatomic()
2858 * functionality. We want to know that before we call in any
2859 * of the complex code paths. Also we want to prevent
2860 * registration of robust lists in that case. NULL is
2861 * guaranteed to fault and we get -EFAULT on functional
2862 * implementation, the non-functional ones will return
2863 * -ENOSYS.
2864 */
2865 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2866 futex_cmpxchg_enabled = 1;
2867
2868 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
2869 plist_head_init(&futex_queues[i].chain);
2870 spin_lock_init(&futex_queues[i].lock);
2871 }
2872
2873 return 0;
2874 }
2875 __initcall(futex_init);
2876