1 // SPDX-License-Identifier: GPL-2.0
2 /* kernel/rwsem.c: R/W semaphores, public implementation
3 *
4 * Written by David Howells (dhowells@redhat.com).
5 * Derived from asm-i386/semaphore.h
6 *
7 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
8 * and Michel Lespinasse <walken@google.com>
9 *
10 * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
11 * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
12 *
13 * Rwsem count bit fields re-definition and rwsem rearchitecture by
14 * Waiman Long <longman@redhat.com> and
15 * Peter Zijlstra <peterz@infradead.org>.
16 */
17
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/sched/rt.h>
22 #include <linux/sched/task.h>
23 #include <linux/sched/debug.h>
24 #include <linux/sched/wake_q.h>
25 #include <linux/sched/signal.h>
26 #include <linux/sched/clock.h>
27 #include <linux/export.h>
28 #include <linux/rwsem.h>
29 #include <linux/atomic.h>
30 #include <trace/events/lock.h>
31
32 #ifndef CONFIG_PREEMPT_RT
33 #include "lock_events.h"
34 #include <trace/hooks/dtask.h>
35 #include <trace/hooks/rwsem.h>
36
37 /*
38 * The least significant 2 bits of the owner value has the following
39 * meanings when set.
40 * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
41 * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
42 *
43 * When the rwsem is reader-owned and a spinning writer has timed out,
44 * the nonspinnable bit will be set to disable optimistic spinning.
45
46 * When a writer acquires a rwsem, it puts its task_struct pointer
47 * into the owner field. It is cleared after an unlock.
48 *
49 * When a reader acquires a rwsem, it will also puts its task_struct
50 * pointer into the owner field with the RWSEM_READER_OWNED bit set.
51 * On unlock, the owner field will largely be left untouched. So
52 * for a free or reader-owned rwsem, the owner value may contain
53 * information about the last reader that acquires the rwsem.
54 *
55 * That information may be helpful in debugging cases where the system
56 * seems to hang on a reader owned rwsem especially if only one reader
57 * is involved. Ideally we would like to track all the readers that own
58 * a rwsem, but the overhead is simply too big.
59 *
60 * A fast path reader optimistic lock stealing is supported when the rwsem
61 * is previously owned by a writer and the following conditions are met:
62 * - rwsem is not currently writer owned
63 * - the handoff isn't set.
64 */
65 #define RWSEM_READER_OWNED (1UL << 0)
66 #define RWSEM_NONSPINNABLE (1UL << 1)
67 #define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
68
69 #ifdef CONFIG_DEBUG_RWSEMS
70 # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
71 if (!debug_locks_silent && \
72 WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
73 #c, atomic_long_read(&(sem)->count), \
74 (unsigned long) sem->magic, \
75 atomic_long_read(&(sem)->owner), (long)current, \
76 list_empty(&(sem)->wait_list) ? "" : "not ")) \
77 debug_locks_off(); \
78 } while (0)
79 #else
80 # define DEBUG_RWSEMS_WARN_ON(c, sem)
81 #endif
82
83 /*
84 * On 64-bit architectures, the bit definitions of the count are:
85 *
86 * Bit 0 - writer locked bit
87 * Bit 1 - waiters present bit
88 * Bit 2 - lock handoff bit
89 * Bits 3-7 - reserved
90 * Bits 8-62 - 55-bit reader count
91 * Bit 63 - read fail bit
92 *
93 * On 32-bit architectures, the bit definitions of the count are:
94 *
95 * Bit 0 - writer locked bit
96 * Bit 1 - waiters present bit
97 * Bit 2 - lock handoff bit
98 * Bits 3-7 - reserved
99 * Bits 8-30 - 23-bit reader count
100 * Bit 31 - read fail bit
101 *
102 * It is not likely that the most significant bit (read fail bit) will ever
103 * be set. This guard bit is still checked anyway in the down_read() fastpath
104 * just in case we need to use up more of the reader bits for other purpose
105 * in the future.
106 *
107 * atomic_long_fetch_add() is used to obtain reader lock, whereas
108 * atomic_long_cmpxchg() will be used to obtain writer lock.
109 *
110 * There are three places where the lock handoff bit may be set or cleared.
111 * 1) rwsem_mark_wake() for readers -- set, clear
112 * 2) rwsem_try_write_lock() for writers -- set, clear
113 * 3) rwsem_del_waiter() -- clear
114 *
115 * For all the above cases, wait_lock will be held. A writer must also
116 * be the first one in the wait_list to be eligible for setting the handoff
117 * bit. So concurrent setting/clearing of handoff bit is not possible.
118 */
119 #define RWSEM_WRITER_LOCKED (1UL << 0)
120 #define RWSEM_FLAG_WAITERS (1UL << 1)
121 #define RWSEM_FLAG_HANDOFF (1UL << 2)
122 #define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))
123
124 #define RWSEM_READER_SHIFT 8
125 #define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT)
126 #define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1))
127 #define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED
128 #define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
129 #define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
130 RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
131
132 /*
133 * All writes to owner are protected by WRITE_ONCE() to make sure that
134 * store tearing can't happen as optimistic spinners may read and use
135 * the owner value concurrently without lock. Read from owner, however,
136 * may not need READ_ONCE() as long as the pointer value is only used
137 * for comparison and isn't being dereferenced.
138 *
139 * Both rwsem_{set,clear}_owner() functions should be in the same
140 * preempt disable section as the atomic op that changes sem->count.
141 */
rwsem_set_owner(struct rw_semaphore * sem)142 static inline void rwsem_set_owner(struct rw_semaphore *sem)
143 {
144 lockdep_assert_preemption_disabled();
145 atomic_long_set(&sem->owner, (long)current);
146 trace_android_vh_record_rwsem_writer_owned(sem);
147 }
148
rwsem_clear_owner(struct rw_semaphore * sem)149 static inline void rwsem_clear_owner(struct rw_semaphore *sem)
150 {
151 lockdep_assert_preemption_disabled();
152 atomic_long_set(&sem->owner, 0);
153 trace_android_vh_clear_rwsem_writer_owned(sem);
154 }
155
156 /*
157 * Test the flags in the owner field.
158 */
rwsem_test_oflags(struct rw_semaphore * sem,long flags)159 static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
160 {
161 return atomic_long_read(&sem->owner) & flags;
162 }
163
164 /*
165 * The task_struct pointer of the last owning reader will be left in
166 * the owner field.
167 *
168 * Note that the owner value just indicates the task has owned the rwsem
169 * previously, it may not be the real owner or one of the real owners
170 * anymore when that field is examined, so take it with a grain of salt.
171 *
172 * The reader non-spinnable bit is preserved.
173 */
__rwsem_set_reader_owned(struct rw_semaphore * sem,struct task_struct * owner)174 static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
175 struct task_struct *owner)
176 {
177 unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
178 (atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE);
179
180 atomic_long_set(&sem->owner, val);
181 }
182
rwsem_set_reader_owned(struct rw_semaphore * sem)183 static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
184 {
185 __rwsem_set_reader_owned(sem, current);
186 trace_android_vh_record_rwsem_reader_owned(sem, NULL);
187 }
188
189 #ifdef CONFIG_DEBUG_RWSEMS
190 /*
191 * Return just the real task structure pointer of the owner
192 */
rwsem_owner(struct rw_semaphore * sem)193 static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
194 {
195 return (struct task_struct *)
196 (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
197 }
198
199 /*
200 * Return true if the rwsem is owned by a reader.
201 */
is_rwsem_reader_owned(struct rw_semaphore * sem)202 static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
203 {
204 /*
205 * Check the count to see if it is write-locked.
206 */
207 long count = atomic_long_read(&sem->count);
208
209 if (count & RWSEM_WRITER_MASK)
210 return false;
211 return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
212 }
213
214 /*
215 * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
216 * is a task pointer in owner of a reader-owned rwsem, it will be the
217 * real owner or one of the real owners. The only exception is when the
218 * unlock is done by up_read_non_owner().
219 */
rwsem_clear_reader_owned(struct rw_semaphore * sem)220 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
221 {
222 unsigned long val = atomic_long_read(&sem->owner);
223
224 trace_android_vh_clear_rwsem_reader_owned(sem);
225 while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
226 if (atomic_long_try_cmpxchg(&sem->owner, &val,
227 val & RWSEM_OWNER_FLAGS_MASK))
228 return;
229 }
230 }
231 #else
rwsem_clear_reader_owned(struct rw_semaphore * sem)232 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
233 {
234 trace_android_vh_clear_rwsem_reader_owned(sem);
235 }
236 #endif
237
238 /*
239 * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
240 * remains set. Otherwise, the operation will be aborted.
241 */
rwsem_set_nonspinnable(struct rw_semaphore * sem)242 static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
243 {
244 unsigned long owner = atomic_long_read(&sem->owner);
245
246 do {
247 if (!(owner & RWSEM_READER_OWNED))
248 break;
249 if (owner & RWSEM_NONSPINNABLE)
250 break;
251 } while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
252 owner | RWSEM_NONSPINNABLE));
253 }
254
rwsem_read_trylock(struct rw_semaphore * sem,long * cntp)255 static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
256 {
257 int ret = 0;
258
259 *cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
260
261 if (WARN_ON_ONCE(*cntp < 0))
262 rwsem_set_nonspinnable(sem);
263
264 if (!(*cntp & RWSEM_READ_FAILED_MASK)) {
265 rwsem_set_reader_owned(sem);
266 trace_android_vh_record_rwsem_lock_starttime(sem, jiffies);
267 return true;
268 }
269
270 trace_android_vh_rwsem_read_trylock_failed(sem, cntp, &ret);
271 if (ret) {
272 rwsem_set_reader_owned(sem);
273 trace_android_vh_record_rwsem_lock_starttime(sem, jiffies);
274 return true;
275 }
276
277 return false;
278 }
279
rwsem_write_trylock(struct rw_semaphore * sem)280 static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
281 {
282 long tmp = RWSEM_UNLOCKED_VALUE;
283
284 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
285 trace_android_vh_record_rwsem_lock_starttime(sem, jiffies);
286 rwsem_set_owner(sem);
287 return true;
288 }
289
290 return false;
291 }
292
293 /*
294 * Return the real task structure pointer of the owner and the embedded
295 * flags in the owner. pflags must be non-NULL.
296 */
297 static inline struct task_struct *
rwsem_owner_flags(struct rw_semaphore * sem,unsigned long * pflags)298 rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
299 {
300 unsigned long owner = atomic_long_read(&sem->owner);
301
302 *pflags = owner & RWSEM_OWNER_FLAGS_MASK;
303 return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
304 }
305
306 /*
307 * Guide to the rw_semaphore's count field.
308 *
309 * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
310 * by a writer.
311 *
312 * The lock is owned by readers when
313 * (1) the RWSEM_WRITER_LOCKED isn't set in count,
314 * (2) some of the reader bits are set in count, and
315 * (3) the owner field has RWSEM_READ_OWNED bit set.
316 *
317 * Having some reader bits set is not enough to guarantee a readers owned
318 * lock as the readers may be in the process of backing out from the count
319 * and a writer has just released the lock. So another writer may steal
320 * the lock immediately after that.
321 */
322
323 /*
324 * Initialize an rwsem:
325 */
__init_rwsem(struct rw_semaphore * sem,const char * name,struct lock_class_key * key)326 void __init_rwsem(struct rw_semaphore *sem, const char *name,
327 struct lock_class_key *key)
328 {
329 #ifdef CONFIG_DEBUG_LOCK_ALLOC
330 /*
331 * Make sure we are not reinitializing a held semaphore:
332 */
333 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
334 lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
335 #endif
336 #ifdef CONFIG_DEBUG_RWSEMS
337 sem->magic = sem;
338 #endif
339 atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
340 raw_spin_lock_init(&sem->wait_lock);
341 INIT_LIST_HEAD(&sem->wait_list);
342 atomic_long_set(&sem->owner, 0L);
343 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
344 osq_lock_init(&sem->osq);
345 #endif
346 android_init_vendor_data(sem, 1);
347 android_init_oem_data(sem, 1);
348 trace_android_vh_rwsem_init(sem);
349 }
350 EXPORT_SYMBOL(__init_rwsem);
351
352 enum rwsem_waiter_type {
353 RWSEM_WAITING_FOR_WRITE,
354 RWSEM_WAITING_FOR_READ
355 };
356
357 struct rwsem_waiter {
358 struct list_head list;
359 struct task_struct *task;
360 enum rwsem_waiter_type type;
361 unsigned long timeout;
362 bool handoff_set;
363 };
364 #define rwsem_first_waiter(sem) \
365 list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
366
367 enum rwsem_wake_type {
368 RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
369 RWSEM_WAKE_READERS, /* Wake readers only */
370 RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
371 };
372
373 /*
374 * The typical HZ value is either 250 or 1000. So set the minimum waiting
375 * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
376 * queue before initiating the handoff protocol.
377 */
378 #define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250)
379
380 /*
381 * Magic number to batch-wakeup waiting readers, even when writers are
382 * also present in the queue. This both limits the amount of work the
383 * waking thread must do and also prevents any potential counter overflow,
384 * however unlikely.
385 */
386 #define MAX_READERS_WAKEUP 0x100
387
388 static inline void
rwsem_add_waiter(struct rw_semaphore * sem,struct rwsem_waiter * waiter)389 rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
390 {
391 lockdep_assert_held(&sem->wait_lock);
392 list_add_tail(&waiter->list, &sem->wait_list);
393 /* caller will set RWSEM_FLAG_WAITERS */
394 }
395
396 /*
397 * Remove a waiter from the wait_list and clear flags.
398 *
399 * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
400 * this function. Modify with care.
401 *
402 * Return: true if wait_list isn't empty and false otherwise
403 */
404 static inline bool
rwsem_del_waiter(struct rw_semaphore * sem,struct rwsem_waiter * waiter)405 rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
406 {
407 lockdep_assert_held(&sem->wait_lock);
408 list_del(&waiter->list);
409 if (likely(!list_empty(&sem->wait_list)))
410 return true;
411
412 atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
413 return false;
414 }
415
416 /*
417 * handle the lock release when processes blocked on it that can now run
418 * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
419 * have been set.
420 * - there must be someone on the queue
421 * - the wait_lock must be held by the caller
422 * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
423 * to actually wakeup the blocked task(s) and drop the reference count,
424 * preferably when the wait_lock is released
425 * - woken process blocks are discarded from the list after having task zeroed
426 * - writers are only marked woken if downgrading is false
427 *
428 * Implies rwsem_del_waiter() for all woken readers.
429 */
rwsem_mark_wake(struct rw_semaphore * sem,enum rwsem_wake_type wake_type,struct wake_q_head * wake_q)430 static void rwsem_mark_wake(struct rw_semaphore *sem,
431 enum rwsem_wake_type wake_type,
432 struct wake_q_head *wake_q)
433 {
434 struct rwsem_waiter *waiter, *tmp;
435 long oldcount, woken = 0, adjustment = 0;
436 struct list_head wlist;
437
438 lockdep_assert_held(&sem->wait_lock);
439
440 /*
441 * Take a peek at the queue head waiter such that we can determine
442 * the wakeup(s) to perform.
443 */
444 waiter = rwsem_first_waiter(sem);
445
446 if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
447 if (wake_type == RWSEM_WAKE_ANY) {
448 /*
449 * Mark writer at the front of the queue for wakeup.
450 * Until the task is actually later awoken later by
451 * the caller, other writers are able to steal it.
452 * Readers, on the other hand, will block as they
453 * will notice the queued writer.
454 */
455 wake_q_add(wake_q, waiter->task);
456 lockevent_inc(rwsem_wake_writer);
457 }
458
459 return;
460 }
461
462 /*
463 * No reader wakeup if there are too many of them already.
464 */
465 if (unlikely(atomic_long_read(&sem->count) < 0))
466 return;
467
468 /*
469 * Writers might steal the lock before we grant it to the next reader.
470 * We prefer to do the first reader grant before counting readers
471 * so we can bail out early if a writer stole the lock.
472 */
473 if (wake_type != RWSEM_WAKE_READ_OWNED) {
474 struct task_struct *owner;
475
476 adjustment = RWSEM_READER_BIAS;
477 oldcount = atomic_long_fetch_add(adjustment, &sem->count);
478 if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
479 /*
480 * When we've been waiting "too" long (for writers
481 * to give up the lock), request a HANDOFF to
482 * force the issue.
483 */
484 if (time_after(jiffies, waiter->timeout)) {
485 if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
486 adjustment -= RWSEM_FLAG_HANDOFF;
487 lockevent_inc(rwsem_rlock_handoff);
488 }
489 waiter->handoff_set = true;
490 }
491
492 atomic_long_add(-adjustment, &sem->count);
493 return;
494 }
495 /*
496 * Set it to reader-owned to give spinners an early
497 * indication that readers now have the lock.
498 * The reader nonspinnable bit seen at slowpath entry of
499 * the reader is copied over.
500 */
501 owner = waiter->task;
502 __rwsem_set_reader_owned(sem, owner);
503 }
504
505 /*
506 * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
507 * queue. We know that the woken will be at least 1 as we accounted
508 * for above. Note we increment the 'active part' of the count by the
509 * number of readers before waking any processes up.
510 *
511 * This is an adaptation of the phase-fair R/W locks where at the
512 * reader phase (first waiter is a reader), all readers are eligible
513 * to acquire the lock at the same time irrespective of their order
514 * in the queue. The writers acquire the lock according to their
515 * order in the queue.
516 *
517 * We have to do wakeup in 2 passes to prevent the possibility that
518 * the reader count may be decremented before it is incremented. It
519 * is because the to-be-woken waiter may not have slept yet. So it
520 * may see waiter->task got cleared, finish its critical section and
521 * do an unlock before the reader count increment.
522 *
523 * 1) Collect the read-waiters in a separate list, count them and
524 * fully increment the reader count in rwsem.
525 * 2) For each waiters in the new list, clear waiter->task and
526 * put them into wake_q to be woken up later.
527 */
528 INIT_LIST_HEAD(&wlist);
529 list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
530 if (waiter->type == RWSEM_WAITING_FOR_WRITE)
531 continue;
532
533 woken++;
534 list_move_tail(&waiter->list, &wlist);
535
536 /*
537 * Limit # of readers that can be woken up per wakeup call.
538 */
539 if (unlikely(woken >= MAX_READERS_WAKEUP))
540 break;
541 }
542
543 adjustment = woken * RWSEM_READER_BIAS - adjustment;
544 lockevent_cond_inc(rwsem_wake_reader, woken);
545
546 oldcount = atomic_long_read(&sem->count);
547 if (list_empty(&sem->wait_list)) {
548 /*
549 * Combined with list_move_tail() above, this implies
550 * rwsem_del_waiter().
551 */
552 adjustment -= RWSEM_FLAG_WAITERS;
553 if (oldcount & RWSEM_FLAG_HANDOFF)
554 adjustment -= RWSEM_FLAG_HANDOFF;
555 } else if (woken) {
556 /*
557 * When we've woken a reader, we no longer need to force
558 * writers to give up the lock and we can clear HANDOFF.
559 */
560 if (oldcount & RWSEM_FLAG_HANDOFF)
561 adjustment -= RWSEM_FLAG_HANDOFF;
562 }
563
564 if (adjustment)
565 atomic_long_add(adjustment, &sem->count);
566 trace_android_vh_record_rwsem_reader_owned(sem, &wlist);
567
568 /* 2nd pass */
569 list_for_each_entry_safe(waiter, tmp, &wlist, list) {
570 struct task_struct *tsk;
571
572 tsk = waiter->task;
573 get_task_struct(tsk);
574
575 /*
576 * Ensure calling get_task_struct() before setting the reader
577 * waiter to nil such that rwsem_down_read_slowpath() cannot
578 * race with do_exit() by always holding a reference count
579 * to the task to wakeup.
580 */
581 smp_store_release(&waiter->task, NULL);
582 /*
583 * Ensure issuing the wakeup (either by us or someone else)
584 * after setting the reader waiter to nil.
585 */
586 wake_q_add_safe(wake_q, tsk);
587 }
588 }
589
590 /*
591 * Remove a waiter and try to wake up other waiters in the wait queue
592 * This function is called from the out_nolock path of both the reader and
593 * writer slowpaths with wait_lock held. It releases the wait_lock and
594 * optionally wake up waiters before it returns.
595 */
596 static inline void
rwsem_del_wake_waiter(struct rw_semaphore * sem,struct rwsem_waiter * waiter,struct wake_q_head * wake_q)597 rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
598 struct wake_q_head *wake_q)
599 __releases(&sem->wait_lock)
600 {
601 bool first = rwsem_first_waiter(sem) == waiter;
602
603 wake_q_init(wake_q);
604
605 /*
606 * If the wait_list isn't empty and the waiter to be deleted is
607 * the first waiter, we wake up the remaining waiters as they may
608 * be eligible to acquire or spin on the lock.
609 */
610 if (rwsem_del_waiter(sem, waiter) && first)
611 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
612 raw_spin_unlock_irq(&sem->wait_lock);
613 if (!wake_q_empty(wake_q))
614 wake_up_q(wake_q);
615 }
616
617 /*
618 * This function must be called with the sem->wait_lock held to prevent
619 * race conditions between checking the rwsem wait list and setting the
620 * sem->count accordingly.
621 *
622 * Implies rwsem_del_waiter() on success.
623 */
rwsem_try_write_lock(struct rw_semaphore * sem,struct rwsem_waiter * waiter)624 static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
625 struct rwsem_waiter *waiter)
626 {
627 struct rwsem_waiter *first = rwsem_first_waiter(sem);
628 long count, new;
629
630 lockdep_assert_held(&sem->wait_lock);
631
632 count = atomic_long_read(&sem->count);
633 do {
634 bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
635
636 if (has_handoff) {
637 /*
638 * Honor handoff bit and yield only when the first
639 * waiter is the one that set it. Otherwisee, we
640 * still try to acquire the rwsem.
641 */
642 if (first->handoff_set && (waiter != first))
643 return false;
644 }
645
646 new = count;
647
648 if (count & RWSEM_LOCK_MASK) {
649 /*
650 * A waiter (first or not) can set the handoff bit
651 * if it is an RT task or wait in the wait queue
652 * for too long.
653 */
654 if (has_handoff || (!rt_or_dl_task(waiter->task) &&
655 !time_after(jiffies, waiter->timeout)))
656 return false;
657
658 new |= RWSEM_FLAG_HANDOFF;
659 } else {
660 new |= RWSEM_WRITER_LOCKED;
661 new &= ~RWSEM_FLAG_HANDOFF;
662
663 if (list_is_singular(&sem->wait_list))
664 new &= ~RWSEM_FLAG_WAITERS;
665 }
666 } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
667
668 /*
669 * We have either acquired the lock with handoff bit cleared or set
670 * the handoff bit. Only the first waiter can have its handoff_set
671 * set here to enable optimistic spinning in slowpath loop.
672 */
673 if (new & RWSEM_FLAG_HANDOFF) {
674 first->handoff_set = true;
675 lockevent_inc(rwsem_wlock_handoff);
676 return false;
677 }
678
679 /*
680 * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
681 * success.
682 */
683 list_del(&waiter->list);
684 rwsem_set_owner(sem);
685 return true;
686 }
687
688 /*
689 * The rwsem_spin_on_owner() function returns the following 4 values
690 * depending on the lock owner state.
691 * OWNER_NULL : owner is currently NULL
692 * OWNER_WRITER: when owner changes and is a writer
693 * OWNER_READER: when owner changes and the new owner may be a reader.
694 * OWNER_NONSPINNABLE:
695 * when optimistic spinning has to stop because either the
696 * owner stops running, is unknown, or its timeslice has
697 * been used up.
698 */
699 enum owner_state {
700 OWNER_NULL = 1 << 0,
701 OWNER_WRITER = 1 << 1,
702 OWNER_READER = 1 << 2,
703 OWNER_NONSPINNABLE = 1 << 3,
704 };
705
706 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
707 /*
708 * Try to acquire write lock before the writer has been put on wait queue.
709 */
rwsem_try_write_lock_unqueued(struct rw_semaphore * sem)710 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
711 {
712 long count = atomic_long_read(&sem->count);
713
714 while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
715 if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
716 count | RWSEM_WRITER_LOCKED)) {
717 rwsem_set_owner(sem);
718 lockevent_inc(rwsem_opt_lock);
719 return true;
720 }
721 }
722 return false;
723 }
724
rwsem_can_spin_on_owner(struct rw_semaphore * sem)725 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
726 {
727 struct task_struct *owner;
728 unsigned long flags;
729 bool ret = true;
730
731 if (need_resched()) {
732 lockevent_inc(rwsem_opt_fail);
733 return false;
734 }
735
736 /*
737 * Disable preemption is equal to the RCU read-side crital section,
738 * thus the task_strcut structure won't go away.
739 */
740 owner = rwsem_owner_flags(sem, &flags);
741 /*
742 * Don't check the read-owner as the entry may be stale.
743 */
744 if ((flags & RWSEM_NONSPINNABLE) ||
745 (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
746 ret = false;
747 trace_android_vh_rwsem_can_spin_on_owner(sem, &ret);
748
749 lockevent_cond_inc(rwsem_opt_fail, !ret);
750 return ret;
751 }
752
753 #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
754
755 static inline enum owner_state
rwsem_owner_state(struct task_struct * owner,unsigned long flags)756 rwsem_owner_state(struct task_struct *owner, unsigned long flags)
757 {
758 if (flags & RWSEM_NONSPINNABLE)
759 return OWNER_NONSPINNABLE;
760
761 if (flags & RWSEM_READER_OWNED)
762 return OWNER_READER;
763
764 return owner ? OWNER_WRITER : OWNER_NULL;
765 }
766
767 static noinline enum owner_state
rwsem_spin_on_owner(struct rw_semaphore * sem)768 rwsem_spin_on_owner(struct rw_semaphore *sem)
769 {
770 struct task_struct *new, *owner;
771 unsigned long flags, new_flags;
772 enum owner_state state;
773 int cnt = 0;
774 bool time_out = false;
775
776 lockdep_assert_preemption_disabled();
777
778 owner = rwsem_owner_flags(sem, &flags);
779 state = rwsem_owner_state(owner, flags);
780 if (state != OWNER_WRITER)
781 return state;
782
783 for (;;) {
784 trace_android_vh_rwsem_opt_spin_start(sem, &time_out, &cnt, true);
785 if (time_out)
786 break;
787 /*
788 * When a waiting writer set the handoff flag, it may spin
789 * on the owner as well. Once that writer acquires the lock,
790 * we can spin on it. So we don't need to quit even when the
791 * handoff bit is set.
792 */
793 new = rwsem_owner_flags(sem, &new_flags);
794 if ((new != owner) || (new_flags != flags)) {
795 state = rwsem_owner_state(new, new_flags);
796 break;
797 }
798
799 /*
800 * Ensure we emit the owner->on_cpu, dereference _after_
801 * checking sem->owner still matches owner, if that fails,
802 * owner might point to free()d memory, if it still matches,
803 * our spinning context already disabled preemption which is
804 * equal to RCU read-side crital section ensures the memory
805 * stays valid.
806 */
807 barrier();
808
809 if (need_resched() || !owner_on_cpu(owner)) {
810 state = OWNER_NONSPINNABLE;
811 break;
812 }
813
814 cpu_relax();
815 }
816
817 return state;
818 }
819
820 /*
821 * Calculate reader-owned rwsem spinning threshold for writer
822 *
823 * The more readers own the rwsem, the longer it will take for them to
824 * wind down and free the rwsem. So the empirical formula used to
825 * determine the actual spinning time limit here is:
826 *
827 * Spinning threshold = (10 + nr_readers/2)us
828 *
829 * The limit is capped to a maximum of 25us (30 readers). This is just
830 * a heuristic and is subjected to change in the future.
831 */
rwsem_rspin_threshold(struct rw_semaphore * sem)832 static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
833 {
834 long count = atomic_long_read(&sem->count);
835 int readers = count >> RWSEM_READER_SHIFT;
836 u64 delta;
837
838 if (readers > 30)
839 readers = 30;
840 delta = (20 + readers) * NSEC_PER_USEC / 2;
841
842 return sched_clock() + delta;
843 }
844
rwsem_optimistic_spin(struct rw_semaphore * sem)845 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
846 {
847 bool taken = false;
848 int prev_owner_state = OWNER_NULL;
849 int loop = 0;
850 u64 rspin_threshold = 0;
851 int cnt = 0;
852 bool time_out = false;
853
854 /* sem->wait_lock should not be held when doing optimistic spinning */
855 if (!osq_lock(&sem->osq))
856 goto done;
857
858 /*
859 * Optimistically spin on the owner field and attempt to acquire the
860 * lock whenever the owner changes. Spinning will be stopped when:
861 * 1) the owning writer isn't running; or
862 * 2) readers own the lock and spinning time has exceeded limit.
863 */
864 for (;;) {
865 enum owner_state owner_state;
866
867 trace_android_vh_rwsem_opt_spin_start(sem, &time_out, &cnt, false);
868 if (time_out)
869 break;
870 owner_state = rwsem_spin_on_owner(sem);
871 if (!(owner_state & OWNER_SPINNABLE))
872 break;
873
874 /*
875 * Try to acquire the lock
876 */
877 taken = rwsem_try_write_lock_unqueued(sem);
878
879 if (taken)
880 break;
881
882 /*
883 * Time-based reader-owned rwsem optimistic spinning
884 */
885 if (owner_state == OWNER_READER) {
886 /*
887 * Re-initialize rspin_threshold every time when
888 * the owner state changes from non-reader to reader.
889 * This allows a writer to steal the lock in between
890 * 2 reader phases and have the threshold reset at
891 * the beginning of the 2nd reader phase.
892 */
893 if (prev_owner_state != OWNER_READER) {
894 if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
895 break;
896 rspin_threshold = rwsem_rspin_threshold(sem);
897 loop = 0;
898 }
899
900 /*
901 * Check time threshold once every 16 iterations to
902 * avoid calling sched_clock() too frequently so
903 * as to reduce the average latency between the times
904 * when the lock becomes free and when the spinner
905 * is ready to do a trylock.
906 */
907 else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
908 rwsem_set_nonspinnable(sem);
909 lockevent_inc(rwsem_opt_nospin);
910 break;
911 }
912 }
913
914 /*
915 * An RT task cannot do optimistic spinning if it cannot
916 * be sure the lock holder is running or live-lock may
917 * happen if the current task and the lock holder happen
918 * to run in the same CPU. However, aborting optimistic
919 * spinning while a NULL owner is detected may miss some
920 * opportunity where spinning can continue without causing
921 * problem.
922 *
923 * There are 2 possible cases where an RT task may be able
924 * to continue spinning.
925 *
926 * 1) The lock owner is in the process of releasing the
927 * lock, sem->owner is cleared but the lock has not
928 * been released yet.
929 * 2) The lock was free and owner cleared, but another
930 * task just comes in and acquire the lock before
931 * we try to get it. The new owner may be a spinnable
932 * writer.
933 *
934 * To take advantage of two scenarios listed above, the RT
935 * task is made to retry one more time to see if it can
936 * acquire the lock or continue spinning on the new owning
937 * writer. Of course, if the time lag is long enough or the
938 * new owner is not a writer or spinnable, the RT task will
939 * quit spinning.
940 *
941 * If the owner is a writer, the need_resched() check is
942 * done inside rwsem_spin_on_owner(). If the owner is not
943 * a writer, need_resched() check needs to be done here.
944 */
945 if (owner_state != OWNER_WRITER) {
946 if (need_resched())
947 break;
948 if (rt_or_dl_task(current) &&
949 (prev_owner_state != OWNER_WRITER))
950 break;
951 }
952 prev_owner_state = owner_state;
953
954 /*
955 * The cpu_relax() call is a compiler barrier which forces
956 * everything in this loop to be re-loaded. We don't need
957 * memory barriers as we'll eventually observe the right
958 * values at the cost of a few extra spins.
959 */
960 cpu_relax();
961 }
962 osq_unlock(&sem->osq);
963 trace_android_vh_rwsem_opt_spin_finish(sem, taken);
964 done:
965 lockevent_cond_inc(rwsem_opt_fail, !taken);
966 return taken;
967 }
968
969 /*
970 * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should
971 * only be called when the reader count reaches 0.
972 */
clear_nonspinnable(struct rw_semaphore * sem)973 static inline void clear_nonspinnable(struct rw_semaphore *sem)
974 {
975 if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
976 atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
977 }
978
979 #else
rwsem_can_spin_on_owner(struct rw_semaphore * sem)980 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
981 {
982 return false;
983 }
984
rwsem_optimistic_spin(struct rw_semaphore * sem)985 static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
986 {
987 return false;
988 }
989
clear_nonspinnable(struct rw_semaphore * sem)990 static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
991
992 static inline enum owner_state
rwsem_spin_on_owner(struct rw_semaphore * sem)993 rwsem_spin_on_owner(struct rw_semaphore *sem)
994 {
995 return OWNER_NONSPINNABLE;
996 }
997 #endif
998
999 /*
1000 * Prepare to wake up waiter(s) in the wait queue by putting them into the
1001 * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
1002 * reader-owned, wake up read lock waiters in queue front or wake up any
1003 * front waiter otherwise.
1004
1005 * This is being called from both reader and writer slow paths.
1006 */
rwsem_cond_wake_waiter(struct rw_semaphore * sem,long count,struct wake_q_head * wake_q)1007 static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
1008 struct wake_q_head *wake_q)
1009 {
1010 enum rwsem_wake_type wake_type;
1011
1012 if (count & RWSEM_WRITER_MASK)
1013 return;
1014
1015 if (count & RWSEM_READER_MASK) {
1016 wake_type = RWSEM_WAKE_READERS;
1017 } else {
1018 wake_type = RWSEM_WAKE_ANY;
1019 clear_nonspinnable(sem);
1020 }
1021 rwsem_mark_wake(sem, wake_type, wake_q);
1022 }
1023
1024 /*
1025 * Wait for the read lock to be granted
1026 */
1027 static struct rw_semaphore __sched *
rwsem_down_read_slowpath(struct rw_semaphore * sem,long count,unsigned int state)1028 rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
1029 {
1030 long adjustment = -RWSEM_READER_BIAS;
1031 long rcnt = (count >> RWSEM_READER_SHIFT);
1032 struct rwsem_waiter waiter;
1033 DEFINE_WAKE_Q(wake_q);
1034 bool already_on_list = false;
1035 bool steal = true;
1036 bool rspin = false;
1037
1038 /*
1039 * To prevent a constant stream of readers from starving a sleeping
1040 * writer, don't attempt optimistic lock stealing if the lock is
1041 * very likely owned by readers.
1042 */
1043 if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
1044 (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
1045 goto queue;
1046
1047 /*
1048 * Reader optimistic lock stealing.
1049 */
1050 trace_android_vh_rwsem_direct_rsteal(sem, &steal);
1051 if (steal && !(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) {
1052 rwsem_set_reader_owned(sem);
1053 lockevent_inc(rwsem_rlock_steal);
1054
1055 /*
1056 * Wake up other readers in the wait queue if it is
1057 * the first reader.
1058 */
1059 wake_readers:
1060 if ((rcnt == 1 || rspin) && (count & RWSEM_FLAG_WAITERS)) {
1061 raw_spin_lock_irq(&sem->wait_lock);
1062 if (!list_empty(&sem->wait_list))
1063 rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
1064 &wake_q);
1065 raw_spin_unlock_irq(&sem->wait_lock);
1066 wake_up_q(&wake_q);
1067 }
1068 trace_android_vh_record_rwsem_lock_starttime(sem, jiffies);
1069 return sem;
1070 }
1071 /*
1072 * Reader optimistic spinning and stealing.
1073 */
1074 trace_android_vh_rwsem_optimistic_rspin(sem, &adjustment, &rspin);
1075 if (rspin)
1076 goto wake_readers;
1077
1078 queue:
1079 waiter.task = current;
1080 waiter.type = RWSEM_WAITING_FOR_READ;
1081 waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1082 waiter.handoff_set = false;
1083
1084 raw_spin_lock_irq(&sem->wait_lock);
1085 if (list_empty(&sem->wait_list)) {
1086 /*
1087 * In case the wait queue is empty and the lock isn't owned
1088 * by a writer, this reader can exit the slowpath and return
1089 * immediately as its RWSEM_READER_BIAS has already been set
1090 * in the count.
1091 */
1092 if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
1093 /* Provide lock ACQUIRE */
1094 smp_acquire__after_ctrl_dep();
1095 raw_spin_unlock_irq(&sem->wait_lock);
1096 rwsem_set_reader_owned(sem);
1097 lockevent_inc(rwsem_rlock_fast);
1098 return sem;
1099 }
1100 adjustment += RWSEM_FLAG_WAITERS;
1101 }
1102 trace_android_vh_alter_rwsem_list_add(
1103 &waiter,
1104 sem, &already_on_list);
1105 if (!already_on_list)
1106 rwsem_add_waiter(sem, &waiter);
1107
1108 /* we're now waiting on the lock, but no longer actively locking */
1109 count = atomic_long_add_return(adjustment, &sem->count);
1110
1111 rwsem_cond_wake_waiter(sem, count, &wake_q);
1112 trace_android_vh_rwsem_wake(sem);
1113 raw_spin_unlock_irq(&sem->wait_lock);
1114
1115 if (!wake_q_empty(&wake_q))
1116 wake_up_q(&wake_q);
1117
1118 trace_contention_begin(sem, LCB_F_READ);
1119
1120 /* wait to be given the lock */
1121 trace_android_vh_rwsem_read_wait_start(sem);
1122 for (;;) {
1123 set_current_state(state);
1124 if (!smp_load_acquire(&waiter.task)) {
1125 /* Matches rwsem_mark_wake()'s smp_store_release(). */
1126 break;
1127 }
1128 if (signal_pending_state(state, current)) {
1129 raw_spin_lock_irq(&sem->wait_lock);
1130 if (waiter.task)
1131 goto out_nolock;
1132 raw_spin_unlock_irq(&sem->wait_lock);
1133 /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
1134 break;
1135 }
1136 schedule_preempt_disabled();
1137 lockevent_inc(rwsem_sleep_reader);
1138 }
1139
1140 __set_current_state(TASK_RUNNING);
1141 trace_android_vh_rwsem_read_wait_finish(sem);
1142 lockevent_inc(rwsem_rlock);
1143 trace_contention_end(sem, 0);
1144 trace_android_vh_record_rwsem_lock_starttime(sem, jiffies);
1145 return sem;
1146
1147 out_nolock:
1148 rwsem_del_wake_waiter(sem, &waiter, &wake_q);
1149 __set_current_state(TASK_RUNNING);
1150 trace_android_vh_rwsem_read_wait_finish(sem);
1151 lockevent_inc(rwsem_rlock_fail);
1152 trace_contention_end(sem, -EINTR);
1153 return ERR_PTR(-EINTR);
1154 }
1155
1156 /*
1157 * Wait until we successfully acquire the write lock
1158 */
1159 static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore * sem,int state)1160 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
1161 {
1162 struct rwsem_waiter waiter;
1163 DEFINE_WAKE_Q(wake_q);
1164 bool already_on_list = false;
1165
1166 /* do optimistic spinning and steal lock if possible */
1167 if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
1168 /* rwsem_optimistic_spin() implies ACQUIRE on success */
1169 trace_android_vh_record_rwsem_lock_starttime(sem, jiffies);
1170 return sem;
1171 }
1172
1173 /*
1174 * Optimistic spinning failed, proceed to the slowpath
1175 * and block until we can acquire the sem.
1176 */
1177 waiter.task = current;
1178 waiter.type = RWSEM_WAITING_FOR_WRITE;
1179 waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1180 waiter.handoff_set = false;
1181
1182 raw_spin_lock_irq(&sem->wait_lock);
1183 trace_android_vh_alter_rwsem_list_add(
1184 &waiter,
1185 sem, &already_on_list);
1186 if (!already_on_list)
1187 rwsem_add_waiter(sem, &waiter);
1188
1189 /* we're now waiting on the lock */
1190 if (rwsem_first_waiter(sem) != &waiter) {
1191 rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
1192 &wake_q);
1193 if (!wake_q_empty(&wake_q)) {
1194 /*
1195 * We want to minimize wait_lock hold time especially
1196 * when a large number of readers are to be woken up.
1197 */
1198 raw_spin_unlock_irq(&sem->wait_lock);
1199 wake_up_q(&wake_q);
1200 raw_spin_lock_irq(&sem->wait_lock);
1201 }
1202 } else {
1203 atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
1204 }
1205
1206 trace_android_vh_rwsem_wake(sem);
1207 /* wait until we successfully acquire the lock */
1208 trace_android_vh_rwsem_write_wait_start(sem);
1209 set_current_state(state);
1210 trace_contention_begin(sem, LCB_F_WRITE);
1211
1212 for (;;) {
1213 if (rwsem_try_write_lock(sem, &waiter)) {
1214 /* rwsem_try_write_lock() implies ACQUIRE on success */
1215 break;
1216 }
1217
1218 raw_spin_unlock_irq(&sem->wait_lock);
1219
1220 if (signal_pending_state(state, current))
1221 goto out_nolock;
1222
1223 /*
1224 * After setting the handoff bit and failing to acquire
1225 * the lock, attempt to spin on owner to accelerate lock
1226 * transfer. If the previous owner is a on-cpu writer and it
1227 * has just released the lock, OWNER_NULL will be returned.
1228 * In this case, we attempt to acquire the lock again
1229 * without sleeping.
1230 */
1231 if (waiter.handoff_set) {
1232 enum owner_state owner_state;
1233
1234 owner_state = rwsem_spin_on_owner(sem);
1235 if (owner_state == OWNER_NULL)
1236 goto trylock_again;
1237 }
1238
1239 schedule_preempt_disabled();
1240 lockevent_inc(rwsem_sleep_writer);
1241 set_current_state(state);
1242 trylock_again:
1243 raw_spin_lock_irq(&sem->wait_lock);
1244 }
1245 __set_current_state(TASK_RUNNING);
1246 trace_android_vh_rwsem_write_wait_finish(sem);
1247 raw_spin_unlock_irq(&sem->wait_lock);
1248 lockevent_inc(rwsem_wlock);
1249 trace_contention_end(sem, 0);
1250 trace_android_vh_record_rwsem_lock_starttime(sem, jiffies);
1251 return sem;
1252
1253 out_nolock:
1254 __set_current_state(TASK_RUNNING);
1255 trace_android_vh_rwsem_write_wait_finish(sem);
1256 raw_spin_lock_irq(&sem->wait_lock);
1257 rwsem_del_wake_waiter(sem, &waiter, &wake_q);
1258 lockevent_inc(rwsem_wlock_fail);
1259 trace_contention_end(sem, -EINTR);
1260 return ERR_PTR(-EINTR);
1261 }
1262
1263 /*
1264 * handle waking up a waiter on the semaphore
1265 * - up_read/up_write has decremented the active part of count if we come here
1266 */
rwsem_wake(struct rw_semaphore * sem)1267 static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
1268 {
1269 unsigned long flags;
1270 DEFINE_WAKE_Q(wake_q);
1271
1272 raw_spin_lock_irqsave(&sem->wait_lock, flags);
1273
1274 if (!list_empty(&sem->wait_list))
1275 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
1276 trace_android_vh_rwsem_wake_finish(sem);
1277
1278 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
1279 wake_up_q(&wake_q);
1280
1281 return sem;
1282 }
1283
1284 /*
1285 * downgrade a write lock into a read lock
1286 * - caller incremented waiting part of count and discovered it still negative
1287 * - just wake up any readers at the front of the queue
1288 */
rwsem_downgrade_wake(struct rw_semaphore * sem)1289 static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
1290 {
1291 unsigned long flags;
1292 DEFINE_WAKE_Q(wake_q);
1293
1294 raw_spin_lock_irqsave(&sem->wait_lock, flags);
1295
1296 if (!list_empty(&sem->wait_list))
1297 rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
1298
1299 trace_android_vh_rwsem_downgrade_wake_finish(sem);
1300
1301 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
1302 wake_up_q(&wake_q);
1303
1304 return sem;
1305 }
1306
1307 /*
1308 * lock for reading
1309 */
__down_read_common(struct rw_semaphore * sem,int state)1310 static __always_inline int __down_read_common(struct rw_semaphore *sem, int state)
1311 {
1312 int ret = 0;
1313 long count;
1314
1315 preempt_disable();
1316 if (!rwsem_read_trylock(sem, &count)) {
1317 if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
1318 ret = -EINTR;
1319 goto out;
1320 }
1321 DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1322 }
1323 out:
1324 preempt_enable();
1325 return ret;
1326 }
1327
__down_read(struct rw_semaphore * sem)1328 static __always_inline void __down_read(struct rw_semaphore *sem)
1329 {
1330 __down_read_common(sem, TASK_UNINTERRUPTIBLE);
1331 }
1332
__down_read_interruptible(struct rw_semaphore * sem)1333 static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
1334 {
1335 return __down_read_common(sem, TASK_INTERRUPTIBLE);
1336 }
1337
__down_read_killable(struct rw_semaphore * sem)1338 static __always_inline int __down_read_killable(struct rw_semaphore *sem)
1339 {
1340 return __down_read_common(sem, TASK_KILLABLE);
1341 }
1342
__down_read_trylock(struct rw_semaphore * sem)1343 static inline int __down_read_trylock(struct rw_semaphore *sem)
1344 {
1345 int ret = 0;
1346 long tmp;
1347
1348 DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1349
1350 preempt_disable();
1351 tmp = atomic_long_read(&sem->count);
1352 while (!(tmp & RWSEM_READ_FAILED_MASK)) {
1353 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1354 tmp + RWSEM_READER_BIAS)) {
1355 rwsem_set_reader_owned(sem);
1356 ret = 1;
1357 trace_android_vh_record_rwsem_lock_starttime(sem, jiffies);
1358 break;
1359 }
1360 }
1361
1362 if (!ret) {
1363 trace_android_vh_rwsem_read_trylock_failed(sem, NULL, &ret);
1364 if (ret) {
1365 rwsem_set_reader_owned(sem);
1366 trace_android_vh_record_rwsem_lock_starttime(sem, jiffies);
1367 }
1368 }
1369
1370 preempt_enable();
1371 return ret;
1372 }
1373
1374 /*
1375 * lock for writing
1376 */
__down_write_common(struct rw_semaphore * sem,int state)1377 static __always_inline int __down_write_common(struct rw_semaphore *sem, int state)
1378 {
1379 int ret = 0;
1380
1381 preempt_disable();
1382 if (unlikely(!rwsem_write_trylock(sem))) {
1383 if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
1384 ret = -EINTR;
1385 }
1386 preempt_enable();
1387 return ret;
1388 }
1389
__down_write(struct rw_semaphore * sem)1390 static __always_inline void __down_write(struct rw_semaphore *sem)
1391 {
1392 __down_write_common(sem, TASK_UNINTERRUPTIBLE);
1393 }
1394
__down_write_killable(struct rw_semaphore * sem)1395 static __always_inline int __down_write_killable(struct rw_semaphore *sem)
1396 {
1397 return __down_write_common(sem, TASK_KILLABLE);
1398 }
1399
__down_write_trylock(struct rw_semaphore * sem)1400 static inline int __down_write_trylock(struct rw_semaphore *sem)
1401 {
1402 int ret;
1403
1404 preempt_disable();
1405 DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1406 ret = rwsem_write_trylock(sem);
1407 preempt_enable();
1408
1409 return ret;
1410 }
1411
1412 /*
1413 * unlock after reading
1414 */
__up_read(struct rw_semaphore * sem)1415 static inline void __up_read(struct rw_semaphore *sem)
1416 {
1417 long tmp;
1418
1419 DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1420 DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1421
1422 preempt_disable();
1423 rwsem_clear_reader_owned(sem);
1424 tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
1425 DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
1426 if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
1427 RWSEM_FLAG_WAITERS)) {
1428 clear_nonspinnable(sem);
1429 rwsem_wake(sem);
1430 }
1431 trace_android_vh_record_rwsem_lock_starttime(sem, 0);
1432 preempt_enable();
1433 }
1434
1435 /*
1436 * unlock after writing
1437 */
__up_write(struct rw_semaphore * sem)1438 static inline void __up_write(struct rw_semaphore *sem)
1439 {
1440 long tmp;
1441
1442 DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1443 /*
1444 * sem->owner may differ from current if the ownership is transferred
1445 * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
1446 */
1447 DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
1448 !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
1449
1450 preempt_disable();
1451 rwsem_clear_owner(sem);
1452 tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
1453 if (unlikely(tmp & RWSEM_FLAG_WAITERS))
1454 rwsem_wake(sem);
1455 trace_android_vh_record_rwsem_lock_starttime(sem, 0);
1456 preempt_enable();
1457 }
1458
1459 /*
1460 * downgrade write lock to read lock
1461 */
__downgrade_write(struct rw_semaphore * sem)1462 static inline void __downgrade_write(struct rw_semaphore *sem)
1463 {
1464 long tmp;
1465
1466 /*
1467 * When downgrading from exclusive to shared ownership,
1468 * anything inside the write-locked region cannot leak
1469 * into the read side. In contrast, anything in the
1470 * read-locked region is ok to be re-ordered into the
1471 * write side. As such, rely on RELEASE semantics.
1472 */
1473 DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
1474 preempt_disable();
1475 tmp = atomic_long_fetch_add_release(
1476 -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
1477 rwsem_set_reader_owned(sem);
1478 if (tmp & RWSEM_FLAG_WAITERS)
1479 rwsem_downgrade_wake(sem);
1480 preempt_enable();
1481 }
1482
1483 #else /* !CONFIG_PREEMPT_RT */
1484
1485 #define RT_MUTEX_BUILD_MUTEX
1486 #include "rtmutex.c"
1487
1488 #define rwbase_set_and_save_current_state(state) \
1489 set_current_state(state)
1490
1491 #define rwbase_restore_current_state() \
1492 __set_current_state(TASK_RUNNING)
1493
1494 #define rwbase_rtmutex_lock_state(rtm, state) \
1495 __rt_mutex_lock(rtm, state)
1496
1497 #define rwbase_rtmutex_slowlock_locked(rtm, state, wq) \
1498 __rt_mutex_slowlock_locked(rtm, NULL, state, wq)
1499
1500 #define rwbase_rtmutex_unlock(rtm) \
1501 __rt_mutex_unlock(rtm)
1502
1503 #define rwbase_rtmutex_trylock(rtm) \
1504 __rt_mutex_trylock(rtm)
1505
1506 #define rwbase_signal_pending_state(state, current) \
1507 signal_pending_state(state, current)
1508
1509 #define rwbase_pre_schedule() \
1510 rt_mutex_pre_schedule()
1511
1512 #define rwbase_schedule() \
1513 rt_mutex_schedule()
1514
1515 #define rwbase_post_schedule() \
1516 rt_mutex_post_schedule()
1517
1518 #include "rwbase_rt.c"
1519
__init_rwsem(struct rw_semaphore * sem,const char * name,struct lock_class_key * key)1520 void __init_rwsem(struct rw_semaphore *sem, const char *name,
1521 struct lock_class_key *key)
1522 {
1523 init_rwbase_rt(&(sem)->rwbase);
1524
1525 #ifdef CONFIG_DEBUG_LOCK_ALLOC
1526 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
1527 lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
1528 #endif
1529 }
1530 EXPORT_SYMBOL(__init_rwsem);
1531
__down_read(struct rw_semaphore * sem)1532 static inline void __down_read(struct rw_semaphore *sem)
1533 {
1534 rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
1535 }
1536
__down_read_interruptible(struct rw_semaphore * sem)1537 static inline int __down_read_interruptible(struct rw_semaphore *sem)
1538 {
1539 return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
1540 }
1541
__down_read_killable(struct rw_semaphore * sem)1542 static inline int __down_read_killable(struct rw_semaphore *sem)
1543 {
1544 return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
1545 }
1546
__down_read_trylock(struct rw_semaphore * sem)1547 static inline int __down_read_trylock(struct rw_semaphore *sem)
1548 {
1549 return rwbase_read_trylock(&sem->rwbase);
1550 }
1551
__up_read(struct rw_semaphore * sem)1552 static inline void __up_read(struct rw_semaphore *sem)
1553 {
1554 rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
1555 }
1556
__down_write(struct rw_semaphore * sem)1557 static inline void __sched __down_write(struct rw_semaphore *sem)
1558 {
1559 rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
1560 }
1561
__down_write_killable(struct rw_semaphore * sem)1562 static inline int __sched __down_write_killable(struct rw_semaphore *sem)
1563 {
1564 return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
1565 }
1566
__down_write_trylock(struct rw_semaphore * sem)1567 static inline int __down_write_trylock(struct rw_semaphore *sem)
1568 {
1569 return rwbase_write_trylock(&sem->rwbase);
1570 }
1571
__up_write(struct rw_semaphore * sem)1572 static inline void __up_write(struct rw_semaphore *sem)
1573 {
1574 rwbase_write_unlock(&sem->rwbase);
1575 }
1576
__downgrade_write(struct rw_semaphore * sem)1577 static inline void __downgrade_write(struct rw_semaphore *sem)
1578 {
1579 rwbase_write_downgrade(&sem->rwbase);
1580 }
1581
1582 /* Debug stubs for the common API */
1583 #define DEBUG_RWSEMS_WARN_ON(c, sem)
1584
__rwsem_set_reader_owned(struct rw_semaphore * sem,struct task_struct * owner)1585 static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
1586 struct task_struct *owner)
1587 {
1588 }
1589
is_rwsem_reader_owned(struct rw_semaphore * sem)1590 static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
1591 {
1592 int count = atomic_read(&sem->rwbase.readers);
1593
1594 return count < 0 && count != READER_BIAS;
1595 }
1596
1597 #endif /* CONFIG_PREEMPT_RT */
1598
1599 /*
1600 * lock for reading
1601 */
down_read(struct rw_semaphore * sem)1602 void __sched down_read(struct rw_semaphore *sem)
1603 {
1604 might_sleep();
1605 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1606
1607 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1608 }
1609 EXPORT_SYMBOL(down_read);
1610
down_read_interruptible(struct rw_semaphore * sem)1611 int __sched down_read_interruptible(struct rw_semaphore *sem)
1612 {
1613 might_sleep();
1614 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1615
1616 if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) {
1617 rwsem_release(&sem->dep_map, _RET_IP_);
1618 return -EINTR;
1619 }
1620
1621 return 0;
1622 }
1623 EXPORT_SYMBOL(down_read_interruptible);
1624
down_read_killable(struct rw_semaphore * sem)1625 int __sched down_read_killable(struct rw_semaphore *sem)
1626 {
1627 might_sleep();
1628 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1629
1630 if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1631 rwsem_release(&sem->dep_map, _RET_IP_);
1632 return -EINTR;
1633 }
1634
1635 return 0;
1636 }
1637 EXPORT_SYMBOL(down_read_killable);
1638
1639 /*
1640 * trylock for reading -- returns 1 if successful, 0 if contention
1641 */
down_read_trylock(struct rw_semaphore * sem)1642 int down_read_trylock(struct rw_semaphore *sem)
1643 {
1644 int ret = __down_read_trylock(sem);
1645
1646 if (ret == 1)
1647 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
1648 return ret;
1649 }
1650 EXPORT_SYMBOL(down_read_trylock);
1651
1652 /*
1653 * lock for writing
1654 */
down_write(struct rw_semaphore * sem)1655 void __sched down_write(struct rw_semaphore *sem)
1656 {
1657 might_sleep();
1658 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1659 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1660 }
1661 EXPORT_SYMBOL(down_write);
1662
1663 /*
1664 * lock for writing
1665 */
down_write_killable(struct rw_semaphore * sem)1666 int __sched down_write_killable(struct rw_semaphore *sem)
1667 {
1668 might_sleep();
1669 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1670
1671 if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1672 __down_write_killable)) {
1673 rwsem_release(&sem->dep_map, _RET_IP_);
1674 return -EINTR;
1675 }
1676
1677 return 0;
1678 }
1679 EXPORT_SYMBOL(down_write_killable);
1680
1681 /*
1682 * trylock for writing -- returns 1 if successful, 0 if contention
1683 */
down_write_trylock(struct rw_semaphore * sem)1684 int down_write_trylock(struct rw_semaphore *sem)
1685 {
1686 int ret = __down_write_trylock(sem);
1687
1688 if (ret == 1)
1689 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
1690
1691 return ret;
1692 }
1693 EXPORT_SYMBOL(down_write_trylock);
1694
1695 /*
1696 * release a read lock
1697 */
up_read(struct rw_semaphore * sem)1698 void up_read(struct rw_semaphore *sem)
1699 {
1700 rwsem_release(&sem->dep_map, _RET_IP_);
1701 __up_read(sem);
1702 }
1703 EXPORT_SYMBOL(up_read);
1704
1705 /*
1706 * release a write lock
1707 */
up_write(struct rw_semaphore * sem)1708 void up_write(struct rw_semaphore *sem)
1709 {
1710 rwsem_release(&sem->dep_map, _RET_IP_);
1711 trace_android_vh_rwsem_write_finished(sem);
1712 __up_write(sem);
1713 }
1714 EXPORT_SYMBOL(up_write);
1715
1716 /*
1717 * downgrade write lock to read lock
1718 */
downgrade_write(struct rw_semaphore * sem)1719 void downgrade_write(struct rw_semaphore *sem)
1720 {
1721 lock_downgrade(&sem->dep_map, _RET_IP_);
1722 trace_android_vh_rwsem_write_finished(sem);
1723 __downgrade_write(sem);
1724 }
1725 EXPORT_SYMBOL(downgrade_write);
1726
1727 #ifdef CONFIG_DEBUG_LOCK_ALLOC
1728
down_read_nested(struct rw_semaphore * sem,int subclass)1729 void down_read_nested(struct rw_semaphore *sem, int subclass)
1730 {
1731 might_sleep();
1732 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
1733 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1734 }
1735 EXPORT_SYMBOL(down_read_nested);
1736
down_read_killable_nested(struct rw_semaphore * sem,int subclass)1737 int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
1738 {
1739 might_sleep();
1740 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
1741
1742 if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1743 rwsem_release(&sem->dep_map, _RET_IP_);
1744 return -EINTR;
1745 }
1746
1747 return 0;
1748 }
1749 EXPORT_SYMBOL(down_read_killable_nested);
1750
_down_write_nest_lock(struct rw_semaphore * sem,struct lockdep_map * nest)1751 void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
1752 {
1753 might_sleep();
1754 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
1755 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1756 }
1757 EXPORT_SYMBOL(_down_write_nest_lock);
1758
down_read_non_owner(struct rw_semaphore * sem)1759 void down_read_non_owner(struct rw_semaphore *sem)
1760 {
1761 might_sleep();
1762 __down_read(sem);
1763 /*
1764 * The owner value for a reader-owned lock is mostly for debugging
1765 * purpose only and is not critical to the correct functioning of
1766 * rwsem. So it is perfectly fine to set it in a preempt-enabled
1767 * context here.
1768 */
1769 __rwsem_set_reader_owned(sem, NULL);
1770 }
1771 EXPORT_SYMBOL(down_read_non_owner);
1772
down_write_nested(struct rw_semaphore * sem,int subclass)1773 void down_write_nested(struct rw_semaphore *sem, int subclass)
1774 {
1775 might_sleep();
1776 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1777 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1778 }
1779 EXPORT_SYMBOL(down_write_nested);
1780
down_write_killable_nested(struct rw_semaphore * sem,int subclass)1781 int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
1782 {
1783 might_sleep();
1784 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1785
1786 if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1787 __down_write_killable)) {
1788 rwsem_release(&sem->dep_map, _RET_IP_);
1789 return -EINTR;
1790 }
1791
1792 return 0;
1793 }
1794 EXPORT_SYMBOL(down_write_killable_nested);
1795
up_read_non_owner(struct rw_semaphore * sem)1796 void up_read_non_owner(struct rw_semaphore *sem)
1797 {
1798 DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1799 __up_read(sem);
1800 }
1801 EXPORT_SYMBOL(up_read_non_owner);
1802
1803 #endif
1804