1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2009-2011 Red Hat, Inc.
4 *
5 * Author: Mikulas Patocka <mpatocka@redhat.com>
6 *
7 * This file is released under the GPL.
8 */
9
10 #include <linux/dm-bufio.h>
11
12 #include <linux/device-mapper.h>
13 #include <linux/dm-io.h>
14 #include <linux/slab.h>
15 #include <linux/sched/mm.h>
16 #include <linux/jiffies.h>
17 #include <linux/vmalloc.h>
18 #include <linux/shrinker.h>
19 #include <linux/module.h>
20 #include <linux/rbtree.h>
21 #include <linux/stacktrace.h>
22 #include <linux/jump_label.h>
23
24 #include "dm.h"
25
26 #define DM_MSG_PREFIX "bufio"
27
28 /*
29 * Memory management policy:
30 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
31 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
32 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
33 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
34 * dirty buffers.
35 */
36 #define DM_BUFIO_MIN_BUFFERS 8
37
38 #define DM_BUFIO_MEMORY_PERCENT 2
39 #define DM_BUFIO_VMALLOC_PERCENT 25
40 #define DM_BUFIO_WRITEBACK_RATIO 3
41 #define DM_BUFIO_LOW_WATERMARK_RATIO 16
42
43 /*
44 * The nr of bytes of cached data to keep around.
45 */
46 #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
47
48 /*
49 * Align buffer writes to this boundary.
50 * Tests show that SSDs have the highest IOPS when using 4k writes.
51 */
52 #define DM_BUFIO_WRITE_ALIGN 4096
53
54 /*
55 * dm_buffer->list_mode
56 */
57 #define LIST_CLEAN 0
58 #define LIST_DIRTY 1
59 #define LIST_SIZE 2
60
61 #define SCAN_RESCHED_CYCLE 16
62
63 /*--------------------------------------------------------------*/
64
65 /*
66 * Rather than use an LRU list, we use a clock algorithm where entries
67 * are held in a circular list. When an entry is 'hit' a reference bit
68 * is set. The least recently used entry is approximated by running a
69 * cursor around the list selecting unreferenced entries. Referenced
70 * entries have their reference bit cleared as the cursor passes them.
71 */
72 struct lru_entry {
73 struct list_head list;
74 atomic_t referenced;
75 };
76
77 struct lru_iter {
78 struct lru *lru;
79 struct list_head list;
80 struct lru_entry *stop;
81 struct lru_entry *e;
82 };
83
84 struct lru {
85 struct list_head *cursor;
86 unsigned long count;
87
88 struct list_head iterators;
89 };
90
91 /*--------------*/
92
lru_init(struct lru * lru)93 static void lru_init(struct lru *lru)
94 {
95 lru->cursor = NULL;
96 lru->count = 0;
97 INIT_LIST_HEAD(&lru->iterators);
98 }
99
lru_destroy(struct lru * lru)100 static void lru_destroy(struct lru *lru)
101 {
102 WARN_ON_ONCE(lru->cursor);
103 WARN_ON_ONCE(!list_empty(&lru->iterators));
104 }
105
106 /*
107 * Insert a new entry into the lru.
108 */
lru_insert(struct lru * lru,struct lru_entry * le)109 static void lru_insert(struct lru *lru, struct lru_entry *le)
110 {
111 /*
112 * Don't be tempted to set to 1, makes the lru aspect
113 * perform poorly.
114 */
115 atomic_set(&le->referenced, 0);
116
117 if (lru->cursor) {
118 list_add_tail(&le->list, lru->cursor);
119 } else {
120 INIT_LIST_HEAD(&le->list);
121 lru->cursor = &le->list;
122 }
123 lru->count++;
124 }
125
126 /*--------------*/
127
128 /*
129 * Convert a list_head pointer to an lru_entry pointer.
130 */
to_le(struct list_head * l)131 static inline struct lru_entry *to_le(struct list_head *l)
132 {
133 return container_of(l, struct lru_entry, list);
134 }
135
136 /*
137 * Initialize an lru_iter and add it to the list of cursors in the lru.
138 */
lru_iter_begin(struct lru * lru,struct lru_iter * it)139 static void lru_iter_begin(struct lru *lru, struct lru_iter *it)
140 {
141 it->lru = lru;
142 it->stop = lru->cursor ? to_le(lru->cursor->prev) : NULL;
143 it->e = lru->cursor ? to_le(lru->cursor) : NULL;
144 list_add(&it->list, &lru->iterators);
145 }
146
147 /*
148 * Remove an lru_iter from the list of cursors in the lru.
149 */
lru_iter_end(struct lru_iter * it)150 static inline void lru_iter_end(struct lru_iter *it)
151 {
152 list_del(&it->list);
153 }
154
155 /* Predicate function type to be used with lru_iter_next */
156 typedef bool (*iter_predicate)(struct lru_entry *le, void *context);
157
158 /*
159 * Advance the cursor to the next entry that passes the
160 * predicate, and return that entry. Returns NULL if the
161 * iteration is complete.
162 */
lru_iter_next(struct lru_iter * it,iter_predicate pred,void * context)163 static struct lru_entry *lru_iter_next(struct lru_iter *it,
164 iter_predicate pred, void *context)
165 {
166 struct lru_entry *e;
167
168 while (it->e) {
169 e = it->e;
170
171 /* advance the cursor */
172 if (it->e == it->stop)
173 it->e = NULL;
174 else
175 it->e = to_le(it->e->list.next);
176
177 if (pred(e, context))
178 return e;
179 }
180
181 return NULL;
182 }
183
184 /*
185 * Invalidate a specific lru_entry and update all cursors in
186 * the lru accordingly.
187 */
lru_iter_invalidate(struct lru * lru,struct lru_entry * e)188 static void lru_iter_invalidate(struct lru *lru, struct lru_entry *e)
189 {
190 struct lru_iter *it;
191
192 list_for_each_entry(it, &lru->iterators, list) {
193 /* Move c->e forwards if necc. */
194 if (it->e == e) {
195 it->e = to_le(it->e->list.next);
196 if (it->e == e)
197 it->e = NULL;
198 }
199
200 /* Move it->stop backwards if necc. */
201 if (it->stop == e) {
202 it->stop = to_le(it->stop->list.prev);
203 if (it->stop == e)
204 it->stop = NULL;
205 }
206 }
207 }
208
209 /*--------------*/
210
211 /*
212 * Remove a specific entry from the lru.
213 */
lru_remove(struct lru * lru,struct lru_entry * le)214 static void lru_remove(struct lru *lru, struct lru_entry *le)
215 {
216 lru_iter_invalidate(lru, le);
217 if (lru->count == 1) {
218 lru->cursor = NULL;
219 } else {
220 if (lru->cursor == &le->list)
221 lru->cursor = lru->cursor->next;
222 list_del(&le->list);
223 }
224 lru->count--;
225 }
226
227 /*
228 * Mark as referenced.
229 */
lru_reference(struct lru_entry * le)230 static inline void lru_reference(struct lru_entry *le)
231 {
232 atomic_set(&le->referenced, 1);
233 }
234
235 /*--------------*/
236
237 /*
238 * Remove the least recently used entry (approx), that passes the predicate.
239 * Returns NULL on failure.
240 */
241 enum evict_result {
242 ER_EVICT,
243 ER_DONT_EVICT,
244 ER_STOP, /* stop looking for something to evict */
245 };
246
247 typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context);
248
lru_evict(struct lru * lru,le_predicate pred,void * context,bool no_sleep)249 static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context, bool no_sleep)
250 {
251 unsigned long tested = 0;
252 struct list_head *h = lru->cursor;
253 struct lru_entry *le;
254
255 if (!h)
256 return NULL;
257 /*
258 * In the worst case we have to loop around twice. Once to clear
259 * the reference flags, and then again to discover the predicate
260 * fails for all entries.
261 */
262 while (tested < lru->count) {
263 le = container_of(h, struct lru_entry, list);
264
265 if (atomic_read(&le->referenced)) {
266 atomic_set(&le->referenced, 0);
267 } else {
268 tested++;
269 switch (pred(le, context)) {
270 case ER_EVICT:
271 /*
272 * Adjust the cursor, so we start the next
273 * search from here.
274 */
275 lru->cursor = le->list.next;
276 lru_remove(lru, le);
277 return le;
278
279 case ER_DONT_EVICT:
280 break;
281
282 case ER_STOP:
283 lru->cursor = le->list.next;
284 return NULL;
285 }
286 }
287
288 h = h->next;
289
290 if (!no_sleep)
291 cond_resched();
292 }
293
294 return NULL;
295 }
296
297 /*--------------------------------------------------------------*/
298
299 /*
300 * Buffer state bits.
301 */
302 #define B_READING 0
303 #define B_WRITING 1
304 #define B_DIRTY 2
305
306 /*
307 * Describes how the block was allocated:
308 * kmem_cache_alloc(), __get_free_pages() or vmalloc().
309 * See the comment at alloc_buffer_data.
310 */
311 enum data_mode {
312 DATA_MODE_SLAB = 0,
313 DATA_MODE_GET_FREE_PAGES = 1,
314 DATA_MODE_VMALLOC = 2,
315 DATA_MODE_LIMIT = 3
316 };
317
318 struct dm_buffer {
319 /* protected by the locks in dm_buffer_cache */
320 struct rb_node node;
321
322 /* immutable, so don't need protecting */
323 sector_t block;
324 void *data;
325 unsigned char data_mode; /* DATA_MODE_* */
326
327 /*
328 * These two fields are used in isolation, so do not need
329 * a surrounding lock.
330 */
331 atomic_t hold_count;
332 unsigned long last_accessed;
333
334 /*
335 * Everything else is protected by the mutex in
336 * dm_bufio_client
337 */
338 unsigned long state;
339 struct lru_entry lru;
340 unsigned char list_mode; /* LIST_* */
341 blk_status_t read_error;
342 blk_status_t write_error;
343 unsigned int dirty_start;
344 unsigned int dirty_end;
345 unsigned int write_start;
346 unsigned int write_end;
347 struct list_head write_list;
348 struct dm_bufio_client *c;
349 void (*end_io)(struct dm_buffer *b, blk_status_t bs);
350 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
351 #define MAX_STACK 10
352 unsigned int stack_len;
353 unsigned long stack_entries[MAX_STACK];
354 #endif
355 };
356
357 /*--------------------------------------------------------------*/
358
359 /*
360 * The buffer cache manages buffers, particularly:
361 * - inc/dec of holder count
362 * - setting the last_accessed field
363 * - maintains clean/dirty state along with lru
364 * - selecting buffers that match predicates
365 *
366 * It does *not* handle:
367 * - allocation/freeing of buffers.
368 * - IO
369 * - Eviction or cache sizing.
370 *
371 * cache_get() and cache_put() are threadsafe, you do not need to
372 * protect these calls with a surrounding mutex. All the other
373 * methods are not threadsafe; they do use locking primitives, but
374 * only enough to ensure get/put are threadsafe.
375 */
376
377 struct buffer_tree {
378 union {
379 struct rw_semaphore lock;
380 rwlock_t spinlock;
381 } u;
382 struct rb_root root;
383 } ____cacheline_aligned_in_smp;
384
385 struct dm_buffer_cache {
386 struct lru lru[LIST_SIZE];
387 /*
388 * We spread entries across multiple trees to reduce contention
389 * on the locks.
390 */
391 unsigned int num_locks;
392 bool no_sleep;
393 struct buffer_tree trees[];
394 };
395
396 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
397
cache_index(sector_t block,unsigned int num_locks)398 static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
399 {
400 return dm_hash_locks_index(block, num_locks);
401 }
402
cache_read_lock(struct dm_buffer_cache * bc,sector_t block)403 static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
404 {
405 if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
406 read_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
407 else
408 down_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
409 }
410
cache_read_unlock(struct dm_buffer_cache * bc,sector_t block)411 static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
412 {
413 if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
414 read_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
415 else
416 up_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
417 }
418
cache_write_lock(struct dm_buffer_cache * bc,sector_t block)419 static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
420 {
421 if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
422 write_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
423 else
424 down_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
425 }
426
cache_write_unlock(struct dm_buffer_cache * bc,sector_t block)427 static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
428 {
429 if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
430 write_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
431 else
432 up_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
433 }
434
435 /*
436 * Sometimes we want to repeatedly get and drop locks as part of an iteration.
437 * This struct helps avoid redundant drop and gets of the same lock.
438 */
439 struct lock_history {
440 struct dm_buffer_cache *cache;
441 bool write;
442 unsigned int previous;
443 unsigned int no_previous;
444 };
445
lh_init(struct lock_history * lh,struct dm_buffer_cache * cache,bool write)446 static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool write)
447 {
448 lh->cache = cache;
449 lh->write = write;
450 lh->no_previous = cache->num_locks;
451 lh->previous = lh->no_previous;
452 }
453
__lh_lock(struct lock_history * lh,unsigned int index)454 static void __lh_lock(struct lock_history *lh, unsigned int index)
455 {
456 if (lh->write) {
457 if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
458 write_lock_bh(&lh->cache->trees[index].u.spinlock);
459 else
460 down_write(&lh->cache->trees[index].u.lock);
461 } else {
462 if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
463 read_lock_bh(&lh->cache->trees[index].u.spinlock);
464 else
465 down_read(&lh->cache->trees[index].u.lock);
466 }
467 }
468
__lh_unlock(struct lock_history * lh,unsigned int index)469 static void __lh_unlock(struct lock_history *lh, unsigned int index)
470 {
471 if (lh->write) {
472 if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
473 write_unlock_bh(&lh->cache->trees[index].u.spinlock);
474 else
475 up_write(&lh->cache->trees[index].u.lock);
476 } else {
477 if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
478 read_unlock_bh(&lh->cache->trees[index].u.spinlock);
479 else
480 up_read(&lh->cache->trees[index].u.lock);
481 }
482 }
483
484 /*
485 * Make sure you call this since it will unlock the final lock.
486 */
lh_exit(struct lock_history * lh)487 static void lh_exit(struct lock_history *lh)
488 {
489 if (lh->previous != lh->no_previous) {
490 __lh_unlock(lh, lh->previous);
491 lh->previous = lh->no_previous;
492 }
493 }
494
495 /*
496 * Named 'next' because there is no corresponding
497 * 'up/unlock' call since it's done automatically.
498 */
lh_next(struct lock_history * lh,sector_t b)499 static void lh_next(struct lock_history *lh, sector_t b)
500 {
501 unsigned int index = cache_index(b, lh->no_previous); /* no_previous is num_locks */
502
503 if (lh->previous != lh->no_previous) {
504 if (lh->previous != index) {
505 __lh_unlock(lh, lh->previous);
506 __lh_lock(lh, index);
507 lh->previous = index;
508 }
509 } else {
510 __lh_lock(lh, index);
511 lh->previous = index;
512 }
513 }
514
le_to_buffer(struct lru_entry * le)515 static inline struct dm_buffer *le_to_buffer(struct lru_entry *le)
516 {
517 return container_of(le, struct dm_buffer, lru);
518 }
519
list_to_buffer(struct list_head * l)520 static struct dm_buffer *list_to_buffer(struct list_head *l)
521 {
522 struct lru_entry *le = list_entry(l, struct lru_entry, list);
523
524 return le_to_buffer(le);
525 }
526
cache_init(struct dm_buffer_cache * bc,unsigned int num_locks,bool no_sleep)527 static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks, bool no_sleep)
528 {
529 unsigned int i;
530
531 bc->num_locks = num_locks;
532 bc->no_sleep = no_sleep;
533
534 for (i = 0; i < bc->num_locks; i++) {
535 if (no_sleep)
536 rwlock_init(&bc->trees[i].u.spinlock);
537 else
538 init_rwsem(&bc->trees[i].u.lock);
539 bc->trees[i].root = RB_ROOT;
540 }
541
542 lru_init(&bc->lru[LIST_CLEAN]);
543 lru_init(&bc->lru[LIST_DIRTY]);
544 }
545
cache_destroy(struct dm_buffer_cache * bc)546 static void cache_destroy(struct dm_buffer_cache *bc)
547 {
548 unsigned int i;
549
550 for (i = 0; i < bc->num_locks; i++)
551 WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc->trees[i].root));
552
553 lru_destroy(&bc->lru[LIST_CLEAN]);
554 lru_destroy(&bc->lru[LIST_DIRTY]);
555 }
556
557 /*--------------*/
558
559 /*
560 * not threadsafe, or racey depending how you look at it
561 */
cache_count(struct dm_buffer_cache * bc,int list_mode)562 static inline unsigned long cache_count(struct dm_buffer_cache *bc, int list_mode)
563 {
564 return bc->lru[list_mode].count;
565 }
566
cache_total(struct dm_buffer_cache * bc)567 static inline unsigned long cache_total(struct dm_buffer_cache *bc)
568 {
569 return cache_count(bc, LIST_CLEAN) + cache_count(bc, LIST_DIRTY);
570 }
571
572 /*--------------*/
573
574 /*
575 * Gets a specific buffer, indexed by block.
576 * If the buffer is found then its holder count will be incremented and
577 * lru_reference will be called.
578 *
579 * threadsafe
580 */
__cache_get(const struct rb_root * root,sector_t block)581 static struct dm_buffer *__cache_get(const struct rb_root *root, sector_t block)
582 {
583 struct rb_node *n = root->rb_node;
584 struct dm_buffer *b;
585
586 while (n) {
587 b = container_of(n, struct dm_buffer, node);
588
589 if (b->block == block)
590 return b;
591
592 n = block < b->block ? n->rb_left : n->rb_right;
593 }
594
595 return NULL;
596 }
597
__cache_inc_buffer(struct dm_buffer * b)598 static void __cache_inc_buffer(struct dm_buffer *b)
599 {
600 atomic_inc(&b->hold_count);
601 WRITE_ONCE(b->last_accessed, jiffies);
602 }
603
cache_get(struct dm_buffer_cache * bc,sector_t block)604 static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block)
605 {
606 struct dm_buffer *b;
607
608 cache_read_lock(bc, block);
609 b = __cache_get(&bc->trees[cache_index(block, bc->num_locks)].root, block);
610 if (b) {
611 lru_reference(&b->lru);
612 __cache_inc_buffer(b);
613 }
614 cache_read_unlock(bc, block);
615
616 return b;
617 }
618
619 /*--------------*/
620
621 /*
622 * Returns true if the hold count hits zero.
623 * threadsafe
624 */
cache_put(struct dm_buffer_cache * bc,struct dm_buffer * b)625 static bool cache_put(struct dm_buffer_cache *bc, struct dm_buffer *b)
626 {
627 bool r;
628
629 cache_read_lock(bc, b->block);
630 BUG_ON(!atomic_read(&b->hold_count));
631 r = atomic_dec_and_test(&b->hold_count);
632 cache_read_unlock(bc, b->block);
633
634 return r;
635 }
636
637 /*--------------*/
638
639 typedef enum evict_result (*b_predicate)(struct dm_buffer *, void *);
640
641 /*
642 * Evicts a buffer based on a predicate. The oldest buffer that
643 * matches the predicate will be selected. In addition to the
644 * predicate the hold_count of the selected buffer will be zero.
645 */
646 struct evict_wrapper {
647 struct lock_history *lh;
648 b_predicate pred;
649 void *context;
650 };
651
652 /*
653 * Wraps the buffer predicate turning it into an lru predicate. Adds
654 * extra test for hold_count.
655 */
__evict_pred(struct lru_entry * le,void * context)656 static enum evict_result __evict_pred(struct lru_entry *le, void *context)
657 {
658 struct evict_wrapper *w = context;
659 struct dm_buffer *b = le_to_buffer(le);
660
661 lh_next(w->lh, b->block);
662
663 if (atomic_read(&b->hold_count))
664 return ER_DONT_EVICT;
665
666 return w->pred(b, w->context);
667 }
668
__cache_evict(struct dm_buffer_cache * bc,int list_mode,b_predicate pred,void * context,struct lock_history * lh)669 static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode,
670 b_predicate pred, void *context,
671 struct lock_history *lh)
672 {
673 struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
674 struct lru_entry *le;
675 struct dm_buffer *b;
676
677 le = lru_evict(&bc->lru[list_mode], __evict_pred, &w, bc->no_sleep);
678 if (!le)
679 return NULL;
680
681 b = le_to_buffer(le);
682 /* __evict_pred will have locked the appropriate tree. */
683 rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
684
685 return b;
686 }
687
cache_evict(struct dm_buffer_cache * bc,int list_mode,b_predicate pred,void * context)688 static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode,
689 b_predicate pred, void *context)
690 {
691 struct dm_buffer *b;
692 struct lock_history lh;
693
694 lh_init(&lh, bc, true);
695 b = __cache_evict(bc, list_mode, pred, context, &lh);
696 lh_exit(&lh);
697
698 return b;
699 }
700
701 /*--------------*/
702
703 /*
704 * Mark a buffer as clean or dirty. Not threadsafe.
705 */
cache_mark(struct dm_buffer_cache * bc,struct dm_buffer * b,int list_mode)706 static void cache_mark(struct dm_buffer_cache *bc, struct dm_buffer *b, int list_mode)
707 {
708 cache_write_lock(bc, b->block);
709 if (list_mode != b->list_mode) {
710 lru_remove(&bc->lru[b->list_mode], &b->lru);
711 b->list_mode = list_mode;
712 lru_insert(&bc->lru[b->list_mode], &b->lru);
713 }
714 cache_write_unlock(bc, b->block);
715 }
716
717 /*--------------*/
718
719 /*
720 * Runs through the lru associated with 'old_mode', if the predicate matches then
721 * it moves them to 'new_mode'. Not threadsafe.
722 */
__cache_mark_many(struct dm_buffer_cache * bc,int old_mode,int new_mode,b_predicate pred,void * context,struct lock_history * lh)723 static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
724 b_predicate pred, void *context, struct lock_history *lh)
725 {
726 struct lru_entry *le;
727 struct dm_buffer *b;
728 struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
729
730 while (true) {
731 le = lru_evict(&bc->lru[old_mode], __evict_pred, &w, bc->no_sleep);
732 if (!le)
733 break;
734
735 b = le_to_buffer(le);
736 b->list_mode = new_mode;
737 lru_insert(&bc->lru[b->list_mode], &b->lru);
738 }
739 }
740
cache_mark_many(struct dm_buffer_cache * bc,int old_mode,int new_mode,b_predicate pred,void * context)741 static void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
742 b_predicate pred, void *context)
743 {
744 struct lock_history lh;
745
746 lh_init(&lh, bc, true);
747 __cache_mark_many(bc, old_mode, new_mode, pred, context, &lh);
748 lh_exit(&lh);
749 }
750
751 /*--------------*/
752
753 /*
754 * Iterates through all clean or dirty entries calling a function for each
755 * entry. The callback may terminate the iteration early. Not threadsafe.
756 */
757
758 /*
759 * Iterator functions should return one of these actions to indicate
760 * how the iteration should proceed.
761 */
762 enum it_action {
763 IT_NEXT,
764 IT_COMPLETE,
765 };
766
767 typedef enum it_action (*iter_fn)(struct dm_buffer *b, void *context);
768
__cache_iterate(struct dm_buffer_cache * bc,int list_mode,iter_fn fn,void * context,struct lock_history * lh)769 static void __cache_iterate(struct dm_buffer_cache *bc, int list_mode,
770 iter_fn fn, void *context, struct lock_history *lh)
771 {
772 struct lru *lru = &bc->lru[list_mode];
773 struct lru_entry *le, *first;
774
775 if (!lru->cursor)
776 return;
777
778 first = le = to_le(lru->cursor);
779 do {
780 struct dm_buffer *b = le_to_buffer(le);
781
782 lh_next(lh, b->block);
783
784 switch (fn(b, context)) {
785 case IT_NEXT:
786 break;
787
788 case IT_COMPLETE:
789 return;
790 }
791 cond_resched();
792
793 le = to_le(le->list.next);
794 } while (le != first);
795 }
796
cache_iterate(struct dm_buffer_cache * bc,int list_mode,iter_fn fn,void * context)797 static void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
798 iter_fn fn, void *context)
799 {
800 struct lock_history lh;
801
802 lh_init(&lh, bc, false);
803 __cache_iterate(bc, list_mode, fn, context, &lh);
804 lh_exit(&lh);
805 }
806
807 /*--------------*/
808
809 /*
810 * Passes ownership of the buffer to the cache. Returns false if the
811 * buffer was already present (in which case ownership does not pass).
812 * eg, a race with another thread.
813 *
814 * Holder count should be 1 on insertion.
815 *
816 * Not threadsafe.
817 */
__cache_insert(struct rb_root * root,struct dm_buffer * b)818 static bool __cache_insert(struct rb_root *root, struct dm_buffer *b)
819 {
820 struct rb_node **new = &root->rb_node, *parent = NULL;
821 struct dm_buffer *found;
822
823 while (*new) {
824 found = container_of(*new, struct dm_buffer, node);
825
826 if (found->block == b->block)
827 return false;
828
829 parent = *new;
830 new = b->block < found->block ?
831 &found->node.rb_left : &found->node.rb_right;
832 }
833
834 rb_link_node(&b->node, parent, new);
835 rb_insert_color(&b->node, root);
836
837 return true;
838 }
839
cache_insert(struct dm_buffer_cache * bc,struct dm_buffer * b)840 static bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b)
841 {
842 bool r;
843
844 if (WARN_ON_ONCE(b->list_mode >= LIST_SIZE))
845 return false;
846
847 cache_write_lock(bc, b->block);
848 BUG_ON(atomic_read(&b->hold_count) != 1);
849 r = __cache_insert(&bc->trees[cache_index(b->block, bc->num_locks)].root, b);
850 if (r)
851 lru_insert(&bc->lru[b->list_mode], &b->lru);
852 cache_write_unlock(bc, b->block);
853
854 return r;
855 }
856
857 /*--------------*/
858
859 /*
860 * Removes buffer from cache, ownership of the buffer passes back to the caller.
861 * Fails if the hold_count is not one (ie. the caller holds the only reference).
862 *
863 * Not threadsafe.
864 */
cache_remove(struct dm_buffer_cache * bc,struct dm_buffer * b)865 static bool cache_remove(struct dm_buffer_cache *bc, struct dm_buffer *b)
866 {
867 bool r;
868
869 cache_write_lock(bc, b->block);
870
871 if (atomic_read(&b->hold_count) != 1) {
872 r = false;
873 } else {
874 r = true;
875 rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
876 lru_remove(&bc->lru[b->list_mode], &b->lru);
877 }
878
879 cache_write_unlock(bc, b->block);
880
881 return r;
882 }
883
884 /*--------------*/
885
886 typedef void (*b_release)(struct dm_buffer *);
887
__find_next(struct rb_root * root,sector_t block)888 static struct dm_buffer *__find_next(struct rb_root *root, sector_t block)
889 {
890 struct rb_node *n = root->rb_node;
891 struct dm_buffer *b;
892 struct dm_buffer *best = NULL;
893
894 while (n) {
895 b = container_of(n, struct dm_buffer, node);
896
897 if (b->block == block)
898 return b;
899
900 if (block <= b->block) {
901 n = n->rb_left;
902 best = b;
903 } else {
904 n = n->rb_right;
905 }
906 }
907
908 return best;
909 }
910
__remove_range(struct dm_buffer_cache * bc,struct rb_root * root,sector_t begin,sector_t end,b_predicate pred,b_release release)911 static void __remove_range(struct dm_buffer_cache *bc,
912 struct rb_root *root,
913 sector_t begin, sector_t end,
914 b_predicate pred, b_release release)
915 {
916 struct dm_buffer *b;
917
918 while (true) {
919 cond_resched();
920
921 b = __find_next(root, begin);
922 if (!b || (b->block >= end))
923 break;
924
925 begin = b->block + 1;
926
927 if (atomic_read(&b->hold_count))
928 continue;
929
930 if (pred(b, NULL) == ER_EVICT) {
931 rb_erase(&b->node, root);
932 lru_remove(&bc->lru[b->list_mode], &b->lru);
933 release(b);
934 }
935 }
936 }
937
cache_remove_range(struct dm_buffer_cache * bc,sector_t begin,sector_t end,b_predicate pred,b_release release)938 static void cache_remove_range(struct dm_buffer_cache *bc,
939 sector_t begin, sector_t end,
940 b_predicate pred, b_release release)
941 {
942 unsigned int i;
943
944 BUG_ON(bc->no_sleep);
945 for (i = 0; i < bc->num_locks; i++) {
946 down_write(&bc->trees[i].u.lock);
947 __remove_range(bc, &bc->trees[i].root, begin, end, pred, release);
948 up_write(&bc->trees[i].u.lock);
949 }
950 }
951
952 /*----------------------------------------------------------------*/
953
954 /*
955 * Linking of buffers:
956 * All buffers are linked to buffer_cache with their node field.
957 *
958 * Clean buffers that are not being written (B_WRITING not set)
959 * are linked to lru[LIST_CLEAN] with their lru_list field.
960 *
961 * Dirty and clean buffers that are being written are linked to
962 * lru[LIST_DIRTY] with their lru_list field. When the write
963 * finishes, the buffer cannot be relinked immediately (because we
964 * are in an interrupt context and relinking requires process
965 * context), so some clean-not-writing buffers can be held on
966 * dirty_lru too. They are later added to lru in the process
967 * context.
968 */
969 struct dm_bufio_client {
970 struct block_device *bdev;
971 unsigned int block_size;
972 s8 sectors_per_block_bits;
973
974 bool no_sleep;
975 struct mutex lock;
976 spinlock_t spinlock;
977
978 int async_write_error;
979
980 void (*alloc_callback)(struct dm_buffer *buf);
981 void (*write_callback)(struct dm_buffer *buf);
982 struct kmem_cache *slab_buffer;
983 struct kmem_cache *slab_cache;
984 struct dm_io_client *dm_io;
985
986 struct list_head reserved_buffers;
987 unsigned int need_reserved_buffers;
988
989 unsigned int minimum_buffers;
990
991 sector_t start;
992
993 struct shrinker *shrinker;
994 struct work_struct shrink_work;
995 atomic_long_t need_shrink;
996
997 wait_queue_head_t free_buffer_wait;
998
999 struct list_head client_list;
1000
1001 /*
1002 * Used by global_cleanup to sort the clients list.
1003 */
1004 unsigned long oldest_buffer;
1005
1006 struct dm_buffer_cache cache; /* must be last member */
1007 };
1008
1009 /*----------------------------------------------------------------*/
1010
1011 #define dm_bufio_in_request() (!!current->bio_list)
1012
dm_bufio_lock(struct dm_bufio_client * c)1013 static void dm_bufio_lock(struct dm_bufio_client *c)
1014 {
1015 if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1016 spin_lock_bh(&c->spinlock);
1017 else
1018 mutex_lock_nested(&c->lock, dm_bufio_in_request());
1019 }
1020
dm_bufio_unlock(struct dm_bufio_client * c)1021 static void dm_bufio_unlock(struct dm_bufio_client *c)
1022 {
1023 if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1024 spin_unlock_bh(&c->spinlock);
1025 else
1026 mutex_unlock(&c->lock);
1027 }
1028
1029 /*----------------------------------------------------------------*/
1030
1031 /*
1032 * Default cache size: available memory divided by the ratio.
1033 */
1034 static unsigned long dm_bufio_default_cache_size;
1035
1036 /*
1037 * Total cache size set by the user.
1038 */
1039 static unsigned long dm_bufio_cache_size;
1040
1041 /*
1042 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
1043 * at any time. If it disagrees, the user has changed cache size.
1044 */
1045 static unsigned long dm_bufio_cache_size_latch;
1046
1047 static DEFINE_SPINLOCK(global_spinlock);
1048
1049 static unsigned int dm_bufio_max_age; /* No longer does anything */
1050
1051 static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
1052
1053 static unsigned long dm_bufio_peak_allocated;
1054 static unsigned long dm_bufio_allocated_kmem_cache;
1055 static unsigned long dm_bufio_allocated_get_free_pages;
1056 static unsigned long dm_bufio_allocated_vmalloc;
1057 static unsigned long dm_bufio_current_allocated;
1058
1059 /*----------------------------------------------------------------*/
1060
1061 /*
1062 * The current number of clients.
1063 */
1064 static int dm_bufio_client_count;
1065
1066 /*
1067 * The list of all clients.
1068 */
1069 static LIST_HEAD(dm_bufio_all_clients);
1070
1071 /*
1072 * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
1073 */
1074 static DEFINE_MUTEX(dm_bufio_clients_lock);
1075
1076 static struct workqueue_struct *dm_bufio_wq;
1077 static struct work_struct dm_bufio_replacement_work;
1078
1079
1080 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
buffer_record_stack(struct dm_buffer * b)1081 static void buffer_record_stack(struct dm_buffer *b)
1082 {
1083 b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
1084 }
1085 #endif
1086
1087 /*----------------------------------------------------------------*/
1088
adjust_total_allocated(struct dm_buffer * b,bool unlink)1089 static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
1090 {
1091 unsigned char data_mode;
1092 long diff;
1093
1094 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
1095 &dm_bufio_allocated_kmem_cache,
1096 &dm_bufio_allocated_get_free_pages,
1097 &dm_bufio_allocated_vmalloc,
1098 };
1099
1100 data_mode = b->data_mode;
1101 diff = (long)b->c->block_size;
1102 if (unlink)
1103 diff = -diff;
1104
1105 spin_lock(&global_spinlock);
1106
1107 *class_ptr[data_mode] += diff;
1108
1109 dm_bufio_current_allocated += diff;
1110
1111 if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
1112 dm_bufio_peak_allocated = dm_bufio_current_allocated;
1113
1114 if (!unlink) {
1115 if (dm_bufio_current_allocated > dm_bufio_cache_size)
1116 queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
1117 }
1118
1119 spin_unlock(&global_spinlock);
1120 }
1121
1122 /*
1123 * Change the number of clients and recalculate per-client limit.
1124 */
__cache_size_refresh(void)1125 static void __cache_size_refresh(void)
1126 {
1127 if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock)))
1128 return;
1129 if (WARN_ON(dm_bufio_client_count < 0))
1130 return;
1131
1132 dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
1133
1134 /*
1135 * Use default if set to 0 and report the actual cache size used.
1136 */
1137 if (!dm_bufio_cache_size_latch) {
1138 (void)cmpxchg(&dm_bufio_cache_size, 0,
1139 dm_bufio_default_cache_size);
1140 dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
1141 }
1142 }
1143
1144 /*
1145 * Allocating buffer data.
1146 *
1147 * Small buffers are allocated with kmem_cache, to use space optimally.
1148 *
1149 * For large buffers, we choose between get_free_pages and vmalloc.
1150 * Each has advantages and disadvantages.
1151 *
1152 * __get_free_pages can randomly fail if the memory is fragmented.
1153 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
1154 * as low as 128M) so using it for caching is not appropriate.
1155 *
1156 * If the allocation may fail we use __get_free_pages. Memory fragmentation
1157 * won't have a fatal effect here, but it just causes flushes of some other
1158 * buffers and more I/O will be performed. Don't use __get_free_pages if it
1159 * always fails (i.e. order > MAX_PAGE_ORDER).
1160 *
1161 * If the allocation shouldn't fail we use __vmalloc. This is only for the
1162 * initial reserve allocation, so there's no risk of wasting all vmalloc
1163 * space.
1164 */
alloc_buffer_data(struct dm_bufio_client * c,gfp_t gfp_mask,unsigned char * data_mode)1165 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
1166 unsigned char *data_mode)
1167 {
1168 if (unlikely(c->slab_cache != NULL)) {
1169 *data_mode = DATA_MODE_SLAB;
1170 return kmem_cache_alloc(c->slab_cache, gfp_mask);
1171 }
1172
1173 if (c->block_size <= KMALLOC_MAX_SIZE &&
1174 gfp_mask & __GFP_NORETRY) {
1175 *data_mode = DATA_MODE_GET_FREE_PAGES;
1176 return (void *)__get_free_pages(gfp_mask,
1177 c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
1178 }
1179
1180 *data_mode = DATA_MODE_VMALLOC;
1181
1182 return __vmalloc(c->block_size, gfp_mask);
1183 }
1184
1185 /*
1186 * Free buffer's data.
1187 */
free_buffer_data(struct dm_bufio_client * c,void * data,unsigned char data_mode)1188 static void free_buffer_data(struct dm_bufio_client *c,
1189 void *data, unsigned char data_mode)
1190 {
1191 switch (data_mode) {
1192 case DATA_MODE_SLAB:
1193 kmem_cache_free(c->slab_cache, data);
1194 break;
1195
1196 case DATA_MODE_GET_FREE_PAGES:
1197 free_pages((unsigned long)data,
1198 c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
1199 break;
1200
1201 case DATA_MODE_VMALLOC:
1202 vfree(data);
1203 break;
1204
1205 default:
1206 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
1207 data_mode);
1208 BUG();
1209 }
1210 }
1211
1212 /*
1213 * Allocate buffer and its data.
1214 */
alloc_buffer(struct dm_bufio_client * c,gfp_t gfp_mask)1215 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
1216 {
1217 struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
1218
1219 if (!b)
1220 return NULL;
1221
1222 b->c = c;
1223
1224 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
1225 if (!b->data) {
1226 kmem_cache_free(c->slab_buffer, b);
1227 return NULL;
1228 }
1229 adjust_total_allocated(b, false);
1230
1231 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1232 b->stack_len = 0;
1233 #endif
1234 return b;
1235 }
1236
1237 /*
1238 * Free buffer and its data.
1239 */
free_buffer(struct dm_buffer * b)1240 static void free_buffer(struct dm_buffer *b)
1241 {
1242 struct dm_bufio_client *c = b->c;
1243
1244 adjust_total_allocated(b, true);
1245 free_buffer_data(c, b->data, b->data_mode);
1246 kmem_cache_free(c->slab_buffer, b);
1247 }
1248
1249 /*
1250 *--------------------------------------------------------------------------
1251 * Submit I/O on the buffer.
1252 *
1253 * Bio interface is faster but it has some problems:
1254 * the vector list is limited (increasing this limit increases
1255 * memory-consumption per buffer, so it is not viable);
1256 *
1257 * the memory must be direct-mapped, not vmalloced;
1258 *
1259 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
1260 * it is not vmalloced, try using the bio interface.
1261 *
1262 * If the buffer is big, if it is vmalloced or if the underlying device
1263 * rejects the bio because it is too large, use dm-io layer to do the I/O.
1264 * The dm-io layer splits the I/O into multiple requests, avoiding the above
1265 * shortcomings.
1266 *--------------------------------------------------------------------------
1267 */
1268
1269 /*
1270 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
1271 * that the request was handled directly with bio interface.
1272 */
dmio_complete(unsigned long error,void * context)1273 static void dmio_complete(unsigned long error, void *context)
1274 {
1275 struct dm_buffer *b = context;
1276
1277 b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
1278 }
1279
use_dmio(struct dm_buffer * b,enum req_op op,sector_t sector,unsigned int n_sectors,unsigned int offset,unsigned short ioprio)1280 static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector,
1281 unsigned int n_sectors, unsigned int offset,
1282 unsigned short ioprio)
1283 {
1284 int r;
1285 struct dm_io_request io_req = {
1286 .bi_opf = op,
1287 .notify.fn = dmio_complete,
1288 .notify.context = b,
1289 .client = b->c->dm_io,
1290 };
1291 struct dm_io_region region = {
1292 .bdev = b->c->bdev,
1293 .sector = sector,
1294 .count = n_sectors,
1295 };
1296
1297 if (b->data_mode != DATA_MODE_VMALLOC) {
1298 io_req.mem.type = DM_IO_KMEM;
1299 io_req.mem.ptr.addr = (char *)b->data + offset;
1300 } else {
1301 io_req.mem.type = DM_IO_VMA;
1302 io_req.mem.ptr.vma = (char *)b->data + offset;
1303 }
1304
1305 r = dm_io(&io_req, 1, ®ion, NULL, ioprio);
1306 if (unlikely(r))
1307 b->end_io(b, errno_to_blk_status(r));
1308 }
1309
bio_complete(struct bio * bio)1310 static void bio_complete(struct bio *bio)
1311 {
1312 struct dm_buffer *b = bio->bi_private;
1313 blk_status_t status = bio->bi_status;
1314
1315 bio_uninit(bio);
1316 kfree(bio);
1317 b->end_io(b, status);
1318 }
1319
use_bio(struct dm_buffer * b,enum req_op op,sector_t sector,unsigned int n_sectors,unsigned int offset,unsigned short ioprio)1320 static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
1321 unsigned int n_sectors, unsigned int offset,
1322 unsigned short ioprio)
1323 {
1324 struct bio *bio;
1325 char *ptr;
1326 unsigned int len;
1327
1328 bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
1329 if (!bio) {
1330 use_dmio(b, op, sector, n_sectors, offset, ioprio);
1331 return;
1332 }
1333 bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op);
1334 bio->bi_iter.bi_sector = sector;
1335 bio->bi_end_io = bio_complete;
1336 bio->bi_private = b;
1337 bio->bi_ioprio = ioprio;
1338
1339 ptr = (char *)b->data + offset;
1340 len = n_sectors << SECTOR_SHIFT;
1341
1342 __bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
1343
1344 submit_bio(bio);
1345 }
1346
block_to_sector(struct dm_bufio_client * c,sector_t block)1347 static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
1348 {
1349 sector_t sector;
1350
1351 if (likely(c->sectors_per_block_bits >= 0))
1352 sector = block << c->sectors_per_block_bits;
1353 else
1354 sector = block * (c->block_size >> SECTOR_SHIFT);
1355 sector += c->start;
1356
1357 return sector;
1358 }
1359
submit_io(struct dm_buffer * b,enum req_op op,unsigned short ioprio,void (* end_io)(struct dm_buffer *,blk_status_t))1360 static void submit_io(struct dm_buffer *b, enum req_op op, unsigned short ioprio,
1361 void (*end_io)(struct dm_buffer *, blk_status_t))
1362 {
1363 unsigned int n_sectors;
1364 sector_t sector;
1365 unsigned int offset, end;
1366
1367 b->end_io = end_io;
1368
1369 sector = block_to_sector(b->c, b->block);
1370
1371 if (op != REQ_OP_WRITE) {
1372 n_sectors = b->c->block_size >> SECTOR_SHIFT;
1373 offset = 0;
1374 } else {
1375 if (b->c->write_callback)
1376 b->c->write_callback(b);
1377 offset = b->write_start;
1378 end = b->write_end;
1379 offset &= -DM_BUFIO_WRITE_ALIGN;
1380 end += DM_BUFIO_WRITE_ALIGN - 1;
1381 end &= -DM_BUFIO_WRITE_ALIGN;
1382 if (unlikely(end > b->c->block_size))
1383 end = b->c->block_size;
1384
1385 sector += offset >> SECTOR_SHIFT;
1386 n_sectors = (end - offset) >> SECTOR_SHIFT;
1387 }
1388
1389 if (b->data_mode != DATA_MODE_VMALLOC)
1390 use_bio(b, op, sector, n_sectors, offset, ioprio);
1391 else
1392 use_dmio(b, op, sector, n_sectors, offset, ioprio);
1393 }
1394
1395 /*
1396 *--------------------------------------------------------------
1397 * Writing dirty buffers
1398 *--------------------------------------------------------------
1399 */
1400
1401 /*
1402 * The endio routine for write.
1403 *
1404 * Set the error, clear B_WRITING bit and wake anyone who was waiting on
1405 * it.
1406 */
write_endio(struct dm_buffer * b,blk_status_t status)1407 static void write_endio(struct dm_buffer *b, blk_status_t status)
1408 {
1409 b->write_error = status;
1410 if (unlikely(status)) {
1411 struct dm_bufio_client *c = b->c;
1412
1413 (void)cmpxchg(&c->async_write_error, 0,
1414 blk_status_to_errno(status));
1415 }
1416
1417 BUG_ON(!test_bit(B_WRITING, &b->state));
1418
1419 smp_mb__before_atomic();
1420 clear_bit(B_WRITING, &b->state);
1421 smp_mb__after_atomic();
1422
1423 wake_up_bit(&b->state, B_WRITING);
1424 }
1425
1426 /*
1427 * Initiate a write on a dirty buffer, but don't wait for it.
1428 *
1429 * - If the buffer is not dirty, exit.
1430 * - If there some previous write going on, wait for it to finish (we can't
1431 * have two writes on the same buffer simultaneously).
1432 * - Submit our write and don't wait on it. We set B_WRITING indicating
1433 * that there is a write in progress.
1434 */
__write_dirty_buffer(struct dm_buffer * b,struct list_head * write_list)1435 static void __write_dirty_buffer(struct dm_buffer *b,
1436 struct list_head *write_list)
1437 {
1438 if (!test_bit(B_DIRTY, &b->state))
1439 return;
1440
1441 clear_bit(B_DIRTY, &b->state);
1442 wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1443
1444 b->write_start = b->dirty_start;
1445 b->write_end = b->dirty_end;
1446
1447 if (!write_list)
1448 submit_io(b, REQ_OP_WRITE, IOPRIO_DEFAULT, write_endio);
1449 else
1450 list_add_tail(&b->write_list, write_list);
1451 }
1452
__flush_write_list(struct list_head * write_list)1453 static void __flush_write_list(struct list_head *write_list)
1454 {
1455 struct blk_plug plug;
1456
1457 blk_start_plug(&plug);
1458 while (!list_empty(write_list)) {
1459 struct dm_buffer *b =
1460 list_entry(write_list->next, struct dm_buffer, write_list);
1461 list_del(&b->write_list);
1462 submit_io(b, REQ_OP_WRITE, IOPRIO_DEFAULT, write_endio);
1463 cond_resched();
1464 }
1465 blk_finish_plug(&plug);
1466 }
1467
1468 /*
1469 * Wait until any activity on the buffer finishes. Possibly write the
1470 * buffer if it is dirty. When this function finishes, there is no I/O
1471 * running on the buffer and the buffer is not dirty.
1472 */
__make_buffer_clean(struct dm_buffer * b)1473 static void __make_buffer_clean(struct dm_buffer *b)
1474 {
1475 BUG_ON(atomic_read(&b->hold_count));
1476
1477 /* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */
1478 if (!smp_load_acquire(&b->state)) /* fast case */
1479 return;
1480
1481 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1482 __write_dirty_buffer(b, NULL);
1483 wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1484 }
1485
is_clean(struct dm_buffer * b,void * context)1486 static enum evict_result is_clean(struct dm_buffer *b, void *context)
1487 {
1488 struct dm_bufio_client *c = context;
1489
1490 /* These should never happen */
1491 if (WARN_ON_ONCE(test_bit(B_WRITING, &b->state)))
1492 return ER_DONT_EVICT;
1493 if (WARN_ON_ONCE(test_bit(B_DIRTY, &b->state)))
1494 return ER_DONT_EVICT;
1495 if (WARN_ON_ONCE(b->list_mode != LIST_CLEAN))
1496 return ER_DONT_EVICT;
1497
1498 if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep &&
1499 unlikely(test_bit(B_READING, &b->state)))
1500 return ER_DONT_EVICT;
1501
1502 return ER_EVICT;
1503 }
1504
is_dirty(struct dm_buffer * b,void * context)1505 static enum evict_result is_dirty(struct dm_buffer *b, void *context)
1506 {
1507 /* These should never happen */
1508 if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1509 return ER_DONT_EVICT;
1510 if (WARN_ON_ONCE(b->list_mode != LIST_DIRTY))
1511 return ER_DONT_EVICT;
1512
1513 return ER_EVICT;
1514 }
1515
1516 /*
1517 * Find some buffer that is not held by anybody, clean it, unlink it and
1518 * return it.
1519 */
__get_unclaimed_buffer(struct dm_bufio_client * c)1520 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
1521 {
1522 struct dm_buffer *b;
1523
1524 b = cache_evict(&c->cache, LIST_CLEAN, is_clean, c);
1525 if (b) {
1526 /* this also waits for pending reads */
1527 __make_buffer_clean(b);
1528 return b;
1529 }
1530
1531 if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1532 return NULL;
1533
1534 b = cache_evict(&c->cache, LIST_DIRTY, is_dirty, NULL);
1535 if (b) {
1536 __make_buffer_clean(b);
1537 return b;
1538 }
1539
1540 return NULL;
1541 }
1542
1543 /*
1544 * Wait until some other threads free some buffer or release hold count on
1545 * some buffer.
1546 *
1547 * This function is entered with c->lock held, drops it and regains it
1548 * before exiting.
1549 */
__wait_for_free_buffer(struct dm_bufio_client * c)1550 static void __wait_for_free_buffer(struct dm_bufio_client *c)
1551 {
1552 DECLARE_WAITQUEUE(wait, current);
1553
1554 add_wait_queue(&c->free_buffer_wait, &wait);
1555 set_current_state(TASK_UNINTERRUPTIBLE);
1556 dm_bufio_unlock(c);
1557
1558 /*
1559 * It's possible to miss a wake up event since we don't always
1560 * hold c->lock when wake_up is called. So we have a timeout here,
1561 * just in case.
1562 */
1563 io_schedule_timeout(5 * HZ);
1564
1565 remove_wait_queue(&c->free_buffer_wait, &wait);
1566
1567 dm_bufio_lock(c);
1568 }
1569
1570 enum new_flag {
1571 NF_FRESH = 0,
1572 NF_READ = 1,
1573 NF_GET = 2,
1574 NF_PREFETCH = 3
1575 };
1576
1577 /*
1578 * Allocate a new buffer. If the allocation is not possible, wait until
1579 * some other thread frees a buffer.
1580 *
1581 * May drop the lock and regain it.
1582 */
__alloc_buffer_wait_no_callback(struct dm_bufio_client * c,enum new_flag nf)1583 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
1584 {
1585 struct dm_buffer *b;
1586 bool tried_noio_alloc = false;
1587
1588 /*
1589 * dm-bufio is resistant to allocation failures (it just keeps
1590 * one buffer reserved in cases all the allocations fail).
1591 * So set flags to not try too hard:
1592 * GFP_NOWAIT: don't wait; if we need to sleep we'll release our
1593 * mutex and wait ourselves.
1594 * __GFP_NORETRY: don't retry and rather return failure
1595 * __GFP_NOMEMALLOC: don't use emergency reserves
1596 * __GFP_NOWARN: don't print a warning in case of failure
1597 *
1598 * For debugging, if we set the cache size to 1, no new buffers will
1599 * be allocated.
1600 */
1601 while (1) {
1602 if (dm_bufio_cache_size_latch != 1) {
1603 b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1604 if (b)
1605 return b;
1606 }
1607
1608 if (nf == NF_PREFETCH)
1609 return NULL;
1610
1611 if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
1612 dm_bufio_unlock(c);
1613 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1614 dm_bufio_lock(c);
1615 if (b)
1616 return b;
1617 tried_noio_alloc = true;
1618 }
1619
1620 if (!list_empty(&c->reserved_buffers)) {
1621 b = list_to_buffer(c->reserved_buffers.next);
1622 list_del(&b->lru.list);
1623 c->need_reserved_buffers++;
1624
1625 return b;
1626 }
1627
1628 b = __get_unclaimed_buffer(c);
1629 if (b)
1630 return b;
1631
1632 __wait_for_free_buffer(c);
1633 }
1634 }
1635
__alloc_buffer_wait(struct dm_bufio_client * c,enum new_flag nf)1636 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
1637 {
1638 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
1639
1640 if (!b)
1641 return NULL;
1642
1643 if (c->alloc_callback)
1644 c->alloc_callback(b);
1645
1646 return b;
1647 }
1648
1649 /*
1650 * Free a buffer and wake other threads waiting for free buffers.
1651 */
__free_buffer_wake(struct dm_buffer * b)1652 static void __free_buffer_wake(struct dm_buffer *b)
1653 {
1654 struct dm_bufio_client *c = b->c;
1655
1656 b->block = -1;
1657 if (!c->need_reserved_buffers)
1658 free_buffer(b);
1659 else {
1660 list_add(&b->lru.list, &c->reserved_buffers);
1661 c->need_reserved_buffers--;
1662 }
1663
1664 /*
1665 * We hold the bufio lock here, so no one can add entries to the
1666 * wait queue anyway.
1667 */
1668 if (unlikely(waitqueue_active(&c->free_buffer_wait)))
1669 wake_up(&c->free_buffer_wait);
1670 }
1671
cleaned(struct dm_buffer * b,void * context)1672 static enum evict_result cleaned(struct dm_buffer *b, void *context)
1673 {
1674 if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1675 return ER_DONT_EVICT; /* should never happen */
1676
1677 if (test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state))
1678 return ER_DONT_EVICT;
1679 else
1680 return ER_EVICT;
1681 }
1682
__move_clean_buffers(struct dm_bufio_client * c)1683 static void __move_clean_buffers(struct dm_bufio_client *c)
1684 {
1685 cache_mark_many(&c->cache, LIST_DIRTY, LIST_CLEAN, cleaned, NULL);
1686 }
1687
1688 struct write_context {
1689 int no_wait;
1690 struct list_head *write_list;
1691 };
1692
write_one(struct dm_buffer * b,void * context)1693 static enum it_action write_one(struct dm_buffer *b, void *context)
1694 {
1695 struct write_context *wc = context;
1696
1697 if (wc->no_wait && test_bit(B_WRITING, &b->state))
1698 return IT_COMPLETE;
1699
1700 __write_dirty_buffer(b, wc->write_list);
1701 return IT_NEXT;
1702 }
1703
__write_dirty_buffers_async(struct dm_bufio_client * c,int no_wait,struct list_head * write_list)1704 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
1705 struct list_head *write_list)
1706 {
1707 struct write_context wc = {.no_wait = no_wait, .write_list = write_list};
1708
1709 __move_clean_buffers(c);
1710 cache_iterate(&c->cache, LIST_DIRTY, write_one, &wc);
1711 }
1712
1713 /*
1714 * Check if we're over watermark.
1715 * If we are over threshold_buffers, start freeing buffers.
1716 * If we're over "limit_buffers", block until we get under the limit.
1717 */
__check_watermark(struct dm_bufio_client * c,struct list_head * write_list)1718 static void __check_watermark(struct dm_bufio_client *c,
1719 struct list_head *write_list)
1720 {
1721 if (cache_count(&c->cache, LIST_DIRTY) >
1722 cache_count(&c->cache, LIST_CLEAN) * DM_BUFIO_WRITEBACK_RATIO)
1723 __write_dirty_buffers_async(c, 1, write_list);
1724 }
1725
1726 /*
1727 *--------------------------------------------------------------
1728 * Getting a buffer
1729 *--------------------------------------------------------------
1730 */
1731
cache_put_and_wake(struct dm_bufio_client * c,struct dm_buffer * b)1732 static void cache_put_and_wake(struct dm_bufio_client *c, struct dm_buffer *b)
1733 {
1734 /*
1735 * Relying on waitqueue_active() is racey, but we sleep
1736 * with schedule_timeout anyway.
1737 */
1738 if (cache_put(&c->cache, b) &&
1739 unlikely(waitqueue_active(&c->free_buffer_wait)))
1740 wake_up(&c->free_buffer_wait);
1741 }
1742
1743 /*
1744 * This assumes you have already checked the cache to see if the buffer
1745 * is already present (it will recheck after dropping the lock for allocation).
1746 */
__bufio_new(struct dm_bufio_client * c,sector_t block,enum new_flag nf,int * need_submit,struct list_head * write_list)1747 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
1748 enum new_flag nf, int *need_submit,
1749 struct list_head *write_list)
1750 {
1751 struct dm_buffer *b, *new_b = NULL;
1752
1753 *need_submit = 0;
1754
1755 /* This can't be called with NF_GET */
1756 if (WARN_ON_ONCE(nf == NF_GET))
1757 return NULL;
1758
1759 new_b = __alloc_buffer_wait(c, nf);
1760 if (!new_b)
1761 return NULL;
1762
1763 /*
1764 * We've had a period where the mutex was unlocked, so need to
1765 * recheck the buffer tree.
1766 */
1767 b = cache_get(&c->cache, block);
1768 if (b) {
1769 __free_buffer_wake(new_b);
1770 goto found_buffer;
1771 }
1772
1773 __check_watermark(c, write_list);
1774
1775 b = new_b;
1776 atomic_set(&b->hold_count, 1);
1777 WRITE_ONCE(b->last_accessed, jiffies);
1778 b->block = block;
1779 b->read_error = 0;
1780 b->write_error = 0;
1781 b->list_mode = LIST_CLEAN;
1782
1783 if (nf == NF_FRESH)
1784 b->state = 0;
1785 else {
1786 b->state = 1 << B_READING;
1787 *need_submit = 1;
1788 }
1789
1790 /*
1791 * We mustn't insert into the cache until the B_READING state
1792 * is set. Otherwise another thread could get it and use
1793 * it before it had been read.
1794 */
1795 cache_insert(&c->cache, b);
1796
1797 return b;
1798
1799 found_buffer:
1800 if (nf == NF_PREFETCH) {
1801 cache_put_and_wake(c, b);
1802 return NULL;
1803 }
1804
1805 /*
1806 * Note: it is essential that we don't wait for the buffer to be
1807 * read if dm_bufio_get function is used. Both dm_bufio_get and
1808 * dm_bufio_prefetch can be used in the driver request routine.
1809 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1810 * the same buffer, it would deadlock if we waited.
1811 */
1812 if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1813 cache_put_and_wake(c, b);
1814 return NULL;
1815 }
1816
1817 return b;
1818 }
1819
1820 /*
1821 * The endio routine for reading: set the error, clear the bit and wake up
1822 * anyone waiting on the buffer.
1823 */
read_endio(struct dm_buffer * b,blk_status_t status)1824 static void read_endio(struct dm_buffer *b, blk_status_t status)
1825 {
1826 b->read_error = status;
1827
1828 BUG_ON(!test_bit(B_READING, &b->state));
1829
1830 smp_mb__before_atomic();
1831 clear_bit(B_READING, &b->state);
1832 smp_mb__after_atomic();
1833
1834 wake_up_bit(&b->state, B_READING);
1835 }
1836
1837 /*
1838 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these
1839 * functions is similar except that dm_bufio_new doesn't read the
1840 * buffer from the disk (assuming that the caller overwrites all the data
1841 * and uses dm_bufio_mark_buffer_dirty to write new data back).
1842 */
new_read(struct dm_bufio_client * c,sector_t block,enum new_flag nf,struct dm_buffer ** bp,unsigned short ioprio)1843 static void *new_read(struct dm_bufio_client *c, sector_t block,
1844 enum new_flag nf, struct dm_buffer **bp,
1845 unsigned short ioprio)
1846 {
1847 int need_submit = 0;
1848 struct dm_buffer *b;
1849
1850 LIST_HEAD(write_list);
1851
1852 *bp = NULL;
1853
1854 /*
1855 * Fast path, hopefully the block is already in the cache. No need
1856 * to get the client lock for this.
1857 */
1858 b = cache_get(&c->cache, block);
1859 if (b) {
1860 if (nf == NF_PREFETCH) {
1861 cache_put_and_wake(c, b);
1862 return NULL;
1863 }
1864
1865 /*
1866 * Note: it is essential that we don't wait for the buffer to be
1867 * read if dm_bufio_get function is used. Both dm_bufio_get and
1868 * dm_bufio_prefetch can be used in the driver request routine.
1869 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1870 * the same buffer, it would deadlock if we waited.
1871 */
1872 if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1873 cache_put_and_wake(c, b);
1874 return NULL;
1875 }
1876 }
1877
1878 if (!b) {
1879 if (nf == NF_GET)
1880 return NULL;
1881
1882 dm_bufio_lock(c);
1883 b = __bufio_new(c, block, nf, &need_submit, &write_list);
1884 dm_bufio_unlock(c);
1885 }
1886
1887 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1888 if (b && (atomic_read(&b->hold_count) == 1))
1889 buffer_record_stack(b);
1890 #endif
1891
1892 __flush_write_list(&write_list);
1893
1894 if (!b)
1895 return NULL;
1896
1897 if (need_submit)
1898 submit_io(b, REQ_OP_READ, ioprio, read_endio);
1899
1900 if (nf != NF_GET) /* we already tested this condition above */
1901 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1902
1903 if (b->read_error) {
1904 int error = blk_status_to_errno(b->read_error);
1905
1906 dm_bufio_release(b);
1907
1908 return ERR_PTR(error);
1909 }
1910
1911 *bp = b;
1912
1913 return b->data;
1914 }
1915
dm_bufio_get(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)1916 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1917 struct dm_buffer **bp)
1918 {
1919 return new_read(c, block, NF_GET, bp, IOPRIO_DEFAULT);
1920 }
1921 EXPORT_SYMBOL_GPL(dm_bufio_get);
1922
__dm_bufio_read(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp,unsigned short ioprio)1923 static void *__dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1924 struct dm_buffer **bp, unsigned short ioprio)
1925 {
1926 if (WARN_ON_ONCE(dm_bufio_in_request()))
1927 return ERR_PTR(-EINVAL);
1928
1929 return new_read(c, block, NF_READ, bp, ioprio);
1930 }
1931
dm_bufio_read(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)1932 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1933 struct dm_buffer **bp)
1934 {
1935 return __dm_bufio_read(c, block, bp, IOPRIO_DEFAULT);
1936 }
1937 EXPORT_SYMBOL_GPL(dm_bufio_read);
1938
dm_bufio_read_with_ioprio(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp,unsigned short ioprio)1939 void *dm_bufio_read_with_ioprio(struct dm_bufio_client *c, sector_t block,
1940 struct dm_buffer **bp, unsigned short ioprio)
1941 {
1942 return __dm_bufio_read(c, block, bp, ioprio);
1943 }
1944 EXPORT_SYMBOL_GPL(dm_bufio_read_with_ioprio);
1945
dm_bufio_new(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)1946 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1947 struct dm_buffer **bp)
1948 {
1949 if (WARN_ON_ONCE(dm_bufio_in_request()))
1950 return ERR_PTR(-EINVAL);
1951
1952 return new_read(c, block, NF_FRESH, bp, IOPRIO_DEFAULT);
1953 }
1954 EXPORT_SYMBOL_GPL(dm_bufio_new);
1955
__dm_bufio_prefetch(struct dm_bufio_client * c,sector_t block,unsigned int n_blocks,unsigned short ioprio)1956 static void __dm_bufio_prefetch(struct dm_bufio_client *c,
1957 sector_t block, unsigned int n_blocks,
1958 unsigned short ioprio)
1959 {
1960 struct blk_plug plug;
1961
1962 LIST_HEAD(write_list);
1963
1964 if (WARN_ON_ONCE(dm_bufio_in_request()))
1965 return; /* should never happen */
1966
1967 blk_start_plug(&plug);
1968
1969 for (; n_blocks--; block++) {
1970 int need_submit;
1971 struct dm_buffer *b;
1972
1973 b = cache_get(&c->cache, block);
1974 if (b) {
1975 /* already in cache */
1976 cache_put_and_wake(c, b);
1977 continue;
1978 }
1979
1980 dm_bufio_lock(c);
1981 b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1982 &write_list);
1983 if (unlikely(!list_empty(&write_list))) {
1984 dm_bufio_unlock(c);
1985 blk_finish_plug(&plug);
1986 __flush_write_list(&write_list);
1987 blk_start_plug(&plug);
1988 dm_bufio_lock(c);
1989 }
1990 if (unlikely(b != NULL)) {
1991 dm_bufio_unlock(c);
1992
1993 if (need_submit)
1994 submit_io(b, REQ_OP_READ, ioprio, read_endio);
1995 dm_bufio_release(b);
1996
1997 cond_resched();
1998
1999 if (!n_blocks)
2000 goto flush_plug;
2001 dm_bufio_lock(c);
2002 }
2003 dm_bufio_unlock(c);
2004 }
2005
2006 flush_plug:
2007 blk_finish_plug(&plug);
2008 }
2009
dm_bufio_prefetch(struct dm_bufio_client * c,sector_t block,unsigned int n_blocks)2010 void dm_bufio_prefetch(struct dm_bufio_client *c, sector_t block, unsigned int n_blocks)
2011 {
2012 return __dm_bufio_prefetch(c, block, n_blocks, IOPRIO_DEFAULT);
2013 }
2014 EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
2015
dm_bufio_prefetch_with_ioprio(struct dm_bufio_client * c,sector_t block,unsigned int n_blocks,unsigned short ioprio)2016 void dm_bufio_prefetch_with_ioprio(struct dm_bufio_client *c, sector_t block,
2017 unsigned int n_blocks, unsigned short ioprio)
2018 {
2019 return __dm_bufio_prefetch(c, block, n_blocks, ioprio);
2020 }
2021 EXPORT_SYMBOL_GPL(dm_bufio_prefetch_with_ioprio);
2022
dm_bufio_release(struct dm_buffer * b)2023 void dm_bufio_release(struct dm_buffer *b)
2024 {
2025 struct dm_bufio_client *c = b->c;
2026
2027 /*
2028 * If there were errors on the buffer, and the buffer is not
2029 * to be written, free the buffer. There is no point in caching
2030 * invalid buffer.
2031 */
2032 if ((b->read_error || b->write_error) &&
2033 !test_bit_acquire(B_READING, &b->state) &&
2034 !test_bit(B_WRITING, &b->state) &&
2035 !test_bit(B_DIRTY, &b->state)) {
2036 dm_bufio_lock(c);
2037
2038 /* cache remove can fail if there are other holders */
2039 if (cache_remove(&c->cache, b)) {
2040 __free_buffer_wake(b);
2041 dm_bufio_unlock(c);
2042 return;
2043 }
2044
2045 dm_bufio_unlock(c);
2046 }
2047
2048 cache_put_and_wake(c, b);
2049 }
2050 EXPORT_SYMBOL_GPL(dm_bufio_release);
2051
dm_bufio_mark_partial_buffer_dirty(struct dm_buffer * b,unsigned int start,unsigned int end)2052 void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
2053 unsigned int start, unsigned int end)
2054 {
2055 struct dm_bufio_client *c = b->c;
2056
2057 BUG_ON(start >= end);
2058 BUG_ON(end > b->c->block_size);
2059
2060 dm_bufio_lock(c);
2061
2062 BUG_ON(test_bit(B_READING, &b->state));
2063
2064 if (!test_and_set_bit(B_DIRTY, &b->state)) {
2065 b->dirty_start = start;
2066 b->dirty_end = end;
2067 cache_mark(&c->cache, b, LIST_DIRTY);
2068 } else {
2069 if (start < b->dirty_start)
2070 b->dirty_start = start;
2071 if (end > b->dirty_end)
2072 b->dirty_end = end;
2073 }
2074
2075 dm_bufio_unlock(c);
2076 }
2077 EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
2078
dm_bufio_mark_buffer_dirty(struct dm_buffer * b)2079 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
2080 {
2081 dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
2082 }
2083 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
2084
dm_bufio_write_dirty_buffers_async(struct dm_bufio_client * c)2085 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
2086 {
2087 LIST_HEAD(write_list);
2088
2089 if (WARN_ON_ONCE(dm_bufio_in_request()))
2090 return; /* should never happen */
2091
2092 dm_bufio_lock(c);
2093 __write_dirty_buffers_async(c, 0, &write_list);
2094 dm_bufio_unlock(c);
2095 __flush_write_list(&write_list);
2096 }
2097 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
2098
2099 /*
2100 * For performance, it is essential that the buffers are written asynchronously
2101 * and simultaneously (so that the block layer can merge the writes) and then
2102 * waited upon.
2103 *
2104 * Finally, we flush hardware disk cache.
2105 */
is_writing(struct lru_entry * e,void * context)2106 static bool is_writing(struct lru_entry *e, void *context)
2107 {
2108 struct dm_buffer *b = le_to_buffer(e);
2109
2110 return test_bit(B_WRITING, &b->state);
2111 }
2112
dm_bufio_write_dirty_buffers(struct dm_bufio_client * c)2113 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
2114 {
2115 int a, f;
2116 unsigned long nr_buffers;
2117 struct lru_entry *e;
2118 struct lru_iter it;
2119
2120 LIST_HEAD(write_list);
2121
2122 dm_bufio_lock(c);
2123 __write_dirty_buffers_async(c, 0, &write_list);
2124 dm_bufio_unlock(c);
2125 __flush_write_list(&write_list);
2126 dm_bufio_lock(c);
2127
2128 nr_buffers = cache_count(&c->cache, LIST_DIRTY);
2129 lru_iter_begin(&c->cache.lru[LIST_DIRTY], &it);
2130 while ((e = lru_iter_next(&it, is_writing, c))) {
2131 struct dm_buffer *b = le_to_buffer(e);
2132 __cache_inc_buffer(b);
2133
2134 BUG_ON(test_bit(B_READING, &b->state));
2135
2136 if (nr_buffers) {
2137 nr_buffers--;
2138 dm_bufio_unlock(c);
2139 wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
2140 dm_bufio_lock(c);
2141 } else {
2142 wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
2143 }
2144
2145 if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state))
2146 cache_mark(&c->cache, b, LIST_CLEAN);
2147
2148 cache_put_and_wake(c, b);
2149
2150 cond_resched();
2151 }
2152 lru_iter_end(&it);
2153
2154 wake_up(&c->free_buffer_wait);
2155 dm_bufio_unlock(c);
2156
2157 a = xchg(&c->async_write_error, 0);
2158 f = dm_bufio_issue_flush(c);
2159 if (a)
2160 return a;
2161
2162 return f;
2163 }
2164 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
2165
2166 /*
2167 * Use dm-io to send an empty barrier to flush the device.
2168 */
dm_bufio_issue_flush(struct dm_bufio_client * c)2169 int dm_bufio_issue_flush(struct dm_bufio_client *c)
2170 {
2171 struct dm_io_request io_req = {
2172 .bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
2173 .mem.type = DM_IO_KMEM,
2174 .mem.ptr.addr = NULL,
2175 .client = c->dm_io,
2176 };
2177 struct dm_io_region io_reg = {
2178 .bdev = c->bdev,
2179 .sector = 0,
2180 .count = 0,
2181 };
2182
2183 if (WARN_ON_ONCE(dm_bufio_in_request()))
2184 return -EINVAL;
2185
2186 return dm_io(&io_req, 1, &io_reg, NULL, IOPRIO_DEFAULT);
2187 }
2188 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
2189
2190 /*
2191 * Use dm-io to send a discard request to flush the device.
2192 */
dm_bufio_issue_discard(struct dm_bufio_client * c,sector_t block,sector_t count)2193 int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
2194 {
2195 struct dm_io_request io_req = {
2196 .bi_opf = REQ_OP_DISCARD | REQ_SYNC,
2197 .mem.type = DM_IO_KMEM,
2198 .mem.ptr.addr = NULL,
2199 .client = c->dm_io,
2200 };
2201 struct dm_io_region io_reg = {
2202 .bdev = c->bdev,
2203 .sector = block_to_sector(c, block),
2204 .count = block_to_sector(c, count),
2205 };
2206
2207 if (WARN_ON_ONCE(dm_bufio_in_request()))
2208 return -EINVAL; /* discards are optional */
2209
2210 return dm_io(&io_req, 1, &io_reg, NULL, IOPRIO_DEFAULT);
2211 }
2212 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
2213
forget_buffer(struct dm_bufio_client * c,sector_t block)2214 static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
2215 {
2216 struct dm_buffer *b;
2217
2218 b = cache_get(&c->cache, block);
2219 if (b) {
2220 if (likely(!smp_load_acquire(&b->state))) {
2221 if (cache_remove(&c->cache, b))
2222 __free_buffer_wake(b);
2223 else
2224 cache_put_and_wake(c, b);
2225 } else {
2226 cache_put_and_wake(c, b);
2227 }
2228 }
2229
2230 return b ? true : false;
2231 }
2232
2233 /*
2234 * Free the given buffer.
2235 *
2236 * This is just a hint, if the buffer is in use or dirty, this function
2237 * does nothing.
2238 */
dm_bufio_forget(struct dm_bufio_client * c,sector_t block)2239 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
2240 {
2241 dm_bufio_lock(c);
2242 forget_buffer(c, block);
2243 dm_bufio_unlock(c);
2244 }
2245 EXPORT_SYMBOL_GPL(dm_bufio_forget);
2246
idle(struct dm_buffer * b,void * context)2247 static enum evict_result idle(struct dm_buffer *b, void *context)
2248 {
2249 return b->state ? ER_DONT_EVICT : ER_EVICT;
2250 }
2251
dm_bufio_forget_buffers(struct dm_bufio_client * c,sector_t block,sector_t n_blocks)2252 void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
2253 {
2254 dm_bufio_lock(c);
2255 cache_remove_range(&c->cache, block, block + n_blocks, idle, __free_buffer_wake);
2256 dm_bufio_unlock(c);
2257 }
2258 EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
2259
dm_bufio_set_minimum_buffers(struct dm_bufio_client * c,unsigned int n)2260 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned int n)
2261 {
2262 c->minimum_buffers = n;
2263 }
2264 EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
2265
dm_bufio_get_block_size(struct dm_bufio_client * c)2266 unsigned int dm_bufio_get_block_size(struct dm_bufio_client *c)
2267 {
2268 return c->block_size;
2269 }
2270 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
2271
dm_bufio_get_device_size(struct dm_bufio_client * c)2272 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
2273 {
2274 sector_t s = bdev_nr_sectors(c->bdev);
2275
2276 if (s >= c->start)
2277 s -= c->start;
2278 else
2279 s = 0;
2280 if (likely(c->sectors_per_block_bits >= 0))
2281 s >>= c->sectors_per_block_bits;
2282 else
2283 sector_div(s, c->block_size >> SECTOR_SHIFT);
2284 return s;
2285 }
2286 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
2287
dm_bufio_get_dm_io_client(struct dm_bufio_client * c)2288 struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
2289 {
2290 return c->dm_io;
2291 }
2292 EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
2293
dm_bufio_get_block_number(struct dm_buffer * b)2294 sector_t dm_bufio_get_block_number(struct dm_buffer *b)
2295 {
2296 return b->block;
2297 }
2298 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
2299
dm_bufio_get_block_data(struct dm_buffer * b)2300 void *dm_bufio_get_block_data(struct dm_buffer *b)
2301 {
2302 return b->data;
2303 }
2304 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
2305
dm_bufio_get_aux_data(struct dm_buffer * b)2306 void *dm_bufio_get_aux_data(struct dm_buffer *b)
2307 {
2308 return b + 1;
2309 }
2310 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
2311
dm_bufio_get_client(struct dm_buffer * b)2312 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
2313 {
2314 return b->c;
2315 }
2316 EXPORT_SYMBOL_GPL(dm_bufio_get_client);
2317
warn_leak(struct dm_buffer * b,void * context)2318 static enum it_action warn_leak(struct dm_buffer *b, void *context)
2319 {
2320 bool *warned = context;
2321
2322 WARN_ON(!(*warned));
2323 *warned = true;
2324 DMERR("leaked buffer %llx, hold count %u, list %d",
2325 (unsigned long long)b->block, atomic_read(&b->hold_count), b->list_mode);
2326 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2327 stack_trace_print(b->stack_entries, b->stack_len, 1);
2328 /* mark unclaimed to avoid WARN_ON at end of drop_buffers() */
2329 atomic_set(&b->hold_count, 0);
2330 #endif
2331 return IT_NEXT;
2332 }
2333
drop_buffers(struct dm_bufio_client * c)2334 static void drop_buffers(struct dm_bufio_client *c)
2335 {
2336 int i;
2337 struct dm_buffer *b;
2338
2339 if (WARN_ON(dm_bufio_in_request()))
2340 return; /* should never happen */
2341
2342 /*
2343 * An optimization so that the buffers are not written one-by-one.
2344 */
2345 dm_bufio_write_dirty_buffers_async(c);
2346
2347 dm_bufio_lock(c);
2348
2349 while ((b = __get_unclaimed_buffer(c)))
2350 __free_buffer_wake(b);
2351
2352 for (i = 0; i < LIST_SIZE; i++) {
2353 bool warned = false;
2354
2355 cache_iterate(&c->cache, i, warn_leak, &warned);
2356 }
2357
2358 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2359 while ((b = __get_unclaimed_buffer(c)))
2360 __free_buffer_wake(b);
2361 #endif
2362
2363 for (i = 0; i < LIST_SIZE; i++)
2364 WARN_ON(cache_count(&c->cache, i));
2365
2366 dm_bufio_unlock(c);
2367 }
2368
get_retain_buffers(struct dm_bufio_client * c)2369 static unsigned long get_retain_buffers(struct dm_bufio_client *c)
2370 {
2371 unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
2372
2373 if (likely(c->sectors_per_block_bits >= 0))
2374 retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
2375 else
2376 retain_bytes /= c->block_size;
2377
2378 return retain_bytes;
2379 }
2380
__scan(struct dm_bufio_client * c)2381 static void __scan(struct dm_bufio_client *c)
2382 {
2383 int l;
2384 struct dm_buffer *b;
2385 unsigned long freed = 0;
2386 unsigned long retain_target = get_retain_buffers(c);
2387 unsigned long count = cache_total(&c->cache);
2388
2389 for (l = 0; l < LIST_SIZE; l++) {
2390 while (true) {
2391 if (count - freed <= retain_target)
2392 atomic_long_set(&c->need_shrink, 0);
2393 if (!atomic_long_read(&c->need_shrink))
2394 break;
2395
2396 b = cache_evict(&c->cache, l,
2397 l == LIST_CLEAN ? is_clean : is_dirty, c);
2398 if (!b)
2399 break;
2400
2401 __make_buffer_clean(b);
2402 __free_buffer_wake(b);
2403
2404 atomic_long_dec(&c->need_shrink);
2405 freed++;
2406
2407 if (unlikely(freed % SCAN_RESCHED_CYCLE == 0)) {
2408 dm_bufio_unlock(c);
2409 cond_resched();
2410 dm_bufio_lock(c);
2411 }
2412 }
2413 }
2414 }
2415
shrink_work(struct work_struct * w)2416 static void shrink_work(struct work_struct *w)
2417 {
2418 struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
2419
2420 dm_bufio_lock(c);
2421 __scan(c);
2422 dm_bufio_unlock(c);
2423 }
2424
dm_bufio_shrink_scan(struct shrinker * shrink,struct shrink_control * sc)2425 static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
2426 {
2427 struct dm_bufio_client *c;
2428
2429 c = shrink->private_data;
2430 atomic_long_add(sc->nr_to_scan, &c->need_shrink);
2431 queue_work(dm_bufio_wq, &c->shrink_work);
2432
2433 return sc->nr_to_scan;
2434 }
2435
dm_bufio_shrink_count(struct shrinker * shrink,struct shrink_control * sc)2436 static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
2437 {
2438 struct dm_bufio_client *c = shrink->private_data;
2439 unsigned long count = cache_total(&c->cache);
2440 unsigned long retain_target = get_retain_buffers(c);
2441 unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
2442
2443 if (unlikely(count < retain_target))
2444 count = 0;
2445 else
2446 count -= retain_target;
2447
2448 if (unlikely(count < queued_for_cleanup))
2449 count = 0;
2450 else
2451 count -= queued_for_cleanup;
2452
2453 return count;
2454 }
2455
2456 /*
2457 * Create the buffering interface
2458 */
dm_bufio_client_create(struct block_device * bdev,unsigned int block_size,unsigned int reserved_buffers,unsigned int aux_size,void (* alloc_callback)(struct dm_buffer *),void (* write_callback)(struct dm_buffer *),unsigned int flags)2459 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned int block_size,
2460 unsigned int reserved_buffers, unsigned int aux_size,
2461 void (*alloc_callback)(struct dm_buffer *),
2462 void (*write_callback)(struct dm_buffer *),
2463 unsigned int flags)
2464 {
2465 int r;
2466 unsigned int num_locks;
2467 struct dm_bufio_client *c;
2468 char slab_name[64];
2469 static atomic_t seqno = ATOMIC_INIT(0);
2470
2471 if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
2472 DMERR("%s: block size not specified or is not multiple of 512b", __func__);
2473 r = -EINVAL;
2474 goto bad_client;
2475 }
2476
2477 num_locks = dm_num_hash_locks();
2478 c = kzalloc(sizeof(*c) + (num_locks * sizeof(struct buffer_tree)), GFP_KERNEL);
2479 if (!c) {
2480 r = -ENOMEM;
2481 goto bad_client;
2482 }
2483 cache_init(&c->cache, num_locks, (flags & DM_BUFIO_CLIENT_NO_SLEEP) != 0);
2484
2485 c->bdev = bdev;
2486 c->block_size = block_size;
2487 if (is_power_of_2(block_size))
2488 c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
2489 else
2490 c->sectors_per_block_bits = -1;
2491
2492 c->alloc_callback = alloc_callback;
2493 c->write_callback = write_callback;
2494
2495 if (flags & DM_BUFIO_CLIENT_NO_SLEEP) {
2496 c->no_sleep = true;
2497 static_branch_inc(&no_sleep_enabled);
2498 }
2499
2500 mutex_init(&c->lock);
2501 spin_lock_init(&c->spinlock);
2502 INIT_LIST_HEAD(&c->reserved_buffers);
2503 c->need_reserved_buffers = reserved_buffers;
2504
2505 dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
2506
2507 init_waitqueue_head(&c->free_buffer_wait);
2508 c->async_write_error = 0;
2509
2510 c->dm_io = dm_io_client_create();
2511 if (IS_ERR(c->dm_io)) {
2512 r = PTR_ERR(c->dm_io);
2513 goto bad_dm_io;
2514 }
2515
2516 if (block_size <= KMALLOC_MAX_SIZE &&
2517 (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
2518 unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE);
2519
2520 snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u-%u",
2521 block_size, atomic_inc_return(&seqno));
2522 c->slab_cache = kmem_cache_create(slab_name, block_size, align,
2523 SLAB_RECLAIM_ACCOUNT, NULL);
2524 if (!c->slab_cache) {
2525 r = -ENOMEM;
2526 goto bad;
2527 }
2528 }
2529 if (aux_size)
2530 snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u-%u",
2531 aux_size, atomic_inc_return(&seqno));
2532 else
2533 snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u",
2534 atomic_inc_return(&seqno));
2535 c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
2536 0, SLAB_RECLAIM_ACCOUNT, NULL);
2537 if (!c->slab_buffer) {
2538 r = -ENOMEM;
2539 goto bad;
2540 }
2541
2542 while (c->need_reserved_buffers) {
2543 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
2544
2545 if (!b) {
2546 r = -ENOMEM;
2547 goto bad;
2548 }
2549 __free_buffer_wake(b);
2550 }
2551
2552 INIT_WORK(&c->shrink_work, shrink_work);
2553 atomic_long_set(&c->need_shrink, 0);
2554
2555 c->shrinker = shrinker_alloc(0, "dm-bufio:(%u:%u)",
2556 MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
2557 if (!c->shrinker) {
2558 r = -ENOMEM;
2559 goto bad;
2560 }
2561
2562 c->shrinker->count_objects = dm_bufio_shrink_count;
2563 c->shrinker->scan_objects = dm_bufio_shrink_scan;
2564 c->shrinker->seeks = 1;
2565 c->shrinker->batch = 0;
2566 c->shrinker->private_data = c;
2567
2568 shrinker_register(c->shrinker);
2569
2570 mutex_lock(&dm_bufio_clients_lock);
2571 dm_bufio_client_count++;
2572 list_add(&c->client_list, &dm_bufio_all_clients);
2573 __cache_size_refresh();
2574 mutex_unlock(&dm_bufio_clients_lock);
2575
2576 return c;
2577
2578 bad:
2579 while (!list_empty(&c->reserved_buffers)) {
2580 struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
2581
2582 list_del(&b->lru.list);
2583 free_buffer(b);
2584 }
2585 kmem_cache_destroy(c->slab_cache);
2586 kmem_cache_destroy(c->slab_buffer);
2587 dm_io_client_destroy(c->dm_io);
2588 bad_dm_io:
2589 mutex_destroy(&c->lock);
2590 if (c->no_sleep)
2591 static_branch_dec(&no_sleep_enabled);
2592 kfree(c);
2593 bad_client:
2594 return ERR_PTR(r);
2595 }
2596 EXPORT_SYMBOL_GPL(dm_bufio_client_create);
2597
2598 /*
2599 * Free the buffering interface.
2600 * It is required that there are no references on any buffers.
2601 */
dm_bufio_client_destroy(struct dm_bufio_client * c)2602 void dm_bufio_client_destroy(struct dm_bufio_client *c)
2603 {
2604 unsigned int i;
2605
2606 drop_buffers(c);
2607
2608 shrinker_free(c->shrinker);
2609 flush_work(&c->shrink_work);
2610
2611 mutex_lock(&dm_bufio_clients_lock);
2612
2613 list_del(&c->client_list);
2614 dm_bufio_client_count--;
2615 __cache_size_refresh();
2616
2617 mutex_unlock(&dm_bufio_clients_lock);
2618
2619 WARN_ON(c->need_reserved_buffers);
2620
2621 while (!list_empty(&c->reserved_buffers)) {
2622 struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
2623
2624 list_del(&b->lru.list);
2625 free_buffer(b);
2626 }
2627
2628 for (i = 0; i < LIST_SIZE; i++)
2629 if (cache_count(&c->cache, i))
2630 DMERR("leaked buffer count %d: %lu", i, cache_count(&c->cache, i));
2631
2632 for (i = 0; i < LIST_SIZE; i++)
2633 WARN_ON(cache_count(&c->cache, i));
2634
2635 cache_destroy(&c->cache);
2636 kmem_cache_destroy(c->slab_cache);
2637 kmem_cache_destroy(c->slab_buffer);
2638 dm_io_client_destroy(c->dm_io);
2639 mutex_destroy(&c->lock);
2640 if (c->no_sleep)
2641 static_branch_dec(&no_sleep_enabled);
2642 kfree(c);
2643 }
2644 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
2645
dm_bufio_client_reset(struct dm_bufio_client * c)2646 void dm_bufio_client_reset(struct dm_bufio_client *c)
2647 {
2648 drop_buffers(c);
2649 flush_work(&c->shrink_work);
2650 }
2651 EXPORT_SYMBOL_GPL(dm_bufio_client_reset);
2652
dm_bufio_set_sector_offset(struct dm_bufio_client * c,sector_t start)2653 void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
2654 {
2655 c->start = start;
2656 }
2657 EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
2658
2659 /*--------------------------------------------------------------*/
2660
2661 /*
2662 * Global cleanup tries to evict the oldest buffers from across _all_
2663 * the clients. It does this by repeatedly evicting a few buffers from
2664 * the client that holds the oldest buffer. It's approximate, but hopefully
2665 * good enough.
2666 */
__pop_client(void)2667 static struct dm_bufio_client *__pop_client(void)
2668 {
2669 struct list_head *h;
2670
2671 if (list_empty(&dm_bufio_all_clients))
2672 return NULL;
2673
2674 h = dm_bufio_all_clients.next;
2675 list_del(h);
2676 return container_of(h, struct dm_bufio_client, client_list);
2677 }
2678
2679 /*
2680 * Inserts the client in the global client list based on its
2681 * 'oldest_buffer' field.
2682 */
__insert_client(struct dm_bufio_client * new_client)2683 static void __insert_client(struct dm_bufio_client *new_client)
2684 {
2685 struct dm_bufio_client *c;
2686 struct list_head *h = dm_bufio_all_clients.next;
2687
2688 while (h != &dm_bufio_all_clients) {
2689 c = container_of(h, struct dm_bufio_client, client_list);
2690 if (time_after_eq(c->oldest_buffer, new_client->oldest_buffer))
2691 break;
2692 h = h->next;
2693 }
2694
2695 list_add_tail(&new_client->client_list, h);
2696 }
2697
select_for_evict(struct dm_buffer * b,void * context)2698 static enum evict_result select_for_evict(struct dm_buffer *b, void *context)
2699 {
2700 /* In no-sleep mode, we cannot wait on IO. */
2701 if (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep) {
2702 if (test_bit_acquire(B_READING, &b->state) ||
2703 test_bit(B_WRITING, &b->state) ||
2704 test_bit(B_DIRTY, &b->state))
2705 return ER_DONT_EVICT;
2706 }
2707 return ER_EVICT;
2708 }
2709
__evict_a_few(unsigned long nr_buffers)2710 static unsigned long __evict_a_few(unsigned long nr_buffers)
2711 {
2712 struct dm_bufio_client *c;
2713 unsigned long oldest_buffer = jiffies;
2714 unsigned long last_accessed;
2715 unsigned long count;
2716 struct dm_buffer *b;
2717
2718 c = __pop_client();
2719 if (!c)
2720 return 0;
2721
2722 dm_bufio_lock(c);
2723
2724 for (count = 0; count < nr_buffers; count++) {
2725 b = cache_evict(&c->cache, LIST_CLEAN, select_for_evict, NULL);
2726 if (!b)
2727 break;
2728
2729 last_accessed = READ_ONCE(b->last_accessed);
2730 if (time_after_eq(oldest_buffer, last_accessed))
2731 oldest_buffer = last_accessed;
2732
2733 __make_buffer_clean(b);
2734 __free_buffer_wake(b);
2735
2736 if (need_resched()) {
2737 dm_bufio_unlock(c);
2738 cond_resched();
2739 dm_bufio_lock(c);
2740 }
2741 }
2742
2743 dm_bufio_unlock(c);
2744
2745 if (count)
2746 c->oldest_buffer = oldest_buffer;
2747 __insert_client(c);
2748
2749 return count;
2750 }
2751
check_watermarks(void)2752 static void check_watermarks(void)
2753 {
2754 LIST_HEAD(write_list);
2755 struct dm_bufio_client *c;
2756
2757 mutex_lock(&dm_bufio_clients_lock);
2758 list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
2759 dm_bufio_lock(c);
2760 __check_watermark(c, &write_list);
2761 dm_bufio_unlock(c);
2762 }
2763 mutex_unlock(&dm_bufio_clients_lock);
2764
2765 __flush_write_list(&write_list);
2766 }
2767
evict_old(void)2768 static void evict_old(void)
2769 {
2770 unsigned long threshold = dm_bufio_cache_size -
2771 dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
2772
2773 mutex_lock(&dm_bufio_clients_lock);
2774 while (dm_bufio_current_allocated > threshold) {
2775 if (!__evict_a_few(64))
2776 break;
2777 cond_resched();
2778 }
2779 mutex_unlock(&dm_bufio_clients_lock);
2780 }
2781
do_global_cleanup(struct work_struct * w)2782 static void do_global_cleanup(struct work_struct *w)
2783 {
2784 check_watermarks();
2785 evict_old();
2786 }
2787
2788 /*
2789 *--------------------------------------------------------------
2790 * Module setup
2791 *--------------------------------------------------------------
2792 */
2793
2794 /*
2795 * This is called only once for the whole dm_bufio module.
2796 * It initializes memory limit.
2797 */
dm_bufio_init(void)2798 static int __init dm_bufio_init(void)
2799 {
2800 __u64 mem;
2801
2802 dm_bufio_allocated_kmem_cache = 0;
2803 dm_bufio_allocated_get_free_pages = 0;
2804 dm_bufio_allocated_vmalloc = 0;
2805 dm_bufio_current_allocated = 0;
2806
2807 mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
2808 DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
2809
2810 if (mem > ULONG_MAX)
2811 mem = ULONG_MAX;
2812
2813 #ifdef CONFIG_MMU
2814 if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
2815 mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
2816 #endif
2817
2818 dm_bufio_default_cache_size = mem;
2819
2820 mutex_lock(&dm_bufio_clients_lock);
2821 __cache_size_refresh();
2822 mutex_unlock(&dm_bufio_clients_lock);
2823
2824 dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
2825 if (!dm_bufio_wq)
2826 return -ENOMEM;
2827
2828 INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2829
2830 return 0;
2831 }
2832
2833 /*
2834 * This is called once when unloading the dm_bufio module.
2835 */
dm_bufio_exit(void)2836 static void __exit dm_bufio_exit(void)
2837 {
2838 int bug = 0;
2839
2840 destroy_workqueue(dm_bufio_wq);
2841
2842 if (dm_bufio_client_count) {
2843 DMCRIT("%s: dm_bufio_client_count leaked: %d",
2844 __func__, dm_bufio_client_count);
2845 bug = 1;
2846 }
2847
2848 if (dm_bufio_current_allocated) {
2849 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2850 __func__, dm_bufio_current_allocated);
2851 bug = 1;
2852 }
2853
2854 if (dm_bufio_allocated_get_free_pages) {
2855 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2856 __func__, dm_bufio_allocated_get_free_pages);
2857 bug = 1;
2858 }
2859
2860 if (dm_bufio_allocated_vmalloc) {
2861 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2862 __func__, dm_bufio_allocated_vmalloc);
2863 bug = 1;
2864 }
2865
2866 WARN_ON(bug); /* leaks are not worth crashing the system */
2867 }
2868
2869 module_init(dm_bufio_init)
2870 module_exit(dm_bufio_exit)
2871
2872 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644);
2873 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2874
2875 module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644);
2876 MODULE_PARM_DESC(max_age_seconds, "No longer does anything");
2877
2878 module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644);
2879 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2880
2881 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, 0644);
2882 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2883
2884 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, 0444);
2885 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2886
2887 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, 0444);
2888 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2889
2890 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, 0444);
2891 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2892
2893 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, 0444);
2894 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2895
2896 MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>");
2897 MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2898 MODULE_LICENSE("GPL");
2899