• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009-2011 Red Hat, Inc.
3  *
4  * Author: Mikulas Patocka <mpatocka@redhat.com>
5  *
6  * This file is released under the GPL.
7  */
8 
9 #include <linux/dm-bufio.h>
10 
11 #include <linux/device-mapper.h>
12 #include <linux/dm-io.h>
13 #include <linux/slab.h>
14 #include <linux/sched/mm.h>
15 #include <linux/jiffies.h>
16 #include <linux/vmalloc.h>
17 #include <linux/shrinker.h>
18 #include <linux/module.h>
19 #include <linux/rbtree.h>
20 #include <linux/stacktrace.h>
21 
22 #include <trace/hooks/mm.h>
23 
24 #define DM_MSG_PREFIX "bufio"
25 
26 /*
27  * Memory management policy:
28  *	Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
29  *	or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
30  *	Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
31  *	Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
32  *	dirty buffers.
33  */
34 #define DM_BUFIO_MIN_BUFFERS		8
35 
36 #define DM_BUFIO_MEMORY_PERCENT		2
37 #define DM_BUFIO_VMALLOC_PERCENT	25
38 #define DM_BUFIO_WRITEBACK_RATIO	3
39 #define DM_BUFIO_LOW_WATERMARK_RATIO	16
40 
41 /*
42  * Check buffer ages in this interval (seconds)
43  */
44 #define DM_BUFIO_WORK_TIMER_SECS	30
45 
46 /*
47  * Free buffers when they are older than this (seconds)
48  */
49 #define DM_BUFIO_DEFAULT_AGE_SECS	300
50 
51 /*
52  * The nr of bytes of cached data to keep around.
53  */
54 #define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
55 
56 /*
57  * Align buffer writes to this boundary.
58  * Tests show that SSDs have the highest IOPS when using 4k writes.
59  */
60 #define DM_BUFIO_WRITE_ALIGN		4096
61 
62 /*
63  * dm_buffer->list_mode
64  */
65 #define LIST_CLEAN	0
66 #define LIST_DIRTY	1
67 #define LIST_SIZE	2
68 
69 /*
70  * Linking of buffers:
71  *	All buffers are linked to buffer_tree with their node field.
72  *
73  *	Clean buffers that are not being written (B_WRITING not set)
74  *	are linked to lru[LIST_CLEAN] with their lru_list field.
75  *
76  *	Dirty and clean buffers that are being written are linked to
77  *	lru[LIST_DIRTY] with their lru_list field. When the write
78  *	finishes, the buffer cannot be relinked immediately (because we
79  *	are in an interrupt context and relinking requires process
80  *	context), so some clean-not-writing buffers can be held on
81  *	dirty_lru too.  They are later added to lru in the process
82  *	context.
83  */
84 struct dm_bufio_client {
85 	struct mutex lock;
86 
87 	struct list_head lru[LIST_SIZE];
88 	unsigned long n_buffers[LIST_SIZE];
89 
90 	struct block_device *bdev;
91 	unsigned block_size;
92 	s8 sectors_per_block_bits;
93 	void (*alloc_callback)(struct dm_buffer *);
94 	void (*write_callback)(struct dm_buffer *);
95 
96 	struct kmem_cache *slab_buffer;
97 	struct kmem_cache *slab_cache;
98 	struct dm_io_client *dm_io;
99 
100 	struct list_head reserved_buffers;
101 	unsigned need_reserved_buffers;
102 
103 	unsigned minimum_buffers;
104 
105 	struct rb_root buffer_tree;
106 	wait_queue_head_t free_buffer_wait;
107 
108 	sector_t start;
109 
110 	int async_write_error;
111 
112 	struct list_head client_list;
113 
114 	struct shrinker shrinker;
115 	struct work_struct shrink_work;
116 	atomic_long_t need_shrink;
117 };
118 
119 /*
120  * Buffer state bits.
121  */
122 #define B_READING	0
123 #define B_WRITING	1
124 #define B_DIRTY		2
125 
126 /*
127  * Describes how the block was allocated:
128  * kmem_cache_alloc(), __get_free_pages() or vmalloc().
129  * See the comment at alloc_buffer_data.
130  */
131 enum data_mode {
132 	DATA_MODE_SLAB = 0,
133 	DATA_MODE_GET_FREE_PAGES = 1,
134 	DATA_MODE_VMALLOC = 2,
135 	DATA_MODE_LIMIT = 3
136 };
137 
138 struct dm_buffer {
139 	struct rb_node node;
140 	struct list_head lru_list;
141 	struct list_head global_list;
142 	sector_t block;
143 	void *data;
144 	unsigned char data_mode;		/* DATA_MODE_* */
145 	unsigned char list_mode;		/* LIST_* */
146 	blk_status_t read_error;
147 	blk_status_t write_error;
148 	unsigned accessed;
149 	unsigned hold_count;
150 	unsigned long state;
151 	unsigned long last_accessed;
152 	unsigned dirty_start;
153 	unsigned dirty_end;
154 	unsigned write_start;
155 	unsigned write_end;
156 	struct dm_bufio_client *c;
157 	struct list_head write_list;
158 	void (*end_io)(struct dm_buffer *, blk_status_t);
159 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
160 #define MAX_STACK 10
161 	unsigned int stack_len;
162 	unsigned long stack_entries[MAX_STACK];
163 #endif
164 };
165 
166 /*----------------------------------------------------------------*/
167 
168 #define dm_bufio_in_request()	(!!current->bio_list)
169 
dm_bufio_lock(struct dm_bufio_client * c)170 static void dm_bufio_lock(struct dm_bufio_client *c)
171 {
172 	mutex_lock_nested(&c->lock, dm_bufio_in_request());
173 }
174 
dm_bufio_trylock(struct dm_bufio_client * c)175 static int dm_bufio_trylock(struct dm_bufio_client *c)
176 {
177 	return mutex_trylock(&c->lock);
178 }
179 
dm_bufio_unlock(struct dm_bufio_client * c)180 static void dm_bufio_unlock(struct dm_bufio_client *c)
181 {
182 	mutex_unlock(&c->lock);
183 }
184 
185 /*----------------------------------------------------------------*/
186 
187 /*
188  * Default cache size: available memory divided by the ratio.
189  */
190 static unsigned long dm_bufio_default_cache_size;
191 
192 /*
193  * Total cache size set by the user.
194  */
195 static unsigned long dm_bufio_cache_size;
196 
197 /*
198  * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
199  * at any time.  If it disagrees, the user has changed cache size.
200  */
201 static unsigned long dm_bufio_cache_size_latch;
202 
203 static DEFINE_SPINLOCK(global_spinlock);
204 
205 static LIST_HEAD(global_queue);
206 
207 static unsigned long global_num = 0;
208 
209 /*
210  * Buffers are freed after this timeout
211  */
212 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
213 static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
214 
215 static unsigned long dm_bufio_peak_allocated;
216 static unsigned long dm_bufio_allocated_kmem_cache;
217 static unsigned long dm_bufio_allocated_get_free_pages;
218 static unsigned long dm_bufio_allocated_vmalloc;
219 static unsigned long dm_bufio_current_allocated;
220 
221 /*----------------------------------------------------------------*/
222 
223 /*
224  * The current number of clients.
225  */
226 static int dm_bufio_client_count;
227 
228 /*
229  * The list of all clients.
230  */
231 static LIST_HEAD(dm_bufio_all_clients);
232 
233 /*
234  * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
235  */
236 static DEFINE_MUTEX(dm_bufio_clients_lock);
237 
238 static struct workqueue_struct *dm_bufio_wq;
239 static struct delayed_work dm_bufio_cleanup_old_work;
240 static struct work_struct dm_bufio_replacement_work;
241 
242 
243 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
buffer_record_stack(struct dm_buffer * b)244 static void buffer_record_stack(struct dm_buffer *b)
245 {
246 	b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
247 }
248 #endif
249 
250 /*----------------------------------------------------------------
251  * A red/black tree acts as an index for all the buffers.
252  *--------------------------------------------------------------*/
__find(struct dm_bufio_client * c,sector_t block)253 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
254 {
255 	struct rb_node *n = c->buffer_tree.rb_node;
256 	struct dm_buffer *b;
257 
258 	while (n) {
259 		b = container_of(n, struct dm_buffer, node);
260 
261 		if (b->block == block)
262 			return b;
263 
264 		n = block < b->block ? n->rb_left : n->rb_right;
265 	}
266 
267 	return NULL;
268 }
269 
__find_next(struct dm_bufio_client * c,sector_t block)270 static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block)
271 {
272 	struct rb_node *n = c->buffer_tree.rb_node;
273 	struct dm_buffer *b;
274 	struct dm_buffer *best = NULL;
275 
276 	while (n) {
277 		b = container_of(n, struct dm_buffer, node);
278 
279 		if (b->block == block)
280 			return b;
281 
282 		if (block <= b->block) {
283 			n = n->rb_left;
284 			best = b;
285 		} else {
286 			n = n->rb_right;
287 		}
288 	}
289 
290 	return best;
291 }
292 
__insert(struct dm_bufio_client * c,struct dm_buffer * b)293 static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
294 {
295 	struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
296 	struct dm_buffer *found;
297 
298 	while (*new) {
299 		found = container_of(*new, struct dm_buffer, node);
300 
301 		if (found->block == b->block) {
302 			BUG_ON(found != b);
303 			return;
304 		}
305 
306 		parent = *new;
307 		new = b->block < found->block ?
308 			&found->node.rb_left : &found->node.rb_right;
309 	}
310 
311 	rb_link_node(&b->node, parent, new);
312 	rb_insert_color(&b->node, &c->buffer_tree);
313 }
314 
__remove(struct dm_bufio_client * c,struct dm_buffer * b)315 static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
316 {
317 	rb_erase(&b->node, &c->buffer_tree);
318 }
319 
320 /*----------------------------------------------------------------*/
321 
adjust_total_allocated(struct dm_buffer * b,bool unlink)322 static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
323 {
324 	unsigned char data_mode;
325 	long diff;
326 
327 	static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
328 		&dm_bufio_allocated_kmem_cache,
329 		&dm_bufio_allocated_get_free_pages,
330 		&dm_bufio_allocated_vmalloc,
331 	};
332 
333 	data_mode = b->data_mode;
334 	diff = (long)b->c->block_size;
335 	if (unlink)
336 		diff = -diff;
337 
338 	spin_lock(&global_spinlock);
339 
340 	*class_ptr[data_mode] += diff;
341 
342 	dm_bufio_current_allocated += diff;
343 
344 	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
345 		dm_bufio_peak_allocated = dm_bufio_current_allocated;
346 
347 	b->accessed = 1;
348 
349 	if (!unlink) {
350 		list_add(&b->global_list, &global_queue);
351 		global_num++;
352 		if (dm_bufio_current_allocated > dm_bufio_cache_size)
353 			queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
354 	} else {
355 		list_del(&b->global_list);
356 		global_num--;
357 	}
358 
359 	spin_unlock(&global_spinlock);
360 }
361 
362 /*
363  * Change the number of clients and recalculate per-client limit.
364  */
__cache_size_refresh(void)365 static void __cache_size_refresh(void)
366 {
367 	BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
368 	BUG_ON(dm_bufio_client_count < 0);
369 
370 	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
371 
372 	/*
373 	 * Use default if set to 0 and report the actual cache size used.
374 	 */
375 	if (!dm_bufio_cache_size_latch) {
376 		(void)cmpxchg(&dm_bufio_cache_size, 0,
377 			      dm_bufio_default_cache_size);
378 		dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
379 	}
380 }
381 
382 /*
383  * Allocating buffer data.
384  *
385  * Small buffers are allocated with kmem_cache, to use space optimally.
386  *
387  * For large buffers, we choose between get_free_pages and vmalloc.
388  * Each has advantages and disadvantages.
389  *
390  * __get_free_pages can randomly fail if the memory is fragmented.
391  * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
392  * as low as 128M) so using it for caching is not appropriate.
393  *
394  * If the allocation may fail we use __get_free_pages. Memory fragmentation
395  * won't have a fatal effect here, but it just causes flushes of some other
396  * buffers and more I/O will be performed. Don't use __get_free_pages if it
397  * always fails (i.e. order >= MAX_ORDER).
398  *
399  * If the allocation shouldn't fail we use __vmalloc. This is only for the
400  * initial reserve allocation, so there's no risk of wasting all vmalloc
401  * space.
402  */
alloc_buffer_data(struct dm_bufio_client * c,gfp_t gfp_mask,unsigned char * data_mode)403 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
404 			       unsigned char *data_mode)
405 {
406 	if (unlikely(c->slab_cache != NULL)) {
407 		*data_mode = DATA_MODE_SLAB;
408 		return kmem_cache_alloc(c->slab_cache, gfp_mask);
409 	}
410 
411 	if (c->block_size <= KMALLOC_MAX_SIZE &&
412 	    gfp_mask & __GFP_NORETRY) {
413 		*data_mode = DATA_MODE_GET_FREE_PAGES;
414 		return (void *)__get_free_pages(gfp_mask,
415 						c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
416 	}
417 
418 	*data_mode = DATA_MODE_VMALLOC;
419 
420 	/*
421 	 * __vmalloc allocates the data pages and auxiliary structures with
422 	 * gfp_flags that were specified, but pagetables are always allocated
423 	 * with GFP_KERNEL, no matter what was specified as gfp_mask.
424 	 *
425 	 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
426 	 * all allocations done by this process (including pagetables) are done
427 	 * as if GFP_NOIO was specified.
428 	 */
429 	if (gfp_mask & __GFP_NORETRY) {
430 		unsigned noio_flag = memalloc_noio_save();
431 		void *ptr = __vmalloc(c->block_size, gfp_mask);
432 
433 		memalloc_noio_restore(noio_flag);
434 		return ptr;
435 	}
436 
437 	return __vmalloc(c->block_size, gfp_mask);
438 }
439 
440 /*
441  * Free buffer's data.
442  */
free_buffer_data(struct dm_bufio_client * c,void * data,unsigned char data_mode)443 static void free_buffer_data(struct dm_bufio_client *c,
444 			     void *data, unsigned char data_mode)
445 {
446 	switch (data_mode) {
447 	case DATA_MODE_SLAB:
448 		kmem_cache_free(c->slab_cache, data);
449 		break;
450 
451 	case DATA_MODE_GET_FREE_PAGES:
452 		free_pages((unsigned long)data,
453 			   c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
454 		break;
455 
456 	case DATA_MODE_VMALLOC:
457 		vfree(data);
458 		break;
459 
460 	default:
461 		DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
462 		       data_mode);
463 		BUG();
464 	}
465 }
466 
467 /*
468  * Allocate buffer and its data.
469  */
alloc_buffer(struct dm_bufio_client * c,gfp_t gfp_mask)470 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
471 {
472 	struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
473 
474 	if (!b)
475 		return NULL;
476 
477 	b->c = c;
478 
479 	b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
480 	if (!b->data) {
481 		kmem_cache_free(c->slab_buffer, b);
482 		return NULL;
483 	}
484 
485 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
486 	b->stack_len = 0;
487 #endif
488 	return b;
489 }
490 
491 /*
492  * Free buffer and its data.
493  */
free_buffer(struct dm_buffer * b)494 static void free_buffer(struct dm_buffer *b)
495 {
496 	struct dm_bufio_client *c = b->c;
497 
498 	free_buffer_data(c, b->data, b->data_mode);
499 	kmem_cache_free(c->slab_buffer, b);
500 }
501 
502 /*
503  * Link buffer to the buffer tree and clean or dirty queue.
504  */
__link_buffer(struct dm_buffer * b,sector_t block,int dirty)505 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
506 {
507 	struct dm_bufio_client *c = b->c;
508 
509 	c->n_buffers[dirty]++;
510 	b->block = block;
511 	b->list_mode = dirty;
512 	list_add(&b->lru_list, &c->lru[dirty]);
513 	__insert(b->c, b);
514 	b->last_accessed = jiffies;
515 
516 	adjust_total_allocated(b, false);
517 }
518 
519 /*
520  * Unlink buffer from the buffer tree and dirty or clean queue.
521  */
__unlink_buffer(struct dm_buffer * b)522 static void __unlink_buffer(struct dm_buffer *b)
523 {
524 	struct dm_bufio_client *c = b->c;
525 
526 	BUG_ON(!c->n_buffers[b->list_mode]);
527 
528 	c->n_buffers[b->list_mode]--;
529 	__remove(b->c, b);
530 	list_del(&b->lru_list);
531 
532 	adjust_total_allocated(b, true);
533 }
534 
535 /*
536  * Place the buffer to the head of dirty or clean LRU queue.
537  */
__relink_lru(struct dm_buffer * b,int dirty)538 static void __relink_lru(struct dm_buffer *b, int dirty)
539 {
540 	struct dm_bufio_client *c = b->c;
541 
542 	b->accessed = 1;
543 
544 	BUG_ON(!c->n_buffers[b->list_mode]);
545 
546 	c->n_buffers[b->list_mode]--;
547 	c->n_buffers[dirty]++;
548 	b->list_mode = dirty;
549 	list_move(&b->lru_list, &c->lru[dirty]);
550 	b->last_accessed = jiffies;
551 }
552 
553 /*----------------------------------------------------------------
554  * Submit I/O on the buffer.
555  *
556  * Bio interface is faster but it has some problems:
557  *	the vector list is limited (increasing this limit increases
558  *	memory-consumption per buffer, so it is not viable);
559  *
560  *	the memory must be direct-mapped, not vmalloced;
561  *
562  * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
563  * it is not vmalloced, try using the bio interface.
564  *
565  * If the buffer is big, if it is vmalloced or if the underlying device
566  * rejects the bio because it is too large, use dm-io layer to do the I/O.
567  * The dm-io layer splits the I/O into multiple requests, avoiding the above
568  * shortcomings.
569  *--------------------------------------------------------------*/
570 
571 /*
572  * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
573  * that the request was handled directly with bio interface.
574  */
dmio_complete(unsigned long error,void * context)575 static void dmio_complete(unsigned long error, void *context)
576 {
577 	struct dm_buffer *b = context;
578 
579 	b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
580 }
581 
use_dmio(struct dm_buffer * b,int rw,sector_t sector,unsigned n_sectors,unsigned offset)582 static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
583 		     unsigned n_sectors, unsigned offset)
584 {
585 	int r;
586 	struct dm_io_request io_req = {
587 		.bi_op = rw,
588 		.bi_op_flags = 0,
589 		.notify.fn = dmio_complete,
590 		.notify.context = b,
591 		.client = b->c->dm_io,
592 	};
593 	struct dm_io_region region = {
594 		.bdev = b->c->bdev,
595 		.sector = sector,
596 		.count = n_sectors,
597 	};
598 
599 	if (b->data_mode != DATA_MODE_VMALLOC) {
600 		io_req.mem.type = DM_IO_KMEM;
601 		io_req.mem.ptr.addr = (char *)b->data + offset;
602 	} else {
603 		io_req.mem.type = DM_IO_VMA;
604 		io_req.mem.ptr.vma = (char *)b->data + offset;
605 	}
606 
607 	r = dm_io(&io_req, 1, &region, NULL);
608 	if (unlikely(r))
609 		b->end_io(b, errno_to_blk_status(r));
610 }
611 
bio_complete(struct bio * bio)612 static void bio_complete(struct bio *bio)
613 {
614 	struct dm_buffer *b = bio->bi_private;
615 	blk_status_t status = bio->bi_status;
616 	bio_put(bio);
617 	b->end_io(b, status);
618 }
619 
use_bio(struct dm_buffer * b,int rw,sector_t sector,unsigned n_sectors,unsigned offset)620 static void use_bio(struct dm_buffer *b, int rw, sector_t sector,
621 		    unsigned n_sectors, unsigned offset)
622 {
623 	struct bio *bio;
624 	char *ptr;
625 	unsigned vec_size, len;
626 
627 	vec_size = b->c->block_size >> PAGE_SHIFT;
628 	if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT))
629 		vec_size += 2;
630 
631 	bio = bio_kmalloc(GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN, vec_size);
632 	if (!bio) {
633 dmio:
634 		use_dmio(b, rw, sector, n_sectors, offset);
635 		return;
636 	}
637 
638 	bio->bi_iter.bi_sector = sector;
639 	bio_set_dev(bio, b->c->bdev);
640 	bio_set_op_attrs(bio, rw, 0);
641 	bio->bi_end_io = bio_complete;
642 	bio->bi_private = b;
643 
644 	ptr = (char *)b->data + offset;
645 	len = n_sectors << SECTOR_SHIFT;
646 
647 	do {
648 		unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len);
649 		if (!bio_add_page(bio, virt_to_page(ptr), this_step,
650 				  offset_in_page(ptr))) {
651 			bio_put(bio);
652 			goto dmio;
653 		}
654 
655 		len -= this_step;
656 		ptr += this_step;
657 	} while (len > 0);
658 
659 	submit_bio(bio);
660 }
661 
block_to_sector(struct dm_bufio_client * c,sector_t block)662 static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
663 {
664 	sector_t sector;
665 
666 	if (likely(c->sectors_per_block_bits >= 0))
667 		sector = block << c->sectors_per_block_bits;
668 	else
669 		sector = block * (c->block_size >> SECTOR_SHIFT);
670 	sector += c->start;
671 
672 	return sector;
673 }
674 
submit_io(struct dm_buffer * b,int rw,void (* end_io)(struct dm_buffer *,blk_status_t))675 static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t))
676 {
677 	unsigned n_sectors;
678 	sector_t sector;
679 	unsigned offset, end;
680 
681 	b->end_io = end_io;
682 
683 	sector = block_to_sector(b->c, b->block);
684 
685 	if (rw != REQ_OP_WRITE) {
686 		n_sectors = b->c->block_size >> SECTOR_SHIFT;
687 		offset = 0;
688 	} else {
689 		if (b->c->write_callback)
690 			b->c->write_callback(b);
691 		offset = b->write_start;
692 		end = b->write_end;
693 		offset &= -DM_BUFIO_WRITE_ALIGN;
694 		end += DM_BUFIO_WRITE_ALIGN - 1;
695 		end &= -DM_BUFIO_WRITE_ALIGN;
696 		if (unlikely(end > b->c->block_size))
697 			end = b->c->block_size;
698 
699 		sector += offset >> SECTOR_SHIFT;
700 		n_sectors = (end - offset) >> SECTOR_SHIFT;
701 	}
702 
703 	if (b->data_mode != DATA_MODE_VMALLOC)
704 		use_bio(b, rw, sector, n_sectors, offset);
705 	else
706 		use_dmio(b, rw, sector, n_sectors, offset);
707 }
708 
709 /*----------------------------------------------------------------
710  * Writing dirty buffers
711  *--------------------------------------------------------------*/
712 
713 /*
714  * The endio routine for write.
715  *
716  * Set the error, clear B_WRITING bit and wake anyone who was waiting on
717  * it.
718  */
write_endio(struct dm_buffer * b,blk_status_t status)719 static void write_endio(struct dm_buffer *b, blk_status_t status)
720 {
721 	b->write_error = status;
722 	if (unlikely(status)) {
723 		struct dm_bufio_client *c = b->c;
724 
725 		(void)cmpxchg(&c->async_write_error, 0,
726 				blk_status_to_errno(status));
727 	}
728 
729 	BUG_ON(!test_bit(B_WRITING, &b->state));
730 
731 	smp_mb__before_atomic();
732 	clear_bit(B_WRITING, &b->state);
733 	smp_mb__after_atomic();
734 
735 	wake_up_bit(&b->state, B_WRITING);
736 }
737 
738 /*
739  * Initiate a write on a dirty buffer, but don't wait for it.
740  *
741  * - If the buffer is not dirty, exit.
742  * - If there some previous write going on, wait for it to finish (we can't
743  *   have two writes on the same buffer simultaneously).
744  * - Submit our write and don't wait on it. We set B_WRITING indicating
745  *   that there is a write in progress.
746  */
__write_dirty_buffer(struct dm_buffer * b,struct list_head * write_list)747 static void __write_dirty_buffer(struct dm_buffer *b,
748 				 struct list_head *write_list)
749 {
750 	if (!test_bit(B_DIRTY, &b->state))
751 		return;
752 
753 	clear_bit(B_DIRTY, &b->state);
754 	wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
755 
756 	b->write_start = b->dirty_start;
757 	b->write_end = b->dirty_end;
758 
759 	if (!write_list)
760 		submit_io(b, REQ_OP_WRITE, write_endio);
761 	else
762 		list_add_tail(&b->write_list, write_list);
763 }
764 
__flush_write_list(struct list_head * write_list)765 static void __flush_write_list(struct list_head *write_list)
766 {
767 	struct blk_plug plug;
768 	blk_start_plug(&plug);
769 	while (!list_empty(write_list)) {
770 		struct dm_buffer *b =
771 			list_entry(write_list->next, struct dm_buffer, write_list);
772 		list_del(&b->write_list);
773 		submit_io(b, REQ_OP_WRITE, write_endio);
774 		cond_resched();
775 	}
776 	blk_finish_plug(&plug);
777 }
778 
779 /*
780  * Wait until any activity on the buffer finishes.  Possibly write the
781  * buffer if it is dirty.  When this function finishes, there is no I/O
782  * running on the buffer and the buffer is not dirty.
783  */
__make_buffer_clean(struct dm_buffer * b)784 static void __make_buffer_clean(struct dm_buffer *b)
785 {
786 	BUG_ON(b->hold_count);
787 
788 	if (!b->state)	/* fast case */
789 		return;
790 
791 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
792 	__write_dirty_buffer(b, NULL);
793 	wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
794 }
795 
796 /*
797  * Find some buffer that is not held by anybody, clean it, unlink it and
798  * return it.
799  */
__get_unclaimed_buffer(struct dm_bufio_client * c)800 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
801 {
802 	struct dm_buffer *b;
803 
804 	list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
805 		BUG_ON(test_bit(B_WRITING, &b->state));
806 		BUG_ON(test_bit(B_DIRTY, &b->state));
807 
808 		if (!b->hold_count) {
809 			__make_buffer_clean(b);
810 			__unlink_buffer(b);
811 			return b;
812 		}
813 		cond_resched();
814 	}
815 
816 	list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
817 		BUG_ON(test_bit(B_READING, &b->state));
818 
819 		if (!b->hold_count) {
820 			__make_buffer_clean(b);
821 			__unlink_buffer(b);
822 			return b;
823 		}
824 		cond_resched();
825 	}
826 
827 	return NULL;
828 }
829 
830 /*
831  * Wait until some other threads free some buffer or release hold count on
832  * some buffer.
833  *
834  * This function is entered with c->lock held, drops it and regains it
835  * before exiting.
836  */
__wait_for_free_buffer(struct dm_bufio_client * c)837 static void __wait_for_free_buffer(struct dm_bufio_client *c)
838 {
839 	DECLARE_WAITQUEUE(wait, current);
840 
841 	add_wait_queue(&c->free_buffer_wait, &wait);
842 	set_current_state(TASK_UNINTERRUPTIBLE);
843 	dm_bufio_unlock(c);
844 
845 	io_schedule();
846 
847 	remove_wait_queue(&c->free_buffer_wait, &wait);
848 
849 	dm_bufio_lock(c);
850 }
851 
852 enum new_flag {
853 	NF_FRESH = 0,
854 	NF_READ = 1,
855 	NF_GET = 2,
856 	NF_PREFETCH = 3
857 };
858 
859 /*
860  * Allocate a new buffer. If the allocation is not possible, wait until
861  * some other thread frees a buffer.
862  *
863  * May drop the lock and regain it.
864  */
__alloc_buffer_wait_no_callback(struct dm_bufio_client * c,enum new_flag nf)865 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
866 {
867 	struct dm_buffer *b;
868 	bool tried_noio_alloc = false;
869 
870 	/*
871 	 * dm-bufio is resistant to allocation failures (it just keeps
872 	 * one buffer reserved in cases all the allocations fail).
873 	 * So set flags to not try too hard:
874 	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
875 	 *		    mutex and wait ourselves.
876 	 *	__GFP_NORETRY: don't retry and rather return failure
877 	 *	__GFP_NOMEMALLOC: don't use emergency reserves
878 	 *	__GFP_NOWARN: don't print a warning in case of failure
879 	 *
880 	 * For debugging, if we set the cache size to 1, no new buffers will
881 	 * be allocated.
882 	 */
883 	while (1) {
884 		if (dm_bufio_cache_size_latch != 1) {
885 			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
886 			if (b)
887 				return b;
888 		}
889 
890 		if (nf == NF_PREFETCH)
891 			return NULL;
892 
893 		if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
894 			dm_bufio_unlock(c);
895 			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
896 			dm_bufio_lock(c);
897 			if (b)
898 				return b;
899 			tried_noio_alloc = true;
900 		}
901 
902 		if (!list_empty(&c->reserved_buffers)) {
903 			b = list_entry(c->reserved_buffers.next,
904 				       struct dm_buffer, lru_list);
905 			list_del(&b->lru_list);
906 			c->need_reserved_buffers++;
907 
908 			return b;
909 		}
910 
911 		b = __get_unclaimed_buffer(c);
912 		if (b)
913 			return b;
914 
915 		__wait_for_free_buffer(c);
916 	}
917 }
918 
__alloc_buffer_wait(struct dm_bufio_client * c,enum new_flag nf)919 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
920 {
921 	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
922 
923 	if (!b)
924 		return NULL;
925 
926 	if (c->alloc_callback)
927 		c->alloc_callback(b);
928 
929 	return b;
930 }
931 
932 /*
933  * Free a buffer and wake other threads waiting for free buffers.
934  */
__free_buffer_wake(struct dm_buffer * b)935 static void __free_buffer_wake(struct dm_buffer *b)
936 {
937 	struct dm_bufio_client *c = b->c;
938 
939 	if (!c->need_reserved_buffers)
940 		free_buffer(b);
941 	else {
942 		list_add(&b->lru_list, &c->reserved_buffers);
943 		c->need_reserved_buffers--;
944 	}
945 
946 	wake_up(&c->free_buffer_wait);
947 }
948 
__write_dirty_buffers_async(struct dm_bufio_client * c,int no_wait,struct list_head * write_list)949 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
950 					struct list_head *write_list)
951 {
952 	struct dm_buffer *b, *tmp;
953 
954 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
955 		BUG_ON(test_bit(B_READING, &b->state));
956 
957 		if (!test_bit(B_DIRTY, &b->state) &&
958 		    !test_bit(B_WRITING, &b->state)) {
959 			__relink_lru(b, LIST_CLEAN);
960 			continue;
961 		}
962 
963 		if (no_wait && test_bit(B_WRITING, &b->state))
964 			return;
965 
966 		__write_dirty_buffer(b, write_list);
967 		cond_resched();
968 	}
969 }
970 
971 /*
972  * Check if we're over watermark.
973  * If we are over threshold_buffers, start freeing buffers.
974  * If we're over "limit_buffers", block until we get under the limit.
975  */
__check_watermark(struct dm_bufio_client * c,struct list_head * write_list)976 static void __check_watermark(struct dm_bufio_client *c,
977 			      struct list_head *write_list)
978 {
979 	if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
980 		__write_dirty_buffers_async(c, 1, write_list);
981 }
982 
983 /*----------------------------------------------------------------
984  * Getting a buffer
985  *--------------------------------------------------------------*/
986 
__bufio_new(struct dm_bufio_client * c,sector_t block,enum new_flag nf,int * need_submit,struct list_head * write_list)987 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
988 				     enum new_flag nf, int *need_submit,
989 				     struct list_head *write_list)
990 {
991 	struct dm_buffer *b, *new_b = NULL;
992 
993 	*need_submit = 0;
994 
995 	b = __find(c, block);
996 	if (b)
997 		goto found_buffer;
998 
999 	if (nf == NF_GET)
1000 		return NULL;
1001 
1002 	new_b = __alloc_buffer_wait(c, nf);
1003 	if (!new_b)
1004 		return NULL;
1005 
1006 	/*
1007 	 * We've had a period where the mutex was unlocked, so need to
1008 	 * recheck the buffer tree.
1009 	 */
1010 	b = __find(c, block);
1011 	if (b) {
1012 		__free_buffer_wake(new_b);
1013 		goto found_buffer;
1014 	}
1015 
1016 	__check_watermark(c, write_list);
1017 
1018 	b = new_b;
1019 	b->hold_count = 1;
1020 	b->read_error = 0;
1021 	b->write_error = 0;
1022 	__link_buffer(b, block, LIST_CLEAN);
1023 
1024 	if (nf == NF_FRESH) {
1025 		b->state = 0;
1026 		return b;
1027 	}
1028 
1029 	b->state = 1 << B_READING;
1030 	*need_submit = 1;
1031 
1032 	return b;
1033 
1034 found_buffer:
1035 	if (nf == NF_PREFETCH)
1036 		return NULL;
1037 	/*
1038 	 * Note: it is essential that we don't wait for the buffer to be
1039 	 * read if dm_bufio_get function is used. Both dm_bufio_get and
1040 	 * dm_bufio_prefetch can be used in the driver request routine.
1041 	 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1042 	 * the same buffer, it would deadlock if we waited.
1043 	 */
1044 	if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
1045 		return NULL;
1046 
1047 	b->hold_count++;
1048 	__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
1049 		     test_bit(B_WRITING, &b->state));
1050 	return b;
1051 }
1052 
1053 /*
1054  * The endio routine for reading: set the error, clear the bit and wake up
1055  * anyone waiting on the buffer.
1056  */
read_endio(struct dm_buffer * b,blk_status_t status)1057 static void read_endio(struct dm_buffer *b, blk_status_t status)
1058 {
1059 	b->read_error = status;
1060 
1061 	BUG_ON(!test_bit(B_READING, &b->state));
1062 
1063 	smp_mb__before_atomic();
1064 	clear_bit(B_READING, &b->state);
1065 	smp_mb__after_atomic();
1066 
1067 	wake_up_bit(&b->state, B_READING);
1068 }
1069 
1070 /*
1071  * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1072  * functions is similar except that dm_bufio_new doesn't read the
1073  * buffer from the disk (assuming that the caller overwrites all the data
1074  * and uses dm_bufio_mark_buffer_dirty to write new data back).
1075  */
new_read(struct dm_bufio_client * c,sector_t block,enum new_flag nf,struct dm_buffer ** bp)1076 static void *new_read(struct dm_bufio_client *c, sector_t block,
1077 		      enum new_flag nf, struct dm_buffer **bp)
1078 {
1079 	int need_submit;
1080 	struct dm_buffer *b;
1081 
1082 	LIST_HEAD(write_list);
1083 
1084 	dm_bufio_lock(c);
1085 	b = __bufio_new(c, block, nf, &need_submit, &write_list);
1086 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1087 	if (b && b->hold_count == 1)
1088 		buffer_record_stack(b);
1089 #endif
1090 	dm_bufio_unlock(c);
1091 
1092 	__flush_write_list(&write_list);
1093 
1094 	if (!b)
1095 		return NULL;
1096 
1097 	if (need_submit)
1098 		submit_io(b, REQ_OP_READ, read_endio);
1099 
1100 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1101 
1102 	if (b->read_error) {
1103 		int error = blk_status_to_errno(b->read_error);
1104 
1105 		dm_bufio_release(b);
1106 
1107 		return ERR_PTR(error);
1108 	}
1109 
1110 	*bp = b;
1111 
1112 	return b->data;
1113 }
1114 
dm_bufio_get(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)1115 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1116 		   struct dm_buffer **bp)
1117 {
1118 	return new_read(c, block, NF_GET, bp);
1119 }
1120 EXPORT_SYMBOL_GPL(dm_bufio_get);
1121 
dm_bufio_read(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)1122 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1123 		    struct dm_buffer **bp)
1124 {
1125 	BUG_ON(dm_bufio_in_request());
1126 
1127 	return new_read(c, block, NF_READ, bp);
1128 }
1129 EXPORT_SYMBOL_GPL(dm_bufio_read);
1130 
dm_bufio_new(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)1131 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1132 		   struct dm_buffer **bp)
1133 {
1134 	BUG_ON(dm_bufio_in_request());
1135 
1136 	return new_read(c, block, NF_FRESH, bp);
1137 }
1138 EXPORT_SYMBOL_GPL(dm_bufio_new);
1139 
dm_bufio_prefetch(struct dm_bufio_client * c,sector_t block,unsigned n_blocks)1140 void dm_bufio_prefetch(struct dm_bufio_client *c,
1141 		       sector_t block, unsigned n_blocks)
1142 {
1143 	struct blk_plug plug;
1144 
1145 	LIST_HEAD(write_list);
1146 
1147 	BUG_ON(dm_bufio_in_request());
1148 
1149 	blk_start_plug(&plug);
1150 	dm_bufio_lock(c);
1151 
1152 	for (; n_blocks--; block++) {
1153 		int need_submit;
1154 		struct dm_buffer *b;
1155 		b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1156 				&write_list);
1157 		if (unlikely(!list_empty(&write_list))) {
1158 			dm_bufio_unlock(c);
1159 			blk_finish_plug(&plug);
1160 			__flush_write_list(&write_list);
1161 			blk_start_plug(&plug);
1162 			dm_bufio_lock(c);
1163 		}
1164 		if (unlikely(b != NULL)) {
1165 			dm_bufio_unlock(c);
1166 
1167 			if (need_submit)
1168 				submit_io(b, REQ_OP_READ, read_endio);
1169 			dm_bufio_release(b);
1170 
1171 			cond_resched();
1172 
1173 			if (!n_blocks)
1174 				goto flush_plug;
1175 			dm_bufio_lock(c);
1176 		}
1177 	}
1178 
1179 	dm_bufio_unlock(c);
1180 
1181 flush_plug:
1182 	blk_finish_plug(&plug);
1183 }
1184 EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1185 
dm_bufio_release(struct dm_buffer * b)1186 void dm_bufio_release(struct dm_buffer *b)
1187 {
1188 	struct dm_bufio_client *c = b->c;
1189 
1190 	dm_bufio_lock(c);
1191 
1192 	BUG_ON(!b->hold_count);
1193 
1194 	b->hold_count--;
1195 	if (!b->hold_count) {
1196 		wake_up(&c->free_buffer_wait);
1197 
1198 		/*
1199 		 * If there were errors on the buffer, and the buffer is not
1200 		 * to be written, free the buffer. There is no point in caching
1201 		 * invalid buffer.
1202 		 */
1203 		if ((b->read_error || b->write_error) &&
1204 		    !test_bit(B_READING, &b->state) &&
1205 		    !test_bit(B_WRITING, &b->state) &&
1206 		    !test_bit(B_DIRTY, &b->state)) {
1207 			__unlink_buffer(b);
1208 			__free_buffer_wake(b);
1209 		}
1210 	}
1211 
1212 	dm_bufio_unlock(c);
1213 }
1214 EXPORT_SYMBOL_GPL(dm_bufio_release);
1215 
dm_bufio_mark_partial_buffer_dirty(struct dm_buffer * b,unsigned start,unsigned end)1216 void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
1217 					unsigned start, unsigned end)
1218 {
1219 	struct dm_bufio_client *c = b->c;
1220 
1221 	BUG_ON(start >= end);
1222 	BUG_ON(end > b->c->block_size);
1223 
1224 	dm_bufio_lock(c);
1225 
1226 	BUG_ON(test_bit(B_READING, &b->state));
1227 
1228 	if (!test_and_set_bit(B_DIRTY, &b->state)) {
1229 		b->dirty_start = start;
1230 		b->dirty_end = end;
1231 		__relink_lru(b, LIST_DIRTY);
1232 	} else {
1233 		if (start < b->dirty_start)
1234 			b->dirty_start = start;
1235 		if (end > b->dirty_end)
1236 			b->dirty_end = end;
1237 	}
1238 
1239 	dm_bufio_unlock(c);
1240 }
1241 EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
1242 
dm_bufio_mark_buffer_dirty(struct dm_buffer * b)1243 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1244 {
1245 	dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
1246 }
1247 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1248 
dm_bufio_write_dirty_buffers_async(struct dm_bufio_client * c)1249 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1250 {
1251 	LIST_HEAD(write_list);
1252 
1253 	BUG_ON(dm_bufio_in_request());
1254 
1255 	dm_bufio_lock(c);
1256 	__write_dirty_buffers_async(c, 0, &write_list);
1257 	dm_bufio_unlock(c);
1258 	__flush_write_list(&write_list);
1259 }
1260 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1261 
1262 /*
1263  * For performance, it is essential that the buffers are written asynchronously
1264  * and simultaneously (so that the block layer can merge the writes) and then
1265  * waited upon.
1266  *
1267  * Finally, we flush hardware disk cache.
1268  */
dm_bufio_write_dirty_buffers(struct dm_bufio_client * c)1269 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1270 {
1271 	int a, f;
1272 	unsigned long buffers_processed = 0;
1273 	struct dm_buffer *b, *tmp;
1274 
1275 	LIST_HEAD(write_list);
1276 
1277 	dm_bufio_lock(c);
1278 	__write_dirty_buffers_async(c, 0, &write_list);
1279 	dm_bufio_unlock(c);
1280 	__flush_write_list(&write_list);
1281 	dm_bufio_lock(c);
1282 
1283 again:
1284 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1285 		int dropped_lock = 0;
1286 
1287 		if (buffers_processed < c->n_buffers[LIST_DIRTY])
1288 			buffers_processed++;
1289 
1290 		BUG_ON(test_bit(B_READING, &b->state));
1291 
1292 		if (test_bit(B_WRITING, &b->state)) {
1293 			if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1294 				dropped_lock = 1;
1295 				b->hold_count++;
1296 				dm_bufio_unlock(c);
1297 				wait_on_bit_io(&b->state, B_WRITING,
1298 					       TASK_UNINTERRUPTIBLE);
1299 				dm_bufio_lock(c);
1300 				b->hold_count--;
1301 			} else
1302 				wait_on_bit_io(&b->state, B_WRITING,
1303 					       TASK_UNINTERRUPTIBLE);
1304 		}
1305 
1306 		if (!test_bit(B_DIRTY, &b->state) &&
1307 		    !test_bit(B_WRITING, &b->state))
1308 			__relink_lru(b, LIST_CLEAN);
1309 
1310 		cond_resched();
1311 
1312 		/*
1313 		 * If we dropped the lock, the list is no longer consistent,
1314 		 * so we must restart the search.
1315 		 *
1316 		 * In the most common case, the buffer just processed is
1317 		 * relinked to the clean list, so we won't loop scanning the
1318 		 * same buffer again and again.
1319 		 *
1320 		 * This may livelock if there is another thread simultaneously
1321 		 * dirtying buffers, so we count the number of buffers walked
1322 		 * and if it exceeds the total number of buffers, it means that
1323 		 * someone is doing some writes simultaneously with us.  In
1324 		 * this case, stop, dropping the lock.
1325 		 */
1326 		if (dropped_lock)
1327 			goto again;
1328 	}
1329 	wake_up(&c->free_buffer_wait);
1330 	dm_bufio_unlock(c);
1331 
1332 	a = xchg(&c->async_write_error, 0);
1333 	f = dm_bufio_issue_flush(c);
1334 	if (a)
1335 		return a;
1336 
1337 	return f;
1338 }
1339 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1340 
1341 /*
1342  * Use dm-io to send an empty barrier to flush the device.
1343  */
dm_bufio_issue_flush(struct dm_bufio_client * c)1344 int dm_bufio_issue_flush(struct dm_bufio_client *c)
1345 {
1346 	struct dm_io_request io_req = {
1347 		.bi_op = REQ_OP_WRITE,
1348 		.bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
1349 		.mem.type = DM_IO_KMEM,
1350 		.mem.ptr.addr = NULL,
1351 		.client = c->dm_io,
1352 	};
1353 	struct dm_io_region io_reg = {
1354 		.bdev = c->bdev,
1355 		.sector = 0,
1356 		.count = 0,
1357 	};
1358 
1359 	BUG_ON(dm_bufio_in_request());
1360 
1361 	return dm_io(&io_req, 1, &io_reg, NULL);
1362 }
1363 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1364 
1365 /*
1366  * Use dm-io to send a discard request to flush the device.
1367  */
dm_bufio_issue_discard(struct dm_bufio_client * c,sector_t block,sector_t count)1368 int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
1369 {
1370 	struct dm_io_request io_req = {
1371 		.bi_op = REQ_OP_DISCARD,
1372 		.bi_op_flags = REQ_SYNC,
1373 		.mem.type = DM_IO_KMEM,
1374 		.mem.ptr.addr = NULL,
1375 		.client = c->dm_io,
1376 	};
1377 	struct dm_io_region io_reg = {
1378 		.bdev = c->bdev,
1379 		.sector = block_to_sector(c, block),
1380 		.count = block_to_sector(c, count),
1381 	};
1382 
1383 	BUG_ON(dm_bufio_in_request());
1384 
1385 	return dm_io(&io_req, 1, &io_reg, NULL);
1386 }
1387 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
1388 
1389 /*
1390  * We first delete any other buffer that may be at that new location.
1391  *
1392  * Then, we write the buffer to the original location if it was dirty.
1393  *
1394  * Then, if we are the only one who is holding the buffer, relink the buffer
1395  * in the buffer tree for the new location.
1396  *
1397  * If there was someone else holding the buffer, we write it to the new
1398  * location but not relink it, because that other user needs to have the buffer
1399  * at the same place.
1400  */
dm_bufio_release_move(struct dm_buffer * b,sector_t new_block)1401 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1402 {
1403 	struct dm_bufio_client *c = b->c;
1404 	struct dm_buffer *new;
1405 
1406 	BUG_ON(dm_bufio_in_request());
1407 
1408 	dm_bufio_lock(c);
1409 
1410 retry:
1411 	new = __find(c, new_block);
1412 	if (new) {
1413 		if (new->hold_count) {
1414 			__wait_for_free_buffer(c);
1415 			goto retry;
1416 		}
1417 
1418 		/*
1419 		 * FIXME: Is there any point waiting for a write that's going
1420 		 * to be overwritten in a bit?
1421 		 */
1422 		__make_buffer_clean(new);
1423 		__unlink_buffer(new);
1424 		__free_buffer_wake(new);
1425 	}
1426 
1427 	BUG_ON(!b->hold_count);
1428 	BUG_ON(test_bit(B_READING, &b->state));
1429 
1430 	__write_dirty_buffer(b, NULL);
1431 	if (b->hold_count == 1) {
1432 		wait_on_bit_io(&b->state, B_WRITING,
1433 			       TASK_UNINTERRUPTIBLE);
1434 		set_bit(B_DIRTY, &b->state);
1435 		b->dirty_start = 0;
1436 		b->dirty_end = c->block_size;
1437 		__unlink_buffer(b);
1438 		__link_buffer(b, new_block, LIST_DIRTY);
1439 	} else {
1440 		sector_t old_block;
1441 		wait_on_bit_lock_io(&b->state, B_WRITING,
1442 				    TASK_UNINTERRUPTIBLE);
1443 		/*
1444 		 * Relink buffer to "new_block" so that write_callback
1445 		 * sees "new_block" as a block number.
1446 		 * After the write, link the buffer back to old_block.
1447 		 * All this must be done in bufio lock, so that block number
1448 		 * change isn't visible to other threads.
1449 		 */
1450 		old_block = b->block;
1451 		__unlink_buffer(b);
1452 		__link_buffer(b, new_block, b->list_mode);
1453 		submit_io(b, REQ_OP_WRITE, write_endio);
1454 		wait_on_bit_io(&b->state, B_WRITING,
1455 			       TASK_UNINTERRUPTIBLE);
1456 		__unlink_buffer(b);
1457 		__link_buffer(b, old_block, b->list_mode);
1458 	}
1459 
1460 	dm_bufio_unlock(c);
1461 	dm_bufio_release(b);
1462 }
1463 EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1464 
forget_buffer_locked(struct dm_buffer * b)1465 static void forget_buffer_locked(struct dm_buffer *b)
1466 {
1467 	if (likely(!b->hold_count) && likely(!b->state)) {
1468 		__unlink_buffer(b);
1469 		__free_buffer_wake(b);
1470 	}
1471 }
1472 
1473 /*
1474  * Free the given buffer.
1475  *
1476  * This is just a hint, if the buffer is in use or dirty, this function
1477  * does nothing.
1478  */
dm_bufio_forget(struct dm_bufio_client * c,sector_t block)1479 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
1480 {
1481 	struct dm_buffer *b;
1482 
1483 	dm_bufio_lock(c);
1484 
1485 	b = __find(c, block);
1486 	if (b)
1487 		forget_buffer_locked(b);
1488 
1489 	dm_bufio_unlock(c);
1490 }
1491 EXPORT_SYMBOL_GPL(dm_bufio_forget);
1492 
dm_bufio_forget_buffers(struct dm_bufio_client * c,sector_t block,sector_t n_blocks)1493 void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
1494 {
1495 	struct dm_buffer *b;
1496 	sector_t end_block = block + n_blocks;
1497 
1498 	while (block < end_block) {
1499 		dm_bufio_lock(c);
1500 
1501 		b = __find_next(c, block);
1502 		if (b) {
1503 			block = b->block + 1;
1504 			forget_buffer_locked(b);
1505 		}
1506 
1507 		dm_bufio_unlock(c);
1508 
1509 		if (!b)
1510 			break;
1511 	}
1512 
1513 }
1514 EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
1515 
dm_bufio_set_minimum_buffers(struct dm_bufio_client * c,unsigned n)1516 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
1517 {
1518 	c->minimum_buffers = n;
1519 }
1520 EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
1521 
dm_bufio_get_block_size(struct dm_bufio_client * c)1522 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1523 {
1524 	return c->block_size;
1525 }
1526 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1527 
dm_bufio_get_device_size(struct dm_bufio_client * c)1528 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1529 {
1530 	sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT;
1531 	if (s >= c->start)
1532 		s -= c->start;
1533 	else
1534 		s = 0;
1535 	if (likely(c->sectors_per_block_bits >= 0))
1536 		s >>= c->sectors_per_block_bits;
1537 	else
1538 		sector_div(s, c->block_size >> SECTOR_SHIFT);
1539 	return s;
1540 }
1541 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1542 
dm_bufio_get_dm_io_client(struct dm_bufio_client * c)1543 struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
1544 {
1545 	return c->dm_io;
1546 }
1547 EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
1548 
dm_bufio_get_block_number(struct dm_buffer * b)1549 sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1550 {
1551 	return b->block;
1552 }
1553 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1554 
dm_bufio_get_block_data(struct dm_buffer * b)1555 void *dm_bufio_get_block_data(struct dm_buffer *b)
1556 {
1557 	return b->data;
1558 }
1559 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1560 
dm_bufio_get_aux_data(struct dm_buffer * b)1561 void *dm_bufio_get_aux_data(struct dm_buffer *b)
1562 {
1563 	return b + 1;
1564 }
1565 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1566 
dm_bufio_get_client(struct dm_buffer * b)1567 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1568 {
1569 	return b->c;
1570 }
1571 EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1572 
drop_buffers(struct dm_bufio_client * c)1573 static void drop_buffers(struct dm_bufio_client *c)
1574 {
1575 	struct dm_buffer *b;
1576 	int i;
1577 	bool warned = false;
1578 
1579 	BUG_ON(dm_bufio_in_request());
1580 
1581 	/*
1582 	 * An optimization so that the buffers are not written one-by-one.
1583 	 */
1584 	dm_bufio_write_dirty_buffers_async(c);
1585 
1586 	dm_bufio_lock(c);
1587 
1588 	while ((b = __get_unclaimed_buffer(c)))
1589 		__free_buffer_wake(b);
1590 
1591 	for (i = 0; i < LIST_SIZE; i++)
1592 		list_for_each_entry(b, &c->lru[i], lru_list) {
1593 			WARN_ON(!warned);
1594 			warned = true;
1595 			DMERR("leaked buffer %llx, hold count %u, list %d",
1596 			      (unsigned long long)b->block, b->hold_count, i);
1597 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1598 			stack_trace_print(b->stack_entries, b->stack_len, 1);
1599 			/* mark unclaimed to avoid BUG_ON below */
1600 			b->hold_count = 0;
1601 #endif
1602 		}
1603 
1604 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1605 	while ((b = __get_unclaimed_buffer(c)))
1606 		__free_buffer_wake(b);
1607 #endif
1608 
1609 	for (i = 0; i < LIST_SIZE; i++)
1610 		BUG_ON(!list_empty(&c->lru[i]));
1611 
1612 	dm_bufio_unlock(c);
1613 }
1614 
1615 /*
1616  * We may not be able to evict this buffer if IO pending or the client
1617  * is still using it.  Caller is expected to know buffer is too old.
1618  *
1619  * And if GFP_NOFS is used, we must not do any I/O because we hold
1620  * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
1621  * rerouted to different bufio client.
1622  */
__try_evict_buffer(struct dm_buffer * b,gfp_t gfp)1623 static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
1624 {
1625 	if (!(gfp & __GFP_FS)) {
1626 		if (test_bit(B_READING, &b->state) ||
1627 		    test_bit(B_WRITING, &b->state) ||
1628 		    test_bit(B_DIRTY, &b->state))
1629 			return false;
1630 	}
1631 
1632 	if (b->hold_count)
1633 		return false;
1634 
1635 	__make_buffer_clean(b);
1636 	__unlink_buffer(b);
1637 	__free_buffer_wake(b);
1638 
1639 	return true;
1640 }
1641 
get_retain_buffers(struct dm_bufio_client * c)1642 static unsigned long get_retain_buffers(struct dm_bufio_client *c)
1643 {
1644 	unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
1645 	if (likely(c->sectors_per_block_bits >= 0))
1646 		retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
1647 	else
1648 		retain_bytes /= c->block_size;
1649 	return retain_bytes;
1650 }
1651 
__scan(struct dm_bufio_client * c)1652 static void __scan(struct dm_bufio_client *c)
1653 {
1654 	int l;
1655 	struct dm_buffer *b, *tmp;
1656 	unsigned long freed = 0;
1657 	unsigned long count = c->n_buffers[LIST_CLEAN] +
1658 			      c->n_buffers[LIST_DIRTY];
1659 	unsigned long retain_target = get_retain_buffers(c);
1660 
1661 	for (l = 0; l < LIST_SIZE; l++) {
1662 		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
1663 			if (count - freed <= retain_target)
1664 				atomic_long_set(&c->need_shrink, 0);
1665 			if (!atomic_long_read(&c->need_shrink))
1666 				return;
1667 			if (__try_evict_buffer(b, GFP_KERNEL)) {
1668 				atomic_long_dec(&c->need_shrink);
1669 				freed++;
1670 			}
1671 			cond_resched();
1672 		}
1673 	}
1674 }
1675 
shrink_work(struct work_struct * w)1676 static void shrink_work(struct work_struct *w)
1677 {
1678 	struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
1679 
1680 	dm_bufio_lock(c);
1681 	__scan(c);
1682 	dm_bufio_unlock(c);
1683 }
1684 
dm_bufio_shrink_scan(struct shrinker * shrink,struct shrink_control * sc)1685 static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
1686 {
1687 	struct dm_bufio_client *c;
1688 	bool bypass = false;
1689 
1690 	trace_android_vh_dm_bufio_shrink_scan_bypass(
1691 			dm_bufio_current_allocated,
1692 			&bypass);
1693 	if (bypass)
1694 		return 0;
1695 
1696 	c = container_of(shrink, struct dm_bufio_client, shrinker);
1697 	atomic_long_add(sc->nr_to_scan, &c->need_shrink);
1698 	queue_work(dm_bufio_wq, &c->shrink_work);
1699 
1700 	return sc->nr_to_scan;
1701 }
1702 
dm_bufio_shrink_count(struct shrinker * shrink,struct shrink_control * sc)1703 static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
1704 {
1705 	struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
1706 	unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
1707 			      READ_ONCE(c->n_buffers[LIST_DIRTY]);
1708 	unsigned long retain_target = get_retain_buffers(c);
1709 	unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
1710 
1711 	if (unlikely(count < retain_target))
1712 		count = 0;
1713 	else
1714 		count -= retain_target;
1715 
1716 	if (unlikely(count < queued_for_cleanup))
1717 		count = 0;
1718 	else
1719 		count -= queued_for_cleanup;
1720 
1721 	return count;
1722 }
1723 
1724 /*
1725  * Create the buffering interface
1726  */
dm_bufio_client_create(struct block_device * bdev,unsigned block_size,unsigned reserved_buffers,unsigned aux_size,void (* alloc_callback)(struct dm_buffer *),void (* write_callback)(struct dm_buffer *))1727 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
1728 					       unsigned reserved_buffers, unsigned aux_size,
1729 					       void (*alloc_callback)(struct dm_buffer *),
1730 					       void (*write_callback)(struct dm_buffer *))
1731 {
1732 	int r;
1733 	struct dm_bufio_client *c;
1734 	unsigned i;
1735 	char slab_name[27];
1736 
1737 	if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
1738 		DMERR("%s: block size not specified or is not multiple of 512b", __func__);
1739 		r = -EINVAL;
1740 		goto bad_client;
1741 	}
1742 
1743 	c = kzalloc(sizeof(*c), GFP_KERNEL);
1744 	if (!c) {
1745 		r = -ENOMEM;
1746 		goto bad_client;
1747 	}
1748 	c->buffer_tree = RB_ROOT;
1749 
1750 	c->bdev = bdev;
1751 	c->block_size = block_size;
1752 	if (is_power_of_2(block_size))
1753 		c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
1754 	else
1755 		c->sectors_per_block_bits = -1;
1756 
1757 	c->alloc_callback = alloc_callback;
1758 	c->write_callback = write_callback;
1759 
1760 	for (i = 0; i < LIST_SIZE; i++) {
1761 		INIT_LIST_HEAD(&c->lru[i]);
1762 		c->n_buffers[i] = 0;
1763 	}
1764 
1765 	mutex_init(&c->lock);
1766 	INIT_LIST_HEAD(&c->reserved_buffers);
1767 	c->need_reserved_buffers = reserved_buffers;
1768 
1769 	dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
1770 
1771 	init_waitqueue_head(&c->free_buffer_wait);
1772 	c->async_write_error = 0;
1773 
1774 	c->dm_io = dm_io_client_create();
1775 	if (IS_ERR(c->dm_io)) {
1776 		r = PTR_ERR(c->dm_io);
1777 		goto bad_dm_io;
1778 	}
1779 
1780 	if (block_size <= KMALLOC_MAX_SIZE &&
1781 	    (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
1782 		unsigned align = min(1U << __ffs(block_size), (unsigned)PAGE_SIZE);
1783 		snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", block_size);
1784 		c->slab_cache = kmem_cache_create(slab_name, block_size, align,
1785 						  SLAB_RECLAIM_ACCOUNT, NULL);
1786 		if (!c->slab_cache) {
1787 			r = -ENOMEM;
1788 			goto bad;
1789 		}
1790 	}
1791 	if (aux_size)
1792 		snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer-%u", aux_size);
1793 	else
1794 		snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer");
1795 	c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
1796 					   0, SLAB_RECLAIM_ACCOUNT, NULL);
1797 	if (!c->slab_buffer) {
1798 		r = -ENOMEM;
1799 		goto bad;
1800 	}
1801 
1802 	while (c->need_reserved_buffers) {
1803 		struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1804 
1805 		if (!b) {
1806 			r = -ENOMEM;
1807 			goto bad;
1808 		}
1809 		__free_buffer_wake(b);
1810 	}
1811 
1812 	INIT_WORK(&c->shrink_work, shrink_work);
1813 	atomic_long_set(&c->need_shrink, 0);
1814 
1815 	c->shrinker.count_objects = dm_bufio_shrink_count;
1816 	c->shrinker.scan_objects = dm_bufio_shrink_scan;
1817 	c->shrinker.seeks = 1;
1818 	c->shrinker.batch = 0;
1819 	r = register_shrinker(&c->shrinker);
1820 	if (r)
1821 		goto bad;
1822 
1823 	mutex_lock(&dm_bufio_clients_lock);
1824 	dm_bufio_client_count++;
1825 	list_add(&c->client_list, &dm_bufio_all_clients);
1826 	__cache_size_refresh();
1827 	mutex_unlock(&dm_bufio_clients_lock);
1828 
1829 	return c;
1830 
1831 bad:
1832 	while (!list_empty(&c->reserved_buffers)) {
1833 		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1834 						 struct dm_buffer, lru_list);
1835 		list_del(&b->lru_list);
1836 		free_buffer(b);
1837 	}
1838 	kmem_cache_destroy(c->slab_cache);
1839 	kmem_cache_destroy(c->slab_buffer);
1840 	dm_io_client_destroy(c->dm_io);
1841 bad_dm_io:
1842 	mutex_destroy(&c->lock);
1843 	kfree(c);
1844 bad_client:
1845 	return ERR_PTR(r);
1846 }
1847 EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1848 
1849 /*
1850  * Free the buffering interface.
1851  * It is required that there are no references on any buffers.
1852  */
dm_bufio_client_destroy(struct dm_bufio_client * c)1853 void dm_bufio_client_destroy(struct dm_bufio_client *c)
1854 {
1855 	unsigned i;
1856 
1857 	drop_buffers(c);
1858 
1859 	unregister_shrinker(&c->shrinker);
1860 	flush_work(&c->shrink_work);
1861 
1862 	mutex_lock(&dm_bufio_clients_lock);
1863 
1864 	list_del(&c->client_list);
1865 	dm_bufio_client_count--;
1866 	__cache_size_refresh();
1867 
1868 	mutex_unlock(&dm_bufio_clients_lock);
1869 
1870 	BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
1871 	BUG_ON(c->need_reserved_buffers);
1872 
1873 	while (!list_empty(&c->reserved_buffers)) {
1874 		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1875 						 struct dm_buffer, lru_list);
1876 		list_del(&b->lru_list);
1877 		free_buffer(b);
1878 	}
1879 
1880 	for (i = 0; i < LIST_SIZE; i++)
1881 		if (c->n_buffers[i])
1882 			DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1883 
1884 	for (i = 0; i < LIST_SIZE; i++)
1885 		BUG_ON(c->n_buffers[i]);
1886 
1887 	kmem_cache_destroy(c->slab_cache);
1888 	kmem_cache_destroy(c->slab_buffer);
1889 	dm_io_client_destroy(c->dm_io);
1890 	mutex_destroy(&c->lock);
1891 	kfree(c);
1892 }
1893 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1894 
dm_bufio_set_sector_offset(struct dm_bufio_client * c,sector_t start)1895 void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
1896 {
1897 	c->start = start;
1898 }
1899 EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
1900 
get_max_age_hz(void)1901 static unsigned get_max_age_hz(void)
1902 {
1903 	unsigned max_age = READ_ONCE(dm_bufio_max_age);
1904 
1905 	if (max_age > UINT_MAX / HZ)
1906 		max_age = UINT_MAX / HZ;
1907 
1908 	return max_age * HZ;
1909 }
1910 
older_than(struct dm_buffer * b,unsigned long age_hz)1911 static bool older_than(struct dm_buffer *b, unsigned long age_hz)
1912 {
1913 	return time_after_eq(jiffies, b->last_accessed + age_hz);
1914 }
1915 
__evict_old_buffers(struct dm_bufio_client * c,unsigned long age_hz)1916 static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1917 {
1918 	struct dm_buffer *b, *tmp;
1919 	unsigned long retain_target = get_retain_buffers(c);
1920 	unsigned long count;
1921 	LIST_HEAD(write_list);
1922 
1923 	dm_bufio_lock(c);
1924 
1925 	__check_watermark(c, &write_list);
1926 	if (unlikely(!list_empty(&write_list))) {
1927 		dm_bufio_unlock(c);
1928 		__flush_write_list(&write_list);
1929 		dm_bufio_lock(c);
1930 	}
1931 
1932 	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1933 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
1934 		if (count <= retain_target)
1935 			break;
1936 
1937 		if (!older_than(b, age_hz))
1938 			break;
1939 
1940 		if (__try_evict_buffer(b, 0))
1941 			count--;
1942 
1943 		cond_resched();
1944 	}
1945 
1946 	dm_bufio_unlock(c);
1947 }
1948 
do_global_cleanup(struct work_struct * w)1949 static void do_global_cleanup(struct work_struct *w)
1950 {
1951 	struct dm_bufio_client *locked_client = NULL;
1952 	struct dm_bufio_client *current_client;
1953 	struct dm_buffer *b;
1954 	unsigned spinlock_hold_count;
1955 	unsigned long threshold = dm_bufio_cache_size -
1956 		dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
1957 	unsigned long loops = global_num * 2;
1958 
1959 	mutex_lock(&dm_bufio_clients_lock);
1960 
1961 	while (1) {
1962 		cond_resched();
1963 
1964 		spin_lock(&global_spinlock);
1965 		if (unlikely(dm_bufio_current_allocated <= threshold))
1966 			break;
1967 
1968 		spinlock_hold_count = 0;
1969 get_next:
1970 		if (!loops--)
1971 			break;
1972 		if (unlikely(list_empty(&global_queue)))
1973 			break;
1974 		b = list_entry(global_queue.prev, struct dm_buffer, global_list);
1975 
1976 		if (b->accessed) {
1977 			b->accessed = 0;
1978 			list_move(&b->global_list, &global_queue);
1979 			if (likely(++spinlock_hold_count < 16))
1980 				goto get_next;
1981 			spin_unlock(&global_spinlock);
1982 			continue;
1983 		}
1984 
1985 		current_client = b->c;
1986 		if (unlikely(current_client != locked_client)) {
1987 			if (locked_client)
1988 				dm_bufio_unlock(locked_client);
1989 
1990 			if (!dm_bufio_trylock(current_client)) {
1991 				spin_unlock(&global_spinlock);
1992 				dm_bufio_lock(current_client);
1993 				locked_client = current_client;
1994 				continue;
1995 			}
1996 
1997 			locked_client = current_client;
1998 		}
1999 
2000 		spin_unlock(&global_spinlock);
2001 
2002 		if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
2003 			spin_lock(&global_spinlock);
2004 			list_move(&b->global_list, &global_queue);
2005 			spin_unlock(&global_spinlock);
2006 		}
2007 	}
2008 
2009 	spin_unlock(&global_spinlock);
2010 
2011 	if (locked_client)
2012 		dm_bufio_unlock(locked_client);
2013 
2014 	mutex_unlock(&dm_bufio_clients_lock);
2015 }
2016 
cleanup_old_buffers(void)2017 static void cleanup_old_buffers(void)
2018 {
2019 	unsigned long max_age_hz = get_max_age_hz();
2020 	struct dm_bufio_client *c;
2021 	bool bypass = false;
2022 
2023 	trace_android_vh_cleanup_old_buffers_bypass(
2024 				dm_bufio_current_allocated,
2025 				&max_age_hz,
2026 				&bypass);
2027 	if (bypass)
2028 		return;
2029 
2030 	mutex_lock(&dm_bufio_clients_lock);
2031 
2032 	__cache_size_refresh();
2033 
2034 	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
2035 		__evict_old_buffers(c, max_age_hz);
2036 
2037 	mutex_unlock(&dm_bufio_clients_lock);
2038 }
2039 
work_fn(struct work_struct * w)2040 static void work_fn(struct work_struct *w)
2041 {
2042 	cleanup_old_buffers();
2043 
2044 	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2045 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
2046 }
2047 
2048 /*----------------------------------------------------------------
2049  * Module setup
2050  *--------------------------------------------------------------*/
2051 
2052 /*
2053  * This is called only once for the whole dm_bufio module.
2054  * It initializes memory limit.
2055  */
dm_bufio_init(void)2056 static int __init dm_bufio_init(void)
2057 {
2058 	__u64 mem;
2059 
2060 	dm_bufio_allocated_kmem_cache = 0;
2061 	dm_bufio_allocated_get_free_pages = 0;
2062 	dm_bufio_allocated_vmalloc = 0;
2063 	dm_bufio_current_allocated = 0;
2064 
2065 	mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
2066 			       DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
2067 
2068 	if (mem > ULONG_MAX)
2069 		mem = ULONG_MAX;
2070 
2071 #ifdef CONFIG_MMU
2072 	if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
2073 		mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
2074 #endif
2075 
2076 	dm_bufio_default_cache_size = mem;
2077 
2078 	mutex_lock(&dm_bufio_clients_lock);
2079 	__cache_size_refresh();
2080 	mutex_unlock(&dm_bufio_clients_lock);
2081 
2082 	dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
2083 	if (!dm_bufio_wq)
2084 		return -ENOMEM;
2085 
2086 	INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
2087 	INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2088 	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2089 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
2090 
2091 	return 0;
2092 }
2093 
2094 /*
2095  * This is called once when unloading the dm_bufio module.
2096  */
dm_bufio_exit(void)2097 static void __exit dm_bufio_exit(void)
2098 {
2099 	int bug = 0;
2100 
2101 	cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
2102 	flush_workqueue(dm_bufio_wq);
2103 	destroy_workqueue(dm_bufio_wq);
2104 
2105 	if (dm_bufio_client_count) {
2106 		DMCRIT("%s: dm_bufio_client_count leaked: %d",
2107 			__func__, dm_bufio_client_count);
2108 		bug = 1;
2109 	}
2110 
2111 	if (dm_bufio_current_allocated) {
2112 		DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2113 			__func__, dm_bufio_current_allocated);
2114 		bug = 1;
2115 	}
2116 
2117 	if (dm_bufio_allocated_get_free_pages) {
2118 		DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2119 		       __func__, dm_bufio_allocated_get_free_pages);
2120 		bug = 1;
2121 	}
2122 
2123 	if (dm_bufio_allocated_vmalloc) {
2124 		DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2125 		       __func__, dm_bufio_allocated_vmalloc);
2126 		bug = 1;
2127 	}
2128 
2129 	BUG_ON(bug);
2130 }
2131 
2132 module_init(dm_bufio_init)
2133 module_exit(dm_bufio_exit)
2134 
2135 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
2136 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2137 
2138 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
2139 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2140 
2141 module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
2142 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2143 
2144 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
2145 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2146 
2147 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
2148 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2149 
2150 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
2151 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2152 
2153 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
2154 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2155 
2156 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
2157 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2158 
2159 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2160 MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2161 MODULE_LICENSE("GPL");
2162