• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * background writeback - scan btree for dirty data and write it to the backing
3  * device
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8 
9 #include "bcache.h"
10 #include "btree.h"
11 #include "debug.h"
12 #include "writeback.h"
13 
14 #include <linux/delay.h>
15 #include <linux/freezer.h>
16 #include <linux/kthread.h>
17 #include <trace/events/bcache.h>
18 
19 /* Rate limiting */
20 
__update_writeback_rate(struct cached_dev * dc)21 static void __update_writeback_rate(struct cached_dev *dc)
22 {
23 	struct cache_set *c = dc->disk.c;
24 	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
25 				bcache_flash_devs_sectors_dirty(c);
26 	uint64_t cache_dirty_target =
27 		div_u64(cache_sectors * dc->writeback_percent, 100);
28 
29 	int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
30 				   c->cached_dev_sectors);
31 
32 	/* PD controller */
33 
34 	int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
35 	int64_t derivative = dirty - dc->disk.sectors_dirty_last;
36 	int64_t proportional = dirty - target;
37 	int64_t change;
38 
39 	dc->disk.sectors_dirty_last = dirty;
40 
41 	/* Scale to sectors per second */
42 
43 	proportional *= dc->writeback_rate_update_seconds;
44 	proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
45 
46 	derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
47 
48 	derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
49 			      (dc->writeback_rate_d_term /
50 			       dc->writeback_rate_update_seconds) ?: 1, 0);
51 
52 	derivative *= dc->writeback_rate_d_term;
53 	derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
54 
55 	change = proportional + derivative;
56 
57 	/* Don't increase writeback rate if the device isn't keeping up */
58 	if (change > 0 &&
59 	    time_after64(local_clock(),
60 			 dc->writeback_rate.next + NSEC_PER_MSEC))
61 		change = 0;
62 
63 	dc->writeback_rate.rate =
64 		clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
65 			1, NSEC_PER_MSEC);
66 
67 	dc->writeback_rate_proportional = proportional;
68 	dc->writeback_rate_derivative = derivative;
69 	dc->writeback_rate_change = change;
70 	dc->writeback_rate_target = target;
71 }
72 
update_writeback_rate(struct work_struct * work)73 static void update_writeback_rate(struct work_struct *work)
74 {
75 	struct cached_dev *dc = container_of(to_delayed_work(work),
76 					     struct cached_dev,
77 					     writeback_rate_update);
78 
79 	down_read(&dc->writeback_lock);
80 
81 	if (atomic_read(&dc->has_dirty) &&
82 	    dc->writeback_percent)
83 		__update_writeback_rate(dc);
84 
85 	up_read(&dc->writeback_lock);
86 
87 	schedule_delayed_work(&dc->writeback_rate_update,
88 			      dc->writeback_rate_update_seconds * HZ);
89 }
90 
writeback_delay(struct cached_dev * dc,unsigned sectors)91 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
92 {
93 	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
94 	    !dc->writeback_percent)
95 		return 0;
96 
97 	return bch_next_delay(&dc->writeback_rate, sectors);
98 }
99 
100 struct dirty_io {
101 	struct closure		cl;
102 	struct cached_dev	*dc;
103 	struct bio		bio;
104 };
105 
dirty_init(struct keybuf_key * w)106 static void dirty_init(struct keybuf_key *w)
107 {
108 	struct dirty_io *io = w->private;
109 	struct bio *bio = &io->bio;
110 
111 	bio_init(bio);
112 	if (!io->dc->writeback_percent)
113 		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
114 
115 	bio->bi_iter.bi_size	= KEY_SIZE(&w->key) << 9;
116 	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
117 	bio->bi_private		= w;
118 	bio->bi_io_vec		= bio->bi_inline_vecs;
119 	bch_bio_map(bio, NULL);
120 }
121 
dirty_io_destructor(struct closure * cl)122 static void dirty_io_destructor(struct closure *cl)
123 {
124 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
125 	kfree(io);
126 }
127 
write_dirty_finish(struct closure * cl)128 static void write_dirty_finish(struct closure *cl)
129 {
130 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
131 	struct keybuf_key *w = io->bio.bi_private;
132 	struct cached_dev *dc = io->dc;
133 	struct bio_vec *bv;
134 	int i;
135 
136 	bio_for_each_segment_all(bv, &io->bio, i)
137 		__free_page(bv->bv_page);
138 
139 	/* This is kind of a dumb way of signalling errors. */
140 	if (KEY_DIRTY(&w->key)) {
141 		int ret;
142 		unsigned i;
143 		struct keylist keys;
144 
145 		bch_keylist_init(&keys);
146 
147 		bkey_copy(keys.top, &w->key);
148 		SET_KEY_DIRTY(keys.top, false);
149 		bch_keylist_push(&keys);
150 
151 		for (i = 0; i < KEY_PTRS(&w->key); i++)
152 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
153 
154 		ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
155 
156 		if (ret)
157 			trace_bcache_writeback_collision(&w->key);
158 
159 		atomic_long_inc(ret
160 				? &dc->disk.c->writeback_keys_failed
161 				: &dc->disk.c->writeback_keys_done);
162 	}
163 
164 	bch_keybuf_del(&dc->writeback_keys, w);
165 	up(&dc->in_flight);
166 
167 	closure_return_with_destructor(cl, dirty_io_destructor);
168 }
169 
dirty_endio(struct bio * bio)170 static void dirty_endio(struct bio *bio)
171 {
172 	struct keybuf_key *w = bio->bi_private;
173 	struct dirty_io *io = w->private;
174 
175 	if (bio->bi_error)
176 		SET_KEY_DIRTY(&w->key, false);
177 
178 	closure_put(&io->cl);
179 }
180 
write_dirty(struct closure * cl)181 static void write_dirty(struct closure *cl)
182 {
183 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
184 	struct keybuf_key *w = io->bio.bi_private;
185 
186 	dirty_init(w);
187 	io->bio.bi_rw		= WRITE;
188 	io->bio.bi_iter.bi_sector = KEY_START(&w->key);
189 	io->bio.bi_bdev		= io->dc->bdev;
190 	io->bio.bi_end_io	= dirty_endio;
191 
192 	closure_bio_submit(&io->bio, cl);
193 
194 	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
195 }
196 
read_dirty_endio(struct bio * bio)197 static void read_dirty_endio(struct bio *bio)
198 {
199 	struct keybuf_key *w = bio->bi_private;
200 	struct dirty_io *io = w->private;
201 
202 	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
203 			    bio->bi_error, "reading dirty data from cache");
204 
205 	dirty_endio(bio);
206 }
207 
read_dirty_submit(struct closure * cl)208 static void read_dirty_submit(struct closure *cl)
209 {
210 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
211 
212 	closure_bio_submit(&io->bio, cl);
213 
214 	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
215 }
216 
read_dirty(struct cached_dev * dc)217 static void read_dirty(struct cached_dev *dc)
218 {
219 	unsigned delay = 0;
220 	struct keybuf_key *w;
221 	struct dirty_io *io;
222 	struct closure cl;
223 
224 	closure_init_stack(&cl);
225 
226 	/*
227 	 * XXX: if we error, background writeback just spins. Should use some
228 	 * mempools.
229 	 */
230 
231 	while (!kthread_should_stop()) {
232 		try_to_freeze();
233 
234 		w = bch_keybuf_next(&dc->writeback_keys);
235 		if (!w)
236 			break;
237 
238 		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
239 
240 		if (KEY_START(&w->key) != dc->last_read ||
241 		    jiffies_to_msecs(delay) > 50)
242 			while (!kthread_should_stop() && delay)
243 				delay = schedule_timeout_interruptible(delay);
244 
245 		dc->last_read	= KEY_OFFSET(&w->key);
246 
247 		io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
248 			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
249 			     GFP_KERNEL);
250 		if (!io)
251 			goto err;
252 
253 		w->private	= io;
254 		io->dc		= dc;
255 
256 		dirty_init(w);
257 		io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
258 		io->bio.bi_bdev		= PTR_CACHE(dc->disk.c,
259 						    &w->key, 0)->bdev;
260 		io->bio.bi_rw		= READ;
261 		io->bio.bi_end_io	= read_dirty_endio;
262 
263 		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
264 			goto err_free;
265 
266 		trace_bcache_writeback(&w->key);
267 
268 		down(&dc->in_flight);
269 		closure_call(&io->cl, read_dirty_submit, NULL, &cl);
270 
271 		delay = writeback_delay(dc, KEY_SIZE(&w->key));
272 	}
273 
274 	if (0) {
275 err_free:
276 		kfree(w->private);
277 err:
278 		bch_keybuf_del(&dc->writeback_keys, w);
279 	}
280 
281 	/*
282 	 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
283 	 * freed) before refilling again
284 	 */
285 	closure_sync(&cl);
286 }
287 
288 /* Scan for dirty data */
289 
bcache_dev_sectors_dirty_add(struct cache_set * c,unsigned inode,uint64_t offset,int nr_sectors)290 void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
291 				  uint64_t offset, int nr_sectors)
292 {
293 	struct bcache_device *d = c->devices[inode];
294 	unsigned stripe_offset, stripe, sectors_dirty;
295 
296 	if (!d)
297 		return;
298 
299 	stripe = offset_to_stripe(d, offset);
300 	stripe_offset = offset & (d->stripe_size - 1);
301 
302 	while (nr_sectors) {
303 		int s = min_t(unsigned, abs(nr_sectors),
304 			      d->stripe_size - stripe_offset);
305 
306 		if (nr_sectors < 0)
307 			s = -s;
308 
309 		if (stripe >= d->nr_stripes)
310 			return;
311 
312 		sectors_dirty = atomic_add_return(s,
313 					d->stripe_sectors_dirty + stripe);
314 		if (sectors_dirty == d->stripe_size)
315 			set_bit(stripe, d->full_dirty_stripes);
316 		else
317 			clear_bit(stripe, d->full_dirty_stripes);
318 
319 		nr_sectors -= s;
320 		stripe_offset = 0;
321 		stripe++;
322 	}
323 }
324 
dirty_pred(struct keybuf * buf,struct bkey * k)325 static bool dirty_pred(struct keybuf *buf, struct bkey *k)
326 {
327 	struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
328 
329 	BUG_ON(KEY_INODE(k) != dc->disk.id);
330 
331 	return KEY_DIRTY(k);
332 }
333 
refill_full_stripes(struct cached_dev * dc)334 static void refill_full_stripes(struct cached_dev *dc)
335 {
336 	struct keybuf *buf = &dc->writeback_keys;
337 	unsigned start_stripe, stripe, next_stripe;
338 	bool wrapped = false;
339 
340 	stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
341 
342 	if (stripe >= dc->disk.nr_stripes)
343 		stripe = 0;
344 
345 	start_stripe = stripe;
346 
347 	while (1) {
348 		stripe = find_next_bit(dc->disk.full_dirty_stripes,
349 				       dc->disk.nr_stripes, stripe);
350 
351 		if (stripe == dc->disk.nr_stripes)
352 			goto next;
353 
354 		next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
355 						 dc->disk.nr_stripes, stripe);
356 
357 		buf->last_scanned = KEY(dc->disk.id,
358 					stripe * dc->disk.stripe_size, 0);
359 
360 		bch_refill_keybuf(dc->disk.c, buf,
361 				  &KEY(dc->disk.id,
362 				       next_stripe * dc->disk.stripe_size, 0),
363 				  dirty_pred);
364 
365 		if (array_freelist_empty(&buf->freelist))
366 			return;
367 
368 		stripe = next_stripe;
369 next:
370 		if (wrapped && stripe > start_stripe)
371 			return;
372 
373 		if (stripe == dc->disk.nr_stripes) {
374 			stripe = 0;
375 			wrapped = true;
376 		}
377 	}
378 }
379 
380 /*
381  * Returns true if we scanned the entire disk
382  */
refill_dirty(struct cached_dev * dc)383 static bool refill_dirty(struct cached_dev *dc)
384 {
385 	struct keybuf *buf = &dc->writeback_keys;
386 	struct bkey start = KEY(dc->disk.id, 0, 0);
387 	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
388 	struct bkey start_pos;
389 
390 	/*
391 	 * make sure keybuf pos is inside the range for this disk - at bringup
392 	 * we might not be attached yet so this disk's inode nr isn't
393 	 * initialized then
394 	 */
395 	if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
396 	    bkey_cmp(&buf->last_scanned, &end) > 0)
397 		buf->last_scanned = start;
398 
399 	if (dc->partial_stripes_expensive) {
400 		refill_full_stripes(dc);
401 		if (array_freelist_empty(&buf->freelist))
402 			return false;
403 	}
404 
405 	start_pos = buf->last_scanned;
406 	bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
407 
408 	if (bkey_cmp(&buf->last_scanned, &end) < 0)
409 		return false;
410 
411 	/*
412 	 * If we get to the end start scanning again from the beginning, and
413 	 * only scan up to where we initially started scanning from:
414 	 */
415 	buf->last_scanned = start;
416 	bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
417 
418 	return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
419 }
420 
bch_writeback_thread(void * arg)421 static int bch_writeback_thread(void *arg)
422 {
423 	struct cached_dev *dc = arg;
424 	bool searched_full_index;
425 
426 	while (!kthread_should_stop()) {
427 		down_write(&dc->writeback_lock);
428 		set_current_state(TASK_INTERRUPTIBLE);
429 		/*
430 		 * If the bache device is detaching, skip here and continue
431 		 * to perform writeback. Otherwise, if no dirty data on cache,
432 		 * or there is dirty data on cache but writeback is disabled,
433 		 * the writeback thread should sleep here and wait for others
434 		 * to wake up it.
435 		 */
436 		if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
437 		    (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) {
438 			up_write(&dc->writeback_lock);
439 
440 			if (kthread_should_stop()) {
441 				set_current_state(TASK_RUNNING);
442 				return 0;
443 			}
444 
445 			try_to_freeze();
446 			schedule();
447 			continue;
448 		}
449 		set_current_state(TASK_RUNNING);
450 
451 		searched_full_index = refill_dirty(dc);
452 
453 		if (searched_full_index &&
454 		    RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
455 			atomic_set(&dc->has_dirty, 0);
456 			cached_dev_put(dc);
457 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
458 			bch_write_bdev_super(dc, NULL);
459 			/*
460 			 * If bcache device is detaching via sysfs interface,
461 			 * writeback thread should stop after there is no dirty
462 			 * data on cache. BCACHE_DEV_DETACHING flag is set in
463 			 * bch_cached_dev_detach().
464 			 */
465 			if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) {
466 				up_write(&dc->writeback_lock);
467 				break;
468 			}
469 		}
470 
471 		up_write(&dc->writeback_lock);
472 
473 		bch_ratelimit_reset(&dc->writeback_rate);
474 		read_dirty(dc);
475 
476 		if (searched_full_index) {
477 			unsigned delay = dc->writeback_delay * HZ;
478 
479 			while (delay &&
480 			       !kthread_should_stop() &&
481 			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
482 				delay = schedule_timeout_interruptible(delay);
483 		}
484 	}
485 
486 	return 0;
487 }
488 
489 /* Init */
490 
491 struct sectors_dirty_init {
492 	struct btree_op	op;
493 	unsigned	inode;
494 };
495 
sectors_dirty_init_fn(struct btree_op * _op,struct btree * b,struct bkey * k)496 static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
497 				 struct bkey *k)
498 {
499 	struct sectors_dirty_init *op = container_of(_op,
500 						struct sectors_dirty_init, op);
501 	if (KEY_INODE(k) > op->inode)
502 		return MAP_DONE;
503 
504 	if (KEY_DIRTY(k))
505 		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
506 					     KEY_START(k), KEY_SIZE(k));
507 
508 	return MAP_CONTINUE;
509 }
510 
bch_sectors_dirty_init(struct bcache_device * d)511 void bch_sectors_dirty_init(struct bcache_device *d)
512 {
513 	struct sectors_dirty_init op;
514 
515 	bch_btree_op_init(&op.op, -1);
516 	op.inode = d->id;
517 
518 	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
519 			   sectors_dirty_init_fn, 0);
520 
521 	d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
522 }
523 
bch_cached_dev_writeback_init(struct cached_dev * dc)524 void bch_cached_dev_writeback_init(struct cached_dev *dc)
525 {
526 	sema_init(&dc->in_flight, 64);
527 	init_rwsem(&dc->writeback_lock);
528 	bch_keybuf_init(&dc->writeback_keys);
529 
530 	dc->writeback_metadata		= true;
531 	dc->writeback_running		= true;
532 	dc->writeback_percent		= 10;
533 	dc->writeback_delay		= 30;
534 	dc->writeback_rate.rate		= 1024;
535 
536 	dc->writeback_rate_update_seconds = 5;
537 	dc->writeback_rate_d_term	= 30;
538 	dc->writeback_rate_p_term_inverse = 6000;
539 
540 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
541 }
542 
bch_cached_dev_writeback_start(struct cached_dev * dc)543 int bch_cached_dev_writeback_start(struct cached_dev *dc)
544 {
545 	dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
546 						WQ_MEM_RECLAIM, 0);
547 	if (!dc->writeback_write_wq)
548 		return -ENOMEM;
549 
550 	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
551 					      "bcache_writeback");
552 	if (IS_ERR(dc->writeback_thread))
553 		return PTR_ERR(dc->writeback_thread);
554 
555 	schedule_delayed_work(&dc->writeback_rate_update,
556 			      dc->writeback_rate_update_seconds * HZ);
557 
558 	bch_writeback_queue(dc);
559 
560 	return 0;
561 }
562