• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3  *
4  * Uses a block device as cache for other block devices; optimized for SSDs.
5  * All allocation is done in buckets, which should match the erase block size
6  * of the device.
7  *
8  * Buckets containing cached data are kept on a heap sorted by priority;
9  * bucket priority is increased on cache hit, and periodically all the buckets
10  * on the heap have their priority scaled down. This currently is just used as
11  * an LRU but in the future should allow for more intelligent heuristics.
12  *
13  * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14  * counter. Garbage collection is used to remove stale pointers.
15  *
16  * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17  * as keys are inserted we only sort the pages that have not yet been written.
18  * When garbage collection is run, we resort the entire node.
19  *
20  * All configuration is done via sysfs; see Documentation/bcache.txt.
21  */
22 
23 #include "bcache.h"
24 #include "btree.h"
25 #include "debug.h"
26 #include "extents.h"
27 
28 #include <linux/slab.h>
29 #include <linux/bitops.h>
30 #include <linux/freezer.h>
31 #include <linux/hash.h>
32 #include <linux/kthread.h>
33 #include <linux/prefetch.h>
34 #include <linux/random.h>
35 #include <linux/rcupdate.h>
36 #include <trace/events/bcache.h>
37 
38 /*
39  * Todo:
40  * register_bcache: Return errors out to userspace correctly
41  *
42  * Writeback: don't undirty key until after a cache flush
43  *
44  * Create an iterator for key pointers
45  *
46  * On btree write error, mark bucket such that it won't be freed from the cache
47  *
48  * Journalling:
49  *   Check for bad keys in replay
50  *   Propagate barriers
51  *   Refcount journal entries in journal_replay
52  *
53  * Garbage collection:
54  *   Finish incremental gc
55  *   Gc should free old UUIDs, data for invalid UUIDs
56  *
57  * Provide a way to list backing device UUIDs we have data cached for, and
58  * probably how long it's been since we've seen them, and a way to invalidate
59  * dirty data for devices that will never be attached again
60  *
61  * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
62  * that based on that and how much dirty data we have we can keep writeback
63  * from being starved
64  *
65  * Add a tracepoint or somesuch to watch for writeback starvation
66  *
67  * When btree depth > 1 and splitting an interior node, we have to make sure
68  * alloc_bucket() cannot fail. This should be true but is not completely
69  * obvious.
70  *
71  * Plugging?
72  *
73  * If data write is less than hard sector size of ssd, round up offset in open
74  * bucket to the next whole sector
75  *
76  * Superblock needs to be fleshed out for multiple cache devices
77  *
78  * Add a sysfs tunable for the number of writeback IOs in flight
79  *
80  * Add a sysfs tunable for the number of open data buckets
81  *
82  * IO tracking: Can we track when one process is doing io on behalf of another?
83  * IO tracking: Don't use just an average, weigh more recent stuff higher
84  *
85  * Test module load/unload
86  */
87 
88 #define MAX_NEED_GC		64
89 #define MAX_SAVE_PRIO		72
90 
91 #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
92 
93 #define PTR_HASH(c, k)							\
94 	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
95 
96 #define insert_lock(s, b)	((b)->level <= (s)->lock)
97 
98 /*
99  * These macros are for recursing down the btree - they handle the details of
100  * locking and looking up nodes in the cache for you. They're best treated as
101  * mere syntax when reading code that uses them.
102  *
103  * op->lock determines whether we take a read or a write lock at a given depth.
104  * If you've got a read lock and find that you need a write lock (i.e. you're
105  * going to have to split), set op->lock and return -EINTR; btree_root() will
106  * call you again and you'll have the correct lock.
107  */
108 
109 /**
110  * btree - recurse down the btree on a specified key
111  * @fn:		function to call, which will be passed the child node
112  * @key:	key to recurse on
113  * @b:		parent btree node
114  * @op:		pointer to struct btree_op
115  */
116 #define btree(fn, key, b, op, ...)					\
117 ({									\
118 	int _r, l = (b)->level - 1;					\
119 	bool _w = l <= (op)->lock;					\
120 	struct btree *_child = bch_btree_node_get((b)->c, op, key, l,	\
121 						  _w, b);		\
122 	if (!IS_ERR(_child)) {						\
123 		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
124 		rw_unlock(_w, _child);					\
125 	} else								\
126 		_r = PTR_ERR(_child);					\
127 	_r;								\
128 })
129 
130 /**
131  * btree_root - call a function on the root of the btree
132  * @fn:		function to call, which will be passed the child node
133  * @c:		cache set
134  * @op:		pointer to struct btree_op
135  */
136 #define btree_root(fn, c, op, ...)					\
137 ({									\
138 	int _r = -EINTR;						\
139 	do {								\
140 		struct btree *_b = (c)->root;				\
141 		bool _w = insert_lock(op, _b);				\
142 		rw_lock(_w, _b, _b->level);				\
143 		if (_b == (c)->root &&					\
144 		    _w == insert_lock(op, _b)) {			\
145 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
146 		}							\
147 		rw_unlock(_w, _b);					\
148 		bch_cannibalize_unlock(c);				\
149 		if (_r == -EINTR)					\
150 			schedule();					\
151 	} while (_r == -EINTR);						\
152 									\
153 	finish_wait(&(c)->btree_cache_wait, &(op)->wait);		\
154 	_r;								\
155 })
156 
write_block(struct btree * b)157 static inline struct bset *write_block(struct btree *b)
158 {
159 	return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
160 }
161 
bch_btree_init_next(struct btree * b)162 static void bch_btree_init_next(struct btree *b)
163 {
164 	/* If not a leaf node, always sort */
165 	if (b->level && b->keys.nsets)
166 		bch_btree_sort(&b->keys, &b->c->sort);
167 	else
168 		bch_btree_sort_lazy(&b->keys, &b->c->sort);
169 
170 	if (b->written < btree_blocks(b))
171 		bch_bset_init_next(&b->keys, write_block(b),
172 				   bset_magic(&b->c->sb));
173 
174 }
175 
176 /* Btree key manipulation */
177 
bkey_put(struct cache_set * c,struct bkey * k)178 void bkey_put(struct cache_set *c, struct bkey *k)
179 {
180 	unsigned i;
181 
182 	for (i = 0; i < KEY_PTRS(k); i++)
183 		if (ptr_available(c, k, i))
184 			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
185 }
186 
187 /* Btree IO */
188 
btree_csum_set(struct btree * b,struct bset * i)189 static uint64_t btree_csum_set(struct btree *b, struct bset *i)
190 {
191 	uint64_t crc = b->key.ptr[0];
192 	void *data = (void *) i + 8, *end = bset_bkey_last(i);
193 
194 	crc = bch_crc64_update(crc, data, end - data);
195 	return crc ^ 0xffffffffffffffffULL;
196 }
197 
bch_btree_node_read_done(struct btree * b)198 void bch_btree_node_read_done(struct btree *b)
199 {
200 	const char *err = "bad btree header";
201 	struct bset *i = btree_bset_first(b);
202 	struct btree_iter *iter;
203 
204 	iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
205 	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
206 	iter->used = 0;
207 
208 #ifdef CONFIG_BCACHE_DEBUG
209 	iter->b = &b->keys;
210 #endif
211 
212 	if (!i->seq)
213 		goto err;
214 
215 	for (;
216 	     b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
217 	     i = write_block(b)) {
218 		err = "unsupported bset version";
219 		if (i->version > BCACHE_BSET_VERSION)
220 			goto err;
221 
222 		err = "bad btree header";
223 		if (b->written + set_blocks(i, block_bytes(b->c)) >
224 		    btree_blocks(b))
225 			goto err;
226 
227 		err = "bad magic";
228 		if (i->magic != bset_magic(&b->c->sb))
229 			goto err;
230 
231 		err = "bad checksum";
232 		switch (i->version) {
233 		case 0:
234 			if (i->csum != csum_set(i))
235 				goto err;
236 			break;
237 		case BCACHE_BSET_VERSION:
238 			if (i->csum != btree_csum_set(b, i))
239 				goto err;
240 			break;
241 		}
242 
243 		err = "empty set";
244 		if (i != b->keys.set[0].data && !i->keys)
245 			goto err;
246 
247 		bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
248 
249 		b->written += set_blocks(i, block_bytes(b->c));
250 	}
251 
252 	err = "corrupted btree";
253 	for (i = write_block(b);
254 	     bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
255 	     i = ((void *) i) + block_bytes(b->c))
256 		if (i->seq == b->keys.set[0].data->seq)
257 			goto err;
258 
259 	bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
260 
261 	i = b->keys.set[0].data;
262 	err = "short btree key";
263 	if (b->keys.set[0].size &&
264 	    bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
265 		goto err;
266 
267 	if (b->written < btree_blocks(b))
268 		bch_bset_init_next(&b->keys, write_block(b),
269 				   bset_magic(&b->c->sb));
270 out:
271 	mempool_free(iter, b->c->fill_iter);
272 	return;
273 err:
274 	set_btree_node_io_error(b);
275 	bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
276 			    err, PTR_BUCKET_NR(b->c, &b->key, 0),
277 			    bset_block_offset(b, i), i->keys);
278 	goto out;
279 }
280 
btree_node_read_endio(struct bio * bio)281 static void btree_node_read_endio(struct bio *bio)
282 {
283 	struct closure *cl = bio->bi_private;
284 	closure_put(cl);
285 }
286 
bch_btree_node_read(struct btree * b)287 static void bch_btree_node_read(struct btree *b)
288 {
289 	uint64_t start_time = local_clock();
290 	struct closure cl;
291 	struct bio *bio;
292 
293 	trace_bcache_btree_read(b);
294 
295 	closure_init_stack(&cl);
296 
297 	bio = bch_bbio_alloc(b->c);
298 	bio->bi_rw	= REQ_META|READ_SYNC;
299 	bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
300 	bio->bi_end_io	= btree_node_read_endio;
301 	bio->bi_private	= &cl;
302 
303 	bch_bio_map(bio, b->keys.set[0].data);
304 
305 	bch_submit_bbio(bio, b->c, &b->key, 0);
306 	closure_sync(&cl);
307 
308 	if (bio->bi_error)
309 		set_btree_node_io_error(b);
310 
311 	bch_bbio_free(bio, b->c);
312 
313 	if (btree_node_io_error(b))
314 		goto err;
315 
316 	bch_btree_node_read_done(b);
317 	bch_time_stats_update(&b->c->btree_read_time, start_time);
318 
319 	return;
320 err:
321 	bch_cache_set_error(b->c, "io error reading bucket %zu",
322 			    PTR_BUCKET_NR(b->c, &b->key, 0));
323 }
324 
btree_complete_write(struct btree * b,struct btree_write * w)325 static void btree_complete_write(struct btree *b, struct btree_write *w)
326 {
327 	if (w->prio_blocked &&
328 	    !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
329 		wake_up_allocators(b->c);
330 
331 	if (w->journal) {
332 		atomic_dec_bug(w->journal);
333 		__closure_wake_up(&b->c->journal.wait);
334 	}
335 
336 	w->prio_blocked	= 0;
337 	w->journal	= NULL;
338 }
339 
btree_node_write_unlock(struct closure * cl)340 static void btree_node_write_unlock(struct closure *cl)
341 {
342 	struct btree *b = container_of(cl, struct btree, io);
343 
344 	up(&b->io_mutex);
345 }
346 
__btree_node_write_done(struct closure * cl)347 static void __btree_node_write_done(struct closure *cl)
348 {
349 	struct btree *b = container_of(cl, struct btree, io);
350 	struct btree_write *w = btree_prev_write(b);
351 
352 	bch_bbio_free(b->bio, b->c);
353 	b->bio = NULL;
354 	btree_complete_write(b, w);
355 
356 	if (btree_node_dirty(b))
357 		schedule_delayed_work(&b->work, 30 * HZ);
358 
359 	closure_return_with_destructor(cl, btree_node_write_unlock);
360 }
361 
btree_node_write_done(struct closure * cl)362 static void btree_node_write_done(struct closure *cl)
363 {
364 	struct btree *b = container_of(cl, struct btree, io);
365 	struct bio_vec *bv;
366 	int n;
367 
368 	bio_for_each_segment_all(bv, b->bio, n)
369 		__free_page(bv->bv_page);
370 
371 	__btree_node_write_done(cl);
372 }
373 
btree_node_write_endio(struct bio * bio)374 static void btree_node_write_endio(struct bio *bio)
375 {
376 	struct closure *cl = bio->bi_private;
377 	struct btree *b = container_of(cl, struct btree, io);
378 
379 	if (bio->bi_error)
380 		set_btree_node_io_error(b);
381 
382 	bch_bbio_count_io_errors(b->c, bio, bio->bi_error, "writing btree");
383 	closure_put(cl);
384 }
385 
do_btree_node_write(struct btree * b)386 static void do_btree_node_write(struct btree *b)
387 {
388 	struct closure *cl = &b->io;
389 	struct bset *i = btree_bset_last(b);
390 	BKEY_PADDED(key) k;
391 
392 	i->version	= BCACHE_BSET_VERSION;
393 	i->csum		= btree_csum_set(b, i);
394 
395 	BUG_ON(b->bio);
396 	b->bio = bch_bbio_alloc(b->c);
397 
398 	b->bio->bi_end_io	= btree_node_write_endio;
399 	b->bio->bi_private	= cl;
400 	b->bio->bi_rw		= REQ_META|WRITE_SYNC|REQ_FUA;
401 	b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c));
402 	bch_bio_map(b->bio, i);
403 
404 	/*
405 	 * If we're appending to a leaf node, we don't technically need FUA -
406 	 * this write just needs to be persisted before the next journal write,
407 	 * which will be marked FLUSH|FUA.
408 	 *
409 	 * Similarly if we're writing a new btree root - the pointer is going to
410 	 * be in the next journal entry.
411 	 *
412 	 * But if we're writing a new btree node (that isn't a root) or
413 	 * appending to a non leaf btree node, we need either FUA or a flush
414 	 * when we write the parent with the new pointer. FUA is cheaper than a
415 	 * flush, and writes appending to leaf nodes aren't blocking anything so
416 	 * just make all btree node writes FUA to keep things sane.
417 	 */
418 
419 	bkey_copy(&k.key, &b->key);
420 	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
421 		       bset_sector_offset(&b->keys, i));
422 
423 	if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
424 		int j;
425 		struct bio_vec *bv;
426 		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
427 
428 		bio_for_each_segment_all(bv, b->bio, j)
429 			memcpy(page_address(bv->bv_page),
430 			       base + j * PAGE_SIZE, PAGE_SIZE);
431 
432 		bch_submit_bbio(b->bio, b->c, &k.key, 0);
433 
434 		continue_at(cl, btree_node_write_done, NULL);
435 	} else {
436 		b->bio->bi_vcnt = 0;
437 		bch_bio_map(b->bio, i);
438 
439 		bch_submit_bbio(b->bio, b->c, &k.key, 0);
440 
441 		closure_sync(cl);
442 		continue_at_nobarrier(cl, __btree_node_write_done, NULL);
443 	}
444 }
445 
__bch_btree_node_write(struct btree * b,struct closure * parent)446 void __bch_btree_node_write(struct btree *b, struct closure *parent)
447 {
448 	struct bset *i = btree_bset_last(b);
449 
450 	lockdep_assert_held(&b->write_lock);
451 
452 	trace_bcache_btree_write(b);
453 
454 	BUG_ON(current->bio_list);
455 	BUG_ON(b->written >= btree_blocks(b));
456 	BUG_ON(b->written && !i->keys);
457 	BUG_ON(btree_bset_first(b)->seq != i->seq);
458 	bch_check_keys(&b->keys, "writing");
459 
460 	cancel_delayed_work(&b->work);
461 
462 	/* If caller isn't waiting for write, parent refcount is cache set */
463 	down(&b->io_mutex);
464 	closure_init(&b->io, parent ?: &b->c->cl);
465 
466 	clear_bit(BTREE_NODE_dirty,	 &b->flags);
467 	change_bit(BTREE_NODE_write_idx, &b->flags);
468 
469 	do_btree_node_write(b);
470 
471 	atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
472 			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
473 
474 	b->written += set_blocks(i, block_bytes(b->c));
475 }
476 
bch_btree_node_write(struct btree * b,struct closure * parent)477 void bch_btree_node_write(struct btree *b, struct closure *parent)
478 {
479 	unsigned nsets = b->keys.nsets;
480 
481 	lockdep_assert_held(&b->lock);
482 
483 	__bch_btree_node_write(b, parent);
484 
485 	/*
486 	 * do verify if there was more than one set initially (i.e. we did a
487 	 * sort) and we sorted down to a single set:
488 	 */
489 	if (nsets && !b->keys.nsets)
490 		bch_btree_verify(b);
491 
492 	bch_btree_init_next(b);
493 }
494 
bch_btree_node_write_sync(struct btree * b)495 static void bch_btree_node_write_sync(struct btree *b)
496 {
497 	struct closure cl;
498 
499 	closure_init_stack(&cl);
500 
501 	mutex_lock(&b->write_lock);
502 	bch_btree_node_write(b, &cl);
503 	mutex_unlock(&b->write_lock);
504 
505 	closure_sync(&cl);
506 }
507 
btree_node_write_work(struct work_struct * w)508 static void btree_node_write_work(struct work_struct *w)
509 {
510 	struct btree *b = container_of(to_delayed_work(w), struct btree, work);
511 
512 	mutex_lock(&b->write_lock);
513 	if (btree_node_dirty(b))
514 		__bch_btree_node_write(b, NULL);
515 	mutex_unlock(&b->write_lock);
516 }
517 
bch_btree_leaf_dirty(struct btree * b,atomic_t * journal_ref)518 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
519 {
520 	struct bset *i = btree_bset_last(b);
521 	struct btree_write *w = btree_current_write(b);
522 
523 	lockdep_assert_held(&b->write_lock);
524 
525 	BUG_ON(!b->written);
526 	BUG_ON(!i->keys);
527 
528 	if (!btree_node_dirty(b))
529 		schedule_delayed_work(&b->work, 30 * HZ);
530 
531 	set_btree_node_dirty(b);
532 
533 	if (journal_ref) {
534 		if (w->journal &&
535 		    journal_pin_cmp(b->c, w->journal, journal_ref)) {
536 			atomic_dec_bug(w->journal);
537 			w->journal = NULL;
538 		}
539 
540 		if (!w->journal) {
541 			w->journal = journal_ref;
542 			atomic_inc(w->journal);
543 		}
544 	}
545 
546 	/* Force write if set is too big */
547 	if (set_bytes(i) > PAGE_SIZE - 48 &&
548 	    !current->bio_list)
549 		bch_btree_node_write(b, NULL);
550 }
551 
552 /*
553  * Btree in memory cache - allocation/freeing
554  * mca -> memory cache
555  */
556 
557 #define mca_reserve(c)	(((c->root && c->root->level)		\
558 			  ? c->root->level : 1) * 8 + 16)
559 #define mca_can_free(c)						\
560 	max_t(int, 0, c->btree_cache_used - mca_reserve(c))
561 
mca_data_free(struct btree * b)562 static void mca_data_free(struct btree *b)
563 {
564 	BUG_ON(b->io_mutex.count != 1);
565 
566 	bch_btree_keys_free(&b->keys);
567 
568 	b->c->btree_cache_used--;
569 	list_move(&b->list, &b->c->btree_cache_freed);
570 }
571 
mca_bucket_free(struct btree * b)572 static void mca_bucket_free(struct btree *b)
573 {
574 	BUG_ON(btree_node_dirty(b));
575 
576 	b->key.ptr[0] = 0;
577 	hlist_del_init_rcu(&b->hash);
578 	list_move(&b->list, &b->c->btree_cache_freeable);
579 }
580 
btree_order(struct bkey * k)581 static unsigned btree_order(struct bkey *k)
582 {
583 	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
584 }
585 
mca_data_alloc(struct btree * b,struct bkey * k,gfp_t gfp)586 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
587 {
588 	if (!bch_btree_keys_alloc(&b->keys,
589 				  max_t(unsigned,
590 					ilog2(b->c->btree_pages),
591 					btree_order(k)),
592 				  gfp)) {
593 		b->c->btree_cache_used++;
594 		list_move(&b->list, &b->c->btree_cache);
595 	} else {
596 		list_move(&b->list, &b->c->btree_cache_freed);
597 	}
598 }
599 
mca_bucket_alloc(struct cache_set * c,struct bkey * k,gfp_t gfp)600 static struct btree *mca_bucket_alloc(struct cache_set *c,
601 				      struct bkey *k, gfp_t gfp)
602 {
603 	struct btree *b = kzalloc(sizeof(struct btree), gfp);
604 	if (!b)
605 		return NULL;
606 
607 	init_rwsem(&b->lock);
608 	lockdep_set_novalidate_class(&b->lock);
609 	mutex_init(&b->write_lock);
610 	lockdep_set_novalidate_class(&b->write_lock);
611 	INIT_LIST_HEAD(&b->list);
612 	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
613 	b->c = c;
614 	sema_init(&b->io_mutex, 1);
615 
616 	mca_data_alloc(b, k, gfp);
617 	return b;
618 }
619 
mca_reap(struct btree * b,unsigned min_order,bool flush)620 static int mca_reap(struct btree *b, unsigned min_order, bool flush)
621 {
622 	struct closure cl;
623 
624 	closure_init_stack(&cl);
625 	lockdep_assert_held(&b->c->bucket_lock);
626 
627 	if (!down_write_trylock(&b->lock))
628 		return -ENOMEM;
629 
630 	BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
631 
632 	if (b->keys.page_order < min_order)
633 		goto out_unlock;
634 
635 	if (!flush) {
636 		if (btree_node_dirty(b))
637 			goto out_unlock;
638 
639 		if (down_trylock(&b->io_mutex))
640 			goto out_unlock;
641 		up(&b->io_mutex);
642 	}
643 
644 	mutex_lock(&b->write_lock);
645 	if (btree_node_dirty(b))
646 		__bch_btree_node_write(b, &cl);
647 	mutex_unlock(&b->write_lock);
648 
649 	closure_sync(&cl);
650 
651 	/* wait for any in flight btree write */
652 	down(&b->io_mutex);
653 	up(&b->io_mutex);
654 
655 	return 0;
656 out_unlock:
657 	rw_unlock(true, b);
658 	return -ENOMEM;
659 }
660 
bch_mca_scan(struct shrinker * shrink,struct shrink_control * sc)661 static unsigned long bch_mca_scan(struct shrinker *shrink,
662 				  struct shrink_control *sc)
663 {
664 	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
665 	struct btree *b, *t;
666 	unsigned long i, nr = sc->nr_to_scan;
667 	unsigned long freed = 0;
668 
669 	if (c->shrinker_disabled)
670 		return SHRINK_STOP;
671 
672 	if (c->btree_cache_alloc_lock)
673 		return SHRINK_STOP;
674 
675 	/* Return -1 if we can't do anything right now */
676 	if (sc->gfp_mask & __GFP_IO)
677 		mutex_lock(&c->bucket_lock);
678 	else if (!mutex_trylock(&c->bucket_lock))
679 		return -1;
680 
681 	/*
682 	 * It's _really_ critical that we don't free too many btree nodes - we
683 	 * have to always leave ourselves a reserve. The reserve is how we
684 	 * guarantee that allocating memory for a new btree node can always
685 	 * succeed, so that inserting keys into the btree can always succeed and
686 	 * IO can always make forward progress:
687 	 */
688 	nr /= c->btree_pages;
689 	if (nr == 0)
690 		nr = 1;
691 	nr = min_t(unsigned long, nr, mca_can_free(c));
692 
693 	i = 0;
694 	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
695 		if (freed >= nr)
696 			break;
697 
698 		if (++i > 3 &&
699 		    !mca_reap(b, 0, false)) {
700 			mca_data_free(b);
701 			rw_unlock(true, b);
702 			freed++;
703 		}
704 	}
705 
706 	for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
707 		if (list_empty(&c->btree_cache))
708 			goto out;
709 
710 		b = list_first_entry(&c->btree_cache, struct btree, list);
711 		list_rotate_left(&c->btree_cache);
712 
713 		if (!b->accessed &&
714 		    !mca_reap(b, 0, false)) {
715 			mca_bucket_free(b);
716 			mca_data_free(b);
717 			rw_unlock(true, b);
718 			freed++;
719 		} else
720 			b->accessed = 0;
721 	}
722 out:
723 	mutex_unlock(&c->bucket_lock);
724 	return freed;
725 }
726 
bch_mca_count(struct shrinker * shrink,struct shrink_control * sc)727 static unsigned long bch_mca_count(struct shrinker *shrink,
728 				   struct shrink_control *sc)
729 {
730 	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
731 
732 	if (c->shrinker_disabled)
733 		return 0;
734 
735 	if (c->btree_cache_alloc_lock)
736 		return 0;
737 
738 	return mca_can_free(c) * c->btree_pages;
739 }
740 
bch_btree_cache_free(struct cache_set * c)741 void bch_btree_cache_free(struct cache_set *c)
742 {
743 	struct btree *b;
744 	struct closure cl;
745 	closure_init_stack(&cl);
746 
747 	if (c->shrink.list.next)
748 		unregister_shrinker(&c->shrink);
749 
750 	mutex_lock(&c->bucket_lock);
751 
752 #ifdef CONFIG_BCACHE_DEBUG
753 	if (c->verify_data)
754 		list_move(&c->verify_data->list, &c->btree_cache);
755 
756 	free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
757 #endif
758 
759 	list_splice(&c->btree_cache_freeable,
760 		    &c->btree_cache);
761 
762 	while (!list_empty(&c->btree_cache)) {
763 		b = list_first_entry(&c->btree_cache, struct btree, list);
764 
765 		if (btree_node_dirty(b))
766 			btree_complete_write(b, btree_current_write(b));
767 		clear_bit(BTREE_NODE_dirty, &b->flags);
768 
769 		mca_data_free(b);
770 	}
771 
772 	while (!list_empty(&c->btree_cache_freed)) {
773 		b = list_first_entry(&c->btree_cache_freed,
774 				     struct btree, list);
775 		list_del(&b->list);
776 		cancel_delayed_work_sync(&b->work);
777 		kfree(b);
778 	}
779 
780 	mutex_unlock(&c->bucket_lock);
781 }
782 
bch_btree_cache_alloc(struct cache_set * c)783 int bch_btree_cache_alloc(struct cache_set *c)
784 {
785 	unsigned i;
786 
787 	for (i = 0; i < mca_reserve(c); i++)
788 		if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
789 			return -ENOMEM;
790 
791 	list_splice_init(&c->btree_cache,
792 			 &c->btree_cache_freeable);
793 
794 #ifdef CONFIG_BCACHE_DEBUG
795 	mutex_init(&c->verify_lock);
796 
797 	c->verify_ondisk = (void *)
798 		__get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(bucket_pages(c)));
799 
800 	c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
801 
802 	if (c->verify_data &&
803 	    c->verify_data->keys.set->data)
804 		list_del_init(&c->verify_data->list);
805 	else
806 		c->verify_data = NULL;
807 #endif
808 
809 	c->shrink.count_objects = bch_mca_count;
810 	c->shrink.scan_objects = bch_mca_scan;
811 	c->shrink.seeks = 4;
812 	c->shrink.batch = c->btree_pages * 2;
813 
814 	if (register_shrinker(&c->shrink))
815 		pr_warn("bcache: %s: could not register shrinker",
816 				__func__);
817 
818 	return 0;
819 }
820 
821 /* Btree in memory cache - hash table */
822 
mca_hash(struct cache_set * c,struct bkey * k)823 static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
824 {
825 	return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
826 }
827 
mca_find(struct cache_set * c,struct bkey * k)828 static struct btree *mca_find(struct cache_set *c, struct bkey *k)
829 {
830 	struct btree *b;
831 
832 	rcu_read_lock();
833 	hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
834 		if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
835 			goto out;
836 	b = NULL;
837 out:
838 	rcu_read_unlock();
839 	return b;
840 }
841 
mca_cannibalize_lock(struct cache_set * c,struct btree_op * op)842 static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
843 {
844 	spin_lock(&c->btree_cannibalize_lock);
845 	if (likely(c->btree_cache_alloc_lock == NULL)) {
846 		c->btree_cache_alloc_lock = current;
847 	} else if (c->btree_cache_alloc_lock != current) {
848 		if (op)
849 			prepare_to_wait(&c->btree_cache_wait, &op->wait,
850 					TASK_UNINTERRUPTIBLE);
851 		spin_unlock(&c->btree_cannibalize_lock);
852 		return -EINTR;
853 	}
854 	spin_unlock(&c->btree_cannibalize_lock);
855 
856 	return 0;
857 }
858 
mca_cannibalize(struct cache_set * c,struct btree_op * op,struct bkey * k)859 static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
860 				     struct bkey *k)
861 {
862 	struct btree *b;
863 
864 	trace_bcache_btree_cache_cannibalize(c);
865 
866 	if (mca_cannibalize_lock(c, op))
867 		return ERR_PTR(-EINTR);
868 
869 	list_for_each_entry_reverse(b, &c->btree_cache, list)
870 		if (!mca_reap(b, btree_order(k), false))
871 			return b;
872 
873 	list_for_each_entry_reverse(b, &c->btree_cache, list)
874 		if (!mca_reap(b, btree_order(k), true))
875 			return b;
876 
877 	WARN(1, "btree cache cannibalize failed\n");
878 	return ERR_PTR(-ENOMEM);
879 }
880 
881 /*
882  * We can only have one thread cannibalizing other cached btree nodes at a time,
883  * or we'll deadlock. We use an open coded mutex to ensure that, which a
884  * cannibalize_bucket() will take. This means every time we unlock the root of
885  * the btree, we need to release this lock if we have it held.
886  */
bch_cannibalize_unlock(struct cache_set * c)887 static void bch_cannibalize_unlock(struct cache_set *c)
888 {
889 	spin_lock(&c->btree_cannibalize_lock);
890 	if (c->btree_cache_alloc_lock == current) {
891 		c->btree_cache_alloc_lock = NULL;
892 		wake_up(&c->btree_cache_wait);
893 	}
894 	spin_unlock(&c->btree_cannibalize_lock);
895 }
896 
mca_alloc(struct cache_set * c,struct btree_op * op,struct bkey * k,int level)897 static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
898 			       struct bkey *k, int level)
899 {
900 	struct btree *b;
901 
902 	BUG_ON(current->bio_list);
903 
904 	lockdep_assert_held(&c->bucket_lock);
905 
906 	if (mca_find(c, k))
907 		return NULL;
908 
909 	/* btree_free() doesn't free memory; it sticks the node on the end of
910 	 * the list. Check if there's any freed nodes there:
911 	 */
912 	list_for_each_entry(b, &c->btree_cache_freeable, list)
913 		if (!mca_reap(b, btree_order(k), false))
914 			goto out;
915 
916 	/* We never free struct btree itself, just the memory that holds the on
917 	 * disk node. Check the freed list before allocating a new one:
918 	 */
919 	list_for_each_entry(b, &c->btree_cache_freed, list)
920 		if (!mca_reap(b, 0, false)) {
921 			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
922 			if (!b->keys.set[0].data)
923 				goto err;
924 			else
925 				goto out;
926 		}
927 
928 	b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
929 	if (!b)
930 		goto err;
931 
932 	BUG_ON(!down_write_trylock(&b->lock));
933 	if (!b->keys.set->data)
934 		goto err;
935 out:
936 	BUG_ON(b->io_mutex.count != 1);
937 
938 	bkey_copy(&b->key, k);
939 	list_move(&b->list, &c->btree_cache);
940 	hlist_del_init_rcu(&b->hash);
941 	hlist_add_head_rcu(&b->hash, mca_hash(c, k));
942 
943 	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
944 	b->parent	= (void *) ~0UL;
945 	b->flags	= 0;
946 	b->written	= 0;
947 	b->level	= level;
948 
949 	if (!b->level)
950 		bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
951 				    &b->c->expensive_debug_checks);
952 	else
953 		bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
954 				    &b->c->expensive_debug_checks);
955 
956 	return b;
957 err:
958 	if (b)
959 		rw_unlock(true, b);
960 
961 	b = mca_cannibalize(c, op, k);
962 	if (!IS_ERR(b))
963 		goto out;
964 
965 	return b;
966 }
967 
968 /**
969  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
970  * in from disk if necessary.
971  *
972  * If IO is necessary and running under generic_make_request, returns -EAGAIN.
973  *
974  * The btree node will have either a read or a write lock held, depending on
975  * level and op->lock.
976  */
bch_btree_node_get(struct cache_set * c,struct btree_op * op,struct bkey * k,int level,bool write,struct btree * parent)977 struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
978 				 struct bkey *k, int level, bool write,
979 				 struct btree *parent)
980 {
981 	int i = 0;
982 	struct btree *b;
983 
984 	BUG_ON(level < 0);
985 retry:
986 	b = mca_find(c, k);
987 
988 	if (!b) {
989 		if (current->bio_list)
990 			return ERR_PTR(-EAGAIN);
991 
992 		mutex_lock(&c->bucket_lock);
993 		b = mca_alloc(c, op, k, level);
994 		mutex_unlock(&c->bucket_lock);
995 
996 		if (!b)
997 			goto retry;
998 		if (IS_ERR(b))
999 			return b;
1000 
1001 		bch_btree_node_read(b);
1002 
1003 		if (!write)
1004 			downgrade_write(&b->lock);
1005 	} else {
1006 		rw_lock(write, b, level);
1007 		if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
1008 			rw_unlock(write, b);
1009 			goto retry;
1010 		}
1011 		BUG_ON(b->level != level);
1012 	}
1013 
1014 	b->parent = parent;
1015 	b->accessed = 1;
1016 
1017 	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
1018 		prefetch(b->keys.set[i].tree);
1019 		prefetch(b->keys.set[i].data);
1020 	}
1021 
1022 	for (; i <= b->keys.nsets; i++)
1023 		prefetch(b->keys.set[i].data);
1024 
1025 	if (btree_node_io_error(b)) {
1026 		rw_unlock(write, b);
1027 		return ERR_PTR(-EIO);
1028 	}
1029 
1030 	BUG_ON(!b->written);
1031 
1032 	return b;
1033 }
1034 
btree_node_prefetch(struct btree * parent,struct bkey * k)1035 static void btree_node_prefetch(struct btree *parent, struct bkey *k)
1036 {
1037 	struct btree *b;
1038 
1039 	mutex_lock(&parent->c->bucket_lock);
1040 	b = mca_alloc(parent->c, NULL, k, parent->level - 1);
1041 	mutex_unlock(&parent->c->bucket_lock);
1042 
1043 	if (!IS_ERR_OR_NULL(b)) {
1044 		b->parent = parent;
1045 		bch_btree_node_read(b);
1046 		rw_unlock(true, b);
1047 	}
1048 }
1049 
1050 /* Btree alloc */
1051 
btree_node_free(struct btree * b)1052 static void btree_node_free(struct btree *b)
1053 {
1054 	trace_bcache_btree_node_free(b);
1055 
1056 	BUG_ON(b == b->c->root);
1057 
1058 	mutex_lock(&b->write_lock);
1059 
1060 	if (btree_node_dirty(b))
1061 		btree_complete_write(b, btree_current_write(b));
1062 	clear_bit(BTREE_NODE_dirty, &b->flags);
1063 
1064 	mutex_unlock(&b->write_lock);
1065 
1066 	cancel_delayed_work(&b->work);
1067 
1068 	mutex_lock(&b->c->bucket_lock);
1069 	bch_bucket_free(b->c, &b->key);
1070 	mca_bucket_free(b);
1071 	mutex_unlock(&b->c->bucket_lock);
1072 }
1073 
__bch_btree_node_alloc(struct cache_set * c,struct btree_op * op,int level,bool wait,struct btree * parent)1074 struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
1075 				     int level, bool wait,
1076 				     struct btree *parent)
1077 {
1078 	BKEY_PADDED(key) k;
1079 	struct btree *b = ERR_PTR(-EAGAIN);
1080 
1081 	mutex_lock(&c->bucket_lock);
1082 retry:
1083 	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
1084 		goto err;
1085 
1086 	bkey_put(c, &k.key);
1087 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1088 
1089 	b = mca_alloc(c, op, &k.key, level);
1090 	if (IS_ERR(b))
1091 		goto err_free;
1092 
1093 	if (!b) {
1094 		cache_bug(c,
1095 			"Tried to allocate bucket that was in btree cache");
1096 		goto retry;
1097 	}
1098 
1099 	b->accessed = 1;
1100 	b->parent = parent;
1101 	bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
1102 
1103 	mutex_unlock(&c->bucket_lock);
1104 
1105 	trace_bcache_btree_node_alloc(b);
1106 	return b;
1107 err_free:
1108 	bch_bucket_free(c, &k.key);
1109 err:
1110 	mutex_unlock(&c->bucket_lock);
1111 
1112 	trace_bcache_btree_node_alloc_fail(c);
1113 	return b;
1114 }
1115 
bch_btree_node_alloc(struct cache_set * c,struct btree_op * op,int level,struct btree * parent)1116 static struct btree *bch_btree_node_alloc(struct cache_set *c,
1117 					  struct btree_op *op, int level,
1118 					  struct btree *parent)
1119 {
1120 	return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
1121 }
1122 
btree_node_alloc_replacement(struct btree * b,struct btree_op * op)1123 static struct btree *btree_node_alloc_replacement(struct btree *b,
1124 						  struct btree_op *op)
1125 {
1126 	struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
1127 	if (!IS_ERR_OR_NULL(n)) {
1128 		mutex_lock(&n->write_lock);
1129 		bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
1130 		bkey_copy_key(&n->key, &b->key);
1131 		mutex_unlock(&n->write_lock);
1132 	}
1133 
1134 	return n;
1135 }
1136 
make_btree_freeing_key(struct btree * b,struct bkey * k)1137 static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1138 {
1139 	unsigned i;
1140 
1141 	mutex_lock(&b->c->bucket_lock);
1142 
1143 	atomic_inc(&b->c->prio_blocked);
1144 
1145 	bkey_copy(k, &b->key);
1146 	bkey_copy_key(k, &ZERO_KEY);
1147 
1148 	for (i = 0; i < KEY_PTRS(k); i++)
1149 		SET_PTR_GEN(k, i,
1150 			    bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
1151 					PTR_BUCKET(b->c, &b->key, i)));
1152 
1153 	mutex_unlock(&b->c->bucket_lock);
1154 }
1155 
btree_check_reserve(struct btree * b,struct btree_op * op)1156 static int btree_check_reserve(struct btree *b, struct btree_op *op)
1157 {
1158 	struct cache_set *c = b->c;
1159 	struct cache *ca;
1160 	unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
1161 
1162 	mutex_lock(&c->bucket_lock);
1163 
1164 	for_each_cache(ca, c, i)
1165 		if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1166 			if (op)
1167 				prepare_to_wait(&c->btree_cache_wait, &op->wait,
1168 						TASK_UNINTERRUPTIBLE);
1169 			mutex_unlock(&c->bucket_lock);
1170 			return -EINTR;
1171 		}
1172 
1173 	mutex_unlock(&c->bucket_lock);
1174 
1175 	return mca_cannibalize_lock(b->c, op);
1176 }
1177 
1178 /* Garbage collection */
1179 
__bch_btree_mark_key(struct cache_set * c,int level,struct bkey * k)1180 static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
1181 				    struct bkey *k)
1182 {
1183 	uint8_t stale = 0;
1184 	unsigned i;
1185 	struct bucket *g;
1186 
1187 	/*
1188 	 * ptr_invalid() can't return true for the keys that mark btree nodes as
1189 	 * freed, but since ptr_bad() returns true we'll never actually use them
1190 	 * for anything and thus we don't want mark their pointers here
1191 	 */
1192 	if (!bkey_cmp(k, &ZERO_KEY))
1193 		return stale;
1194 
1195 	for (i = 0; i < KEY_PTRS(k); i++) {
1196 		if (!ptr_available(c, k, i))
1197 			continue;
1198 
1199 		g = PTR_BUCKET(c, k, i);
1200 
1201 		if (gen_after(g->last_gc, PTR_GEN(k, i)))
1202 			g->last_gc = PTR_GEN(k, i);
1203 
1204 		if (ptr_stale(c, k, i)) {
1205 			stale = max(stale, ptr_stale(c, k, i));
1206 			continue;
1207 		}
1208 
1209 		cache_bug_on(GC_MARK(g) &&
1210 			     (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1211 			     c, "inconsistent ptrs: mark = %llu, level = %i",
1212 			     GC_MARK(g), level);
1213 
1214 		if (level)
1215 			SET_GC_MARK(g, GC_MARK_METADATA);
1216 		else if (KEY_DIRTY(k))
1217 			SET_GC_MARK(g, GC_MARK_DIRTY);
1218 		else if (!GC_MARK(g))
1219 			SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
1220 
1221 		/* guard against overflow */
1222 		SET_GC_SECTORS_USED(g, min_t(unsigned,
1223 					     GC_SECTORS_USED(g) + KEY_SIZE(k),
1224 					     MAX_GC_SECTORS_USED));
1225 
1226 		BUG_ON(!GC_SECTORS_USED(g));
1227 	}
1228 
1229 	return stale;
1230 }
1231 
1232 #define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)
1233 
bch_initial_mark_key(struct cache_set * c,int level,struct bkey * k)1234 void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
1235 {
1236 	unsigned i;
1237 
1238 	for (i = 0; i < KEY_PTRS(k); i++)
1239 		if (ptr_available(c, k, i) &&
1240 		    !ptr_stale(c, k, i)) {
1241 			struct bucket *b = PTR_BUCKET(c, k, i);
1242 
1243 			b->gen = PTR_GEN(k, i);
1244 
1245 			if (level && bkey_cmp(k, &ZERO_KEY))
1246 				b->prio = BTREE_PRIO;
1247 			else if (!level && b->prio == BTREE_PRIO)
1248 				b->prio = INITIAL_PRIO;
1249 		}
1250 
1251 	__bch_btree_mark_key(c, level, k);
1252 }
1253 
btree_gc_mark_node(struct btree * b,struct gc_stat * gc)1254 static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1255 {
1256 	uint8_t stale = 0;
1257 	unsigned keys = 0, good_keys = 0;
1258 	struct bkey *k;
1259 	struct btree_iter iter;
1260 	struct bset_tree *t;
1261 
1262 	gc->nodes++;
1263 
1264 	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
1265 		stale = max(stale, btree_mark_key(b, k));
1266 		keys++;
1267 
1268 		if (bch_ptr_bad(&b->keys, k))
1269 			continue;
1270 
1271 		gc->key_bytes += bkey_u64s(k);
1272 		gc->nkeys++;
1273 		good_keys++;
1274 
1275 		gc->data += KEY_SIZE(k);
1276 	}
1277 
1278 	for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
1279 		btree_bug_on(t->size &&
1280 			     bset_written(&b->keys, t) &&
1281 			     bkey_cmp(&b->key, &t->end) < 0,
1282 			     b, "found short btree key in gc");
1283 
1284 	if (b->c->gc_always_rewrite)
1285 		return true;
1286 
1287 	if (stale > 10)
1288 		return true;
1289 
1290 	if ((keys - good_keys) * 2 > keys)
1291 		return true;
1292 
1293 	return false;
1294 }
1295 
1296 #define GC_MERGE_NODES	4U
1297 
1298 struct gc_merge_info {
1299 	struct btree	*b;
1300 	unsigned	keys;
1301 };
1302 
1303 static int bch_btree_insert_node(struct btree *, struct btree_op *,
1304 				 struct keylist *, atomic_t *, struct bkey *);
1305 
btree_gc_coalesce(struct btree * b,struct btree_op * op,struct gc_stat * gc,struct gc_merge_info * r)1306 static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1307 			     struct gc_stat *gc, struct gc_merge_info *r)
1308 {
1309 	unsigned i, nodes = 0, keys = 0, blocks;
1310 	struct btree *new_nodes[GC_MERGE_NODES];
1311 	struct keylist keylist;
1312 	struct closure cl;
1313 	struct bkey *k;
1314 
1315 	bch_keylist_init(&keylist);
1316 
1317 	if (btree_check_reserve(b, NULL))
1318 		return 0;
1319 
1320 	memset(new_nodes, 0, sizeof(new_nodes));
1321 	closure_init_stack(&cl);
1322 
1323 	while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
1324 		keys += r[nodes++].keys;
1325 
1326 	blocks = btree_default_blocks(b->c) * 2 / 3;
1327 
1328 	if (nodes < 2 ||
1329 	    __set_blocks(b->keys.set[0].data, keys,
1330 			 block_bytes(b->c)) > blocks * (nodes - 1))
1331 		return 0;
1332 
1333 	for (i = 0; i < nodes; i++) {
1334 		new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
1335 		if (IS_ERR_OR_NULL(new_nodes[i]))
1336 			goto out_nocoalesce;
1337 	}
1338 
1339 	/*
1340 	 * We have to check the reserve here, after we've allocated our new
1341 	 * nodes, to make sure the insert below will succeed - we also check
1342 	 * before as an optimization to potentially avoid a bunch of expensive
1343 	 * allocs/sorts
1344 	 */
1345 	if (btree_check_reserve(b, NULL))
1346 		goto out_nocoalesce;
1347 
1348 	for (i = 0; i < nodes; i++)
1349 		mutex_lock(&new_nodes[i]->write_lock);
1350 
1351 	for (i = nodes - 1; i > 0; --i) {
1352 		struct bset *n1 = btree_bset_first(new_nodes[i]);
1353 		struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
1354 		struct bkey *k, *last = NULL;
1355 
1356 		keys = 0;
1357 
1358 		if (i > 1) {
1359 			for (k = n2->start;
1360 			     k < bset_bkey_last(n2);
1361 			     k = bkey_next(k)) {
1362 				if (__set_blocks(n1, n1->keys + keys +
1363 						 bkey_u64s(k),
1364 						 block_bytes(b->c)) > blocks)
1365 					break;
1366 
1367 				last = k;
1368 				keys += bkey_u64s(k);
1369 			}
1370 		} else {
1371 			/*
1372 			 * Last node we're not getting rid of - we're getting
1373 			 * rid of the node at r[0]. Have to try and fit all of
1374 			 * the remaining keys into this node; we can't ensure
1375 			 * they will always fit due to rounding and variable
1376 			 * length keys (shouldn't be possible in practice,
1377 			 * though)
1378 			 */
1379 			if (__set_blocks(n1, n1->keys + n2->keys,
1380 					 block_bytes(b->c)) >
1381 			    btree_blocks(new_nodes[i]))
1382 				goto out_unlock_nocoalesce;
1383 
1384 			keys = n2->keys;
1385 			/* Take the key of the node we're getting rid of */
1386 			last = &r->b->key;
1387 		}
1388 
1389 		BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
1390 		       btree_blocks(new_nodes[i]));
1391 
1392 		if (last)
1393 			bkey_copy_key(&new_nodes[i]->key, last);
1394 
1395 		memcpy(bset_bkey_last(n1),
1396 		       n2->start,
1397 		       (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
1398 
1399 		n1->keys += keys;
1400 		r[i].keys = n1->keys;
1401 
1402 		memmove(n2->start,
1403 			bset_bkey_idx(n2, keys),
1404 			(void *) bset_bkey_last(n2) -
1405 			(void *) bset_bkey_idx(n2, keys));
1406 
1407 		n2->keys -= keys;
1408 
1409 		if (__bch_keylist_realloc(&keylist,
1410 					  bkey_u64s(&new_nodes[i]->key)))
1411 			goto out_unlock_nocoalesce;
1412 
1413 		bch_btree_node_write(new_nodes[i], &cl);
1414 		bch_keylist_add(&keylist, &new_nodes[i]->key);
1415 	}
1416 
1417 	for (i = 0; i < nodes; i++)
1418 		mutex_unlock(&new_nodes[i]->write_lock);
1419 
1420 	closure_sync(&cl);
1421 
1422 	/* We emptied out this node */
1423 	BUG_ON(btree_bset_first(new_nodes[0])->keys);
1424 	btree_node_free(new_nodes[0]);
1425 	rw_unlock(true, new_nodes[0]);
1426 	new_nodes[0] = NULL;
1427 
1428 	for (i = 0; i < nodes; i++) {
1429 		if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
1430 			goto out_nocoalesce;
1431 
1432 		make_btree_freeing_key(r[i].b, keylist.top);
1433 		bch_keylist_push(&keylist);
1434 	}
1435 
1436 	bch_btree_insert_node(b, op, &keylist, NULL, NULL);
1437 	BUG_ON(!bch_keylist_empty(&keylist));
1438 
1439 	for (i = 0; i < nodes; i++) {
1440 		btree_node_free(r[i].b);
1441 		rw_unlock(true, r[i].b);
1442 
1443 		r[i].b = new_nodes[i];
1444 	}
1445 
1446 	memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
1447 	r[nodes - 1].b = ERR_PTR(-EINTR);
1448 
1449 	trace_bcache_btree_gc_coalesce(nodes);
1450 	gc->nodes--;
1451 
1452 	bch_keylist_free(&keylist);
1453 
1454 	/* Invalidated our iterator */
1455 	return -EINTR;
1456 
1457 out_unlock_nocoalesce:
1458 	for (i = 0; i < nodes; i++)
1459 		mutex_unlock(&new_nodes[i]->write_lock);
1460 
1461 out_nocoalesce:
1462 	closure_sync(&cl);
1463 	bch_keylist_free(&keylist);
1464 
1465 	while ((k = bch_keylist_pop(&keylist)))
1466 		if (!bkey_cmp(k, &ZERO_KEY))
1467 			atomic_dec(&b->c->prio_blocked);
1468 
1469 	for (i = 0; i < nodes; i++)
1470 		if (!IS_ERR_OR_NULL(new_nodes[i])) {
1471 			btree_node_free(new_nodes[i]);
1472 			rw_unlock(true, new_nodes[i]);
1473 		}
1474 	return 0;
1475 }
1476 
btree_gc_rewrite_node(struct btree * b,struct btree_op * op,struct btree * replace)1477 static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
1478 				 struct btree *replace)
1479 {
1480 	struct keylist keys;
1481 	struct btree *n;
1482 
1483 	if (btree_check_reserve(b, NULL))
1484 		return 0;
1485 
1486 	n = btree_node_alloc_replacement(replace, NULL);
1487 
1488 	/* recheck reserve after allocating replacement node */
1489 	if (btree_check_reserve(b, NULL)) {
1490 		btree_node_free(n);
1491 		rw_unlock(true, n);
1492 		return 0;
1493 	}
1494 
1495 	bch_btree_node_write_sync(n);
1496 
1497 	bch_keylist_init(&keys);
1498 	bch_keylist_add(&keys, &n->key);
1499 
1500 	make_btree_freeing_key(replace, keys.top);
1501 	bch_keylist_push(&keys);
1502 
1503 	bch_btree_insert_node(b, op, &keys, NULL, NULL);
1504 	BUG_ON(!bch_keylist_empty(&keys));
1505 
1506 	btree_node_free(replace);
1507 	rw_unlock(true, n);
1508 
1509 	/* Invalidated our iterator */
1510 	return -EINTR;
1511 }
1512 
btree_gc_count_keys(struct btree * b)1513 static unsigned btree_gc_count_keys(struct btree *b)
1514 {
1515 	struct bkey *k;
1516 	struct btree_iter iter;
1517 	unsigned ret = 0;
1518 
1519 	for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
1520 		ret += bkey_u64s(k);
1521 
1522 	return ret;
1523 }
1524 
btree_gc_recurse(struct btree * b,struct btree_op * op,struct closure * writes,struct gc_stat * gc)1525 static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1526 			    struct closure *writes, struct gc_stat *gc)
1527 {
1528 	int ret = 0;
1529 	bool should_rewrite;
1530 	struct bkey *k;
1531 	struct btree_iter iter;
1532 	struct gc_merge_info r[GC_MERGE_NODES];
1533 	struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
1534 
1535 	bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
1536 
1537 	for (i = r; i < r + ARRAY_SIZE(r); i++)
1538 		i->b = ERR_PTR(-EINTR);
1539 
1540 	while (1) {
1541 		k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
1542 		if (k) {
1543 			r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
1544 						  true, b);
1545 			if (IS_ERR(r->b)) {
1546 				ret = PTR_ERR(r->b);
1547 				break;
1548 			}
1549 
1550 			r->keys = btree_gc_count_keys(r->b);
1551 
1552 			ret = btree_gc_coalesce(b, op, gc, r);
1553 			if (ret)
1554 				break;
1555 		}
1556 
1557 		if (!last->b)
1558 			break;
1559 
1560 		if (!IS_ERR(last->b)) {
1561 			should_rewrite = btree_gc_mark_node(last->b, gc);
1562 			if (should_rewrite) {
1563 				ret = btree_gc_rewrite_node(b, op, last->b);
1564 				if (ret)
1565 					break;
1566 			}
1567 
1568 			if (last->b->level) {
1569 				ret = btree_gc_recurse(last->b, op, writes, gc);
1570 				if (ret)
1571 					break;
1572 			}
1573 
1574 			bkey_copy_key(&b->c->gc_done, &last->b->key);
1575 
1576 			/*
1577 			 * Must flush leaf nodes before gc ends, since replace
1578 			 * operations aren't journalled
1579 			 */
1580 			mutex_lock(&last->b->write_lock);
1581 			if (btree_node_dirty(last->b))
1582 				bch_btree_node_write(last->b, writes);
1583 			mutex_unlock(&last->b->write_lock);
1584 			rw_unlock(true, last->b);
1585 		}
1586 
1587 		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
1588 		r->b = NULL;
1589 
1590 		if (need_resched()) {
1591 			ret = -EAGAIN;
1592 			break;
1593 		}
1594 	}
1595 
1596 	for (i = r; i < r + ARRAY_SIZE(r); i++)
1597 		if (!IS_ERR_OR_NULL(i->b)) {
1598 			mutex_lock(&i->b->write_lock);
1599 			if (btree_node_dirty(i->b))
1600 				bch_btree_node_write(i->b, writes);
1601 			mutex_unlock(&i->b->write_lock);
1602 			rw_unlock(true, i->b);
1603 		}
1604 
1605 	return ret;
1606 }
1607 
bch_btree_gc_root(struct btree * b,struct btree_op * op,struct closure * writes,struct gc_stat * gc)1608 static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1609 			     struct closure *writes, struct gc_stat *gc)
1610 {
1611 	struct btree *n = NULL;
1612 	int ret = 0;
1613 	bool should_rewrite;
1614 
1615 	should_rewrite = btree_gc_mark_node(b, gc);
1616 	if (should_rewrite) {
1617 		n = btree_node_alloc_replacement(b, NULL);
1618 
1619 		if (!IS_ERR_OR_NULL(n)) {
1620 			bch_btree_node_write_sync(n);
1621 
1622 			bch_btree_set_root(n);
1623 			btree_node_free(b);
1624 			rw_unlock(true, n);
1625 
1626 			return -EINTR;
1627 		}
1628 	}
1629 
1630 	__bch_btree_mark_key(b->c, b->level + 1, &b->key);
1631 
1632 	if (b->level) {
1633 		ret = btree_gc_recurse(b, op, writes, gc);
1634 		if (ret)
1635 			return ret;
1636 	}
1637 
1638 	bkey_copy_key(&b->c->gc_done, &b->key);
1639 
1640 	return ret;
1641 }
1642 
btree_gc_start(struct cache_set * c)1643 static void btree_gc_start(struct cache_set *c)
1644 {
1645 	struct cache *ca;
1646 	struct bucket *b;
1647 	unsigned i;
1648 
1649 	if (!c->gc_mark_valid)
1650 		return;
1651 
1652 	mutex_lock(&c->bucket_lock);
1653 
1654 	c->gc_mark_valid = 0;
1655 	c->gc_done = ZERO_KEY;
1656 
1657 	for_each_cache(ca, c, i)
1658 		for_each_bucket(b, ca) {
1659 			b->last_gc = b->gen;
1660 			if (!atomic_read(&b->pin)) {
1661 				SET_GC_MARK(b, 0);
1662 				SET_GC_SECTORS_USED(b, 0);
1663 			}
1664 		}
1665 
1666 	mutex_unlock(&c->bucket_lock);
1667 }
1668 
bch_btree_gc_finish(struct cache_set * c)1669 static size_t bch_btree_gc_finish(struct cache_set *c)
1670 {
1671 	size_t available = 0;
1672 	struct bucket *b;
1673 	struct cache *ca;
1674 	unsigned i;
1675 
1676 	mutex_lock(&c->bucket_lock);
1677 
1678 	set_gc_sectors(c);
1679 	c->gc_mark_valid = 1;
1680 	c->need_gc	= 0;
1681 
1682 	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1683 		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1684 			    GC_MARK_METADATA);
1685 
1686 	/* don't reclaim buckets to which writeback keys point */
1687 	rcu_read_lock();
1688 	for (i = 0; i < c->nr_uuids; i++) {
1689 		struct bcache_device *d = c->devices[i];
1690 		struct cached_dev *dc;
1691 		struct keybuf_key *w, *n;
1692 		unsigned j;
1693 
1694 		if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
1695 			continue;
1696 		dc = container_of(d, struct cached_dev, disk);
1697 
1698 		spin_lock(&dc->writeback_keys.lock);
1699 		rbtree_postorder_for_each_entry_safe(w, n,
1700 					&dc->writeback_keys.keys, node)
1701 			for (j = 0; j < KEY_PTRS(&w->key); j++)
1702 				SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
1703 					    GC_MARK_DIRTY);
1704 		spin_unlock(&dc->writeback_keys.lock);
1705 	}
1706 	rcu_read_unlock();
1707 
1708 	for_each_cache(ca, c, i) {
1709 		uint64_t *i;
1710 
1711 		ca->invalidate_needs_gc = 0;
1712 
1713 		for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1714 			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1715 
1716 		for (i = ca->prio_buckets;
1717 		     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1718 			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1719 
1720 		for_each_bucket(b, ca) {
1721 			c->need_gc	= max(c->need_gc, bucket_gc_gen(b));
1722 
1723 			if (atomic_read(&b->pin))
1724 				continue;
1725 
1726 			BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
1727 
1728 			if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
1729 				available++;
1730 		}
1731 	}
1732 
1733 	mutex_unlock(&c->bucket_lock);
1734 	return available;
1735 }
1736 
bch_btree_gc(struct cache_set * c)1737 static void bch_btree_gc(struct cache_set *c)
1738 {
1739 	int ret;
1740 	unsigned long available;
1741 	struct gc_stat stats;
1742 	struct closure writes;
1743 	struct btree_op op;
1744 	uint64_t start_time = local_clock();
1745 
1746 	trace_bcache_gc_start(c);
1747 
1748 	memset(&stats, 0, sizeof(struct gc_stat));
1749 	closure_init_stack(&writes);
1750 	bch_btree_op_init(&op, SHRT_MAX);
1751 
1752 	btree_gc_start(c);
1753 
1754 	do {
1755 		ret = btree_root(gc_root, c, &op, &writes, &stats);
1756 		closure_sync(&writes);
1757 		cond_resched();
1758 
1759 		if (ret && ret != -EAGAIN)
1760 			pr_warn("gc failed!");
1761 	} while (ret);
1762 
1763 	available = bch_btree_gc_finish(c);
1764 	wake_up_allocators(c);
1765 
1766 	bch_time_stats_update(&c->btree_gc_time, start_time);
1767 
1768 	stats.key_bytes *= sizeof(uint64_t);
1769 	stats.data	<<= 9;
1770 	stats.in_use	= (c->nbuckets - available) * 100 / c->nbuckets;
1771 	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1772 
1773 	trace_bcache_gc_end(c);
1774 
1775 	bch_moving_gc(c);
1776 }
1777 
gc_should_run(struct cache_set * c)1778 static bool gc_should_run(struct cache_set *c)
1779 {
1780 	struct cache *ca;
1781 	unsigned i;
1782 
1783 	for_each_cache(ca, c, i)
1784 		if (ca->invalidate_needs_gc)
1785 			return true;
1786 
1787 	if (atomic_read(&c->sectors_to_gc) < 0)
1788 		return true;
1789 
1790 	return false;
1791 }
1792 
bch_gc_thread(void * arg)1793 static int bch_gc_thread(void *arg)
1794 {
1795 	struct cache_set *c = arg;
1796 
1797 	while (1) {
1798 		wait_event_interruptible(c->gc_wait,
1799 			   kthread_should_stop() || gc_should_run(c));
1800 
1801 		if (kthread_should_stop())
1802 			break;
1803 
1804 		set_gc_sectors(c);
1805 		bch_btree_gc(c);
1806 	}
1807 
1808 	return 0;
1809 }
1810 
bch_gc_thread_start(struct cache_set * c)1811 int bch_gc_thread_start(struct cache_set *c)
1812 {
1813 	c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
1814 	if (IS_ERR(c->gc_thread))
1815 		return PTR_ERR(c->gc_thread);
1816 
1817 	return 0;
1818 }
1819 
1820 /* Initial partial gc */
1821 
bch_btree_check_recurse(struct btree * b,struct btree_op * op)1822 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
1823 {
1824 	int ret = 0;
1825 	struct bkey *k, *p = NULL;
1826 	struct btree_iter iter;
1827 
1828 	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
1829 		bch_initial_mark_key(b->c, b->level, k);
1830 
1831 	bch_initial_mark_key(b->c, b->level + 1, &b->key);
1832 
1833 	if (b->level) {
1834 		bch_btree_iter_init(&b->keys, &iter, NULL);
1835 
1836 		do {
1837 			k = bch_btree_iter_next_filter(&iter, &b->keys,
1838 						       bch_ptr_bad);
1839 			if (k)
1840 				btree_node_prefetch(b, k);
1841 
1842 			if (p)
1843 				ret = btree(check_recurse, p, b, op);
1844 
1845 			p = k;
1846 		} while (p && !ret);
1847 	}
1848 
1849 	return ret;
1850 }
1851 
bch_btree_check(struct cache_set * c)1852 int bch_btree_check(struct cache_set *c)
1853 {
1854 	struct btree_op op;
1855 
1856 	bch_btree_op_init(&op, SHRT_MAX);
1857 
1858 	return btree_root(check_recurse, c, &op);
1859 }
1860 
bch_initial_gc_finish(struct cache_set * c)1861 void bch_initial_gc_finish(struct cache_set *c)
1862 {
1863 	struct cache *ca;
1864 	struct bucket *b;
1865 	unsigned i;
1866 
1867 	bch_btree_gc_finish(c);
1868 
1869 	mutex_lock(&c->bucket_lock);
1870 
1871 	/*
1872 	 * We need to put some unused buckets directly on the prio freelist in
1873 	 * order to get the allocator thread started - it needs freed buckets in
1874 	 * order to rewrite the prios and gens, and it needs to rewrite prios
1875 	 * and gens in order to free buckets.
1876 	 *
1877 	 * This is only safe for buckets that have no live data in them, which
1878 	 * there should always be some of.
1879 	 */
1880 	for_each_cache(ca, c, i) {
1881 		for_each_bucket(b, ca) {
1882 			if (fifo_full(&ca->free[RESERVE_PRIO]) &&
1883 			    fifo_full(&ca->free[RESERVE_BTREE]))
1884 				break;
1885 
1886 			if (bch_can_invalidate_bucket(ca, b) &&
1887 			    !GC_MARK(b)) {
1888 				__bch_invalidate_one_bucket(ca, b);
1889 				if (!fifo_push(&ca->free[RESERVE_PRIO],
1890 				   b - ca->buckets))
1891 					fifo_push(&ca->free[RESERVE_BTREE],
1892 						  b - ca->buckets);
1893 			}
1894 		}
1895 	}
1896 
1897 	mutex_unlock(&c->bucket_lock);
1898 }
1899 
1900 /* Btree insertion */
1901 
btree_insert_key(struct btree * b,struct bkey * k,struct bkey * replace_key)1902 static bool btree_insert_key(struct btree *b, struct bkey *k,
1903 			     struct bkey *replace_key)
1904 {
1905 	unsigned status;
1906 
1907 	BUG_ON(bkey_cmp(k, &b->key) > 0);
1908 
1909 	status = bch_btree_insert_key(&b->keys, k, replace_key);
1910 	if (status != BTREE_INSERT_STATUS_NO_INSERT) {
1911 		bch_check_keys(&b->keys, "%u for %s", status,
1912 			       replace_key ? "replace" : "insert");
1913 
1914 		trace_bcache_btree_insert_key(b, k, replace_key != NULL,
1915 					      status);
1916 		return true;
1917 	} else
1918 		return false;
1919 }
1920 
insert_u64s_remaining(struct btree * b)1921 static size_t insert_u64s_remaining(struct btree *b)
1922 {
1923 	long ret = bch_btree_keys_u64s_remaining(&b->keys);
1924 
1925 	/*
1926 	 * Might land in the middle of an existing extent and have to split it
1927 	 */
1928 	if (b->keys.ops->is_extents)
1929 		ret -= KEY_MAX_U64S;
1930 
1931 	return max(ret, 0L);
1932 }
1933 
bch_btree_insert_keys(struct btree * b,struct btree_op * op,struct keylist * insert_keys,struct bkey * replace_key)1934 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
1935 				  struct keylist *insert_keys,
1936 				  struct bkey *replace_key)
1937 {
1938 	bool ret = false;
1939 	int oldsize = bch_count_data(&b->keys);
1940 
1941 	while (!bch_keylist_empty(insert_keys)) {
1942 		struct bkey *k = insert_keys->keys;
1943 
1944 		if (bkey_u64s(k) > insert_u64s_remaining(b))
1945 			break;
1946 
1947 		if (bkey_cmp(k, &b->key) <= 0) {
1948 			if (!b->level)
1949 				bkey_put(b->c, k);
1950 
1951 			ret |= btree_insert_key(b, k, replace_key);
1952 			bch_keylist_pop_front(insert_keys);
1953 		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
1954 			BKEY_PADDED(key) temp;
1955 			bkey_copy(&temp.key, insert_keys->keys);
1956 
1957 			bch_cut_back(&b->key, &temp.key);
1958 			bch_cut_front(&b->key, insert_keys->keys);
1959 
1960 			ret |= btree_insert_key(b, &temp.key, replace_key);
1961 			break;
1962 		} else {
1963 			break;
1964 		}
1965 	}
1966 
1967 	if (!ret)
1968 		op->insert_collision = true;
1969 
1970 	BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
1971 
1972 	BUG_ON(bch_count_data(&b->keys) < oldsize);
1973 	return ret;
1974 }
1975 
btree_split(struct btree * b,struct btree_op * op,struct keylist * insert_keys,struct bkey * replace_key)1976 static int btree_split(struct btree *b, struct btree_op *op,
1977 		       struct keylist *insert_keys,
1978 		       struct bkey *replace_key)
1979 {
1980 	bool split;
1981 	struct btree *n1, *n2 = NULL, *n3 = NULL;
1982 	uint64_t start_time = local_clock();
1983 	struct closure cl;
1984 	struct keylist parent_keys;
1985 
1986 	closure_init_stack(&cl);
1987 	bch_keylist_init(&parent_keys);
1988 
1989 	if (btree_check_reserve(b, op)) {
1990 		if (!b->level)
1991 			return -EINTR;
1992 		else
1993 			WARN(1, "insufficient reserve for split\n");
1994 	}
1995 
1996 	n1 = btree_node_alloc_replacement(b, op);
1997 	if (IS_ERR(n1))
1998 		goto err;
1999 
2000 	split = set_blocks(btree_bset_first(n1),
2001 			   block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
2002 
2003 	if (split) {
2004 		unsigned keys = 0;
2005 
2006 		trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
2007 
2008 		n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
2009 		if (IS_ERR(n2))
2010 			goto err_free1;
2011 
2012 		if (!b->parent) {
2013 			n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
2014 			if (IS_ERR(n3))
2015 				goto err_free2;
2016 		}
2017 
2018 		mutex_lock(&n1->write_lock);
2019 		mutex_lock(&n2->write_lock);
2020 
2021 		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2022 
2023 		/*
2024 		 * Has to be a linear search because we don't have an auxiliary
2025 		 * search tree yet
2026 		 */
2027 
2028 		while (keys < (btree_bset_first(n1)->keys * 3) / 5)
2029 			keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
2030 							keys));
2031 
2032 		bkey_copy_key(&n1->key,
2033 			      bset_bkey_idx(btree_bset_first(n1), keys));
2034 		keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
2035 
2036 		btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
2037 		btree_bset_first(n1)->keys = keys;
2038 
2039 		memcpy(btree_bset_first(n2)->start,
2040 		       bset_bkey_last(btree_bset_first(n1)),
2041 		       btree_bset_first(n2)->keys * sizeof(uint64_t));
2042 
2043 		bkey_copy_key(&n2->key, &b->key);
2044 
2045 		bch_keylist_add(&parent_keys, &n2->key);
2046 		bch_btree_node_write(n2, &cl);
2047 		mutex_unlock(&n2->write_lock);
2048 		rw_unlock(true, n2);
2049 	} else {
2050 		trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
2051 
2052 		mutex_lock(&n1->write_lock);
2053 		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2054 	}
2055 
2056 	bch_keylist_add(&parent_keys, &n1->key);
2057 	bch_btree_node_write(n1, &cl);
2058 	mutex_unlock(&n1->write_lock);
2059 
2060 	if (n3) {
2061 		/* Depth increases, make a new root */
2062 		mutex_lock(&n3->write_lock);
2063 		bkey_copy_key(&n3->key, &MAX_KEY);
2064 		bch_btree_insert_keys(n3, op, &parent_keys, NULL);
2065 		bch_btree_node_write(n3, &cl);
2066 		mutex_unlock(&n3->write_lock);
2067 
2068 		closure_sync(&cl);
2069 		bch_btree_set_root(n3);
2070 		rw_unlock(true, n3);
2071 	} else if (!b->parent) {
2072 		/* Root filled up but didn't need to be split */
2073 		closure_sync(&cl);
2074 		bch_btree_set_root(n1);
2075 	} else {
2076 		/* Split a non root node */
2077 		closure_sync(&cl);
2078 		make_btree_freeing_key(b, parent_keys.top);
2079 		bch_keylist_push(&parent_keys);
2080 
2081 		bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
2082 		BUG_ON(!bch_keylist_empty(&parent_keys));
2083 	}
2084 
2085 	btree_node_free(b);
2086 	rw_unlock(true, n1);
2087 
2088 	bch_time_stats_update(&b->c->btree_split_time, start_time);
2089 
2090 	return 0;
2091 err_free2:
2092 	bkey_put(b->c, &n2->key);
2093 	btree_node_free(n2);
2094 	rw_unlock(true, n2);
2095 err_free1:
2096 	bkey_put(b->c, &n1->key);
2097 	btree_node_free(n1);
2098 	rw_unlock(true, n1);
2099 err:
2100 	WARN(1, "bcache: btree split failed (level %u)", b->level);
2101 
2102 	if (n3 == ERR_PTR(-EAGAIN) ||
2103 	    n2 == ERR_PTR(-EAGAIN) ||
2104 	    n1 == ERR_PTR(-EAGAIN))
2105 		return -EAGAIN;
2106 
2107 	return -ENOMEM;
2108 }
2109 
bch_btree_insert_node(struct btree * b,struct btree_op * op,struct keylist * insert_keys,atomic_t * journal_ref,struct bkey * replace_key)2110 static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2111 				 struct keylist *insert_keys,
2112 				 atomic_t *journal_ref,
2113 				 struct bkey *replace_key)
2114 {
2115 	struct closure cl;
2116 
2117 	BUG_ON(b->level && replace_key);
2118 
2119 	closure_init_stack(&cl);
2120 
2121 	mutex_lock(&b->write_lock);
2122 
2123 	if (write_block(b) != btree_bset_last(b) &&
2124 	    b->keys.last_set_unwritten)
2125 		bch_btree_init_next(b); /* just wrote a set */
2126 
2127 	if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
2128 		mutex_unlock(&b->write_lock);
2129 		goto split;
2130 	}
2131 
2132 	BUG_ON(write_block(b) != btree_bset_last(b));
2133 
2134 	if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2135 		if (!b->level)
2136 			bch_btree_leaf_dirty(b, journal_ref);
2137 		else
2138 			bch_btree_node_write(b, &cl);
2139 	}
2140 
2141 	mutex_unlock(&b->write_lock);
2142 
2143 	/* wait for btree node write if necessary, after unlock */
2144 	closure_sync(&cl);
2145 
2146 	return 0;
2147 split:
2148 	if (current->bio_list) {
2149 		op->lock = b->c->root->level + 1;
2150 		return -EAGAIN;
2151 	} else if (op->lock <= b->c->root->level) {
2152 		op->lock = b->c->root->level + 1;
2153 		return -EINTR;
2154 	} else {
2155 		/* Invalidated all iterators */
2156 		int ret = btree_split(b, op, insert_keys, replace_key);
2157 
2158 		if (bch_keylist_empty(insert_keys))
2159 			return 0;
2160 		else if (!ret)
2161 			return -EINTR;
2162 		return ret;
2163 	}
2164 }
2165 
bch_btree_insert_check_key(struct btree * b,struct btree_op * op,struct bkey * check_key)2166 int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2167 			       struct bkey *check_key)
2168 {
2169 	int ret = -EINTR;
2170 	uint64_t btree_ptr = b->key.ptr[0];
2171 	unsigned long seq = b->seq;
2172 	struct keylist insert;
2173 	bool upgrade = op->lock == -1;
2174 
2175 	bch_keylist_init(&insert);
2176 
2177 	if (upgrade) {
2178 		rw_unlock(false, b);
2179 		rw_lock(true, b, b->level);
2180 
2181 		if (b->key.ptr[0] != btree_ptr ||
2182                    b->seq != seq + 1) {
2183                        op->lock = b->level;
2184 			goto out;
2185                }
2186 	}
2187 
2188 	SET_KEY_PTRS(check_key, 1);
2189 	get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
2190 
2191 	SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
2192 
2193 	bch_keylist_add(&insert, check_key);
2194 
2195 	ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
2196 
2197 	BUG_ON(!ret && !bch_keylist_empty(&insert));
2198 out:
2199 	if (upgrade)
2200 		downgrade_write(&b->lock);
2201 	return ret;
2202 }
2203 
2204 struct btree_insert_op {
2205 	struct btree_op	op;
2206 	struct keylist	*keys;
2207 	atomic_t	*journal_ref;
2208 	struct bkey	*replace_key;
2209 };
2210 
btree_insert_fn(struct btree_op * b_op,struct btree * b)2211 static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
2212 {
2213 	struct btree_insert_op *op = container_of(b_op,
2214 					struct btree_insert_op, op);
2215 
2216 	int ret = bch_btree_insert_node(b, &op->op, op->keys,
2217 					op->journal_ref, op->replace_key);
2218 	if (ret && !bch_keylist_empty(op->keys))
2219 		return ret;
2220 	else
2221 		return MAP_DONE;
2222 }
2223 
bch_btree_insert(struct cache_set * c,struct keylist * keys,atomic_t * journal_ref,struct bkey * replace_key)2224 int bch_btree_insert(struct cache_set *c, struct keylist *keys,
2225 		     atomic_t *journal_ref, struct bkey *replace_key)
2226 {
2227 	struct btree_insert_op op;
2228 	int ret = 0;
2229 
2230 	BUG_ON(current->bio_list);
2231 	BUG_ON(bch_keylist_empty(keys));
2232 
2233 	bch_btree_op_init(&op.op, 0);
2234 	op.keys		= keys;
2235 	op.journal_ref	= journal_ref;
2236 	op.replace_key	= replace_key;
2237 
2238 	while (!ret && !bch_keylist_empty(keys)) {
2239 		op.op.lock = 0;
2240 		ret = bch_btree_map_leaf_nodes(&op.op, c,
2241 					       &START_KEY(keys->keys),
2242 					       btree_insert_fn);
2243 	}
2244 
2245 	if (ret) {
2246 		struct bkey *k;
2247 
2248 		pr_err("error %i", ret);
2249 
2250 		while ((k = bch_keylist_pop(keys)))
2251 			bkey_put(c, k);
2252 	} else if (op.op.insert_collision)
2253 		ret = -ESRCH;
2254 
2255 	return ret;
2256 }
2257 
bch_btree_set_root(struct btree * b)2258 void bch_btree_set_root(struct btree *b)
2259 {
2260 	unsigned i;
2261 	struct closure cl;
2262 
2263 	closure_init_stack(&cl);
2264 
2265 	trace_bcache_btree_set_root(b);
2266 
2267 	BUG_ON(!b->written);
2268 
2269 	for (i = 0; i < KEY_PTRS(&b->key); i++)
2270 		BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2271 
2272 	mutex_lock(&b->c->bucket_lock);
2273 	list_del_init(&b->list);
2274 	mutex_unlock(&b->c->bucket_lock);
2275 
2276 	b->c->root = b;
2277 
2278 	bch_journal_meta(b->c, &cl);
2279 	closure_sync(&cl);
2280 }
2281 
2282 /* Map across nodes or keys */
2283 
bch_btree_map_nodes_recurse(struct btree * b,struct btree_op * op,struct bkey * from,btree_map_nodes_fn * fn,int flags)2284 static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2285 				       struct bkey *from,
2286 				       btree_map_nodes_fn *fn, int flags)
2287 {
2288 	int ret = MAP_CONTINUE;
2289 
2290 	if (b->level) {
2291 		struct bkey *k;
2292 		struct btree_iter iter;
2293 
2294 		bch_btree_iter_init(&b->keys, &iter, from);
2295 
2296 		while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
2297 						       bch_ptr_bad))) {
2298 			ret = btree(map_nodes_recurse, k, b,
2299 				    op, from, fn, flags);
2300 			from = NULL;
2301 
2302 			if (ret != MAP_CONTINUE)
2303 				return ret;
2304 		}
2305 	}
2306 
2307 	if (!b->level || flags == MAP_ALL_NODES)
2308 		ret = fn(op, b);
2309 
2310 	return ret;
2311 }
2312 
__bch_btree_map_nodes(struct btree_op * op,struct cache_set * c,struct bkey * from,btree_map_nodes_fn * fn,int flags)2313 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
2314 			  struct bkey *from, btree_map_nodes_fn *fn, int flags)
2315 {
2316 	return btree_root(map_nodes_recurse, c, op, from, fn, flags);
2317 }
2318 
bch_btree_map_keys_recurse(struct btree * b,struct btree_op * op,struct bkey * from,btree_map_keys_fn * fn,int flags)2319 static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2320 				      struct bkey *from, btree_map_keys_fn *fn,
2321 				      int flags)
2322 {
2323 	int ret = MAP_CONTINUE;
2324 	struct bkey *k;
2325 	struct btree_iter iter;
2326 
2327 	bch_btree_iter_init(&b->keys, &iter, from);
2328 
2329 	while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
2330 		ret = !b->level
2331 			? fn(op, b, k)
2332 			: btree(map_keys_recurse, k, b, op, from, fn, flags);
2333 		from = NULL;
2334 
2335 		if (ret != MAP_CONTINUE)
2336 			return ret;
2337 	}
2338 
2339 	if (!b->level && (flags & MAP_END_KEY))
2340 		ret = fn(op, b, &KEY(KEY_INODE(&b->key),
2341 				     KEY_OFFSET(&b->key), 0));
2342 
2343 	return ret;
2344 }
2345 
bch_btree_map_keys(struct btree_op * op,struct cache_set * c,struct bkey * from,btree_map_keys_fn * fn,int flags)2346 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
2347 		       struct bkey *from, btree_map_keys_fn *fn, int flags)
2348 {
2349 	return btree_root(map_keys_recurse, c, op, from, fn, flags);
2350 }
2351 
2352 /* Keybuf code */
2353 
keybuf_cmp(struct keybuf_key * l,struct keybuf_key * r)2354 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2355 {
2356 	/* Overlapping keys compare equal */
2357 	if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2358 		return -1;
2359 	if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2360 		return 1;
2361 	return 0;
2362 }
2363 
keybuf_nonoverlapping_cmp(struct keybuf_key * l,struct keybuf_key * r)2364 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2365 					    struct keybuf_key *r)
2366 {
2367 	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2368 }
2369 
2370 struct refill {
2371 	struct btree_op	op;
2372 	unsigned	nr_found;
2373 	struct keybuf	*buf;
2374 	struct bkey	*end;
2375 	keybuf_pred_fn	*pred;
2376 };
2377 
refill_keybuf_fn(struct btree_op * op,struct btree * b,struct bkey * k)2378 static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
2379 			    struct bkey *k)
2380 {
2381 	struct refill *refill = container_of(op, struct refill, op);
2382 	struct keybuf *buf = refill->buf;
2383 	int ret = MAP_CONTINUE;
2384 
2385 	if (bkey_cmp(k, refill->end) > 0) {
2386 		ret = MAP_DONE;
2387 		goto out;
2388 	}
2389 
2390 	if (!KEY_SIZE(k)) /* end key */
2391 		goto out;
2392 
2393 	if (refill->pred(buf, k)) {
2394 		struct keybuf_key *w;
2395 
2396 		spin_lock(&buf->lock);
2397 
2398 		w = array_alloc(&buf->freelist);
2399 		if (!w) {
2400 			spin_unlock(&buf->lock);
2401 			return MAP_DONE;
2402 		}
2403 
2404 		w->private = NULL;
2405 		bkey_copy(&w->key, k);
2406 
2407 		if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2408 			array_free(&buf->freelist, w);
2409 		else
2410 			refill->nr_found++;
2411 
2412 		if (array_freelist_empty(&buf->freelist))
2413 			ret = MAP_DONE;
2414 
2415 		spin_unlock(&buf->lock);
2416 	}
2417 out:
2418 	buf->last_scanned = *k;
2419 	return ret;
2420 }
2421 
bch_refill_keybuf(struct cache_set * c,struct keybuf * buf,struct bkey * end,keybuf_pred_fn * pred)2422 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2423 		       struct bkey *end, keybuf_pred_fn *pred)
2424 {
2425 	struct bkey start = buf->last_scanned;
2426 	struct refill refill;
2427 
2428 	cond_resched();
2429 
2430 	bch_btree_op_init(&refill.op, -1);
2431 	refill.nr_found	= 0;
2432 	refill.buf	= buf;
2433 	refill.end	= end;
2434 	refill.pred	= pred;
2435 
2436 	bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
2437 			   refill_keybuf_fn, MAP_END_KEY);
2438 
2439 	trace_bcache_keyscan(refill.nr_found,
2440 			     KEY_INODE(&start), KEY_OFFSET(&start),
2441 			     KEY_INODE(&buf->last_scanned),
2442 			     KEY_OFFSET(&buf->last_scanned));
2443 
2444 	spin_lock(&buf->lock);
2445 
2446 	if (!RB_EMPTY_ROOT(&buf->keys)) {
2447 		struct keybuf_key *w;
2448 		w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2449 		buf->start	= START_KEY(&w->key);
2450 
2451 		w = RB_LAST(&buf->keys, struct keybuf_key, node);
2452 		buf->end	= w->key;
2453 	} else {
2454 		buf->start	= MAX_KEY;
2455 		buf->end	= MAX_KEY;
2456 	}
2457 
2458 	spin_unlock(&buf->lock);
2459 }
2460 
__bch_keybuf_del(struct keybuf * buf,struct keybuf_key * w)2461 static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2462 {
2463 	rb_erase(&w->node, &buf->keys);
2464 	array_free(&buf->freelist, w);
2465 }
2466 
bch_keybuf_del(struct keybuf * buf,struct keybuf_key * w)2467 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2468 {
2469 	spin_lock(&buf->lock);
2470 	__bch_keybuf_del(buf, w);
2471 	spin_unlock(&buf->lock);
2472 }
2473 
bch_keybuf_check_overlapping(struct keybuf * buf,struct bkey * start,struct bkey * end)2474 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2475 				  struct bkey *end)
2476 {
2477 	bool ret = false;
2478 	struct keybuf_key *p, *w, s;
2479 	s.key = *start;
2480 
2481 	if (bkey_cmp(end, &buf->start) <= 0 ||
2482 	    bkey_cmp(start, &buf->end) >= 0)
2483 		return false;
2484 
2485 	spin_lock(&buf->lock);
2486 	w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2487 
2488 	while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2489 		p = w;
2490 		w = RB_NEXT(w, node);
2491 
2492 		if (p->private)
2493 			ret = true;
2494 		else
2495 			__bch_keybuf_del(buf, p);
2496 	}
2497 
2498 	spin_unlock(&buf->lock);
2499 	return ret;
2500 }
2501 
bch_keybuf_next(struct keybuf * buf)2502 struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2503 {
2504 	struct keybuf_key *w;
2505 	spin_lock(&buf->lock);
2506 
2507 	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2508 
2509 	while (w && w->private)
2510 		w = RB_NEXT(w, node);
2511 
2512 	if (w)
2513 		w->private = ERR_PTR(-EINTR);
2514 
2515 	spin_unlock(&buf->lock);
2516 	return w;
2517 }
2518 
bch_keybuf_next_rescan(struct cache_set * c,struct keybuf * buf,struct bkey * end,keybuf_pred_fn * pred)2519 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2520 					  struct keybuf *buf,
2521 					  struct bkey *end,
2522 					  keybuf_pred_fn *pred)
2523 {
2524 	struct keybuf_key *ret;
2525 
2526 	while (1) {
2527 		ret = bch_keybuf_next(buf);
2528 		if (ret)
2529 			break;
2530 
2531 		if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2532 			pr_debug("scan finished");
2533 			break;
2534 		}
2535 
2536 		bch_refill_keybuf(c, buf, end, pred);
2537 	}
2538 
2539 	return ret;
2540 }
2541 
bch_keybuf_init(struct keybuf * buf)2542 void bch_keybuf_init(struct keybuf *buf)
2543 {
2544 	buf->last_scanned	= MAX_KEY;
2545 	buf->keys		= RB_ROOT;
2546 
2547 	spin_lock_init(&buf->lock);
2548 	array_allocator_init(&buf->freelist);
2549 }
2550