• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 Red Hat, Inc.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-btree-internal.h"
8 #include "dm-space-map.h"
9 #include "dm-transaction-manager.h"
10 
11 #include <linux/export.h>
12 #include <linux/device-mapper.h>
13 
14 #define DM_MSG_PREFIX "btree"
15 
16 /*----------------------------------------------------------------
17  * Array manipulation
18  *--------------------------------------------------------------*/
memcpy_disk(void * dest,const void * src,size_t len)19 static void memcpy_disk(void *dest, const void *src, size_t len)
20 	__dm_written_to_disk(src)
21 {
22 	memcpy(dest, src, len);
23 	__dm_unbless_for_disk(src);
24 }
25 
array_insert(void * base,size_t elt_size,unsigned nr_elts,unsigned index,void * elt)26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 			 unsigned index, void *elt)
28 	__dm_written_to_disk(elt)
29 {
30 	if (index < nr_elts)
31 		memmove(base + (elt_size * (index + 1)),
32 			base + (elt_size * index),
33 			(nr_elts - index) * elt_size);
34 
35 	memcpy_disk(base + (elt_size * index), elt, elt_size);
36 }
37 
38 /*----------------------------------------------------------------*/
39 
40 /* makes the assumption that no two keys are the same. */
bsearch(struct btree_node * n,uint64_t key,int want_hi)41 static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42 {
43 	int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44 
45 	while (hi - lo > 1) {
46 		int mid = lo + ((hi - lo) / 2);
47 		uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48 
49 		if (mid_key == key)
50 			return mid;
51 
52 		if (mid_key < key)
53 			lo = mid;
54 		else
55 			hi = mid;
56 	}
57 
58 	return want_hi ? hi : lo;
59 }
60 
lower_bound(struct btree_node * n,uint64_t key)61 int lower_bound(struct btree_node *n, uint64_t key)
62 {
63 	return bsearch(n, key, 0);
64 }
65 
upper_bound(struct btree_node * n,uint64_t key)66 static int upper_bound(struct btree_node *n, uint64_t key)
67 {
68 	return bsearch(n, key, 1);
69 }
70 
inc_children(struct dm_transaction_manager * tm,struct btree_node * n,struct dm_btree_value_type * vt)71 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
72 		  struct dm_btree_value_type *vt)
73 {
74 	unsigned i;
75 	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
76 
77 	if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
78 		for (i = 0; i < nr_entries; i++)
79 			dm_tm_inc(tm, value64(n, i));
80 	else if (vt->inc)
81 		for (i = 0; i < nr_entries; i++)
82 			vt->inc(vt->context, value_ptr(n, i));
83 }
84 
insert_at(size_t value_size,struct btree_node * node,unsigned index,uint64_t key,void * value)85 static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
86 		     uint64_t key, void *value)
87 	__dm_written_to_disk(value)
88 {
89 	uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
90 	uint32_t max_entries = le32_to_cpu(node->header.max_entries);
91 	__le64 key_le = cpu_to_le64(key);
92 
93 	if (index > nr_entries ||
94 	    index >= max_entries ||
95 	    nr_entries >= max_entries) {
96 		DMERR("too many entries in btree node for insert");
97 		__dm_unbless_for_disk(value);
98 		return -ENOMEM;
99 	}
100 
101 	__dm_bless_for_disk(&key_le);
102 
103 	array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
104 	array_insert(value_base(node), value_size, nr_entries, index, value);
105 	node->header.nr_entries = cpu_to_le32(nr_entries + 1);
106 
107 	return 0;
108 }
109 
110 /*----------------------------------------------------------------*/
111 
112 /*
113  * We want 3n entries (for some n).  This works more nicely for repeated
114  * insert remove loops than (2n + 1).
115  */
calc_max_entries(size_t value_size,size_t block_size)116 static uint32_t calc_max_entries(size_t value_size, size_t block_size)
117 {
118 	uint32_t total, n;
119 	size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
120 
121 	block_size -= sizeof(struct node_header);
122 	total = block_size / elt_size;
123 	n = total / 3;		/* rounds down */
124 
125 	return 3 * n;
126 }
127 
dm_btree_empty(struct dm_btree_info * info,dm_block_t * root)128 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
129 {
130 	int r;
131 	struct dm_block *b;
132 	struct btree_node *n;
133 	size_t block_size;
134 	uint32_t max_entries;
135 
136 	r = new_block(info, &b);
137 	if (r < 0)
138 		return r;
139 
140 	block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
141 	max_entries = calc_max_entries(info->value_type.size, block_size);
142 
143 	n = dm_block_data(b);
144 	memset(n, 0, block_size);
145 	n->header.flags = cpu_to_le32(LEAF_NODE);
146 	n->header.nr_entries = cpu_to_le32(0);
147 	n->header.max_entries = cpu_to_le32(max_entries);
148 	n->header.value_size = cpu_to_le32(info->value_type.size);
149 
150 	*root = dm_block_location(b);
151 	unlock_block(info, b);
152 
153 	return 0;
154 }
155 EXPORT_SYMBOL_GPL(dm_btree_empty);
156 
157 /*----------------------------------------------------------------*/
158 
159 /*
160  * Deletion uses a recursive algorithm, since we have limited stack space
161  * we explicitly manage our own stack on the heap.
162  */
163 #define MAX_SPINE_DEPTH 64
164 struct frame {
165 	struct dm_block *b;
166 	struct btree_node *n;
167 	unsigned level;
168 	unsigned nr_children;
169 	unsigned current_child;
170 };
171 
172 struct del_stack {
173 	struct dm_btree_info *info;
174 	struct dm_transaction_manager *tm;
175 	int top;
176 	struct frame spine[MAX_SPINE_DEPTH];
177 };
178 
top_frame(struct del_stack * s,struct frame ** f)179 static int top_frame(struct del_stack *s, struct frame **f)
180 {
181 	if (s->top < 0) {
182 		DMERR("btree deletion stack empty");
183 		return -EINVAL;
184 	}
185 
186 	*f = s->spine + s->top;
187 
188 	return 0;
189 }
190 
unprocessed_frames(struct del_stack * s)191 static int unprocessed_frames(struct del_stack *s)
192 {
193 	return s->top >= 0;
194 }
195 
prefetch_children(struct del_stack * s,struct frame * f)196 static void prefetch_children(struct del_stack *s, struct frame *f)
197 {
198 	unsigned i;
199 	struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
200 
201 	for (i = 0; i < f->nr_children; i++)
202 		dm_bm_prefetch(bm, value64(f->n, i));
203 }
204 
is_internal_level(struct dm_btree_info * info,struct frame * f)205 static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
206 {
207 	return f->level < (info->levels - 1);
208 }
209 
push_frame(struct del_stack * s,dm_block_t b,unsigned level)210 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
211 {
212 	int r;
213 	uint32_t ref_count;
214 
215 	if (s->top >= MAX_SPINE_DEPTH - 1) {
216 		DMERR("btree deletion stack out of memory");
217 		return -ENOMEM;
218 	}
219 
220 	r = dm_tm_ref(s->tm, b, &ref_count);
221 	if (r)
222 		return r;
223 
224 	if (ref_count > 1)
225 		/*
226 		 * This is a shared node, so we can just decrement it's
227 		 * reference counter and leave the children.
228 		 */
229 		dm_tm_dec(s->tm, b);
230 
231 	else {
232 		uint32_t flags;
233 		struct frame *f = s->spine + ++s->top;
234 
235 		r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
236 		if (r) {
237 			s->top--;
238 			return r;
239 		}
240 
241 		f->n = dm_block_data(f->b);
242 		f->level = level;
243 		f->nr_children = le32_to_cpu(f->n->header.nr_entries);
244 		f->current_child = 0;
245 
246 		flags = le32_to_cpu(f->n->header.flags);
247 		if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
248 			prefetch_children(s, f);
249 	}
250 
251 	return 0;
252 }
253 
pop_frame(struct del_stack * s)254 static void pop_frame(struct del_stack *s)
255 {
256 	struct frame *f = s->spine + s->top--;
257 
258 	dm_tm_dec(s->tm, dm_block_location(f->b));
259 	dm_tm_unlock(s->tm, f->b);
260 }
261 
unlock_all_frames(struct del_stack * s)262 static void unlock_all_frames(struct del_stack *s)
263 {
264 	struct frame *f;
265 
266 	while (unprocessed_frames(s)) {
267 		f = s->spine + s->top--;
268 		dm_tm_unlock(s->tm, f->b);
269 	}
270 }
271 
dm_btree_del(struct dm_btree_info * info,dm_block_t root)272 int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
273 {
274 	int r;
275 	struct del_stack *s;
276 
277 	s = kmalloc(sizeof(*s), GFP_NOIO);
278 	if (!s)
279 		return -ENOMEM;
280 	s->info = info;
281 	s->tm = info->tm;
282 	s->top = -1;
283 
284 	r = push_frame(s, root, 0);
285 	if (r)
286 		goto out;
287 
288 	while (unprocessed_frames(s)) {
289 		uint32_t flags;
290 		struct frame *f;
291 		dm_block_t b;
292 
293 		r = top_frame(s, &f);
294 		if (r)
295 			goto out;
296 
297 		if (f->current_child >= f->nr_children) {
298 			pop_frame(s);
299 			continue;
300 		}
301 
302 		flags = le32_to_cpu(f->n->header.flags);
303 		if (flags & INTERNAL_NODE) {
304 			b = value64(f->n, f->current_child);
305 			f->current_child++;
306 			r = push_frame(s, b, f->level);
307 			if (r)
308 				goto out;
309 
310 		} else if (is_internal_level(info, f)) {
311 			b = value64(f->n, f->current_child);
312 			f->current_child++;
313 			r = push_frame(s, b, f->level + 1);
314 			if (r)
315 				goto out;
316 
317 		} else {
318 			if (info->value_type.dec) {
319 				unsigned i;
320 
321 				for (i = 0; i < f->nr_children; i++)
322 					info->value_type.dec(info->value_type.context,
323 							     value_ptr(f->n, i));
324 			}
325 			pop_frame(s);
326 		}
327 	}
328 out:
329 	if (r) {
330 		/* cleanup all frames of del_stack */
331 		unlock_all_frames(s);
332 	}
333 	kfree(s);
334 
335 	return r;
336 }
337 EXPORT_SYMBOL_GPL(dm_btree_del);
338 
339 /*----------------------------------------------------------------*/
340 
btree_lookup_raw(struct ro_spine * s,dm_block_t block,uint64_t key,int (* search_fn)(struct btree_node *,uint64_t),uint64_t * result_key,void * v,size_t value_size)341 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
342 			    int (*search_fn)(struct btree_node *, uint64_t),
343 			    uint64_t *result_key, void *v, size_t value_size)
344 {
345 	int i, r;
346 	uint32_t flags, nr_entries;
347 
348 	do {
349 		r = ro_step(s, block);
350 		if (r < 0)
351 			return r;
352 
353 		i = search_fn(ro_node(s), key);
354 
355 		flags = le32_to_cpu(ro_node(s)->header.flags);
356 		nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
357 		if (i < 0 || i >= nr_entries)
358 			return -ENODATA;
359 
360 		if (flags & INTERNAL_NODE)
361 			block = value64(ro_node(s), i);
362 
363 	} while (!(flags & LEAF_NODE));
364 
365 	*result_key = le64_to_cpu(ro_node(s)->keys[i]);
366 	memcpy(v, value_ptr(ro_node(s), i), value_size);
367 
368 	return 0;
369 }
370 
dm_btree_lookup(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,void * value_le)371 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
372 		    uint64_t *keys, void *value_le)
373 {
374 	unsigned level, last_level = info->levels - 1;
375 	int r = -ENODATA;
376 	uint64_t rkey;
377 	__le64 internal_value_le;
378 	struct ro_spine spine;
379 
380 	init_ro_spine(&spine, info);
381 	for (level = 0; level < info->levels; level++) {
382 		size_t size;
383 		void *value_p;
384 
385 		if (level == last_level) {
386 			value_p = value_le;
387 			size = info->value_type.size;
388 
389 		} else {
390 			value_p = &internal_value_le;
391 			size = sizeof(uint64_t);
392 		}
393 
394 		r = btree_lookup_raw(&spine, root, keys[level],
395 				     lower_bound, &rkey,
396 				     value_p, size);
397 
398 		if (!r) {
399 			if (rkey != keys[level]) {
400 				exit_ro_spine(&spine);
401 				return -ENODATA;
402 			}
403 		} else {
404 			exit_ro_spine(&spine);
405 			return r;
406 		}
407 
408 		root = le64_to_cpu(internal_value_le);
409 	}
410 	exit_ro_spine(&spine);
411 
412 	return r;
413 }
414 EXPORT_SYMBOL_GPL(dm_btree_lookup);
415 
dm_btree_lookup_next_single(struct dm_btree_info * info,dm_block_t root,uint64_t key,uint64_t * rkey,void * value_le)416 static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
417 				       uint64_t key, uint64_t *rkey, void *value_le)
418 {
419 	int r, i;
420 	uint32_t flags, nr_entries;
421 	struct dm_block *node;
422 	struct btree_node *n;
423 
424 	r = bn_read_lock(info, root, &node);
425 	if (r)
426 		return r;
427 
428 	n = dm_block_data(node);
429 	flags = le32_to_cpu(n->header.flags);
430 	nr_entries = le32_to_cpu(n->header.nr_entries);
431 
432 	if (flags & INTERNAL_NODE) {
433 		i = lower_bound(n, key);
434 		if (i < 0 || i >= nr_entries) {
435 			r = -ENODATA;
436 			goto out;
437 		}
438 
439 		r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
440 		if (r == -ENODATA && i < (nr_entries - 1)) {
441 			i++;
442 			r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
443 		}
444 
445 	} else {
446 		i = upper_bound(n, key);
447 		if (i < 0 || i >= nr_entries) {
448 			r = -ENODATA;
449 			goto out;
450 		}
451 
452 		*rkey = le64_to_cpu(n->keys[i]);
453 		memcpy(value_le, value_ptr(n, i), info->value_type.size);
454 	}
455 out:
456 	dm_tm_unlock(info->tm, node);
457 	return r;
458 }
459 
dm_btree_lookup_next(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,uint64_t * rkey,void * value_le)460 int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
461 			 uint64_t *keys, uint64_t *rkey, void *value_le)
462 {
463 	unsigned level;
464 	int r = -ENODATA;
465 	__le64 internal_value_le;
466 	struct ro_spine spine;
467 
468 	init_ro_spine(&spine, info);
469 	for (level = 0; level < info->levels - 1u; level++) {
470 		r = btree_lookup_raw(&spine, root, keys[level],
471 				     lower_bound, rkey,
472 				     &internal_value_le, sizeof(uint64_t));
473 		if (r)
474 			goto out;
475 
476 		if (*rkey != keys[level]) {
477 			r = -ENODATA;
478 			goto out;
479 		}
480 
481 		root = le64_to_cpu(internal_value_le);
482 	}
483 
484 	r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
485 out:
486 	exit_ro_spine(&spine);
487 	return r;
488 }
489 
490 EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
491 
492 /*
493  * Splits a node by creating a sibling node and shifting half the nodes
494  * contents across.  Assumes there is a parent node, and it has room for
495  * another child.
496  *
497  * Before:
498  *	  +--------+
499  *	  | Parent |
500  *	  +--------+
501  *	     |
502  *	     v
503  *	+----------+
504  *	| A ++++++ |
505  *	+----------+
506  *
507  *
508  * After:
509  *		+--------+
510  *		| Parent |
511  *		+--------+
512  *		  |	|
513  *		  v	+------+
514  *	    +---------+	       |
515  *	    | A* +++  |	       v
516  *	    +---------+	  +-------+
517  *			  | B +++ |
518  *			  +-------+
519  *
520  * Where A* is a shadow of A.
521  */
btree_split_sibling(struct shadow_spine * s,unsigned parent_index,uint64_t key)522 static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
523 			       uint64_t key)
524 {
525 	int r;
526 	size_t size;
527 	unsigned nr_left, nr_right;
528 	struct dm_block *left, *right, *parent;
529 	struct btree_node *ln, *rn, *pn;
530 	__le64 location;
531 
532 	left = shadow_current(s);
533 
534 	r = new_block(s->info, &right);
535 	if (r < 0)
536 		return r;
537 
538 	ln = dm_block_data(left);
539 	rn = dm_block_data(right);
540 
541 	nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
542 	nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
543 
544 	ln->header.nr_entries = cpu_to_le32(nr_left);
545 
546 	rn->header.flags = ln->header.flags;
547 	rn->header.nr_entries = cpu_to_le32(nr_right);
548 	rn->header.max_entries = ln->header.max_entries;
549 	rn->header.value_size = ln->header.value_size;
550 	memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
551 
552 	size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
553 		sizeof(uint64_t) : s->info->value_type.size;
554 	memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
555 	       size * nr_right);
556 
557 	/*
558 	 * Patch up the parent
559 	 */
560 	parent = shadow_parent(s);
561 
562 	pn = dm_block_data(parent);
563 	location = cpu_to_le64(dm_block_location(left));
564 	__dm_bless_for_disk(&location);
565 	memcpy_disk(value_ptr(pn, parent_index),
566 		    &location, sizeof(__le64));
567 
568 	location = cpu_to_le64(dm_block_location(right));
569 	__dm_bless_for_disk(&location);
570 
571 	r = insert_at(sizeof(__le64), pn, parent_index + 1,
572 		      le64_to_cpu(rn->keys[0]), &location);
573 	if (r) {
574 		unlock_block(s->info, right);
575 		return r;
576 	}
577 
578 	if (key < le64_to_cpu(rn->keys[0])) {
579 		unlock_block(s->info, right);
580 		s->nodes[1] = left;
581 	} else {
582 		unlock_block(s->info, left);
583 		s->nodes[1] = right;
584 	}
585 
586 	return 0;
587 }
588 
589 /*
590  * Splits a node by creating two new children beneath the given node.
591  *
592  * Before:
593  *	  +----------+
594  *	  | A ++++++ |
595  *	  +----------+
596  *
597  *
598  * After:
599  *	+------------+
600  *	| A (shadow) |
601  *	+------------+
602  *	    |	|
603  *   +------+	+----+
604  *   |		     |
605  *   v		     v
606  * +-------+	 +-------+
607  * | B +++ |	 | C +++ |
608  * +-------+	 +-------+
609  */
btree_split_beneath(struct shadow_spine * s,uint64_t key)610 static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
611 {
612 	int r;
613 	size_t size;
614 	unsigned nr_left, nr_right;
615 	struct dm_block *left, *right, *new_parent;
616 	struct btree_node *pn, *ln, *rn;
617 	__le64 val;
618 
619 	new_parent = shadow_current(s);
620 
621 	pn = dm_block_data(new_parent);
622 	size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
623 		sizeof(__le64) : s->info->value_type.size;
624 
625 	/* create & init the left block */
626 	r = new_block(s->info, &left);
627 	if (r < 0)
628 		return r;
629 
630 	ln = dm_block_data(left);
631 	nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
632 
633 	ln->header.flags = pn->header.flags;
634 	ln->header.nr_entries = cpu_to_le32(nr_left);
635 	ln->header.max_entries = pn->header.max_entries;
636 	ln->header.value_size = pn->header.value_size;
637 	memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
638 	memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
639 
640 	/* create & init the right block */
641 	r = new_block(s->info, &right);
642 	if (r < 0) {
643 		unlock_block(s->info, left);
644 		return r;
645 	}
646 
647 	rn = dm_block_data(right);
648 	nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
649 
650 	rn->header.flags = pn->header.flags;
651 	rn->header.nr_entries = cpu_to_le32(nr_right);
652 	rn->header.max_entries = pn->header.max_entries;
653 	rn->header.value_size = pn->header.value_size;
654 	memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
655 	memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
656 	       nr_right * size);
657 
658 	/* new_parent should just point to l and r now */
659 	pn->header.flags = cpu_to_le32(INTERNAL_NODE);
660 	pn->header.nr_entries = cpu_to_le32(2);
661 	pn->header.max_entries = cpu_to_le32(
662 		calc_max_entries(sizeof(__le64),
663 				 dm_bm_block_size(
664 					 dm_tm_get_bm(s->info->tm))));
665 	pn->header.value_size = cpu_to_le32(sizeof(__le64));
666 
667 	val = cpu_to_le64(dm_block_location(left));
668 	__dm_bless_for_disk(&val);
669 	pn->keys[0] = ln->keys[0];
670 	memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
671 
672 	val = cpu_to_le64(dm_block_location(right));
673 	__dm_bless_for_disk(&val);
674 	pn->keys[1] = rn->keys[0];
675 	memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
676 
677 	unlock_block(s->info, left);
678 	unlock_block(s->info, right);
679 	return 0;
680 }
681 
btree_insert_raw(struct shadow_spine * s,dm_block_t root,struct dm_btree_value_type * vt,uint64_t key,unsigned * index)682 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
683 			    struct dm_btree_value_type *vt,
684 			    uint64_t key, unsigned *index)
685 {
686 	int r, i = *index, top = 1;
687 	struct btree_node *node;
688 
689 	for (;;) {
690 		r = shadow_step(s, root, vt);
691 		if (r < 0)
692 			return r;
693 
694 		node = dm_block_data(shadow_current(s));
695 
696 		/*
697 		 * We have to patch up the parent node, ugly, but I don't
698 		 * see a way to do this automatically as part of the spine
699 		 * op.
700 		 */
701 		if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
702 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
703 
704 			__dm_bless_for_disk(&location);
705 			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
706 				    &location, sizeof(__le64));
707 		}
708 
709 		node = dm_block_data(shadow_current(s));
710 
711 		if (node->header.nr_entries == node->header.max_entries) {
712 			if (top)
713 				r = btree_split_beneath(s, key);
714 			else
715 				r = btree_split_sibling(s, i, key);
716 
717 			if (r < 0)
718 				return r;
719 		}
720 
721 		node = dm_block_data(shadow_current(s));
722 
723 		i = lower_bound(node, key);
724 
725 		if (le32_to_cpu(node->header.flags) & LEAF_NODE)
726 			break;
727 
728 		if (i < 0) {
729 			/* change the bounds on the lowest key */
730 			node->keys[0] = cpu_to_le64(key);
731 			i = 0;
732 		}
733 
734 		root = value64(node, i);
735 		top = 0;
736 	}
737 
738 	if (i < 0 || le64_to_cpu(node->keys[i]) != key)
739 		i++;
740 
741 	*index = i;
742 	return 0;
743 }
744 
insert(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,void * value,dm_block_t * new_root,int * inserted)745 static int insert(struct dm_btree_info *info, dm_block_t root,
746 		  uint64_t *keys, void *value, dm_block_t *new_root,
747 		  int *inserted)
748 		  __dm_written_to_disk(value)
749 {
750 	int r, need_insert;
751 	unsigned level, index = -1, last_level = info->levels - 1;
752 	dm_block_t block = root;
753 	struct shadow_spine spine;
754 	struct btree_node *n;
755 	struct dm_btree_value_type le64_type;
756 
757 	init_le64_type(info->tm, &le64_type);
758 	init_shadow_spine(&spine, info);
759 
760 	for (level = 0; level < (info->levels - 1); level++) {
761 		r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
762 		if (r < 0)
763 			goto bad;
764 
765 		n = dm_block_data(shadow_current(&spine));
766 		need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
767 			       (le64_to_cpu(n->keys[index]) != keys[level]));
768 
769 		if (need_insert) {
770 			dm_block_t new_tree;
771 			__le64 new_le;
772 
773 			r = dm_btree_empty(info, &new_tree);
774 			if (r < 0)
775 				goto bad;
776 
777 			new_le = cpu_to_le64(new_tree);
778 			__dm_bless_for_disk(&new_le);
779 
780 			r = insert_at(sizeof(uint64_t), n, index,
781 				      keys[level], &new_le);
782 			if (r)
783 				goto bad;
784 		}
785 
786 		if (level < last_level)
787 			block = value64(n, index);
788 	}
789 
790 	r = btree_insert_raw(&spine, block, &info->value_type,
791 			     keys[level], &index);
792 	if (r < 0)
793 		goto bad;
794 
795 	n = dm_block_data(shadow_current(&spine));
796 	need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
797 		       (le64_to_cpu(n->keys[index]) != keys[level]));
798 
799 	if (need_insert) {
800 		if (inserted)
801 			*inserted = 1;
802 
803 		r = insert_at(info->value_type.size, n, index,
804 			      keys[level], value);
805 		if (r)
806 			goto bad_unblessed;
807 	} else {
808 		if (inserted)
809 			*inserted = 0;
810 
811 		if (info->value_type.dec &&
812 		    (!info->value_type.equal ||
813 		     !info->value_type.equal(
814 			     info->value_type.context,
815 			     value_ptr(n, index),
816 			     value))) {
817 			info->value_type.dec(info->value_type.context,
818 					     value_ptr(n, index));
819 		}
820 		memcpy_disk(value_ptr(n, index),
821 			    value, info->value_type.size);
822 	}
823 
824 	*new_root = shadow_root(&spine);
825 	exit_shadow_spine(&spine);
826 
827 	return 0;
828 
829 bad:
830 	__dm_unbless_for_disk(value);
831 bad_unblessed:
832 	exit_shadow_spine(&spine);
833 	return r;
834 }
835 
dm_btree_insert(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,void * value,dm_block_t * new_root)836 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
837 		    uint64_t *keys, void *value, dm_block_t *new_root)
838 		    __dm_written_to_disk(value)
839 {
840 	return insert(info, root, keys, value, new_root, NULL);
841 }
842 EXPORT_SYMBOL_GPL(dm_btree_insert);
843 
dm_btree_insert_notify(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,void * value,dm_block_t * new_root,int * inserted)844 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
845 			   uint64_t *keys, void *value, dm_block_t *new_root,
846 			   int *inserted)
847 			   __dm_written_to_disk(value)
848 {
849 	return insert(info, root, keys, value, new_root, inserted);
850 }
851 EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
852 
853 /*----------------------------------------------------------------*/
854 
find_key(struct ro_spine * s,dm_block_t block,bool find_highest,uint64_t * result_key,dm_block_t * next_block)855 static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
856 		    uint64_t *result_key, dm_block_t *next_block)
857 {
858 	int i, r;
859 	uint32_t flags;
860 
861 	do {
862 		r = ro_step(s, block);
863 		if (r < 0)
864 			return r;
865 
866 		flags = le32_to_cpu(ro_node(s)->header.flags);
867 		i = le32_to_cpu(ro_node(s)->header.nr_entries);
868 		if (!i)
869 			return -ENODATA;
870 		else
871 			i--;
872 
873 		if (find_highest)
874 			*result_key = le64_to_cpu(ro_node(s)->keys[i]);
875 		else
876 			*result_key = le64_to_cpu(ro_node(s)->keys[0]);
877 
878 		if (next_block || flags & INTERNAL_NODE) {
879 			if (find_highest)
880 				block = value64(ro_node(s), i);
881 			else
882 				block = value64(ro_node(s), 0);
883 		}
884 
885 	} while (flags & INTERNAL_NODE);
886 
887 	if (next_block)
888 		*next_block = block;
889 	return 0;
890 }
891 
dm_btree_find_key(struct dm_btree_info * info,dm_block_t root,bool find_highest,uint64_t * result_keys)892 static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
893 			     bool find_highest, uint64_t *result_keys)
894 {
895 	int r = 0, count = 0, level;
896 	struct ro_spine spine;
897 
898 	init_ro_spine(&spine, info);
899 	for (level = 0; level < info->levels; level++) {
900 		r = find_key(&spine, root, find_highest, result_keys + level,
901 			     level == info->levels - 1 ? NULL : &root);
902 		if (r == -ENODATA) {
903 			r = 0;
904 			break;
905 
906 		} else if (r)
907 			break;
908 
909 		count++;
910 	}
911 	exit_ro_spine(&spine);
912 
913 	return r ? r : count;
914 }
915 
dm_btree_find_highest_key(struct dm_btree_info * info,dm_block_t root,uint64_t * result_keys)916 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
917 			      uint64_t *result_keys)
918 {
919 	return dm_btree_find_key(info, root, true, result_keys);
920 }
921 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
922 
dm_btree_find_lowest_key(struct dm_btree_info * info,dm_block_t root,uint64_t * result_keys)923 int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
924 			     uint64_t *result_keys)
925 {
926 	return dm_btree_find_key(info, root, false, result_keys);
927 }
928 EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
929 
930 /*----------------------------------------------------------------*/
931 
932 /*
933  * FIXME: We shouldn't use a recursive algorithm when we have limited stack
934  * space.  Also this only works for single level trees.
935  */
walk_node(struct dm_btree_info * info,dm_block_t block,int (* fn)(void * context,uint64_t * keys,void * leaf),void * context)936 static int walk_node(struct dm_btree_info *info, dm_block_t block,
937 		     int (*fn)(void *context, uint64_t *keys, void *leaf),
938 		     void *context)
939 {
940 	int r;
941 	unsigned i, nr;
942 	struct dm_block *node;
943 	struct btree_node *n;
944 	uint64_t keys;
945 
946 	r = bn_read_lock(info, block, &node);
947 	if (r)
948 		return r;
949 
950 	n = dm_block_data(node);
951 
952 	nr = le32_to_cpu(n->header.nr_entries);
953 	for (i = 0; i < nr; i++) {
954 		if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
955 			r = walk_node(info, value64(n, i), fn, context);
956 			if (r)
957 				goto out;
958 		} else {
959 			keys = le64_to_cpu(*key_ptr(n, i));
960 			r = fn(context, &keys, value_ptr(n, i));
961 			if (r)
962 				goto out;
963 		}
964 	}
965 
966 out:
967 	dm_tm_unlock(info->tm, node);
968 	return r;
969 }
970 
dm_btree_walk(struct dm_btree_info * info,dm_block_t root,int (* fn)(void * context,uint64_t * keys,void * leaf),void * context)971 int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
972 		  int (*fn)(void *context, uint64_t *keys, void *leaf),
973 		  void *context)
974 {
975 	BUG_ON(info->levels > 1);
976 	return walk_node(info, root, fn, context);
977 }
978 EXPORT_SYMBOL_GPL(dm_btree_walk);
979