• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/pagemap.h>
6 #include <linux/page-flags.h>
7 #include <linux/module.h>
8 #include <linux/spinlock.h>
9 #include <linux/blkdev.h>
10 #include <linux/swap.h>
11 #include <linux/writeback.h>
12 #include <linux/pagevec.h>
13 #include <linux/prefetch.h>
14 #include <linux/cleancache.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17 #include "compat.h"
18 #include "ctree.h"
19 #include "btrfs_inode.h"
20 #include "volumes.h"
21 #include "check-integrity.h"
22 #include "locking.h"
23 
24 static struct kmem_cache *extent_state_cache;
25 static struct kmem_cache *extent_buffer_cache;
26 
27 static LIST_HEAD(buffers);
28 static LIST_HEAD(states);
29 
30 #define LEAK_DEBUG 0
31 #if LEAK_DEBUG
32 static DEFINE_SPINLOCK(leak_lock);
33 #endif
34 
35 #define BUFFER_LRU_MAX 64
36 
37 struct tree_entry {
38 	u64 start;
39 	u64 end;
40 	struct rb_node rb_node;
41 };
42 
43 struct extent_page_data {
44 	struct bio *bio;
45 	struct extent_io_tree *tree;
46 	get_extent_t *get_extent;
47 
48 	/* tells writepage not to lock the state bits for this range
49 	 * it still does the unlocking
50 	 */
51 	unsigned int extent_locked:1;
52 
53 	/* tells the submit_bio code to use a WRITE_SYNC */
54 	unsigned int sync_io:1;
55 };
56 
57 static noinline void flush_write_bio(void *data);
58 static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree * tree)59 tree_fs_info(struct extent_io_tree *tree)
60 {
61 	return btrfs_sb(tree->mapping->host->i_sb);
62 }
63 
extent_io_init(void)64 int __init extent_io_init(void)
65 {
66 	extent_state_cache = kmem_cache_create("extent_state",
67 			sizeof(struct extent_state), 0,
68 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
69 	if (!extent_state_cache)
70 		return -ENOMEM;
71 
72 	extent_buffer_cache = kmem_cache_create("extent_buffers",
73 			sizeof(struct extent_buffer), 0,
74 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
75 	if (!extent_buffer_cache)
76 		goto free_state_cache;
77 	return 0;
78 
79 free_state_cache:
80 	kmem_cache_destroy(extent_state_cache);
81 	return -ENOMEM;
82 }
83 
extent_io_exit(void)84 void extent_io_exit(void)
85 {
86 	struct extent_state *state;
87 	struct extent_buffer *eb;
88 
89 	while (!list_empty(&states)) {
90 		state = list_entry(states.next, struct extent_state, leak_list);
91 		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
92 		       "state %lu in tree %p refs %d\n",
93 		       (unsigned long long)state->start,
94 		       (unsigned long long)state->end,
95 		       state->state, state->tree, atomic_read(&state->refs));
96 		list_del(&state->leak_list);
97 		kmem_cache_free(extent_state_cache, state);
98 
99 	}
100 
101 	while (!list_empty(&buffers)) {
102 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
103 		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
104 		       "refs %d\n", (unsigned long long)eb->start,
105 		       eb->len, atomic_read(&eb->refs));
106 		list_del(&eb->leak_list);
107 		kmem_cache_free(extent_buffer_cache, eb);
108 	}
109 	if (extent_state_cache)
110 		kmem_cache_destroy(extent_state_cache);
111 	if (extent_buffer_cache)
112 		kmem_cache_destroy(extent_buffer_cache);
113 }
114 
extent_io_tree_init(struct extent_io_tree * tree,struct address_space * mapping)115 void extent_io_tree_init(struct extent_io_tree *tree,
116 			 struct address_space *mapping)
117 {
118 	tree->state = RB_ROOT;
119 	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
120 	tree->ops = NULL;
121 	tree->dirty_bytes = 0;
122 	spin_lock_init(&tree->lock);
123 	spin_lock_init(&tree->buffer_lock);
124 	tree->mapping = mapping;
125 }
126 
alloc_extent_state(gfp_t mask)127 static struct extent_state *alloc_extent_state(gfp_t mask)
128 {
129 	struct extent_state *state;
130 #if LEAK_DEBUG
131 	unsigned long flags;
132 #endif
133 
134 	state = kmem_cache_alloc(extent_state_cache, mask);
135 	if (!state)
136 		return state;
137 	state->state = 0;
138 	state->private = 0;
139 	state->tree = NULL;
140 #if LEAK_DEBUG
141 	spin_lock_irqsave(&leak_lock, flags);
142 	list_add(&state->leak_list, &states);
143 	spin_unlock_irqrestore(&leak_lock, flags);
144 #endif
145 	atomic_set(&state->refs, 1);
146 	init_waitqueue_head(&state->wq);
147 	trace_alloc_extent_state(state, mask, _RET_IP_);
148 	return state;
149 }
150 
free_extent_state(struct extent_state * state)151 void free_extent_state(struct extent_state *state)
152 {
153 	if (!state)
154 		return;
155 	if (atomic_dec_and_test(&state->refs)) {
156 #if LEAK_DEBUG
157 		unsigned long flags;
158 #endif
159 		WARN_ON(state->tree);
160 #if LEAK_DEBUG
161 		spin_lock_irqsave(&leak_lock, flags);
162 		list_del(&state->leak_list);
163 		spin_unlock_irqrestore(&leak_lock, flags);
164 #endif
165 		trace_free_extent_state(state, _RET_IP_);
166 		kmem_cache_free(extent_state_cache, state);
167 	}
168 }
169 
tree_insert(struct rb_root * root,u64 offset,struct rb_node * node)170 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
171 				   struct rb_node *node)
172 {
173 	struct rb_node **p = &root->rb_node;
174 	struct rb_node *parent = NULL;
175 	struct tree_entry *entry;
176 
177 	while (*p) {
178 		parent = *p;
179 		entry = rb_entry(parent, struct tree_entry, rb_node);
180 
181 		if (offset < entry->start)
182 			p = &(*p)->rb_left;
183 		else if (offset > entry->end)
184 			p = &(*p)->rb_right;
185 		else
186 			return parent;
187 	}
188 
189 	entry = rb_entry(node, struct tree_entry, rb_node);
190 	rb_link_node(node, parent, p);
191 	rb_insert_color(node, root);
192 	return NULL;
193 }
194 
__etree_search(struct extent_io_tree * tree,u64 offset,struct rb_node ** prev_ret,struct rb_node ** next_ret)195 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
196 				     struct rb_node **prev_ret,
197 				     struct rb_node **next_ret)
198 {
199 	struct rb_root *root = &tree->state;
200 	struct rb_node *n = root->rb_node;
201 	struct rb_node *prev = NULL;
202 	struct rb_node *orig_prev = NULL;
203 	struct tree_entry *entry;
204 	struct tree_entry *prev_entry = NULL;
205 
206 	while (n) {
207 		entry = rb_entry(n, struct tree_entry, rb_node);
208 		prev = n;
209 		prev_entry = entry;
210 
211 		if (offset < entry->start)
212 			n = n->rb_left;
213 		else if (offset > entry->end)
214 			n = n->rb_right;
215 		else
216 			return n;
217 	}
218 
219 	if (prev_ret) {
220 		orig_prev = prev;
221 		while (prev && offset > prev_entry->end) {
222 			prev = rb_next(prev);
223 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
224 		}
225 		*prev_ret = prev;
226 		prev = orig_prev;
227 	}
228 
229 	if (next_ret) {
230 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
231 		while (prev && offset < prev_entry->start) {
232 			prev = rb_prev(prev);
233 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
234 		}
235 		*next_ret = prev;
236 	}
237 	return NULL;
238 }
239 
tree_search(struct extent_io_tree * tree,u64 offset)240 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
241 					  u64 offset)
242 {
243 	struct rb_node *prev = NULL;
244 	struct rb_node *ret;
245 
246 	ret = __etree_search(tree, offset, &prev, NULL);
247 	if (!ret)
248 		return prev;
249 	return ret;
250 }
251 
merge_cb(struct extent_io_tree * tree,struct extent_state * new,struct extent_state * other)252 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
253 		     struct extent_state *other)
254 {
255 	if (tree->ops && tree->ops->merge_extent_hook)
256 		tree->ops->merge_extent_hook(tree->mapping->host, new,
257 					     other);
258 }
259 
260 /*
261  * utility function to look for merge candidates inside a given range.
262  * Any extents with matching state are merged together into a single
263  * extent in the tree.  Extents with EXTENT_IO in their state field
264  * are not merged because the end_io handlers need to be able to do
265  * operations on them without sleeping (or doing allocations/splits).
266  *
267  * This should be called with the tree lock held.
268  */
merge_state(struct extent_io_tree * tree,struct extent_state * state)269 static void merge_state(struct extent_io_tree *tree,
270 		        struct extent_state *state)
271 {
272 	struct extent_state *other;
273 	struct rb_node *other_node;
274 
275 	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
276 		return;
277 
278 	other_node = rb_prev(&state->rb_node);
279 	if (other_node) {
280 		other = rb_entry(other_node, struct extent_state, rb_node);
281 		if (other->end == state->start - 1 &&
282 		    other->state == state->state) {
283 			merge_cb(tree, state, other);
284 			state->start = other->start;
285 			other->tree = NULL;
286 			rb_erase(&other->rb_node, &tree->state);
287 			free_extent_state(other);
288 		}
289 	}
290 	other_node = rb_next(&state->rb_node);
291 	if (other_node) {
292 		other = rb_entry(other_node, struct extent_state, rb_node);
293 		if (other->start == state->end + 1 &&
294 		    other->state == state->state) {
295 			merge_cb(tree, state, other);
296 			state->end = other->end;
297 			other->tree = NULL;
298 			rb_erase(&other->rb_node, &tree->state);
299 			free_extent_state(other);
300 		}
301 	}
302 }
303 
set_state_cb(struct extent_io_tree * tree,struct extent_state * state,int * bits)304 static void set_state_cb(struct extent_io_tree *tree,
305 			 struct extent_state *state, int *bits)
306 {
307 	if (tree->ops && tree->ops->set_bit_hook)
308 		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
309 }
310 
clear_state_cb(struct extent_io_tree * tree,struct extent_state * state,int * bits)311 static void clear_state_cb(struct extent_io_tree *tree,
312 			   struct extent_state *state, int *bits)
313 {
314 	if (tree->ops && tree->ops->clear_bit_hook)
315 		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
316 }
317 
318 static void set_state_bits(struct extent_io_tree *tree,
319 			   struct extent_state *state, int *bits);
320 
321 /*
322  * insert an extent_state struct into the tree.  'bits' are set on the
323  * struct before it is inserted.
324  *
325  * This may return -EEXIST if the extent is already there, in which case the
326  * state struct is freed.
327  *
328  * The tree lock is not taken internally.  This is a utility function and
329  * probably isn't what you want to call (see set/clear_extent_bit).
330  */
insert_state(struct extent_io_tree * tree,struct extent_state * state,u64 start,u64 end,int * bits)331 static int insert_state(struct extent_io_tree *tree,
332 			struct extent_state *state, u64 start, u64 end,
333 			int *bits)
334 {
335 	struct rb_node *node;
336 
337 	if (end < start) {
338 		printk(KERN_ERR "btrfs end < start %llu %llu\n",
339 		       (unsigned long long)end,
340 		       (unsigned long long)start);
341 		WARN_ON(1);
342 	}
343 	state->start = start;
344 	state->end = end;
345 
346 	set_state_bits(tree, state, bits);
347 
348 	node = tree_insert(&tree->state, end, &state->rb_node);
349 	if (node) {
350 		struct extent_state *found;
351 		found = rb_entry(node, struct extent_state, rb_node);
352 		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
353 		       "%llu %llu\n", (unsigned long long)found->start,
354 		       (unsigned long long)found->end,
355 		       (unsigned long long)start, (unsigned long long)end);
356 		return -EEXIST;
357 	}
358 	state->tree = tree;
359 	merge_state(tree, state);
360 	return 0;
361 }
362 
split_cb(struct extent_io_tree * tree,struct extent_state * orig,u64 split)363 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
364 		     u64 split)
365 {
366 	if (tree->ops && tree->ops->split_extent_hook)
367 		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
368 }
369 
370 /*
371  * split a given extent state struct in two, inserting the preallocated
372  * struct 'prealloc' as the newly created second half.  'split' indicates an
373  * offset inside 'orig' where it should be split.
374  *
375  * Before calling,
376  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
377  * are two extent state structs in the tree:
378  * prealloc: [orig->start, split - 1]
379  * orig: [ split, orig->end ]
380  *
381  * The tree locks are not taken by this function. They need to be held
382  * by the caller.
383  */
split_state(struct extent_io_tree * tree,struct extent_state * orig,struct extent_state * prealloc,u64 split)384 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
385 		       struct extent_state *prealloc, u64 split)
386 {
387 	struct rb_node *node;
388 
389 	split_cb(tree, orig, split);
390 
391 	prealloc->start = orig->start;
392 	prealloc->end = split - 1;
393 	prealloc->state = orig->state;
394 	orig->start = split;
395 
396 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
397 	if (node) {
398 		free_extent_state(prealloc);
399 		return -EEXIST;
400 	}
401 	prealloc->tree = tree;
402 	return 0;
403 }
404 
next_state(struct extent_state * state)405 static struct extent_state *next_state(struct extent_state *state)
406 {
407 	struct rb_node *next = rb_next(&state->rb_node);
408 	if (next)
409 		return rb_entry(next, struct extent_state, rb_node);
410 	else
411 		return NULL;
412 }
413 
414 /*
415  * utility function to clear some bits in an extent state struct.
416  * it will optionally wake up any one waiting on this state (wake == 1)
417  *
418  * If no bits are set on the state struct after clearing things, the
419  * struct is freed and removed from the tree
420  */
clear_state_bit(struct extent_io_tree * tree,struct extent_state * state,int * bits,int wake)421 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
422 					    struct extent_state *state,
423 					    int *bits, int wake)
424 {
425 	struct extent_state *next;
426 	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
427 
428 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
429 		u64 range = state->end - state->start + 1;
430 		WARN_ON(range > tree->dirty_bytes);
431 		tree->dirty_bytes -= range;
432 	}
433 	clear_state_cb(tree, state, bits);
434 	state->state &= ~bits_to_clear;
435 	if (wake)
436 		wake_up(&state->wq);
437 	if (state->state == 0) {
438 		next = next_state(state);
439 		if (state->tree) {
440 			rb_erase(&state->rb_node, &tree->state);
441 			state->tree = NULL;
442 			free_extent_state(state);
443 		} else {
444 			WARN_ON(1);
445 		}
446 	} else {
447 		merge_state(tree, state);
448 		next = next_state(state);
449 	}
450 	return next;
451 }
452 
453 static struct extent_state *
alloc_extent_state_atomic(struct extent_state * prealloc)454 alloc_extent_state_atomic(struct extent_state *prealloc)
455 {
456 	if (!prealloc)
457 		prealloc = alloc_extent_state(GFP_ATOMIC);
458 
459 	return prealloc;
460 }
461 
extent_io_tree_panic(struct extent_io_tree * tree,int err)462 void extent_io_tree_panic(struct extent_io_tree *tree, int err)
463 {
464 	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
465 		    "Extent tree was modified by another "
466 		    "thread while locked.");
467 }
468 
469 /*
470  * clear some bits on a range in the tree.  This may require splitting
471  * or inserting elements in the tree, so the gfp mask is used to
472  * indicate which allocations or sleeping are allowed.
473  *
474  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
475  * the given range from the tree regardless of state (ie for truncate).
476  *
477  * the range [start, end] is inclusive.
478  *
479  * This takes the tree lock, and returns 0 on success and < 0 on error.
480  */
clear_extent_bit(struct extent_io_tree * tree,u64 start,u64 end,int bits,int wake,int delete,struct extent_state ** cached_state,gfp_t mask)481 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
482 		     int bits, int wake, int delete,
483 		     struct extent_state **cached_state,
484 		     gfp_t mask)
485 {
486 	struct extent_state *state;
487 	struct extent_state *cached;
488 	struct extent_state *prealloc = NULL;
489 	struct rb_node *node;
490 	u64 last_end;
491 	int err;
492 	int clear = 0;
493 
494 	if (delete)
495 		bits |= ~EXTENT_CTLBITS;
496 	bits |= EXTENT_FIRST_DELALLOC;
497 
498 	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
499 		clear = 1;
500 again:
501 	if (!prealloc && (mask & __GFP_WAIT)) {
502 		prealloc = alloc_extent_state(mask);
503 		if (!prealloc)
504 			return -ENOMEM;
505 	}
506 
507 	spin_lock(&tree->lock);
508 	if (cached_state) {
509 		cached = *cached_state;
510 
511 		if (clear) {
512 			*cached_state = NULL;
513 			cached_state = NULL;
514 		}
515 
516 		if (cached && cached->tree && cached->start <= start &&
517 		    cached->end > start) {
518 			if (clear)
519 				atomic_dec(&cached->refs);
520 			state = cached;
521 			goto hit_next;
522 		}
523 		if (clear)
524 			free_extent_state(cached);
525 	}
526 	/*
527 	 * this search will find the extents that end after
528 	 * our range starts
529 	 */
530 	node = tree_search(tree, start);
531 	if (!node)
532 		goto out;
533 	state = rb_entry(node, struct extent_state, rb_node);
534 hit_next:
535 	if (state->start > end)
536 		goto out;
537 	WARN_ON(state->end < start);
538 	last_end = state->end;
539 
540 	/* the state doesn't have the wanted bits, go ahead */
541 	if (!(state->state & bits)) {
542 		state = next_state(state);
543 		goto next;
544 	}
545 
546 	/*
547 	 *     | ---- desired range ---- |
548 	 *  | state | or
549 	 *  | ------------- state -------------- |
550 	 *
551 	 * We need to split the extent we found, and may flip
552 	 * bits on second half.
553 	 *
554 	 * If the extent we found extends past our range, we
555 	 * just split and search again.  It'll get split again
556 	 * the next time though.
557 	 *
558 	 * If the extent we found is inside our range, we clear
559 	 * the desired bit on it.
560 	 */
561 
562 	if (state->start < start) {
563 		prealloc = alloc_extent_state_atomic(prealloc);
564 		BUG_ON(!prealloc);
565 		err = split_state(tree, state, prealloc, start);
566 		if (err)
567 			extent_io_tree_panic(tree, err);
568 
569 		prealloc = NULL;
570 		if (err)
571 			goto out;
572 		if (state->end <= end) {
573 			clear_state_bit(tree, state, &bits, wake);
574 			if (last_end == (u64)-1)
575 				goto out;
576 			start = last_end + 1;
577 		}
578 		goto search_again;
579 	}
580 	/*
581 	 * | ---- desired range ---- |
582 	 *                        | state |
583 	 * We need to split the extent, and clear the bit
584 	 * on the first half
585 	 */
586 	if (state->start <= end && state->end > end) {
587 		prealloc = alloc_extent_state_atomic(prealloc);
588 		BUG_ON(!prealloc);
589 		err = split_state(tree, state, prealloc, end + 1);
590 		if (err)
591 			extent_io_tree_panic(tree, err);
592 
593 		if (wake)
594 			wake_up(&state->wq);
595 
596 		clear_state_bit(tree, prealloc, &bits, wake);
597 
598 		prealloc = NULL;
599 		goto out;
600 	}
601 
602 	state = clear_state_bit(tree, state, &bits, wake);
603 next:
604 	if (last_end == (u64)-1)
605 		goto out;
606 	start = last_end + 1;
607 	if (start <= end && state && !need_resched())
608 		goto hit_next;
609 	goto search_again;
610 
611 out:
612 	spin_unlock(&tree->lock);
613 	if (prealloc)
614 		free_extent_state(prealloc);
615 
616 	return 0;
617 
618 search_again:
619 	if (start > end)
620 		goto out;
621 	spin_unlock(&tree->lock);
622 	if (mask & __GFP_WAIT)
623 		cond_resched();
624 	goto again;
625 }
626 
wait_on_state(struct extent_io_tree * tree,struct extent_state * state)627 static void wait_on_state(struct extent_io_tree *tree,
628 			  struct extent_state *state)
629 		__releases(tree->lock)
630 		__acquires(tree->lock)
631 {
632 	DEFINE_WAIT(wait);
633 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
634 	spin_unlock(&tree->lock);
635 	schedule();
636 	spin_lock(&tree->lock);
637 	finish_wait(&state->wq, &wait);
638 }
639 
640 /*
641  * waits for one or more bits to clear on a range in the state tree.
642  * The range [start, end] is inclusive.
643  * The tree lock is taken by this function
644  */
wait_extent_bit(struct extent_io_tree * tree,u64 start,u64 end,int bits)645 void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
646 {
647 	struct extent_state *state;
648 	struct rb_node *node;
649 
650 	spin_lock(&tree->lock);
651 again:
652 	while (1) {
653 		/*
654 		 * this search will find all the extents that end after
655 		 * our range starts
656 		 */
657 		node = tree_search(tree, start);
658 		if (!node)
659 			break;
660 
661 		state = rb_entry(node, struct extent_state, rb_node);
662 
663 		if (state->start > end)
664 			goto out;
665 
666 		if (state->state & bits) {
667 			start = state->start;
668 			atomic_inc(&state->refs);
669 			wait_on_state(tree, state);
670 			free_extent_state(state);
671 			goto again;
672 		}
673 		start = state->end + 1;
674 
675 		if (start > end)
676 			break;
677 
678 		cond_resched_lock(&tree->lock);
679 	}
680 out:
681 	spin_unlock(&tree->lock);
682 }
683 
set_state_bits(struct extent_io_tree * tree,struct extent_state * state,int * bits)684 static void set_state_bits(struct extent_io_tree *tree,
685 			   struct extent_state *state,
686 			   int *bits)
687 {
688 	int bits_to_set = *bits & ~EXTENT_CTLBITS;
689 
690 	set_state_cb(tree, state, bits);
691 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
692 		u64 range = state->end - state->start + 1;
693 		tree->dirty_bytes += range;
694 	}
695 	state->state |= bits_to_set;
696 }
697 
cache_state(struct extent_state * state,struct extent_state ** cached_ptr)698 static void cache_state(struct extent_state *state,
699 			struct extent_state **cached_ptr)
700 {
701 	if (cached_ptr && !(*cached_ptr)) {
702 		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
703 			*cached_ptr = state;
704 			atomic_inc(&state->refs);
705 		}
706 	}
707 }
708 
uncache_state(struct extent_state ** cached_ptr)709 static void uncache_state(struct extent_state **cached_ptr)
710 {
711 	if (cached_ptr && (*cached_ptr)) {
712 		struct extent_state *state = *cached_ptr;
713 		*cached_ptr = NULL;
714 		free_extent_state(state);
715 	}
716 }
717 
718 /*
719  * set some bits on a range in the tree.  This may require allocations or
720  * sleeping, so the gfp mask is used to indicate what is allowed.
721  *
722  * If any of the exclusive bits are set, this will fail with -EEXIST if some
723  * part of the range already has the desired bits set.  The start of the
724  * existing range is returned in failed_start in this case.
725  *
726  * [start, end] is inclusive This takes the tree lock.
727  */
728 
729 static int __must_check
__set_extent_bit(struct extent_io_tree * tree,u64 start,u64 end,int bits,int exclusive_bits,u64 * failed_start,struct extent_state ** cached_state,gfp_t mask)730 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
731 		 int bits, int exclusive_bits, u64 *failed_start,
732 		 struct extent_state **cached_state, gfp_t mask)
733 {
734 	struct extent_state *state;
735 	struct extent_state *prealloc = NULL;
736 	struct rb_node *node;
737 	int err = 0;
738 	u64 last_start;
739 	u64 last_end;
740 
741 	bits |= EXTENT_FIRST_DELALLOC;
742 again:
743 	if (!prealloc && (mask & __GFP_WAIT)) {
744 		prealloc = alloc_extent_state(mask);
745 		BUG_ON(!prealloc);
746 	}
747 
748 	spin_lock(&tree->lock);
749 	if (cached_state && *cached_state) {
750 		state = *cached_state;
751 		if (state->start <= start && state->end > start &&
752 		    state->tree) {
753 			node = &state->rb_node;
754 			goto hit_next;
755 		}
756 	}
757 	/*
758 	 * this search will find all the extents that end after
759 	 * our range starts.
760 	 */
761 	node = tree_search(tree, start);
762 	if (!node) {
763 		prealloc = alloc_extent_state_atomic(prealloc);
764 		BUG_ON(!prealloc);
765 		err = insert_state(tree, prealloc, start, end, &bits);
766 		if (err)
767 			extent_io_tree_panic(tree, err);
768 
769 		prealloc = NULL;
770 		goto out;
771 	}
772 	state = rb_entry(node, struct extent_state, rb_node);
773 hit_next:
774 	last_start = state->start;
775 	last_end = state->end;
776 
777 	/*
778 	 * | ---- desired range ---- |
779 	 * | state |
780 	 *
781 	 * Just lock what we found and keep going
782 	 */
783 	if (state->start == start && state->end <= end) {
784 		struct rb_node *next_node;
785 		if (state->state & exclusive_bits) {
786 			*failed_start = state->start;
787 			err = -EEXIST;
788 			goto out;
789 		}
790 
791 		set_state_bits(tree, state, &bits);
792 
793 		cache_state(state, cached_state);
794 		merge_state(tree, state);
795 		if (last_end == (u64)-1)
796 			goto out;
797 
798 		start = last_end + 1;
799 		next_node = rb_next(&state->rb_node);
800 		if (next_node && start < end && prealloc && !need_resched()) {
801 			state = rb_entry(next_node, struct extent_state,
802 					 rb_node);
803 			if (state->start == start)
804 				goto hit_next;
805 		}
806 		goto search_again;
807 	}
808 
809 	/*
810 	 *     | ---- desired range ---- |
811 	 * | state |
812 	 *   or
813 	 * | ------------- state -------------- |
814 	 *
815 	 * We need to split the extent we found, and may flip bits on
816 	 * second half.
817 	 *
818 	 * If the extent we found extends past our
819 	 * range, we just split and search again.  It'll get split
820 	 * again the next time though.
821 	 *
822 	 * If the extent we found is inside our range, we set the
823 	 * desired bit on it.
824 	 */
825 	if (state->start < start) {
826 		if (state->state & exclusive_bits) {
827 			*failed_start = start;
828 			err = -EEXIST;
829 			goto out;
830 		}
831 
832 		prealloc = alloc_extent_state_atomic(prealloc);
833 		BUG_ON(!prealloc);
834 		err = split_state(tree, state, prealloc, start);
835 		if (err)
836 			extent_io_tree_panic(tree, err);
837 
838 		prealloc = NULL;
839 		if (err)
840 			goto out;
841 		if (state->end <= end) {
842 			set_state_bits(tree, state, &bits);
843 			cache_state(state, cached_state);
844 			merge_state(tree, state);
845 			if (last_end == (u64)-1)
846 				goto out;
847 			start = last_end + 1;
848 		}
849 		goto search_again;
850 	}
851 	/*
852 	 * | ---- desired range ---- |
853 	 *     | state | or               | state |
854 	 *
855 	 * There's a hole, we need to insert something in it and
856 	 * ignore the extent we found.
857 	 */
858 	if (state->start > start) {
859 		u64 this_end;
860 		if (end < last_start)
861 			this_end = end;
862 		else
863 			this_end = last_start - 1;
864 
865 		prealloc = alloc_extent_state_atomic(prealloc);
866 		BUG_ON(!prealloc);
867 
868 		/*
869 		 * Avoid to free 'prealloc' if it can be merged with
870 		 * the later extent.
871 		 */
872 		err = insert_state(tree, prealloc, start, this_end,
873 				   &bits);
874 		if (err)
875 			extent_io_tree_panic(tree, err);
876 
877 		cache_state(prealloc, cached_state);
878 		prealloc = NULL;
879 		start = this_end + 1;
880 		goto search_again;
881 	}
882 	/*
883 	 * | ---- desired range ---- |
884 	 *                        | state |
885 	 * We need to split the extent, and set the bit
886 	 * on the first half
887 	 */
888 	if (state->start <= end && state->end > end) {
889 		if (state->state & exclusive_bits) {
890 			*failed_start = start;
891 			err = -EEXIST;
892 			goto out;
893 		}
894 
895 		prealloc = alloc_extent_state_atomic(prealloc);
896 		BUG_ON(!prealloc);
897 		err = split_state(tree, state, prealloc, end + 1);
898 		if (err)
899 			extent_io_tree_panic(tree, err);
900 
901 		set_state_bits(tree, prealloc, &bits);
902 		cache_state(prealloc, cached_state);
903 		merge_state(tree, prealloc);
904 		prealloc = NULL;
905 		goto out;
906 	}
907 
908 	goto search_again;
909 
910 out:
911 	spin_unlock(&tree->lock);
912 	if (prealloc)
913 		free_extent_state(prealloc);
914 
915 	return err;
916 
917 search_again:
918 	if (start > end)
919 		goto out;
920 	spin_unlock(&tree->lock);
921 	if (mask & __GFP_WAIT)
922 		cond_resched();
923 	goto again;
924 }
925 
set_extent_bit(struct extent_io_tree * tree,u64 start,u64 end,int bits,u64 * failed_start,struct extent_state ** cached_state,gfp_t mask)926 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
927 		   u64 *failed_start, struct extent_state **cached_state,
928 		   gfp_t mask)
929 {
930 	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
931 				cached_state, mask);
932 }
933 
934 
935 /**
936  * convert_extent - convert all bits in a given range from one bit to another
937  * @tree:	the io tree to search
938  * @start:	the start offset in bytes
939  * @end:	the end offset in bytes (inclusive)
940  * @bits:	the bits to set in this range
941  * @clear_bits:	the bits to clear in this range
942  * @mask:	the allocation mask
943  *
944  * This will go through and set bits for the given range.  If any states exist
945  * already in this range they are set with the given bit and cleared of the
946  * clear_bits.  This is only meant to be used by things that are mergeable, ie
947  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
948  * boundary bits like LOCK.
949  */
convert_extent_bit(struct extent_io_tree * tree,u64 start,u64 end,int bits,int clear_bits,gfp_t mask)950 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
951 		       int bits, int clear_bits, gfp_t mask)
952 {
953 	struct extent_state *state;
954 	struct extent_state *prealloc = NULL;
955 	struct rb_node *node;
956 	int err = 0;
957 	u64 last_start;
958 	u64 last_end;
959 
960 again:
961 	if (!prealloc && (mask & __GFP_WAIT)) {
962 		prealloc = alloc_extent_state(mask);
963 		if (!prealloc)
964 			return -ENOMEM;
965 	}
966 
967 	spin_lock(&tree->lock);
968 	/*
969 	 * this search will find all the extents that end after
970 	 * our range starts.
971 	 */
972 	node = tree_search(tree, start);
973 	if (!node) {
974 		prealloc = alloc_extent_state_atomic(prealloc);
975 		if (!prealloc) {
976 			err = -ENOMEM;
977 			goto out;
978 		}
979 		err = insert_state(tree, prealloc, start, end, &bits);
980 		prealloc = NULL;
981 		if (err)
982 			extent_io_tree_panic(tree, err);
983 		goto out;
984 	}
985 	state = rb_entry(node, struct extent_state, rb_node);
986 hit_next:
987 	last_start = state->start;
988 	last_end = state->end;
989 
990 	/*
991 	 * | ---- desired range ---- |
992 	 * | state |
993 	 *
994 	 * Just lock what we found and keep going
995 	 */
996 	if (state->start == start && state->end <= end) {
997 		struct rb_node *next_node;
998 
999 		set_state_bits(tree, state, &bits);
1000 		clear_state_bit(tree, state, &clear_bits, 0);
1001 		if (last_end == (u64)-1)
1002 			goto out;
1003 
1004 		start = last_end + 1;
1005 		next_node = rb_next(&state->rb_node);
1006 		if (next_node && start < end && prealloc && !need_resched()) {
1007 			state = rb_entry(next_node, struct extent_state,
1008 					 rb_node);
1009 			if (state->start == start)
1010 				goto hit_next;
1011 		}
1012 		goto search_again;
1013 	}
1014 
1015 	/*
1016 	 *     | ---- desired range ---- |
1017 	 * | state |
1018 	 *   or
1019 	 * | ------------- state -------------- |
1020 	 *
1021 	 * We need to split the extent we found, and may flip bits on
1022 	 * second half.
1023 	 *
1024 	 * If the extent we found extends past our
1025 	 * range, we just split and search again.  It'll get split
1026 	 * again the next time though.
1027 	 *
1028 	 * If the extent we found is inside our range, we set the
1029 	 * desired bit on it.
1030 	 */
1031 	if (state->start < start) {
1032 		prealloc = alloc_extent_state_atomic(prealloc);
1033 		if (!prealloc) {
1034 			err = -ENOMEM;
1035 			goto out;
1036 		}
1037 		err = split_state(tree, state, prealloc, start);
1038 		if (err)
1039 			extent_io_tree_panic(tree, err);
1040 		prealloc = NULL;
1041 		if (err)
1042 			goto out;
1043 		if (state->end <= end) {
1044 			set_state_bits(tree, state, &bits);
1045 			clear_state_bit(tree, state, &clear_bits, 0);
1046 			if (last_end == (u64)-1)
1047 				goto out;
1048 			start = last_end + 1;
1049 		}
1050 		goto search_again;
1051 	}
1052 	/*
1053 	 * | ---- desired range ---- |
1054 	 *     | state | or               | state |
1055 	 *
1056 	 * There's a hole, we need to insert something in it and
1057 	 * ignore the extent we found.
1058 	 */
1059 	if (state->start > start) {
1060 		u64 this_end;
1061 		if (end < last_start)
1062 			this_end = end;
1063 		else
1064 			this_end = last_start - 1;
1065 
1066 		prealloc = alloc_extent_state_atomic(prealloc);
1067 		if (!prealloc) {
1068 			err = -ENOMEM;
1069 			goto out;
1070 		}
1071 
1072 		/*
1073 		 * Avoid to free 'prealloc' if it can be merged with
1074 		 * the later extent.
1075 		 */
1076 		err = insert_state(tree, prealloc, start, this_end,
1077 				   &bits);
1078 		if (err)
1079 			extent_io_tree_panic(tree, err);
1080 		prealloc = NULL;
1081 		start = this_end + 1;
1082 		goto search_again;
1083 	}
1084 	/*
1085 	 * | ---- desired range ---- |
1086 	 *                        | state |
1087 	 * We need to split the extent, and set the bit
1088 	 * on the first half
1089 	 */
1090 	if (state->start <= end && state->end > end) {
1091 		prealloc = alloc_extent_state_atomic(prealloc);
1092 		if (!prealloc) {
1093 			err = -ENOMEM;
1094 			goto out;
1095 		}
1096 
1097 		err = split_state(tree, state, prealloc, end + 1);
1098 		if (err)
1099 			extent_io_tree_panic(tree, err);
1100 
1101 		set_state_bits(tree, prealloc, &bits);
1102 		clear_state_bit(tree, prealloc, &clear_bits, 0);
1103 		prealloc = NULL;
1104 		goto out;
1105 	}
1106 
1107 	goto search_again;
1108 
1109 out:
1110 	spin_unlock(&tree->lock);
1111 	if (prealloc)
1112 		free_extent_state(prealloc);
1113 
1114 	return err;
1115 
1116 search_again:
1117 	if (start > end)
1118 		goto out;
1119 	spin_unlock(&tree->lock);
1120 	if (mask & __GFP_WAIT)
1121 		cond_resched();
1122 	goto again;
1123 }
1124 
1125 /* wrappers around set/clear extent bit */
set_extent_dirty(struct extent_io_tree * tree,u64 start,u64 end,gfp_t mask)1126 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1127 		     gfp_t mask)
1128 {
1129 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
1130 			      NULL, mask);
1131 }
1132 
set_extent_bits(struct extent_io_tree * tree,u64 start,u64 end,int bits,gfp_t mask)1133 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1134 		    int bits, gfp_t mask)
1135 {
1136 	return set_extent_bit(tree, start, end, bits, NULL,
1137 			      NULL, mask);
1138 }
1139 
clear_extent_bits(struct extent_io_tree * tree,u64 start,u64 end,int bits,gfp_t mask)1140 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1141 		      int bits, gfp_t mask)
1142 {
1143 	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1144 }
1145 
set_extent_delalloc(struct extent_io_tree * tree,u64 start,u64 end,struct extent_state ** cached_state,gfp_t mask)1146 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1147 			struct extent_state **cached_state, gfp_t mask)
1148 {
1149 	return set_extent_bit(tree, start, end,
1150 			      EXTENT_DELALLOC | EXTENT_UPTODATE,
1151 			      NULL, cached_state, mask);
1152 }
1153 
clear_extent_dirty(struct extent_io_tree * tree,u64 start,u64 end,gfp_t mask)1154 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1155 		       gfp_t mask)
1156 {
1157 	return clear_extent_bit(tree, start, end,
1158 				EXTENT_DIRTY | EXTENT_DELALLOC |
1159 				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1160 }
1161 
set_extent_new(struct extent_io_tree * tree,u64 start,u64 end,gfp_t mask)1162 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1163 		     gfp_t mask)
1164 {
1165 	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
1166 			      NULL, mask);
1167 }
1168 
set_extent_uptodate(struct extent_io_tree * tree,u64 start,u64 end,struct extent_state ** cached_state,gfp_t mask)1169 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1170 			struct extent_state **cached_state, gfp_t mask)
1171 {
1172 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
1173 			      cached_state, mask);
1174 }
1175 
clear_extent_uptodate(struct extent_io_tree * tree,u64 start,u64 end,struct extent_state ** cached_state,gfp_t mask)1176 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
1177 				 u64 end, struct extent_state **cached_state,
1178 				 gfp_t mask)
1179 {
1180 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1181 				cached_state, mask);
1182 }
1183 
1184 /*
1185  * either insert or lock state struct between start and end use mask to tell
1186  * us if waiting is desired.
1187  */
lock_extent_bits(struct extent_io_tree * tree,u64 start,u64 end,int bits,struct extent_state ** cached_state)1188 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1189 		     int bits, struct extent_state **cached_state)
1190 {
1191 	int err;
1192 	u64 failed_start;
1193 	while (1) {
1194 		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1195 				       EXTENT_LOCKED, &failed_start,
1196 				       cached_state, GFP_NOFS);
1197 		if (err == -EEXIST) {
1198 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1199 			start = failed_start;
1200 		} else
1201 			break;
1202 		WARN_ON(start > end);
1203 	}
1204 	return err;
1205 }
1206 
lock_extent(struct extent_io_tree * tree,u64 start,u64 end)1207 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1208 {
1209 	return lock_extent_bits(tree, start, end, 0, NULL);
1210 }
1211 
try_lock_extent(struct extent_io_tree * tree,u64 start,u64 end)1212 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1213 {
1214 	int err;
1215 	u64 failed_start;
1216 
1217 	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1218 			       &failed_start, NULL, GFP_NOFS);
1219 	if (err == -EEXIST) {
1220 		if (failed_start > start)
1221 			clear_extent_bit(tree, start, failed_start - 1,
1222 					 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
1223 		return 0;
1224 	}
1225 	return 1;
1226 }
1227 
unlock_extent_cached(struct extent_io_tree * tree,u64 start,u64 end,struct extent_state ** cached,gfp_t mask)1228 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1229 			 struct extent_state **cached, gfp_t mask)
1230 {
1231 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1232 				mask);
1233 }
1234 
unlock_extent(struct extent_io_tree * tree,u64 start,u64 end)1235 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1236 {
1237 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1238 				GFP_NOFS);
1239 }
1240 
extent_range_clear_dirty_for_io(struct inode * inode,u64 start,u64 end)1241 int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1242 {
1243 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1244 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1245 	struct page *page;
1246 
1247 	while (index <= end_index) {
1248 		page = find_get_page(inode->i_mapping, index);
1249 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1250 		clear_page_dirty_for_io(page);
1251 		page_cache_release(page);
1252 		index++;
1253 	}
1254 	return 0;
1255 }
1256 
extent_range_redirty_for_io(struct inode * inode,u64 start,u64 end)1257 int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1258 {
1259 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1260 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1261 	struct page *page;
1262 
1263 	while (index <= end_index) {
1264 		page = find_get_page(inode->i_mapping, index);
1265 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1266 		account_page_redirty(page);
1267 		__set_page_dirty_nobuffers(page);
1268 		page_cache_release(page);
1269 		index++;
1270 	}
1271 	return 0;
1272 }
1273 
1274 /*
1275  * helper function to set both pages and extents in the tree writeback
1276  */
set_range_writeback(struct extent_io_tree * tree,u64 start,u64 end)1277 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1278 {
1279 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1280 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1281 	struct page *page;
1282 
1283 	while (index <= end_index) {
1284 		page = find_get_page(tree->mapping, index);
1285 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1286 		set_page_writeback(page);
1287 		page_cache_release(page);
1288 		index++;
1289 	}
1290 	return 0;
1291 }
1292 
1293 /* find the first state struct with 'bits' set after 'start', and
1294  * return it.  tree->lock must be held.  NULL will returned if
1295  * nothing was found after 'start'
1296  */
find_first_extent_bit_state(struct extent_io_tree * tree,u64 start,int bits)1297 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1298 						 u64 start, int bits)
1299 {
1300 	struct rb_node *node;
1301 	struct extent_state *state;
1302 
1303 	/*
1304 	 * this search will find all the extents that end after
1305 	 * our range starts.
1306 	 */
1307 	node = tree_search(tree, start);
1308 	if (!node)
1309 		goto out;
1310 
1311 	while (1) {
1312 		state = rb_entry(node, struct extent_state, rb_node);
1313 		if (state->end >= start && (state->state & bits))
1314 			return state;
1315 
1316 		node = rb_next(node);
1317 		if (!node)
1318 			break;
1319 	}
1320 out:
1321 	return NULL;
1322 }
1323 
1324 /*
1325  * find the first offset in the io tree with 'bits' set. zero is
1326  * returned if we find something, and *start_ret and *end_ret are
1327  * set to reflect the state struct that was found.
1328  *
1329  * If nothing was found, 1 is returned, < 0 on error
1330  */
find_first_extent_bit(struct extent_io_tree * tree,u64 start,u64 * start_ret,u64 * end_ret,int bits)1331 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1332 			  u64 *start_ret, u64 *end_ret, int bits)
1333 {
1334 	struct extent_state *state;
1335 	int ret = 1;
1336 
1337 	spin_lock(&tree->lock);
1338 	state = find_first_extent_bit_state(tree, start, bits);
1339 	if (state) {
1340 		*start_ret = state->start;
1341 		*end_ret = state->end;
1342 		ret = 0;
1343 	}
1344 	spin_unlock(&tree->lock);
1345 	return ret;
1346 }
1347 
1348 /*
1349  * find a contiguous range of bytes in the file marked as delalloc, not
1350  * more than 'max_bytes'.  start and end are used to return the range,
1351  *
1352  * 1 is returned if we find something, 0 if nothing was in the tree
1353  */
find_delalloc_range(struct extent_io_tree * tree,u64 * start,u64 * end,u64 max_bytes,struct extent_state ** cached_state)1354 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1355 					u64 *start, u64 *end, u64 max_bytes,
1356 					struct extent_state **cached_state)
1357 {
1358 	struct rb_node *node;
1359 	struct extent_state *state;
1360 	u64 cur_start = *start;
1361 	u64 found = 0;
1362 	u64 total_bytes = 0;
1363 
1364 	spin_lock(&tree->lock);
1365 
1366 	/*
1367 	 * this search will find all the extents that end after
1368 	 * our range starts.
1369 	 */
1370 	node = tree_search(tree, cur_start);
1371 	if (!node) {
1372 		if (!found)
1373 			*end = (u64)-1;
1374 		goto out;
1375 	}
1376 
1377 	while (1) {
1378 		state = rb_entry(node, struct extent_state, rb_node);
1379 		if (found && (state->start != cur_start ||
1380 			      (state->state & EXTENT_BOUNDARY))) {
1381 			goto out;
1382 		}
1383 		if (!(state->state & EXTENT_DELALLOC)) {
1384 			if (!found)
1385 				*end = state->end;
1386 			goto out;
1387 		}
1388 		if (!found) {
1389 			*start = state->start;
1390 			*cached_state = state;
1391 			atomic_inc(&state->refs);
1392 		}
1393 		found++;
1394 		*end = state->end;
1395 		cur_start = state->end + 1;
1396 		node = rb_next(node);
1397 		if (!node)
1398 			break;
1399 		total_bytes += state->end - state->start + 1;
1400 		if (total_bytes >= max_bytes)
1401 			break;
1402 	}
1403 out:
1404 	spin_unlock(&tree->lock);
1405 	return found;
1406 }
1407 
__unlock_for_delalloc(struct inode * inode,struct page * locked_page,u64 start,u64 end)1408 static noinline void __unlock_for_delalloc(struct inode *inode,
1409 					   struct page *locked_page,
1410 					   u64 start, u64 end)
1411 {
1412 	int ret;
1413 	struct page *pages[16];
1414 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1415 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1416 	unsigned long nr_pages = end_index - index + 1;
1417 	int i;
1418 
1419 	if (index == locked_page->index && end_index == index)
1420 		return;
1421 
1422 	while (nr_pages > 0) {
1423 		ret = find_get_pages_contig(inode->i_mapping, index,
1424 				     min_t(unsigned long, nr_pages,
1425 				     ARRAY_SIZE(pages)), pages);
1426 		for (i = 0; i < ret; i++) {
1427 			if (pages[i] != locked_page)
1428 				unlock_page(pages[i]);
1429 			page_cache_release(pages[i]);
1430 		}
1431 		nr_pages -= ret;
1432 		index += ret;
1433 		cond_resched();
1434 	}
1435 }
1436 
lock_delalloc_pages(struct inode * inode,struct page * locked_page,u64 delalloc_start,u64 delalloc_end)1437 static noinline int lock_delalloc_pages(struct inode *inode,
1438 					struct page *locked_page,
1439 					u64 delalloc_start,
1440 					u64 delalloc_end)
1441 {
1442 	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1443 	unsigned long start_index = index;
1444 	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1445 	unsigned long pages_locked = 0;
1446 	struct page *pages[16];
1447 	unsigned long nrpages;
1448 	int ret;
1449 	int i;
1450 
1451 	/* the caller is responsible for locking the start index */
1452 	if (index == locked_page->index && index == end_index)
1453 		return 0;
1454 
1455 	/* skip the page at the start index */
1456 	nrpages = end_index - index + 1;
1457 	while (nrpages > 0) {
1458 		ret = find_get_pages_contig(inode->i_mapping, index,
1459 				     min_t(unsigned long,
1460 				     nrpages, ARRAY_SIZE(pages)), pages);
1461 		if (ret == 0) {
1462 			ret = -EAGAIN;
1463 			goto done;
1464 		}
1465 		/* now we have an array of pages, lock them all */
1466 		for (i = 0; i < ret; i++) {
1467 			/*
1468 			 * the caller is taking responsibility for
1469 			 * locked_page
1470 			 */
1471 			if (pages[i] != locked_page) {
1472 				lock_page(pages[i]);
1473 				if (!PageDirty(pages[i]) ||
1474 				    pages[i]->mapping != inode->i_mapping) {
1475 					ret = -EAGAIN;
1476 					unlock_page(pages[i]);
1477 					page_cache_release(pages[i]);
1478 					goto done;
1479 				}
1480 			}
1481 			page_cache_release(pages[i]);
1482 			pages_locked++;
1483 		}
1484 		nrpages -= ret;
1485 		index += ret;
1486 		cond_resched();
1487 	}
1488 	ret = 0;
1489 done:
1490 	if (ret && pages_locked) {
1491 		__unlock_for_delalloc(inode, locked_page,
1492 			      delalloc_start,
1493 			      ((u64)(start_index + pages_locked - 1)) <<
1494 			      PAGE_CACHE_SHIFT);
1495 	}
1496 	return ret;
1497 }
1498 
1499 /*
1500  * find a contiguous range of bytes in the file marked as delalloc, not
1501  * more than 'max_bytes'.  start and end are used to return the range,
1502  *
1503  * 1 is returned if we find something, 0 if nothing was in the tree
1504  */
find_lock_delalloc_range(struct inode * inode,struct extent_io_tree * tree,struct page * locked_page,u64 * start,u64 * end,u64 max_bytes)1505 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1506 					     struct extent_io_tree *tree,
1507 					     struct page *locked_page,
1508 					     u64 *start, u64 *end,
1509 					     u64 max_bytes)
1510 {
1511 	u64 delalloc_start;
1512 	u64 delalloc_end;
1513 	u64 found;
1514 	struct extent_state *cached_state = NULL;
1515 	int ret;
1516 	int loops = 0;
1517 
1518 again:
1519 	/* step one, find a bunch of delalloc bytes starting at start */
1520 	delalloc_start = *start;
1521 	delalloc_end = 0;
1522 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1523 				    max_bytes, &cached_state);
1524 	if (!found || delalloc_end <= *start) {
1525 		*start = delalloc_start;
1526 		*end = delalloc_end;
1527 		free_extent_state(cached_state);
1528 		return found;
1529 	}
1530 
1531 	/*
1532 	 * start comes from the offset of locked_page.  We have to lock
1533 	 * pages in order, so we can't process delalloc bytes before
1534 	 * locked_page
1535 	 */
1536 	if (delalloc_start < *start)
1537 		delalloc_start = *start;
1538 
1539 	/*
1540 	 * make sure to limit the number of pages we try to lock down
1541 	 * if we're looping.
1542 	 */
1543 	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1544 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1545 
1546 	/* step two, lock all the pages after the page that has start */
1547 	ret = lock_delalloc_pages(inode, locked_page,
1548 				  delalloc_start, delalloc_end);
1549 	if (ret == -EAGAIN) {
1550 		/* some of the pages are gone, lets avoid looping by
1551 		 * shortening the size of the delalloc range we're searching
1552 		 */
1553 		free_extent_state(cached_state);
1554 		if (!loops) {
1555 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1556 			max_bytes = PAGE_CACHE_SIZE - offset;
1557 			loops = 1;
1558 			goto again;
1559 		} else {
1560 			found = 0;
1561 			goto out_failed;
1562 		}
1563 	}
1564 	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1565 
1566 	/* step three, lock the state bits for the whole range */
1567 	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
1568 
1569 	/* then test to make sure it is all still delalloc */
1570 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1571 			     EXTENT_DELALLOC, 1, cached_state);
1572 	if (!ret) {
1573 		unlock_extent_cached(tree, delalloc_start, delalloc_end,
1574 				     &cached_state, GFP_NOFS);
1575 		__unlock_for_delalloc(inode, locked_page,
1576 			      delalloc_start, delalloc_end);
1577 		cond_resched();
1578 		goto again;
1579 	}
1580 	free_extent_state(cached_state);
1581 	*start = delalloc_start;
1582 	*end = delalloc_end;
1583 out_failed:
1584 	return found;
1585 }
1586 
extent_clear_unlock_delalloc(struct inode * inode,struct extent_io_tree * tree,u64 start,u64 end,struct page * locked_page,unsigned long op)1587 int extent_clear_unlock_delalloc(struct inode *inode,
1588 				struct extent_io_tree *tree,
1589 				u64 start, u64 end, struct page *locked_page,
1590 				unsigned long op)
1591 {
1592 	int ret;
1593 	struct page *pages[16];
1594 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1595 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1596 	unsigned long nr_pages = end_index - index + 1;
1597 	int i;
1598 	int clear_bits = 0;
1599 
1600 	if (op & EXTENT_CLEAR_UNLOCK)
1601 		clear_bits |= EXTENT_LOCKED;
1602 	if (op & EXTENT_CLEAR_DIRTY)
1603 		clear_bits |= EXTENT_DIRTY;
1604 
1605 	if (op & EXTENT_CLEAR_DELALLOC)
1606 		clear_bits |= EXTENT_DELALLOC;
1607 
1608 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1609 	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1610 		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1611 		    EXTENT_SET_PRIVATE2)))
1612 		return 0;
1613 
1614 	while (nr_pages > 0) {
1615 		ret = find_get_pages_contig(inode->i_mapping, index,
1616 				     min_t(unsigned long,
1617 				     nr_pages, ARRAY_SIZE(pages)), pages);
1618 		for (i = 0; i < ret; i++) {
1619 
1620 			if (op & EXTENT_SET_PRIVATE2)
1621 				SetPagePrivate2(pages[i]);
1622 
1623 			if (pages[i] == locked_page) {
1624 				page_cache_release(pages[i]);
1625 				continue;
1626 			}
1627 			if (op & EXTENT_CLEAR_DIRTY)
1628 				clear_page_dirty_for_io(pages[i]);
1629 			if (op & EXTENT_SET_WRITEBACK)
1630 				set_page_writeback(pages[i]);
1631 			if (op & EXTENT_END_WRITEBACK)
1632 				end_page_writeback(pages[i]);
1633 			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1634 				unlock_page(pages[i]);
1635 			page_cache_release(pages[i]);
1636 		}
1637 		nr_pages -= ret;
1638 		index += ret;
1639 		cond_resched();
1640 	}
1641 	return 0;
1642 }
1643 
1644 /*
1645  * count the number of bytes in the tree that have a given bit(s)
1646  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1647  * cached.  The total number found is returned.
1648  */
count_range_bits(struct extent_io_tree * tree,u64 * start,u64 search_end,u64 max_bytes,unsigned long bits,int contig)1649 u64 count_range_bits(struct extent_io_tree *tree,
1650 		     u64 *start, u64 search_end, u64 max_bytes,
1651 		     unsigned long bits, int contig)
1652 {
1653 	struct rb_node *node;
1654 	struct extent_state *state;
1655 	u64 cur_start = *start;
1656 	u64 total_bytes = 0;
1657 	u64 last = 0;
1658 	int found = 0;
1659 
1660 	if (search_end <= cur_start) {
1661 		WARN_ON(1);
1662 		return 0;
1663 	}
1664 
1665 	spin_lock(&tree->lock);
1666 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1667 		total_bytes = tree->dirty_bytes;
1668 		goto out;
1669 	}
1670 	/*
1671 	 * this search will find all the extents that end after
1672 	 * our range starts.
1673 	 */
1674 	node = tree_search(tree, cur_start);
1675 	if (!node)
1676 		goto out;
1677 
1678 	while (1) {
1679 		state = rb_entry(node, struct extent_state, rb_node);
1680 		if (state->start > search_end)
1681 			break;
1682 		if (contig && found && state->start > last + 1)
1683 			break;
1684 		if (state->end >= cur_start && (state->state & bits) == bits) {
1685 			total_bytes += min(search_end, state->end) + 1 -
1686 				       max(cur_start, state->start);
1687 			if (total_bytes >= max_bytes)
1688 				break;
1689 			if (!found) {
1690 				*start = max(cur_start, state->start);
1691 				found = 1;
1692 			}
1693 			last = state->end;
1694 		} else if (contig && found) {
1695 			break;
1696 		}
1697 		node = rb_next(node);
1698 		if (!node)
1699 			break;
1700 	}
1701 out:
1702 	spin_unlock(&tree->lock);
1703 	return total_bytes;
1704 }
1705 
1706 /*
1707  * set the private field for a given byte offset in the tree.  If there isn't
1708  * an extent_state there already, this does nothing.
1709  */
set_state_private(struct extent_io_tree * tree,u64 start,u64 private)1710 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1711 {
1712 	struct rb_node *node;
1713 	struct extent_state *state;
1714 	int ret = 0;
1715 
1716 	spin_lock(&tree->lock);
1717 	/*
1718 	 * this search will find all the extents that end after
1719 	 * our range starts.
1720 	 */
1721 	node = tree_search(tree, start);
1722 	if (!node) {
1723 		ret = -ENOENT;
1724 		goto out;
1725 	}
1726 	state = rb_entry(node, struct extent_state, rb_node);
1727 	if (state->start != start) {
1728 		ret = -ENOENT;
1729 		goto out;
1730 	}
1731 	state->private = private;
1732 out:
1733 	spin_unlock(&tree->lock);
1734 	return ret;
1735 }
1736 
get_state_private(struct extent_io_tree * tree,u64 start,u64 * private)1737 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1738 {
1739 	struct rb_node *node;
1740 	struct extent_state *state;
1741 	int ret = 0;
1742 
1743 	spin_lock(&tree->lock);
1744 	/*
1745 	 * this search will find all the extents that end after
1746 	 * our range starts.
1747 	 */
1748 	node = tree_search(tree, start);
1749 	if (!node) {
1750 		ret = -ENOENT;
1751 		goto out;
1752 	}
1753 	state = rb_entry(node, struct extent_state, rb_node);
1754 	if (state->start != start) {
1755 		ret = -ENOENT;
1756 		goto out;
1757 	}
1758 	*private = state->private;
1759 out:
1760 	spin_unlock(&tree->lock);
1761 	return ret;
1762 }
1763 
1764 /*
1765  * searches a range in the state tree for a given mask.
1766  * If 'filled' == 1, this returns 1 only if every extent in the tree
1767  * has the bits set.  Otherwise, 1 is returned if any bit in the
1768  * range is found set.
1769  */
test_range_bit(struct extent_io_tree * tree,u64 start,u64 end,int bits,int filled,struct extent_state * cached)1770 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1771 		   int bits, int filled, struct extent_state *cached)
1772 {
1773 	struct extent_state *state = NULL;
1774 	struct rb_node *node;
1775 	int bitset = 0;
1776 
1777 	spin_lock(&tree->lock);
1778 	if (cached && cached->tree && cached->start <= start &&
1779 	    cached->end > start)
1780 		node = &cached->rb_node;
1781 	else
1782 		node = tree_search(tree, start);
1783 	while (node && start <= end) {
1784 		state = rb_entry(node, struct extent_state, rb_node);
1785 
1786 		if (filled && state->start > start) {
1787 			bitset = 0;
1788 			break;
1789 		}
1790 
1791 		if (state->start > end)
1792 			break;
1793 
1794 		if (state->state & bits) {
1795 			bitset = 1;
1796 			if (!filled)
1797 				break;
1798 		} else if (filled) {
1799 			bitset = 0;
1800 			break;
1801 		}
1802 
1803 		if (state->end == (u64)-1)
1804 			break;
1805 
1806 		start = state->end + 1;
1807 		if (start > end)
1808 			break;
1809 		node = rb_next(node);
1810 		if (!node) {
1811 			if (filled)
1812 				bitset = 0;
1813 			break;
1814 		}
1815 	}
1816 	spin_unlock(&tree->lock);
1817 	return bitset;
1818 }
1819 
1820 /*
1821  * helper function to set a given page up to date if all the
1822  * extents in the tree for that page are up to date
1823  */
check_page_uptodate(struct extent_io_tree * tree,struct page * page)1824 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1825 {
1826 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1827 	u64 end = start + PAGE_CACHE_SIZE - 1;
1828 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1829 		SetPageUptodate(page);
1830 }
1831 
1832 /*
1833  * helper function to unlock a page if all the extents in the tree
1834  * for that page are unlocked
1835  */
check_page_locked(struct extent_io_tree * tree,struct page * page)1836 static void check_page_locked(struct extent_io_tree *tree, struct page *page)
1837 {
1838 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1839 	u64 end = start + PAGE_CACHE_SIZE - 1;
1840 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1841 		unlock_page(page);
1842 }
1843 
1844 /*
1845  * helper function to end page writeback if all the extents
1846  * in the tree for that page are done with writeback
1847  */
check_page_writeback(struct extent_io_tree * tree,struct page * page)1848 static void check_page_writeback(struct extent_io_tree *tree,
1849 				 struct page *page)
1850 {
1851 	end_page_writeback(page);
1852 }
1853 
1854 /*
1855  * When IO fails, either with EIO or csum verification fails, we
1856  * try other mirrors that might have a good copy of the data.  This
1857  * io_failure_record is used to record state as we go through all the
1858  * mirrors.  If another mirror has good data, the page is set up to date
1859  * and things continue.  If a good mirror can't be found, the original
1860  * bio end_io callback is called to indicate things have failed.
1861  */
1862 struct io_failure_record {
1863 	struct page *page;
1864 	u64 start;
1865 	u64 len;
1866 	u64 logical;
1867 	unsigned long bio_flags;
1868 	int this_mirror;
1869 	int failed_mirror;
1870 	int in_validation;
1871 };
1872 
free_io_failure(struct inode * inode,struct io_failure_record * rec,int did_repair)1873 static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1874 				int did_repair)
1875 {
1876 	int ret;
1877 	int err = 0;
1878 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1879 
1880 	set_state_private(failure_tree, rec->start, 0);
1881 	ret = clear_extent_bits(failure_tree, rec->start,
1882 				rec->start + rec->len - 1,
1883 				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1884 	if (ret)
1885 		err = ret;
1886 
1887 	if (did_repair) {
1888 		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1889 					rec->start + rec->len - 1,
1890 					EXTENT_DAMAGED, GFP_NOFS);
1891 		if (ret && !err)
1892 			err = ret;
1893 	}
1894 
1895 	kfree(rec);
1896 	return err;
1897 }
1898 
repair_io_failure_callback(struct bio * bio,int err)1899 static void repair_io_failure_callback(struct bio *bio, int err)
1900 {
1901 	complete(bio->bi_private);
1902 }
1903 
1904 /*
1905  * this bypasses the standard btrfs submit functions deliberately, as
1906  * the standard behavior is to write all copies in a raid setup. here we only
1907  * want to write the one bad copy. so we do the mapping for ourselves and issue
1908  * submit_bio directly.
1909  * to avoid any synchonization issues, wait for the data after writing, which
1910  * actually prevents the read that triggered the error from finishing.
1911  * currently, there can be no more than two copies of every data bit. thus,
1912  * exactly one rewrite is required.
1913  */
repair_io_failure(struct btrfs_mapping_tree * map_tree,u64 start,u64 length,u64 logical,struct page * page,int mirror_num)1914 int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1915 			u64 length, u64 logical, struct page *page,
1916 			int mirror_num)
1917 {
1918 	struct bio *bio;
1919 	struct btrfs_device *dev;
1920 	DECLARE_COMPLETION_ONSTACK(compl);
1921 	u64 map_length = 0;
1922 	u64 sector;
1923 	struct btrfs_bio *bbio = NULL;
1924 	int ret;
1925 
1926 	BUG_ON(!mirror_num);
1927 
1928 	bio = bio_alloc(GFP_NOFS, 1);
1929 	if (!bio)
1930 		return -EIO;
1931 	bio->bi_private = &compl;
1932 	bio->bi_end_io = repair_io_failure_callback;
1933 	bio->bi_size = 0;
1934 	map_length = length;
1935 
1936 	ret = btrfs_map_block(map_tree, WRITE, logical,
1937 			      &map_length, &bbio, mirror_num);
1938 	if (ret) {
1939 		bio_put(bio);
1940 		return -EIO;
1941 	}
1942 	BUG_ON(mirror_num != bbio->mirror_num);
1943 	sector = bbio->stripes[mirror_num-1].physical >> 9;
1944 	bio->bi_sector = sector;
1945 	dev = bbio->stripes[mirror_num-1].dev;
1946 	kfree(bbio);
1947 	if (!dev || !dev->bdev || !dev->writeable) {
1948 		bio_put(bio);
1949 		return -EIO;
1950 	}
1951 	bio->bi_bdev = dev->bdev;
1952 	bio_add_page(bio, page, length, start-page_offset(page));
1953 	btrfsic_submit_bio(WRITE_SYNC, bio);
1954 	wait_for_completion(&compl);
1955 
1956 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1957 		/* try to remap that extent elsewhere? */
1958 		bio_put(bio);
1959 		return -EIO;
1960 	}
1961 
1962 	printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1963 			"sector %llu)\n", page->mapping->host->i_ino, start,
1964 			dev->name, sector);
1965 
1966 	bio_put(bio);
1967 	return 0;
1968 }
1969 
repair_eb_io_failure(struct btrfs_root * root,struct extent_buffer * eb,int mirror_num)1970 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
1971 			 int mirror_num)
1972 {
1973 	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
1974 	u64 start = eb->start;
1975 	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
1976 	int ret = 0;
1977 
1978 	for (i = 0; i < num_pages; i++) {
1979 		struct page *p = extent_buffer_page(eb, i);
1980 		ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
1981 					start, p, mirror_num);
1982 		if (ret)
1983 			break;
1984 		start += PAGE_CACHE_SIZE;
1985 	}
1986 
1987 	return ret;
1988 }
1989 
1990 /*
1991  * each time an IO finishes, we do a fast check in the IO failure tree
1992  * to see if we need to process or clean up an io_failure_record
1993  */
clean_io_failure(u64 start,struct page * page)1994 static int clean_io_failure(u64 start, struct page *page)
1995 {
1996 	u64 private;
1997 	u64 private_failure;
1998 	struct io_failure_record *failrec;
1999 	struct btrfs_mapping_tree *map_tree;
2000 	struct extent_state *state;
2001 	int num_copies;
2002 	int did_repair = 0;
2003 	int ret;
2004 	struct inode *inode = page->mapping->host;
2005 
2006 	private = 0;
2007 	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
2008 				(u64)-1, 1, EXTENT_DIRTY, 0);
2009 	if (!ret)
2010 		return 0;
2011 
2012 	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
2013 				&private_failure);
2014 	if (ret)
2015 		return 0;
2016 
2017 	failrec = (struct io_failure_record *)(unsigned long) private_failure;
2018 	BUG_ON(!failrec->this_mirror);
2019 
2020 	if (failrec->in_validation) {
2021 		/* there was no real error, just free the record */
2022 		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
2023 			 failrec->start);
2024 		did_repair = 1;
2025 		goto out;
2026 	}
2027 
2028 	spin_lock(&BTRFS_I(inode)->io_tree.lock);
2029 	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
2030 					    failrec->start,
2031 					    EXTENT_LOCKED);
2032 	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
2033 
2034 	if (state && state->start == failrec->start) {
2035 		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
2036 		num_copies = btrfs_num_copies(map_tree, failrec->logical,
2037 						failrec->len);
2038 		if (num_copies > 1)  {
2039 			ret = repair_io_failure(map_tree, start, failrec->len,
2040 						failrec->logical, page,
2041 						failrec->failed_mirror);
2042 			did_repair = !ret;
2043 		}
2044 	}
2045 
2046 out:
2047 	if (!ret)
2048 		ret = free_io_failure(inode, failrec, did_repair);
2049 
2050 	return ret;
2051 }
2052 
2053 /*
2054  * this is a generic handler for readpage errors (default
2055  * readpage_io_failed_hook). if other copies exist, read those and write back
2056  * good data to the failed position. does not investigate in remapping the
2057  * failed extent elsewhere, hoping the device will be smart enough to do this as
2058  * needed
2059  */
2060 
bio_readpage_error(struct bio * failed_bio,struct page * page,u64 start,u64 end,int failed_mirror,struct extent_state * state)2061 static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2062 				u64 start, u64 end, int failed_mirror,
2063 				struct extent_state *state)
2064 {
2065 	struct io_failure_record *failrec = NULL;
2066 	u64 private;
2067 	struct extent_map *em;
2068 	struct inode *inode = page->mapping->host;
2069 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2070 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2071 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2072 	struct bio *bio;
2073 	int num_copies;
2074 	int ret;
2075 	int read_mode;
2076 	u64 logical;
2077 
2078 	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2079 
2080 	ret = get_state_private(failure_tree, start, &private);
2081 	if (ret) {
2082 		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2083 		if (!failrec)
2084 			return -ENOMEM;
2085 		failrec->start = start;
2086 		failrec->len = end - start + 1;
2087 		failrec->this_mirror = 0;
2088 		failrec->bio_flags = 0;
2089 		failrec->in_validation = 0;
2090 
2091 		read_lock(&em_tree->lock);
2092 		em = lookup_extent_mapping(em_tree, start, failrec->len);
2093 		if (!em) {
2094 			read_unlock(&em_tree->lock);
2095 			kfree(failrec);
2096 			return -EIO;
2097 		}
2098 
2099 		if (em->start > start || em->start + em->len < start) {
2100 			free_extent_map(em);
2101 			em = NULL;
2102 		}
2103 		read_unlock(&em_tree->lock);
2104 
2105 		if (!em || IS_ERR(em)) {
2106 			kfree(failrec);
2107 			return -EIO;
2108 		}
2109 		logical = start - em->start;
2110 		logical = em->block_start + logical;
2111 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2112 			logical = em->block_start;
2113 			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2114 			extent_set_compress_type(&failrec->bio_flags,
2115 						 em->compress_type);
2116 		}
2117 		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2118 			 "len=%llu\n", logical, start, failrec->len);
2119 		failrec->logical = logical;
2120 		free_extent_map(em);
2121 
2122 		/* set the bits in the private failure tree */
2123 		ret = set_extent_bits(failure_tree, start, end,
2124 					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2125 		if (ret >= 0)
2126 			ret = set_state_private(failure_tree, start,
2127 						(u64)(unsigned long)failrec);
2128 		/* set the bits in the inode's tree */
2129 		if (ret >= 0)
2130 			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2131 						GFP_NOFS);
2132 		if (ret < 0) {
2133 			kfree(failrec);
2134 			return ret;
2135 		}
2136 	} else {
2137 		failrec = (struct io_failure_record *)(unsigned long)private;
2138 		pr_debug("bio_readpage_error: (found) logical=%llu, "
2139 			 "start=%llu, len=%llu, validation=%d\n",
2140 			 failrec->logical, failrec->start, failrec->len,
2141 			 failrec->in_validation);
2142 		/*
2143 		 * when data can be on disk more than twice, add to failrec here
2144 		 * (e.g. with a list for failed_mirror) to make
2145 		 * clean_io_failure() clean all those errors at once.
2146 		 */
2147 	}
2148 	num_copies = btrfs_num_copies(
2149 			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
2150 			      failrec->logical, failrec->len);
2151 	if (num_copies == 1) {
2152 		/*
2153 		 * we only have a single copy of the data, so don't bother with
2154 		 * all the retry and error correction code that follows. no
2155 		 * matter what the error is, it is very likely to persist.
2156 		 */
2157 		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2158 			 "state=%p, num_copies=%d, next_mirror %d, "
2159 			 "failed_mirror %d\n", state, num_copies,
2160 			 failrec->this_mirror, failed_mirror);
2161 		free_io_failure(inode, failrec, 0);
2162 		return -EIO;
2163 	}
2164 
2165 	if (!state) {
2166 		spin_lock(&tree->lock);
2167 		state = find_first_extent_bit_state(tree, failrec->start,
2168 						    EXTENT_LOCKED);
2169 		if (state && state->start != failrec->start)
2170 			state = NULL;
2171 		spin_unlock(&tree->lock);
2172 	}
2173 
2174 	/*
2175 	 * there are two premises:
2176 	 *	a) deliver good data to the caller
2177 	 *	b) correct the bad sectors on disk
2178 	 */
2179 	if (failed_bio->bi_vcnt > 1) {
2180 		/*
2181 		 * to fulfill b), we need to know the exact failing sectors, as
2182 		 * we don't want to rewrite any more than the failed ones. thus,
2183 		 * we need separate read requests for the failed bio
2184 		 *
2185 		 * if the following BUG_ON triggers, our validation request got
2186 		 * merged. we need separate requests for our algorithm to work.
2187 		 */
2188 		BUG_ON(failrec->in_validation);
2189 		failrec->in_validation = 1;
2190 		failrec->this_mirror = failed_mirror;
2191 		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2192 	} else {
2193 		/*
2194 		 * we're ready to fulfill a) and b) alongside. get a good copy
2195 		 * of the failed sector and if we succeed, we have setup
2196 		 * everything for repair_io_failure to do the rest for us.
2197 		 */
2198 		if (failrec->in_validation) {
2199 			BUG_ON(failrec->this_mirror != failed_mirror);
2200 			failrec->in_validation = 0;
2201 			failrec->this_mirror = 0;
2202 		}
2203 		failrec->failed_mirror = failed_mirror;
2204 		failrec->this_mirror++;
2205 		if (failrec->this_mirror == failed_mirror)
2206 			failrec->this_mirror++;
2207 		read_mode = READ_SYNC;
2208 	}
2209 
2210 	if (!state || failrec->this_mirror > num_copies) {
2211 		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2212 			 "next_mirror %d, failed_mirror %d\n", state,
2213 			 num_copies, failrec->this_mirror, failed_mirror);
2214 		free_io_failure(inode, failrec, 0);
2215 		return -EIO;
2216 	}
2217 
2218 	bio = bio_alloc(GFP_NOFS, 1);
2219 	if (!bio) {
2220 		free_io_failure(inode, failrec, 0);
2221 		return -EIO;
2222 	}
2223 	bio->bi_private = state;
2224 	bio->bi_end_io = failed_bio->bi_end_io;
2225 	bio->bi_sector = failrec->logical >> 9;
2226 	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2227 	bio->bi_size = 0;
2228 
2229 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
2230 
2231 	pr_debug("bio_readpage_error: submitting new read[%#x] to "
2232 		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2233 		 failrec->this_mirror, num_copies, failrec->in_validation);
2234 
2235 	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2236 					 failrec->this_mirror,
2237 					 failrec->bio_flags, 0);
2238 	return ret;
2239 }
2240 
2241 /* lots and lots of room for performance fixes in the end_bio funcs */
2242 
end_extent_writepage(struct page * page,int err,u64 start,u64 end)2243 int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2244 {
2245 	int uptodate = (err == 0);
2246 	struct extent_io_tree *tree;
2247 	int ret;
2248 
2249 	tree = &BTRFS_I(page->mapping->host)->io_tree;
2250 
2251 	if (tree->ops && tree->ops->writepage_end_io_hook) {
2252 		ret = tree->ops->writepage_end_io_hook(page, start,
2253 					       end, NULL, uptodate);
2254 		if (ret)
2255 			uptodate = 0;
2256 	}
2257 
2258 	if (!uptodate && tree->ops &&
2259 	    tree->ops->writepage_io_failed_hook) {
2260 		ret = tree->ops->writepage_io_failed_hook(NULL, page,
2261 						 start, end, NULL);
2262 		/* Writeback already completed */
2263 		if (ret == 0)
2264 			return 1;
2265 	}
2266 
2267 	if (!uptodate) {
2268 		clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
2269 		ClearPageUptodate(page);
2270 		SetPageError(page);
2271 	}
2272 	return 0;
2273 }
2274 
2275 /*
2276  * after a writepage IO is done, we need to:
2277  * clear the uptodate bits on error
2278  * clear the writeback bits in the extent tree for this IO
2279  * end_page_writeback if the page has no more pending IO
2280  *
2281  * Scheduling is not allowed, so the extent state tree is expected
2282  * to have one and only one object corresponding to this IO.
2283  */
end_bio_extent_writepage(struct bio * bio,int err)2284 static void end_bio_extent_writepage(struct bio *bio, int err)
2285 {
2286 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2287 	struct extent_io_tree *tree;
2288 	u64 start;
2289 	u64 end;
2290 	int whole_page;
2291 
2292 	do {
2293 		struct page *page = bvec->bv_page;
2294 		tree = &BTRFS_I(page->mapping->host)->io_tree;
2295 
2296 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2297 			 bvec->bv_offset;
2298 		end = start + bvec->bv_len - 1;
2299 
2300 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2301 			whole_page = 1;
2302 		else
2303 			whole_page = 0;
2304 
2305 		if (--bvec >= bio->bi_io_vec)
2306 			prefetchw(&bvec->bv_page->flags);
2307 
2308 		if (end_extent_writepage(page, err, start, end))
2309 			continue;
2310 
2311 		if (whole_page)
2312 			end_page_writeback(page);
2313 		else
2314 			check_page_writeback(tree, page);
2315 	} while (bvec >= bio->bi_io_vec);
2316 
2317 	bio_put(bio);
2318 }
2319 
2320 /*
2321  * after a readpage IO is done, we need to:
2322  * clear the uptodate bits on error
2323  * set the uptodate bits if things worked
2324  * set the page up to date if all extents in the tree are uptodate
2325  * clear the lock bit in the extent tree
2326  * unlock the page if there are no other extents locked for it
2327  *
2328  * Scheduling is not allowed, so the extent state tree is expected
2329  * to have one and only one object corresponding to this IO.
2330  */
end_bio_extent_readpage(struct bio * bio,int err)2331 static void end_bio_extent_readpage(struct bio *bio, int err)
2332 {
2333 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2334 	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2335 	struct bio_vec *bvec = bio->bi_io_vec;
2336 	struct extent_io_tree *tree;
2337 	u64 start;
2338 	u64 end;
2339 	int whole_page;
2340 	int mirror;
2341 	int ret;
2342 
2343 	if (err)
2344 		uptodate = 0;
2345 
2346 	do {
2347 		struct page *page = bvec->bv_page;
2348 		struct extent_state *cached = NULL;
2349 		struct extent_state *state;
2350 
2351 		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2352 			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2353 			 (long int)bio->bi_bdev);
2354 		tree = &BTRFS_I(page->mapping->host)->io_tree;
2355 
2356 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2357 			bvec->bv_offset;
2358 		end = start + bvec->bv_len - 1;
2359 
2360 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2361 			whole_page = 1;
2362 		else
2363 			whole_page = 0;
2364 
2365 		if (++bvec <= bvec_end)
2366 			prefetchw(&bvec->bv_page->flags);
2367 
2368 		spin_lock(&tree->lock);
2369 		state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
2370 		if (state && state->start == start) {
2371 			/*
2372 			 * take a reference on the state, unlock will drop
2373 			 * the ref
2374 			 */
2375 			cache_state(state, &cached);
2376 		}
2377 		spin_unlock(&tree->lock);
2378 
2379 		mirror = (int)(unsigned long)bio->bi_bdev;
2380 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2381 			ret = tree->ops->readpage_end_io_hook(page, start, end,
2382 							      state, mirror);
2383 			if (ret)
2384 				uptodate = 0;
2385 			else
2386 				clean_io_failure(start, page);
2387 		}
2388 
2389 		if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
2390 			ret = tree->ops->readpage_io_failed_hook(page, mirror);
2391 			if (!ret && !err &&
2392 			    test_bit(BIO_UPTODATE, &bio->bi_flags))
2393 				uptodate = 1;
2394 		} else if (!uptodate) {
2395 			/*
2396 			 * The generic bio_readpage_error handles errors the
2397 			 * following way: If possible, new read requests are
2398 			 * created and submitted and will end up in
2399 			 * end_bio_extent_readpage as well (if we're lucky, not
2400 			 * in the !uptodate case). In that case it returns 0 and
2401 			 * we just go on with the next page in our bio. If it
2402 			 * can't handle the error it will return -EIO and we
2403 			 * remain responsible for that page.
2404 			 */
2405 			ret = bio_readpage_error(bio, page, start, end, mirror, NULL);
2406 			if (ret == 0) {
2407 				uptodate =
2408 					test_bit(BIO_UPTODATE, &bio->bi_flags);
2409 				if (err)
2410 					uptodate = 0;
2411 				uncache_state(&cached);
2412 				continue;
2413 			}
2414 		}
2415 
2416 		if (uptodate && tree->track_uptodate) {
2417 			set_extent_uptodate(tree, start, end, &cached,
2418 					    GFP_ATOMIC);
2419 		}
2420 		unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2421 
2422 		if (whole_page) {
2423 			if (uptodate) {
2424 				SetPageUptodate(page);
2425 			} else {
2426 				ClearPageUptodate(page);
2427 				SetPageError(page);
2428 			}
2429 			unlock_page(page);
2430 		} else {
2431 			if (uptodate) {
2432 				check_page_uptodate(tree, page);
2433 			} else {
2434 				ClearPageUptodate(page);
2435 				SetPageError(page);
2436 			}
2437 			check_page_locked(tree, page);
2438 		}
2439 	} while (bvec <= bvec_end);
2440 
2441 	bio_put(bio);
2442 }
2443 
2444 struct bio *
btrfs_bio_alloc(struct block_device * bdev,u64 first_sector,int nr_vecs,gfp_t gfp_flags)2445 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2446 		gfp_t gfp_flags)
2447 {
2448 	struct bio *bio;
2449 
2450 	bio = bio_alloc(gfp_flags, nr_vecs);
2451 
2452 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2453 		while (!bio && (nr_vecs /= 2))
2454 			bio = bio_alloc(gfp_flags, nr_vecs);
2455 	}
2456 
2457 	if (bio) {
2458 		bio->bi_size = 0;
2459 		bio->bi_bdev = bdev;
2460 		bio->bi_sector = first_sector;
2461 	}
2462 	return bio;
2463 }
2464 
2465 /*
2466  * Since writes are async, they will only return -ENOMEM.
2467  * Reads can return the full range of I/O error conditions.
2468  */
submit_one_bio(int rw,struct bio * bio,int mirror_num,unsigned long bio_flags)2469 static int __must_check submit_one_bio(int rw, struct bio *bio,
2470 				       int mirror_num, unsigned long bio_flags)
2471 {
2472 	int ret = 0;
2473 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2474 	struct page *page = bvec->bv_page;
2475 	struct extent_io_tree *tree = bio->bi_private;
2476 	u64 start;
2477 
2478 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
2479 
2480 	bio->bi_private = NULL;
2481 
2482 	bio_get(bio);
2483 
2484 	if (tree->ops && tree->ops->submit_bio_hook)
2485 		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2486 					   mirror_num, bio_flags, start);
2487 	else
2488 		btrfsic_submit_bio(rw, bio);
2489 
2490 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2491 		ret = -EOPNOTSUPP;
2492 	bio_put(bio);
2493 	return ret;
2494 }
2495 
merge_bio(struct extent_io_tree * tree,struct page * page,unsigned long offset,size_t size,struct bio * bio,unsigned long bio_flags)2496 static int merge_bio(struct extent_io_tree *tree, struct page *page,
2497 		     unsigned long offset, size_t size, struct bio *bio,
2498 		     unsigned long bio_flags)
2499 {
2500 	int ret = 0;
2501 	if (tree->ops && tree->ops->merge_bio_hook)
2502 		ret = tree->ops->merge_bio_hook(page, offset, size, bio,
2503 						bio_flags);
2504 	BUG_ON(ret < 0);
2505 	return ret;
2506 
2507 }
2508 
submit_extent_page(int rw,struct extent_io_tree * tree,struct page * page,sector_t sector,size_t size,unsigned long offset,struct block_device * bdev,struct bio ** bio_ret,unsigned long max_pages,bio_end_io_t end_io_func,int mirror_num,unsigned long prev_bio_flags,unsigned long bio_flags)2509 static int submit_extent_page(int rw, struct extent_io_tree *tree,
2510 			      struct page *page, sector_t sector,
2511 			      size_t size, unsigned long offset,
2512 			      struct block_device *bdev,
2513 			      struct bio **bio_ret,
2514 			      unsigned long max_pages,
2515 			      bio_end_io_t end_io_func,
2516 			      int mirror_num,
2517 			      unsigned long prev_bio_flags,
2518 			      unsigned long bio_flags)
2519 {
2520 	int ret = 0;
2521 	struct bio *bio;
2522 	int nr;
2523 	int contig = 0;
2524 	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2525 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2526 	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2527 
2528 	if (bio_ret && *bio_ret) {
2529 		bio = *bio_ret;
2530 		if (old_compressed)
2531 			contig = bio->bi_sector == sector;
2532 		else
2533 			contig = bio->bi_sector + (bio->bi_size >> 9) ==
2534 				sector;
2535 
2536 		if (prev_bio_flags != bio_flags || !contig ||
2537 		    merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
2538 		    bio_add_page(bio, page, page_size, offset) < page_size) {
2539 			ret = submit_one_bio(rw, bio, mirror_num,
2540 					     prev_bio_flags);
2541 			if (ret < 0)
2542 				return ret;
2543 			bio = NULL;
2544 		} else {
2545 			return 0;
2546 		}
2547 	}
2548 	if (this_compressed)
2549 		nr = BIO_MAX_PAGES;
2550 	else
2551 		nr = bio_get_nr_vecs(bdev);
2552 
2553 	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2554 	if (!bio)
2555 		return -ENOMEM;
2556 
2557 	bio_add_page(bio, page, page_size, offset);
2558 	bio->bi_end_io = end_io_func;
2559 	bio->bi_private = tree;
2560 
2561 	if (bio_ret)
2562 		*bio_ret = bio;
2563 	else
2564 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2565 
2566 	return ret;
2567 }
2568 
attach_extent_buffer_page(struct extent_buffer * eb,struct page * page)2569 void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
2570 {
2571 	if (!PagePrivate(page)) {
2572 		SetPagePrivate(page);
2573 		page_cache_get(page);
2574 		set_page_private(page, (unsigned long)eb);
2575 	} else {
2576 		WARN_ON(page->private != (unsigned long)eb);
2577 	}
2578 }
2579 
set_page_extent_mapped(struct page * page)2580 void set_page_extent_mapped(struct page *page)
2581 {
2582 	if (!PagePrivate(page)) {
2583 		SetPagePrivate(page);
2584 		page_cache_get(page);
2585 		set_page_private(page, EXTENT_PAGE_PRIVATE);
2586 	}
2587 }
2588 
2589 /*
2590  * basic readpage implementation.  Locked extent state structs are inserted
2591  * into the tree that are removed when the IO is done (by the end_io
2592  * handlers)
2593  * XXX JDM: This needs looking at to ensure proper page locking
2594  */
__extent_read_full_page(struct extent_io_tree * tree,struct page * page,get_extent_t * get_extent,struct bio ** bio,int mirror_num,unsigned long * bio_flags)2595 static int __extent_read_full_page(struct extent_io_tree *tree,
2596 				   struct page *page,
2597 				   get_extent_t *get_extent,
2598 				   struct bio **bio, int mirror_num,
2599 				   unsigned long *bio_flags)
2600 {
2601 	struct inode *inode = page->mapping->host;
2602 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2603 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2604 	u64 end;
2605 	u64 cur = start;
2606 	u64 extent_offset;
2607 	u64 last_byte = i_size_read(inode);
2608 	u64 block_start;
2609 	u64 cur_end;
2610 	sector_t sector;
2611 	struct extent_map *em;
2612 	struct block_device *bdev;
2613 	struct btrfs_ordered_extent *ordered;
2614 	int ret;
2615 	int nr = 0;
2616 	size_t pg_offset = 0;
2617 	size_t iosize;
2618 	size_t disk_io_size;
2619 	size_t blocksize = inode->i_sb->s_blocksize;
2620 	unsigned long this_bio_flag = 0;
2621 
2622 	set_page_extent_mapped(page);
2623 
2624 	if (!PageUptodate(page)) {
2625 		if (cleancache_get_page(page) == 0) {
2626 			BUG_ON(blocksize != PAGE_SIZE);
2627 			goto out;
2628 		}
2629 	}
2630 
2631 	end = page_end;
2632 	while (1) {
2633 		lock_extent(tree, start, end);
2634 		ordered = btrfs_lookup_ordered_extent(inode, start);
2635 		if (!ordered)
2636 			break;
2637 		unlock_extent(tree, start, end);
2638 		btrfs_start_ordered_extent(inode, ordered, 1);
2639 		btrfs_put_ordered_extent(ordered);
2640 	}
2641 
2642 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2643 		char *userpage;
2644 		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2645 
2646 		if (zero_offset) {
2647 			iosize = PAGE_CACHE_SIZE - zero_offset;
2648 			userpage = kmap_atomic(page);
2649 			memset(userpage + zero_offset, 0, iosize);
2650 			flush_dcache_page(page);
2651 			kunmap_atomic(userpage);
2652 		}
2653 	}
2654 	while (cur <= end) {
2655 		if (cur >= last_byte) {
2656 			char *userpage;
2657 			struct extent_state *cached = NULL;
2658 
2659 			iosize = PAGE_CACHE_SIZE - pg_offset;
2660 			userpage = kmap_atomic(page);
2661 			memset(userpage + pg_offset, 0, iosize);
2662 			flush_dcache_page(page);
2663 			kunmap_atomic(userpage);
2664 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2665 					    &cached, GFP_NOFS);
2666 			unlock_extent_cached(tree, cur, cur + iosize - 1,
2667 					     &cached, GFP_NOFS);
2668 			break;
2669 		}
2670 		em = get_extent(inode, page, pg_offset, cur,
2671 				end - cur + 1, 0);
2672 		if (IS_ERR_OR_NULL(em)) {
2673 			SetPageError(page);
2674 			unlock_extent(tree, cur, end);
2675 			break;
2676 		}
2677 		extent_offset = cur - em->start;
2678 		BUG_ON(extent_map_end(em) <= cur);
2679 		BUG_ON(end < cur);
2680 
2681 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2682 			this_bio_flag = EXTENT_BIO_COMPRESSED;
2683 			extent_set_compress_type(&this_bio_flag,
2684 						 em->compress_type);
2685 		}
2686 
2687 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2688 		cur_end = min(extent_map_end(em) - 1, end);
2689 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2690 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2691 			disk_io_size = em->block_len;
2692 			sector = em->block_start >> 9;
2693 		} else {
2694 			sector = (em->block_start + extent_offset) >> 9;
2695 			disk_io_size = iosize;
2696 		}
2697 		bdev = em->bdev;
2698 		block_start = em->block_start;
2699 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2700 			block_start = EXTENT_MAP_HOLE;
2701 		free_extent_map(em);
2702 		em = NULL;
2703 
2704 		/* we've found a hole, just zero and go on */
2705 		if (block_start == EXTENT_MAP_HOLE) {
2706 			char *userpage;
2707 			struct extent_state *cached = NULL;
2708 
2709 			userpage = kmap_atomic(page);
2710 			memset(userpage + pg_offset, 0, iosize);
2711 			flush_dcache_page(page);
2712 			kunmap_atomic(userpage);
2713 
2714 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2715 					    &cached, GFP_NOFS);
2716 			unlock_extent_cached(tree, cur, cur + iosize - 1,
2717 			                     &cached, GFP_NOFS);
2718 			cur = cur + iosize;
2719 			pg_offset += iosize;
2720 			continue;
2721 		}
2722 		/* the get_extent function already copied into the page */
2723 		if (test_range_bit(tree, cur, cur_end,
2724 				   EXTENT_UPTODATE, 1, NULL)) {
2725 			check_page_uptodate(tree, page);
2726 			unlock_extent(tree, cur, cur + iosize - 1);
2727 			cur = cur + iosize;
2728 			pg_offset += iosize;
2729 			continue;
2730 		}
2731 		/* we have an inline extent but it didn't get marked up
2732 		 * to date.  Error out
2733 		 */
2734 		if (block_start == EXTENT_MAP_INLINE) {
2735 			SetPageError(page);
2736 			unlock_extent(tree, cur, cur + iosize - 1);
2737 			cur = cur + iosize;
2738 			pg_offset += iosize;
2739 			continue;
2740 		}
2741 
2742 		ret = 0;
2743 		if (tree->ops && tree->ops->readpage_io_hook) {
2744 			ret = tree->ops->readpage_io_hook(page, cur,
2745 							  cur + iosize - 1);
2746 		}
2747 		if (!ret) {
2748 			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2749 			pnr -= page->index;
2750 			ret = submit_extent_page(READ, tree, page,
2751 					 sector, disk_io_size, pg_offset,
2752 					 bdev, bio, pnr,
2753 					 end_bio_extent_readpage, mirror_num,
2754 					 *bio_flags,
2755 					 this_bio_flag);
2756 			BUG_ON(ret == -ENOMEM);
2757 			nr++;
2758 			*bio_flags = this_bio_flag;
2759 		}
2760 		if (ret)
2761 			SetPageError(page);
2762 		cur = cur + iosize;
2763 		pg_offset += iosize;
2764 	}
2765 out:
2766 	if (!nr) {
2767 		if (!PageError(page))
2768 			SetPageUptodate(page);
2769 		unlock_page(page);
2770 	}
2771 	return 0;
2772 }
2773 
extent_read_full_page(struct extent_io_tree * tree,struct page * page,get_extent_t * get_extent,int mirror_num)2774 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2775 			    get_extent_t *get_extent, int mirror_num)
2776 {
2777 	struct bio *bio = NULL;
2778 	unsigned long bio_flags = 0;
2779 	int ret;
2780 
2781 	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2782 				      &bio_flags);
2783 	if (bio)
2784 		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2785 	return ret;
2786 }
2787 
update_nr_written(struct page * page,struct writeback_control * wbc,unsigned long nr_written)2788 static noinline void update_nr_written(struct page *page,
2789 				      struct writeback_control *wbc,
2790 				      unsigned long nr_written)
2791 {
2792 	wbc->nr_to_write -= nr_written;
2793 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2794 	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2795 		page->mapping->writeback_index = page->index + nr_written;
2796 }
2797 
2798 /*
2799  * the writepage semantics are similar to regular writepage.  extent
2800  * records are inserted to lock ranges in the tree, and as dirty areas
2801  * are found, they are marked writeback.  Then the lock bits are removed
2802  * and the end_io handler clears the writeback ranges
2803  */
__extent_writepage(struct page * page,struct writeback_control * wbc,void * data)2804 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2805 			      void *data)
2806 {
2807 	struct inode *inode = page->mapping->host;
2808 	struct extent_page_data *epd = data;
2809 	struct extent_io_tree *tree = epd->tree;
2810 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2811 	u64 delalloc_start;
2812 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2813 	u64 end;
2814 	u64 cur = start;
2815 	u64 extent_offset;
2816 	u64 last_byte = i_size_read(inode);
2817 	u64 block_start;
2818 	u64 iosize;
2819 	sector_t sector;
2820 	struct extent_state *cached_state = NULL;
2821 	struct extent_map *em;
2822 	struct block_device *bdev;
2823 	int ret;
2824 	int nr = 0;
2825 	size_t pg_offset = 0;
2826 	size_t blocksize;
2827 	loff_t i_size = i_size_read(inode);
2828 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2829 	u64 nr_delalloc;
2830 	u64 delalloc_end;
2831 	int page_started;
2832 	int compressed;
2833 	int write_flags;
2834 	unsigned long nr_written = 0;
2835 	bool fill_delalloc = true;
2836 
2837 	if (wbc->sync_mode == WB_SYNC_ALL)
2838 		write_flags = WRITE_SYNC;
2839 	else
2840 		write_flags = WRITE;
2841 
2842 	trace___extent_writepage(page, inode, wbc);
2843 
2844 	WARN_ON(!PageLocked(page));
2845 
2846 	ClearPageError(page);
2847 
2848 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2849 	if (page->index > end_index ||
2850 	   (page->index == end_index && !pg_offset)) {
2851 		page->mapping->a_ops->invalidatepage(page, 0);
2852 		unlock_page(page);
2853 		return 0;
2854 	}
2855 
2856 	if (page->index == end_index) {
2857 		char *userpage;
2858 
2859 		userpage = kmap_atomic(page);
2860 		memset(userpage + pg_offset, 0,
2861 		       PAGE_CACHE_SIZE - pg_offset);
2862 		kunmap_atomic(userpage);
2863 		flush_dcache_page(page);
2864 	}
2865 	pg_offset = 0;
2866 
2867 	set_page_extent_mapped(page);
2868 
2869 	if (!tree->ops || !tree->ops->fill_delalloc)
2870 		fill_delalloc = false;
2871 
2872 	delalloc_start = start;
2873 	delalloc_end = 0;
2874 	page_started = 0;
2875 	if (!epd->extent_locked && fill_delalloc) {
2876 		u64 delalloc_to_write = 0;
2877 		/*
2878 		 * make sure the wbc mapping index is at least updated
2879 		 * to this page.
2880 		 */
2881 		update_nr_written(page, wbc, 0);
2882 
2883 		while (delalloc_end < page_end) {
2884 			nr_delalloc = find_lock_delalloc_range(inode, tree,
2885 						       page,
2886 						       &delalloc_start,
2887 						       &delalloc_end,
2888 						       128 * 1024 * 1024);
2889 			if (nr_delalloc == 0) {
2890 				delalloc_start = delalloc_end + 1;
2891 				continue;
2892 			}
2893 			ret = tree->ops->fill_delalloc(inode, page,
2894 						       delalloc_start,
2895 						       delalloc_end,
2896 						       &page_started,
2897 						       &nr_written);
2898 			/* File system has been set read-only */
2899 			if (ret) {
2900 				SetPageError(page);
2901 				goto done;
2902 			}
2903 			/*
2904 			 * delalloc_end is already one less than the total
2905 			 * length, so we don't subtract one from
2906 			 * PAGE_CACHE_SIZE
2907 			 */
2908 			delalloc_to_write += (delalloc_end - delalloc_start +
2909 					      PAGE_CACHE_SIZE) >>
2910 					      PAGE_CACHE_SHIFT;
2911 			delalloc_start = delalloc_end + 1;
2912 		}
2913 		if (wbc->nr_to_write < delalloc_to_write) {
2914 			int thresh = 8192;
2915 
2916 			if (delalloc_to_write < thresh * 2)
2917 				thresh = delalloc_to_write;
2918 			wbc->nr_to_write = min_t(u64, delalloc_to_write,
2919 						 thresh);
2920 		}
2921 
2922 		/* did the fill delalloc function already unlock and start
2923 		 * the IO?
2924 		 */
2925 		if (page_started) {
2926 			ret = 0;
2927 			/*
2928 			 * we've unlocked the page, so we can't update
2929 			 * the mapping's writeback index, just update
2930 			 * nr_to_write.
2931 			 */
2932 			wbc->nr_to_write -= nr_written;
2933 			goto done_unlocked;
2934 		}
2935 	}
2936 	if (tree->ops && tree->ops->writepage_start_hook) {
2937 		ret = tree->ops->writepage_start_hook(page, start,
2938 						      page_end);
2939 		if (ret) {
2940 			/* Fixup worker will requeue */
2941 			if (ret == -EBUSY)
2942 				wbc->pages_skipped++;
2943 			else
2944 				redirty_page_for_writepage(wbc, page);
2945 			update_nr_written(page, wbc, nr_written);
2946 			unlock_page(page);
2947 			ret = 0;
2948 			goto done_unlocked;
2949 		}
2950 	}
2951 
2952 	/*
2953 	 * we don't want to touch the inode after unlocking the page,
2954 	 * so we update the mapping writeback index now
2955 	 */
2956 	update_nr_written(page, wbc, nr_written + 1);
2957 
2958 	end = page_end;
2959 	if (last_byte <= start) {
2960 		if (tree->ops && tree->ops->writepage_end_io_hook)
2961 			tree->ops->writepage_end_io_hook(page, start,
2962 							 page_end, NULL, 1);
2963 		goto done;
2964 	}
2965 
2966 	blocksize = inode->i_sb->s_blocksize;
2967 
2968 	while (cur <= end) {
2969 		if (cur >= last_byte) {
2970 			if (tree->ops && tree->ops->writepage_end_io_hook)
2971 				tree->ops->writepage_end_io_hook(page, cur,
2972 							 page_end, NULL, 1);
2973 			break;
2974 		}
2975 		em = epd->get_extent(inode, page, pg_offset, cur,
2976 				     end - cur + 1, 1);
2977 		if (IS_ERR_OR_NULL(em)) {
2978 			SetPageError(page);
2979 			break;
2980 		}
2981 
2982 		extent_offset = cur - em->start;
2983 		BUG_ON(extent_map_end(em) <= cur);
2984 		BUG_ON(end < cur);
2985 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2986 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2987 		sector = (em->block_start + extent_offset) >> 9;
2988 		bdev = em->bdev;
2989 		block_start = em->block_start;
2990 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2991 		free_extent_map(em);
2992 		em = NULL;
2993 
2994 		/*
2995 		 * compressed and inline extents are written through other
2996 		 * paths in the FS
2997 		 */
2998 		if (compressed || block_start == EXTENT_MAP_HOLE ||
2999 		    block_start == EXTENT_MAP_INLINE) {
3000 			/*
3001 			 * end_io notification does not happen here for
3002 			 * compressed extents
3003 			 */
3004 			if (!compressed && tree->ops &&
3005 			    tree->ops->writepage_end_io_hook)
3006 				tree->ops->writepage_end_io_hook(page, cur,
3007 							 cur + iosize - 1,
3008 							 NULL, 1);
3009 			else if (compressed) {
3010 				/* we don't want to end_page_writeback on
3011 				 * a compressed extent.  this happens
3012 				 * elsewhere
3013 				 */
3014 				nr++;
3015 			}
3016 
3017 			cur += iosize;
3018 			pg_offset += iosize;
3019 			continue;
3020 		}
3021 		/* leave this out until we have a page_mkwrite call */
3022 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
3023 				   EXTENT_DIRTY, 0, NULL)) {
3024 			cur = cur + iosize;
3025 			pg_offset += iosize;
3026 			continue;
3027 		}
3028 
3029 		if (tree->ops && tree->ops->writepage_io_hook) {
3030 			ret = tree->ops->writepage_io_hook(page, cur,
3031 						cur + iosize - 1);
3032 		} else {
3033 			ret = 0;
3034 		}
3035 		if (ret) {
3036 			SetPageError(page);
3037 		} else {
3038 			unsigned long max_nr = end_index + 1;
3039 
3040 			set_range_writeback(tree, cur, cur + iosize - 1);
3041 			if (!PageWriteback(page)) {
3042 				printk(KERN_ERR "btrfs warning page %lu not "
3043 				       "writeback, cur %llu end %llu\n",
3044 				       page->index, (unsigned long long)cur,
3045 				       (unsigned long long)end);
3046 			}
3047 
3048 			ret = submit_extent_page(write_flags, tree, page,
3049 						 sector, iosize, pg_offset,
3050 						 bdev, &epd->bio, max_nr,
3051 						 end_bio_extent_writepage,
3052 						 0, 0, 0);
3053 			if (ret)
3054 				SetPageError(page);
3055 		}
3056 		cur = cur + iosize;
3057 		pg_offset += iosize;
3058 		nr++;
3059 	}
3060 done:
3061 	if (nr == 0) {
3062 		/* make sure the mapping tag for page dirty gets cleared */
3063 		set_page_writeback(page);
3064 		end_page_writeback(page);
3065 	}
3066 	unlock_page(page);
3067 
3068 done_unlocked:
3069 
3070 	/* drop our reference on any cached states */
3071 	free_extent_state(cached_state);
3072 	return 0;
3073 }
3074 
eb_wait(void * word)3075 static int eb_wait(void *word)
3076 {
3077 	io_schedule();
3078 	return 0;
3079 }
3080 
wait_on_extent_buffer_writeback(struct extent_buffer * eb)3081 static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3082 {
3083 	wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
3084 		    TASK_UNINTERRUPTIBLE);
3085 }
3086 
lock_extent_buffer_for_io(struct extent_buffer * eb,struct btrfs_fs_info * fs_info,struct extent_page_data * epd)3087 static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3088 				     struct btrfs_fs_info *fs_info,
3089 				     struct extent_page_data *epd)
3090 {
3091 	unsigned long i, num_pages;
3092 	int flush = 0;
3093 	int ret = 0;
3094 
3095 	if (!btrfs_try_tree_write_lock(eb)) {
3096 		flush = 1;
3097 		flush_write_bio(epd);
3098 		btrfs_tree_lock(eb);
3099 	}
3100 
3101 	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3102 		btrfs_tree_unlock(eb);
3103 		if (!epd->sync_io)
3104 			return 0;
3105 		if (!flush) {
3106 			flush_write_bio(epd);
3107 			flush = 1;
3108 		}
3109 		while (1) {
3110 			wait_on_extent_buffer_writeback(eb);
3111 			btrfs_tree_lock(eb);
3112 			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3113 				break;
3114 			btrfs_tree_unlock(eb);
3115 		}
3116 	}
3117 
3118 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3119 		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3120 		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3121 		spin_lock(&fs_info->delalloc_lock);
3122 		if (fs_info->dirty_metadata_bytes >= eb->len)
3123 			fs_info->dirty_metadata_bytes -= eb->len;
3124 		else
3125 			WARN_ON(1);
3126 		spin_unlock(&fs_info->delalloc_lock);
3127 		ret = 1;
3128 	}
3129 
3130 	btrfs_tree_unlock(eb);
3131 
3132 	if (!ret)
3133 		return ret;
3134 
3135 	num_pages = num_extent_pages(eb->start, eb->len);
3136 	for (i = 0; i < num_pages; i++) {
3137 		struct page *p = extent_buffer_page(eb, i);
3138 
3139 		if (!trylock_page(p)) {
3140 			if (!flush) {
3141 				flush_write_bio(epd);
3142 				flush = 1;
3143 			}
3144 			lock_page(p);
3145 		}
3146 	}
3147 
3148 	return ret;
3149 }
3150 
end_extent_buffer_writeback(struct extent_buffer * eb)3151 static void end_extent_buffer_writeback(struct extent_buffer *eb)
3152 {
3153 	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3154 	smp_mb__after_clear_bit();
3155 	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3156 }
3157 
end_bio_extent_buffer_writepage(struct bio * bio,int err)3158 static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3159 {
3160 	int uptodate = err == 0;
3161 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
3162 	struct extent_buffer *eb;
3163 	int done;
3164 
3165 	do {
3166 		struct page *page = bvec->bv_page;
3167 
3168 		bvec--;
3169 		eb = (struct extent_buffer *)page->private;
3170 		BUG_ON(!eb);
3171 		done = atomic_dec_and_test(&eb->io_pages);
3172 
3173 		if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
3174 			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3175 			ClearPageUptodate(page);
3176 			SetPageError(page);
3177 		}
3178 
3179 		end_page_writeback(page);
3180 
3181 		if (!done)
3182 			continue;
3183 
3184 		end_extent_buffer_writeback(eb);
3185 	} while (bvec >= bio->bi_io_vec);
3186 
3187 	bio_put(bio);
3188 
3189 }
3190 
write_one_eb(struct extent_buffer * eb,struct btrfs_fs_info * fs_info,struct writeback_control * wbc,struct extent_page_data * epd)3191 static int write_one_eb(struct extent_buffer *eb,
3192 			struct btrfs_fs_info *fs_info,
3193 			struct writeback_control *wbc,
3194 			struct extent_page_data *epd)
3195 {
3196 	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3197 	u64 offset = eb->start;
3198 	unsigned long i, num_pages;
3199 	int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3200 	int ret;
3201 
3202 	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3203 	num_pages = num_extent_pages(eb->start, eb->len);
3204 	atomic_set(&eb->io_pages, num_pages);
3205 	for (i = 0; i < num_pages; i++) {
3206 		struct page *p = extent_buffer_page(eb, i);
3207 
3208 		clear_page_dirty_for_io(p);
3209 		set_page_writeback(p);
3210 		ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
3211 					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3212 					 -1, end_bio_extent_buffer_writepage,
3213 					 0, 0, 0);
3214 		if (ret) {
3215 			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3216 			SetPageError(p);
3217 			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3218 				end_extent_buffer_writeback(eb);
3219 			ret = -EIO;
3220 			break;
3221 		}
3222 		offset += PAGE_CACHE_SIZE;
3223 		update_nr_written(p, wbc, 1);
3224 		unlock_page(p);
3225 	}
3226 
3227 	if (unlikely(ret)) {
3228 		for (; i < num_pages; i++) {
3229 			struct page *p = extent_buffer_page(eb, i);
3230 			unlock_page(p);
3231 		}
3232 	}
3233 
3234 	return ret;
3235 }
3236 
btree_write_cache_pages(struct address_space * mapping,struct writeback_control * wbc)3237 int btree_write_cache_pages(struct address_space *mapping,
3238 				   struct writeback_control *wbc)
3239 {
3240 	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3241 	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3242 	struct extent_buffer *eb, *prev_eb = NULL;
3243 	struct extent_page_data epd = {
3244 		.bio = NULL,
3245 		.tree = tree,
3246 		.extent_locked = 0,
3247 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3248 	};
3249 	int ret = 0;
3250 	int done = 0;
3251 	int nr_to_write_done = 0;
3252 	struct pagevec pvec;
3253 	int nr_pages;
3254 	pgoff_t index;
3255 	pgoff_t end;		/* Inclusive */
3256 	int scanned = 0;
3257 	int tag;
3258 
3259 	pagevec_init(&pvec, 0);
3260 	if (wbc->range_cyclic) {
3261 		index = mapping->writeback_index; /* Start from prev offset */
3262 		end = -1;
3263 	} else {
3264 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3265 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3266 		scanned = 1;
3267 	}
3268 	if (wbc->sync_mode == WB_SYNC_ALL)
3269 		tag = PAGECACHE_TAG_TOWRITE;
3270 	else
3271 		tag = PAGECACHE_TAG_DIRTY;
3272 retry:
3273 	if (wbc->sync_mode == WB_SYNC_ALL)
3274 		tag_pages_for_writeback(mapping, index, end);
3275 	while (!done && !nr_to_write_done && (index <= end) &&
3276 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3277 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3278 		unsigned i;
3279 
3280 		scanned = 1;
3281 		for (i = 0; i < nr_pages; i++) {
3282 			struct page *page = pvec.pages[i];
3283 
3284 			if (!PagePrivate(page))
3285 				continue;
3286 
3287 			if (!wbc->range_cyclic && page->index > end) {
3288 				done = 1;
3289 				break;
3290 			}
3291 
3292 			eb = (struct extent_buffer *)page->private;
3293 			if (!eb) {
3294 				WARN_ON(1);
3295 				continue;
3296 			}
3297 
3298 			if (eb == prev_eb)
3299 				continue;
3300 
3301 			if (!atomic_inc_not_zero(&eb->refs)) {
3302 				WARN_ON(1);
3303 				continue;
3304 			}
3305 
3306 			prev_eb = eb;
3307 			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3308 			if (!ret) {
3309 				free_extent_buffer(eb);
3310 				continue;
3311 			}
3312 
3313 			ret = write_one_eb(eb, fs_info, wbc, &epd);
3314 			if (ret) {
3315 				done = 1;
3316 				free_extent_buffer(eb);
3317 				break;
3318 			}
3319 			free_extent_buffer(eb);
3320 
3321 			/*
3322 			 * the filesystem may choose to bump up nr_to_write.
3323 			 * We have to make sure to honor the new nr_to_write
3324 			 * at any time
3325 			 */
3326 			nr_to_write_done = wbc->nr_to_write <= 0;
3327 		}
3328 		pagevec_release(&pvec);
3329 		cond_resched();
3330 	}
3331 	if (!scanned && !done) {
3332 		/*
3333 		 * We hit the last page and there is more work to be done: wrap
3334 		 * back to the start of the file
3335 		 */
3336 		scanned = 1;
3337 		index = 0;
3338 		goto retry;
3339 	}
3340 	flush_write_bio(&epd);
3341 	return ret;
3342 }
3343 
3344 /**
3345  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
3346  * @mapping: address space structure to write
3347  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3348  * @writepage: function called for each page
3349  * @data: data passed to writepage function
3350  *
3351  * If a page is already under I/O, write_cache_pages() skips it, even
3352  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
3353  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
3354  * and msync() need to guarantee that all the data which was dirty at the time
3355  * the call was made get new I/O started against them.  If wbc->sync_mode is
3356  * WB_SYNC_ALL then we were called for data integrity and we must wait for
3357  * existing IO to complete.
3358  */
extent_write_cache_pages(struct extent_io_tree * tree,struct address_space * mapping,struct writeback_control * wbc,writepage_t writepage,void * data,void (* flush_fn)(void *))3359 static int extent_write_cache_pages(struct extent_io_tree *tree,
3360 			     struct address_space *mapping,
3361 			     struct writeback_control *wbc,
3362 			     writepage_t writepage, void *data,
3363 			     void (*flush_fn)(void *))
3364 {
3365 	int ret = 0;
3366 	int done = 0;
3367 	int nr_to_write_done = 0;
3368 	struct pagevec pvec;
3369 	int nr_pages;
3370 	pgoff_t index;
3371 	pgoff_t end;		/* Inclusive */
3372 	int scanned = 0;
3373 	int tag;
3374 
3375 	pagevec_init(&pvec, 0);
3376 	if (wbc->range_cyclic) {
3377 		index = mapping->writeback_index; /* Start from prev offset */
3378 		end = -1;
3379 	} else {
3380 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3381 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3382 		scanned = 1;
3383 	}
3384 	if (wbc->sync_mode == WB_SYNC_ALL)
3385 		tag = PAGECACHE_TAG_TOWRITE;
3386 	else
3387 		tag = PAGECACHE_TAG_DIRTY;
3388 retry:
3389 	if (wbc->sync_mode == WB_SYNC_ALL)
3390 		tag_pages_for_writeback(mapping, index, end);
3391 	while (!done && !nr_to_write_done && (index <= end) &&
3392 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3393 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3394 		unsigned i;
3395 
3396 		scanned = 1;
3397 		for (i = 0; i < nr_pages; i++) {
3398 			struct page *page = pvec.pages[i];
3399 
3400 			/*
3401 			 * At this point we hold neither mapping->tree_lock nor
3402 			 * lock on the page itself: the page may be truncated or
3403 			 * invalidated (changing page->mapping to NULL), or even
3404 			 * swizzled back from swapper_space to tmpfs file
3405 			 * mapping
3406 			 */
3407 			if (tree->ops &&
3408 			    tree->ops->write_cache_pages_lock_hook) {
3409 				tree->ops->write_cache_pages_lock_hook(page,
3410 							       data, flush_fn);
3411 			} else {
3412 				if (!trylock_page(page)) {
3413 					flush_fn(data);
3414 					lock_page(page);
3415 				}
3416 			}
3417 
3418 			if (unlikely(page->mapping != mapping)) {
3419 				unlock_page(page);
3420 				continue;
3421 			}
3422 
3423 			if (!wbc->range_cyclic && page->index > end) {
3424 				done = 1;
3425 				unlock_page(page);
3426 				continue;
3427 			}
3428 
3429 			if (wbc->sync_mode != WB_SYNC_NONE) {
3430 				if (PageWriteback(page))
3431 					flush_fn(data);
3432 				wait_on_page_writeback(page);
3433 			}
3434 
3435 			if (PageWriteback(page) ||
3436 			    !clear_page_dirty_for_io(page)) {
3437 				unlock_page(page);
3438 				continue;
3439 			}
3440 
3441 			ret = (*writepage)(page, wbc, data);
3442 
3443 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3444 				unlock_page(page);
3445 				ret = 0;
3446 			}
3447 			if (ret)
3448 				done = 1;
3449 
3450 			/*
3451 			 * the filesystem may choose to bump up nr_to_write.
3452 			 * We have to make sure to honor the new nr_to_write
3453 			 * at any time
3454 			 */
3455 			nr_to_write_done = wbc->nr_to_write <= 0;
3456 		}
3457 		pagevec_release(&pvec);
3458 		cond_resched();
3459 	}
3460 	if (!scanned && !done) {
3461 		/*
3462 		 * We hit the last page and there is more work to be done: wrap
3463 		 * back to the start of the file
3464 		 */
3465 		scanned = 1;
3466 		index = 0;
3467 		goto retry;
3468 	}
3469 	return ret;
3470 }
3471 
flush_epd_write_bio(struct extent_page_data * epd)3472 static void flush_epd_write_bio(struct extent_page_data *epd)
3473 {
3474 	if (epd->bio) {
3475 		int rw = WRITE;
3476 		int ret;
3477 
3478 		if (epd->sync_io)
3479 			rw = WRITE_SYNC;
3480 
3481 		ret = submit_one_bio(rw, epd->bio, 0, 0);
3482 		BUG_ON(ret < 0); /* -ENOMEM */
3483 		epd->bio = NULL;
3484 	}
3485 }
3486 
flush_write_bio(void * data)3487 static noinline void flush_write_bio(void *data)
3488 {
3489 	struct extent_page_data *epd = data;
3490 	flush_epd_write_bio(epd);
3491 }
3492 
extent_write_full_page(struct extent_io_tree * tree,struct page * page,get_extent_t * get_extent,struct writeback_control * wbc)3493 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3494 			  get_extent_t *get_extent,
3495 			  struct writeback_control *wbc)
3496 {
3497 	int ret;
3498 	struct extent_page_data epd = {
3499 		.bio = NULL,
3500 		.tree = tree,
3501 		.get_extent = get_extent,
3502 		.extent_locked = 0,
3503 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3504 	};
3505 
3506 	ret = __extent_writepage(page, wbc, &epd);
3507 
3508 	flush_epd_write_bio(&epd);
3509 	return ret;
3510 }
3511 
extent_write_locked_range(struct extent_io_tree * tree,struct inode * inode,u64 start,u64 end,get_extent_t * get_extent,int mode)3512 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3513 			      u64 start, u64 end, get_extent_t *get_extent,
3514 			      int mode)
3515 {
3516 	int ret = 0;
3517 	struct address_space *mapping = inode->i_mapping;
3518 	struct page *page;
3519 	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
3520 		PAGE_CACHE_SHIFT;
3521 
3522 	struct extent_page_data epd = {
3523 		.bio = NULL,
3524 		.tree = tree,
3525 		.get_extent = get_extent,
3526 		.extent_locked = 1,
3527 		.sync_io = mode == WB_SYNC_ALL,
3528 	};
3529 	struct writeback_control wbc_writepages = {
3530 		.sync_mode	= mode,
3531 		.nr_to_write	= nr_pages * 2,
3532 		.range_start	= start,
3533 		.range_end	= end + 1,
3534 	};
3535 
3536 	while (start <= end) {
3537 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
3538 		if (clear_page_dirty_for_io(page))
3539 			ret = __extent_writepage(page, &wbc_writepages, &epd);
3540 		else {
3541 			if (tree->ops && tree->ops->writepage_end_io_hook)
3542 				tree->ops->writepage_end_io_hook(page, start,
3543 						 start + PAGE_CACHE_SIZE - 1,
3544 						 NULL, 1);
3545 			unlock_page(page);
3546 		}
3547 		page_cache_release(page);
3548 		start += PAGE_CACHE_SIZE;
3549 	}
3550 
3551 	flush_epd_write_bio(&epd);
3552 	return ret;
3553 }
3554 
extent_writepages(struct extent_io_tree * tree,struct address_space * mapping,get_extent_t * get_extent,struct writeback_control * wbc)3555 int extent_writepages(struct extent_io_tree *tree,
3556 		      struct address_space *mapping,
3557 		      get_extent_t *get_extent,
3558 		      struct writeback_control *wbc)
3559 {
3560 	int ret = 0;
3561 	struct extent_page_data epd = {
3562 		.bio = NULL,
3563 		.tree = tree,
3564 		.get_extent = get_extent,
3565 		.extent_locked = 0,
3566 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3567 	};
3568 
3569 	ret = extent_write_cache_pages(tree, mapping, wbc,
3570 				       __extent_writepage, &epd,
3571 				       flush_write_bio);
3572 	flush_epd_write_bio(&epd);
3573 	return ret;
3574 }
3575 
extent_readpages(struct extent_io_tree * tree,struct address_space * mapping,struct list_head * pages,unsigned nr_pages,get_extent_t get_extent)3576 int extent_readpages(struct extent_io_tree *tree,
3577 		     struct address_space *mapping,
3578 		     struct list_head *pages, unsigned nr_pages,
3579 		     get_extent_t get_extent)
3580 {
3581 	struct bio *bio = NULL;
3582 	unsigned page_idx;
3583 	unsigned long bio_flags = 0;
3584 
3585 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3586 		struct page *page = list_entry(pages->prev, struct page, lru);
3587 
3588 		prefetchw(&page->flags);
3589 		list_del(&page->lru);
3590 		if (!add_to_page_cache_lru(page, mapping,
3591 					page->index, GFP_NOFS)) {
3592 			__extent_read_full_page(tree, page, get_extent,
3593 						&bio, 0, &bio_flags);
3594 		}
3595 		page_cache_release(page);
3596 	}
3597 	BUG_ON(!list_empty(pages));
3598 	if (bio)
3599 		return submit_one_bio(READ, bio, 0, bio_flags);
3600 	return 0;
3601 }
3602 
3603 /*
3604  * basic invalidatepage code, this waits on any locked or writeback
3605  * ranges corresponding to the page, and then deletes any extent state
3606  * records from the tree
3607  */
extent_invalidatepage(struct extent_io_tree * tree,struct page * page,unsigned long offset)3608 int extent_invalidatepage(struct extent_io_tree *tree,
3609 			  struct page *page, unsigned long offset)
3610 {
3611 	struct extent_state *cached_state = NULL;
3612 	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
3613 	u64 end = start + PAGE_CACHE_SIZE - 1;
3614 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3615 
3616 	start += (offset + blocksize - 1) & ~(blocksize - 1);
3617 	if (start > end)
3618 		return 0;
3619 
3620 	lock_extent_bits(tree, start, end, 0, &cached_state);
3621 	wait_on_page_writeback(page);
3622 	clear_extent_bit(tree, start, end,
3623 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3624 			 EXTENT_DO_ACCOUNTING,
3625 			 1, 1, &cached_state, GFP_NOFS);
3626 	return 0;
3627 }
3628 
3629 /*
3630  * a helper for releasepage, this tests for areas of the page that
3631  * are locked or under IO and drops the related state bits if it is safe
3632  * to drop the page.
3633  */
try_release_extent_state(struct extent_map_tree * map,struct extent_io_tree * tree,struct page * page,gfp_t mask)3634 int try_release_extent_state(struct extent_map_tree *map,
3635 			     struct extent_io_tree *tree, struct page *page,
3636 			     gfp_t mask)
3637 {
3638 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3639 	u64 end = start + PAGE_CACHE_SIZE - 1;
3640 	int ret = 1;
3641 
3642 	if (test_range_bit(tree, start, end,
3643 			   EXTENT_IOBITS, 0, NULL))
3644 		ret = 0;
3645 	else {
3646 		if ((mask & GFP_NOFS) == GFP_NOFS)
3647 			mask = GFP_NOFS;
3648 		/*
3649 		 * at this point we can safely clear everything except the
3650 		 * locked bit and the nodatasum bit
3651 		 */
3652 		ret = clear_extent_bit(tree, start, end,
3653 				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
3654 				 0, 0, NULL, mask);
3655 
3656 		/* if clear_extent_bit failed for enomem reasons,
3657 		 * we can't allow the release to continue.
3658 		 */
3659 		if (ret < 0)
3660 			ret = 0;
3661 		else
3662 			ret = 1;
3663 	}
3664 	return ret;
3665 }
3666 
3667 /*
3668  * a helper for releasepage.  As long as there are no locked extents
3669  * in the range corresponding to the page, both state records and extent
3670  * map records are removed
3671  */
try_release_extent_mapping(struct extent_map_tree * map,struct extent_io_tree * tree,struct page * page,gfp_t mask)3672 int try_release_extent_mapping(struct extent_map_tree *map,
3673 			       struct extent_io_tree *tree, struct page *page,
3674 			       gfp_t mask)
3675 {
3676 	struct extent_map *em;
3677 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3678 	u64 end = start + PAGE_CACHE_SIZE - 1;
3679 
3680 	if ((mask & __GFP_WAIT) &&
3681 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
3682 		u64 len;
3683 		while (start <= end) {
3684 			len = end - start + 1;
3685 			write_lock(&map->lock);
3686 			em = lookup_extent_mapping(map, start, len);
3687 			if (!em) {
3688 				write_unlock(&map->lock);
3689 				break;
3690 			}
3691 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3692 			    em->start != start) {
3693 				write_unlock(&map->lock);
3694 				free_extent_map(em);
3695 				break;
3696 			}
3697 			if (!test_range_bit(tree, em->start,
3698 					    extent_map_end(em) - 1,
3699 					    EXTENT_LOCKED | EXTENT_WRITEBACK,
3700 					    0, NULL)) {
3701 				remove_extent_mapping(map, em);
3702 				/* once for the rb tree */
3703 				free_extent_map(em);
3704 			}
3705 			start = extent_map_end(em);
3706 			write_unlock(&map->lock);
3707 
3708 			/* once for us */
3709 			free_extent_map(em);
3710 		}
3711 	}
3712 	return try_release_extent_state(map, tree, page, mask);
3713 }
3714 
3715 /*
3716  * helper function for fiemap, which doesn't want to see any holes.
3717  * This maps until we find something past 'last'
3718  */
get_extent_skip_holes(struct inode * inode,u64 offset,u64 last,get_extent_t * get_extent)3719 static struct extent_map *get_extent_skip_holes(struct inode *inode,
3720 						u64 offset,
3721 						u64 last,
3722 						get_extent_t *get_extent)
3723 {
3724 	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
3725 	struct extent_map *em;
3726 	u64 len;
3727 
3728 	if (offset >= last)
3729 		return NULL;
3730 
3731 	while(1) {
3732 		len = last - offset;
3733 		if (len == 0)
3734 			break;
3735 		len = (len + sectorsize - 1) & ~(sectorsize - 1);
3736 		em = get_extent(inode, NULL, 0, offset, len, 0);
3737 		if (IS_ERR_OR_NULL(em))
3738 			return em;
3739 
3740 		/* if this isn't a hole return it */
3741 		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3742 		    em->block_start != EXTENT_MAP_HOLE) {
3743 			return em;
3744 		}
3745 
3746 		/* this is a hole, advance to the next extent */
3747 		offset = extent_map_end(em);
3748 		free_extent_map(em);
3749 		if (offset >= last)
3750 			break;
3751 	}
3752 	return NULL;
3753 }
3754 
extent_fiemap(struct inode * inode,struct fiemap_extent_info * fieinfo,__u64 start,__u64 len,get_extent_t * get_extent)3755 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3756 		__u64 start, __u64 len, get_extent_t *get_extent)
3757 {
3758 	int ret = 0;
3759 	u64 off = start;
3760 	u64 max = start + len;
3761 	u32 flags = 0;
3762 	u32 found_type;
3763 	u64 last;
3764 	u64 last_for_get_extent = 0;
3765 	u64 disko = 0;
3766 	u64 isize = i_size_read(inode);
3767 	struct btrfs_key found_key;
3768 	struct extent_map *em = NULL;
3769 	struct extent_state *cached_state = NULL;
3770 	struct btrfs_path *path;
3771 	struct btrfs_file_extent_item *item;
3772 	int end = 0;
3773 	u64 em_start = 0;
3774 	u64 em_len = 0;
3775 	u64 em_end = 0;
3776 	unsigned long emflags;
3777 
3778 	if (len == 0)
3779 		return -EINVAL;
3780 
3781 	path = btrfs_alloc_path();
3782 	if (!path)
3783 		return -ENOMEM;
3784 	path->leave_spinning = 1;
3785 
3786 	start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3787 	len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3788 
3789 	/*
3790 	 * lookup the last file extent.  We're not using i_size here
3791 	 * because there might be preallocation past i_size
3792 	 */
3793 	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3794 				       path, btrfs_ino(inode), -1, 0);
3795 	if (ret < 0) {
3796 		btrfs_free_path(path);
3797 		return ret;
3798 	}
3799 	WARN_ON(!ret);
3800 	path->slots[0]--;
3801 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3802 			      struct btrfs_file_extent_item);
3803 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3804 	found_type = btrfs_key_type(&found_key);
3805 
3806 	/* No extents, but there might be delalloc bits */
3807 	if (found_key.objectid != btrfs_ino(inode) ||
3808 	    found_type != BTRFS_EXTENT_DATA_KEY) {
3809 		/* have to trust i_size as the end */
3810 		last = (u64)-1;
3811 		last_for_get_extent = isize;
3812 	} else {
3813 		/*
3814 		 * remember the start of the last extent.  There are a
3815 		 * bunch of different factors that go into the length of the
3816 		 * extent, so its much less complex to remember where it started
3817 		 */
3818 		last = found_key.offset;
3819 		last_for_get_extent = last + 1;
3820 	}
3821 	btrfs_free_path(path);
3822 
3823 	/*
3824 	 * we might have some extents allocated but more delalloc past those
3825 	 * extents.  so, we trust isize unless the start of the last extent is
3826 	 * beyond isize
3827 	 */
3828 	if (last < isize) {
3829 		last = (u64)-1;
3830 		last_for_get_extent = isize;
3831 	}
3832 
3833 	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3834 			 &cached_state);
3835 
3836 	em = get_extent_skip_holes(inode, start, last_for_get_extent,
3837 				   get_extent);
3838 	if (!em)
3839 		goto out;
3840 	if (IS_ERR(em)) {
3841 		ret = PTR_ERR(em);
3842 		goto out;
3843 	}
3844 
3845 	while (!end) {
3846 		u64 offset_in_extent;
3847 
3848 		/* break if the extent we found is outside the range */
3849 		if (em->start >= max || extent_map_end(em) < off)
3850 			break;
3851 
3852 		/*
3853 		 * get_extent may return an extent that starts before our
3854 		 * requested range.  We have to make sure the ranges
3855 		 * we return to fiemap always move forward and don't
3856 		 * overlap, so adjust the offsets here
3857 		 */
3858 		em_start = max(em->start, off);
3859 
3860 		/*
3861 		 * record the offset from the start of the extent
3862 		 * for adjusting the disk offset below
3863 		 */
3864 		offset_in_extent = em_start - em->start;
3865 		em_end = extent_map_end(em);
3866 		em_len = em_end - em_start;
3867 		emflags = em->flags;
3868 		disko = 0;
3869 		flags = 0;
3870 
3871 		/*
3872 		 * bump off for our next call to get_extent
3873 		 */
3874 		off = extent_map_end(em);
3875 		if (off >= max)
3876 			end = 1;
3877 
3878 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3879 			end = 1;
3880 			flags |= FIEMAP_EXTENT_LAST;
3881 		} else if (em->block_start == EXTENT_MAP_INLINE) {
3882 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
3883 				  FIEMAP_EXTENT_NOT_ALIGNED);
3884 		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
3885 			flags |= (FIEMAP_EXTENT_DELALLOC |
3886 				  FIEMAP_EXTENT_UNKNOWN);
3887 		} else {
3888 			disko = em->block_start + offset_in_extent;
3889 		}
3890 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3891 			flags |= FIEMAP_EXTENT_ENCODED;
3892 
3893 		free_extent_map(em);
3894 		em = NULL;
3895 		if ((em_start >= last) || em_len == (u64)-1 ||
3896 		   (last == (u64)-1 && isize <= em_end)) {
3897 			flags |= FIEMAP_EXTENT_LAST;
3898 			end = 1;
3899 		}
3900 
3901 		/* now scan forward to see if this is really the last extent. */
3902 		em = get_extent_skip_holes(inode, off, last_for_get_extent,
3903 					   get_extent);
3904 		if (IS_ERR(em)) {
3905 			ret = PTR_ERR(em);
3906 			goto out;
3907 		}
3908 		if (!em) {
3909 			flags |= FIEMAP_EXTENT_LAST;
3910 			end = 1;
3911 		}
3912 		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3913 					      em_len, flags);
3914 		if (ret)
3915 			goto out_free;
3916 	}
3917 out_free:
3918 	free_extent_map(em);
3919 out:
3920 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3921 			     &cached_state, GFP_NOFS);
3922 	return ret;
3923 }
3924 
extent_buffer_page(struct extent_buffer * eb,unsigned long i)3925 inline struct page *extent_buffer_page(struct extent_buffer *eb,
3926 					      unsigned long i)
3927 {
3928 	return eb->pages[i];
3929 }
3930 
num_extent_pages(u64 start,u64 len)3931 inline unsigned long num_extent_pages(u64 start, u64 len)
3932 {
3933 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3934 		(start >> PAGE_CACHE_SHIFT);
3935 }
3936 
__free_extent_buffer(struct extent_buffer * eb)3937 static void __free_extent_buffer(struct extent_buffer *eb)
3938 {
3939 #if LEAK_DEBUG
3940 	unsigned long flags;
3941 	spin_lock_irqsave(&leak_lock, flags);
3942 	list_del(&eb->leak_list);
3943 	spin_unlock_irqrestore(&leak_lock, flags);
3944 #endif
3945 	if (eb->pages && eb->pages != eb->inline_pages)
3946 		kfree(eb->pages);
3947 	kmem_cache_free(extent_buffer_cache, eb);
3948 }
3949 
__alloc_extent_buffer(struct extent_io_tree * tree,u64 start,unsigned long len,gfp_t mask)3950 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3951 						   u64 start,
3952 						   unsigned long len,
3953 						   gfp_t mask)
3954 {
3955 	struct extent_buffer *eb = NULL;
3956 #if LEAK_DEBUG
3957 	unsigned long flags;
3958 #endif
3959 
3960 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3961 	if (eb == NULL)
3962 		return NULL;
3963 	eb->start = start;
3964 	eb->len = len;
3965 	eb->tree = tree;
3966 	rwlock_init(&eb->lock);
3967 	atomic_set(&eb->write_locks, 0);
3968 	atomic_set(&eb->read_locks, 0);
3969 	atomic_set(&eb->blocking_readers, 0);
3970 	atomic_set(&eb->blocking_writers, 0);
3971 	atomic_set(&eb->spinning_readers, 0);
3972 	atomic_set(&eb->spinning_writers, 0);
3973 	eb->lock_nested = 0;
3974 	init_waitqueue_head(&eb->write_lock_wq);
3975 	init_waitqueue_head(&eb->read_lock_wq);
3976 
3977 #if LEAK_DEBUG
3978 	spin_lock_irqsave(&leak_lock, flags);
3979 	list_add(&eb->leak_list, &buffers);
3980 	spin_unlock_irqrestore(&leak_lock, flags);
3981 #endif
3982 	spin_lock_init(&eb->refs_lock);
3983 	atomic_set(&eb->refs, 1);
3984 	atomic_set(&eb->io_pages, 0);
3985 
3986 	if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
3987 		struct page **pages;
3988 		int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
3989 			PAGE_CACHE_SHIFT;
3990 		pages = kzalloc(num_pages, mask);
3991 		if (!pages) {
3992 			__free_extent_buffer(eb);
3993 			return NULL;
3994 		}
3995 		eb->pages = pages;
3996 	} else {
3997 		eb->pages = eb->inline_pages;
3998 	}
3999 
4000 	return eb;
4001 }
4002 
extent_buffer_under_io(struct extent_buffer * eb)4003 static int extent_buffer_under_io(struct extent_buffer *eb)
4004 {
4005 	return (atomic_read(&eb->io_pages) ||
4006 		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4007 		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4008 }
4009 
4010 /*
4011  * Helper for releasing extent buffer page.
4012  */
btrfs_release_extent_buffer_page(struct extent_buffer * eb,unsigned long start_idx)4013 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4014 						unsigned long start_idx)
4015 {
4016 	unsigned long index;
4017 	struct page *page;
4018 
4019 	BUG_ON(extent_buffer_under_io(eb));
4020 
4021 	index = num_extent_pages(eb->start, eb->len);
4022 	if (start_idx >= index)
4023 		return;
4024 
4025 	do {
4026 		index--;
4027 		page = extent_buffer_page(eb, index);
4028 		if (page) {
4029 			spin_lock(&page->mapping->private_lock);
4030 			/*
4031 			 * We do this since we'll remove the pages after we've
4032 			 * removed the eb from the radix tree, so we could race
4033 			 * and have this page now attached to the new eb.  So
4034 			 * only clear page_private if it's still connected to
4035 			 * this eb.
4036 			 */
4037 			if (PagePrivate(page) &&
4038 			    page->private == (unsigned long)eb) {
4039 				BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4040 				BUG_ON(PageDirty(page));
4041 				BUG_ON(PageWriteback(page));
4042 				/*
4043 				 * We need to make sure we haven't be attached
4044 				 * to a new eb.
4045 				 */
4046 				ClearPagePrivate(page);
4047 				set_page_private(page, 0);
4048 				/* One for the page private */
4049 				page_cache_release(page);
4050 			}
4051 			spin_unlock(&page->mapping->private_lock);
4052 
4053 			/* One for when we alloced the page */
4054 			page_cache_release(page);
4055 		}
4056 	} while (index != start_idx);
4057 }
4058 
4059 /*
4060  * Helper for releasing the extent buffer.
4061  */
btrfs_release_extent_buffer(struct extent_buffer * eb)4062 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4063 {
4064 	btrfs_release_extent_buffer_page(eb, 0);
4065 	__free_extent_buffer(eb);
4066 }
4067 
check_buffer_tree_ref(struct extent_buffer * eb)4068 static void check_buffer_tree_ref(struct extent_buffer *eb)
4069 {
4070 	/* the ref bit is tricky.  We have to make sure it is set
4071 	 * if we have the buffer dirty.   Otherwise the
4072 	 * code to free a buffer can end up dropping a dirty
4073 	 * page
4074 	 *
4075 	 * Once the ref bit is set, it won't go away while the
4076 	 * buffer is dirty or in writeback, and it also won't
4077 	 * go away while we have the reference count on the
4078 	 * eb bumped.
4079 	 *
4080 	 * We can't just set the ref bit without bumping the
4081 	 * ref on the eb because free_extent_buffer might
4082 	 * see the ref bit and try to clear it.  If this happens
4083 	 * free_extent_buffer might end up dropping our original
4084 	 * ref by mistake and freeing the page before we are able
4085 	 * to add one more ref.
4086 	 *
4087 	 * So bump the ref count first, then set the bit.  If someone
4088 	 * beat us to it, drop the ref we added.
4089 	 */
4090 	if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
4091 		atomic_inc(&eb->refs);
4092 		if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4093 			atomic_dec(&eb->refs);
4094 	}
4095 }
4096 
mark_extent_buffer_accessed(struct extent_buffer * eb)4097 static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4098 {
4099 	unsigned long num_pages, i;
4100 
4101 	check_buffer_tree_ref(eb);
4102 
4103 	num_pages = num_extent_pages(eb->start, eb->len);
4104 	for (i = 0; i < num_pages; i++) {
4105 		struct page *p = extent_buffer_page(eb, i);
4106 		mark_page_accessed(p);
4107 	}
4108 }
4109 
alloc_extent_buffer(struct extent_io_tree * tree,u64 start,unsigned long len)4110 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4111 					  u64 start, unsigned long len)
4112 {
4113 	unsigned long num_pages = num_extent_pages(start, len);
4114 	unsigned long i;
4115 	unsigned long index = start >> PAGE_CACHE_SHIFT;
4116 	struct extent_buffer *eb;
4117 	struct extent_buffer *exists = NULL;
4118 	struct page *p;
4119 	struct address_space *mapping = tree->mapping;
4120 	int uptodate = 1;
4121 	int ret;
4122 
4123 	rcu_read_lock();
4124 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4125 	if (eb && atomic_inc_not_zero(&eb->refs)) {
4126 		rcu_read_unlock();
4127 		mark_extent_buffer_accessed(eb);
4128 		return eb;
4129 	}
4130 	rcu_read_unlock();
4131 
4132 	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
4133 	if (!eb)
4134 		return NULL;
4135 
4136 	for (i = 0; i < num_pages; i++, index++) {
4137 		p = find_or_create_page(mapping, index, GFP_NOFS);
4138 		if (!p) {
4139 			WARN_ON(1);
4140 			goto free_eb;
4141 		}
4142 
4143 		spin_lock(&mapping->private_lock);
4144 		if (PagePrivate(p)) {
4145 			/*
4146 			 * We could have already allocated an eb for this page
4147 			 * and attached one so lets see if we can get a ref on
4148 			 * the existing eb, and if we can we know it's good and
4149 			 * we can just return that one, else we know we can just
4150 			 * overwrite page->private.
4151 			 */
4152 			exists = (struct extent_buffer *)p->private;
4153 			if (atomic_inc_not_zero(&exists->refs)) {
4154 				spin_unlock(&mapping->private_lock);
4155 				unlock_page(p);
4156 				page_cache_release(p);
4157 				mark_extent_buffer_accessed(exists);
4158 				goto free_eb;
4159 			}
4160 
4161 			/*
4162 			 * Do this so attach doesn't complain and we need to
4163 			 * drop the ref the old guy had.
4164 			 */
4165 			ClearPagePrivate(p);
4166 			WARN_ON(PageDirty(p));
4167 			page_cache_release(p);
4168 		}
4169 		attach_extent_buffer_page(eb, p);
4170 		spin_unlock(&mapping->private_lock);
4171 		WARN_ON(PageDirty(p));
4172 		mark_page_accessed(p);
4173 		eb->pages[i] = p;
4174 		if (!PageUptodate(p))
4175 			uptodate = 0;
4176 
4177 		/*
4178 		 * see below about how we avoid a nasty race with release page
4179 		 * and why we unlock later
4180 		 */
4181 	}
4182 	if (uptodate)
4183 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4184 again:
4185 	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4186 	if (ret)
4187 		goto free_eb;
4188 
4189 	spin_lock(&tree->buffer_lock);
4190 	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
4191 	if (ret == -EEXIST) {
4192 		exists = radix_tree_lookup(&tree->buffer,
4193 						start >> PAGE_CACHE_SHIFT);
4194 		if (!atomic_inc_not_zero(&exists->refs)) {
4195 			spin_unlock(&tree->buffer_lock);
4196 			radix_tree_preload_end();
4197 			exists = NULL;
4198 			goto again;
4199 		}
4200 		spin_unlock(&tree->buffer_lock);
4201 		radix_tree_preload_end();
4202 		mark_extent_buffer_accessed(exists);
4203 		goto free_eb;
4204 	}
4205 	/* add one reference for the tree */
4206 	spin_lock(&eb->refs_lock);
4207 	check_buffer_tree_ref(eb);
4208 	spin_unlock(&eb->refs_lock);
4209 	spin_unlock(&tree->buffer_lock);
4210 	radix_tree_preload_end();
4211 
4212 	/*
4213 	 * there is a race where release page may have
4214 	 * tried to find this extent buffer in the radix
4215 	 * but failed.  It will tell the VM it is safe to
4216 	 * reclaim the, and it will clear the page private bit.
4217 	 * We must make sure to set the page private bit properly
4218 	 * after the extent buffer is in the radix tree so
4219 	 * it doesn't get lost
4220 	 */
4221 	SetPageChecked(eb->pages[0]);
4222 	for (i = 1; i < num_pages; i++) {
4223 		p = extent_buffer_page(eb, i);
4224 		ClearPageChecked(p);
4225 		unlock_page(p);
4226 	}
4227 	unlock_page(eb->pages[0]);
4228 	return eb;
4229 
4230 free_eb:
4231 	for (i = 0; i < num_pages; i++) {
4232 		if (eb->pages[i])
4233 			unlock_page(eb->pages[i]);
4234 	}
4235 
4236 	WARN_ON(!atomic_dec_and_test(&eb->refs));
4237 	btrfs_release_extent_buffer(eb);
4238 	return exists;
4239 }
4240 
find_extent_buffer(struct extent_io_tree * tree,u64 start,unsigned long len)4241 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
4242 					 u64 start, unsigned long len)
4243 {
4244 	struct extent_buffer *eb;
4245 
4246 	rcu_read_lock();
4247 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4248 	if (eb && atomic_inc_not_zero(&eb->refs)) {
4249 		rcu_read_unlock();
4250 		mark_extent_buffer_accessed(eb);
4251 		return eb;
4252 	}
4253 	rcu_read_unlock();
4254 
4255 	return NULL;
4256 }
4257 
btrfs_release_extent_buffer_rcu(struct rcu_head * head)4258 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4259 {
4260 	struct extent_buffer *eb =
4261 			container_of(head, struct extent_buffer, rcu_head);
4262 
4263 	__free_extent_buffer(eb);
4264 }
4265 
4266 /* Expects to have eb->eb_lock already held */
release_extent_buffer(struct extent_buffer * eb,gfp_t mask)4267 static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4268 {
4269 	WARN_ON(atomic_read(&eb->refs) == 0);
4270 	if (atomic_dec_and_test(&eb->refs)) {
4271 		struct extent_io_tree *tree = eb->tree;
4272 
4273 		spin_unlock(&eb->refs_lock);
4274 
4275 		spin_lock(&tree->buffer_lock);
4276 		radix_tree_delete(&tree->buffer,
4277 				  eb->start >> PAGE_CACHE_SHIFT);
4278 		spin_unlock(&tree->buffer_lock);
4279 
4280 		/* Should be safe to release our pages at this point */
4281 		btrfs_release_extent_buffer_page(eb, 0);
4282 
4283 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4284 		return;
4285 	}
4286 	spin_unlock(&eb->refs_lock);
4287 }
4288 
free_extent_buffer(struct extent_buffer * eb)4289 void free_extent_buffer(struct extent_buffer *eb)
4290 {
4291 	if (!eb)
4292 		return;
4293 
4294 	spin_lock(&eb->refs_lock);
4295 	if (atomic_read(&eb->refs) == 2 &&
4296 	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4297 	    !extent_buffer_under_io(eb) &&
4298 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4299 		atomic_dec(&eb->refs);
4300 
4301 	/*
4302 	 * I know this is terrible, but it's temporary until we stop tracking
4303 	 * the uptodate bits and such for the extent buffers.
4304 	 */
4305 	release_extent_buffer(eb, GFP_ATOMIC);
4306 }
4307 
free_extent_buffer_stale(struct extent_buffer * eb)4308 void free_extent_buffer_stale(struct extent_buffer *eb)
4309 {
4310 	if (!eb)
4311 		return;
4312 
4313 	spin_lock(&eb->refs_lock);
4314 	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
4315 
4316 	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
4317 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4318 		atomic_dec(&eb->refs);
4319 	release_extent_buffer(eb, GFP_NOFS);
4320 }
4321 
clear_extent_buffer_dirty(struct extent_buffer * eb)4322 void clear_extent_buffer_dirty(struct extent_buffer *eb)
4323 {
4324 	unsigned long i;
4325 	unsigned long num_pages;
4326 	struct page *page;
4327 
4328 	num_pages = num_extent_pages(eb->start, eb->len);
4329 
4330 	for (i = 0; i < num_pages; i++) {
4331 		page = extent_buffer_page(eb, i);
4332 		if (!PageDirty(page))
4333 			continue;
4334 
4335 		lock_page(page);
4336 		WARN_ON(!PagePrivate(page));
4337 
4338 		clear_page_dirty_for_io(page);
4339 		spin_lock_irq(&page->mapping->tree_lock);
4340 		if (!PageDirty(page)) {
4341 			radix_tree_tag_clear(&page->mapping->page_tree,
4342 						page_index(page),
4343 						PAGECACHE_TAG_DIRTY);
4344 		}
4345 		spin_unlock_irq(&page->mapping->tree_lock);
4346 		ClearPageError(page);
4347 		unlock_page(page);
4348 	}
4349 	WARN_ON(atomic_read(&eb->refs) == 0);
4350 }
4351 
set_extent_buffer_dirty(struct extent_buffer * eb)4352 int set_extent_buffer_dirty(struct extent_buffer *eb)
4353 {
4354 	unsigned long i;
4355 	unsigned long num_pages;
4356 	int was_dirty = 0;
4357 
4358 	check_buffer_tree_ref(eb);
4359 
4360 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4361 
4362 	num_pages = num_extent_pages(eb->start, eb->len);
4363 	WARN_ON(atomic_read(&eb->refs) == 0);
4364 	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4365 
4366 	for (i = 0; i < num_pages; i++)
4367 		set_page_dirty(extent_buffer_page(eb, i));
4368 	return was_dirty;
4369 }
4370 
range_straddles_pages(u64 start,u64 len)4371 static int range_straddles_pages(u64 start, u64 len)
4372 {
4373 	if (len < PAGE_CACHE_SIZE)
4374 		return 1;
4375 	if (start & (PAGE_CACHE_SIZE - 1))
4376 		return 1;
4377 	if ((start + len) & (PAGE_CACHE_SIZE - 1))
4378 		return 1;
4379 	return 0;
4380 }
4381 
clear_extent_buffer_uptodate(struct extent_buffer * eb)4382 int clear_extent_buffer_uptodate(struct extent_buffer *eb)
4383 {
4384 	unsigned long i;
4385 	struct page *page;
4386 	unsigned long num_pages;
4387 
4388 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4389 	num_pages = num_extent_pages(eb->start, eb->len);
4390 	for (i = 0; i < num_pages; i++) {
4391 		page = extent_buffer_page(eb, i);
4392 		if (page)
4393 			ClearPageUptodate(page);
4394 	}
4395 	return 0;
4396 }
4397 
set_extent_buffer_uptodate(struct extent_buffer * eb)4398 int set_extent_buffer_uptodate(struct extent_buffer *eb)
4399 {
4400 	unsigned long i;
4401 	struct page *page;
4402 	unsigned long num_pages;
4403 
4404 	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4405 	num_pages = num_extent_pages(eb->start, eb->len);
4406 	for (i = 0; i < num_pages; i++) {
4407 		page = extent_buffer_page(eb, i);
4408 		SetPageUptodate(page);
4409 	}
4410 	return 0;
4411 }
4412 
extent_range_uptodate(struct extent_io_tree * tree,u64 start,u64 end)4413 int extent_range_uptodate(struct extent_io_tree *tree,
4414 			  u64 start, u64 end)
4415 {
4416 	struct page *page;
4417 	int ret;
4418 	int pg_uptodate = 1;
4419 	int uptodate;
4420 	unsigned long index;
4421 
4422 	if (range_straddles_pages(start, end - start + 1)) {
4423 		ret = test_range_bit(tree, start, end,
4424 				     EXTENT_UPTODATE, 1, NULL);
4425 		if (ret)
4426 			return 1;
4427 	}
4428 	while (start <= end) {
4429 		index = start >> PAGE_CACHE_SHIFT;
4430 		page = find_get_page(tree->mapping, index);
4431 		if (!page)
4432 			return 1;
4433 		uptodate = PageUptodate(page);
4434 		page_cache_release(page);
4435 		if (!uptodate) {
4436 			pg_uptodate = 0;
4437 			break;
4438 		}
4439 		start += PAGE_CACHE_SIZE;
4440 	}
4441 	return pg_uptodate;
4442 }
4443 
extent_buffer_uptodate(struct extent_buffer * eb)4444 int extent_buffer_uptodate(struct extent_buffer *eb)
4445 {
4446 	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4447 }
4448 
read_extent_buffer_pages(struct extent_io_tree * tree,struct extent_buffer * eb,u64 start,int wait,get_extent_t * get_extent,int mirror_num)4449 int read_extent_buffer_pages(struct extent_io_tree *tree,
4450 			     struct extent_buffer *eb, u64 start, int wait,
4451 			     get_extent_t *get_extent, int mirror_num)
4452 {
4453 	unsigned long i;
4454 	unsigned long start_i;
4455 	struct page *page;
4456 	int err;
4457 	int ret = 0;
4458 	int locked_pages = 0;
4459 	int all_uptodate = 1;
4460 	unsigned long num_pages;
4461 	unsigned long num_reads = 0;
4462 	struct bio *bio = NULL;
4463 	unsigned long bio_flags = 0;
4464 
4465 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
4466 		return 0;
4467 
4468 	if (start) {
4469 		WARN_ON(start < eb->start);
4470 		start_i = (start >> PAGE_CACHE_SHIFT) -
4471 			(eb->start >> PAGE_CACHE_SHIFT);
4472 	} else {
4473 		start_i = 0;
4474 	}
4475 
4476 	num_pages = num_extent_pages(eb->start, eb->len);
4477 	for (i = start_i; i < num_pages; i++) {
4478 		page = extent_buffer_page(eb, i);
4479 		if (wait == WAIT_NONE) {
4480 			if (!trylock_page(page))
4481 				goto unlock_exit;
4482 		} else {
4483 			lock_page(page);
4484 		}
4485 		locked_pages++;
4486 		if (!PageUptodate(page)) {
4487 			num_reads++;
4488 			all_uptodate = 0;
4489 		}
4490 	}
4491 	if (all_uptodate) {
4492 		if (start_i == 0)
4493 			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4494 		goto unlock_exit;
4495 	}
4496 
4497 	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
4498 	eb->read_mirror = 0;
4499 	atomic_set(&eb->io_pages, num_reads);
4500 	for (i = start_i; i < num_pages; i++) {
4501 		page = extent_buffer_page(eb, i);
4502 		if (!PageUptodate(page)) {
4503 			ClearPageError(page);
4504 			err = __extent_read_full_page(tree, page,
4505 						      get_extent, &bio,
4506 						      mirror_num, &bio_flags);
4507 			if (err)
4508 				ret = err;
4509 		} else {
4510 			unlock_page(page);
4511 		}
4512 	}
4513 
4514 	if (bio) {
4515 		err = submit_one_bio(READ, bio, mirror_num, bio_flags);
4516 		if (err)
4517 			return err;
4518 	}
4519 
4520 	if (ret || wait != WAIT_COMPLETE)
4521 		return ret;
4522 
4523 	for (i = start_i; i < num_pages; i++) {
4524 		page = extent_buffer_page(eb, i);
4525 		wait_on_page_locked(page);
4526 		if (!PageUptodate(page))
4527 			ret = -EIO;
4528 	}
4529 
4530 	return ret;
4531 
4532 unlock_exit:
4533 	i = start_i;
4534 	while (locked_pages > 0) {
4535 		page = extent_buffer_page(eb, i);
4536 		i++;
4537 		unlock_page(page);
4538 		locked_pages--;
4539 	}
4540 	return ret;
4541 }
4542 
read_extent_buffer(struct extent_buffer * eb,void * dstv,unsigned long start,unsigned long len)4543 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
4544 			unsigned long start,
4545 			unsigned long len)
4546 {
4547 	size_t cur;
4548 	size_t offset;
4549 	struct page *page;
4550 	char *kaddr;
4551 	char *dst = (char *)dstv;
4552 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4553 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4554 
4555 	WARN_ON(start > eb->len);
4556 	WARN_ON(start + len > eb->start + eb->len);
4557 
4558 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4559 
4560 	while (len > 0) {
4561 		page = extent_buffer_page(eb, i);
4562 
4563 		cur = min(len, (PAGE_CACHE_SIZE - offset));
4564 		kaddr = page_address(page);
4565 		memcpy(dst, kaddr + offset, cur);
4566 
4567 		dst += cur;
4568 		len -= cur;
4569 		offset = 0;
4570 		i++;
4571 	}
4572 }
4573 
map_private_extent_buffer(struct extent_buffer * eb,unsigned long start,unsigned long min_len,char ** map,unsigned long * map_start,unsigned long * map_len)4574 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4575 			       unsigned long min_len, char **map,
4576 			       unsigned long *map_start,
4577 			       unsigned long *map_len)
4578 {
4579 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
4580 	char *kaddr;
4581 	struct page *p;
4582 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4583 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4584 	unsigned long end_i = (start_offset + start + min_len - 1) >>
4585 		PAGE_CACHE_SHIFT;
4586 
4587 	if (i != end_i)
4588 		return -EINVAL;
4589 
4590 	if (i == 0) {
4591 		offset = start_offset;
4592 		*map_start = 0;
4593 	} else {
4594 		offset = 0;
4595 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
4596 	}
4597 
4598 	if (start + min_len > eb->len) {
4599 		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4600 		       "wanted %lu %lu\n", (unsigned long long)eb->start,
4601 		       eb->len, start, min_len);
4602 		WARN_ON(1);
4603 		return -EINVAL;
4604 	}
4605 
4606 	p = extent_buffer_page(eb, i);
4607 	kaddr = page_address(p);
4608 	*map = kaddr + offset;
4609 	*map_len = PAGE_CACHE_SIZE - offset;
4610 	return 0;
4611 }
4612 
memcmp_extent_buffer(struct extent_buffer * eb,const void * ptrv,unsigned long start,unsigned long len)4613 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
4614 			  unsigned long start,
4615 			  unsigned long len)
4616 {
4617 	size_t cur;
4618 	size_t offset;
4619 	struct page *page;
4620 	char *kaddr;
4621 	char *ptr = (char *)ptrv;
4622 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4623 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4624 	int ret = 0;
4625 
4626 	WARN_ON(start > eb->len);
4627 	WARN_ON(start + len > eb->start + eb->len);
4628 
4629 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4630 
4631 	while (len > 0) {
4632 		page = extent_buffer_page(eb, i);
4633 
4634 		cur = min(len, (PAGE_CACHE_SIZE - offset));
4635 
4636 		kaddr = page_address(page);
4637 		ret = memcmp(ptr, kaddr + offset, cur);
4638 		if (ret)
4639 			break;
4640 
4641 		ptr += cur;
4642 		len -= cur;
4643 		offset = 0;
4644 		i++;
4645 	}
4646 	return ret;
4647 }
4648 
write_extent_buffer(struct extent_buffer * eb,const void * srcv,unsigned long start,unsigned long len)4649 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
4650 			 unsigned long start, unsigned long len)
4651 {
4652 	size_t cur;
4653 	size_t offset;
4654 	struct page *page;
4655 	char *kaddr;
4656 	char *src = (char *)srcv;
4657 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4658 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4659 
4660 	WARN_ON(start > eb->len);
4661 	WARN_ON(start + len > eb->start + eb->len);
4662 
4663 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4664 
4665 	while (len > 0) {
4666 		page = extent_buffer_page(eb, i);
4667 		WARN_ON(!PageUptodate(page));
4668 
4669 		cur = min(len, PAGE_CACHE_SIZE - offset);
4670 		kaddr = page_address(page);
4671 		memcpy(kaddr + offset, src, cur);
4672 
4673 		src += cur;
4674 		len -= cur;
4675 		offset = 0;
4676 		i++;
4677 	}
4678 }
4679 
memset_extent_buffer(struct extent_buffer * eb,char c,unsigned long start,unsigned long len)4680 void memset_extent_buffer(struct extent_buffer *eb, char c,
4681 			  unsigned long start, unsigned long len)
4682 {
4683 	size_t cur;
4684 	size_t offset;
4685 	struct page *page;
4686 	char *kaddr;
4687 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4688 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4689 
4690 	WARN_ON(start > eb->len);
4691 	WARN_ON(start + len > eb->start + eb->len);
4692 
4693 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4694 
4695 	while (len > 0) {
4696 		page = extent_buffer_page(eb, i);
4697 		WARN_ON(!PageUptodate(page));
4698 
4699 		cur = min(len, PAGE_CACHE_SIZE - offset);
4700 		kaddr = page_address(page);
4701 		memset(kaddr + offset, c, cur);
4702 
4703 		len -= cur;
4704 		offset = 0;
4705 		i++;
4706 	}
4707 }
4708 
copy_extent_buffer(struct extent_buffer * dst,struct extent_buffer * src,unsigned long dst_offset,unsigned long src_offset,unsigned long len)4709 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
4710 			unsigned long dst_offset, unsigned long src_offset,
4711 			unsigned long len)
4712 {
4713 	u64 dst_len = dst->len;
4714 	size_t cur;
4715 	size_t offset;
4716 	struct page *page;
4717 	char *kaddr;
4718 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4719 	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4720 
4721 	WARN_ON(src->len != dst_len);
4722 
4723 	offset = (start_offset + dst_offset) &
4724 		((unsigned long)PAGE_CACHE_SIZE - 1);
4725 
4726 	while (len > 0) {
4727 		page = extent_buffer_page(dst, i);
4728 		WARN_ON(!PageUptodate(page));
4729 
4730 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
4731 
4732 		kaddr = page_address(page);
4733 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
4734 
4735 		src_offset += cur;
4736 		len -= cur;
4737 		offset = 0;
4738 		i++;
4739 	}
4740 }
4741 
move_pages(struct page * dst_page,struct page * src_page,unsigned long dst_off,unsigned long src_off,unsigned long len)4742 static void move_pages(struct page *dst_page, struct page *src_page,
4743 		       unsigned long dst_off, unsigned long src_off,
4744 		       unsigned long len)
4745 {
4746 	char *dst_kaddr = page_address(dst_page);
4747 	if (dst_page == src_page) {
4748 		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
4749 	} else {
4750 		char *src_kaddr = page_address(src_page);
4751 		char *p = dst_kaddr + dst_off + len;
4752 		char *s = src_kaddr + src_off + len;
4753 
4754 		while (len--)
4755 			*--p = *--s;
4756 	}
4757 }
4758 
areas_overlap(unsigned long src,unsigned long dst,unsigned long len)4759 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4760 {
4761 	unsigned long distance = (src > dst) ? src - dst : dst - src;
4762 	return distance < len;
4763 }
4764 
copy_pages(struct page * dst_page,struct page * src_page,unsigned long dst_off,unsigned long src_off,unsigned long len)4765 static void copy_pages(struct page *dst_page, struct page *src_page,
4766 		       unsigned long dst_off, unsigned long src_off,
4767 		       unsigned long len)
4768 {
4769 	char *dst_kaddr = page_address(dst_page);
4770 	char *src_kaddr;
4771 	int must_memmove = 0;
4772 
4773 	if (dst_page != src_page) {
4774 		src_kaddr = page_address(src_page);
4775 	} else {
4776 		src_kaddr = dst_kaddr;
4777 		if (areas_overlap(src_off, dst_off, len))
4778 			must_memmove = 1;
4779 	}
4780 
4781 	if (must_memmove)
4782 		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
4783 	else
4784 		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4785 }
4786 
memcpy_extent_buffer(struct extent_buffer * dst,unsigned long dst_offset,unsigned long src_offset,unsigned long len)4787 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4788 			   unsigned long src_offset, unsigned long len)
4789 {
4790 	size_t cur;
4791 	size_t dst_off_in_page;
4792 	size_t src_off_in_page;
4793 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4794 	unsigned long dst_i;
4795 	unsigned long src_i;
4796 
4797 	if (src_offset + len > dst->len) {
4798 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4799 		       "len %lu dst len %lu\n", src_offset, len, dst->len);
4800 		BUG_ON(1);
4801 	}
4802 	if (dst_offset + len > dst->len) {
4803 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4804 		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
4805 		BUG_ON(1);
4806 	}
4807 
4808 	while (len > 0) {
4809 		dst_off_in_page = (start_offset + dst_offset) &
4810 			((unsigned long)PAGE_CACHE_SIZE - 1);
4811 		src_off_in_page = (start_offset + src_offset) &
4812 			((unsigned long)PAGE_CACHE_SIZE - 1);
4813 
4814 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4815 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
4816 
4817 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
4818 					       src_off_in_page));
4819 		cur = min_t(unsigned long, cur,
4820 			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4821 
4822 		copy_pages(extent_buffer_page(dst, dst_i),
4823 			   extent_buffer_page(dst, src_i),
4824 			   dst_off_in_page, src_off_in_page, cur);
4825 
4826 		src_offset += cur;
4827 		dst_offset += cur;
4828 		len -= cur;
4829 	}
4830 }
4831 
memmove_extent_buffer(struct extent_buffer * dst,unsigned long dst_offset,unsigned long src_offset,unsigned long len)4832 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4833 			   unsigned long src_offset, unsigned long len)
4834 {
4835 	size_t cur;
4836 	size_t dst_off_in_page;
4837 	size_t src_off_in_page;
4838 	unsigned long dst_end = dst_offset + len - 1;
4839 	unsigned long src_end = src_offset + len - 1;
4840 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4841 	unsigned long dst_i;
4842 	unsigned long src_i;
4843 
4844 	if (src_offset + len > dst->len) {
4845 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4846 		       "len %lu len %lu\n", src_offset, len, dst->len);
4847 		BUG_ON(1);
4848 	}
4849 	if (dst_offset + len > dst->len) {
4850 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4851 		       "len %lu len %lu\n", dst_offset, len, dst->len);
4852 		BUG_ON(1);
4853 	}
4854 	if (dst_offset < src_offset) {
4855 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4856 		return;
4857 	}
4858 	while (len > 0) {
4859 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
4860 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
4861 
4862 		dst_off_in_page = (start_offset + dst_end) &
4863 			((unsigned long)PAGE_CACHE_SIZE - 1);
4864 		src_off_in_page = (start_offset + src_end) &
4865 			((unsigned long)PAGE_CACHE_SIZE - 1);
4866 
4867 		cur = min_t(unsigned long, len, src_off_in_page + 1);
4868 		cur = min(cur, dst_off_in_page + 1);
4869 		move_pages(extent_buffer_page(dst, dst_i),
4870 			   extent_buffer_page(dst, src_i),
4871 			   dst_off_in_page - cur + 1,
4872 			   src_off_in_page - cur + 1, cur);
4873 
4874 		dst_end -= cur;
4875 		src_end -= cur;
4876 		len -= cur;
4877 	}
4878 }
4879 
try_release_extent_buffer(struct page * page,gfp_t mask)4880 int try_release_extent_buffer(struct page *page, gfp_t mask)
4881 {
4882 	struct extent_buffer *eb;
4883 
4884 	/*
4885 	 * We need to make sure noboody is attaching this page to an eb right
4886 	 * now.
4887 	 */
4888 	spin_lock(&page->mapping->private_lock);
4889 	if (!PagePrivate(page)) {
4890 		spin_unlock(&page->mapping->private_lock);
4891 		return 1;
4892 	}
4893 
4894 	eb = (struct extent_buffer *)page->private;
4895 	BUG_ON(!eb);
4896 
4897 	/*
4898 	 * This is a little awful but should be ok, we need to make sure that
4899 	 * the eb doesn't disappear out from under us while we're looking at
4900 	 * this page.
4901 	 */
4902 	spin_lock(&eb->refs_lock);
4903 	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
4904 		spin_unlock(&eb->refs_lock);
4905 		spin_unlock(&page->mapping->private_lock);
4906 		return 0;
4907 	}
4908 	spin_unlock(&page->mapping->private_lock);
4909 
4910 	if ((mask & GFP_NOFS) == GFP_NOFS)
4911 		mask = GFP_NOFS;
4912 
4913 	/*
4914 	 * If tree ref isn't set then we know the ref on this eb is a real ref,
4915 	 * so just return, this page will likely be freed soon anyway.
4916 	 */
4917 	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
4918 		spin_unlock(&eb->refs_lock);
4919 		return 0;
4920 	}
4921 	release_extent_buffer(eb, mask);
4922 
4923 	return 1;
4924 }
4925