• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/slab.h>
7 #include <linux/blkdev.h>
8 #include <linux/writeback.h>
9 #include <linux/sched/mm.h>
10 #include "messages.h"
11 #include "misc.h"
12 #include "ctree.h"
13 #include "transaction.h"
14 #include "btrfs_inode.h"
15 #include "extent_io.h"
16 #include "disk-io.h"
17 #include "compression.h"
18 #include "delalloc-space.h"
19 #include "qgroup.h"
20 #include "subpage.h"
21 #include "file.h"
22 #include "super.h"
23 
24 static struct kmem_cache *btrfs_ordered_extent_cache;
25 
entry_end(struct btrfs_ordered_extent * entry)26 static u64 entry_end(struct btrfs_ordered_extent *entry)
27 {
28 	if (entry->file_offset + entry->num_bytes < entry->file_offset)
29 		return (u64)-1;
30 	return entry->file_offset + entry->num_bytes;
31 }
32 
33 /* returns NULL if the insertion worked, or it returns the node it did find
34  * in the tree
35  */
tree_insert(struct rb_root * root,u64 file_offset,struct rb_node * node)36 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
37 				   struct rb_node *node)
38 {
39 	struct rb_node **p = &root->rb_node;
40 	struct rb_node *parent = NULL;
41 	struct btrfs_ordered_extent *entry;
42 
43 	while (*p) {
44 		parent = *p;
45 		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
46 
47 		if (file_offset < entry->file_offset)
48 			p = &(*p)->rb_left;
49 		else if (file_offset >= entry_end(entry))
50 			p = &(*p)->rb_right;
51 		else
52 			return parent;
53 	}
54 
55 	rb_link_node(node, parent, p);
56 	rb_insert_color(node, root);
57 	return NULL;
58 }
59 
60 /*
61  * look for a given offset in the tree, and if it can't be found return the
62  * first lesser offset
63  */
__tree_search(struct rb_root * root,u64 file_offset,struct rb_node ** prev_ret)64 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
65 				     struct rb_node **prev_ret)
66 {
67 	struct rb_node *n = root->rb_node;
68 	struct rb_node *prev = NULL;
69 	struct rb_node *test;
70 	struct btrfs_ordered_extent *entry;
71 	struct btrfs_ordered_extent *prev_entry = NULL;
72 
73 	while (n) {
74 		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
75 		prev = n;
76 		prev_entry = entry;
77 
78 		if (file_offset < entry->file_offset)
79 			n = n->rb_left;
80 		else if (file_offset >= entry_end(entry))
81 			n = n->rb_right;
82 		else
83 			return n;
84 	}
85 	if (!prev_ret)
86 		return NULL;
87 
88 	while (prev && file_offset >= entry_end(prev_entry)) {
89 		test = rb_next(prev);
90 		if (!test)
91 			break;
92 		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
93 				      rb_node);
94 		if (file_offset < entry_end(prev_entry))
95 			break;
96 
97 		prev = test;
98 	}
99 	if (prev)
100 		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
101 				      rb_node);
102 	while (prev && file_offset < entry_end(prev_entry)) {
103 		test = rb_prev(prev);
104 		if (!test)
105 			break;
106 		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
107 				      rb_node);
108 		prev = test;
109 	}
110 	*prev_ret = prev;
111 	return NULL;
112 }
113 
range_overlaps(struct btrfs_ordered_extent * entry,u64 file_offset,u64 len)114 static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
115 			  u64 len)
116 {
117 	if (file_offset + len <= entry->file_offset ||
118 	    entry->file_offset + entry->num_bytes <= file_offset)
119 		return 0;
120 	return 1;
121 }
122 
123 /*
124  * look find the first ordered struct that has this offset, otherwise
125  * the first one less than this offset
126  */
tree_search(struct btrfs_ordered_inode_tree * tree,u64 file_offset)127 static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
128 					  u64 file_offset)
129 {
130 	struct rb_root *root = &tree->tree;
131 	struct rb_node *prev = NULL;
132 	struct rb_node *ret;
133 	struct btrfs_ordered_extent *entry;
134 
135 	if (tree->last) {
136 		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
137 				 rb_node);
138 		if (in_range(file_offset, entry->file_offset, entry->num_bytes))
139 			return tree->last;
140 	}
141 	ret = __tree_search(root, file_offset, &prev);
142 	if (!ret)
143 		ret = prev;
144 	if (ret)
145 		tree->last = ret;
146 	return ret;
147 }
148 
alloc_ordered_extent(struct btrfs_inode * inode,u64 file_offset,u64 num_bytes,u64 ram_bytes,u64 disk_bytenr,u64 disk_num_bytes,u64 offset,unsigned long flags,int compress_type)149 static struct btrfs_ordered_extent *alloc_ordered_extent(
150 			struct btrfs_inode *inode, u64 file_offset, u64 num_bytes,
151 			u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes,
152 			u64 offset, unsigned long flags, int compress_type)
153 {
154 	struct btrfs_ordered_extent *entry;
155 	int ret;
156 	u64 qgroup_rsv = 0;
157 	const bool is_nocow = (flags &
158 	       ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)));
159 
160 	if (is_nocow) {
161 		/* For nocow write, we can release the qgroup rsv right now */
162 		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
163 		if (ret < 0)
164 			return ERR_PTR(ret);
165 	} else {
166 		/*
167 		 * The ordered extent has reserved qgroup space, release now
168 		 * and pass the reserved number for qgroup_record to free.
169 		 */
170 		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
171 		if (ret < 0)
172 			return ERR_PTR(ret);
173 	}
174 	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
175 	if (!entry) {
176 		if (!is_nocow)
177 			btrfs_qgroup_free_refroot(inode->root->fs_info,
178 						  btrfs_root_id(inode->root),
179 						  qgroup_rsv, BTRFS_QGROUP_RSV_DATA);
180 		return ERR_PTR(-ENOMEM);
181 	}
182 
183 	entry->file_offset = file_offset;
184 	entry->num_bytes = num_bytes;
185 	entry->ram_bytes = ram_bytes;
186 	entry->disk_bytenr = disk_bytenr;
187 	entry->disk_num_bytes = disk_num_bytes;
188 	entry->offset = offset;
189 	entry->bytes_left = num_bytes;
190 	entry->inode = igrab(&inode->vfs_inode);
191 	entry->compress_type = compress_type;
192 	entry->truncated_len = (u64)-1;
193 	entry->qgroup_rsv = qgroup_rsv;
194 	entry->flags = flags;
195 	refcount_set(&entry->refs, 1);
196 	init_waitqueue_head(&entry->wait);
197 	INIT_LIST_HEAD(&entry->list);
198 	INIT_LIST_HEAD(&entry->log_list);
199 	INIT_LIST_HEAD(&entry->root_extent_list);
200 	INIT_LIST_HEAD(&entry->work_list);
201 	init_completion(&entry->completion);
202 
203 	/*
204 	 * We don't need the count_max_extents here, we can assume that all of
205 	 * that work has been done at higher layers, so this is truly the
206 	 * smallest the extent is going to get.
207 	 */
208 	spin_lock(&inode->lock);
209 	btrfs_mod_outstanding_extents(inode, 1);
210 	spin_unlock(&inode->lock);
211 
212 	return entry;
213 }
214 
insert_ordered_extent(struct btrfs_ordered_extent * entry)215 static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
216 {
217 	struct btrfs_inode *inode = BTRFS_I(entry->inode);
218 	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
219 	struct btrfs_root *root = inode->root;
220 	struct btrfs_fs_info *fs_info = root->fs_info;
221 	struct rb_node *node;
222 
223 	trace_btrfs_ordered_extent_add(inode, entry);
224 
225 	percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes,
226 				 fs_info->delalloc_batch);
227 
228 	/* One ref for the tree. */
229 	refcount_inc(&entry->refs);
230 
231 	spin_lock_irq(&tree->lock);
232 	node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node);
233 	if (node)
234 		btrfs_panic(fs_info, -EEXIST,
235 				"inconsistency in ordered tree at offset %llu",
236 				entry->file_offset);
237 	spin_unlock_irq(&tree->lock);
238 
239 	spin_lock(&root->ordered_extent_lock);
240 	list_add_tail(&entry->root_extent_list,
241 		      &root->ordered_extents);
242 	root->nr_ordered_extents++;
243 	if (root->nr_ordered_extents == 1) {
244 		spin_lock(&fs_info->ordered_root_lock);
245 		BUG_ON(!list_empty(&root->ordered_root));
246 		list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
247 		spin_unlock(&fs_info->ordered_root_lock);
248 	}
249 	spin_unlock(&root->ordered_extent_lock);
250 }
251 
252 /*
253  * Add an ordered extent to the per-inode tree.
254  *
255  * @inode:           Inode that this extent is for.
256  * @file_offset:     Logical offset in file where the extent starts.
257  * @num_bytes:       Logical length of extent in file.
258  * @ram_bytes:       Full length of unencoded data.
259  * @disk_bytenr:     Offset of extent on disk.
260  * @disk_num_bytes:  Size of extent on disk.
261  * @offset:          Offset into unencoded data where file data starts.
262  * @flags:           Flags specifying type of extent (1 << BTRFS_ORDERED_*).
263  * @compress_type:   Compression algorithm used for data.
264  *
265  * Most of these parameters correspond to &struct btrfs_file_extent_item. The
266  * tree is given a single reference on the ordered extent that was inserted, and
267  * the returned pointer is given a second reference.
268  *
269  * Return: the new ordered extent or error pointer.
270  */
btrfs_alloc_ordered_extent(struct btrfs_inode * inode,u64 file_offset,u64 num_bytes,u64 ram_bytes,u64 disk_bytenr,u64 disk_num_bytes,u64 offset,unsigned long flags,int compress_type)271 struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
272 			struct btrfs_inode *inode, u64 file_offset,
273 			u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
274 			u64 disk_num_bytes, u64 offset, unsigned long flags,
275 			int compress_type)
276 {
277 	struct btrfs_ordered_extent *entry;
278 
279 	ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
280 
281 	entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes,
282 				     disk_bytenr, disk_num_bytes, offset, flags,
283 				     compress_type);
284 	if (!IS_ERR(entry))
285 		insert_ordered_extent(entry);
286 	return entry;
287 }
288 
289 /*
290  * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
291  * when an ordered extent is finished.  If the list covers more than one
292  * ordered extent, it is split across multiples.
293  */
btrfs_add_ordered_sum(struct btrfs_ordered_extent * entry,struct btrfs_ordered_sum * sum)294 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
295 			   struct btrfs_ordered_sum *sum)
296 {
297 	struct btrfs_ordered_inode_tree *tree;
298 
299 	tree = &BTRFS_I(entry->inode)->ordered_tree;
300 	spin_lock_irq(&tree->lock);
301 	list_add_tail(&sum->list, &entry->list);
302 	spin_unlock_irq(&tree->lock);
303 }
304 
finish_ordered_fn(struct btrfs_work * work)305 static void finish_ordered_fn(struct btrfs_work *work)
306 {
307 	struct btrfs_ordered_extent *ordered_extent;
308 
309 	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
310 	btrfs_finish_ordered_io(ordered_extent);
311 }
312 
can_finish_ordered_extent(struct btrfs_ordered_extent * ordered,struct page * page,u64 file_offset,u64 len,bool uptodate)313 static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
314 				      struct page *page, u64 file_offset,
315 				      u64 len, bool uptodate)
316 {
317 	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
318 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
319 
320 	lockdep_assert_held(&inode->ordered_tree.lock);
321 
322 	if (page) {
323 		ASSERT(page->mapping);
324 		ASSERT(page_offset(page) <= file_offset);
325 		ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE);
326 
327 		/*
328 		 * Ordered (Private2) bit indicates whether we still have
329 		 * pending io unfinished for the ordered extent.
330 		 *
331 		 * If there's no such bit, we need to skip to next range.
332 		 */
333 		if (!btrfs_page_test_ordered(fs_info, page, file_offset, len))
334 			return false;
335 		btrfs_page_clear_ordered(fs_info, page, file_offset, len);
336 	}
337 
338 	/* Now we're fine to update the accounting. */
339 	if (WARN_ON_ONCE(len > ordered->bytes_left)) {
340 		btrfs_crit(fs_info,
341 "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu",
342 			   inode->root->root_key.objectid, btrfs_ino(inode),
343 			   ordered->file_offset, ordered->num_bytes,
344 			   len, ordered->bytes_left);
345 		ordered->bytes_left = 0;
346 	} else {
347 		ordered->bytes_left -= len;
348 	}
349 
350 	if (!uptodate)
351 		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
352 
353 	if (ordered->bytes_left)
354 		return false;
355 
356 	/*
357 	 * All the IO of the ordered extent is finished, we need to queue
358 	 * the finish_func to be executed.
359 	 */
360 	set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags);
361 	cond_wake_up(&ordered->wait);
362 	refcount_inc(&ordered->refs);
363 	trace_btrfs_ordered_extent_mark_finished(inode, ordered);
364 	return true;
365 }
366 
btrfs_queue_ordered_fn(struct btrfs_ordered_extent * ordered)367 static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
368 {
369 	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
370 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
371 	struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
372 		fs_info->endio_freespace_worker : fs_info->endio_write_workers;
373 
374 	btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
375 	btrfs_queue_work(wq, &ordered->work);
376 }
377 
btrfs_finish_ordered_extent(struct btrfs_ordered_extent * ordered,struct page * page,u64 file_offset,u64 len,bool uptodate)378 bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
379 				 struct page *page, u64 file_offset, u64 len,
380 				 bool uptodate)
381 {
382 	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
383 	unsigned long flags;
384 	bool ret;
385 
386 	trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
387 
388 	spin_lock_irqsave(&inode->ordered_tree.lock, flags);
389 	ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
390 	spin_unlock_irqrestore(&inode->ordered_tree.lock, flags);
391 
392 	if (ret)
393 		btrfs_queue_ordered_fn(ordered);
394 	return ret;
395 }
396 
397 /*
398  * Mark all ordered extents io inside the specified range finished.
399  *
400  * @page:	 The involved page for the operation.
401  *		 For uncompressed buffered IO, the page status also needs to be
402  *		 updated to indicate whether the pending ordered io is finished.
403  *		 Can be NULL for direct IO and compressed write.
404  *		 For these cases, callers are ensured they won't execute the
405  *		 endio function twice.
406  *
407  * This function is called for endio, thus the range must have ordered
408  * extent(s) covering it.
409  */
btrfs_mark_ordered_io_finished(struct btrfs_inode * inode,struct page * page,u64 file_offset,u64 num_bytes,bool uptodate)410 void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
411 				    struct page *page, u64 file_offset,
412 				    u64 num_bytes, bool uptodate)
413 {
414 	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
415 	struct rb_node *node;
416 	struct btrfs_ordered_extent *entry = NULL;
417 	unsigned long flags;
418 	u64 cur = file_offset;
419 
420 	trace_btrfs_writepage_end_io_hook(inode, file_offset,
421 					  file_offset + num_bytes - 1,
422 					  uptodate);
423 
424 	spin_lock_irqsave(&tree->lock, flags);
425 	while (cur < file_offset + num_bytes) {
426 		u64 entry_end;
427 		u64 end;
428 		u32 len;
429 
430 		node = tree_search(tree, cur);
431 		/* No ordered extents at all */
432 		if (!node)
433 			break;
434 
435 		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
436 		entry_end = entry->file_offset + entry->num_bytes;
437 		/*
438 		 * |<-- OE --->|  |
439 		 *		  cur
440 		 * Go to next OE.
441 		 */
442 		if (cur >= entry_end) {
443 			node = rb_next(node);
444 			/* No more ordered extents, exit */
445 			if (!node)
446 				break;
447 			entry = rb_entry(node, struct btrfs_ordered_extent,
448 					 rb_node);
449 
450 			/* Go to next ordered extent and continue */
451 			cur = entry->file_offset;
452 			continue;
453 		}
454 		/*
455 		 * |	|<--- OE --->|
456 		 * cur
457 		 * Go to the start of OE.
458 		 */
459 		if (cur < entry->file_offset) {
460 			cur = entry->file_offset;
461 			continue;
462 		}
463 
464 		/*
465 		 * Now we are definitely inside one ordered extent.
466 		 *
467 		 * |<--- OE --->|
468 		 *	|
469 		 *	cur
470 		 */
471 		end = min(entry->file_offset + entry->num_bytes,
472 			  file_offset + num_bytes) - 1;
473 		ASSERT(end + 1 - cur < U32_MAX);
474 		len = end + 1 - cur;
475 
476 		if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) {
477 			spin_unlock_irqrestore(&tree->lock, flags);
478 			btrfs_queue_ordered_fn(entry);
479 			spin_lock_irqsave(&tree->lock, flags);
480 		}
481 		cur += len;
482 	}
483 	spin_unlock_irqrestore(&tree->lock, flags);
484 }
485 
486 /*
487  * Finish IO for one ordered extent across a given range.  The range can only
488  * contain one ordered extent.
489  *
490  * @cached:	 The cached ordered extent. If not NULL, we can skip the tree
491  *               search and use the ordered extent directly.
492  * 		 Will be also used to store the finished ordered extent.
493  * @file_offset: File offset for the finished IO
494  * @io_size:	 Length of the finish IO range
495  *
496  * Return true if the ordered extent is finished in the range, and update
497  * @cached.
498  * Return false otherwise.
499  *
500  * NOTE: The range can NOT cross multiple ordered extents.
501  * Thus caller should ensure the range doesn't cross ordered extents.
502  */
btrfs_dec_test_ordered_pending(struct btrfs_inode * inode,struct btrfs_ordered_extent ** cached,u64 file_offset,u64 io_size)503 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
504 				    struct btrfs_ordered_extent **cached,
505 				    u64 file_offset, u64 io_size)
506 {
507 	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
508 	struct rb_node *node;
509 	struct btrfs_ordered_extent *entry = NULL;
510 	unsigned long flags;
511 	bool finished = false;
512 
513 	spin_lock_irqsave(&tree->lock, flags);
514 	if (cached && *cached) {
515 		entry = *cached;
516 		goto have_entry;
517 	}
518 
519 	node = tree_search(tree, file_offset);
520 	if (!node)
521 		goto out;
522 
523 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
524 have_entry:
525 	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
526 		goto out;
527 
528 	if (io_size > entry->bytes_left)
529 		btrfs_crit(inode->root->fs_info,
530 			   "bad ordered accounting left %llu size %llu",
531 		       entry->bytes_left, io_size);
532 
533 	entry->bytes_left -= io_size;
534 
535 	if (entry->bytes_left == 0) {
536 		/*
537 		 * Ensure only one caller can set the flag and finished_ret
538 		 * accordingly
539 		 */
540 		finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
541 		/* test_and_set_bit implies a barrier */
542 		cond_wake_up_nomb(&entry->wait);
543 	}
544 out:
545 	if (finished && cached && entry) {
546 		*cached = entry;
547 		refcount_inc(&entry->refs);
548 		trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
549 	}
550 	spin_unlock_irqrestore(&tree->lock, flags);
551 	return finished;
552 }
553 
554 /*
555  * used to drop a reference on an ordered extent.  This will free
556  * the extent if the last reference is dropped
557  */
btrfs_put_ordered_extent(struct btrfs_ordered_extent * entry)558 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
559 {
560 	struct list_head *cur;
561 	struct btrfs_ordered_sum *sum;
562 
563 	trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
564 
565 	if (refcount_dec_and_test(&entry->refs)) {
566 		ASSERT(list_empty(&entry->root_extent_list));
567 		ASSERT(list_empty(&entry->log_list));
568 		ASSERT(RB_EMPTY_NODE(&entry->rb_node));
569 		if (entry->inode)
570 			btrfs_add_delayed_iput(BTRFS_I(entry->inode));
571 		while (!list_empty(&entry->list)) {
572 			cur = entry->list.next;
573 			sum = list_entry(cur, struct btrfs_ordered_sum, list);
574 			list_del(&sum->list);
575 			kvfree(sum);
576 		}
577 		kmem_cache_free(btrfs_ordered_extent_cache, entry);
578 	}
579 }
580 
581 /*
582  * remove an ordered extent from the tree.  No references are dropped
583  * and waiters are woken up.
584  */
btrfs_remove_ordered_extent(struct btrfs_inode * btrfs_inode,struct btrfs_ordered_extent * entry)585 void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
586 				 struct btrfs_ordered_extent *entry)
587 {
588 	struct btrfs_ordered_inode_tree *tree;
589 	struct btrfs_root *root = btrfs_inode->root;
590 	struct btrfs_fs_info *fs_info = root->fs_info;
591 	struct rb_node *node;
592 	bool pending;
593 	bool freespace_inode;
594 
595 	/*
596 	 * If this is a free space inode the thread has not acquired the ordered
597 	 * extents lockdep map.
598 	 */
599 	freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
600 
601 	btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
602 	/* This is paired with btrfs_alloc_ordered_extent. */
603 	spin_lock(&btrfs_inode->lock);
604 	btrfs_mod_outstanding_extents(btrfs_inode, -1);
605 	spin_unlock(&btrfs_inode->lock);
606 	if (root != fs_info->tree_root) {
607 		u64 release;
608 
609 		if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
610 			release = entry->disk_num_bytes;
611 		else
612 			release = entry->num_bytes;
613 		btrfs_delalloc_release_metadata(btrfs_inode, release,
614 						test_bit(BTRFS_ORDERED_IOERR,
615 							 &entry->flags));
616 	}
617 
618 	percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
619 				 fs_info->delalloc_batch);
620 
621 	tree = &btrfs_inode->ordered_tree;
622 	spin_lock_irq(&tree->lock);
623 	node = &entry->rb_node;
624 	rb_erase(node, &tree->tree);
625 	RB_CLEAR_NODE(node);
626 	if (tree->last == node)
627 		tree->last = NULL;
628 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
629 	pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
630 	spin_unlock_irq(&tree->lock);
631 
632 	/*
633 	 * The current running transaction is waiting on us, we need to let it
634 	 * know that we're complete and wake it up.
635 	 */
636 	if (pending) {
637 		struct btrfs_transaction *trans;
638 
639 		/*
640 		 * The checks for trans are just a formality, it should be set,
641 		 * but if it isn't we don't want to deref/assert under the spin
642 		 * lock, so be nice and check if trans is set, but ASSERT() so
643 		 * if it isn't set a developer will notice.
644 		 */
645 		spin_lock(&fs_info->trans_lock);
646 		trans = fs_info->running_transaction;
647 		if (trans)
648 			refcount_inc(&trans->use_count);
649 		spin_unlock(&fs_info->trans_lock);
650 
651 		ASSERT(trans || BTRFS_FS_ERROR(fs_info));
652 		if (trans) {
653 			if (atomic_dec_and_test(&trans->pending_ordered))
654 				wake_up(&trans->pending_wait);
655 			btrfs_put_transaction(trans);
656 		}
657 	}
658 
659 	btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);
660 
661 	spin_lock(&root->ordered_extent_lock);
662 	list_del_init(&entry->root_extent_list);
663 	root->nr_ordered_extents--;
664 
665 	trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
666 
667 	if (!root->nr_ordered_extents) {
668 		spin_lock(&fs_info->ordered_root_lock);
669 		BUG_ON(list_empty(&root->ordered_root));
670 		list_del_init(&root->ordered_root);
671 		spin_unlock(&fs_info->ordered_root_lock);
672 	}
673 	spin_unlock(&root->ordered_extent_lock);
674 	wake_up(&entry->wait);
675 	if (!freespace_inode)
676 		btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
677 }
678 
btrfs_run_ordered_extent_work(struct btrfs_work * work)679 static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
680 {
681 	struct btrfs_ordered_extent *ordered;
682 
683 	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
684 	btrfs_start_ordered_extent(ordered);
685 	complete(&ordered->completion);
686 }
687 
688 /*
689  * wait for all the ordered extents in a root.  This is done when balancing
690  * space between drives.
691  */
btrfs_wait_ordered_extents(struct btrfs_root * root,u64 nr,const u64 range_start,const u64 range_len)692 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
693 			       const u64 range_start, const u64 range_len)
694 {
695 	struct btrfs_fs_info *fs_info = root->fs_info;
696 	LIST_HEAD(splice);
697 	LIST_HEAD(skipped);
698 	LIST_HEAD(works);
699 	struct btrfs_ordered_extent *ordered, *next;
700 	u64 count = 0;
701 	const u64 range_end = range_start + range_len;
702 
703 	mutex_lock(&root->ordered_extent_mutex);
704 	spin_lock(&root->ordered_extent_lock);
705 	list_splice_init(&root->ordered_extents, &splice);
706 	while (!list_empty(&splice) && nr) {
707 		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
708 					   root_extent_list);
709 
710 		if (range_end <= ordered->disk_bytenr ||
711 		    ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
712 			list_move_tail(&ordered->root_extent_list, &skipped);
713 			cond_resched_lock(&root->ordered_extent_lock);
714 			continue;
715 		}
716 
717 		list_move_tail(&ordered->root_extent_list,
718 			       &root->ordered_extents);
719 		refcount_inc(&ordered->refs);
720 		spin_unlock(&root->ordered_extent_lock);
721 
722 		btrfs_init_work(&ordered->flush_work,
723 				btrfs_run_ordered_extent_work, NULL, NULL);
724 		list_add_tail(&ordered->work_list, &works);
725 		btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
726 
727 		cond_resched();
728 		spin_lock(&root->ordered_extent_lock);
729 		if (nr != U64_MAX)
730 			nr--;
731 		count++;
732 	}
733 	list_splice_tail(&skipped, &root->ordered_extents);
734 	list_splice_tail(&splice, &root->ordered_extents);
735 	spin_unlock(&root->ordered_extent_lock);
736 
737 	list_for_each_entry_safe(ordered, next, &works, work_list) {
738 		list_del_init(&ordered->work_list);
739 		wait_for_completion(&ordered->completion);
740 		btrfs_put_ordered_extent(ordered);
741 		cond_resched();
742 	}
743 	mutex_unlock(&root->ordered_extent_mutex);
744 
745 	return count;
746 }
747 
btrfs_wait_ordered_roots(struct btrfs_fs_info * fs_info,u64 nr,const u64 range_start,const u64 range_len)748 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
749 			     const u64 range_start, const u64 range_len)
750 {
751 	struct btrfs_root *root;
752 	LIST_HEAD(splice);
753 	u64 done;
754 
755 	mutex_lock(&fs_info->ordered_operations_mutex);
756 	spin_lock(&fs_info->ordered_root_lock);
757 	list_splice_init(&fs_info->ordered_roots, &splice);
758 	while (!list_empty(&splice) && nr) {
759 		root = list_first_entry(&splice, struct btrfs_root,
760 					ordered_root);
761 		root = btrfs_grab_root(root);
762 		BUG_ON(!root);
763 		list_move_tail(&root->ordered_root,
764 			       &fs_info->ordered_roots);
765 		spin_unlock(&fs_info->ordered_root_lock);
766 
767 		done = btrfs_wait_ordered_extents(root, nr,
768 						  range_start, range_len);
769 		btrfs_put_root(root);
770 
771 		spin_lock(&fs_info->ordered_root_lock);
772 		if (nr != U64_MAX) {
773 			nr -= done;
774 		}
775 	}
776 	list_splice_tail(&splice, &fs_info->ordered_roots);
777 	spin_unlock(&fs_info->ordered_root_lock);
778 	mutex_unlock(&fs_info->ordered_operations_mutex);
779 }
780 
781 /*
782  * Start IO and wait for a given ordered extent to finish.
783  *
784  * Wait on page writeback for all the pages in the extent and the IO completion
785  * code to insert metadata into the btree corresponding to the extent.
786  */
btrfs_start_ordered_extent(struct btrfs_ordered_extent * entry)787 void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
788 {
789 	u64 start = entry->file_offset;
790 	u64 end = start + entry->num_bytes - 1;
791 	struct btrfs_inode *inode = BTRFS_I(entry->inode);
792 	bool freespace_inode;
793 
794 	trace_btrfs_ordered_extent_start(inode, entry);
795 
796 	/*
797 	 * If this is a free space inode do not take the ordered extents lockdep
798 	 * map.
799 	 */
800 	freespace_inode = btrfs_is_free_space_inode(inode);
801 
802 	/*
803 	 * pages in the range can be dirty, clean or writeback.  We
804 	 * start IO on any dirty ones so the wait doesn't stall waiting
805 	 * for the flusher thread to find them
806 	 */
807 	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
808 		filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
809 
810 	if (!freespace_inode)
811 		btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
812 	wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
813 }
814 
815 /*
816  * Used to wait on ordered extents across a large range of bytes.
817  */
btrfs_wait_ordered_range(struct inode * inode,u64 start,u64 len)818 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
819 {
820 	int ret = 0;
821 	int ret_wb = 0;
822 	u64 end;
823 	u64 orig_end;
824 	struct btrfs_ordered_extent *ordered;
825 
826 	if (start + len < start) {
827 		orig_end = OFFSET_MAX;
828 	} else {
829 		orig_end = start + len - 1;
830 		if (orig_end > OFFSET_MAX)
831 			orig_end = OFFSET_MAX;
832 	}
833 
834 	/* start IO across the range first to instantiate any delalloc
835 	 * extents
836 	 */
837 	ret = btrfs_fdatawrite_range(inode, start, orig_end);
838 	if (ret)
839 		return ret;
840 
841 	/*
842 	 * If we have a writeback error don't return immediately. Wait first
843 	 * for any ordered extents that haven't completed yet. This is to make
844 	 * sure no one can dirty the same page ranges and call writepages()
845 	 * before the ordered extents complete - to avoid failures (-EEXIST)
846 	 * when adding the new ordered extents to the ordered tree.
847 	 */
848 	ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
849 
850 	end = orig_end;
851 	while (1) {
852 		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
853 		if (!ordered)
854 			break;
855 		if (ordered->file_offset > orig_end) {
856 			btrfs_put_ordered_extent(ordered);
857 			break;
858 		}
859 		if (ordered->file_offset + ordered->num_bytes <= start) {
860 			btrfs_put_ordered_extent(ordered);
861 			break;
862 		}
863 		btrfs_start_ordered_extent(ordered);
864 		end = ordered->file_offset;
865 		/*
866 		 * If the ordered extent had an error save the error but don't
867 		 * exit without waiting first for all other ordered extents in
868 		 * the range to complete.
869 		 */
870 		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
871 			ret = -EIO;
872 		btrfs_put_ordered_extent(ordered);
873 		if (end == 0 || end == start)
874 			break;
875 		end--;
876 	}
877 	return ret_wb ? ret_wb : ret;
878 }
879 
880 /*
881  * find an ordered extent corresponding to file_offset.  return NULL if
882  * nothing is found, otherwise take a reference on the extent and return it
883  */
btrfs_lookup_ordered_extent(struct btrfs_inode * inode,u64 file_offset)884 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
885 							 u64 file_offset)
886 {
887 	struct btrfs_ordered_inode_tree *tree;
888 	struct rb_node *node;
889 	struct btrfs_ordered_extent *entry = NULL;
890 	unsigned long flags;
891 
892 	tree = &inode->ordered_tree;
893 	spin_lock_irqsave(&tree->lock, flags);
894 	node = tree_search(tree, file_offset);
895 	if (!node)
896 		goto out;
897 
898 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
899 	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
900 		entry = NULL;
901 	if (entry) {
902 		refcount_inc(&entry->refs);
903 		trace_btrfs_ordered_extent_lookup(inode, entry);
904 	}
905 out:
906 	spin_unlock_irqrestore(&tree->lock, flags);
907 	return entry;
908 }
909 
910 /* Since the DIO code tries to lock a wide area we need to look for any ordered
911  * extents that exist in the range, rather than just the start of the range.
912  */
btrfs_lookup_ordered_range(struct btrfs_inode * inode,u64 file_offset,u64 len)913 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
914 		struct btrfs_inode *inode, u64 file_offset, u64 len)
915 {
916 	struct btrfs_ordered_inode_tree *tree;
917 	struct rb_node *node;
918 	struct btrfs_ordered_extent *entry = NULL;
919 
920 	tree = &inode->ordered_tree;
921 	spin_lock_irq(&tree->lock);
922 	node = tree_search(tree, file_offset);
923 	if (!node) {
924 		node = tree_search(tree, file_offset + len);
925 		if (!node)
926 			goto out;
927 	}
928 
929 	while (1) {
930 		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
931 		if (range_overlaps(entry, file_offset, len))
932 			break;
933 
934 		if (entry->file_offset >= file_offset + len) {
935 			entry = NULL;
936 			break;
937 		}
938 		entry = NULL;
939 		node = rb_next(node);
940 		if (!node)
941 			break;
942 	}
943 out:
944 	if (entry) {
945 		refcount_inc(&entry->refs);
946 		trace_btrfs_ordered_extent_lookup_range(inode, entry);
947 	}
948 	spin_unlock_irq(&tree->lock);
949 	return entry;
950 }
951 
952 /*
953  * Adds all ordered extents to the given list. The list ends up sorted by the
954  * file_offset of the ordered extents.
955  */
btrfs_get_ordered_extents_for_logging(struct btrfs_inode * inode,struct list_head * list)956 void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
957 					   struct list_head *list)
958 {
959 	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
960 	struct rb_node *n;
961 
962 	ASSERT(inode_is_locked(&inode->vfs_inode));
963 
964 	spin_lock_irq(&tree->lock);
965 	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
966 		struct btrfs_ordered_extent *ordered;
967 
968 		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
969 
970 		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
971 			continue;
972 
973 		ASSERT(list_empty(&ordered->log_list));
974 		list_add_tail(&ordered->log_list, list);
975 		refcount_inc(&ordered->refs);
976 		trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
977 	}
978 	spin_unlock_irq(&tree->lock);
979 }
980 
981 /*
982  * lookup and return any extent before 'file_offset'.  NULL is returned
983  * if none is found
984  */
985 struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode * inode,u64 file_offset)986 btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
987 {
988 	struct btrfs_ordered_inode_tree *tree;
989 	struct rb_node *node;
990 	struct btrfs_ordered_extent *entry = NULL;
991 
992 	tree = &inode->ordered_tree;
993 	spin_lock_irq(&tree->lock);
994 	node = tree_search(tree, file_offset);
995 	if (!node)
996 		goto out;
997 
998 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
999 	refcount_inc(&entry->refs);
1000 	trace_btrfs_ordered_extent_lookup_first(inode, entry);
1001 out:
1002 	spin_unlock_irq(&tree->lock);
1003 	return entry;
1004 }
1005 
1006 /*
1007  * Lookup the first ordered extent that overlaps the range
1008  * [@file_offset, @file_offset + @len).
1009  *
1010  * The difference between this and btrfs_lookup_first_ordered_extent() is
1011  * that this one won't return any ordered extent that does not overlap the range.
1012  * And the difference against btrfs_lookup_ordered_extent() is, this function
1013  * ensures the first ordered extent gets returned.
1014  */
btrfs_lookup_first_ordered_range(struct btrfs_inode * inode,u64 file_offset,u64 len)1015 struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
1016 			struct btrfs_inode *inode, u64 file_offset, u64 len)
1017 {
1018 	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
1019 	struct rb_node *node;
1020 	struct rb_node *cur;
1021 	struct rb_node *prev;
1022 	struct rb_node *next;
1023 	struct btrfs_ordered_extent *entry = NULL;
1024 
1025 	spin_lock_irq(&tree->lock);
1026 	node = tree->tree.rb_node;
1027 	/*
1028 	 * Here we don't want to use tree_search() which will use tree->last
1029 	 * and screw up the search order.
1030 	 * And __tree_search() can't return the adjacent ordered extents
1031 	 * either, thus here we do our own search.
1032 	 */
1033 	while (node) {
1034 		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
1035 
1036 		if (file_offset < entry->file_offset) {
1037 			node = node->rb_left;
1038 		} else if (file_offset >= entry_end(entry)) {
1039 			node = node->rb_right;
1040 		} else {
1041 			/*
1042 			 * Direct hit, got an ordered extent that starts at
1043 			 * @file_offset
1044 			 */
1045 			goto out;
1046 		}
1047 	}
1048 	if (!entry) {
1049 		/* Empty tree */
1050 		goto out;
1051 	}
1052 
1053 	cur = &entry->rb_node;
1054 	/* We got an entry around @file_offset, check adjacent entries */
1055 	if (entry->file_offset < file_offset) {
1056 		prev = cur;
1057 		next = rb_next(cur);
1058 	} else {
1059 		prev = rb_prev(cur);
1060 		next = cur;
1061 	}
1062 	if (prev) {
1063 		entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
1064 		if (range_overlaps(entry, file_offset, len))
1065 			goto out;
1066 	}
1067 	if (next) {
1068 		entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
1069 		if (range_overlaps(entry, file_offset, len))
1070 			goto out;
1071 	}
1072 	/* No ordered extent in the range */
1073 	entry = NULL;
1074 out:
1075 	if (entry) {
1076 		refcount_inc(&entry->refs);
1077 		trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
1078 	}
1079 
1080 	spin_unlock_irq(&tree->lock);
1081 	return entry;
1082 }
1083 
1084 /*
1085  * Lock the passed range and ensures all pending ordered extents in it are run
1086  * to completion.
1087  *
1088  * @inode:        Inode whose ordered tree is to be searched
1089  * @start:        Beginning of range to flush
1090  * @end:          Last byte of range to lock
1091  * @cached_state: If passed, will return the extent state responsible for the
1092  *                locked range. It's the caller's responsibility to free the
1093  *                cached state.
1094  *
1095  * Always return with the given range locked, ensuring after it's called no
1096  * order extent can be pending.
1097  */
btrfs_lock_and_flush_ordered_range(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state)1098 void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
1099 					u64 end,
1100 					struct extent_state **cached_state)
1101 {
1102 	struct btrfs_ordered_extent *ordered;
1103 	struct extent_state *cache = NULL;
1104 	struct extent_state **cachedp = &cache;
1105 
1106 	if (cached_state)
1107 		cachedp = cached_state;
1108 
1109 	while (1) {
1110 		lock_extent(&inode->io_tree, start, end, cachedp);
1111 		ordered = btrfs_lookup_ordered_range(inode, start,
1112 						     end - start + 1);
1113 		if (!ordered) {
1114 			/*
1115 			 * If no external cached_state has been passed then
1116 			 * decrement the extra ref taken for cachedp since we
1117 			 * aren't exposing it outside of this function
1118 			 */
1119 			if (!cached_state)
1120 				refcount_dec(&cache->refs);
1121 			break;
1122 		}
1123 		unlock_extent(&inode->io_tree, start, end, cachedp);
1124 		btrfs_start_ordered_extent(ordered);
1125 		btrfs_put_ordered_extent(ordered);
1126 	}
1127 }
1128 
1129 /*
1130  * Lock the passed range and ensure all pending ordered extents in it are run
1131  * to completion in nowait mode.
1132  *
1133  * Return true if btrfs_lock_ordered_range does not return any extents,
1134  * otherwise false.
1135  */
btrfs_try_lock_ordered_range(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state)1136 bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
1137 				  struct extent_state **cached_state)
1138 {
1139 	struct btrfs_ordered_extent *ordered;
1140 
1141 	if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
1142 		return false;
1143 
1144 	ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
1145 	if (!ordered)
1146 		return true;
1147 
1148 	btrfs_put_ordered_extent(ordered);
1149 	unlock_extent(&inode->io_tree, start, end, cached_state);
1150 
1151 	return false;
1152 }
1153 
1154 /* Split out a new ordered extent for this first @len bytes of @ordered. */
btrfs_split_ordered_extent(struct btrfs_ordered_extent * ordered,u64 len)1155 struct btrfs_ordered_extent *btrfs_split_ordered_extent(
1156 			struct btrfs_ordered_extent *ordered, u64 len)
1157 {
1158 	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
1159 	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
1160 	struct btrfs_root *root = inode->root;
1161 	struct btrfs_fs_info *fs_info = root->fs_info;
1162 	u64 file_offset = ordered->file_offset;
1163 	u64 disk_bytenr = ordered->disk_bytenr;
1164 	unsigned long flags = ordered->flags;
1165 	struct btrfs_ordered_sum *sum, *tmpsum;
1166 	struct btrfs_ordered_extent *new;
1167 	struct rb_node *node;
1168 	u64 offset = 0;
1169 
1170 	trace_btrfs_ordered_extent_split(inode, ordered);
1171 
1172 	ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
1173 
1174 	/*
1175 	 * The entire bio must be covered by the ordered extent, but we can't
1176 	 * reduce the original extent to a zero length either.
1177 	 */
1178 	if (WARN_ON_ONCE(len >= ordered->num_bytes))
1179 		return ERR_PTR(-EINVAL);
1180 	/*
1181 	 * If our ordered extent had an error there's no point in continuing.
1182 	 * The error may have come from a transaction abort done either by this
1183 	 * task or some other concurrent task, and the transaction abort path
1184 	 * iterates over all existing ordered extents and sets the flag
1185 	 * BTRFS_ORDERED_IOERR on them.
1186 	 */
1187 	if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) {
1188 		const int fs_error = BTRFS_FS_ERROR(fs_info);
1189 
1190 		return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO);
1191 	}
1192 	/* We cannot split partially completed ordered extents. */
1193 	if (ordered->bytes_left) {
1194 		ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
1195 		if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
1196 			return ERR_PTR(-EINVAL);
1197 	}
1198 	/* We cannot split a compressed ordered extent. */
1199 	if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes))
1200 		return ERR_PTR(-EINVAL);
1201 
1202 	new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr,
1203 				   len, 0, flags, ordered->compress_type);
1204 	if (IS_ERR(new))
1205 		return new;
1206 
1207 	/* One ref for the tree. */
1208 	refcount_inc(&new->refs);
1209 
1210 	spin_lock_irq(&root->ordered_extent_lock);
1211 	spin_lock(&tree->lock);
1212 	/* Remove from tree once */
1213 	node = &ordered->rb_node;
1214 	rb_erase(node, &tree->tree);
1215 	RB_CLEAR_NODE(node);
1216 	if (tree->last == node)
1217 		tree->last = NULL;
1218 
1219 	ordered->file_offset += len;
1220 	ordered->disk_bytenr += len;
1221 	ordered->num_bytes -= len;
1222 	ordered->disk_num_bytes -= len;
1223 	ordered->ram_bytes -= len;
1224 
1225 	if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
1226 		ASSERT(ordered->bytes_left == 0);
1227 		new->bytes_left = 0;
1228 	} else {
1229 		ordered->bytes_left -= len;
1230 	}
1231 
1232 	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) {
1233 		if (ordered->truncated_len > len) {
1234 			ordered->truncated_len -= len;
1235 		} else {
1236 			new->truncated_len = ordered->truncated_len;
1237 			ordered->truncated_len = 0;
1238 		}
1239 	}
1240 
1241 	list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) {
1242 		if (offset == len)
1243 			break;
1244 		list_move_tail(&sum->list, &new->list);
1245 		offset += sum->len;
1246 	}
1247 
1248 	/* Re-insert the node */
1249 	node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
1250 	if (node)
1251 		btrfs_panic(fs_info, -EEXIST,
1252 			"zoned: inconsistency in ordered tree at offset %llu",
1253 			ordered->file_offset);
1254 
1255 	node = tree_insert(&tree->tree, new->file_offset, &new->rb_node);
1256 	if (node)
1257 		btrfs_panic(fs_info, -EEXIST,
1258 			"zoned: inconsistency in ordered tree at offset %llu",
1259 			new->file_offset);
1260 	spin_unlock(&tree->lock);
1261 
1262 	list_add_tail(&new->root_extent_list, &root->ordered_extents);
1263 	root->nr_ordered_extents++;
1264 	spin_unlock_irq(&root->ordered_extent_lock);
1265 	return new;
1266 }
1267 
ordered_data_init(void)1268 int __init ordered_data_init(void)
1269 {
1270 	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
1271 				     sizeof(struct btrfs_ordered_extent), 0,
1272 				     SLAB_MEM_SPREAD,
1273 				     NULL);
1274 	if (!btrfs_ordered_extent_cache)
1275 		return -ENOMEM;
1276 
1277 	return 0;
1278 }
1279 
ordered_data_exit(void)1280 void __cold ordered_data_exit(void)
1281 {
1282 	kmem_cache_destroy(btrfs_ordered_extent_cache);
1283 }
1284