• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * Copyright (C) 2007 Oracle.  All rights reserved.
3   *
4   * This program is free software; you can redistribute it and/or
5   * modify it under the terms of the GNU General Public
6   * License v2 as published by the Free Software Foundation.
7   *
8   * This program is distributed in the hope that it will be useful,
9   * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11   * General Public License for more details.
12   *
13   * You should have received a copy of the GNU General Public
14   * License along with this program; if not, write to the
15   * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16   * Boston, MA 021110-1307, USA.
17   */
18  
19  #include <linux/kernel.h>
20  #include <linux/bio.h>
21  #include <linux/buffer_head.h>
22  #include <linux/file.h>
23  #include <linux/fs.h>
24  #include <linux/pagemap.h>
25  #include <linux/highmem.h>
26  #include <linux/time.h>
27  #include <linux/init.h>
28  #include <linux/string.h>
29  #include <linux/backing-dev.h>
30  #include <linux/mpage.h>
31  #include <linux/swap.h>
32  #include <linux/writeback.h>
33  #include <linux/statfs.h>
34  #include <linux/compat.h>
35  #include <linux/bit_spinlock.h>
36  #include <linux/xattr.h>
37  #include <linux/posix_acl.h>
38  #include <linux/falloc.h>
39  #include <linux/slab.h>
40  #include <linux/ratelimit.h>
41  #include <linux/mount.h>
42  #include <linux/btrfs.h>
43  #include <linux/blkdev.h>
44  #include <linux/posix_acl_xattr.h>
45  #include <linux/uio.h>
46  #include "ctree.h"
47  #include "disk-io.h"
48  #include "transaction.h"
49  #include "btrfs_inode.h"
50  #include "print-tree.h"
51  #include "ordered-data.h"
52  #include "xattr.h"
53  #include "tree-log.h"
54  #include "volumes.h"
55  #include "compression.h"
56  #include "locking.h"
57  #include "free-space-cache.h"
58  #include "inode-map.h"
59  #include "backref.h"
60  #include "hash.h"
61  #include "props.h"
62  #include "qgroup.h"
63  #include "dedupe.h"
64  
65  struct btrfs_iget_args {
66  	struct btrfs_key *location;
67  	struct btrfs_root *root;
68  };
69  
70  struct btrfs_dio_data {
71  	u64 outstanding_extents;
72  	u64 reserve;
73  	u64 unsubmitted_oe_range_start;
74  	u64 unsubmitted_oe_range_end;
75  };
76  
77  static const struct inode_operations btrfs_dir_inode_operations;
78  static const struct inode_operations btrfs_symlink_inode_operations;
79  static const struct inode_operations btrfs_dir_ro_inode_operations;
80  static const struct inode_operations btrfs_special_inode_operations;
81  static const struct inode_operations btrfs_file_inode_operations;
82  static const struct address_space_operations btrfs_aops;
83  static const struct address_space_operations btrfs_symlink_aops;
84  static const struct file_operations btrfs_dir_file_operations;
85  static const struct extent_io_ops btrfs_extent_io_ops;
86  
87  static struct kmem_cache *btrfs_inode_cachep;
88  struct kmem_cache *btrfs_trans_handle_cachep;
89  struct kmem_cache *btrfs_transaction_cachep;
90  struct kmem_cache *btrfs_path_cachep;
91  struct kmem_cache *btrfs_free_space_cachep;
92  
93  #define S_SHIFT 12
94  static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
95  	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
96  	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
97  	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
98  	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
99  	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
100  	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
101  	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
102  };
103  
104  static int btrfs_setsize(struct inode *inode, struct iattr *attr);
105  static int btrfs_truncate(struct inode *inode);
106  static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
107  static noinline int cow_file_range(struct inode *inode,
108  				   struct page *locked_page,
109  				   u64 start, u64 end, u64 delalloc_end,
110  				   int *page_started, unsigned long *nr_written,
111  				   int unlock, struct btrfs_dedupe_hash *hash);
112  static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
113  					   u64 len, u64 orig_start,
114  					   u64 block_start, u64 block_len,
115  					   u64 orig_block_len, u64 ram_bytes,
116  					   int type);
117  
118  static int btrfs_dirty_inode(struct inode *inode);
119  
120  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
btrfs_test_inode_set_ops(struct inode * inode)121  void btrfs_test_inode_set_ops(struct inode *inode)
122  {
123  	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
124  }
125  #endif
126  
btrfs_init_inode_security(struct btrfs_trans_handle * trans,struct inode * inode,struct inode * dir,const struct qstr * qstr)127  static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
128  				     struct inode *inode,  struct inode *dir,
129  				     const struct qstr *qstr)
130  {
131  	int err;
132  
133  	err = btrfs_init_acl(trans, inode, dir);
134  	if (!err)
135  		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
136  	return err;
137  }
138  
139  /*
140   * this does all the hard work for inserting an inline extent into
141   * the btree.  The caller should have done a btrfs_drop_extents so that
142   * no overlapping inline items exist in the btree
143   */
insert_inline_extent(struct btrfs_trans_handle * trans,struct btrfs_path * path,int extent_inserted,struct btrfs_root * root,struct inode * inode,u64 start,size_t size,size_t compressed_size,int compress_type,struct page ** compressed_pages)144  static int insert_inline_extent(struct btrfs_trans_handle *trans,
145  				struct btrfs_path *path, int extent_inserted,
146  				struct btrfs_root *root, struct inode *inode,
147  				u64 start, size_t size, size_t compressed_size,
148  				int compress_type,
149  				struct page **compressed_pages)
150  {
151  	struct extent_buffer *leaf;
152  	struct page *page = NULL;
153  	char *kaddr;
154  	unsigned long ptr;
155  	struct btrfs_file_extent_item *ei;
156  	int err = 0;
157  	int ret;
158  	size_t cur_size = size;
159  	unsigned long offset;
160  
161  	if (compressed_size && compressed_pages)
162  		cur_size = compressed_size;
163  
164  	inode_add_bytes(inode, size);
165  
166  	if (!extent_inserted) {
167  		struct btrfs_key key;
168  		size_t datasize;
169  
170  		key.objectid = btrfs_ino(inode);
171  		key.offset = start;
172  		key.type = BTRFS_EXTENT_DATA_KEY;
173  
174  		datasize = btrfs_file_extent_calc_inline_size(cur_size);
175  		path->leave_spinning = 1;
176  		ret = btrfs_insert_empty_item(trans, root, path, &key,
177  					      datasize);
178  		if (ret) {
179  			err = ret;
180  			goto fail;
181  		}
182  	}
183  	leaf = path->nodes[0];
184  	ei = btrfs_item_ptr(leaf, path->slots[0],
185  			    struct btrfs_file_extent_item);
186  	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
187  	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
188  	btrfs_set_file_extent_encryption(leaf, ei, 0);
189  	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
190  	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
191  	ptr = btrfs_file_extent_inline_start(ei);
192  
193  	if (compress_type != BTRFS_COMPRESS_NONE) {
194  		struct page *cpage;
195  		int i = 0;
196  		while (compressed_size > 0) {
197  			cpage = compressed_pages[i];
198  			cur_size = min_t(unsigned long, compressed_size,
199  				       PAGE_SIZE);
200  
201  			kaddr = kmap_atomic(cpage);
202  			write_extent_buffer(leaf, kaddr, ptr, cur_size);
203  			kunmap_atomic(kaddr);
204  
205  			i++;
206  			ptr += cur_size;
207  			compressed_size -= cur_size;
208  		}
209  		btrfs_set_file_extent_compression(leaf, ei,
210  						  compress_type);
211  	} else {
212  		page = find_get_page(inode->i_mapping,
213  				     start >> PAGE_SHIFT);
214  		btrfs_set_file_extent_compression(leaf, ei, 0);
215  		kaddr = kmap_atomic(page);
216  		offset = start & (PAGE_SIZE - 1);
217  		write_extent_buffer(leaf, kaddr + offset, ptr, size);
218  		kunmap_atomic(kaddr);
219  		put_page(page);
220  	}
221  	btrfs_mark_buffer_dirty(leaf);
222  	btrfs_release_path(path);
223  
224  	/*
225  	 * we're an inline extent, so nobody can
226  	 * extend the file past i_size without locking
227  	 * a page we already have locked.
228  	 *
229  	 * We must do any isize and inode updates
230  	 * before we unlock the pages.  Otherwise we
231  	 * could end up racing with unlink.
232  	 */
233  	BTRFS_I(inode)->disk_i_size = inode->i_size;
234  	ret = btrfs_update_inode(trans, root, inode);
235  
236  	return ret;
237  fail:
238  	return err;
239  }
240  
241  
242  /*
243   * conditionally insert an inline extent into the file.  This
244   * does the checks required to make sure the data is small enough
245   * to fit as an inline extent.
246   */
cow_file_range_inline(struct btrfs_root * root,struct inode * inode,u64 start,u64 end,size_t compressed_size,int compress_type,struct page ** compressed_pages)247  static noinline int cow_file_range_inline(struct btrfs_root *root,
248  					  struct inode *inode, u64 start,
249  					  u64 end, size_t compressed_size,
250  					  int compress_type,
251  					  struct page **compressed_pages)
252  {
253  	struct btrfs_trans_handle *trans;
254  	u64 isize = i_size_read(inode);
255  	u64 actual_end = min(end + 1, isize);
256  	u64 inline_len = actual_end - start;
257  	u64 aligned_end = ALIGN(end, root->sectorsize);
258  	u64 data_len = inline_len;
259  	int ret;
260  	struct btrfs_path *path;
261  	int extent_inserted = 0;
262  	u32 extent_item_size;
263  
264  	if (compressed_size)
265  		data_len = compressed_size;
266  
267  	if (start > 0 ||
268  	    actual_end > root->sectorsize ||
269  	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
270  	    (!compressed_size &&
271  	    (actual_end & (root->sectorsize - 1)) == 0) ||
272  	    end + 1 < isize ||
273  	    data_len > root->fs_info->max_inline) {
274  		return 1;
275  	}
276  
277  	path = btrfs_alloc_path();
278  	if (!path)
279  		return -ENOMEM;
280  
281  	trans = btrfs_join_transaction(root);
282  	if (IS_ERR(trans)) {
283  		btrfs_free_path(path);
284  		return PTR_ERR(trans);
285  	}
286  	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
287  
288  	if (compressed_size && compressed_pages)
289  		extent_item_size = btrfs_file_extent_calc_inline_size(
290  		   compressed_size);
291  	else
292  		extent_item_size = btrfs_file_extent_calc_inline_size(
293  		    inline_len);
294  
295  	ret = __btrfs_drop_extents(trans, root, inode, path,
296  				   start, aligned_end, NULL,
297  				   1, 1, extent_item_size, &extent_inserted);
298  	if (ret) {
299  		btrfs_abort_transaction(trans, ret);
300  		goto out;
301  	}
302  
303  	if (isize > actual_end)
304  		inline_len = min_t(u64, isize, actual_end);
305  	ret = insert_inline_extent(trans, path, extent_inserted,
306  				   root, inode, start,
307  				   inline_len, compressed_size,
308  				   compress_type, compressed_pages);
309  	if (ret && ret != -ENOSPC) {
310  		btrfs_abort_transaction(trans, ret);
311  		goto out;
312  	} else if (ret == -ENOSPC) {
313  		ret = 1;
314  		goto out;
315  	}
316  
317  	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
318  	btrfs_delalloc_release_metadata(inode, end + 1 - start);
319  	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
320  out:
321  	/*
322  	 * Don't forget to free the reserved space, as for inlined extent
323  	 * it won't count as data extent, free them directly here.
324  	 * And at reserve time, it's always aligned to page size, so
325  	 * just free one page here.
326  	 */
327  	btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
328  	btrfs_free_path(path);
329  	btrfs_end_transaction(trans, root);
330  	return ret;
331  }
332  
333  struct async_extent {
334  	u64 start;
335  	u64 ram_size;
336  	u64 compressed_size;
337  	struct page **pages;
338  	unsigned long nr_pages;
339  	int compress_type;
340  	struct list_head list;
341  };
342  
343  struct async_cow {
344  	struct inode *inode;
345  	struct btrfs_root *root;
346  	struct page *locked_page;
347  	u64 start;
348  	u64 end;
349  	struct list_head extents;
350  	struct btrfs_work work;
351  };
352  
add_async_extent(struct async_cow * cow,u64 start,u64 ram_size,u64 compressed_size,struct page ** pages,unsigned long nr_pages,int compress_type)353  static noinline int add_async_extent(struct async_cow *cow,
354  				     u64 start, u64 ram_size,
355  				     u64 compressed_size,
356  				     struct page **pages,
357  				     unsigned long nr_pages,
358  				     int compress_type)
359  {
360  	struct async_extent *async_extent;
361  
362  	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
363  	BUG_ON(!async_extent); /* -ENOMEM */
364  	async_extent->start = start;
365  	async_extent->ram_size = ram_size;
366  	async_extent->compressed_size = compressed_size;
367  	async_extent->pages = pages;
368  	async_extent->nr_pages = nr_pages;
369  	async_extent->compress_type = compress_type;
370  	list_add_tail(&async_extent->list, &cow->extents);
371  	return 0;
372  }
373  
inode_need_compress(struct inode * inode)374  static inline int inode_need_compress(struct inode *inode)
375  {
376  	struct btrfs_root *root = BTRFS_I(inode)->root;
377  
378  	/* force compress */
379  	if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
380  		return 1;
381  	/* bad compression ratios */
382  	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
383  		return 0;
384  	if (btrfs_test_opt(root->fs_info, COMPRESS) ||
385  	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
386  	    BTRFS_I(inode)->force_compress)
387  		return 1;
388  	return 0;
389  }
390  
391  /*
392   * we create compressed extents in two phases.  The first
393   * phase compresses a range of pages that have already been
394   * locked (both pages and state bits are locked).
395   *
396   * This is done inside an ordered work queue, and the compression
397   * is spread across many cpus.  The actual IO submission is step
398   * two, and the ordered work queue takes care of making sure that
399   * happens in the same order things were put onto the queue by
400   * writepages and friends.
401   *
402   * If this code finds it can't get good compression, it puts an
403   * entry onto the work queue to write the uncompressed bytes.  This
404   * makes sure that both compressed inodes and uncompressed inodes
405   * are written in the same order that the flusher thread sent them
406   * down.
407   */
compress_file_range(struct inode * inode,struct page * locked_page,u64 start,u64 end,struct async_cow * async_cow,int * num_added)408  static noinline void compress_file_range(struct inode *inode,
409  					struct page *locked_page,
410  					u64 start, u64 end,
411  					struct async_cow *async_cow,
412  					int *num_added)
413  {
414  	struct btrfs_root *root = BTRFS_I(inode)->root;
415  	u64 num_bytes;
416  	u64 blocksize = root->sectorsize;
417  	u64 actual_end;
418  	u64 isize = i_size_read(inode);
419  	int ret = 0;
420  	struct page **pages = NULL;
421  	unsigned long nr_pages;
422  	unsigned long nr_pages_ret = 0;
423  	unsigned long total_compressed = 0;
424  	unsigned long total_in = 0;
425  	unsigned long max_compressed = SZ_128K;
426  	unsigned long max_uncompressed = SZ_128K;
427  	int i;
428  	int will_compress;
429  	int compress_type = root->fs_info->compress_type;
430  	int redirty = 0;
431  
432  	/* if this is a small write inside eof, kick off a defrag */
433  	if ((end - start + 1) < SZ_16K &&
434  	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
435  		btrfs_add_inode_defrag(NULL, inode);
436  
437  	actual_end = min_t(u64, isize, end + 1);
438  again:
439  	will_compress = 0;
440  	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
441  	nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE);
442  
443  	/*
444  	 * we don't want to send crud past the end of i_size through
445  	 * compression, that's just a waste of CPU time.  So, if the
446  	 * end of the file is before the start of our current
447  	 * requested range of bytes, we bail out to the uncompressed
448  	 * cleanup code that can deal with all of this.
449  	 *
450  	 * It isn't really the fastest way to fix things, but this is a
451  	 * very uncommon corner.
452  	 */
453  	if (actual_end <= start)
454  		goto cleanup_and_bail_uncompressed;
455  
456  	total_compressed = actual_end - start;
457  
458  	/*
459  	 * skip compression for a small file range(<=blocksize) that
460  	 * isn't an inline extent, since it doesn't save disk space at all.
461  	 */
462  	if (total_compressed <= blocksize &&
463  	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
464  		goto cleanup_and_bail_uncompressed;
465  
466  	/* we want to make sure that amount of ram required to uncompress
467  	 * an extent is reasonable, so we limit the total size in ram
468  	 * of a compressed extent to 128k.  This is a crucial number
469  	 * because it also controls how easily we can spread reads across
470  	 * cpus for decompression.
471  	 *
472  	 * We also want to make sure the amount of IO required to do
473  	 * a random read is reasonably small, so we limit the size of
474  	 * a compressed extent to 128k.
475  	 */
476  	total_compressed = min(total_compressed, max_uncompressed);
477  	num_bytes = ALIGN(end - start + 1, blocksize);
478  	num_bytes = max(blocksize,  num_bytes);
479  	total_in = 0;
480  	ret = 0;
481  
482  	/*
483  	 * we do compression for mount -o compress and when the
484  	 * inode has not been flagged as nocompress.  This flag can
485  	 * change at any time if we discover bad compression ratios.
486  	 */
487  	if (inode_need_compress(inode)) {
488  		WARN_ON(pages);
489  		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
490  		if (!pages) {
491  			/* just bail out to the uncompressed code */
492  			goto cont;
493  		}
494  
495  		if (BTRFS_I(inode)->force_compress)
496  			compress_type = BTRFS_I(inode)->force_compress;
497  
498  		/*
499  		 * we need to call clear_page_dirty_for_io on each
500  		 * page in the range.  Otherwise applications with the file
501  		 * mmap'd can wander in and change the page contents while
502  		 * we are compressing them.
503  		 *
504  		 * If the compression fails for any reason, we set the pages
505  		 * dirty again later on.
506  		 */
507  		extent_range_clear_dirty_for_io(inode, start, end);
508  		redirty = 1;
509  		ret = btrfs_compress_pages(compress_type,
510  					   inode->i_mapping, start,
511  					   total_compressed, pages,
512  					   nr_pages, &nr_pages_ret,
513  					   &total_in,
514  					   &total_compressed,
515  					   max_compressed);
516  
517  		if (!ret) {
518  			unsigned long offset = total_compressed &
519  				(PAGE_SIZE - 1);
520  			struct page *page = pages[nr_pages_ret - 1];
521  			char *kaddr;
522  
523  			/* zero the tail end of the last page, we might be
524  			 * sending it down to disk
525  			 */
526  			if (offset) {
527  				kaddr = kmap_atomic(page);
528  				memset(kaddr + offset, 0,
529  				       PAGE_SIZE - offset);
530  				kunmap_atomic(kaddr);
531  			}
532  			will_compress = 1;
533  		}
534  	}
535  cont:
536  	if (start == 0) {
537  		/* lets try to make an inline extent */
538  		if (ret || total_in < (actual_end - start)) {
539  			/* we didn't compress the entire range, try
540  			 * to make an uncompressed inline extent.
541  			 */
542  			ret = cow_file_range_inline(root, inode, start, end,
543  						    0, 0, NULL);
544  		} else {
545  			/* try making a compressed inline extent */
546  			ret = cow_file_range_inline(root, inode, start, end,
547  						    total_compressed,
548  						    compress_type, pages);
549  		}
550  		if (ret <= 0) {
551  			unsigned long clear_flags = EXTENT_DELALLOC |
552  				EXTENT_DEFRAG;
553  			unsigned long page_error_op;
554  
555  			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
556  			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
557  
558  			/*
559  			 * inline extent creation worked or returned error,
560  			 * we don't need to create any more async work items.
561  			 * Unlock and free up our temp pages.
562  			 */
563  			extent_clear_unlock_delalloc(inode, start, end, end,
564  						     NULL, clear_flags,
565  						     PAGE_UNLOCK |
566  						     PAGE_CLEAR_DIRTY |
567  						     PAGE_SET_WRITEBACK |
568  						     page_error_op |
569  						     PAGE_END_WRITEBACK);
570  			if (ret == 0)
571  				btrfs_free_reserved_data_space_noquota(inode,
572  							       start,
573  							       end - start + 1);
574  			goto free_pages_out;
575  		}
576  	}
577  
578  	if (will_compress) {
579  		/*
580  		 * we aren't doing an inline extent round the compressed size
581  		 * up to a block size boundary so the allocator does sane
582  		 * things
583  		 */
584  		total_compressed = ALIGN(total_compressed, blocksize);
585  
586  		/*
587  		 * one last check to make sure the compression is really a
588  		 * win, compare the page count read with the blocks on disk
589  		 */
590  		total_in = ALIGN(total_in, PAGE_SIZE);
591  		if (total_compressed >= total_in) {
592  			will_compress = 0;
593  		} else {
594  			num_bytes = total_in;
595  			*num_added += 1;
596  
597  			/*
598  			 * The async work queues will take care of doing actual
599  			 * allocation on disk for these compressed pages, and
600  			 * will submit them to the elevator.
601  			 */
602  			add_async_extent(async_cow, start, num_bytes,
603  					total_compressed, pages, nr_pages_ret,
604  					compress_type);
605  
606  			if (start + num_bytes < end) {
607  				start += num_bytes;
608  				pages = NULL;
609  				cond_resched();
610  				goto again;
611  			}
612  			return;
613  		}
614  	}
615  	if (pages) {
616  		/*
617  		 * the compression code ran but failed to make things smaller,
618  		 * free any pages it allocated and our page pointer array
619  		 */
620  		for (i = 0; i < nr_pages_ret; i++) {
621  			WARN_ON(pages[i]->mapping);
622  			put_page(pages[i]);
623  		}
624  		kfree(pages);
625  		pages = NULL;
626  		total_compressed = 0;
627  		nr_pages_ret = 0;
628  
629  		/* flag the file so we don't compress in the future */
630  		if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) &&
631  		    !(BTRFS_I(inode)->force_compress)) {
632  			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
633  		}
634  	}
635  cleanup_and_bail_uncompressed:
636  	/*
637  	 * No compression, but we still need to write the pages in the file
638  	 * we've been given so far.  redirty the locked page if it corresponds
639  	 * to our extent and set things up for the async work queue to run
640  	 * cow_file_range to do the normal delalloc dance.
641  	 */
642  	if (page_offset(locked_page) >= start &&
643  	    page_offset(locked_page) <= end)
644  		__set_page_dirty_nobuffers(locked_page);
645  		/* unlocked later on in the async handlers */
646  
647  	if (redirty)
648  		extent_range_redirty_for_io(inode, start, end);
649  	add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
650  			 BTRFS_COMPRESS_NONE);
651  	*num_added += 1;
652  
653  	return;
654  
655  free_pages_out:
656  	for (i = 0; i < nr_pages_ret; i++) {
657  		WARN_ON(pages[i]->mapping);
658  		put_page(pages[i]);
659  	}
660  	kfree(pages);
661  }
662  
free_async_extent_pages(struct async_extent * async_extent)663  static void free_async_extent_pages(struct async_extent *async_extent)
664  {
665  	int i;
666  
667  	if (!async_extent->pages)
668  		return;
669  
670  	for (i = 0; i < async_extent->nr_pages; i++) {
671  		WARN_ON(async_extent->pages[i]->mapping);
672  		put_page(async_extent->pages[i]);
673  	}
674  	kfree(async_extent->pages);
675  	async_extent->nr_pages = 0;
676  	async_extent->pages = NULL;
677  }
678  
679  /*
680   * phase two of compressed writeback.  This is the ordered portion
681   * of the code, which only gets called in the order the work was
682   * queued.  We walk all the async extents created by compress_file_range
683   * and send them down to the disk.
684   */
submit_compressed_extents(struct inode * inode,struct async_cow * async_cow)685  static noinline void submit_compressed_extents(struct inode *inode,
686  					      struct async_cow *async_cow)
687  {
688  	struct async_extent *async_extent;
689  	u64 alloc_hint = 0;
690  	struct btrfs_key ins;
691  	struct extent_map *em;
692  	struct btrfs_root *root = BTRFS_I(inode)->root;
693  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
694  	struct extent_io_tree *io_tree;
695  	int ret = 0;
696  
697  again:
698  	while (!list_empty(&async_cow->extents)) {
699  		async_extent = list_entry(async_cow->extents.next,
700  					  struct async_extent, list);
701  		list_del(&async_extent->list);
702  
703  		io_tree = &BTRFS_I(inode)->io_tree;
704  
705  retry:
706  		/* did the compression code fall back to uncompressed IO? */
707  		if (!async_extent->pages) {
708  			int page_started = 0;
709  			unsigned long nr_written = 0;
710  
711  			lock_extent(io_tree, async_extent->start,
712  					 async_extent->start +
713  					 async_extent->ram_size - 1);
714  
715  			/* allocate blocks */
716  			ret = cow_file_range(inode, async_cow->locked_page,
717  					     async_extent->start,
718  					     async_extent->start +
719  					     async_extent->ram_size - 1,
720  					     async_extent->start +
721  					     async_extent->ram_size - 1,
722  					     &page_started, &nr_written, 0,
723  					     NULL);
724  
725  			/* JDM XXX */
726  
727  			/*
728  			 * if page_started, cow_file_range inserted an
729  			 * inline extent and took care of all the unlocking
730  			 * and IO for us.  Otherwise, we need to submit
731  			 * all those pages down to the drive.
732  			 */
733  			if (!page_started && !ret)
734  				extent_write_locked_range(io_tree,
735  						  inode, async_extent->start,
736  						  async_extent->start +
737  						  async_extent->ram_size - 1,
738  						  btrfs_get_extent,
739  						  WB_SYNC_ALL);
740  			else if (ret)
741  				unlock_page(async_cow->locked_page);
742  			kfree(async_extent);
743  			cond_resched();
744  			continue;
745  		}
746  
747  		lock_extent(io_tree, async_extent->start,
748  			    async_extent->start + async_extent->ram_size - 1);
749  
750  		ret = btrfs_reserve_extent(root, async_extent->ram_size,
751  					   async_extent->compressed_size,
752  					   async_extent->compressed_size,
753  					   0, alloc_hint, &ins, 1, 1);
754  		if (ret) {
755  			free_async_extent_pages(async_extent);
756  
757  			if (ret == -ENOSPC) {
758  				unlock_extent(io_tree, async_extent->start,
759  					      async_extent->start +
760  					      async_extent->ram_size - 1);
761  
762  				/*
763  				 * we need to redirty the pages if we decide to
764  				 * fallback to uncompressed IO, otherwise we
765  				 * will not submit these pages down to lower
766  				 * layers.
767  				 */
768  				extent_range_redirty_for_io(inode,
769  						async_extent->start,
770  						async_extent->start +
771  						async_extent->ram_size - 1);
772  
773  				goto retry;
774  			}
775  			goto out_free;
776  		}
777  		/*
778  		 * here we're doing allocation and writeback of the
779  		 * compressed pages
780  		 */
781  		btrfs_drop_extent_cache(inode, async_extent->start,
782  					async_extent->start +
783  					async_extent->ram_size - 1, 0);
784  
785  		em = alloc_extent_map();
786  		if (!em) {
787  			ret = -ENOMEM;
788  			goto out_free_reserve;
789  		}
790  		em->start = async_extent->start;
791  		em->len = async_extent->ram_size;
792  		em->orig_start = em->start;
793  		em->mod_start = em->start;
794  		em->mod_len = em->len;
795  
796  		em->block_start = ins.objectid;
797  		em->block_len = ins.offset;
798  		em->orig_block_len = ins.offset;
799  		em->ram_bytes = async_extent->ram_size;
800  		em->bdev = root->fs_info->fs_devices->latest_bdev;
801  		em->compress_type = async_extent->compress_type;
802  		set_bit(EXTENT_FLAG_PINNED, &em->flags);
803  		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
804  		em->generation = -1;
805  
806  		while (1) {
807  			write_lock(&em_tree->lock);
808  			ret = add_extent_mapping(em_tree, em, 1);
809  			write_unlock(&em_tree->lock);
810  			if (ret != -EEXIST) {
811  				free_extent_map(em);
812  				break;
813  			}
814  			btrfs_drop_extent_cache(inode, async_extent->start,
815  						async_extent->start +
816  						async_extent->ram_size - 1, 0);
817  		}
818  
819  		if (ret)
820  			goto out_free_reserve;
821  
822  		ret = btrfs_add_ordered_extent_compress(inode,
823  						async_extent->start,
824  						ins.objectid,
825  						async_extent->ram_size,
826  						ins.offset,
827  						BTRFS_ORDERED_COMPRESSED,
828  						async_extent->compress_type);
829  		if (ret) {
830  			btrfs_drop_extent_cache(inode, async_extent->start,
831  						async_extent->start +
832  						async_extent->ram_size - 1, 0);
833  			goto out_free_reserve;
834  		}
835  		btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
836  
837  		/*
838  		 * clear dirty, set writeback and unlock the pages.
839  		 */
840  		extent_clear_unlock_delalloc(inode, async_extent->start,
841  				async_extent->start +
842  				async_extent->ram_size - 1,
843  				async_extent->start +
844  				async_extent->ram_size - 1,
845  				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
846  				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
847  				PAGE_SET_WRITEBACK);
848  		ret = btrfs_submit_compressed_write(inode,
849  				    async_extent->start,
850  				    async_extent->ram_size,
851  				    ins.objectid,
852  				    ins.offset, async_extent->pages,
853  				    async_extent->nr_pages);
854  		if (ret) {
855  			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
856  			struct page *p = async_extent->pages[0];
857  			const u64 start = async_extent->start;
858  			const u64 end = start + async_extent->ram_size - 1;
859  
860  			p->mapping = inode->i_mapping;
861  			tree->ops->writepage_end_io_hook(p, start, end,
862  							 NULL, 0);
863  			p->mapping = NULL;
864  			extent_clear_unlock_delalloc(inode, start, end, end,
865  						     NULL, 0,
866  						     PAGE_END_WRITEBACK |
867  						     PAGE_SET_ERROR);
868  			free_async_extent_pages(async_extent);
869  		}
870  		alloc_hint = ins.objectid + ins.offset;
871  		kfree(async_extent);
872  		cond_resched();
873  	}
874  	return;
875  out_free_reserve:
876  	btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
877  	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
878  out_free:
879  	extent_clear_unlock_delalloc(inode, async_extent->start,
880  				     async_extent->start +
881  				     async_extent->ram_size - 1,
882  				     async_extent->start +
883  				     async_extent->ram_size - 1,
884  				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
885  				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
886  				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
887  				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
888  				     PAGE_SET_ERROR);
889  	free_async_extent_pages(async_extent);
890  	kfree(async_extent);
891  	goto again;
892  }
893  
get_extent_allocation_hint(struct inode * inode,u64 start,u64 num_bytes)894  static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
895  				      u64 num_bytes)
896  {
897  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
898  	struct extent_map *em;
899  	u64 alloc_hint = 0;
900  
901  	read_lock(&em_tree->lock);
902  	em = search_extent_mapping(em_tree, start, num_bytes);
903  	if (em) {
904  		/*
905  		 * if block start isn't an actual block number then find the
906  		 * first block in this inode and use that as a hint.  If that
907  		 * block is also bogus then just don't worry about it.
908  		 */
909  		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
910  			free_extent_map(em);
911  			em = search_extent_mapping(em_tree, 0, 0);
912  			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
913  				alloc_hint = em->block_start;
914  			if (em)
915  				free_extent_map(em);
916  		} else {
917  			alloc_hint = em->block_start;
918  			free_extent_map(em);
919  		}
920  	}
921  	read_unlock(&em_tree->lock);
922  
923  	return alloc_hint;
924  }
925  
926  /*
927   * when extent_io.c finds a delayed allocation range in the file,
928   * the call backs end up in this code.  The basic idea is to
929   * allocate extents on disk for the range, and create ordered data structs
930   * in ram to track those extents.
931   *
932   * locked_page is the page that writepage had locked already.  We use
933   * it to make sure we don't do extra locks or unlocks.
934   *
935   * *page_started is set to one if we unlock locked_page and do everything
936   * required to start IO on it.  It may be clean and already done with
937   * IO when we return.
938   */
cow_file_range(struct inode * inode,struct page * locked_page,u64 start,u64 end,u64 delalloc_end,int * page_started,unsigned long * nr_written,int unlock,struct btrfs_dedupe_hash * hash)939  static noinline int cow_file_range(struct inode *inode,
940  				   struct page *locked_page,
941  				   u64 start, u64 end, u64 delalloc_end,
942  				   int *page_started, unsigned long *nr_written,
943  				   int unlock, struct btrfs_dedupe_hash *hash)
944  {
945  	struct btrfs_root *root = BTRFS_I(inode)->root;
946  	u64 alloc_hint = 0;
947  	u64 num_bytes;
948  	unsigned long ram_size;
949  	u64 disk_num_bytes;
950  	u64 cur_alloc_size;
951  	u64 blocksize = root->sectorsize;
952  	struct btrfs_key ins;
953  	struct extent_map *em;
954  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
955  	int ret = 0;
956  
957  	if (btrfs_is_free_space_inode(inode)) {
958  		WARN_ON_ONCE(1);
959  		ret = -EINVAL;
960  		goto out_unlock;
961  	}
962  
963  	num_bytes = ALIGN(end - start + 1, blocksize);
964  	num_bytes = max(blocksize,  num_bytes);
965  	disk_num_bytes = num_bytes;
966  
967  	/* if this is a small write inside eof, kick off defrag */
968  	if (num_bytes < SZ_64K &&
969  	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
970  		btrfs_add_inode_defrag(NULL, inode);
971  
972  	if (start == 0) {
973  		/* lets try to make an inline extent */
974  		ret = cow_file_range_inline(root, inode, start, end, 0, 0,
975  					    NULL);
976  		if (ret == 0) {
977  			extent_clear_unlock_delalloc(inode, start, end,
978  				     delalloc_end, NULL,
979  				     EXTENT_LOCKED | EXTENT_DELALLOC |
980  				     EXTENT_DEFRAG, PAGE_UNLOCK |
981  				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
982  				     PAGE_END_WRITEBACK);
983  			btrfs_free_reserved_data_space_noquota(inode, start,
984  						end - start + 1);
985  			*nr_written = *nr_written +
986  			     (end - start + PAGE_SIZE) / PAGE_SIZE;
987  			*page_started = 1;
988  			goto out;
989  		} else if (ret < 0) {
990  			goto out_unlock;
991  		}
992  	}
993  
994  	BUG_ON(disk_num_bytes >
995  	       btrfs_super_total_bytes(root->fs_info->super_copy));
996  
997  	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
998  	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
999  
1000  	while (disk_num_bytes > 0) {
1001  		unsigned long op;
1002  
1003  		cur_alloc_size = disk_num_bytes;
1004  		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1005  					   root->sectorsize, 0, alloc_hint,
1006  					   &ins, 1, 1);
1007  		if (ret < 0)
1008  			goto out_unlock;
1009  
1010  		em = alloc_extent_map();
1011  		if (!em) {
1012  			ret = -ENOMEM;
1013  			goto out_reserve;
1014  		}
1015  		em->start = start;
1016  		em->orig_start = em->start;
1017  		ram_size = ins.offset;
1018  		em->len = ins.offset;
1019  		em->mod_start = em->start;
1020  		em->mod_len = em->len;
1021  
1022  		em->block_start = ins.objectid;
1023  		em->block_len = ins.offset;
1024  		em->orig_block_len = ins.offset;
1025  		em->ram_bytes = ram_size;
1026  		em->bdev = root->fs_info->fs_devices->latest_bdev;
1027  		set_bit(EXTENT_FLAG_PINNED, &em->flags);
1028  		em->generation = -1;
1029  
1030  		while (1) {
1031  			write_lock(&em_tree->lock);
1032  			ret = add_extent_mapping(em_tree, em, 1);
1033  			write_unlock(&em_tree->lock);
1034  			if (ret != -EEXIST) {
1035  				free_extent_map(em);
1036  				break;
1037  			}
1038  			btrfs_drop_extent_cache(inode, start,
1039  						start + ram_size - 1, 0);
1040  		}
1041  		if (ret)
1042  			goto out_reserve;
1043  
1044  		cur_alloc_size = ins.offset;
1045  		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1046  					       ram_size, cur_alloc_size, 0);
1047  		if (ret)
1048  			goto out_drop_extent_cache;
1049  
1050  		if (root->root_key.objectid ==
1051  		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1052  			ret = btrfs_reloc_clone_csums(inode, start,
1053  						      cur_alloc_size);
1054  			if (ret)
1055  				goto out_drop_extent_cache;
1056  		}
1057  
1058  		btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1059  
1060  		if (disk_num_bytes < cur_alloc_size)
1061  			break;
1062  
1063  		/* we're not doing compressed IO, don't unlock the first
1064  		 * page (which the caller expects to stay locked), don't
1065  		 * clear any dirty bits and don't set any writeback bits
1066  		 *
1067  		 * Do set the Private2 bit so we know this page was properly
1068  		 * setup for writepage
1069  		 */
1070  		op = unlock ? PAGE_UNLOCK : 0;
1071  		op |= PAGE_SET_PRIVATE2;
1072  
1073  		extent_clear_unlock_delalloc(inode, start,
1074  					     start + ram_size - 1,
1075  					     delalloc_end, locked_page,
1076  					     EXTENT_LOCKED | EXTENT_DELALLOC,
1077  					     op);
1078  		disk_num_bytes -= cur_alloc_size;
1079  		num_bytes -= cur_alloc_size;
1080  		alloc_hint = ins.objectid + ins.offset;
1081  		start += cur_alloc_size;
1082  	}
1083  out:
1084  	return ret;
1085  
1086  out_drop_extent_cache:
1087  	btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1088  out_reserve:
1089  	btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1090  	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1091  out_unlock:
1092  	extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1093  				     locked_page,
1094  				     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
1095  				     EXTENT_DELALLOC | EXTENT_DEFRAG,
1096  				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1097  				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
1098  	goto out;
1099  }
1100  
1101  /*
1102   * work queue call back to started compression on a file and pages
1103   */
async_cow_start(struct btrfs_work * work)1104  static noinline void async_cow_start(struct btrfs_work *work)
1105  {
1106  	struct async_cow *async_cow;
1107  	int num_added = 0;
1108  	async_cow = container_of(work, struct async_cow, work);
1109  
1110  	compress_file_range(async_cow->inode, async_cow->locked_page,
1111  			    async_cow->start, async_cow->end, async_cow,
1112  			    &num_added);
1113  	if (num_added == 0) {
1114  		btrfs_add_delayed_iput(async_cow->inode);
1115  		async_cow->inode = NULL;
1116  	}
1117  }
1118  
1119  /*
1120   * work queue call back to submit previously compressed pages
1121   */
async_cow_submit(struct btrfs_work * work)1122  static noinline void async_cow_submit(struct btrfs_work *work)
1123  {
1124  	struct async_cow *async_cow;
1125  	struct btrfs_root *root;
1126  	unsigned long nr_pages;
1127  
1128  	async_cow = container_of(work, struct async_cow, work);
1129  
1130  	root = async_cow->root;
1131  	nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1132  		PAGE_SHIFT;
1133  
1134  	/*
1135  	 * atomic_sub_return implies a barrier for waitqueue_active
1136  	 */
1137  	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1138  	    5 * SZ_1M &&
1139  	    waitqueue_active(&root->fs_info->async_submit_wait))
1140  		wake_up(&root->fs_info->async_submit_wait);
1141  
1142  	if (async_cow->inode)
1143  		submit_compressed_extents(async_cow->inode, async_cow);
1144  }
1145  
async_cow_free(struct btrfs_work * work)1146  static noinline void async_cow_free(struct btrfs_work *work)
1147  {
1148  	struct async_cow *async_cow;
1149  	async_cow = container_of(work, struct async_cow, work);
1150  	if (async_cow->inode)
1151  		btrfs_add_delayed_iput(async_cow->inode);
1152  	kfree(async_cow);
1153  }
1154  
cow_file_range_async(struct inode * inode,struct page * locked_page,u64 start,u64 end,int * page_started,unsigned long * nr_written)1155  static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1156  				u64 start, u64 end, int *page_started,
1157  				unsigned long *nr_written)
1158  {
1159  	struct async_cow *async_cow;
1160  	struct btrfs_root *root = BTRFS_I(inode)->root;
1161  	unsigned long nr_pages;
1162  	u64 cur_end;
1163  	int limit = 10 * SZ_1M;
1164  
1165  	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1166  			 1, 0, NULL, GFP_NOFS);
1167  	while (start < end) {
1168  		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1169  		BUG_ON(!async_cow); /* -ENOMEM */
1170  		async_cow->inode = igrab(inode);
1171  		async_cow->root = root;
1172  		async_cow->locked_page = locked_page;
1173  		async_cow->start = start;
1174  
1175  		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1176  		    !btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
1177  			cur_end = end;
1178  		else
1179  			cur_end = min(end, start + SZ_512K - 1);
1180  
1181  		async_cow->end = cur_end;
1182  		INIT_LIST_HEAD(&async_cow->extents);
1183  
1184  		btrfs_init_work(&async_cow->work,
1185  				btrfs_delalloc_helper,
1186  				async_cow_start, async_cow_submit,
1187  				async_cow_free);
1188  
1189  		nr_pages = (cur_end - start + PAGE_SIZE) >>
1190  			PAGE_SHIFT;
1191  		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1192  
1193  		btrfs_queue_work(root->fs_info->delalloc_workers,
1194  				 &async_cow->work);
1195  
1196  		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1197  			wait_event(root->fs_info->async_submit_wait,
1198  			   (atomic_read(&root->fs_info->async_delalloc_pages) <
1199  			    limit));
1200  		}
1201  
1202  		while (atomic_read(&root->fs_info->async_submit_draining) &&
1203  		      atomic_read(&root->fs_info->async_delalloc_pages)) {
1204  			wait_event(root->fs_info->async_submit_wait,
1205  			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
1206  			   0));
1207  		}
1208  
1209  		*nr_written += nr_pages;
1210  		start = cur_end + 1;
1211  	}
1212  	*page_started = 1;
1213  	return 0;
1214  }
1215  
csum_exist_in_range(struct btrfs_root * root,u64 bytenr,u64 num_bytes)1216  static noinline int csum_exist_in_range(struct btrfs_root *root,
1217  					u64 bytenr, u64 num_bytes)
1218  {
1219  	int ret;
1220  	struct btrfs_ordered_sum *sums;
1221  	LIST_HEAD(list);
1222  
1223  	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1224  				       bytenr + num_bytes - 1, &list, 0);
1225  	if (ret == 0 && list_empty(&list))
1226  		return 0;
1227  
1228  	while (!list_empty(&list)) {
1229  		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1230  		list_del(&sums->list);
1231  		kfree(sums);
1232  	}
1233  	return 1;
1234  }
1235  
1236  /*
1237   * when nowcow writeback call back.  This checks for snapshots or COW copies
1238   * of the extents that exist in the file, and COWs the file as required.
1239   *
1240   * If no cow copies or snapshots exist, we write directly to the existing
1241   * blocks on disk
1242   */
run_delalloc_nocow(struct inode * inode,struct page * locked_page,u64 start,u64 end,int * page_started,int force,unsigned long * nr_written)1243  static noinline int run_delalloc_nocow(struct inode *inode,
1244  				       struct page *locked_page,
1245  			      u64 start, u64 end, int *page_started, int force,
1246  			      unsigned long *nr_written)
1247  {
1248  	struct btrfs_root *root = BTRFS_I(inode)->root;
1249  	struct btrfs_trans_handle *trans;
1250  	struct extent_buffer *leaf;
1251  	struct btrfs_path *path;
1252  	struct btrfs_file_extent_item *fi;
1253  	struct btrfs_key found_key;
1254  	u64 cow_start;
1255  	u64 cur_offset;
1256  	u64 extent_end;
1257  	u64 extent_offset;
1258  	u64 disk_bytenr;
1259  	u64 num_bytes;
1260  	u64 disk_num_bytes;
1261  	u64 ram_bytes;
1262  	int extent_type;
1263  	int ret, err;
1264  	int type;
1265  	int nocow;
1266  	int check_prev = 1;
1267  	bool nolock;
1268  	u64 ino = btrfs_ino(inode);
1269  
1270  	path = btrfs_alloc_path();
1271  	if (!path) {
1272  		extent_clear_unlock_delalloc(inode, start, end, end,
1273  					     locked_page,
1274  					     EXTENT_LOCKED | EXTENT_DELALLOC |
1275  					     EXTENT_DO_ACCOUNTING |
1276  					     EXTENT_DEFRAG, PAGE_UNLOCK |
1277  					     PAGE_CLEAR_DIRTY |
1278  					     PAGE_SET_WRITEBACK |
1279  					     PAGE_END_WRITEBACK);
1280  		return -ENOMEM;
1281  	}
1282  
1283  	nolock = btrfs_is_free_space_inode(inode);
1284  
1285  	if (nolock)
1286  		trans = btrfs_join_transaction_nolock(root);
1287  	else
1288  		trans = btrfs_join_transaction(root);
1289  
1290  	if (IS_ERR(trans)) {
1291  		extent_clear_unlock_delalloc(inode, start, end, end,
1292  					     locked_page,
1293  					     EXTENT_LOCKED | EXTENT_DELALLOC |
1294  					     EXTENT_DO_ACCOUNTING |
1295  					     EXTENT_DEFRAG, PAGE_UNLOCK |
1296  					     PAGE_CLEAR_DIRTY |
1297  					     PAGE_SET_WRITEBACK |
1298  					     PAGE_END_WRITEBACK);
1299  		btrfs_free_path(path);
1300  		return PTR_ERR(trans);
1301  	}
1302  
1303  	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1304  
1305  	cow_start = (u64)-1;
1306  	cur_offset = start;
1307  	while (1) {
1308  		ret = btrfs_lookup_file_extent(trans, root, path, ino,
1309  					       cur_offset, 0);
1310  		if (ret < 0)
1311  			goto error;
1312  		if (ret > 0 && path->slots[0] > 0 && check_prev) {
1313  			leaf = path->nodes[0];
1314  			btrfs_item_key_to_cpu(leaf, &found_key,
1315  					      path->slots[0] - 1);
1316  			if (found_key.objectid == ino &&
1317  			    found_key.type == BTRFS_EXTENT_DATA_KEY)
1318  				path->slots[0]--;
1319  		}
1320  		check_prev = 0;
1321  next_slot:
1322  		leaf = path->nodes[0];
1323  		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1324  			ret = btrfs_next_leaf(root, path);
1325  			if (ret < 0) {
1326  				if (cow_start != (u64)-1)
1327  					cur_offset = cow_start;
1328  				goto error;
1329  			}
1330  			if (ret > 0)
1331  				break;
1332  			leaf = path->nodes[0];
1333  		}
1334  
1335  		nocow = 0;
1336  		disk_bytenr = 0;
1337  		num_bytes = 0;
1338  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1339  
1340  		if (found_key.objectid > ino)
1341  			break;
1342  		if (WARN_ON_ONCE(found_key.objectid < ino) ||
1343  		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
1344  			path->slots[0]++;
1345  			goto next_slot;
1346  		}
1347  		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1348  		    found_key.offset > end)
1349  			break;
1350  
1351  		if (found_key.offset > cur_offset) {
1352  			extent_end = found_key.offset;
1353  			extent_type = 0;
1354  			goto out_check;
1355  		}
1356  
1357  		fi = btrfs_item_ptr(leaf, path->slots[0],
1358  				    struct btrfs_file_extent_item);
1359  		extent_type = btrfs_file_extent_type(leaf, fi);
1360  
1361  		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1362  		if (extent_type == BTRFS_FILE_EXTENT_REG ||
1363  		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1364  			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1365  			extent_offset = btrfs_file_extent_offset(leaf, fi);
1366  			extent_end = found_key.offset +
1367  				btrfs_file_extent_num_bytes(leaf, fi);
1368  			disk_num_bytes =
1369  				btrfs_file_extent_disk_num_bytes(leaf, fi);
1370  			if (extent_end <= start) {
1371  				path->slots[0]++;
1372  				goto next_slot;
1373  			}
1374  			if (disk_bytenr == 0)
1375  				goto out_check;
1376  			if (btrfs_file_extent_compression(leaf, fi) ||
1377  			    btrfs_file_extent_encryption(leaf, fi) ||
1378  			    btrfs_file_extent_other_encoding(leaf, fi))
1379  				goto out_check;
1380  			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1381  				goto out_check;
1382  			if (btrfs_extent_readonly(root, disk_bytenr))
1383  				goto out_check;
1384  			if (btrfs_cross_ref_exist(trans, root, ino,
1385  						  found_key.offset -
1386  						  extent_offset, disk_bytenr))
1387  				goto out_check;
1388  			disk_bytenr += extent_offset;
1389  			disk_bytenr += cur_offset - found_key.offset;
1390  			num_bytes = min(end + 1, extent_end) - cur_offset;
1391  			/*
1392  			 * if there are pending snapshots for this root,
1393  			 * we fall into common COW way.
1394  			 */
1395  			if (!nolock) {
1396  				err = btrfs_start_write_no_snapshoting(root);
1397  				if (!err)
1398  					goto out_check;
1399  			}
1400  			/*
1401  			 * force cow if csum exists in the range.
1402  			 * this ensure that csum for a given extent are
1403  			 * either valid or do not exist.
1404  			 */
1405  			if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1406  				goto out_check;
1407  			if (!btrfs_inc_nocow_writers(root->fs_info,
1408  						     disk_bytenr))
1409  				goto out_check;
1410  			nocow = 1;
1411  		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1412  			extent_end = found_key.offset +
1413  				btrfs_file_extent_inline_len(leaf,
1414  						     path->slots[0], fi);
1415  			extent_end = ALIGN(extent_end, root->sectorsize);
1416  		} else {
1417  			BUG_ON(1);
1418  		}
1419  out_check:
1420  		if (extent_end <= start) {
1421  			path->slots[0]++;
1422  			if (!nolock && nocow)
1423  				btrfs_end_write_no_snapshoting(root);
1424  			if (nocow)
1425  				btrfs_dec_nocow_writers(root->fs_info,
1426  							disk_bytenr);
1427  			goto next_slot;
1428  		}
1429  		if (!nocow) {
1430  			if (cow_start == (u64)-1)
1431  				cow_start = cur_offset;
1432  			cur_offset = extent_end;
1433  			if (cur_offset > end)
1434  				break;
1435  			path->slots[0]++;
1436  			goto next_slot;
1437  		}
1438  
1439  		btrfs_release_path(path);
1440  		if (cow_start != (u64)-1) {
1441  			ret = cow_file_range(inode, locked_page,
1442  					     cow_start, found_key.offset - 1,
1443  					     end, page_started, nr_written, 1,
1444  					     NULL);
1445  			if (ret) {
1446  				if (!nolock && nocow)
1447  					btrfs_end_write_no_snapshoting(root);
1448  				if (nocow)
1449  					btrfs_dec_nocow_writers(root->fs_info,
1450  								disk_bytenr);
1451  				goto error;
1452  			}
1453  			cow_start = (u64)-1;
1454  		}
1455  
1456  		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1457  			struct extent_map *em;
1458  			struct extent_map_tree *em_tree;
1459  			em_tree = &BTRFS_I(inode)->extent_tree;
1460  			em = alloc_extent_map();
1461  			BUG_ON(!em); /* -ENOMEM */
1462  			em->start = cur_offset;
1463  			em->orig_start = found_key.offset - extent_offset;
1464  			em->len = num_bytes;
1465  			em->block_len = num_bytes;
1466  			em->block_start = disk_bytenr;
1467  			em->orig_block_len = disk_num_bytes;
1468  			em->ram_bytes = ram_bytes;
1469  			em->bdev = root->fs_info->fs_devices->latest_bdev;
1470  			em->mod_start = em->start;
1471  			em->mod_len = em->len;
1472  			set_bit(EXTENT_FLAG_PINNED, &em->flags);
1473  			set_bit(EXTENT_FLAG_FILLING, &em->flags);
1474  			em->generation = -1;
1475  			while (1) {
1476  				write_lock(&em_tree->lock);
1477  				ret = add_extent_mapping(em_tree, em, 1);
1478  				write_unlock(&em_tree->lock);
1479  				if (ret != -EEXIST) {
1480  					free_extent_map(em);
1481  					break;
1482  				}
1483  				btrfs_drop_extent_cache(inode, em->start,
1484  						em->start + em->len - 1, 0);
1485  			}
1486  			type = BTRFS_ORDERED_PREALLOC;
1487  		} else {
1488  			type = BTRFS_ORDERED_NOCOW;
1489  		}
1490  
1491  		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1492  					       num_bytes, num_bytes, type);
1493  		if (nocow)
1494  			btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
1495  		BUG_ON(ret); /* -ENOMEM */
1496  
1497  		if (root->root_key.objectid ==
1498  		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1499  			ret = btrfs_reloc_clone_csums(inode, cur_offset,
1500  						      num_bytes);
1501  			if (ret) {
1502  				if (!nolock && nocow)
1503  					btrfs_end_write_no_snapshoting(root);
1504  				goto error;
1505  			}
1506  		}
1507  
1508  		extent_clear_unlock_delalloc(inode, cur_offset,
1509  					     cur_offset + num_bytes - 1, end,
1510  					     locked_page, EXTENT_LOCKED |
1511  					     EXTENT_DELALLOC |
1512  					     EXTENT_CLEAR_DATA_RESV,
1513  					     PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1514  
1515  		if (!nolock && nocow)
1516  			btrfs_end_write_no_snapshoting(root);
1517  		cur_offset = extent_end;
1518  		if (cur_offset > end)
1519  			break;
1520  	}
1521  	btrfs_release_path(path);
1522  
1523  	if (cur_offset <= end && cow_start == (u64)-1) {
1524  		cow_start = cur_offset;
1525  		cur_offset = end;
1526  	}
1527  
1528  	if (cow_start != (u64)-1) {
1529  		ret = cow_file_range(inode, locked_page, cow_start, end, end,
1530  				     page_started, nr_written, 1, NULL);
1531  		if (ret)
1532  			goto error;
1533  	}
1534  
1535  error:
1536  	err = btrfs_end_transaction(trans, root);
1537  	if (!ret)
1538  		ret = err;
1539  
1540  	if (ret && cur_offset < end)
1541  		extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1542  					     locked_page, EXTENT_LOCKED |
1543  					     EXTENT_DELALLOC | EXTENT_DEFRAG |
1544  					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1545  					     PAGE_CLEAR_DIRTY |
1546  					     PAGE_SET_WRITEBACK |
1547  					     PAGE_END_WRITEBACK);
1548  	btrfs_free_path(path);
1549  	return ret;
1550  }
1551  
need_force_cow(struct inode * inode,u64 start,u64 end)1552  static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1553  {
1554  
1555  	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1556  	    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1557  		return 0;
1558  
1559  	/*
1560  	 * @defrag_bytes is a hint value, no spinlock held here,
1561  	 * if is not zero, it means the file is defragging.
1562  	 * Force cow if given extent needs to be defragged.
1563  	 */
1564  	if (BTRFS_I(inode)->defrag_bytes &&
1565  	    test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1566  			   EXTENT_DEFRAG, 0, NULL))
1567  		return 1;
1568  
1569  	return 0;
1570  }
1571  
1572  /*
1573   * extent_io.c call back to do delayed allocation processing
1574   */
run_delalloc_range(struct inode * inode,struct page * locked_page,u64 start,u64 end,int * page_started,unsigned long * nr_written)1575  static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1576  			      u64 start, u64 end, int *page_started,
1577  			      unsigned long *nr_written)
1578  {
1579  	int ret;
1580  	int force_cow = need_force_cow(inode, start, end);
1581  
1582  	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1583  		ret = run_delalloc_nocow(inode, locked_page, start, end,
1584  					 page_started, 1, nr_written);
1585  	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1586  		ret = run_delalloc_nocow(inode, locked_page, start, end,
1587  					 page_started, 0, nr_written);
1588  	} else if (!inode_need_compress(inode)) {
1589  		ret = cow_file_range(inode, locked_page, start, end, end,
1590  				      page_started, nr_written, 1, NULL);
1591  	} else {
1592  		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1593  			&BTRFS_I(inode)->runtime_flags);
1594  		ret = cow_file_range_async(inode, locked_page, start, end,
1595  					   page_started, nr_written);
1596  	}
1597  	return ret;
1598  }
1599  
btrfs_split_extent_hook(struct inode * inode,struct extent_state * orig,u64 split)1600  static void btrfs_split_extent_hook(struct inode *inode,
1601  				    struct extent_state *orig, u64 split)
1602  {
1603  	u64 size;
1604  
1605  	/* not delalloc, ignore it */
1606  	if (!(orig->state & EXTENT_DELALLOC))
1607  		return;
1608  
1609  	size = orig->end - orig->start + 1;
1610  	if (size > BTRFS_MAX_EXTENT_SIZE) {
1611  		u64 num_extents;
1612  		u64 new_size;
1613  
1614  		/*
1615  		 * See the explanation in btrfs_merge_extent_hook, the same
1616  		 * applies here, just in reverse.
1617  		 */
1618  		new_size = orig->end - split + 1;
1619  		num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1620  					BTRFS_MAX_EXTENT_SIZE);
1621  		new_size = split - orig->start;
1622  		num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1623  					BTRFS_MAX_EXTENT_SIZE);
1624  		if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
1625  			      BTRFS_MAX_EXTENT_SIZE) >= num_extents)
1626  			return;
1627  	}
1628  
1629  	spin_lock(&BTRFS_I(inode)->lock);
1630  	BTRFS_I(inode)->outstanding_extents++;
1631  	spin_unlock(&BTRFS_I(inode)->lock);
1632  }
1633  
1634  /*
1635   * extent_io.c merge_extent_hook, used to track merged delayed allocation
1636   * extents so we can keep track of new extents that are just merged onto old
1637   * extents, such as when we are doing sequential writes, so we can properly
1638   * account for the metadata space we'll need.
1639   */
btrfs_merge_extent_hook(struct inode * inode,struct extent_state * new,struct extent_state * other)1640  static void btrfs_merge_extent_hook(struct inode *inode,
1641  				    struct extent_state *new,
1642  				    struct extent_state *other)
1643  {
1644  	u64 new_size, old_size;
1645  	u64 num_extents;
1646  
1647  	/* not delalloc, ignore it */
1648  	if (!(other->state & EXTENT_DELALLOC))
1649  		return;
1650  
1651  	if (new->start > other->start)
1652  		new_size = new->end - other->start + 1;
1653  	else
1654  		new_size = other->end - new->start + 1;
1655  
1656  	/* we're not bigger than the max, unreserve the space and go */
1657  	if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1658  		spin_lock(&BTRFS_I(inode)->lock);
1659  		BTRFS_I(inode)->outstanding_extents--;
1660  		spin_unlock(&BTRFS_I(inode)->lock);
1661  		return;
1662  	}
1663  
1664  	/*
1665  	 * We have to add up either side to figure out how many extents were
1666  	 * accounted for before we merged into one big extent.  If the number of
1667  	 * extents we accounted for is <= the amount we need for the new range
1668  	 * then we can return, otherwise drop.  Think of it like this
1669  	 *
1670  	 * [ 4k][MAX_SIZE]
1671  	 *
1672  	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
1673  	 * need 2 outstanding extents, on one side we have 1 and the other side
1674  	 * we have 1 so they are == and we can return.  But in this case
1675  	 *
1676  	 * [MAX_SIZE+4k][MAX_SIZE+4k]
1677  	 *
1678  	 * Each range on their own accounts for 2 extents, but merged together
1679  	 * they are only 3 extents worth of accounting, so we need to drop in
1680  	 * this case.
1681  	 */
1682  	old_size = other->end - other->start + 1;
1683  	num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1684  				BTRFS_MAX_EXTENT_SIZE);
1685  	old_size = new->end - new->start + 1;
1686  	num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1687  				 BTRFS_MAX_EXTENT_SIZE);
1688  
1689  	if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1690  		      BTRFS_MAX_EXTENT_SIZE) >= num_extents)
1691  		return;
1692  
1693  	spin_lock(&BTRFS_I(inode)->lock);
1694  	BTRFS_I(inode)->outstanding_extents--;
1695  	spin_unlock(&BTRFS_I(inode)->lock);
1696  }
1697  
btrfs_add_delalloc_inodes(struct btrfs_root * root,struct inode * inode)1698  static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1699  				      struct inode *inode)
1700  {
1701  	spin_lock(&root->delalloc_lock);
1702  	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1703  		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1704  			      &root->delalloc_inodes);
1705  		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1706  			&BTRFS_I(inode)->runtime_flags);
1707  		root->nr_delalloc_inodes++;
1708  		if (root->nr_delalloc_inodes == 1) {
1709  			spin_lock(&root->fs_info->delalloc_root_lock);
1710  			BUG_ON(!list_empty(&root->delalloc_root));
1711  			list_add_tail(&root->delalloc_root,
1712  				      &root->fs_info->delalloc_roots);
1713  			spin_unlock(&root->fs_info->delalloc_root_lock);
1714  		}
1715  	}
1716  	spin_unlock(&root->delalloc_lock);
1717  }
1718  
btrfs_del_delalloc_inode(struct btrfs_root * root,struct inode * inode)1719  static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1720  				     struct inode *inode)
1721  {
1722  	spin_lock(&root->delalloc_lock);
1723  	if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1724  		list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1725  		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1726  			  &BTRFS_I(inode)->runtime_flags);
1727  		root->nr_delalloc_inodes--;
1728  		if (!root->nr_delalloc_inodes) {
1729  			spin_lock(&root->fs_info->delalloc_root_lock);
1730  			BUG_ON(list_empty(&root->delalloc_root));
1731  			list_del_init(&root->delalloc_root);
1732  			spin_unlock(&root->fs_info->delalloc_root_lock);
1733  		}
1734  	}
1735  	spin_unlock(&root->delalloc_lock);
1736  }
1737  
1738  /*
1739   * extent_io.c set_bit_hook, used to track delayed allocation
1740   * bytes in this file, and to maintain the list of inodes that
1741   * have pending delalloc work to be done.
1742   */
btrfs_set_bit_hook(struct inode * inode,struct extent_state * state,unsigned * bits)1743  static void btrfs_set_bit_hook(struct inode *inode,
1744  			       struct extent_state *state, unsigned *bits)
1745  {
1746  
1747  	if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1748  		WARN_ON(1);
1749  	/*
1750  	 * set_bit and clear bit hooks normally require _irqsave/restore
1751  	 * but in this case, we are only testing for the DELALLOC
1752  	 * bit, which is only set or cleared with irqs on
1753  	 */
1754  	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1755  		struct btrfs_root *root = BTRFS_I(inode)->root;
1756  		u64 len = state->end + 1 - state->start;
1757  		bool do_list = !btrfs_is_free_space_inode(inode);
1758  
1759  		if (*bits & EXTENT_FIRST_DELALLOC) {
1760  			*bits &= ~EXTENT_FIRST_DELALLOC;
1761  		} else {
1762  			spin_lock(&BTRFS_I(inode)->lock);
1763  			BTRFS_I(inode)->outstanding_extents++;
1764  			spin_unlock(&BTRFS_I(inode)->lock);
1765  		}
1766  
1767  		/* For sanity tests */
1768  		if (btrfs_is_testing(root->fs_info))
1769  			return;
1770  
1771  		__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1772  				     root->fs_info->delalloc_batch);
1773  		spin_lock(&BTRFS_I(inode)->lock);
1774  		BTRFS_I(inode)->delalloc_bytes += len;
1775  		if (*bits & EXTENT_DEFRAG)
1776  			BTRFS_I(inode)->defrag_bytes += len;
1777  		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1778  					 &BTRFS_I(inode)->runtime_flags))
1779  			btrfs_add_delalloc_inodes(root, inode);
1780  		spin_unlock(&BTRFS_I(inode)->lock);
1781  	}
1782  }
1783  
1784  /*
1785   * extent_io.c clear_bit_hook, see set_bit_hook for why
1786   */
btrfs_clear_bit_hook(struct inode * inode,struct extent_state * state,unsigned * bits)1787  static void btrfs_clear_bit_hook(struct inode *inode,
1788  				 struct extent_state *state,
1789  				 unsigned *bits)
1790  {
1791  	u64 len = state->end + 1 - state->start;
1792  	u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
1793  				    BTRFS_MAX_EXTENT_SIZE);
1794  
1795  	spin_lock(&BTRFS_I(inode)->lock);
1796  	if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1797  		BTRFS_I(inode)->defrag_bytes -= len;
1798  	spin_unlock(&BTRFS_I(inode)->lock);
1799  
1800  	/*
1801  	 * set_bit and clear bit hooks normally require _irqsave/restore
1802  	 * but in this case, we are only testing for the DELALLOC
1803  	 * bit, which is only set or cleared with irqs on
1804  	 */
1805  	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1806  		struct btrfs_root *root = BTRFS_I(inode)->root;
1807  		bool do_list = !btrfs_is_free_space_inode(inode);
1808  
1809  		if (*bits & EXTENT_FIRST_DELALLOC) {
1810  			*bits &= ~EXTENT_FIRST_DELALLOC;
1811  		} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1812  			spin_lock(&BTRFS_I(inode)->lock);
1813  			BTRFS_I(inode)->outstanding_extents -= num_extents;
1814  			spin_unlock(&BTRFS_I(inode)->lock);
1815  		}
1816  
1817  		/*
1818  		 * We don't reserve metadata space for space cache inodes so we
1819  		 * don't need to call dellalloc_release_metadata if there is an
1820  		 * error.
1821  		 */
1822  		if (*bits & EXTENT_DO_ACCOUNTING &&
1823  		    root != root->fs_info->tree_root)
1824  			btrfs_delalloc_release_metadata(inode, len);
1825  
1826  		/* For sanity tests. */
1827  		if (btrfs_is_testing(root->fs_info))
1828  			return;
1829  
1830  		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1831  		    && do_list && !(state->state & EXTENT_NORESERVE)
1832  		    && (*bits & (EXTENT_DO_ACCOUNTING |
1833  		    EXTENT_CLEAR_DATA_RESV)))
1834  			btrfs_free_reserved_data_space_noquota(inode,
1835  					state->start, len);
1836  
1837  		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1838  				     root->fs_info->delalloc_batch);
1839  		spin_lock(&BTRFS_I(inode)->lock);
1840  		BTRFS_I(inode)->delalloc_bytes -= len;
1841  		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1842  		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1843  			     &BTRFS_I(inode)->runtime_flags))
1844  			btrfs_del_delalloc_inode(root, inode);
1845  		spin_unlock(&BTRFS_I(inode)->lock);
1846  	}
1847  }
1848  
1849  /*
1850   * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1851   * we don't create bios that span stripes or chunks
1852   *
1853   * return 1 if page cannot be merged to bio
1854   * return 0 if page can be merged to bio
1855   * return error otherwise
1856   */
btrfs_merge_bio_hook(struct page * page,unsigned long offset,size_t size,struct bio * bio,unsigned long bio_flags)1857  int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1858  			 size_t size, struct bio *bio,
1859  			 unsigned long bio_flags)
1860  {
1861  	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1862  	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1863  	u64 length = 0;
1864  	u64 map_length;
1865  	int ret;
1866  
1867  	if (bio_flags & EXTENT_BIO_COMPRESSED)
1868  		return 0;
1869  
1870  	length = bio->bi_iter.bi_size;
1871  	map_length = length;
1872  	ret = btrfs_map_block(root->fs_info, bio_op(bio), logical,
1873  			      &map_length, NULL, 0);
1874  	if (ret < 0)
1875  		return ret;
1876  	if (map_length < length + size)
1877  		return 1;
1878  	return 0;
1879  }
1880  
1881  /*
1882   * in order to insert checksums into the metadata in large chunks,
1883   * we wait until bio submission time.   All the pages in the bio are
1884   * checksummed and sums are attached onto the ordered extent record.
1885   *
1886   * At IO completion time the cums attached on the ordered extent record
1887   * are inserted into the btree
1888   */
__btrfs_submit_bio_start(struct inode * inode,struct bio * bio,int mirror_num,unsigned long bio_flags,u64 bio_offset)1889  static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
1890  				    int mirror_num, unsigned long bio_flags,
1891  				    u64 bio_offset)
1892  {
1893  	struct btrfs_root *root = BTRFS_I(inode)->root;
1894  	int ret = 0;
1895  
1896  	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1897  	BUG_ON(ret); /* -ENOMEM */
1898  	return 0;
1899  }
1900  
1901  /*
1902   * in order to insert checksums into the metadata in large chunks,
1903   * we wait until bio submission time.   All the pages in the bio are
1904   * checksummed and sums are attached onto the ordered extent record.
1905   *
1906   * At IO completion time the cums attached on the ordered extent record
1907   * are inserted into the btree
1908   */
__btrfs_submit_bio_done(struct inode * inode,struct bio * bio,int mirror_num,unsigned long bio_flags,u64 bio_offset)1909  static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
1910  			  int mirror_num, unsigned long bio_flags,
1911  			  u64 bio_offset)
1912  {
1913  	struct btrfs_root *root = BTRFS_I(inode)->root;
1914  	int ret;
1915  
1916  	ret = btrfs_map_bio(root, bio, mirror_num, 1);
1917  	if (ret) {
1918  		bio->bi_error = ret;
1919  		bio_endio(bio);
1920  	}
1921  	return ret;
1922  }
1923  
1924  /*
1925   * extent_io.c submission hook. This does the right thing for csum calculation
1926   * on write, or reading the csums from the tree before a read
1927   */
btrfs_submit_bio_hook(struct inode * inode,struct bio * bio,int mirror_num,unsigned long bio_flags,u64 bio_offset)1928  static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
1929  			  int mirror_num, unsigned long bio_flags,
1930  			  u64 bio_offset)
1931  {
1932  	struct btrfs_root *root = BTRFS_I(inode)->root;
1933  	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1934  	int ret = 0;
1935  	int skip_sum;
1936  	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1937  
1938  	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1939  
1940  	if (btrfs_is_free_space_inode(inode))
1941  		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1942  
1943  	if (bio_op(bio) != REQ_OP_WRITE) {
1944  		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1945  		if (ret)
1946  			goto out;
1947  
1948  		if (bio_flags & EXTENT_BIO_COMPRESSED) {
1949  			ret = btrfs_submit_compressed_read(inode, bio,
1950  							   mirror_num,
1951  							   bio_flags);
1952  			goto out;
1953  		} else if (!skip_sum) {
1954  			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1955  			if (ret)
1956  				goto out;
1957  		}
1958  		goto mapit;
1959  	} else if (async && !skip_sum) {
1960  		/* csum items have already been cloned */
1961  		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1962  			goto mapit;
1963  		/* we're doing a write, do the async checksumming */
1964  		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1965  				   inode, bio, mirror_num,
1966  				   bio_flags, bio_offset,
1967  				   __btrfs_submit_bio_start,
1968  				   __btrfs_submit_bio_done);
1969  		goto out;
1970  	} else if (!skip_sum) {
1971  		ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1972  		if (ret)
1973  			goto out;
1974  	}
1975  
1976  mapit:
1977  	ret = btrfs_map_bio(root, bio, mirror_num, 0);
1978  
1979  out:
1980  	if (ret < 0) {
1981  		bio->bi_error = ret;
1982  		bio_endio(bio);
1983  	}
1984  	return ret;
1985  }
1986  
1987  /*
1988   * given a list of ordered sums record them in the inode.  This happens
1989   * at IO completion time based on sums calculated at bio submission time.
1990   */
add_pending_csums(struct btrfs_trans_handle * trans,struct inode * inode,u64 file_offset,struct list_head * list)1991  static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1992  			     struct inode *inode, u64 file_offset,
1993  			     struct list_head *list)
1994  {
1995  	struct btrfs_ordered_sum *sum;
1996  
1997  	list_for_each_entry(sum, list, list) {
1998  		trans->adding_csums = 1;
1999  		btrfs_csum_file_blocks(trans,
2000  		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
2001  		trans->adding_csums = 0;
2002  	}
2003  	return 0;
2004  }
2005  
btrfs_set_extent_delalloc(struct inode * inode,u64 start,u64 end,struct extent_state ** cached_state,int dedupe)2006  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2007  			      struct extent_state **cached_state, int dedupe)
2008  {
2009  	WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2010  	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2011  				   cached_state);
2012  }
2013  
2014  /* see btrfs_writepage_start_hook for details on why this is required */
2015  struct btrfs_writepage_fixup {
2016  	struct page *page;
2017  	struct btrfs_work work;
2018  };
2019  
btrfs_writepage_fixup_worker(struct btrfs_work * work)2020  static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2021  {
2022  	struct btrfs_writepage_fixup *fixup;
2023  	struct btrfs_ordered_extent *ordered;
2024  	struct extent_state *cached_state = NULL;
2025  	struct page *page;
2026  	struct inode *inode;
2027  	u64 page_start;
2028  	u64 page_end;
2029  	int ret;
2030  
2031  	fixup = container_of(work, struct btrfs_writepage_fixup, work);
2032  	page = fixup->page;
2033  again:
2034  	lock_page(page);
2035  	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2036  		ClearPageChecked(page);
2037  		goto out_page;
2038  	}
2039  
2040  	inode = page->mapping->host;
2041  	page_start = page_offset(page);
2042  	page_end = page_offset(page) + PAGE_SIZE - 1;
2043  
2044  	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2045  			 &cached_state);
2046  
2047  	/* already ordered? We're done */
2048  	if (PagePrivate2(page))
2049  		goto out;
2050  
2051  	ordered = btrfs_lookup_ordered_range(inode, page_start,
2052  					PAGE_SIZE);
2053  	if (ordered) {
2054  		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2055  				     page_end, &cached_state, GFP_NOFS);
2056  		unlock_page(page);
2057  		btrfs_start_ordered_extent(inode, ordered, 1);
2058  		btrfs_put_ordered_extent(ordered);
2059  		goto again;
2060  	}
2061  
2062  	ret = btrfs_delalloc_reserve_space(inode, page_start,
2063  					   PAGE_SIZE);
2064  	if (ret) {
2065  		mapping_set_error(page->mapping, ret);
2066  		end_extent_writepage(page, ret, page_start, page_end);
2067  		ClearPageChecked(page);
2068  		goto out;
2069  	 }
2070  
2071  	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
2072  					&cached_state, 0);
2073  	if (ret) {
2074  		mapping_set_error(page->mapping, ret);
2075  		end_extent_writepage(page, ret, page_start, page_end);
2076  		ClearPageChecked(page);
2077  		goto out;
2078  	}
2079  
2080  	ClearPageChecked(page);
2081  	set_page_dirty(page);
2082  out:
2083  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2084  			     &cached_state, GFP_NOFS);
2085  out_page:
2086  	unlock_page(page);
2087  	put_page(page);
2088  	kfree(fixup);
2089  }
2090  
2091  /*
2092   * There are a few paths in the higher layers of the kernel that directly
2093   * set the page dirty bit without asking the filesystem if it is a
2094   * good idea.  This causes problems because we want to make sure COW
2095   * properly happens and the data=ordered rules are followed.
2096   *
2097   * In our case any range that doesn't have the ORDERED bit set
2098   * hasn't been properly setup for IO.  We kick off an async process
2099   * to fix it up.  The async helper will wait for ordered extents, set
2100   * the delalloc bit and make it safe to write the page.
2101   */
btrfs_writepage_start_hook(struct page * page,u64 start,u64 end)2102  static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2103  {
2104  	struct inode *inode = page->mapping->host;
2105  	struct btrfs_writepage_fixup *fixup;
2106  	struct btrfs_root *root = BTRFS_I(inode)->root;
2107  
2108  	/* this page is properly in the ordered list */
2109  	if (TestClearPagePrivate2(page))
2110  		return 0;
2111  
2112  	if (PageChecked(page))
2113  		return -EAGAIN;
2114  
2115  	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2116  	if (!fixup)
2117  		return -EAGAIN;
2118  
2119  	SetPageChecked(page);
2120  	get_page(page);
2121  	btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2122  			btrfs_writepage_fixup_worker, NULL, NULL);
2123  	fixup->page = page;
2124  	btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
2125  	return -EBUSY;
2126  }
2127  
insert_reserved_file_extent(struct btrfs_trans_handle * trans,struct inode * inode,u64 file_pos,u64 disk_bytenr,u64 disk_num_bytes,u64 num_bytes,u64 ram_bytes,u8 compression,u8 encryption,u16 other_encoding,int extent_type)2128  static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2129  				       struct inode *inode, u64 file_pos,
2130  				       u64 disk_bytenr, u64 disk_num_bytes,
2131  				       u64 num_bytes, u64 ram_bytes,
2132  				       u8 compression, u8 encryption,
2133  				       u16 other_encoding, int extent_type)
2134  {
2135  	struct btrfs_root *root = BTRFS_I(inode)->root;
2136  	struct btrfs_file_extent_item *fi;
2137  	struct btrfs_path *path;
2138  	struct extent_buffer *leaf;
2139  	struct btrfs_key ins;
2140  	int extent_inserted = 0;
2141  	int ret;
2142  
2143  	path = btrfs_alloc_path();
2144  	if (!path)
2145  		return -ENOMEM;
2146  
2147  	/*
2148  	 * we may be replacing one extent in the tree with another.
2149  	 * The new extent is pinned in the extent map, and we don't want
2150  	 * to drop it from the cache until it is completely in the btree.
2151  	 *
2152  	 * So, tell btrfs_drop_extents to leave this extent in the cache.
2153  	 * the caller is expected to unpin it and allow it to be merged
2154  	 * with the others.
2155  	 */
2156  	ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2157  				   file_pos + num_bytes, NULL, 0,
2158  				   1, sizeof(*fi), &extent_inserted);
2159  	if (ret)
2160  		goto out;
2161  
2162  	if (!extent_inserted) {
2163  		ins.objectid = btrfs_ino(inode);
2164  		ins.offset = file_pos;
2165  		ins.type = BTRFS_EXTENT_DATA_KEY;
2166  
2167  		path->leave_spinning = 1;
2168  		ret = btrfs_insert_empty_item(trans, root, path, &ins,
2169  					      sizeof(*fi));
2170  		if (ret)
2171  			goto out;
2172  	}
2173  	leaf = path->nodes[0];
2174  	fi = btrfs_item_ptr(leaf, path->slots[0],
2175  			    struct btrfs_file_extent_item);
2176  	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2177  	btrfs_set_file_extent_type(leaf, fi, extent_type);
2178  	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2179  	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2180  	btrfs_set_file_extent_offset(leaf, fi, 0);
2181  	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2182  	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2183  	btrfs_set_file_extent_compression(leaf, fi, compression);
2184  	btrfs_set_file_extent_encryption(leaf, fi, encryption);
2185  	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2186  
2187  	btrfs_mark_buffer_dirty(leaf);
2188  	btrfs_release_path(path);
2189  
2190  	inode_add_bytes(inode, num_bytes);
2191  
2192  	ins.objectid = disk_bytenr;
2193  	ins.offset = disk_num_bytes;
2194  	ins.type = BTRFS_EXTENT_ITEM_KEY;
2195  	ret = btrfs_alloc_reserved_file_extent(trans, root,
2196  					root->root_key.objectid,
2197  					btrfs_ino(inode), file_pos,
2198  					ram_bytes, &ins);
2199  	/*
2200  	 * Release the reserved range from inode dirty range map, as it is
2201  	 * already moved into delayed_ref_head
2202  	 */
2203  	btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2204  out:
2205  	btrfs_free_path(path);
2206  
2207  	return ret;
2208  }
2209  
2210  /* snapshot-aware defrag */
2211  struct sa_defrag_extent_backref {
2212  	struct rb_node node;
2213  	struct old_sa_defrag_extent *old;
2214  	u64 root_id;
2215  	u64 inum;
2216  	u64 file_pos;
2217  	u64 extent_offset;
2218  	u64 num_bytes;
2219  	u64 generation;
2220  };
2221  
2222  struct old_sa_defrag_extent {
2223  	struct list_head list;
2224  	struct new_sa_defrag_extent *new;
2225  
2226  	u64 extent_offset;
2227  	u64 bytenr;
2228  	u64 offset;
2229  	u64 len;
2230  	int count;
2231  };
2232  
2233  struct new_sa_defrag_extent {
2234  	struct rb_root root;
2235  	struct list_head head;
2236  	struct btrfs_path *path;
2237  	struct inode *inode;
2238  	u64 file_pos;
2239  	u64 len;
2240  	u64 bytenr;
2241  	u64 disk_len;
2242  	u8 compress_type;
2243  };
2244  
backref_comp(struct sa_defrag_extent_backref * b1,struct sa_defrag_extent_backref * b2)2245  static int backref_comp(struct sa_defrag_extent_backref *b1,
2246  			struct sa_defrag_extent_backref *b2)
2247  {
2248  	if (b1->root_id < b2->root_id)
2249  		return -1;
2250  	else if (b1->root_id > b2->root_id)
2251  		return 1;
2252  
2253  	if (b1->inum < b2->inum)
2254  		return -1;
2255  	else if (b1->inum > b2->inum)
2256  		return 1;
2257  
2258  	if (b1->file_pos < b2->file_pos)
2259  		return -1;
2260  	else if (b1->file_pos > b2->file_pos)
2261  		return 1;
2262  
2263  	/*
2264  	 * [------------------------------] ===> (a range of space)
2265  	 *     |<--->|   |<---->| =============> (fs/file tree A)
2266  	 * |<---------------------------->| ===> (fs/file tree B)
2267  	 *
2268  	 * A range of space can refer to two file extents in one tree while
2269  	 * refer to only one file extent in another tree.
2270  	 *
2271  	 * So we may process a disk offset more than one time(two extents in A)
2272  	 * and locate at the same extent(one extent in B), then insert two same
2273  	 * backrefs(both refer to the extent in B).
2274  	 */
2275  	return 0;
2276  }
2277  
backref_insert(struct rb_root * root,struct sa_defrag_extent_backref * backref)2278  static void backref_insert(struct rb_root *root,
2279  			   struct sa_defrag_extent_backref *backref)
2280  {
2281  	struct rb_node **p = &root->rb_node;
2282  	struct rb_node *parent = NULL;
2283  	struct sa_defrag_extent_backref *entry;
2284  	int ret;
2285  
2286  	while (*p) {
2287  		parent = *p;
2288  		entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2289  
2290  		ret = backref_comp(backref, entry);
2291  		if (ret < 0)
2292  			p = &(*p)->rb_left;
2293  		else
2294  			p = &(*p)->rb_right;
2295  	}
2296  
2297  	rb_link_node(&backref->node, parent, p);
2298  	rb_insert_color(&backref->node, root);
2299  }
2300  
2301  /*
2302   * Note the backref might has changed, and in this case we just return 0.
2303   */
record_one_backref(u64 inum,u64 offset,u64 root_id,void * ctx)2304  static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2305  				       void *ctx)
2306  {
2307  	struct btrfs_file_extent_item *extent;
2308  	struct btrfs_fs_info *fs_info;
2309  	struct old_sa_defrag_extent *old = ctx;
2310  	struct new_sa_defrag_extent *new = old->new;
2311  	struct btrfs_path *path = new->path;
2312  	struct btrfs_key key;
2313  	struct btrfs_root *root;
2314  	struct sa_defrag_extent_backref *backref;
2315  	struct extent_buffer *leaf;
2316  	struct inode *inode = new->inode;
2317  	int slot;
2318  	int ret;
2319  	u64 extent_offset;
2320  	u64 num_bytes;
2321  
2322  	if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2323  	    inum == btrfs_ino(inode))
2324  		return 0;
2325  
2326  	key.objectid = root_id;
2327  	key.type = BTRFS_ROOT_ITEM_KEY;
2328  	key.offset = (u64)-1;
2329  
2330  	fs_info = BTRFS_I(inode)->root->fs_info;
2331  	root = btrfs_read_fs_root_no_name(fs_info, &key);
2332  	if (IS_ERR(root)) {
2333  		if (PTR_ERR(root) == -ENOENT)
2334  			return 0;
2335  		WARN_ON(1);
2336  		btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2337  			 inum, offset, root_id);
2338  		return PTR_ERR(root);
2339  	}
2340  
2341  	key.objectid = inum;
2342  	key.type = BTRFS_EXTENT_DATA_KEY;
2343  	if (offset > (u64)-1 << 32)
2344  		key.offset = 0;
2345  	else
2346  		key.offset = offset;
2347  
2348  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2349  	if (WARN_ON(ret < 0))
2350  		return ret;
2351  	ret = 0;
2352  
2353  	while (1) {
2354  		cond_resched();
2355  
2356  		leaf = path->nodes[0];
2357  		slot = path->slots[0];
2358  
2359  		if (slot >= btrfs_header_nritems(leaf)) {
2360  			ret = btrfs_next_leaf(root, path);
2361  			if (ret < 0) {
2362  				goto out;
2363  			} else if (ret > 0) {
2364  				ret = 0;
2365  				goto out;
2366  			}
2367  			continue;
2368  		}
2369  
2370  		path->slots[0]++;
2371  
2372  		btrfs_item_key_to_cpu(leaf, &key, slot);
2373  
2374  		if (key.objectid > inum)
2375  			goto out;
2376  
2377  		if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2378  			continue;
2379  
2380  		extent = btrfs_item_ptr(leaf, slot,
2381  					struct btrfs_file_extent_item);
2382  
2383  		if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2384  			continue;
2385  
2386  		/*
2387  		 * 'offset' refers to the exact key.offset,
2388  		 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2389  		 * (key.offset - extent_offset).
2390  		 */
2391  		if (key.offset != offset)
2392  			continue;
2393  
2394  		extent_offset = btrfs_file_extent_offset(leaf, extent);
2395  		num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2396  
2397  		if (extent_offset >= old->extent_offset + old->offset +
2398  		    old->len || extent_offset + num_bytes <=
2399  		    old->extent_offset + old->offset)
2400  			continue;
2401  		break;
2402  	}
2403  
2404  	backref = kmalloc(sizeof(*backref), GFP_NOFS);
2405  	if (!backref) {
2406  		ret = -ENOENT;
2407  		goto out;
2408  	}
2409  
2410  	backref->root_id = root_id;
2411  	backref->inum = inum;
2412  	backref->file_pos = offset;
2413  	backref->num_bytes = num_bytes;
2414  	backref->extent_offset = extent_offset;
2415  	backref->generation = btrfs_file_extent_generation(leaf, extent);
2416  	backref->old = old;
2417  	backref_insert(&new->root, backref);
2418  	old->count++;
2419  out:
2420  	btrfs_release_path(path);
2421  	WARN_ON(ret);
2422  	return ret;
2423  }
2424  
record_extent_backrefs(struct btrfs_path * path,struct new_sa_defrag_extent * new)2425  static noinline bool record_extent_backrefs(struct btrfs_path *path,
2426  				   struct new_sa_defrag_extent *new)
2427  {
2428  	struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2429  	struct old_sa_defrag_extent *old, *tmp;
2430  	int ret;
2431  
2432  	new->path = path;
2433  
2434  	list_for_each_entry_safe(old, tmp, &new->head, list) {
2435  		ret = iterate_inodes_from_logical(old->bytenr +
2436  						  old->extent_offset, fs_info,
2437  						  path, record_one_backref,
2438  						  old);
2439  		if (ret < 0 && ret != -ENOENT)
2440  			return false;
2441  
2442  		/* no backref to be processed for this extent */
2443  		if (!old->count) {
2444  			list_del(&old->list);
2445  			kfree(old);
2446  		}
2447  	}
2448  
2449  	if (list_empty(&new->head))
2450  		return false;
2451  
2452  	return true;
2453  }
2454  
relink_is_mergable(struct extent_buffer * leaf,struct btrfs_file_extent_item * fi,struct new_sa_defrag_extent * new)2455  static int relink_is_mergable(struct extent_buffer *leaf,
2456  			      struct btrfs_file_extent_item *fi,
2457  			      struct new_sa_defrag_extent *new)
2458  {
2459  	if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2460  		return 0;
2461  
2462  	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2463  		return 0;
2464  
2465  	if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2466  		return 0;
2467  
2468  	if (btrfs_file_extent_encryption(leaf, fi) ||
2469  	    btrfs_file_extent_other_encoding(leaf, fi))
2470  		return 0;
2471  
2472  	return 1;
2473  }
2474  
2475  /*
2476   * Note the backref might has changed, and in this case we just return 0.
2477   */
relink_extent_backref(struct btrfs_path * path,struct sa_defrag_extent_backref * prev,struct sa_defrag_extent_backref * backref)2478  static noinline int relink_extent_backref(struct btrfs_path *path,
2479  				 struct sa_defrag_extent_backref *prev,
2480  				 struct sa_defrag_extent_backref *backref)
2481  {
2482  	struct btrfs_file_extent_item *extent;
2483  	struct btrfs_file_extent_item *item;
2484  	struct btrfs_ordered_extent *ordered;
2485  	struct btrfs_trans_handle *trans;
2486  	struct btrfs_fs_info *fs_info;
2487  	struct btrfs_root *root;
2488  	struct btrfs_key key;
2489  	struct extent_buffer *leaf;
2490  	struct old_sa_defrag_extent *old = backref->old;
2491  	struct new_sa_defrag_extent *new = old->new;
2492  	struct inode *src_inode = new->inode;
2493  	struct inode *inode;
2494  	struct extent_state *cached = NULL;
2495  	int ret = 0;
2496  	u64 start;
2497  	u64 len;
2498  	u64 lock_start;
2499  	u64 lock_end;
2500  	bool merge = false;
2501  	int index;
2502  
2503  	if (prev && prev->root_id == backref->root_id &&
2504  	    prev->inum == backref->inum &&
2505  	    prev->file_pos + prev->num_bytes == backref->file_pos)
2506  		merge = true;
2507  
2508  	/* step 1: get root */
2509  	key.objectid = backref->root_id;
2510  	key.type = BTRFS_ROOT_ITEM_KEY;
2511  	key.offset = (u64)-1;
2512  
2513  	fs_info = BTRFS_I(src_inode)->root->fs_info;
2514  	index = srcu_read_lock(&fs_info->subvol_srcu);
2515  
2516  	root = btrfs_read_fs_root_no_name(fs_info, &key);
2517  	if (IS_ERR(root)) {
2518  		srcu_read_unlock(&fs_info->subvol_srcu, index);
2519  		if (PTR_ERR(root) == -ENOENT)
2520  			return 0;
2521  		return PTR_ERR(root);
2522  	}
2523  
2524  	if (btrfs_root_readonly(root)) {
2525  		srcu_read_unlock(&fs_info->subvol_srcu, index);
2526  		return 0;
2527  	}
2528  
2529  	/* step 2: get inode */
2530  	key.objectid = backref->inum;
2531  	key.type = BTRFS_INODE_ITEM_KEY;
2532  	key.offset = 0;
2533  
2534  	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2535  	if (IS_ERR(inode)) {
2536  		srcu_read_unlock(&fs_info->subvol_srcu, index);
2537  		return 0;
2538  	}
2539  
2540  	srcu_read_unlock(&fs_info->subvol_srcu, index);
2541  
2542  	/* step 3: relink backref */
2543  	lock_start = backref->file_pos;
2544  	lock_end = backref->file_pos + backref->num_bytes - 1;
2545  	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2546  			 &cached);
2547  
2548  	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2549  	if (ordered) {
2550  		btrfs_put_ordered_extent(ordered);
2551  		goto out_unlock;
2552  	}
2553  
2554  	trans = btrfs_join_transaction(root);
2555  	if (IS_ERR(trans)) {
2556  		ret = PTR_ERR(trans);
2557  		goto out_unlock;
2558  	}
2559  
2560  	key.objectid = backref->inum;
2561  	key.type = BTRFS_EXTENT_DATA_KEY;
2562  	key.offset = backref->file_pos;
2563  
2564  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2565  	if (ret < 0) {
2566  		goto out_free_path;
2567  	} else if (ret > 0) {
2568  		ret = 0;
2569  		goto out_free_path;
2570  	}
2571  
2572  	extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2573  				struct btrfs_file_extent_item);
2574  
2575  	if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2576  	    backref->generation)
2577  		goto out_free_path;
2578  
2579  	btrfs_release_path(path);
2580  
2581  	start = backref->file_pos;
2582  	if (backref->extent_offset < old->extent_offset + old->offset)
2583  		start += old->extent_offset + old->offset -
2584  			 backref->extent_offset;
2585  
2586  	len = min(backref->extent_offset + backref->num_bytes,
2587  		  old->extent_offset + old->offset + old->len);
2588  	len -= max(backref->extent_offset, old->extent_offset + old->offset);
2589  
2590  	ret = btrfs_drop_extents(trans, root, inode, start,
2591  				 start + len, 1);
2592  	if (ret)
2593  		goto out_free_path;
2594  again:
2595  	key.objectid = btrfs_ino(inode);
2596  	key.type = BTRFS_EXTENT_DATA_KEY;
2597  	key.offset = start;
2598  
2599  	path->leave_spinning = 1;
2600  	if (merge) {
2601  		struct btrfs_file_extent_item *fi;
2602  		u64 extent_len;
2603  		struct btrfs_key found_key;
2604  
2605  		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2606  		if (ret < 0)
2607  			goto out_free_path;
2608  
2609  		path->slots[0]--;
2610  		leaf = path->nodes[0];
2611  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2612  
2613  		fi = btrfs_item_ptr(leaf, path->slots[0],
2614  				    struct btrfs_file_extent_item);
2615  		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2616  
2617  		if (extent_len + found_key.offset == start &&
2618  		    relink_is_mergable(leaf, fi, new)) {
2619  			btrfs_set_file_extent_num_bytes(leaf, fi,
2620  							extent_len + len);
2621  			btrfs_mark_buffer_dirty(leaf);
2622  			inode_add_bytes(inode, len);
2623  
2624  			ret = 1;
2625  			goto out_free_path;
2626  		} else {
2627  			merge = false;
2628  			btrfs_release_path(path);
2629  			goto again;
2630  		}
2631  	}
2632  
2633  	ret = btrfs_insert_empty_item(trans, root, path, &key,
2634  					sizeof(*extent));
2635  	if (ret) {
2636  		btrfs_abort_transaction(trans, ret);
2637  		goto out_free_path;
2638  	}
2639  
2640  	leaf = path->nodes[0];
2641  	item = btrfs_item_ptr(leaf, path->slots[0],
2642  				struct btrfs_file_extent_item);
2643  	btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2644  	btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2645  	btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2646  	btrfs_set_file_extent_num_bytes(leaf, item, len);
2647  	btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2648  	btrfs_set_file_extent_generation(leaf, item, trans->transid);
2649  	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2650  	btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2651  	btrfs_set_file_extent_encryption(leaf, item, 0);
2652  	btrfs_set_file_extent_other_encoding(leaf, item, 0);
2653  
2654  	btrfs_mark_buffer_dirty(leaf);
2655  	inode_add_bytes(inode, len);
2656  	btrfs_release_path(path);
2657  
2658  	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2659  			new->disk_len, 0,
2660  			backref->root_id, backref->inum,
2661  			new->file_pos);	/* start - extent_offset */
2662  	if (ret) {
2663  		btrfs_abort_transaction(trans, ret);
2664  		goto out_free_path;
2665  	}
2666  
2667  	ret = 1;
2668  out_free_path:
2669  	btrfs_release_path(path);
2670  	path->leave_spinning = 0;
2671  	btrfs_end_transaction(trans, root);
2672  out_unlock:
2673  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2674  			     &cached, GFP_NOFS);
2675  	iput(inode);
2676  	return ret;
2677  }
2678  
free_sa_defrag_extent(struct new_sa_defrag_extent * new)2679  static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2680  {
2681  	struct old_sa_defrag_extent *old, *tmp;
2682  
2683  	if (!new)
2684  		return;
2685  
2686  	list_for_each_entry_safe(old, tmp, &new->head, list) {
2687  		kfree(old);
2688  	}
2689  	kfree(new);
2690  }
2691  
relink_file_extents(struct new_sa_defrag_extent * new)2692  static void relink_file_extents(struct new_sa_defrag_extent *new)
2693  {
2694  	struct btrfs_path *path;
2695  	struct sa_defrag_extent_backref *backref;
2696  	struct sa_defrag_extent_backref *prev = NULL;
2697  	struct inode *inode;
2698  	struct btrfs_root *root;
2699  	struct rb_node *node;
2700  	int ret;
2701  
2702  	inode = new->inode;
2703  	root = BTRFS_I(inode)->root;
2704  
2705  	path = btrfs_alloc_path();
2706  	if (!path)
2707  		return;
2708  
2709  	if (!record_extent_backrefs(path, new)) {
2710  		btrfs_free_path(path);
2711  		goto out;
2712  	}
2713  	btrfs_release_path(path);
2714  
2715  	while (1) {
2716  		node = rb_first(&new->root);
2717  		if (!node)
2718  			break;
2719  		rb_erase(node, &new->root);
2720  
2721  		backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2722  
2723  		ret = relink_extent_backref(path, prev, backref);
2724  		WARN_ON(ret < 0);
2725  
2726  		kfree(prev);
2727  
2728  		if (ret == 1)
2729  			prev = backref;
2730  		else
2731  			prev = NULL;
2732  		cond_resched();
2733  	}
2734  	kfree(prev);
2735  
2736  	btrfs_free_path(path);
2737  out:
2738  	free_sa_defrag_extent(new);
2739  
2740  	atomic_dec(&root->fs_info->defrag_running);
2741  	wake_up(&root->fs_info->transaction_wait);
2742  }
2743  
2744  static struct new_sa_defrag_extent *
record_old_file_extents(struct inode * inode,struct btrfs_ordered_extent * ordered)2745  record_old_file_extents(struct inode *inode,
2746  			struct btrfs_ordered_extent *ordered)
2747  {
2748  	struct btrfs_root *root = BTRFS_I(inode)->root;
2749  	struct btrfs_path *path;
2750  	struct btrfs_key key;
2751  	struct old_sa_defrag_extent *old;
2752  	struct new_sa_defrag_extent *new;
2753  	int ret;
2754  
2755  	new = kmalloc(sizeof(*new), GFP_NOFS);
2756  	if (!new)
2757  		return NULL;
2758  
2759  	new->inode = inode;
2760  	new->file_pos = ordered->file_offset;
2761  	new->len = ordered->len;
2762  	new->bytenr = ordered->start;
2763  	new->disk_len = ordered->disk_len;
2764  	new->compress_type = ordered->compress_type;
2765  	new->root = RB_ROOT;
2766  	INIT_LIST_HEAD(&new->head);
2767  
2768  	path = btrfs_alloc_path();
2769  	if (!path)
2770  		goto out_kfree;
2771  
2772  	key.objectid = btrfs_ino(inode);
2773  	key.type = BTRFS_EXTENT_DATA_KEY;
2774  	key.offset = new->file_pos;
2775  
2776  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2777  	if (ret < 0)
2778  		goto out_free_path;
2779  	if (ret > 0 && path->slots[0] > 0)
2780  		path->slots[0]--;
2781  
2782  	/* find out all the old extents for the file range */
2783  	while (1) {
2784  		struct btrfs_file_extent_item *extent;
2785  		struct extent_buffer *l;
2786  		int slot;
2787  		u64 num_bytes;
2788  		u64 offset;
2789  		u64 end;
2790  		u64 disk_bytenr;
2791  		u64 extent_offset;
2792  
2793  		l = path->nodes[0];
2794  		slot = path->slots[0];
2795  
2796  		if (slot >= btrfs_header_nritems(l)) {
2797  			ret = btrfs_next_leaf(root, path);
2798  			if (ret < 0)
2799  				goto out_free_path;
2800  			else if (ret > 0)
2801  				break;
2802  			continue;
2803  		}
2804  
2805  		btrfs_item_key_to_cpu(l, &key, slot);
2806  
2807  		if (key.objectid != btrfs_ino(inode))
2808  			break;
2809  		if (key.type != BTRFS_EXTENT_DATA_KEY)
2810  			break;
2811  		if (key.offset >= new->file_pos + new->len)
2812  			break;
2813  
2814  		extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2815  
2816  		num_bytes = btrfs_file_extent_num_bytes(l, extent);
2817  		if (key.offset + num_bytes < new->file_pos)
2818  			goto next;
2819  
2820  		disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2821  		if (!disk_bytenr)
2822  			goto next;
2823  
2824  		extent_offset = btrfs_file_extent_offset(l, extent);
2825  
2826  		old = kmalloc(sizeof(*old), GFP_NOFS);
2827  		if (!old)
2828  			goto out_free_path;
2829  
2830  		offset = max(new->file_pos, key.offset);
2831  		end = min(new->file_pos + new->len, key.offset + num_bytes);
2832  
2833  		old->bytenr = disk_bytenr;
2834  		old->extent_offset = extent_offset;
2835  		old->offset = offset - key.offset;
2836  		old->len = end - offset;
2837  		old->new = new;
2838  		old->count = 0;
2839  		list_add_tail(&old->list, &new->head);
2840  next:
2841  		path->slots[0]++;
2842  		cond_resched();
2843  	}
2844  
2845  	btrfs_free_path(path);
2846  	atomic_inc(&root->fs_info->defrag_running);
2847  
2848  	return new;
2849  
2850  out_free_path:
2851  	btrfs_free_path(path);
2852  out_kfree:
2853  	free_sa_defrag_extent(new);
2854  	return NULL;
2855  }
2856  
btrfs_release_delalloc_bytes(struct btrfs_root * root,u64 start,u64 len)2857  static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
2858  					 u64 start, u64 len)
2859  {
2860  	struct btrfs_block_group_cache *cache;
2861  
2862  	cache = btrfs_lookup_block_group(root->fs_info, start);
2863  	ASSERT(cache);
2864  
2865  	spin_lock(&cache->lock);
2866  	cache->delalloc_bytes -= len;
2867  	spin_unlock(&cache->lock);
2868  
2869  	btrfs_put_block_group(cache);
2870  }
2871  
2872  /* as ordered data IO finishes, this gets called so we can finish
2873   * an ordered extent if the range of bytes in the file it covers are
2874   * fully written.
2875   */
btrfs_finish_ordered_io(struct btrfs_ordered_extent * ordered_extent)2876  static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2877  {
2878  	struct inode *inode = ordered_extent->inode;
2879  	struct btrfs_root *root = BTRFS_I(inode)->root;
2880  	struct btrfs_trans_handle *trans = NULL;
2881  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2882  	struct extent_state *cached_state = NULL;
2883  	struct new_sa_defrag_extent *new = NULL;
2884  	int compress_type = 0;
2885  	int ret = 0;
2886  	u64 logical_len = ordered_extent->len;
2887  	bool nolock;
2888  	bool truncated = false;
2889  
2890  	nolock = btrfs_is_free_space_inode(inode);
2891  
2892  	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2893  		ret = -EIO;
2894  		goto out;
2895  	}
2896  
2897  	btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
2898  				     ordered_extent->file_offset +
2899  				     ordered_extent->len - 1);
2900  
2901  	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2902  		truncated = true;
2903  		logical_len = ordered_extent->truncated_len;
2904  		/* Truncated the entire extent, don't bother adding */
2905  		if (!logical_len)
2906  			goto out;
2907  	}
2908  
2909  	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2910  		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2911  
2912  		/*
2913  		 * For mwrite(mmap + memset to write) case, we still reserve
2914  		 * space for NOCOW range.
2915  		 * As NOCOW won't cause a new delayed ref, just free the space
2916  		 */
2917  		btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
2918  				       ordered_extent->len);
2919  		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2920  		if (nolock)
2921  			trans = btrfs_join_transaction_nolock(root);
2922  		else
2923  			trans = btrfs_join_transaction(root);
2924  		if (IS_ERR(trans)) {
2925  			ret = PTR_ERR(trans);
2926  			trans = NULL;
2927  			goto out;
2928  		}
2929  		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2930  		ret = btrfs_update_inode_fallback(trans, root, inode);
2931  		if (ret) /* -ENOMEM or corruption */
2932  			btrfs_abort_transaction(trans, ret);
2933  		goto out;
2934  	}
2935  
2936  	lock_extent_bits(io_tree, ordered_extent->file_offset,
2937  			 ordered_extent->file_offset + ordered_extent->len - 1,
2938  			 &cached_state);
2939  
2940  	ret = test_range_bit(io_tree, ordered_extent->file_offset,
2941  			ordered_extent->file_offset + ordered_extent->len - 1,
2942  			EXTENT_DEFRAG, 1, cached_state);
2943  	if (ret) {
2944  		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2945  		if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2946  			/* the inode is shared */
2947  			new = record_old_file_extents(inode, ordered_extent);
2948  
2949  		clear_extent_bit(io_tree, ordered_extent->file_offset,
2950  			ordered_extent->file_offset + ordered_extent->len - 1,
2951  			EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2952  	}
2953  
2954  	if (nolock)
2955  		trans = btrfs_join_transaction_nolock(root);
2956  	else
2957  		trans = btrfs_join_transaction(root);
2958  	if (IS_ERR(trans)) {
2959  		ret = PTR_ERR(trans);
2960  		trans = NULL;
2961  		goto out_unlock;
2962  	}
2963  
2964  	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2965  
2966  	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2967  		compress_type = ordered_extent->compress_type;
2968  	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2969  		BUG_ON(compress_type);
2970  		ret = btrfs_mark_extent_written(trans, inode,
2971  						ordered_extent->file_offset,
2972  						ordered_extent->file_offset +
2973  						logical_len);
2974  	} else {
2975  		BUG_ON(root == root->fs_info->tree_root);
2976  		ret = insert_reserved_file_extent(trans, inode,
2977  						ordered_extent->file_offset,
2978  						ordered_extent->start,
2979  						ordered_extent->disk_len,
2980  						logical_len, logical_len,
2981  						compress_type, 0, 0,
2982  						BTRFS_FILE_EXTENT_REG);
2983  		if (!ret)
2984  			btrfs_release_delalloc_bytes(root,
2985  						     ordered_extent->start,
2986  						     ordered_extent->disk_len);
2987  	}
2988  	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2989  			   ordered_extent->file_offset, ordered_extent->len,
2990  			   trans->transid);
2991  	if (ret < 0) {
2992  		btrfs_abort_transaction(trans, ret);
2993  		goto out_unlock;
2994  	}
2995  
2996  	add_pending_csums(trans, inode, ordered_extent->file_offset,
2997  			  &ordered_extent->list);
2998  
2999  	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3000  	ret = btrfs_update_inode_fallback(trans, root, inode);
3001  	if (ret) { /* -ENOMEM or corruption */
3002  		btrfs_abort_transaction(trans, ret);
3003  		goto out_unlock;
3004  	}
3005  	ret = 0;
3006  out_unlock:
3007  	unlock_extent_cached(io_tree, ordered_extent->file_offset,
3008  			     ordered_extent->file_offset +
3009  			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
3010  out:
3011  	if (root != root->fs_info->tree_root)
3012  		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
3013  	if (trans)
3014  		btrfs_end_transaction(trans, root);
3015  
3016  	if (ret || truncated) {
3017  		u64 start, end;
3018  
3019  		if (truncated)
3020  			start = ordered_extent->file_offset + logical_len;
3021  		else
3022  			start = ordered_extent->file_offset;
3023  		end = ordered_extent->file_offset + ordered_extent->len - 1;
3024  		clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
3025  
3026  		/* Drop the cache for the part of the extent we didn't write. */
3027  		btrfs_drop_extent_cache(inode, start, end, 0);
3028  
3029  		/*
3030  		 * If the ordered extent had an IOERR or something else went
3031  		 * wrong we need to return the space for this ordered extent
3032  		 * back to the allocator.  We only free the extent in the
3033  		 * truncated case if we didn't write out the extent at all.
3034  		 */
3035  		if ((ret || !logical_len) &&
3036  		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3037  		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3038  			btrfs_free_reserved_extent(root, ordered_extent->start,
3039  						   ordered_extent->disk_len, 1);
3040  	}
3041  
3042  
3043  	/*
3044  	 * This needs to be done to make sure anybody waiting knows we are done
3045  	 * updating everything for this ordered extent.
3046  	 */
3047  	btrfs_remove_ordered_extent(inode, ordered_extent);
3048  
3049  	/* for snapshot-aware defrag */
3050  	if (new) {
3051  		if (ret) {
3052  			free_sa_defrag_extent(new);
3053  			atomic_dec(&root->fs_info->defrag_running);
3054  		} else {
3055  			relink_file_extents(new);
3056  		}
3057  	}
3058  
3059  	/* once for us */
3060  	btrfs_put_ordered_extent(ordered_extent);
3061  	/* once for the tree */
3062  	btrfs_put_ordered_extent(ordered_extent);
3063  
3064  	return ret;
3065  }
3066  
finish_ordered_fn(struct btrfs_work * work)3067  static void finish_ordered_fn(struct btrfs_work *work)
3068  {
3069  	struct btrfs_ordered_extent *ordered_extent;
3070  	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3071  	btrfs_finish_ordered_io(ordered_extent);
3072  }
3073  
btrfs_writepage_end_io_hook(struct page * page,u64 start,u64 end,struct extent_state * state,int uptodate)3074  static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3075  				struct extent_state *state, int uptodate)
3076  {
3077  	struct inode *inode = page->mapping->host;
3078  	struct btrfs_root *root = BTRFS_I(inode)->root;
3079  	struct btrfs_ordered_extent *ordered_extent = NULL;
3080  	struct btrfs_workqueue *wq;
3081  	btrfs_work_func_t func;
3082  
3083  	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3084  
3085  	ClearPagePrivate2(page);
3086  	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3087  					    end - start + 1, uptodate))
3088  		return 0;
3089  
3090  	if (btrfs_is_free_space_inode(inode)) {
3091  		wq = root->fs_info->endio_freespace_worker;
3092  		func = btrfs_freespace_write_helper;
3093  	} else {
3094  		wq = root->fs_info->endio_write_workers;
3095  		func = btrfs_endio_write_helper;
3096  	}
3097  
3098  	btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3099  			NULL);
3100  	btrfs_queue_work(wq, &ordered_extent->work);
3101  
3102  	return 0;
3103  }
3104  
__readpage_endio_check(struct inode * inode,struct btrfs_io_bio * io_bio,int icsum,struct page * page,int pgoff,u64 start,size_t len)3105  static int __readpage_endio_check(struct inode *inode,
3106  				  struct btrfs_io_bio *io_bio,
3107  				  int icsum, struct page *page,
3108  				  int pgoff, u64 start, size_t len)
3109  {
3110  	char *kaddr;
3111  	u32 csum_expected;
3112  	u32 csum = ~(u32)0;
3113  
3114  	csum_expected = *(((u32 *)io_bio->csum) + icsum);
3115  
3116  	kaddr = kmap_atomic(page);
3117  	csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
3118  	btrfs_csum_final(csum, (char *)&csum);
3119  	if (csum != csum_expected)
3120  		goto zeroit;
3121  
3122  	kunmap_atomic(kaddr);
3123  	return 0;
3124  zeroit:
3125  	btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
3126  		"csum failed ino %llu off %llu csum %u expected csum %u",
3127  			   btrfs_ino(inode), start, csum, csum_expected);
3128  	memset(kaddr + pgoff, 1, len);
3129  	flush_dcache_page(page);
3130  	kunmap_atomic(kaddr);
3131  	if (csum_expected == 0)
3132  		return 0;
3133  	return -EIO;
3134  }
3135  
3136  /*
3137   * when reads are done, we need to check csums to verify the data is correct
3138   * if there's a match, we allow the bio to finish.  If not, the code in
3139   * extent_io.c will try to find good copies for us.
3140   */
btrfs_readpage_end_io_hook(struct btrfs_io_bio * io_bio,u64 phy_offset,struct page * page,u64 start,u64 end,int mirror)3141  static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3142  				      u64 phy_offset, struct page *page,
3143  				      u64 start, u64 end, int mirror)
3144  {
3145  	size_t offset = start - page_offset(page);
3146  	struct inode *inode = page->mapping->host;
3147  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3148  	struct btrfs_root *root = BTRFS_I(inode)->root;
3149  
3150  	if (PageChecked(page)) {
3151  		ClearPageChecked(page);
3152  		return 0;
3153  	}
3154  
3155  	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3156  		return 0;
3157  
3158  	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3159  	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3160  		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
3161  		return 0;
3162  	}
3163  
3164  	phy_offset >>= inode->i_sb->s_blocksize_bits;
3165  	return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3166  				      start, (size_t)(end - start + 1));
3167  }
3168  
btrfs_add_delayed_iput(struct inode * inode)3169  void btrfs_add_delayed_iput(struct inode *inode)
3170  {
3171  	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3172  	struct btrfs_inode *binode = BTRFS_I(inode);
3173  
3174  	if (atomic_add_unless(&inode->i_count, -1, 1))
3175  		return;
3176  
3177  	spin_lock(&fs_info->delayed_iput_lock);
3178  	if (binode->delayed_iput_count == 0) {
3179  		ASSERT(list_empty(&binode->delayed_iput));
3180  		list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3181  	} else {
3182  		binode->delayed_iput_count++;
3183  	}
3184  	spin_unlock(&fs_info->delayed_iput_lock);
3185  }
3186  
btrfs_run_delayed_iputs(struct btrfs_root * root)3187  void btrfs_run_delayed_iputs(struct btrfs_root *root)
3188  {
3189  	struct btrfs_fs_info *fs_info = root->fs_info;
3190  
3191  	spin_lock(&fs_info->delayed_iput_lock);
3192  	while (!list_empty(&fs_info->delayed_iputs)) {
3193  		struct btrfs_inode *inode;
3194  
3195  		inode = list_first_entry(&fs_info->delayed_iputs,
3196  				struct btrfs_inode, delayed_iput);
3197  		if (inode->delayed_iput_count) {
3198  			inode->delayed_iput_count--;
3199  			list_move_tail(&inode->delayed_iput,
3200  					&fs_info->delayed_iputs);
3201  		} else {
3202  			list_del_init(&inode->delayed_iput);
3203  		}
3204  		spin_unlock(&fs_info->delayed_iput_lock);
3205  		iput(&inode->vfs_inode);
3206  		spin_lock(&fs_info->delayed_iput_lock);
3207  	}
3208  	spin_unlock(&fs_info->delayed_iput_lock);
3209  }
3210  
3211  /*
3212   * This is called in transaction commit time. If there are no orphan
3213   * files in the subvolume, it removes orphan item and frees block_rsv
3214   * structure.
3215   */
btrfs_orphan_commit_root(struct btrfs_trans_handle * trans,struct btrfs_root * root)3216  void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
3217  			      struct btrfs_root *root)
3218  {
3219  	struct btrfs_block_rsv *block_rsv;
3220  	int ret;
3221  
3222  	if (atomic_read(&root->orphan_inodes) ||
3223  	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
3224  		return;
3225  
3226  	spin_lock(&root->orphan_lock);
3227  	if (atomic_read(&root->orphan_inodes)) {
3228  		spin_unlock(&root->orphan_lock);
3229  		return;
3230  	}
3231  
3232  	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
3233  		spin_unlock(&root->orphan_lock);
3234  		return;
3235  	}
3236  
3237  	block_rsv = root->orphan_block_rsv;
3238  	root->orphan_block_rsv = NULL;
3239  	spin_unlock(&root->orphan_lock);
3240  
3241  	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
3242  	    btrfs_root_refs(&root->root_item) > 0) {
3243  		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
3244  					    root->root_key.objectid);
3245  		if (ret)
3246  			btrfs_abort_transaction(trans, ret);
3247  		else
3248  			clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3249  				  &root->state);
3250  	}
3251  
3252  	if (block_rsv) {
3253  		WARN_ON(block_rsv->size > 0);
3254  		btrfs_free_block_rsv(root, block_rsv);
3255  	}
3256  }
3257  
3258  /*
3259   * This creates an orphan entry for the given inode in case something goes
3260   * wrong in the middle of an unlink/truncate.
3261   *
3262   * NOTE: caller of this function should reserve 5 units of metadata for
3263   *	 this function.
3264   */
btrfs_orphan_add(struct btrfs_trans_handle * trans,struct inode * inode)3265  int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
3266  {
3267  	struct btrfs_root *root = BTRFS_I(inode)->root;
3268  	struct btrfs_block_rsv *block_rsv = NULL;
3269  	int reserve = 0;
3270  	int insert = 0;
3271  	int ret;
3272  
3273  	if (!root->orphan_block_rsv) {
3274  		block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3275  		if (!block_rsv)
3276  			return -ENOMEM;
3277  	}
3278  
3279  	spin_lock(&root->orphan_lock);
3280  	if (!root->orphan_block_rsv) {
3281  		root->orphan_block_rsv = block_rsv;
3282  	} else if (block_rsv) {
3283  		btrfs_free_block_rsv(root, block_rsv);
3284  		block_rsv = NULL;
3285  	}
3286  
3287  	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3288  			      &BTRFS_I(inode)->runtime_flags)) {
3289  #if 0
3290  		/*
3291  		 * For proper ENOSPC handling, we should do orphan
3292  		 * cleanup when mounting. But this introduces backward
3293  		 * compatibility issue.
3294  		 */
3295  		if (!xchg(&root->orphan_item_inserted, 1))
3296  			insert = 2;
3297  		else
3298  			insert = 1;
3299  #endif
3300  		insert = 1;
3301  		atomic_inc(&root->orphan_inodes);
3302  	}
3303  
3304  	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3305  			      &BTRFS_I(inode)->runtime_flags))
3306  		reserve = 1;
3307  	spin_unlock(&root->orphan_lock);
3308  
3309  	/* grab metadata reservation from transaction handle */
3310  	if (reserve) {
3311  		ret = btrfs_orphan_reserve_metadata(trans, inode);
3312  		ASSERT(!ret);
3313  		if (ret) {
3314  			atomic_dec(&root->orphan_inodes);
3315  			clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3316  				  &BTRFS_I(inode)->runtime_flags);
3317  			if (insert)
3318  				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3319  					  &BTRFS_I(inode)->runtime_flags);
3320  			return ret;
3321  		}
3322  	}
3323  
3324  	/* insert an orphan item to track this unlinked/truncated file */
3325  	if (insert >= 1) {
3326  		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3327  		if (ret) {
3328  			atomic_dec(&root->orphan_inodes);
3329  			if (reserve) {
3330  				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3331  					  &BTRFS_I(inode)->runtime_flags);
3332  				btrfs_orphan_release_metadata(inode);
3333  			}
3334  			if (ret != -EEXIST) {
3335  				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3336  					  &BTRFS_I(inode)->runtime_flags);
3337  				btrfs_abort_transaction(trans, ret);
3338  				return ret;
3339  			}
3340  		}
3341  		ret = 0;
3342  	}
3343  
3344  	/* insert an orphan item to track subvolume contains orphan files */
3345  	if (insert >= 2) {
3346  		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
3347  					       root->root_key.objectid);
3348  		if (ret && ret != -EEXIST) {
3349  			btrfs_abort_transaction(trans, ret);
3350  			return ret;
3351  		}
3352  	}
3353  	return 0;
3354  }
3355  
3356  /*
3357   * We have done the truncate/delete so we can go ahead and remove the orphan
3358   * item for this particular inode.
3359   */
btrfs_orphan_del(struct btrfs_trans_handle * trans,struct inode * inode)3360  static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3361  			    struct inode *inode)
3362  {
3363  	struct btrfs_root *root = BTRFS_I(inode)->root;
3364  	int delete_item = 0;
3365  	int release_rsv = 0;
3366  	int ret = 0;
3367  
3368  	spin_lock(&root->orphan_lock);
3369  	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3370  			       &BTRFS_I(inode)->runtime_flags))
3371  		delete_item = 1;
3372  
3373  	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3374  			       &BTRFS_I(inode)->runtime_flags))
3375  		release_rsv = 1;
3376  	spin_unlock(&root->orphan_lock);
3377  
3378  	if (delete_item) {
3379  		atomic_dec(&root->orphan_inodes);
3380  		if (trans)
3381  			ret = btrfs_del_orphan_item(trans, root,
3382  						    btrfs_ino(inode));
3383  	}
3384  
3385  	if (release_rsv)
3386  		btrfs_orphan_release_metadata(inode);
3387  
3388  	return ret;
3389  }
3390  
3391  /*
3392   * this cleans up any orphans that may be left on the list from the last use
3393   * of this root.
3394   */
btrfs_orphan_cleanup(struct btrfs_root * root)3395  int btrfs_orphan_cleanup(struct btrfs_root *root)
3396  {
3397  	struct btrfs_path *path;
3398  	struct extent_buffer *leaf;
3399  	struct btrfs_key key, found_key;
3400  	struct btrfs_trans_handle *trans;
3401  	struct inode *inode;
3402  	u64 last_objectid = 0;
3403  	int ret = 0, nr_unlink = 0, nr_truncate = 0;
3404  
3405  	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3406  		return 0;
3407  
3408  	path = btrfs_alloc_path();
3409  	if (!path) {
3410  		ret = -ENOMEM;
3411  		goto out;
3412  	}
3413  	path->reada = READA_BACK;
3414  
3415  	key.objectid = BTRFS_ORPHAN_OBJECTID;
3416  	key.type = BTRFS_ORPHAN_ITEM_KEY;
3417  	key.offset = (u64)-1;
3418  
3419  	while (1) {
3420  		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3421  		if (ret < 0)
3422  			goto out;
3423  
3424  		/*
3425  		 * if ret == 0 means we found what we were searching for, which
3426  		 * is weird, but possible, so only screw with path if we didn't
3427  		 * find the key and see if we have stuff that matches
3428  		 */
3429  		if (ret > 0) {
3430  			ret = 0;
3431  			if (path->slots[0] == 0)
3432  				break;
3433  			path->slots[0]--;
3434  		}
3435  
3436  		/* pull out the item */
3437  		leaf = path->nodes[0];
3438  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3439  
3440  		/* make sure the item matches what we want */
3441  		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3442  			break;
3443  		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3444  			break;
3445  
3446  		/* release the path since we're done with it */
3447  		btrfs_release_path(path);
3448  
3449  		/*
3450  		 * this is where we are basically btrfs_lookup, without the
3451  		 * crossing root thing.  we store the inode number in the
3452  		 * offset of the orphan item.
3453  		 */
3454  
3455  		if (found_key.offset == last_objectid) {
3456  			btrfs_err(root->fs_info,
3457  				"Error removing orphan entry, stopping orphan cleanup");
3458  			ret = -EINVAL;
3459  			goto out;
3460  		}
3461  
3462  		last_objectid = found_key.offset;
3463  
3464  		found_key.objectid = found_key.offset;
3465  		found_key.type = BTRFS_INODE_ITEM_KEY;
3466  		found_key.offset = 0;
3467  		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3468  		ret = PTR_ERR_OR_ZERO(inode);
3469  		if (ret && ret != -ENOENT)
3470  			goto out;
3471  
3472  		if (ret == -ENOENT && root == root->fs_info->tree_root) {
3473  			struct btrfs_root *dead_root;
3474  			struct btrfs_fs_info *fs_info = root->fs_info;
3475  			int is_dead_root = 0;
3476  
3477  			/*
3478  			 * this is an orphan in the tree root. Currently these
3479  			 * could come from 2 sources:
3480  			 *  a) a snapshot deletion in progress
3481  			 *  b) a free space cache inode
3482  			 * We need to distinguish those two, as the snapshot
3483  			 * orphan must not get deleted.
3484  			 * find_dead_roots already ran before us, so if this
3485  			 * is a snapshot deletion, we should find the root
3486  			 * in the dead_roots list
3487  			 */
3488  			spin_lock(&fs_info->trans_lock);
3489  			list_for_each_entry(dead_root, &fs_info->dead_roots,
3490  					    root_list) {
3491  				if (dead_root->root_key.objectid ==
3492  				    found_key.objectid) {
3493  					is_dead_root = 1;
3494  					break;
3495  				}
3496  			}
3497  			spin_unlock(&fs_info->trans_lock);
3498  			if (is_dead_root) {
3499  				/* prevent this orphan from being found again */
3500  				key.offset = found_key.objectid - 1;
3501  				continue;
3502  			}
3503  		}
3504  		/*
3505  		 * Inode is already gone but the orphan item is still there,
3506  		 * kill the orphan item.
3507  		 */
3508  		if (ret == -ENOENT) {
3509  			trans = btrfs_start_transaction(root, 1);
3510  			if (IS_ERR(trans)) {
3511  				ret = PTR_ERR(trans);
3512  				goto out;
3513  			}
3514  			btrfs_debug(root->fs_info, "auto deleting %Lu",
3515  				found_key.objectid);
3516  			ret = btrfs_del_orphan_item(trans, root,
3517  						    found_key.objectid);
3518  			btrfs_end_transaction(trans, root);
3519  			if (ret)
3520  				goto out;
3521  			continue;
3522  		}
3523  
3524  		/*
3525  		 * add this inode to the orphan list so btrfs_orphan_del does
3526  		 * the proper thing when we hit it
3527  		 */
3528  		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3529  			&BTRFS_I(inode)->runtime_flags);
3530  		atomic_inc(&root->orphan_inodes);
3531  
3532  		/* if we have links, this was a truncate, lets do that */
3533  		if (inode->i_nlink) {
3534  			if (WARN_ON(!S_ISREG(inode->i_mode))) {
3535  				iput(inode);
3536  				continue;
3537  			}
3538  			nr_truncate++;
3539  
3540  			/* 1 for the orphan item deletion. */
3541  			trans = btrfs_start_transaction(root, 1);
3542  			if (IS_ERR(trans)) {
3543  				iput(inode);
3544  				ret = PTR_ERR(trans);
3545  				goto out;
3546  			}
3547  			ret = btrfs_orphan_add(trans, inode);
3548  			btrfs_end_transaction(trans, root);
3549  			if (ret) {
3550  				iput(inode);
3551  				goto out;
3552  			}
3553  
3554  			ret = btrfs_truncate(inode);
3555  			if (ret)
3556  				btrfs_orphan_del(NULL, inode);
3557  		} else {
3558  			nr_unlink++;
3559  		}
3560  
3561  		/* this will do delete_inode and everything for us */
3562  		iput(inode);
3563  		if (ret)
3564  			goto out;
3565  	}
3566  	/* release the path since we're done with it */
3567  	btrfs_release_path(path);
3568  
3569  	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3570  
3571  	if (root->orphan_block_rsv)
3572  		btrfs_block_rsv_release(root, root->orphan_block_rsv,
3573  					(u64)-1);
3574  
3575  	if (root->orphan_block_rsv ||
3576  	    test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3577  		trans = btrfs_join_transaction(root);
3578  		if (!IS_ERR(trans))
3579  			btrfs_end_transaction(trans, root);
3580  	}
3581  
3582  	if (nr_unlink)
3583  		btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
3584  	if (nr_truncate)
3585  		btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
3586  
3587  out:
3588  	if (ret)
3589  		btrfs_err(root->fs_info,
3590  			"could not do orphan cleanup %d", ret);
3591  	btrfs_free_path(path);
3592  	return ret;
3593  }
3594  
3595  /*
3596   * very simple check to peek ahead in the leaf looking for xattrs.  If we
3597   * don't find any xattrs, we know there can't be any acls.
3598   *
3599   * slot is the slot the inode is in, objectid is the objectid of the inode
3600   */
acls_after_inode_item(struct extent_buffer * leaf,int slot,u64 objectid,int * first_xattr_slot)3601  static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3602  					  int slot, u64 objectid,
3603  					  int *first_xattr_slot)
3604  {
3605  	u32 nritems = btrfs_header_nritems(leaf);
3606  	struct btrfs_key found_key;
3607  	static u64 xattr_access = 0;
3608  	static u64 xattr_default = 0;
3609  	int scanned = 0;
3610  
3611  	if (!xattr_access) {
3612  		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3613  					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3614  		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3615  					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3616  	}
3617  
3618  	slot++;
3619  	*first_xattr_slot = -1;
3620  	while (slot < nritems) {
3621  		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3622  
3623  		/* we found a different objectid, there must not be acls */
3624  		if (found_key.objectid != objectid)
3625  			return 0;
3626  
3627  		/* we found an xattr, assume we've got an acl */
3628  		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3629  			if (*first_xattr_slot == -1)
3630  				*first_xattr_slot = slot;
3631  			if (found_key.offset == xattr_access ||
3632  			    found_key.offset == xattr_default)
3633  				return 1;
3634  		}
3635  
3636  		/*
3637  		 * we found a key greater than an xattr key, there can't
3638  		 * be any acls later on
3639  		 */
3640  		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3641  			return 0;
3642  
3643  		slot++;
3644  		scanned++;
3645  
3646  		/*
3647  		 * it goes inode, inode backrefs, xattrs, extents,
3648  		 * so if there are a ton of hard links to an inode there can
3649  		 * be a lot of backrefs.  Don't waste time searching too hard,
3650  		 * this is just an optimization
3651  		 */
3652  		if (scanned >= 8)
3653  			break;
3654  	}
3655  	/* we hit the end of the leaf before we found an xattr or
3656  	 * something larger than an xattr.  We have to assume the inode
3657  	 * has acls
3658  	 */
3659  	if (*first_xattr_slot == -1)
3660  		*first_xattr_slot = slot;
3661  	return 1;
3662  }
3663  
3664  /*
3665   * read an inode from the btree into the in-memory inode
3666   */
btrfs_read_locked_inode(struct inode * inode)3667  static int btrfs_read_locked_inode(struct inode *inode)
3668  {
3669  	struct btrfs_path *path;
3670  	struct extent_buffer *leaf;
3671  	struct btrfs_inode_item *inode_item;
3672  	struct btrfs_root *root = BTRFS_I(inode)->root;
3673  	struct btrfs_key location;
3674  	unsigned long ptr;
3675  	int maybe_acls;
3676  	u32 rdev;
3677  	int ret;
3678  	bool filled = false;
3679  	int first_xattr_slot;
3680  
3681  	ret = btrfs_fill_inode(inode, &rdev);
3682  	if (!ret)
3683  		filled = true;
3684  
3685  	path = btrfs_alloc_path();
3686  	if (!path) {
3687  		ret = -ENOMEM;
3688  		goto make_bad;
3689  	}
3690  
3691  	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3692  
3693  	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3694  	if (ret) {
3695  		if (ret > 0)
3696  			ret = -ENOENT;
3697  		goto make_bad;
3698  	}
3699  
3700  	leaf = path->nodes[0];
3701  
3702  	if (filled)
3703  		goto cache_index;
3704  
3705  	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3706  				    struct btrfs_inode_item);
3707  	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3708  	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3709  	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3710  	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3711  	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3712  
3713  	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3714  	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3715  
3716  	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3717  	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3718  
3719  	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3720  	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3721  
3722  	BTRFS_I(inode)->i_otime.tv_sec =
3723  		btrfs_timespec_sec(leaf, &inode_item->otime);
3724  	BTRFS_I(inode)->i_otime.tv_nsec =
3725  		btrfs_timespec_nsec(leaf, &inode_item->otime);
3726  
3727  	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3728  	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3729  	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3730  
3731  	inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3732  	inode->i_generation = BTRFS_I(inode)->generation;
3733  	inode->i_rdev = 0;
3734  	rdev = btrfs_inode_rdev(leaf, inode_item);
3735  
3736  	BTRFS_I(inode)->index_cnt = (u64)-1;
3737  	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3738  
3739  cache_index:
3740  	/*
3741  	 * If we were modified in the current generation and evicted from memory
3742  	 * and then re-read we need to do a full sync since we don't have any
3743  	 * idea about which extents were modified before we were evicted from
3744  	 * cache.
3745  	 *
3746  	 * This is required for both inode re-read from disk and delayed inode
3747  	 * in delayed_nodes_tree.
3748  	 */
3749  	if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
3750  		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3751  			&BTRFS_I(inode)->runtime_flags);
3752  
3753  	/*
3754  	 * We don't persist the id of the transaction where an unlink operation
3755  	 * against the inode was last made. So here we assume the inode might
3756  	 * have been evicted, and therefore the exact value of last_unlink_trans
3757  	 * lost, and set it to last_trans to avoid metadata inconsistencies
3758  	 * between the inode and its parent if the inode is fsync'ed and the log
3759  	 * replayed. For example, in the scenario:
3760  	 *
3761  	 * touch mydir/foo
3762  	 * ln mydir/foo mydir/bar
3763  	 * sync
3764  	 * unlink mydir/bar
3765  	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3766  	 * xfs_io -c fsync mydir/foo
3767  	 * <power failure>
3768  	 * mount fs, triggers fsync log replay
3769  	 *
3770  	 * We must make sure that when we fsync our inode foo we also log its
3771  	 * parent inode, otherwise after log replay the parent still has the
3772  	 * dentry with the "bar" name but our inode foo has a link count of 1
3773  	 * and doesn't have an inode ref with the name "bar" anymore.
3774  	 *
3775  	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3776  	 * but it guarantees correctness at the expense of occasional full
3777  	 * transaction commits on fsync if our inode is a directory, or if our
3778  	 * inode is not a directory, logging its parent unnecessarily.
3779  	 */
3780  	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3781  
3782  	path->slots[0]++;
3783  	if (inode->i_nlink != 1 ||
3784  	    path->slots[0] >= btrfs_header_nritems(leaf))
3785  		goto cache_acl;
3786  
3787  	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3788  	if (location.objectid != btrfs_ino(inode))
3789  		goto cache_acl;
3790  
3791  	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3792  	if (location.type == BTRFS_INODE_REF_KEY) {
3793  		struct btrfs_inode_ref *ref;
3794  
3795  		ref = (struct btrfs_inode_ref *)ptr;
3796  		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3797  	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3798  		struct btrfs_inode_extref *extref;
3799  
3800  		extref = (struct btrfs_inode_extref *)ptr;
3801  		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3802  								     extref);
3803  	}
3804  cache_acl:
3805  	/*
3806  	 * try to precache a NULL acl entry for files that don't have
3807  	 * any xattrs or acls
3808  	 */
3809  	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3810  					   btrfs_ino(inode), &first_xattr_slot);
3811  	if (first_xattr_slot != -1) {
3812  		path->slots[0] = first_xattr_slot;
3813  		ret = btrfs_load_inode_props(inode, path);
3814  		if (ret)
3815  			btrfs_err(root->fs_info,
3816  				  "error loading props for ino %llu (root %llu): %d",
3817  				  btrfs_ino(inode),
3818  				  root->root_key.objectid, ret);
3819  	}
3820  	btrfs_free_path(path);
3821  
3822  	if (!maybe_acls)
3823  		cache_no_acl(inode);
3824  
3825  	switch (inode->i_mode & S_IFMT) {
3826  	case S_IFREG:
3827  		inode->i_mapping->a_ops = &btrfs_aops;
3828  		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3829  		inode->i_fop = &btrfs_file_operations;
3830  		inode->i_op = &btrfs_file_inode_operations;
3831  		break;
3832  	case S_IFDIR:
3833  		inode->i_fop = &btrfs_dir_file_operations;
3834  		inode->i_op = &btrfs_dir_inode_operations;
3835  		break;
3836  	case S_IFLNK:
3837  		inode->i_op = &btrfs_symlink_inode_operations;
3838  		inode_nohighmem(inode);
3839  		inode->i_mapping->a_ops = &btrfs_symlink_aops;
3840  		break;
3841  	default:
3842  		inode->i_op = &btrfs_special_inode_operations;
3843  		init_special_inode(inode, inode->i_mode, rdev);
3844  		break;
3845  	}
3846  
3847  	btrfs_update_iflags(inode);
3848  	return 0;
3849  
3850  make_bad:
3851  	btrfs_free_path(path);
3852  	make_bad_inode(inode);
3853  	return ret;
3854  }
3855  
3856  /*
3857   * given a leaf and an inode, copy the inode fields into the leaf
3858   */
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode)3859  static void fill_inode_item(struct btrfs_trans_handle *trans,
3860  			    struct extent_buffer *leaf,
3861  			    struct btrfs_inode_item *item,
3862  			    struct inode *inode)
3863  {
3864  	struct btrfs_map_token token;
3865  
3866  	btrfs_init_map_token(&token);
3867  
3868  	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3869  	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3870  	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3871  				   &token);
3872  	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3873  	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3874  
3875  	btrfs_set_token_timespec_sec(leaf, &item->atime,
3876  				     inode->i_atime.tv_sec, &token);
3877  	btrfs_set_token_timespec_nsec(leaf, &item->atime,
3878  				      inode->i_atime.tv_nsec, &token);
3879  
3880  	btrfs_set_token_timespec_sec(leaf, &item->mtime,
3881  				     inode->i_mtime.tv_sec, &token);
3882  	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3883  				      inode->i_mtime.tv_nsec, &token);
3884  
3885  	btrfs_set_token_timespec_sec(leaf, &item->ctime,
3886  				     inode->i_ctime.tv_sec, &token);
3887  	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3888  				      inode->i_ctime.tv_nsec, &token);
3889  
3890  	btrfs_set_token_timespec_sec(leaf, &item->otime,
3891  				     BTRFS_I(inode)->i_otime.tv_sec, &token);
3892  	btrfs_set_token_timespec_nsec(leaf, &item->otime,
3893  				      BTRFS_I(inode)->i_otime.tv_nsec, &token);
3894  
3895  	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3896  				     &token);
3897  	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3898  					 &token);
3899  	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3900  	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3901  	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3902  	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3903  	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3904  }
3905  
3906  /*
3907   * copy everything in the in-memory inode into the btree.
3908   */
btrfs_update_inode_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * inode)3909  static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3910  				struct btrfs_root *root, struct inode *inode)
3911  {
3912  	struct btrfs_inode_item *inode_item;
3913  	struct btrfs_path *path;
3914  	struct extent_buffer *leaf;
3915  	int ret;
3916  
3917  	path = btrfs_alloc_path();
3918  	if (!path)
3919  		return -ENOMEM;
3920  
3921  	path->leave_spinning = 1;
3922  	ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3923  				 1);
3924  	if (ret) {
3925  		if (ret > 0)
3926  			ret = -ENOENT;
3927  		goto failed;
3928  	}
3929  
3930  	leaf = path->nodes[0];
3931  	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3932  				    struct btrfs_inode_item);
3933  
3934  	fill_inode_item(trans, leaf, inode_item, inode);
3935  	btrfs_mark_buffer_dirty(leaf);
3936  	btrfs_set_inode_last_trans(trans, inode);
3937  	ret = 0;
3938  failed:
3939  	btrfs_free_path(path);
3940  	return ret;
3941  }
3942  
3943  /*
3944   * copy everything in the in-memory inode into the btree.
3945   */
btrfs_update_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * inode)3946  noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3947  				struct btrfs_root *root, struct inode *inode)
3948  {
3949  	int ret;
3950  
3951  	/*
3952  	 * If the inode is a free space inode, we can deadlock during commit
3953  	 * if we put it into the delayed code.
3954  	 *
3955  	 * The data relocation inode should also be directly updated
3956  	 * without delay
3957  	 */
3958  	if (!btrfs_is_free_space_inode(inode)
3959  	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3960  	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
3961  		btrfs_update_root_times(trans, root);
3962  
3963  		ret = btrfs_delayed_update_inode(trans, root, inode);
3964  		if (!ret)
3965  			btrfs_set_inode_last_trans(trans, inode);
3966  		return ret;
3967  	}
3968  
3969  	return btrfs_update_inode_item(trans, root, inode);
3970  }
3971  
btrfs_update_inode_fallback(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * inode)3972  noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3973  					 struct btrfs_root *root,
3974  					 struct inode *inode)
3975  {
3976  	int ret;
3977  
3978  	ret = btrfs_update_inode(trans, root, inode);
3979  	if (ret == -ENOSPC)
3980  		return btrfs_update_inode_item(trans, root, inode);
3981  	return ret;
3982  }
3983  
3984  /*
3985   * unlink helper that gets used here in inode.c and in the tree logging
3986   * recovery code.  It remove a link in a directory with a given name, and
3987   * also drops the back refs in the inode to the directory
3988   */
__btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * dir,struct inode * inode,const char * name,int name_len)3989  static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3990  				struct btrfs_root *root,
3991  				struct inode *dir, struct inode *inode,
3992  				const char *name, int name_len)
3993  {
3994  	struct btrfs_path *path;
3995  	int ret = 0;
3996  	struct extent_buffer *leaf;
3997  	struct btrfs_dir_item *di;
3998  	struct btrfs_key key;
3999  	u64 index;
4000  	u64 ino = btrfs_ino(inode);
4001  	u64 dir_ino = btrfs_ino(dir);
4002  
4003  	path = btrfs_alloc_path();
4004  	if (!path) {
4005  		ret = -ENOMEM;
4006  		goto out;
4007  	}
4008  
4009  	path->leave_spinning = 1;
4010  	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4011  				    name, name_len, -1);
4012  	if (IS_ERR(di)) {
4013  		ret = PTR_ERR(di);
4014  		goto err;
4015  	}
4016  	if (!di) {
4017  		ret = -ENOENT;
4018  		goto err;
4019  	}
4020  	leaf = path->nodes[0];
4021  	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4022  	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4023  	if (ret)
4024  		goto err;
4025  	btrfs_release_path(path);
4026  
4027  	/*
4028  	 * If we don't have dir index, we have to get it by looking up
4029  	 * the inode ref, since we get the inode ref, remove it directly,
4030  	 * it is unnecessary to do delayed deletion.
4031  	 *
4032  	 * But if we have dir index, needn't search inode ref to get it.
4033  	 * Since the inode ref is close to the inode item, it is better
4034  	 * that we delay to delete it, and just do this deletion when
4035  	 * we update the inode item.
4036  	 */
4037  	if (BTRFS_I(inode)->dir_index) {
4038  		ret = btrfs_delayed_delete_inode_ref(inode);
4039  		if (!ret) {
4040  			index = BTRFS_I(inode)->dir_index;
4041  			goto skip_backref;
4042  		}
4043  	}
4044  
4045  	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
4046  				  dir_ino, &index);
4047  	if (ret) {
4048  		btrfs_info(root->fs_info,
4049  			"failed to delete reference to %.*s, inode %llu parent %llu",
4050  			name_len, name, ino, dir_ino);
4051  		btrfs_abort_transaction(trans, ret);
4052  		goto err;
4053  	}
4054  skip_backref:
4055  	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
4056  	if (ret) {
4057  		btrfs_abort_transaction(trans, ret);
4058  		goto err;
4059  	}
4060  
4061  	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
4062  					 inode, dir_ino);
4063  	if (ret != 0 && ret != -ENOENT) {
4064  		btrfs_abort_transaction(trans, ret);
4065  		goto err;
4066  	}
4067  
4068  	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
4069  					   dir, index);
4070  	if (ret == -ENOENT)
4071  		ret = 0;
4072  	else if (ret)
4073  		btrfs_abort_transaction(trans, ret);
4074  err:
4075  	btrfs_free_path(path);
4076  	if (ret)
4077  		goto out;
4078  
4079  	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
4080  	inode_inc_iversion(inode);
4081  	inode_inc_iversion(dir);
4082  	inode->i_ctime = dir->i_mtime =
4083  		dir->i_ctime = current_time(inode);
4084  	ret = btrfs_update_inode(trans, root, dir);
4085  out:
4086  	return ret;
4087  }
4088  
btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * dir,struct inode * inode,const char * name,int name_len)4089  int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4090  		       struct btrfs_root *root,
4091  		       struct inode *dir, struct inode *inode,
4092  		       const char *name, int name_len)
4093  {
4094  	int ret;
4095  	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
4096  	if (!ret) {
4097  		drop_nlink(inode);
4098  		ret = btrfs_update_inode(trans, root, inode);
4099  	}
4100  	return ret;
4101  }
4102  
4103  /*
4104   * helper to start transaction for unlink and rmdir.
4105   *
4106   * unlink and rmdir are special in btrfs, they do not always free space, so
4107   * if we cannot make our reservations the normal way try and see if there is
4108   * plenty of slack room in the global reserve to migrate, otherwise we cannot
4109   * allow the unlink to occur.
4110   */
__unlink_start_trans(struct inode * dir)4111  static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4112  {
4113  	struct btrfs_root *root = BTRFS_I(dir)->root;
4114  
4115  	/*
4116  	 * 1 for the possible orphan item
4117  	 * 1 for the dir item
4118  	 * 1 for the dir index
4119  	 * 1 for the inode ref
4120  	 * 1 for the inode
4121  	 */
4122  	return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
4123  }
4124  
btrfs_unlink(struct inode * dir,struct dentry * dentry)4125  static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4126  {
4127  	struct btrfs_root *root = BTRFS_I(dir)->root;
4128  	struct btrfs_trans_handle *trans;
4129  	struct inode *inode = d_inode(dentry);
4130  	int ret;
4131  
4132  	trans = __unlink_start_trans(dir);
4133  	if (IS_ERR(trans))
4134  		return PTR_ERR(trans);
4135  
4136  	btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0);
4137  
4138  	ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
4139  				 dentry->d_name.name, dentry->d_name.len);
4140  	if (ret)
4141  		goto out;
4142  
4143  	if (inode->i_nlink == 0) {
4144  		ret = btrfs_orphan_add(trans, inode);
4145  		if (ret)
4146  			goto out;
4147  	}
4148  
4149  out:
4150  	btrfs_end_transaction(trans, root);
4151  	btrfs_btree_balance_dirty(root);
4152  	return ret;
4153  }
4154  
btrfs_unlink_subvol(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * dir,u64 objectid,const char * name,int name_len)4155  int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4156  			struct btrfs_root *root,
4157  			struct inode *dir, u64 objectid,
4158  			const char *name, int name_len)
4159  {
4160  	struct btrfs_path *path;
4161  	struct extent_buffer *leaf;
4162  	struct btrfs_dir_item *di;
4163  	struct btrfs_key key;
4164  	u64 index;
4165  	int ret;
4166  	u64 dir_ino = btrfs_ino(dir);
4167  
4168  	path = btrfs_alloc_path();
4169  	if (!path)
4170  		return -ENOMEM;
4171  
4172  	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4173  				   name, name_len, -1);
4174  	if (IS_ERR_OR_NULL(di)) {
4175  		if (!di)
4176  			ret = -ENOENT;
4177  		else
4178  			ret = PTR_ERR(di);
4179  		goto out;
4180  	}
4181  
4182  	leaf = path->nodes[0];
4183  	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4184  	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4185  	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4186  	if (ret) {
4187  		btrfs_abort_transaction(trans, ret);
4188  		goto out;
4189  	}
4190  	btrfs_release_path(path);
4191  
4192  	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
4193  				 objectid, root->root_key.objectid,
4194  				 dir_ino, &index, name, name_len);
4195  	if (ret < 0) {
4196  		if (ret != -ENOENT) {
4197  			btrfs_abort_transaction(trans, ret);
4198  			goto out;
4199  		}
4200  		di = btrfs_search_dir_index_item(root, path, dir_ino,
4201  						 name, name_len);
4202  		if (IS_ERR_OR_NULL(di)) {
4203  			if (!di)
4204  				ret = -ENOENT;
4205  			else
4206  				ret = PTR_ERR(di);
4207  			btrfs_abort_transaction(trans, ret);
4208  			goto out;
4209  		}
4210  
4211  		leaf = path->nodes[0];
4212  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4213  		btrfs_release_path(path);
4214  		index = key.offset;
4215  	}
4216  	btrfs_release_path(path);
4217  
4218  	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
4219  	if (ret) {
4220  		btrfs_abort_transaction(trans, ret);
4221  		goto out;
4222  	}
4223  
4224  	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
4225  	inode_inc_iversion(dir);
4226  	dir->i_mtime = dir->i_ctime = current_time(dir);
4227  	ret = btrfs_update_inode_fallback(trans, root, dir);
4228  	if (ret)
4229  		btrfs_abort_transaction(trans, ret);
4230  out:
4231  	btrfs_free_path(path);
4232  	return ret;
4233  }
4234  
btrfs_rmdir(struct inode * dir,struct dentry * dentry)4235  static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4236  {
4237  	struct inode *inode = d_inode(dentry);
4238  	int err = 0;
4239  	struct btrfs_root *root = BTRFS_I(dir)->root;
4240  	struct btrfs_trans_handle *trans;
4241  	u64 last_unlink_trans;
4242  
4243  	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4244  		return -ENOTEMPTY;
4245  	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
4246  		return -EPERM;
4247  
4248  	trans = __unlink_start_trans(dir);
4249  	if (IS_ERR(trans))
4250  		return PTR_ERR(trans);
4251  
4252  	if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4253  		err = btrfs_unlink_subvol(trans, root, dir,
4254  					  BTRFS_I(inode)->location.objectid,
4255  					  dentry->d_name.name,
4256  					  dentry->d_name.len);
4257  		goto out;
4258  	}
4259  
4260  	err = btrfs_orphan_add(trans, inode);
4261  	if (err)
4262  		goto out;
4263  
4264  	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4265  
4266  	/* now the directory is empty */
4267  	err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
4268  				 dentry->d_name.name, dentry->d_name.len);
4269  	if (!err) {
4270  		btrfs_i_size_write(inode, 0);
4271  		/*
4272  		 * Propagate the last_unlink_trans value of the deleted dir to
4273  		 * its parent directory. This is to prevent an unrecoverable
4274  		 * log tree in the case we do something like this:
4275  		 * 1) create dir foo
4276  		 * 2) create snapshot under dir foo
4277  		 * 3) delete the snapshot
4278  		 * 4) rmdir foo
4279  		 * 5) mkdir foo
4280  		 * 6) fsync foo or some file inside foo
4281  		 */
4282  		if (last_unlink_trans >= trans->transid)
4283  			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4284  	}
4285  out:
4286  	btrfs_end_transaction(trans, root);
4287  	btrfs_btree_balance_dirty(root);
4288  
4289  	return err;
4290  }
4291  
truncate_space_check(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 bytes_deleted)4292  static int truncate_space_check(struct btrfs_trans_handle *trans,
4293  				struct btrfs_root *root,
4294  				u64 bytes_deleted)
4295  {
4296  	int ret;
4297  
4298  	/*
4299  	 * This is only used to apply pressure to the enospc system, we don't
4300  	 * intend to use this reservation at all.
4301  	 */
4302  	bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
4303  	bytes_deleted *= root->nodesize;
4304  	ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
4305  				  bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4306  	if (!ret) {
4307  		trace_btrfs_space_reservation(root->fs_info, "transaction",
4308  					      trans->transid,
4309  					      bytes_deleted, 1);
4310  		trans->bytes_reserved += bytes_deleted;
4311  	}
4312  	return ret;
4313  
4314  }
4315  
truncate_inline_extent(struct inode * inode,struct btrfs_path * path,struct btrfs_key * found_key,const u64 item_end,const u64 new_size)4316  static int truncate_inline_extent(struct inode *inode,
4317  				  struct btrfs_path *path,
4318  				  struct btrfs_key *found_key,
4319  				  const u64 item_end,
4320  				  const u64 new_size)
4321  {
4322  	struct extent_buffer *leaf = path->nodes[0];
4323  	int slot = path->slots[0];
4324  	struct btrfs_file_extent_item *fi;
4325  	u32 size = (u32)(new_size - found_key->offset);
4326  	struct btrfs_root *root = BTRFS_I(inode)->root;
4327  
4328  	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4329  
4330  	if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
4331  		loff_t offset = new_size;
4332  		loff_t page_end = ALIGN(offset, PAGE_SIZE);
4333  
4334  		/*
4335  		 * Zero out the remaining of the last page of our inline extent,
4336  		 * instead of directly truncating our inline extent here - that
4337  		 * would be much more complex (decompressing all the data, then
4338  		 * compressing the truncated data, which might be bigger than
4339  		 * the size of the inline extent, resize the extent, etc).
4340  		 * We release the path because to get the page we might need to
4341  		 * read the extent item from disk (data not in the page cache).
4342  		 */
4343  		btrfs_release_path(path);
4344  		return btrfs_truncate_block(inode, offset, page_end - offset,
4345  					0);
4346  	}
4347  
4348  	btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4349  	size = btrfs_file_extent_calc_inline_size(size);
4350  	btrfs_truncate_item(root, path, size, 1);
4351  
4352  	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4353  		inode_sub_bytes(inode, item_end + 1 - new_size);
4354  
4355  	return 0;
4356  }
4357  
4358  /*
4359   * this can truncate away extent items, csum items and directory items.
4360   * It starts at a high offset and removes keys until it can't find
4361   * any higher than new_size
4362   *
4363   * csum items that cross the new i_size are truncated to the new size
4364   * as well.
4365   *
4366   * min_type is the minimum key type to truncate down to.  If set to 0, this
4367   * will kill all the items on this inode, including the INODE_ITEM_KEY.
4368   */
btrfs_truncate_inode_items(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * inode,u64 new_size,u32 min_type)4369  int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4370  			       struct btrfs_root *root,
4371  			       struct inode *inode,
4372  			       u64 new_size, u32 min_type)
4373  {
4374  	struct btrfs_path *path;
4375  	struct extent_buffer *leaf;
4376  	struct btrfs_file_extent_item *fi;
4377  	struct btrfs_key key;
4378  	struct btrfs_key found_key;
4379  	u64 extent_start = 0;
4380  	u64 extent_num_bytes = 0;
4381  	u64 extent_offset = 0;
4382  	u64 item_end = 0;
4383  	u64 last_size = new_size;
4384  	u32 found_type = (u8)-1;
4385  	int found_extent;
4386  	int del_item;
4387  	int pending_del_nr = 0;
4388  	int pending_del_slot = 0;
4389  	int extent_type = -1;
4390  	int ret;
4391  	int err = 0;
4392  	u64 ino = btrfs_ino(inode);
4393  	u64 bytes_deleted = 0;
4394  	bool be_nice = 0;
4395  	bool should_throttle = 0;
4396  	bool should_end = 0;
4397  
4398  	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4399  
4400  	/*
4401  	 * for non-free space inodes and ref cows, we want to back off from
4402  	 * time to time
4403  	 */
4404  	if (!btrfs_is_free_space_inode(inode) &&
4405  	    test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4406  		be_nice = 1;
4407  
4408  	path = btrfs_alloc_path();
4409  	if (!path)
4410  		return -ENOMEM;
4411  	path->reada = READA_BACK;
4412  
4413  	/*
4414  	 * We want to drop from the next block forward in case this new size is
4415  	 * not block aligned since we will be keeping the last block of the
4416  	 * extent just the way it is.
4417  	 */
4418  	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4419  	    root == root->fs_info->tree_root)
4420  		btrfs_drop_extent_cache(inode, ALIGN(new_size,
4421  					root->sectorsize), (u64)-1, 0);
4422  
4423  	/*
4424  	 * This function is also used to drop the items in the log tree before
4425  	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4426  	 * it is used to drop the loged items. So we shouldn't kill the delayed
4427  	 * items.
4428  	 */
4429  	if (min_type == 0 && root == BTRFS_I(inode)->root)
4430  		btrfs_kill_delayed_inode_items(inode);
4431  
4432  	key.objectid = ino;
4433  	key.offset = (u64)-1;
4434  	key.type = (u8)-1;
4435  
4436  search_again:
4437  	/*
4438  	 * with a 16K leaf size and 128MB extents, you can actually queue
4439  	 * up a huge file in a single leaf.  Most of the time that
4440  	 * bytes_deleted is > 0, it will be huge by the time we get here
4441  	 */
4442  	if (be_nice && bytes_deleted > SZ_32M) {
4443  		if (btrfs_should_end_transaction(trans, root)) {
4444  			err = -EAGAIN;
4445  			goto error;
4446  		}
4447  	}
4448  
4449  
4450  	path->leave_spinning = 1;
4451  	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4452  	if (ret < 0) {
4453  		err = ret;
4454  		goto out;
4455  	}
4456  
4457  	if (ret > 0) {
4458  		/* there are no items in the tree for us to truncate, we're
4459  		 * done
4460  		 */
4461  		if (path->slots[0] == 0)
4462  			goto out;
4463  		path->slots[0]--;
4464  	}
4465  
4466  	while (1) {
4467  		fi = NULL;
4468  		leaf = path->nodes[0];
4469  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4470  		found_type = found_key.type;
4471  
4472  		if (found_key.objectid != ino)
4473  			break;
4474  
4475  		if (found_type < min_type)
4476  			break;
4477  
4478  		item_end = found_key.offset;
4479  		if (found_type == BTRFS_EXTENT_DATA_KEY) {
4480  			fi = btrfs_item_ptr(leaf, path->slots[0],
4481  					    struct btrfs_file_extent_item);
4482  			extent_type = btrfs_file_extent_type(leaf, fi);
4483  			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4484  				item_end +=
4485  				    btrfs_file_extent_num_bytes(leaf, fi);
4486  			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4487  				item_end += btrfs_file_extent_inline_len(leaf,
4488  							 path->slots[0], fi);
4489  			}
4490  			item_end--;
4491  		}
4492  		if (found_type > min_type) {
4493  			del_item = 1;
4494  		} else {
4495  			if (item_end < new_size) {
4496  				/*
4497  				 * With NO_HOLES mode, for the following mapping
4498  				 *
4499  				 * [0-4k][hole][8k-12k]
4500  				 *
4501  				 * if truncating isize down to 6k, it ends up
4502  				 * isize being 8k.
4503  				 */
4504  				if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
4505  					last_size = new_size;
4506  				break;
4507  			}
4508  			if (found_key.offset >= new_size)
4509  				del_item = 1;
4510  			else
4511  				del_item = 0;
4512  		}
4513  		found_extent = 0;
4514  		/* FIXME, shrink the extent if the ref count is only 1 */
4515  		if (found_type != BTRFS_EXTENT_DATA_KEY)
4516  			goto delete;
4517  
4518  		if (del_item)
4519  			last_size = found_key.offset;
4520  		else
4521  			last_size = new_size;
4522  
4523  		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4524  			u64 num_dec;
4525  			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4526  			if (!del_item) {
4527  				u64 orig_num_bytes =
4528  					btrfs_file_extent_num_bytes(leaf, fi);
4529  				extent_num_bytes = ALIGN(new_size -
4530  						found_key.offset,
4531  						root->sectorsize);
4532  				btrfs_set_file_extent_num_bytes(leaf, fi,
4533  							 extent_num_bytes);
4534  				num_dec = (orig_num_bytes -
4535  					   extent_num_bytes);
4536  				if (test_bit(BTRFS_ROOT_REF_COWS,
4537  					     &root->state) &&
4538  				    extent_start != 0)
4539  					inode_sub_bytes(inode, num_dec);
4540  				btrfs_mark_buffer_dirty(leaf);
4541  			} else {
4542  				extent_num_bytes =
4543  					btrfs_file_extent_disk_num_bytes(leaf,
4544  									 fi);
4545  				extent_offset = found_key.offset -
4546  					btrfs_file_extent_offset(leaf, fi);
4547  
4548  				/* FIXME blocksize != 4096 */
4549  				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4550  				if (extent_start != 0) {
4551  					found_extent = 1;
4552  					if (test_bit(BTRFS_ROOT_REF_COWS,
4553  						     &root->state))
4554  						inode_sub_bytes(inode, num_dec);
4555  				}
4556  			}
4557  		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4558  			/*
4559  			 * we can't truncate inline items that have had
4560  			 * special encodings
4561  			 */
4562  			if (!del_item &&
4563  			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
4564  			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4565  
4566  				/*
4567  				 * Need to release path in order to truncate a
4568  				 * compressed extent. So delete any accumulated
4569  				 * extent items so far.
4570  				 */
4571  				if (btrfs_file_extent_compression(leaf, fi) !=
4572  				    BTRFS_COMPRESS_NONE && pending_del_nr) {
4573  					err = btrfs_del_items(trans, root, path,
4574  							      pending_del_slot,
4575  							      pending_del_nr);
4576  					if (err) {
4577  						btrfs_abort_transaction(trans,
4578  									err);
4579  						goto error;
4580  					}
4581  					pending_del_nr = 0;
4582  				}
4583  
4584  				err = truncate_inline_extent(inode, path,
4585  							     &found_key,
4586  							     item_end,
4587  							     new_size);
4588  				if (err) {
4589  					btrfs_abort_transaction(trans, err);
4590  					goto error;
4591  				}
4592  			} else if (test_bit(BTRFS_ROOT_REF_COWS,
4593  					    &root->state)) {
4594  				inode_sub_bytes(inode, item_end + 1 - new_size);
4595  			}
4596  		}
4597  delete:
4598  		if (del_item) {
4599  			if (!pending_del_nr) {
4600  				/* no pending yet, add ourselves */
4601  				pending_del_slot = path->slots[0];
4602  				pending_del_nr = 1;
4603  			} else if (pending_del_nr &&
4604  				   path->slots[0] + 1 == pending_del_slot) {
4605  				/* hop on the pending chunk */
4606  				pending_del_nr++;
4607  				pending_del_slot = path->slots[0];
4608  			} else {
4609  				BUG();
4610  			}
4611  		} else {
4612  			break;
4613  		}
4614  		should_throttle = 0;
4615  
4616  		if (found_extent &&
4617  		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4618  		     root == root->fs_info->tree_root)) {
4619  			btrfs_set_path_blocking(path);
4620  			bytes_deleted += extent_num_bytes;
4621  			ret = btrfs_free_extent(trans, root, extent_start,
4622  						extent_num_bytes, 0,
4623  						btrfs_header_owner(leaf),
4624  						ino, extent_offset);
4625  			BUG_ON(ret);
4626  			if (btrfs_should_throttle_delayed_refs(trans, root))
4627  				btrfs_async_run_delayed_refs(root,
4628  					trans->delayed_ref_updates * 2,
4629  					trans->transid, 0);
4630  			if (be_nice) {
4631  				if (truncate_space_check(trans, root,
4632  							 extent_num_bytes)) {
4633  					should_end = 1;
4634  				}
4635  				if (btrfs_should_throttle_delayed_refs(trans,
4636  								       root)) {
4637  					should_throttle = 1;
4638  				}
4639  			}
4640  		}
4641  
4642  		if (found_type == BTRFS_INODE_ITEM_KEY)
4643  			break;
4644  
4645  		if (path->slots[0] == 0 ||
4646  		    path->slots[0] != pending_del_slot ||
4647  		    should_throttle || should_end) {
4648  			if (pending_del_nr) {
4649  				ret = btrfs_del_items(trans, root, path,
4650  						pending_del_slot,
4651  						pending_del_nr);
4652  				if (ret) {
4653  					btrfs_abort_transaction(trans, ret);
4654  					goto error;
4655  				}
4656  				pending_del_nr = 0;
4657  			}
4658  			btrfs_release_path(path);
4659  			if (should_throttle) {
4660  				unsigned long updates = trans->delayed_ref_updates;
4661  				if (updates) {
4662  					trans->delayed_ref_updates = 0;
4663  					ret = btrfs_run_delayed_refs(trans, root, updates * 2);
4664  					if (ret && !err)
4665  						err = ret;
4666  				}
4667  			}
4668  			/*
4669  			 * if we failed to refill our space rsv, bail out
4670  			 * and let the transaction restart
4671  			 */
4672  			if (should_end) {
4673  				err = -EAGAIN;
4674  				goto error;
4675  			}
4676  			goto search_again;
4677  		} else {
4678  			path->slots[0]--;
4679  		}
4680  	}
4681  out:
4682  	if (pending_del_nr) {
4683  		ret = btrfs_del_items(trans, root, path, pending_del_slot,
4684  				      pending_del_nr);
4685  		if (ret)
4686  			btrfs_abort_transaction(trans, ret);
4687  	}
4688  error:
4689  	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4690  		btrfs_ordered_update_i_size(inode, last_size, NULL);
4691  
4692  	btrfs_free_path(path);
4693  
4694  	if (be_nice && bytes_deleted > SZ_32M) {
4695  		unsigned long updates = trans->delayed_ref_updates;
4696  		if (updates) {
4697  			trans->delayed_ref_updates = 0;
4698  			ret = btrfs_run_delayed_refs(trans, root, updates * 2);
4699  			if (ret && !err)
4700  				err = ret;
4701  		}
4702  	}
4703  	return err;
4704  }
4705  
4706  /*
4707   * btrfs_truncate_block - read, zero a chunk and write a block
4708   * @inode - inode that we're zeroing
4709   * @from - the offset to start zeroing
4710   * @len - the length to zero, 0 to zero the entire range respective to the
4711   *	offset
4712   * @front - zero up to the offset instead of from the offset on
4713   *
4714   * This will find the block for the "from" offset and cow the block and zero the
4715   * part we want to zero.  This is used with truncate and hole punching.
4716   */
btrfs_truncate_block(struct inode * inode,loff_t from,loff_t len,int front)4717  int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4718  			int front)
4719  {
4720  	struct address_space *mapping = inode->i_mapping;
4721  	struct btrfs_root *root = BTRFS_I(inode)->root;
4722  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4723  	struct btrfs_ordered_extent *ordered;
4724  	struct extent_state *cached_state = NULL;
4725  	char *kaddr;
4726  	u32 blocksize = root->sectorsize;
4727  	pgoff_t index = from >> PAGE_SHIFT;
4728  	unsigned offset = from & (blocksize - 1);
4729  	struct page *page;
4730  	gfp_t mask = btrfs_alloc_write_mask(mapping);
4731  	int ret = 0;
4732  	u64 block_start;
4733  	u64 block_end;
4734  
4735  	if ((offset & (blocksize - 1)) == 0 &&
4736  	    (!len || ((len & (blocksize - 1)) == 0)))
4737  		goto out;
4738  
4739  	ret = btrfs_delalloc_reserve_space(inode,
4740  			round_down(from, blocksize), blocksize);
4741  	if (ret)
4742  		goto out;
4743  
4744  again:
4745  	page = find_or_create_page(mapping, index, mask);
4746  	if (!page) {
4747  		btrfs_delalloc_release_space(inode,
4748  				round_down(from, blocksize),
4749  				blocksize);
4750  		ret = -ENOMEM;
4751  		goto out;
4752  	}
4753  
4754  	block_start = round_down(from, blocksize);
4755  	block_end = block_start + blocksize - 1;
4756  
4757  	if (!PageUptodate(page)) {
4758  		ret = btrfs_readpage(NULL, page);
4759  		lock_page(page);
4760  		if (page->mapping != mapping) {
4761  			unlock_page(page);
4762  			put_page(page);
4763  			goto again;
4764  		}
4765  		if (!PageUptodate(page)) {
4766  			ret = -EIO;
4767  			goto out_unlock;
4768  		}
4769  	}
4770  	wait_on_page_writeback(page);
4771  
4772  	lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4773  	set_page_extent_mapped(page);
4774  
4775  	ordered = btrfs_lookup_ordered_extent(inode, block_start);
4776  	if (ordered) {
4777  		unlock_extent_cached(io_tree, block_start, block_end,
4778  				     &cached_state, GFP_NOFS);
4779  		unlock_page(page);
4780  		put_page(page);
4781  		btrfs_start_ordered_extent(inode, ordered, 1);
4782  		btrfs_put_ordered_extent(ordered);
4783  		goto again;
4784  	}
4785  
4786  	clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
4787  			  EXTENT_DIRTY | EXTENT_DELALLOC |
4788  			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4789  			  0, 0, &cached_state, GFP_NOFS);
4790  
4791  	ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
4792  					&cached_state, 0);
4793  	if (ret) {
4794  		unlock_extent_cached(io_tree, block_start, block_end,
4795  				     &cached_state, GFP_NOFS);
4796  		goto out_unlock;
4797  	}
4798  
4799  	if (offset != blocksize) {
4800  		if (!len)
4801  			len = blocksize - offset;
4802  		kaddr = kmap(page);
4803  		if (front)
4804  			memset(kaddr + (block_start - page_offset(page)),
4805  				0, offset);
4806  		else
4807  			memset(kaddr + (block_start - page_offset(page)) +  offset,
4808  				0, len);
4809  		flush_dcache_page(page);
4810  		kunmap(page);
4811  	}
4812  	ClearPageChecked(page);
4813  	set_page_dirty(page);
4814  	unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
4815  			     GFP_NOFS);
4816  
4817  out_unlock:
4818  	if (ret)
4819  		btrfs_delalloc_release_space(inode, block_start,
4820  					     blocksize);
4821  	unlock_page(page);
4822  	put_page(page);
4823  out:
4824  	return ret;
4825  }
4826  
maybe_insert_hole(struct btrfs_root * root,struct inode * inode,u64 offset,u64 len)4827  static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4828  			     u64 offset, u64 len)
4829  {
4830  	struct btrfs_trans_handle *trans;
4831  	int ret;
4832  
4833  	/*
4834  	 * Still need to make sure the inode looks like it's been updated so
4835  	 * that any holes get logged if we fsync.
4836  	 */
4837  	if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
4838  		BTRFS_I(inode)->last_trans = root->fs_info->generation;
4839  		BTRFS_I(inode)->last_sub_trans = root->log_transid;
4840  		BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4841  		return 0;
4842  	}
4843  
4844  	/*
4845  	 * 1 - for the one we're dropping
4846  	 * 1 - for the one we're adding
4847  	 * 1 - for updating the inode.
4848  	 */
4849  	trans = btrfs_start_transaction(root, 3);
4850  	if (IS_ERR(trans))
4851  		return PTR_ERR(trans);
4852  
4853  	ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4854  	if (ret) {
4855  		btrfs_abort_transaction(trans, ret);
4856  		btrfs_end_transaction(trans, root);
4857  		return ret;
4858  	}
4859  
4860  	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
4861  				       0, 0, len, 0, len, 0, 0, 0);
4862  	if (ret)
4863  		btrfs_abort_transaction(trans, ret);
4864  	else
4865  		btrfs_update_inode(trans, root, inode);
4866  	btrfs_end_transaction(trans, root);
4867  	return ret;
4868  }
4869  
4870  /*
4871   * This function puts in dummy file extents for the area we're creating a hole
4872   * for.  So if we are truncating this file to a larger size we need to insert
4873   * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4874   * the range between oldsize and size
4875   */
btrfs_cont_expand(struct inode * inode,loff_t oldsize,loff_t size)4876  int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4877  {
4878  	struct btrfs_root *root = BTRFS_I(inode)->root;
4879  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4880  	struct extent_map *em = NULL;
4881  	struct extent_state *cached_state = NULL;
4882  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4883  	u64 hole_start = ALIGN(oldsize, root->sectorsize);
4884  	u64 block_end = ALIGN(size, root->sectorsize);
4885  	u64 last_byte;
4886  	u64 cur_offset;
4887  	u64 hole_size;
4888  	int err = 0;
4889  
4890  	/*
4891  	 * If our size started in the middle of a block we need to zero out the
4892  	 * rest of the block before we expand the i_size, otherwise we could
4893  	 * expose stale data.
4894  	 */
4895  	err = btrfs_truncate_block(inode, oldsize, 0, 0);
4896  	if (err)
4897  		return err;
4898  
4899  	if (size <= hole_start)
4900  		return 0;
4901  
4902  	while (1) {
4903  		struct btrfs_ordered_extent *ordered;
4904  
4905  		lock_extent_bits(io_tree, hole_start, block_end - 1,
4906  				 &cached_state);
4907  		ordered = btrfs_lookup_ordered_range(inode, hole_start,
4908  						     block_end - hole_start);
4909  		if (!ordered)
4910  			break;
4911  		unlock_extent_cached(io_tree, hole_start, block_end - 1,
4912  				     &cached_state, GFP_NOFS);
4913  		btrfs_start_ordered_extent(inode, ordered, 1);
4914  		btrfs_put_ordered_extent(ordered);
4915  	}
4916  
4917  	cur_offset = hole_start;
4918  	while (1) {
4919  		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4920  				block_end - cur_offset, 0);
4921  		if (IS_ERR(em)) {
4922  			err = PTR_ERR(em);
4923  			em = NULL;
4924  			break;
4925  		}
4926  		last_byte = min(extent_map_end(em), block_end);
4927  		last_byte = ALIGN(last_byte , root->sectorsize);
4928  		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4929  			struct extent_map *hole_em;
4930  			hole_size = last_byte - cur_offset;
4931  
4932  			err = maybe_insert_hole(root, inode, cur_offset,
4933  						hole_size);
4934  			if (err)
4935  				break;
4936  			btrfs_drop_extent_cache(inode, cur_offset,
4937  						cur_offset + hole_size - 1, 0);
4938  			hole_em = alloc_extent_map();
4939  			if (!hole_em) {
4940  				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4941  					&BTRFS_I(inode)->runtime_flags);
4942  				goto next;
4943  			}
4944  			hole_em->start = cur_offset;
4945  			hole_em->len = hole_size;
4946  			hole_em->orig_start = cur_offset;
4947  
4948  			hole_em->block_start = EXTENT_MAP_HOLE;
4949  			hole_em->block_len = 0;
4950  			hole_em->orig_block_len = 0;
4951  			hole_em->ram_bytes = hole_size;
4952  			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4953  			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4954  			hole_em->generation = root->fs_info->generation;
4955  
4956  			while (1) {
4957  				write_lock(&em_tree->lock);
4958  				err = add_extent_mapping(em_tree, hole_em, 1);
4959  				write_unlock(&em_tree->lock);
4960  				if (err != -EEXIST)
4961  					break;
4962  				btrfs_drop_extent_cache(inode, cur_offset,
4963  							cur_offset +
4964  							hole_size - 1, 0);
4965  			}
4966  			free_extent_map(hole_em);
4967  		}
4968  next:
4969  		free_extent_map(em);
4970  		em = NULL;
4971  		cur_offset = last_byte;
4972  		if (cur_offset >= block_end)
4973  			break;
4974  	}
4975  	free_extent_map(em);
4976  	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
4977  			     GFP_NOFS);
4978  	return err;
4979  }
4980  
btrfs_setsize(struct inode * inode,struct iattr * attr)4981  static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4982  {
4983  	struct btrfs_root *root = BTRFS_I(inode)->root;
4984  	struct btrfs_trans_handle *trans;
4985  	loff_t oldsize = i_size_read(inode);
4986  	loff_t newsize = attr->ia_size;
4987  	int mask = attr->ia_valid;
4988  	int ret;
4989  
4990  	/*
4991  	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4992  	 * special case where we need to update the times despite not having
4993  	 * these flags set.  For all other operations the VFS set these flags
4994  	 * explicitly if it wants a timestamp update.
4995  	 */
4996  	if (newsize != oldsize) {
4997  		inode_inc_iversion(inode);
4998  		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
4999  			inode->i_ctime = inode->i_mtime =
5000  				current_time(inode);
5001  	}
5002  
5003  	if (newsize > oldsize) {
5004  		/*
5005  		 * Don't do an expanding truncate while snapshoting is ongoing.
5006  		 * This is to ensure the snapshot captures a fully consistent
5007  		 * state of this file - if the snapshot captures this expanding
5008  		 * truncation, it must capture all writes that happened before
5009  		 * this truncation.
5010  		 */
5011  		btrfs_wait_for_snapshot_creation(root);
5012  		ret = btrfs_cont_expand(inode, oldsize, newsize);
5013  		if (ret) {
5014  			btrfs_end_write_no_snapshoting(root);
5015  			return ret;
5016  		}
5017  
5018  		trans = btrfs_start_transaction(root, 1);
5019  		if (IS_ERR(trans)) {
5020  			btrfs_end_write_no_snapshoting(root);
5021  			return PTR_ERR(trans);
5022  		}
5023  
5024  		i_size_write(inode, newsize);
5025  		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
5026  		pagecache_isize_extended(inode, oldsize, newsize);
5027  		ret = btrfs_update_inode(trans, root, inode);
5028  		btrfs_end_write_no_snapshoting(root);
5029  		btrfs_end_transaction(trans, root);
5030  	} else {
5031  
5032  		/*
5033  		 * We're truncating a file that used to have good data down to
5034  		 * zero. Make sure it gets into the ordered flush list so that
5035  		 * any new writes get down to disk quickly.
5036  		 */
5037  		if (newsize == 0)
5038  			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
5039  				&BTRFS_I(inode)->runtime_flags);
5040  
5041  		/*
5042  		 * 1 for the orphan item we're going to add
5043  		 * 1 for the orphan item deletion.
5044  		 */
5045  		trans = btrfs_start_transaction(root, 2);
5046  		if (IS_ERR(trans))
5047  			return PTR_ERR(trans);
5048  
5049  		/*
5050  		 * We need to do this in case we fail at _any_ point during the
5051  		 * actual truncate.  Once we do the truncate_setsize we could
5052  		 * invalidate pages which forces any outstanding ordered io to
5053  		 * be instantly completed which will give us extents that need
5054  		 * to be truncated.  If we fail to get an orphan inode down we
5055  		 * could have left over extents that were never meant to live,
5056  		 * so we need to guarantee from this point on that everything
5057  		 * will be consistent.
5058  		 */
5059  		ret = btrfs_orphan_add(trans, inode);
5060  		btrfs_end_transaction(trans, root);
5061  		if (ret)
5062  			return ret;
5063  
5064  		/* we don't support swapfiles, so vmtruncate shouldn't fail */
5065  		truncate_setsize(inode, newsize);
5066  
5067  		/* Disable nonlocked read DIO to avoid the end less truncate */
5068  		btrfs_inode_block_unlocked_dio(inode);
5069  		inode_dio_wait(inode);
5070  		btrfs_inode_resume_unlocked_dio(inode);
5071  
5072  		ret = btrfs_truncate(inode);
5073  		if (ret && inode->i_nlink) {
5074  			int err;
5075  
5076  			/*
5077  			 * failed to truncate, disk_i_size is only adjusted down
5078  			 * as we remove extents, so it should represent the true
5079  			 * size of the inode, so reset the in memory size and
5080  			 * delete our orphan entry.
5081  			 */
5082  			trans = btrfs_join_transaction(root);
5083  			if (IS_ERR(trans)) {
5084  				btrfs_orphan_del(NULL, inode);
5085  				return ret;
5086  			}
5087  			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5088  			err = btrfs_orphan_del(trans, inode);
5089  			if (err)
5090  				btrfs_abort_transaction(trans, err);
5091  			btrfs_end_transaction(trans, root);
5092  		}
5093  	}
5094  
5095  	return ret;
5096  }
5097  
btrfs_setattr(struct dentry * dentry,struct iattr * attr)5098  static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
5099  {
5100  	struct inode *inode = d_inode(dentry);
5101  	struct btrfs_root *root = BTRFS_I(inode)->root;
5102  	int err;
5103  
5104  	if (btrfs_root_readonly(root))
5105  		return -EROFS;
5106  
5107  	err = setattr_prepare(dentry, attr);
5108  	if (err)
5109  		return err;
5110  
5111  	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5112  		err = btrfs_setsize(inode, attr);
5113  		if (err)
5114  			return err;
5115  	}
5116  
5117  	if (attr->ia_valid) {
5118  		setattr_copy(inode, attr);
5119  		inode_inc_iversion(inode);
5120  		err = btrfs_dirty_inode(inode);
5121  
5122  		if (!err && attr->ia_valid & ATTR_MODE)
5123  			err = posix_acl_chmod(inode, inode->i_mode);
5124  	}
5125  
5126  	return err;
5127  }
5128  
5129  /*
5130   * While truncating the inode pages during eviction, we get the VFS calling
5131   * btrfs_invalidatepage() against each page of the inode. This is slow because
5132   * the calls to btrfs_invalidatepage() result in a huge amount of calls to
5133   * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
5134   * extent_state structures over and over, wasting lots of time.
5135   *
5136   * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
5137   * those expensive operations on a per page basis and do only the ordered io
5138   * finishing, while we release here the extent_map and extent_state structures,
5139   * without the excessive merging and splitting.
5140   */
evict_inode_truncate_pages(struct inode * inode)5141  static void evict_inode_truncate_pages(struct inode *inode)
5142  {
5143  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5144  	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
5145  	struct rb_node *node;
5146  
5147  	ASSERT(inode->i_state & I_FREEING);
5148  	truncate_inode_pages_final(&inode->i_data);
5149  
5150  	write_lock(&map_tree->lock);
5151  	while (!RB_EMPTY_ROOT(&map_tree->map)) {
5152  		struct extent_map *em;
5153  
5154  		node = rb_first(&map_tree->map);
5155  		em = rb_entry(node, struct extent_map, rb_node);
5156  		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5157  		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5158  		remove_extent_mapping(map_tree, em);
5159  		free_extent_map(em);
5160  		if (need_resched()) {
5161  			write_unlock(&map_tree->lock);
5162  			cond_resched();
5163  			write_lock(&map_tree->lock);
5164  		}
5165  	}
5166  	write_unlock(&map_tree->lock);
5167  
5168  	/*
5169  	 * Keep looping until we have no more ranges in the io tree.
5170  	 * We can have ongoing bios started by readpages (called from readahead)
5171  	 * that have their endio callback (extent_io.c:end_bio_extent_readpage)
5172  	 * still in progress (unlocked the pages in the bio but did not yet
5173  	 * unlocked the ranges in the io tree). Therefore this means some
5174  	 * ranges can still be locked and eviction started because before
5175  	 * submitting those bios, which are executed by a separate task (work
5176  	 * queue kthread), inode references (inode->i_count) were not taken
5177  	 * (which would be dropped in the end io callback of each bio).
5178  	 * Therefore here we effectively end up waiting for those bios and
5179  	 * anyone else holding locked ranges without having bumped the inode's
5180  	 * reference count - if we don't do it, when they access the inode's
5181  	 * io_tree to unlock a range it may be too late, leading to an
5182  	 * use-after-free issue.
5183  	 */
5184  	spin_lock(&io_tree->lock);
5185  	while (!RB_EMPTY_ROOT(&io_tree->state)) {
5186  		struct extent_state *state;
5187  		struct extent_state *cached_state = NULL;
5188  		u64 start;
5189  		u64 end;
5190  
5191  		node = rb_first(&io_tree->state);
5192  		state = rb_entry(node, struct extent_state, rb_node);
5193  		start = state->start;
5194  		end = state->end;
5195  		spin_unlock(&io_tree->lock);
5196  
5197  		lock_extent_bits(io_tree, start, end, &cached_state);
5198  
5199  		/*
5200  		 * If still has DELALLOC flag, the extent didn't reach disk,
5201  		 * and its reserved space won't be freed by delayed_ref.
5202  		 * So we need to free its reserved space here.
5203  		 * (Refer to comment in btrfs_invalidatepage, case 2)
5204  		 *
5205  		 * Note, end is the bytenr of last byte, so we need + 1 here.
5206  		 */
5207  		if (state->state & EXTENT_DELALLOC)
5208  			btrfs_qgroup_free_data(inode, start, end - start + 1);
5209  
5210  		clear_extent_bit(io_tree, start, end,
5211  				 EXTENT_LOCKED | EXTENT_DIRTY |
5212  				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
5213  				 EXTENT_DEFRAG, 1, 1,
5214  				 &cached_state, GFP_NOFS);
5215  
5216  		cond_resched();
5217  		spin_lock(&io_tree->lock);
5218  	}
5219  	spin_unlock(&io_tree->lock);
5220  }
5221  
btrfs_evict_inode(struct inode * inode)5222  void btrfs_evict_inode(struct inode *inode)
5223  {
5224  	struct btrfs_trans_handle *trans;
5225  	struct btrfs_root *root = BTRFS_I(inode)->root;
5226  	struct btrfs_block_rsv *rsv, *global_rsv;
5227  	int steal_from_global = 0;
5228  	u64 min_size;
5229  	int ret;
5230  
5231  	trace_btrfs_inode_evict(inode);
5232  
5233  	if (!root) {
5234  		clear_inode(inode);
5235  		return;
5236  	}
5237  
5238  	min_size = btrfs_calc_trunc_metadata_size(root, 1);
5239  
5240  	evict_inode_truncate_pages(inode);
5241  
5242  	if (inode->i_nlink &&
5243  	    ((btrfs_root_refs(&root->root_item) != 0 &&
5244  	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5245  	     btrfs_is_free_space_inode(inode)))
5246  		goto no_delete;
5247  
5248  	if (is_bad_inode(inode)) {
5249  		btrfs_orphan_del(NULL, inode);
5250  		goto no_delete;
5251  	}
5252  	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
5253  	if (!special_file(inode->i_mode))
5254  		btrfs_wait_ordered_range(inode, 0, (u64)-1);
5255  
5256  	btrfs_free_io_failure_record(inode, 0, (u64)-1);
5257  
5258  	if (test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
5259  		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
5260  				 &BTRFS_I(inode)->runtime_flags));
5261  		goto no_delete;
5262  	}
5263  
5264  	if (inode->i_nlink > 0) {
5265  		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5266  		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5267  		goto no_delete;
5268  	}
5269  
5270  	ret = btrfs_commit_inode_delayed_inode(inode);
5271  	if (ret) {
5272  		btrfs_orphan_del(NULL, inode);
5273  		goto no_delete;
5274  	}
5275  
5276  	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
5277  	if (!rsv) {
5278  		btrfs_orphan_del(NULL, inode);
5279  		goto no_delete;
5280  	}
5281  	rsv->size = min_size;
5282  	rsv->failfast = 1;
5283  	global_rsv = &root->fs_info->global_block_rsv;
5284  
5285  	btrfs_i_size_write(inode, 0);
5286  
5287  	/*
5288  	 * This is a bit simpler than btrfs_truncate since we've already
5289  	 * reserved our space for our orphan item in the unlink, so we just
5290  	 * need to reserve some slack space in case we add bytes and update
5291  	 * inode item when doing the truncate.
5292  	 */
5293  	while (1) {
5294  		ret = btrfs_block_rsv_refill(root, rsv, min_size,
5295  					     BTRFS_RESERVE_FLUSH_LIMIT);
5296  
5297  		/*
5298  		 * Try and steal from the global reserve since we will
5299  		 * likely not use this space anyway, we want to try as
5300  		 * hard as possible to get this to work.
5301  		 */
5302  		if (ret)
5303  			steal_from_global++;
5304  		else
5305  			steal_from_global = 0;
5306  		ret = 0;
5307  
5308  		/*
5309  		 * steal_from_global == 0: we reserved stuff, hooray!
5310  		 * steal_from_global == 1: we didn't reserve stuff, boo!
5311  		 * steal_from_global == 2: we've committed, still not a lot of
5312  		 * room but maybe we'll have room in the global reserve this
5313  		 * time.
5314  		 * steal_from_global == 3: abandon all hope!
5315  		 */
5316  		if (steal_from_global > 2) {
5317  			btrfs_warn(root->fs_info,
5318  				"Could not get space for a delete, will truncate on mount %d",
5319  				ret);
5320  			btrfs_orphan_del(NULL, inode);
5321  			btrfs_free_block_rsv(root, rsv);
5322  			goto no_delete;
5323  		}
5324  
5325  		trans = btrfs_join_transaction(root);
5326  		if (IS_ERR(trans)) {
5327  			btrfs_orphan_del(NULL, inode);
5328  			btrfs_free_block_rsv(root, rsv);
5329  			goto no_delete;
5330  		}
5331  
5332  		/*
5333  		 * We can't just steal from the global reserve, we need to make
5334  		 * sure there is room to do it, if not we need to commit and try
5335  		 * again.
5336  		 */
5337  		if (steal_from_global) {
5338  			if (!btrfs_check_space_for_delayed_refs(trans, root))
5339  				ret = btrfs_block_rsv_migrate(global_rsv, rsv,
5340  							      min_size, 0);
5341  			else
5342  				ret = -ENOSPC;
5343  		}
5344  
5345  		/*
5346  		 * Couldn't steal from the global reserve, we have too much
5347  		 * pending stuff built up, commit the transaction and try it
5348  		 * again.
5349  		 */
5350  		if (ret) {
5351  			ret = btrfs_commit_transaction(trans, root);
5352  			if (ret) {
5353  				btrfs_orphan_del(NULL, inode);
5354  				btrfs_free_block_rsv(root, rsv);
5355  				goto no_delete;
5356  			}
5357  			continue;
5358  		} else {
5359  			steal_from_global = 0;
5360  		}
5361  
5362  		trans->block_rsv = rsv;
5363  
5364  		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
5365  		if (ret != -ENOSPC && ret != -EAGAIN)
5366  			break;
5367  
5368  		trans->block_rsv = &root->fs_info->trans_block_rsv;
5369  		btrfs_end_transaction(trans, root);
5370  		trans = NULL;
5371  		btrfs_btree_balance_dirty(root);
5372  	}
5373  
5374  	btrfs_free_block_rsv(root, rsv);
5375  
5376  	/*
5377  	 * Errors here aren't a big deal, it just means we leave orphan items
5378  	 * in the tree.  They will be cleaned up on the next mount.
5379  	 */
5380  	if (ret == 0) {
5381  		trans->block_rsv = root->orphan_block_rsv;
5382  		btrfs_orphan_del(trans, inode);
5383  	} else {
5384  		btrfs_orphan_del(NULL, inode);
5385  	}
5386  
5387  	trans->block_rsv = &root->fs_info->trans_block_rsv;
5388  	if (!(root == root->fs_info->tree_root ||
5389  	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
5390  		btrfs_return_ino(root, btrfs_ino(inode));
5391  
5392  	btrfs_end_transaction(trans, root);
5393  	btrfs_btree_balance_dirty(root);
5394  no_delete:
5395  	btrfs_remove_delayed_node(inode);
5396  	clear_inode(inode);
5397  }
5398  
5399  /*
5400   * this returns the key found in the dir entry in the location pointer.
5401   * If no dir entries were found, location->objectid is 0.
5402   */
btrfs_inode_by_name(struct inode * dir,struct dentry * dentry,struct btrfs_key * location)5403  static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5404  			       struct btrfs_key *location)
5405  {
5406  	const char *name = dentry->d_name.name;
5407  	int namelen = dentry->d_name.len;
5408  	struct btrfs_dir_item *di;
5409  	struct btrfs_path *path;
5410  	struct btrfs_root *root = BTRFS_I(dir)->root;
5411  	int ret = 0;
5412  
5413  	path = btrfs_alloc_path();
5414  	if (!path)
5415  		return -ENOMEM;
5416  
5417  	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
5418  				    namelen, 0);
5419  	if (IS_ERR(di))
5420  		ret = PTR_ERR(di);
5421  
5422  	if (IS_ERR_OR_NULL(di))
5423  		goto out_err;
5424  
5425  	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5426  out:
5427  	btrfs_free_path(path);
5428  	return ret;
5429  out_err:
5430  	location->objectid = 0;
5431  	goto out;
5432  }
5433  
5434  /*
5435   * when we hit a tree root in a directory, the btrfs part of the inode
5436   * needs to be changed to reflect the root directory of the tree root.  This
5437   * is kind of like crossing a mount point.
5438   */
fixup_tree_root_location(struct btrfs_root * root,struct inode * dir,struct dentry * dentry,struct btrfs_key * location,struct btrfs_root ** sub_root)5439  static int fixup_tree_root_location(struct btrfs_root *root,
5440  				    struct inode *dir,
5441  				    struct dentry *dentry,
5442  				    struct btrfs_key *location,
5443  				    struct btrfs_root **sub_root)
5444  {
5445  	struct btrfs_path *path;
5446  	struct btrfs_root *new_root;
5447  	struct btrfs_root_ref *ref;
5448  	struct extent_buffer *leaf;
5449  	struct btrfs_key key;
5450  	int ret;
5451  	int err = 0;
5452  
5453  	path = btrfs_alloc_path();
5454  	if (!path) {
5455  		err = -ENOMEM;
5456  		goto out;
5457  	}
5458  
5459  	err = -ENOENT;
5460  	key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5461  	key.type = BTRFS_ROOT_REF_KEY;
5462  	key.offset = location->objectid;
5463  
5464  	ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
5465  				0, 0);
5466  	if (ret) {
5467  		if (ret < 0)
5468  			err = ret;
5469  		goto out;
5470  	}
5471  
5472  	leaf = path->nodes[0];
5473  	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5474  	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5475  	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5476  		goto out;
5477  
5478  	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5479  				   (unsigned long)(ref + 1),
5480  				   dentry->d_name.len);
5481  	if (ret)
5482  		goto out;
5483  
5484  	btrfs_release_path(path);
5485  
5486  	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
5487  	if (IS_ERR(new_root)) {
5488  		err = PTR_ERR(new_root);
5489  		goto out;
5490  	}
5491  
5492  	*sub_root = new_root;
5493  	location->objectid = btrfs_root_dirid(&new_root->root_item);
5494  	location->type = BTRFS_INODE_ITEM_KEY;
5495  	location->offset = 0;
5496  	err = 0;
5497  out:
5498  	btrfs_free_path(path);
5499  	return err;
5500  }
5501  
inode_tree_add(struct inode * inode)5502  static void inode_tree_add(struct inode *inode)
5503  {
5504  	struct btrfs_root *root = BTRFS_I(inode)->root;
5505  	struct btrfs_inode *entry;
5506  	struct rb_node **p;
5507  	struct rb_node *parent;
5508  	struct rb_node *new = &BTRFS_I(inode)->rb_node;
5509  	u64 ino = btrfs_ino(inode);
5510  
5511  	if (inode_unhashed(inode))
5512  		return;
5513  	parent = NULL;
5514  	spin_lock(&root->inode_lock);
5515  	p = &root->inode_tree.rb_node;
5516  	while (*p) {
5517  		parent = *p;
5518  		entry = rb_entry(parent, struct btrfs_inode, rb_node);
5519  
5520  		if (ino < btrfs_ino(&entry->vfs_inode))
5521  			p = &parent->rb_left;
5522  		else if (ino > btrfs_ino(&entry->vfs_inode))
5523  			p = &parent->rb_right;
5524  		else {
5525  			WARN_ON(!(entry->vfs_inode.i_state &
5526  				  (I_WILL_FREE | I_FREEING)));
5527  			rb_replace_node(parent, new, &root->inode_tree);
5528  			RB_CLEAR_NODE(parent);
5529  			spin_unlock(&root->inode_lock);
5530  			return;
5531  		}
5532  	}
5533  	rb_link_node(new, parent, p);
5534  	rb_insert_color(new, &root->inode_tree);
5535  	spin_unlock(&root->inode_lock);
5536  }
5537  
inode_tree_del(struct inode * inode)5538  static void inode_tree_del(struct inode *inode)
5539  {
5540  	struct btrfs_root *root = BTRFS_I(inode)->root;
5541  	int empty = 0;
5542  
5543  	spin_lock(&root->inode_lock);
5544  	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
5545  		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
5546  		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
5547  		empty = RB_EMPTY_ROOT(&root->inode_tree);
5548  	}
5549  	spin_unlock(&root->inode_lock);
5550  
5551  	if (empty && btrfs_root_refs(&root->root_item) == 0) {
5552  		synchronize_srcu(&root->fs_info->subvol_srcu);
5553  		spin_lock(&root->inode_lock);
5554  		empty = RB_EMPTY_ROOT(&root->inode_tree);
5555  		spin_unlock(&root->inode_lock);
5556  		if (empty)
5557  			btrfs_add_dead_root(root);
5558  	}
5559  }
5560  
btrfs_invalidate_inodes(struct btrfs_root * root)5561  void btrfs_invalidate_inodes(struct btrfs_root *root)
5562  {
5563  	struct rb_node *node;
5564  	struct rb_node *prev;
5565  	struct btrfs_inode *entry;
5566  	struct inode *inode;
5567  	u64 objectid = 0;
5568  
5569  	if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
5570  		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
5571  
5572  	spin_lock(&root->inode_lock);
5573  again:
5574  	node = root->inode_tree.rb_node;
5575  	prev = NULL;
5576  	while (node) {
5577  		prev = node;
5578  		entry = rb_entry(node, struct btrfs_inode, rb_node);
5579  
5580  		if (objectid < btrfs_ino(&entry->vfs_inode))
5581  			node = node->rb_left;
5582  		else if (objectid > btrfs_ino(&entry->vfs_inode))
5583  			node = node->rb_right;
5584  		else
5585  			break;
5586  	}
5587  	if (!node) {
5588  		while (prev) {
5589  			entry = rb_entry(prev, struct btrfs_inode, rb_node);
5590  			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
5591  				node = prev;
5592  				break;
5593  			}
5594  			prev = rb_next(prev);
5595  		}
5596  	}
5597  	while (node) {
5598  		entry = rb_entry(node, struct btrfs_inode, rb_node);
5599  		objectid = btrfs_ino(&entry->vfs_inode) + 1;
5600  		inode = igrab(&entry->vfs_inode);
5601  		if (inode) {
5602  			spin_unlock(&root->inode_lock);
5603  			if (atomic_read(&inode->i_count) > 1)
5604  				d_prune_aliases(inode);
5605  			/*
5606  			 * btrfs_drop_inode will have it removed from
5607  			 * the inode cache when its usage count
5608  			 * hits zero.
5609  			 */
5610  			iput(inode);
5611  			cond_resched();
5612  			spin_lock(&root->inode_lock);
5613  			goto again;
5614  		}
5615  
5616  		if (cond_resched_lock(&root->inode_lock))
5617  			goto again;
5618  
5619  		node = rb_next(node);
5620  	}
5621  	spin_unlock(&root->inode_lock);
5622  }
5623  
btrfs_init_locked_inode(struct inode * inode,void * p)5624  static int btrfs_init_locked_inode(struct inode *inode, void *p)
5625  {
5626  	struct btrfs_iget_args *args = p;
5627  	inode->i_ino = args->location->objectid;
5628  	memcpy(&BTRFS_I(inode)->location, args->location,
5629  	       sizeof(*args->location));
5630  	BTRFS_I(inode)->root = args->root;
5631  	return 0;
5632  }
5633  
btrfs_find_actor(struct inode * inode,void * opaque)5634  static int btrfs_find_actor(struct inode *inode, void *opaque)
5635  {
5636  	struct btrfs_iget_args *args = opaque;
5637  	return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5638  		args->root == BTRFS_I(inode)->root;
5639  }
5640  
btrfs_iget_locked(struct super_block * s,struct btrfs_key * location,struct btrfs_root * root)5641  static struct inode *btrfs_iget_locked(struct super_block *s,
5642  				       struct btrfs_key *location,
5643  				       struct btrfs_root *root)
5644  {
5645  	struct inode *inode;
5646  	struct btrfs_iget_args args;
5647  	unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5648  
5649  	args.location = location;
5650  	args.root = root;
5651  
5652  	inode = iget5_locked(s, hashval, btrfs_find_actor,
5653  			     btrfs_init_locked_inode,
5654  			     (void *)&args);
5655  	return inode;
5656  }
5657  
5658  /* Get an inode object given its location and corresponding root.
5659   * Returns in *is_new if the inode was read from disk
5660   */
btrfs_iget(struct super_block * s,struct btrfs_key * location,struct btrfs_root * root,int * new)5661  struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5662  			 struct btrfs_root *root, int *new)
5663  {
5664  	struct inode *inode;
5665  
5666  	inode = btrfs_iget_locked(s, location, root);
5667  	if (!inode)
5668  		return ERR_PTR(-ENOMEM);
5669  
5670  	if (inode->i_state & I_NEW) {
5671  		int ret;
5672  
5673  		ret = btrfs_read_locked_inode(inode);
5674  		if (!is_bad_inode(inode)) {
5675  			inode_tree_add(inode);
5676  			unlock_new_inode(inode);
5677  			if (new)
5678  				*new = 1;
5679  		} else {
5680  			unlock_new_inode(inode);
5681  			iput(inode);
5682  			ASSERT(ret < 0);
5683  			inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
5684  		}
5685  	}
5686  
5687  	return inode;
5688  }
5689  
new_simple_dir(struct super_block * s,struct btrfs_key * key,struct btrfs_root * root)5690  static struct inode *new_simple_dir(struct super_block *s,
5691  				    struct btrfs_key *key,
5692  				    struct btrfs_root *root)
5693  {
5694  	struct inode *inode = new_inode(s);
5695  
5696  	if (!inode)
5697  		return ERR_PTR(-ENOMEM);
5698  
5699  	BTRFS_I(inode)->root = root;
5700  	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5701  	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5702  
5703  	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5704  	inode->i_op = &btrfs_dir_ro_inode_operations;
5705  	inode->i_opflags &= ~IOP_XATTR;
5706  	inode->i_fop = &simple_dir_operations;
5707  	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5708  	inode->i_mtime = current_time(inode);
5709  	inode->i_atime = inode->i_mtime;
5710  	inode->i_ctime = inode->i_mtime;
5711  	BTRFS_I(inode)->i_otime = inode->i_mtime;
5712  
5713  	return inode;
5714  }
5715  
btrfs_lookup_dentry(struct inode * dir,struct dentry * dentry)5716  struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5717  {
5718  	struct inode *inode;
5719  	struct btrfs_root *root = BTRFS_I(dir)->root;
5720  	struct btrfs_root *sub_root = root;
5721  	struct btrfs_key location;
5722  	int index;
5723  	int ret = 0;
5724  
5725  	if (dentry->d_name.len > BTRFS_NAME_LEN)
5726  		return ERR_PTR(-ENAMETOOLONG);
5727  
5728  	ret = btrfs_inode_by_name(dir, dentry, &location);
5729  	if (ret < 0)
5730  		return ERR_PTR(ret);
5731  
5732  	if (location.objectid == 0)
5733  		return ERR_PTR(-ENOENT);
5734  
5735  	if (location.type == BTRFS_INODE_ITEM_KEY) {
5736  		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5737  		return inode;
5738  	}
5739  
5740  	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
5741  
5742  	index = srcu_read_lock(&root->fs_info->subvol_srcu);
5743  	ret = fixup_tree_root_location(root, dir, dentry,
5744  				       &location, &sub_root);
5745  	if (ret < 0) {
5746  		if (ret != -ENOENT)
5747  			inode = ERR_PTR(ret);
5748  		else
5749  			inode = new_simple_dir(dir->i_sb, &location, sub_root);
5750  	} else {
5751  		inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5752  	}
5753  	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
5754  
5755  	if (!IS_ERR(inode) && root != sub_root) {
5756  		down_read(&root->fs_info->cleanup_work_sem);
5757  		if (!(inode->i_sb->s_flags & MS_RDONLY))
5758  			ret = btrfs_orphan_cleanup(sub_root);
5759  		up_read(&root->fs_info->cleanup_work_sem);
5760  		if (ret) {
5761  			iput(inode);
5762  			inode = ERR_PTR(ret);
5763  		}
5764  	}
5765  
5766  	return inode;
5767  }
5768  
btrfs_dentry_delete(const struct dentry * dentry)5769  static int btrfs_dentry_delete(const struct dentry *dentry)
5770  {
5771  	struct btrfs_root *root;
5772  	struct inode *inode = d_inode(dentry);
5773  
5774  	if (!inode && !IS_ROOT(dentry))
5775  		inode = d_inode(dentry->d_parent);
5776  
5777  	if (inode) {
5778  		root = BTRFS_I(inode)->root;
5779  		if (btrfs_root_refs(&root->root_item) == 0)
5780  			return 1;
5781  
5782  		if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5783  			return 1;
5784  	}
5785  	return 0;
5786  }
5787  
btrfs_dentry_release(struct dentry * dentry)5788  static void btrfs_dentry_release(struct dentry *dentry)
5789  {
5790  	kfree(dentry->d_fsdata);
5791  }
5792  
btrfs_lookup(struct inode * dir,struct dentry * dentry,unsigned int flags)5793  static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5794  				   unsigned int flags)
5795  {
5796  	struct inode *inode;
5797  
5798  	inode = btrfs_lookup_dentry(dir, dentry);
5799  	if (IS_ERR(inode)) {
5800  		if (PTR_ERR(inode) == -ENOENT)
5801  			inode = NULL;
5802  		else
5803  			return ERR_CAST(inode);
5804  	}
5805  
5806  	return d_splice_alias(inode, dentry);
5807  }
5808  
5809  unsigned char btrfs_filetype_table[] = {
5810  	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5811  };
5812  
btrfs_real_readdir(struct file * file,struct dir_context * ctx)5813  static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5814  {
5815  	struct inode *inode = file_inode(file);
5816  	struct btrfs_root *root = BTRFS_I(inode)->root;
5817  	struct btrfs_item *item;
5818  	struct btrfs_dir_item *di;
5819  	struct btrfs_key key;
5820  	struct btrfs_key found_key;
5821  	struct btrfs_path *path;
5822  	struct list_head ins_list;
5823  	struct list_head del_list;
5824  	int ret;
5825  	struct extent_buffer *leaf;
5826  	int slot;
5827  	unsigned char d_type;
5828  	int over = 0;
5829  	u32 di_cur;
5830  	u32 di_total;
5831  	u32 di_len;
5832  	int key_type = BTRFS_DIR_INDEX_KEY;
5833  	char tmp_name[32];
5834  	char *name_ptr;
5835  	int name_len;
5836  	int is_curr = 0;	/* ctx->pos points to the current index? */
5837  	bool emitted;
5838  	bool put = false;
5839  
5840  	/* FIXME, use a real flag for deciding about the key type */
5841  	if (root->fs_info->tree_root == root)
5842  		key_type = BTRFS_DIR_ITEM_KEY;
5843  
5844  	if (!dir_emit_dots(file, ctx))
5845  		return 0;
5846  
5847  	path = btrfs_alloc_path();
5848  	if (!path)
5849  		return -ENOMEM;
5850  
5851  	path->reada = READA_FORWARD;
5852  
5853  	if (key_type == BTRFS_DIR_INDEX_KEY) {
5854  		INIT_LIST_HEAD(&ins_list);
5855  		INIT_LIST_HEAD(&del_list);
5856  		put = btrfs_readdir_get_delayed_items(inode, &ins_list,
5857  						      &del_list);
5858  	}
5859  
5860  	key.type = key_type;
5861  	key.offset = ctx->pos;
5862  	key.objectid = btrfs_ino(inode);
5863  
5864  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5865  	if (ret < 0)
5866  		goto err;
5867  
5868  	emitted = false;
5869  	while (1) {
5870  		leaf = path->nodes[0];
5871  		slot = path->slots[0];
5872  		if (slot >= btrfs_header_nritems(leaf)) {
5873  			ret = btrfs_next_leaf(root, path);
5874  			if (ret < 0)
5875  				goto err;
5876  			else if (ret > 0)
5877  				break;
5878  			continue;
5879  		}
5880  
5881  		item = btrfs_item_nr(slot);
5882  		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5883  
5884  		if (found_key.objectid != key.objectid)
5885  			break;
5886  		if (found_key.type != key_type)
5887  			break;
5888  		if (found_key.offset < ctx->pos)
5889  			goto next;
5890  		if (key_type == BTRFS_DIR_INDEX_KEY &&
5891  		    btrfs_should_delete_dir_index(&del_list,
5892  						  found_key.offset))
5893  			goto next;
5894  
5895  		ctx->pos = found_key.offset;
5896  		is_curr = 1;
5897  
5898  		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5899  		di_cur = 0;
5900  		di_total = btrfs_item_size(leaf, item);
5901  
5902  		while (di_cur < di_total) {
5903  			struct btrfs_key location;
5904  
5905  			if (verify_dir_item(root, leaf, di))
5906  				break;
5907  
5908  			name_len = btrfs_dir_name_len(leaf, di);
5909  			if (name_len <= sizeof(tmp_name)) {
5910  				name_ptr = tmp_name;
5911  			} else {
5912  				name_ptr = kmalloc(name_len, GFP_KERNEL);
5913  				if (!name_ptr) {
5914  					ret = -ENOMEM;
5915  					goto err;
5916  				}
5917  			}
5918  			read_extent_buffer(leaf, name_ptr,
5919  					   (unsigned long)(di + 1), name_len);
5920  
5921  			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5922  			btrfs_dir_item_key_to_cpu(leaf, di, &location);
5923  
5924  
5925  			/* is this a reference to our own snapshot? If so
5926  			 * skip it.
5927  			 *
5928  			 * In contrast to old kernels, we insert the snapshot's
5929  			 * dir item and dir index after it has been created, so
5930  			 * we won't find a reference to our own snapshot. We
5931  			 * still keep the following code for backward
5932  			 * compatibility.
5933  			 */
5934  			if (location.type == BTRFS_ROOT_ITEM_KEY &&
5935  			    location.objectid == root->root_key.objectid) {
5936  				over = 0;
5937  				goto skip;
5938  			}
5939  			over = !dir_emit(ctx, name_ptr, name_len,
5940  				       location.objectid, d_type);
5941  
5942  skip:
5943  			if (name_ptr != tmp_name)
5944  				kfree(name_ptr);
5945  
5946  			if (over)
5947  				goto nopos;
5948  			emitted = true;
5949  			di_len = btrfs_dir_name_len(leaf, di) +
5950  				 btrfs_dir_data_len(leaf, di) + sizeof(*di);
5951  			di_cur += di_len;
5952  			di = (struct btrfs_dir_item *)((char *)di + di_len);
5953  		}
5954  next:
5955  		path->slots[0]++;
5956  	}
5957  
5958  	if (key_type == BTRFS_DIR_INDEX_KEY) {
5959  		if (is_curr)
5960  			ctx->pos++;
5961  		ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted);
5962  		if (ret)
5963  			goto nopos;
5964  	}
5965  
5966  	/*
5967  	 * If we haven't emitted any dir entry, we must not touch ctx->pos as
5968  	 * it was was set to the termination value in previous call. We assume
5969  	 * that "." and ".." were emitted if we reach this point and set the
5970  	 * termination value as well for an empty directory.
5971  	 */
5972  	if (ctx->pos > 2 && !emitted)
5973  		goto nopos;
5974  
5975  	/* Reached end of directory/root. Bump pos past the last item. */
5976  	ctx->pos++;
5977  
5978  	/*
5979  	 * Stop new entries from being returned after we return the last
5980  	 * entry.
5981  	 *
5982  	 * New directory entries are assigned a strictly increasing
5983  	 * offset.  This means that new entries created during readdir
5984  	 * are *guaranteed* to be seen in the future by that readdir.
5985  	 * This has broken buggy programs which operate on names as
5986  	 * they're returned by readdir.  Until we re-use freed offsets
5987  	 * we have this hack to stop new entries from being returned
5988  	 * under the assumption that they'll never reach this huge
5989  	 * offset.
5990  	 *
5991  	 * This is being careful not to overflow 32bit loff_t unless the
5992  	 * last entry requires it because doing so has broken 32bit apps
5993  	 * in the past.
5994  	 */
5995  	if (key_type == BTRFS_DIR_INDEX_KEY) {
5996  		if (ctx->pos >= INT_MAX)
5997  			ctx->pos = LLONG_MAX;
5998  		else
5999  			ctx->pos = INT_MAX;
6000  	}
6001  nopos:
6002  	ret = 0;
6003  err:
6004  	if (put)
6005  		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
6006  	btrfs_free_path(path);
6007  	return ret;
6008  }
6009  
btrfs_write_inode(struct inode * inode,struct writeback_control * wbc)6010  int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
6011  {
6012  	struct btrfs_root *root = BTRFS_I(inode)->root;
6013  	struct btrfs_trans_handle *trans;
6014  	int ret = 0;
6015  	bool nolock = false;
6016  
6017  	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6018  		return 0;
6019  
6020  	if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
6021  		nolock = true;
6022  
6023  	if (wbc->sync_mode == WB_SYNC_ALL) {
6024  		if (nolock)
6025  			trans = btrfs_join_transaction_nolock(root);
6026  		else
6027  			trans = btrfs_join_transaction(root);
6028  		if (IS_ERR(trans))
6029  			return PTR_ERR(trans);
6030  		ret = btrfs_commit_transaction(trans, root);
6031  	}
6032  	return ret;
6033  }
6034  
6035  /*
6036   * This is somewhat expensive, updating the tree every time the
6037   * inode changes.  But, it is most likely to find the inode in cache.
6038   * FIXME, needs more benchmarking...there are no reasons other than performance
6039   * to keep or drop this code.
6040   */
btrfs_dirty_inode(struct inode * inode)6041  static int btrfs_dirty_inode(struct inode *inode)
6042  {
6043  	struct btrfs_root *root = BTRFS_I(inode)->root;
6044  	struct btrfs_trans_handle *trans;
6045  	int ret;
6046  
6047  	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6048  		return 0;
6049  
6050  	trans = btrfs_join_transaction(root);
6051  	if (IS_ERR(trans))
6052  		return PTR_ERR(trans);
6053  
6054  	ret = btrfs_update_inode(trans, root, inode);
6055  	if (ret && ret == -ENOSPC) {
6056  		/* whoops, lets try again with the full transaction */
6057  		btrfs_end_transaction(trans, root);
6058  		trans = btrfs_start_transaction(root, 1);
6059  		if (IS_ERR(trans))
6060  			return PTR_ERR(trans);
6061  
6062  		ret = btrfs_update_inode(trans, root, inode);
6063  	}
6064  	btrfs_end_transaction(trans, root);
6065  	if (BTRFS_I(inode)->delayed_node)
6066  		btrfs_balance_delayed_items(root);
6067  
6068  	return ret;
6069  }
6070  
6071  /*
6072   * This is a copy of file_update_time.  We need this so we can return error on
6073   * ENOSPC for updating the inode in the case of file write and mmap writes.
6074   */
btrfs_update_time(struct inode * inode,struct timespec * now,int flags)6075  static int btrfs_update_time(struct inode *inode, struct timespec *now,
6076  			     int flags)
6077  {
6078  	struct btrfs_root *root = BTRFS_I(inode)->root;
6079  
6080  	if (btrfs_root_readonly(root))
6081  		return -EROFS;
6082  
6083  	if (flags & S_VERSION)
6084  		inode_inc_iversion(inode);
6085  	if (flags & S_CTIME)
6086  		inode->i_ctime = *now;
6087  	if (flags & S_MTIME)
6088  		inode->i_mtime = *now;
6089  	if (flags & S_ATIME)
6090  		inode->i_atime = *now;
6091  	return btrfs_dirty_inode(inode);
6092  }
6093  
6094  /*
6095   * find the highest existing sequence number in a directory
6096   * and then set the in-memory index_cnt variable to reflect
6097   * free sequence numbers
6098   */
btrfs_set_inode_index_count(struct inode * inode)6099  static int btrfs_set_inode_index_count(struct inode *inode)
6100  {
6101  	struct btrfs_root *root = BTRFS_I(inode)->root;
6102  	struct btrfs_key key, found_key;
6103  	struct btrfs_path *path;
6104  	struct extent_buffer *leaf;
6105  	int ret;
6106  
6107  	key.objectid = btrfs_ino(inode);
6108  	key.type = BTRFS_DIR_INDEX_KEY;
6109  	key.offset = (u64)-1;
6110  
6111  	path = btrfs_alloc_path();
6112  	if (!path)
6113  		return -ENOMEM;
6114  
6115  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6116  	if (ret < 0)
6117  		goto out;
6118  	/* FIXME: we should be able to handle this */
6119  	if (ret == 0)
6120  		goto out;
6121  	ret = 0;
6122  
6123  	/*
6124  	 * MAGIC NUMBER EXPLANATION:
6125  	 * since we search a directory based on f_pos we have to start at 2
6126  	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
6127  	 * else has to start at 2
6128  	 */
6129  	if (path->slots[0] == 0) {
6130  		BTRFS_I(inode)->index_cnt = 2;
6131  		goto out;
6132  	}
6133  
6134  	path->slots[0]--;
6135  
6136  	leaf = path->nodes[0];
6137  	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6138  
6139  	if (found_key.objectid != btrfs_ino(inode) ||
6140  	    found_key.type != BTRFS_DIR_INDEX_KEY) {
6141  		BTRFS_I(inode)->index_cnt = 2;
6142  		goto out;
6143  	}
6144  
6145  	BTRFS_I(inode)->index_cnt = found_key.offset + 1;
6146  out:
6147  	btrfs_free_path(path);
6148  	return ret;
6149  }
6150  
6151  /*
6152   * helper to find a free sequence number in a given directory.  This current
6153   * code is very simple, later versions will do smarter things in the btree
6154   */
btrfs_set_inode_index(struct inode * dir,u64 * index)6155  int btrfs_set_inode_index(struct inode *dir, u64 *index)
6156  {
6157  	int ret = 0;
6158  
6159  	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
6160  		ret = btrfs_inode_delayed_dir_index_count(dir);
6161  		if (ret) {
6162  			ret = btrfs_set_inode_index_count(dir);
6163  			if (ret)
6164  				return ret;
6165  		}
6166  	}
6167  
6168  	*index = BTRFS_I(dir)->index_cnt;
6169  	BTRFS_I(dir)->index_cnt++;
6170  
6171  	return ret;
6172  }
6173  
btrfs_insert_inode_locked(struct inode * inode)6174  static int btrfs_insert_inode_locked(struct inode *inode)
6175  {
6176  	struct btrfs_iget_args args;
6177  	args.location = &BTRFS_I(inode)->location;
6178  	args.root = BTRFS_I(inode)->root;
6179  
6180  	return insert_inode_locked4(inode,
6181  		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6182  		   btrfs_find_actor, &args);
6183  }
6184  
btrfs_new_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * dir,const char * name,int name_len,u64 ref_objectid,u64 objectid,umode_t mode,u64 * index)6185  static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
6186  				     struct btrfs_root *root,
6187  				     struct inode *dir,
6188  				     const char *name, int name_len,
6189  				     u64 ref_objectid, u64 objectid,
6190  				     umode_t mode, u64 *index)
6191  {
6192  	struct inode *inode;
6193  	struct btrfs_inode_item *inode_item;
6194  	struct btrfs_key *location;
6195  	struct btrfs_path *path;
6196  	struct btrfs_inode_ref *ref;
6197  	struct btrfs_key key[2];
6198  	u32 sizes[2];
6199  	int nitems = name ? 2 : 1;
6200  	unsigned long ptr;
6201  	int ret;
6202  
6203  	path = btrfs_alloc_path();
6204  	if (!path)
6205  		return ERR_PTR(-ENOMEM);
6206  
6207  	inode = new_inode(root->fs_info->sb);
6208  	if (!inode) {
6209  		btrfs_free_path(path);
6210  		return ERR_PTR(-ENOMEM);
6211  	}
6212  
6213  	/*
6214  	 * O_TMPFILE, set link count to 0, so that after this point,
6215  	 * we fill in an inode item with the correct link count.
6216  	 */
6217  	if (!name)
6218  		set_nlink(inode, 0);
6219  
6220  	/*
6221  	 * we have to initialize this early, so we can reclaim the inode
6222  	 * number if we fail afterwards in this function.
6223  	 */
6224  	inode->i_ino = objectid;
6225  
6226  	if (dir && name) {
6227  		trace_btrfs_inode_request(dir);
6228  
6229  		ret = btrfs_set_inode_index(dir, index);
6230  		if (ret) {
6231  			btrfs_free_path(path);
6232  			iput(inode);
6233  			return ERR_PTR(ret);
6234  		}
6235  	} else if (dir) {
6236  		*index = 0;
6237  	}
6238  	/*
6239  	 * index_cnt is ignored for everything but a dir,
6240  	 * btrfs_get_inode_index_count has an explanation for the magic
6241  	 * number
6242  	 */
6243  	BTRFS_I(inode)->index_cnt = 2;
6244  	BTRFS_I(inode)->dir_index = *index;
6245  	BTRFS_I(inode)->root = root;
6246  	BTRFS_I(inode)->generation = trans->transid;
6247  	inode->i_generation = BTRFS_I(inode)->generation;
6248  
6249  	/*
6250  	 * We could have gotten an inode number from somebody who was fsynced
6251  	 * and then removed in this same transaction, so let's just set full
6252  	 * sync since it will be a full sync anyway and this will blow away the
6253  	 * old info in the log.
6254  	 */
6255  	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6256  
6257  	key[0].objectid = objectid;
6258  	key[0].type = BTRFS_INODE_ITEM_KEY;
6259  	key[0].offset = 0;
6260  
6261  	sizes[0] = sizeof(struct btrfs_inode_item);
6262  
6263  	if (name) {
6264  		/*
6265  		 * Start new inodes with an inode_ref. This is slightly more
6266  		 * efficient for small numbers of hard links since they will
6267  		 * be packed into one item. Extended refs will kick in if we
6268  		 * add more hard links than can fit in the ref item.
6269  		 */
6270  		key[1].objectid = objectid;
6271  		key[1].type = BTRFS_INODE_REF_KEY;
6272  		key[1].offset = ref_objectid;
6273  
6274  		sizes[1] = name_len + sizeof(*ref);
6275  	}
6276  
6277  	location = &BTRFS_I(inode)->location;
6278  	location->objectid = objectid;
6279  	location->offset = 0;
6280  	location->type = BTRFS_INODE_ITEM_KEY;
6281  
6282  	ret = btrfs_insert_inode_locked(inode);
6283  	if (ret < 0)
6284  		goto fail;
6285  
6286  	path->leave_spinning = 1;
6287  	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
6288  	if (ret != 0)
6289  		goto fail_unlock;
6290  
6291  	inode_init_owner(inode, dir, mode);
6292  	inode_set_bytes(inode, 0);
6293  
6294  	inode->i_mtime = current_time(inode);
6295  	inode->i_atime = inode->i_mtime;
6296  	inode->i_ctime = inode->i_mtime;
6297  	BTRFS_I(inode)->i_otime = inode->i_mtime;
6298  
6299  	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6300  				  struct btrfs_inode_item);
6301  	memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
6302  			     sizeof(*inode_item));
6303  	fill_inode_item(trans, path->nodes[0], inode_item, inode);
6304  
6305  	if (name) {
6306  		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6307  				     struct btrfs_inode_ref);
6308  		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
6309  		btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
6310  		ptr = (unsigned long)(ref + 1);
6311  		write_extent_buffer(path->nodes[0], name, ptr, name_len);
6312  	}
6313  
6314  	btrfs_mark_buffer_dirty(path->nodes[0]);
6315  	btrfs_free_path(path);
6316  
6317  	btrfs_inherit_iflags(inode, dir);
6318  
6319  	if (S_ISREG(mode)) {
6320  		if (btrfs_test_opt(root->fs_info, NODATASUM))
6321  			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6322  		if (btrfs_test_opt(root->fs_info, NODATACOW))
6323  			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6324  				BTRFS_INODE_NODATASUM;
6325  	}
6326  
6327  	inode_tree_add(inode);
6328  
6329  	trace_btrfs_inode_new(inode);
6330  	btrfs_set_inode_last_trans(trans, inode);
6331  
6332  	btrfs_update_root_times(trans, root);
6333  
6334  	ret = btrfs_inode_inherit_props(trans, inode, dir);
6335  	if (ret)
6336  		btrfs_err(root->fs_info,
6337  			  "error inheriting props for ino %llu (root %llu): %d",
6338  			  btrfs_ino(inode), root->root_key.objectid, ret);
6339  
6340  	return inode;
6341  
6342  fail_unlock:
6343  	unlock_new_inode(inode);
6344  fail:
6345  	if (dir && name)
6346  		BTRFS_I(dir)->index_cnt--;
6347  	btrfs_free_path(path);
6348  	iput(inode);
6349  	return ERR_PTR(ret);
6350  }
6351  
btrfs_inode_type(struct inode * inode)6352  static inline u8 btrfs_inode_type(struct inode *inode)
6353  {
6354  	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
6355  }
6356  
6357  /*
6358   * utility function to add 'inode' into 'parent_inode' with
6359   * a give name and a given sequence number.
6360   * if 'add_backref' is true, also insert a backref from the
6361   * inode to the parent directory.
6362   */
btrfs_add_link(struct btrfs_trans_handle * trans,struct inode * parent_inode,struct inode * inode,const char * name,int name_len,int add_backref,u64 index)6363  int btrfs_add_link(struct btrfs_trans_handle *trans,
6364  		   struct inode *parent_inode, struct inode *inode,
6365  		   const char *name, int name_len, int add_backref, u64 index)
6366  {
6367  	int ret = 0;
6368  	struct btrfs_key key;
6369  	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
6370  	u64 ino = btrfs_ino(inode);
6371  	u64 parent_ino = btrfs_ino(parent_inode);
6372  
6373  	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6374  		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
6375  	} else {
6376  		key.objectid = ino;
6377  		key.type = BTRFS_INODE_ITEM_KEY;
6378  		key.offset = 0;
6379  	}
6380  
6381  	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6382  		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
6383  					 key.objectid, root->root_key.objectid,
6384  					 parent_ino, index, name, name_len);
6385  	} else if (add_backref) {
6386  		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
6387  					     parent_ino, index);
6388  	}
6389  
6390  	/* Nothing to clean up yet */
6391  	if (ret)
6392  		return ret;
6393  
6394  	ret = btrfs_insert_dir_item(trans, root, name, name_len,
6395  				    parent_inode, &key,
6396  				    btrfs_inode_type(inode), index);
6397  	if (ret == -EEXIST || ret == -EOVERFLOW)
6398  		goto fail_dir_item;
6399  	else if (ret) {
6400  		btrfs_abort_transaction(trans, ret);
6401  		return ret;
6402  	}
6403  
6404  	btrfs_i_size_write(parent_inode, parent_inode->i_size +
6405  			   name_len * 2);
6406  	inode_inc_iversion(parent_inode);
6407  	parent_inode->i_mtime = parent_inode->i_ctime =
6408  		current_time(parent_inode);
6409  	ret = btrfs_update_inode(trans, root, parent_inode);
6410  	if (ret)
6411  		btrfs_abort_transaction(trans, ret);
6412  	return ret;
6413  
6414  fail_dir_item:
6415  	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6416  		u64 local_index;
6417  		int err;
6418  		err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
6419  				 key.objectid, root->root_key.objectid,
6420  				 parent_ino, &local_index, name, name_len);
6421  
6422  	} else if (add_backref) {
6423  		u64 local_index;
6424  		int err;
6425  
6426  		err = btrfs_del_inode_ref(trans, root, name, name_len,
6427  					  ino, parent_ino, &local_index);
6428  	}
6429  	return ret;
6430  }
6431  
btrfs_add_nondir(struct btrfs_trans_handle * trans,struct inode * dir,struct dentry * dentry,struct inode * inode,int backref,u64 index)6432  static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
6433  			    struct inode *dir, struct dentry *dentry,
6434  			    struct inode *inode, int backref, u64 index)
6435  {
6436  	int err = btrfs_add_link(trans, dir, inode,
6437  				 dentry->d_name.name, dentry->d_name.len,
6438  				 backref, index);
6439  	if (err > 0)
6440  		err = -EEXIST;
6441  	return err;
6442  }
6443  
btrfs_mknod(struct inode * dir,struct dentry * dentry,umode_t mode,dev_t rdev)6444  static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
6445  			umode_t mode, dev_t rdev)
6446  {
6447  	struct btrfs_trans_handle *trans;
6448  	struct btrfs_root *root = BTRFS_I(dir)->root;
6449  	struct inode *inode = NULL;
6450  	int err;
6451  	int drop_inode = 0;
6452  	u64 objectid;
6453  	u64 index = 0;
6454  
6455  	/*
6456  	 * 2 for inode item and ref
6457  	 * 2 for dir items
6458  	 * 1 for xattr if selinux is on
6459  	 */
6460  	trans = btrfs_start_transaction(root, 5);
6461  	if (IS_ERR(trans))
6462  		return PTR_ERR(trans);
6463  
6464  	err = btrfs_find_free_ino(root, &objectid);
6465  	if (err)
6466  		goto out_unlock;
6467  
6468  	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6469  				dentry->d_name.len, btrfs_ino(dir), objectid,
6470  				mode, &index);
6471  	if (IS_ERR(inode)) {
6472  		err = PTR_ERR(inode);
6473  		goto out_unlock;
6474  	}
6475  
6476  	/*
6477  	* If the active LSM wants to access the inode during
6478  	* d_instantiate it needs these. Smack checks to see
6479  	* if the filesystem supports xattrs by looking at the
6480  	* ops vector.
6481  	*/
6482  	inode->i_op = &btrfs_special_inode_operations;
6483  	init_special_inode(inode, inode->i_mode, rdev);
6484  
6485  	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6486  	if (err)
6487  		goto out_unlock_inode;
6488  
6489  	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6490  	if (err) {
6491  		goto out_unlock_inode;
6492  	} else {
6493  		btrfs_update_inode(trans, root, inode);
6494  		unlock_new_inode(inode);
6495  		d_instantiate(dentry, inode);
6496  	}
6497  
6498  out_unlock:
6499  	btrfs_end_transaction(trans, root);
6500  	btrfs_balance_delayed_items(root);
6501  	btrfs_btree_balance_dirty(root);
6502  	if (drop_inode) {
6503  		inode_dec_link_count(inode);
6504  		iput(inode);
6505  	}
6506  	return err;
6507  
6508  out_unlock_inode:
6509  	drop_inode = 1;
6510  	unlock_new_inode(inode);
6511  	goto out_unlock;
6512  
6513  }
6514  
btrfs_create(struct inode * dir,struct dentry * dentry,umode_t mode,bool excl)6515  static int btrfs_create(struct inode *dir, struct dentry *dentry,
6516  			umode_t mode, bool excl)
6517  {
6518  	struct btrfs_trans_handle *trans;
6519  	struct btrfs_root *root = BTRFS_I(dir)->root;
6520  	struct inode *inode = NULL;
6521  	int drop_inode_on_err = 0;
6522  	int err;
6523  	u64 objectid;
6524  	u64 index = 0;
6525  
6526  	/*
6527  	 * 2 for inode item and ref
6528  	 * 2 for dir items
6529  	 * 1 for xattr if selinux is on
6530  	 */
6531  	trans = btrfs_start_transaction(root, 5);
6532  	if (IS_ERR(trans))
6533  		return PTR_ERR(trans);
6534  
6535  	err = btrfs_find_free_ino(root, &objectid);
6536  	if (err)
6537  		goto out_unlock;
6538  
6539  	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6540  				dentry->d_name.len, btrfs_ino(dir), objectid,
6541  				mode, &index);
6542  	if (IS_ERR(inode)) {
6543  		err = PTR_ERR(inode);
6544  		goto out_unlock;
6545  	}
6546  	drop_inode_on_err = 1;
6547  	/*
6548  	* If the active LSM wants to access the inode during
6549  	* d_instantiate it needs these. Smack checks to see
6550  	* if the filesystem supports xattrs by looking at the
6551  	* ops vector.
6552  	*/
6553  	inode->i_fop = &btrfs_file_operations;
6554  	inode->i_op = &btrfs_file_inode_operations;
6555  	inode->i_mapping->a_ops = &btrfs_aops;
6556  
6557  	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6558  	if (err)
6559  		goto out_unlock_inode;
6560  
6561  	err = btrfs_update_inode(trans, root, inode);
6562  	if (err)
6563  		goto out_unlock_inode;
6564  
6565  	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6566  	if (err)
6567  		goto out_unlock_inode;
6568  
6569  	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6570  	unlock_new_inode(inode);
6571  	d_instantiate(dentry, inode);
6572  
6573  out_unlock:
6574  	btrfs_end_transaction(trans, root);
6575  	if (err && drop_inode_on_err) {
6576  		inode_dec_link_count(inode);
6577  		iput(inode);
6578  	}
6579  	btrfs_balance_delayed_items(root);
6580  	btrfs_btree_balance_dirty(root);
6581  	return err;
6582  
6583  out_unlock_inode:
6584  	unlock_new_inode(inode);
6585  	goto out_unlock;
6586  
6587  }
6588  
btrfs_link(struct dentry * old_dentry,struct inode * dir,struct dentry * dentry)6589  static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6590  		      struct dentry *dentry)
6591  {
6592  	struct btrfs_trans_handle *trans = NULL;
6593  	struct btrfs_root *root = BTRFS_I(dir)->root;
6594  	struct inode *inode = d_inode(old_dentry);
6595  	u64 index;
6596  	int err;
6597  	int drop_inode = 0;
6598  
6599  	/* do not allow sys_link's with other subvols of the same device */
6600  	if (root->objectid != BTRFS_I(inode)->root->objectid)
6601  		return -EXDEV;
6602  
6603  	if (inode->i_nlink >= BTRFS_LINK_MAX)
6604  		return -EMLINK;
6605  
6606  	err = btrfs_set_inode_index(dir, &index);
6607  	if (err)
6608  		goto fail;
6609  
6610  	/*
6611  	 * 2 items for inode and inode ref
6612  	 * 2 items for dir items
6613  	 * 1 item for parent inode
6614  	 */
6615  	trans = btrfs_start_transaction(root, 5);
6616  	if (IS_ERR(trans)) {
6617  		err = PTR_ERR(trans);
6618  		trans = NULL;
6619  		goto fail;
6620  	}
6621  
6622  	/* There are several dir indexes for this inode, clear the cache. */
6623  	BTRFS_I(inode)->dir_index = 0ULL;
6624  	inc_nlink(inode);
6625  	inode_inc_iversion(inode);
6626  	inode->i_ctime = current_time(inode);
6627  	ihold(inode);
6628  	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6629  
6630  	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
6631  
6632  	if (err) {
6633  		drop_inode = 1;
6634  	} else {
6635  		struct dentry *parent = dentry->d_parent;
6636  		err = btrfs_update_inode(trans, root, inode);
6637  		if (err)
6638  			goto fail;
6639  		if (inode->i_nlink == 1) {
6640  			/*
6641  			 * If new hard link count is 1, it's a file created
6642  			 * with open(2) O_TMPFILE flag.
6643  			 */
6644  			err = btrfs_orphan_del(trans, inode);
6645  			if (err)
6646  				goto fail;
6647  		}
6648  		d_instantiate(dentry, inode);
6649  		btrfs_log_new_name(trans, inode, NULL, parent);
6650  	}
6651  
6652  	btrfs_balance_delayed_items(root);
6653  fail:
6654  	if (trans)
6655  		btrfs_end_transaction(trans, root);
6656  	if (drop_inode) {
6657  		inode_dec_link_count(inode);
6658  		iput(inode);
6659  	}
6660  	btrfs_btree_balance_dirty(root);
6661  	return err;
6662  }
6663  
btrfs_mkdir(struct inode * dir,struct dentry * dentry,umode_t mode)6664  static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6665  {
6666  	struct inode *inode = NULL;
6667  	struct btrfs_trans_handle *trans;
6668  	struct btrfs_root *root = BTRFS_I(dir)->root;
6669  	int err = 0;
6670  	int drop_on_err = 0;
6671  	u64 objectid = 0;
6672  	u64 index = 0;
6673  
6674  	/*
6675  	 * 2 items for inode and ref
6676  	 * 2 items for dir items
6677  	 * 1 for xattr if selinux is on
6678  	 */
6679  	trans = btrfs_start_transaction(root, 5);
6680  	if (IS_ERR(trans))
6681  		return PTR_ERR(trans);
6682  
6683  	err = btrfs_find_free_ino(root, &objectid);
6684  	if (err)
6685  		goto out_fail;
6686  
6687  	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6688  				dentry->d_name.len, btrfs_ino(dir), objectid,
6689  				S_IFDIR | mode, &index);
6690  	if (IS_ERR(inode)) {
6691  		err = PTR_ERR(inode);
6692  		goto out_fail;
6693  	}
6694  
6695  	drop_on_err = 1;
6696  	/* these must be set before we unlock the inode */
6697  	inode->i_op = &btrfs_dir_inode_operations;
6698  	inode->i_fop = &btrfs_dir_file_operations;
6699  
6700  	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6701  	if (err)
6702  		goto out_fail_inode;
6703  
6704  	btrfs_i_size_write(inode, 0);
6705  	err = btrfs_update_inode(trans, root, inode);
6706  	if (err)
6707  		goto out_fail_inode;
6708  
6709  	err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
6710  			     dentry->d_name.len, 0, index);
6711  	if (err)
6712  		goto out_fail_inode;
6713  
6714  	d_instantiate(dentry, inode);
6715  	/*
6716  	 * mkdir is special.  We're unlocking after we call d_instantiate
6717  	 * to avoid a race with nfsd calling d_instantiate.
6718  	 */
6719  	unlock_new_inode(inode);
6720  	drop_on_err = 0;
6721  
6722  out_fail:
6723  	btrfs_end_transaction(trans, root);
6724  	if (drop_on_err) {
6725  		inode_dec_link_count(inode);
6726  		iput(inode);
6727  	}
6728  	btrfs_balance_delayed_items(root);
6729  	btrfs_btree_balance_dirty(root);
6730  	return err;
6731  
6732  out_fail_inode:
6733  	unlock_new_inode(inode);
6734  	goto out_fail;
6735  }
6736  
6737  /* Find next extent map of a given extent map, caller needs to ensure locks */
next_extent_map(struct extent_map * em)6738  static struct extent_map *next_extent_map(struct extent_map *em)
6739  {
6740  	struct rb_node *next;
6741  
6742  	next = rb_next(&em->rb_node);
6743  	if (!next)
6744  		return NULL;
6745  	return container_of(next, struct extent_map, rb_node);
6746  }
6747  
prev_extent_map(struct extent_map * em)6748  static struct extent_map *prev_extent_map(struct extent_map *em)
6749  {
6750  	struct rb_node *prev;
6751  
6752  	prev = rb_prev(&em->rb_node);
6753  	if (!prev)
6754  		return NULL;
6755  	return container_of(prev, struct extent_map, rb_node);
6756  }
6757  
6758  /* helper for btfs_get_extent.  Given an existing extent in the tree,
6759   * the existing extent is the nearest extent to map_start,
6760   * and an extent that you want to insert, deal with overlap and insert
6761   * the best fitted new extent into the tree.
6762   */
merge_extent_mapping(struct extent_map_tree * em_tree,struct extent_map * existing,struct extent_map * em,u64 map_start)6763  static int merge_extent_mapping(struct extent_map_tree *em_tree,
6764  				struct extent_map *existing,
6765  				struct extent_map *em,
6766  				u64 map_start)
6767  {
6768  	struct extent_map *prev;
6769  	struct extent_map *next;
6770  	u64 start;
6771  	u64 end;
6772  	u64 start_diff;
6773  
6774  	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6775  
6776  	if (existing->start > map_start) {
6777  		next = existing;
6778  		prev = prev_extent_map(next);
6779  	} else {
6780  		prev = existing;
6781  		next = next_extent_map(prev);
6782  	}
6783  
6784  	start = prev ? extent_map_end(prev) : em->start;
6785  	start = max_t(u64, start, em->start);
6786  	end = next ? next->start : extent_map_end(em);
6787  	end = min_t(u64, end, extent_map_end(em));
6788  	start_diff = start - em->start;
6789  	em->start = start;
6790  	em->len = end - start;
6791  	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6792  	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6793  		em->block_start += start_diff;
6794  		em->block_len -= start_diff;
6795  	}
6796  	return add_extent_mapping(em_tree, em, 0);
6797  }
6798  
uncompress_inline(struct btrfs_path * path,struct page * page,size_t pg_offset,u64 extent_offset,struct btrfs_file_extent_item * item)6799  static noinline int uncompress_inline(struct btrfs_path *path,
6800  				      struct page *page,
6801  				      size_t pg_offset, u64 extent_offset,
6802  				      struct btrfs_file_extent_item *item)
6803  {
6804  	int ret;
6805  	struct extent_buffer *leaf = path->nodes[0];
6806  	char *tmp;
6807  	size_t max_size;
6808  	unsigned long inline_size;
6809  	unsigned long ptr;
6810  	int compress_type;
6811  
6812  	WARN_ON(pg_offset != 0);
6813  	compress_type = btrfs_file_extent_compression(leaf, item);
6814  	max_size = btrfs_file_extent_ram_bytes(leaf, item);
6815  	inline_size = btrfs_file_extent_inline_item_len(leaf,
6816  					btrfs_item_nr(path->slots[0]));
6817  	tmp = kmalloc(inline_size, GFP_NOFS);
6818  	if (!tmp)
6819  		return -ENOMEM;
6820  	ptr = btrfs_file_extent_inline_start(item);
6821  
6822  	read_extent_buffer(leaf, tmp, ptr, inline_size);
6823  
6824  	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6825  	ret = btrfs_decompress(compress_type, tmp, page,
6826  			       extent_offset, inline_size, max_size);
6827  
6828  	/*
6829  	 * decompression code contains a memset to fill in any space between the end
6830  	 * of the uncompressed data and the end of max_size in case the decompressed
6831  	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6832  	 * the end of an inline extent and the beginning of the next block, so we
6833  	 * cover that region here.
6834  	 */
6835  
6836  	if (max_size + pg_offset < PAGE_SIZE) {
6837  		char *map = kmap(page);
6838  		memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
6839  		kunmap(page);
6840  	}
6841  	kfree(tmp);
6842  	return ret;
6843  }
6844  
6845  /*
6846   * a bit scary, this does extent mapping from logical file offset to the disk.
6847   * the ugly parts come from merging extents from the disk with the in-ram
6848   * representation.  This gets more complex because of the data=ordered code,
6849   * where the in-ram extents might be locked pending data=ordered completion.
6850   *
6851   * This also copies inline extents directly into the page.
6852   */
6853  
btrfs_get_extent(struct inode * inode,struct page * page,size_t pg_offset,u64 start,u64 len,int create)6854  struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
6855  				    size_t pg_offset, u64 start, u64 len,
6856  				    int create)
6857  {
6858  	int ret;
6859  	int err = 0;
6860  	u64 extent_start = 0;
6861  	u64 extent_end = 0;
6862  	u64 objectid = btrfs_ino(inode);
6863  	u32 found_type;
6864  	struct btrfs_path *path = NULL;
6865  	struct btrfs_root *root = BTRFS_I(inode)->root;
6866  	struct btrfs_file_extent_item *item;
6867  	struct extent_buffer *leaf;
6868  	struct btrfs_key found_key;
6869  	struct extent_map *em = NULL;
6870  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
6871  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6872  	struct btrfs_trans_handle *trans = NULL;
6873  	const bool new_inline = !page || create;
6874  
6875  again:
6876  	read_lock(&em_tree->lock);
6877  	em = lookup_extent_mapping(em_tree, start, len);
6878  	if (em)
6879  		em->bdev = root->fs_info->fs_devices->latest_bdev;
6880  	read_unlock(&em_tree->lock);
6881  
6882  	if (em) {
6883  		if (em->start > start || em->start + em->len <= start)
6884  			free_extent_map(em);
6885  		else if (em->block_start == EXTENT_MAP_INLINE && page)
6886  			free_extent_map(em);
6887  		else
6888  			goto out;
6889  	}
6890  	em = alloc_extent_map();
6891  	if (!em) {
6892  		err = -ENOMEM;
6893  		goto out;
6894  	}
6895  	em->bdev = root->fs_info->fs_devices->latest_bdev;
6896  	em->start = EXTENT_MAP_HOLE;
6897  	em->orig_start = EXTENT_MAP_HOLE;
6898  	em->len = (u64)-1;
6899  	em->block_len = (u64)-1;
6900  
6901  	if (!path) {
6902  		path = btrfs_alloc_path();
6903  		if (!path) {
6904  			err = -ENOMEM;
6905  			goto out;
6906  		}
6907  		/*
6908  		 * Chances are we'll be called again, so go ahead and do
6909  		 * readahead
6910  		 */
6911  		path->reada = READA_FORWARD;
6912  	}
6913  
6914  	ret = btrfs_lookup_file_extent(trans, root, path,
6915  				       objectid, start, trans != NULL);
6916  	if (ret < 0) {
6917  		err = ret;
6918  		goto out;
6919  	}
6920  
6921  	if (ret != 0) {
6922  		if (path->slots[0] == 0)
6923  			goto not_found;
6924  		path->slots[0]--;
6925  	}
6926  
6927  	leaf = path->nodes[0];
6928  	item = btrfs_item_ptr(leaf, path->slots[0],
6929  			      struct btrfs_file_extent_item);
6930  	/* are we inside the extent that was found? */
6931  	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6932  	found_type = found_key.type;
6933  	if (found_key.objectid != objectid ||
6934  	    found_type != BTRFS_EXTENT_DATA_KEY) {
6935  		/*
6936  		 * If we backup past the first extent we want to move forward
6937  		 * and see if there is an extent in front of us, otherwise we'll
6938  		 * say there is a hole for our whole search range which can
6939  		 * cause problems.
6940  		 */
6941  		extent_end = start;
6942  		goto next;
6943  	}
6944  
6945  	found_type = btrfs_file_extent_type(leaf, item);
6946  	extent_start = found_key.offset;
6947  	if (found_type == BTRFS_FILE_EXTENT_REG ||
6948  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6949  		extent_end = extent_start +
6950  		       btrfs_file_extent_num_bytes(leaf, item);
6951  	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6952  		size_t size;
6953  		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6954  		extent_end = ALIGN(extent_start + size, root->sectorsize);
6955  	}
6956  next:
6957  	if (start >= extent_end) {
6958  		path->slots[0]++;
6959  		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6960  			ret = btrfs_next_leaf(root, path);
6961  			if (ret < 0) {
6962  				err = ret;
6963  				goto out;
6964  			}
6965  			if (ret > 0)
6966  				goto not_found;
6967  			leaf = path->nodes[0];
6968  		}
6969  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6970  		if (found_key.objectid != objectid ||
6971  		    found_key.type != BTRFS_EXTENT_DATA_KEY)
6972  			goto not_found;
6973  		if (start + len <= found_key.offset)
6974  			goto not_found;
6975  		if (start > found_key.offset)
6976  			goto next;
6977  		em->start = start;
6978  		em->orig_start = start;
6979  		em->len = found_key.offset - start;
6980  		goto not_found_em;
6981  	}
6982  
6983  	btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
6984  
6985  	if (found_type == BTRFS_FILE_EXTENT_REG ||
6986  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6987  		goto insert;
6988  	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6989  		unsigned long ptr;
6990  		char *map;
6991  		size_t size;
6992  		size_t extent_offset;
6993  		size_t copy_size;
6994  
6995  		if (new_inline)
6996  			goto out;
6997  
6998  		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6999  		extent_offset = page_offset(page) + pg_offset - extent_start;
7000  		copy_size = min_t(u64, PAGE_SIZE - pg_offset,
7001  				  size - extent_offset);
7002  		em->start = extent_start + extent_offset;
7003  		em->len = ALIGN(copy_size, root->sectorsize);
7004  		em->orig_block_len = em->len;
7005  		em->orig_start = em->start;
7006  		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
7007  		if (create == 0 && !PageUptodate(page)) {
7008  			if (btrfs_file_extent_compression(leaf, item) !=
7009  			    BTRFS_COMPRESS_NONE) {
7010  				ret = uncompress_inline(path, page, pg_offset,
7011  							extent_offset, item);
7012  				if (ret) {
7013  					err = ret;
7014  					goto out;
7015  				}
7016  			} else {
7017  				map = kmap(page);
7018  				read_extent_buffer(leaf, map + pg_offset, ptr,
7019  						   copy_size);
7020  				if (pg_offset + copy_size < PAGE_SIZE) {
7021  					memset(map + pg_offset + copy_size, 0,
7022  					       PAGE_SIZE - pg_offset -
7023  					       copy_size);
7024  				}
7025  				kunmap(page);
7026  			}
7027  			flush_dcache_page(page);
7028  		} else if (create && PageUptodate(page)) {
7029  			BUG();
7030  			if (!trans) {
7031  				kunmap(page);
7032  				free_extent_map(em);
7033  				em = NULL;
7034  
7035  				btrfs_release_path(path);
7036  				trans = btrfs_join_transaction(root);
7037  
7038  				if (IS_ERR(trans))
7039  					return ERR_CAST(trans);
7040  				goto again;
7041  			}
7042  			map = kmap(page);
7043  			write_extent_buffer(leaf, map + pg_offset, ptr,
7044  					    copy_size);
7045  			kunmap(page);
7046  			btrfs_mark_buffer_dirty(leaf);
7047  		}
7048  		set_extent_uptodate(io_tree, em->start,
7049  				    extent_map_end(em) - 1, NULL, GFP_NOFS);
7050  		goto insert;
7051  	}
7052  not_found:
7053  	em->start = start;
7054  	em->orig_start = start;
7055  	em->len = len;
7056  not_found_em:
7057  	em->block_start = EXTENT_MAP_HOLE;
7058  	set_bit(EXTENT_FLAG_VACANCY, &em->flags);
7059  insert:
7060  	btrfs_release_path(path);
7061  	if (em->start > start || extent_map_end(em) <= start) {
7062  		btrfs_err(root->fs_info,
7063  			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
7064  			  em->start, em->len, start, len);
7065  		err = -EIO;
7066  		goto out;
7067  	}
7068  
7069  	err = 0;
7070  	write_lock(&em_tree->lock);
7071  	ret = add_extent_mapping(em_tree, em, 0);
7072  	/* it is possible that someone inserted the extent into the tree
7073  	 * while we had the lock dropped.  It is also possible that
7074  	 * an overlapping map exists in the tree
7075  	 */
7076  	if (ret == -EEXIST) {
7077  		struct extent_map *existing;
7078  
7079  		ret = 0;
7080  
7081  		existing = search_extent_mapping(em_tree, start, len);
7082  		/*
7083  		 * existing will always be non-NULL, since there must be
7084  		 * extent causing the -EEXIST.
7085  		 */
7086  		if (existing->start == em->start &&
7087  		    extent_map_end(existing) == extent_map_end(em) &&
7088  		    em->block_start == existing->block_start) {
7089  			/*
7090  			 * these two extents are the same, it happens
7091  			 * with inlines especially
7092  			 */
7093  			free_extent_map(em);
7094  			em = existing;
7095  			err = 0;
7096  
7097  		} else if (start >= extent_map_end(existing) ||
7098  		    start <= existing->start) {
7099  			/*
7100  			 * The existing extent map is the one nearest to
7101  			 * the [start, start + len) range which overlaps
7102  			 */
7103  			err = merge_extent_mapping(em_tree, existing,
7104  						   em, start);
7105  			free_extent_map(existing);
7106  			if (err) {
7107  				free_extent_map(em);
7108  				em = NULL;
7109  			}
7110  		} else {
7111  			free_extent_map(em);
7112  			em = existing;
7113  			err = 0;
7114  		}
7115  	}
7116  	write_unlock(&em_tree->lock);
7117  out:
7118  
7119  	trace_btrfs_get_extent(root, em);
7120  
7121  	btrfs_free_path(path);
7122  	if (trans) {
7123  		ret = btrfs_end_transaction(trans, root);
7124  		if (!err)
7125  			err = ret;
7126  	}
7127  	if (err) {
7128  		free_extent_map(em);
7129  		return ERR_PTR(err);
7130  	}
7131  	BUG_ON(!em); /* Error is always set */
7132  	return em;
7133  }
7134  
btrfs_get_extent_fiemap(struct inode * inode,struct page * page,size_t pg_offset,u64 start,u64 len,int create)7135  struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
7136  					   size_t pg_offset, u64 start, u64 len,
7137  					   int create)
7138  {
7139  	struct extent_map *em;
7140  	struct extent_map *hole_em = NULL;
7141  	u64 range_start = start;
7142  	u64 end;
7143  	u64 found;
7144  	u64 found_end;
7145  	int err = 0;
7146  
7147  	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
7148  	if (IS_ERR(em))
7149  		return em;
7150  	if (em) {
7151  		/*
7152  		 * if our em maps to
7153  		 * -  a hole or
7154  		 * -  a pre-alloc extent,
7155  		 * there might actually be delalloc bytes behind it.
7156  		 */
7157  		if (em->block_start != EXTENT_MAP_HOLE &&
7158  		    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7159  			return em;
7160  		else
7161  			hole_em = em;
7162  	}
7163  
7164  	/* check to see if we've wrapped (len == -1 or similar) */
7165  	end = start + len;
7166  	if (end < start)
7167  		end = (u64)-1;
7168  	else
7169  		end -= 1;
7170  
7171  	em = NULL;
7172  
7173  	/* ok, we didn't find anything, lets look for delalloc */
7174  	found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
7175  				 end, len, EXTENT_DELALLOC, 1);
7176  	found_end = range_start + found;
7177  	if (found_end < range_start)
7178  		found_end = (u64)-1;
7179  
7180  	/*
7181  	 * we didn't find anything useful, return
7182  	 * the original results from get_extent()
7183  	 */
7184  	if (range_start > end || found_end <= start) {
7185  		em = hole_em;
7186  		hole_em = NULL;
7187  		goto out;
7188  	}
7189  
7190  	/* adjust the range_start to make sure it doesn't
7191  	 * go backwards from the start they passed in
7192  	 */
7193  	range_start = max(start, range_start);
7194  	found = found_end - range_start;
7195  
7196  	if (found > 0) {
7197  		u64 hole_start = start;
7198  		u64 hole_len = len;
7199  
7200  		em = alloc_extent_map();
7201  		if (!em) {
7202  			err = -ENOMEM;
7203  			goto out;
7204  		}
7205  		/*
7206  		 * when btrfs_get_extent can't find anything it
7207  		 * returns one huge hole
7208  		 *
7209  		 * make sure what it found really fits our range, and
7210  		 * adjust to make sure it is based on the start from
7211  		 * the caller
7212  		 */
7213  		if (hole_em) {
7214  			u64 calc_end = extent_map_end(hole_em);
7215  
7216  			if (calc_end <= start || (hole_em->start > end)) {
7217  				free_extent_map(hole_em);
7218  				hole_em = NULL;
7219  			} else {
7220  				hole_start = max(hole_em->start, start);
7221  				hole_len = calc_end - hole_start;
7222  			}
7223  		}
7224  		em->bdev = NULL;
7225  		if (hole_em && range_start > hole_start) {
7226  			/* our hole starts before our delalloc, so we
7227  			 * have to return just the parts of the hole
7228  			 * that go until  the delalloc starts
7229  			 */
7230  			em->len = min(hole_len,
7231  				      range_start - hole_start);
7232  			em->start = hole_start;
7233  			em->orig_start = hole_start;
7234  			/*
7235  			 * don't adjust block start at all,
7236  			 * it is fixed at EXTENT_MAP_HOLE
7237  			 */
7238  			em->block_start = hole_em->block_start;
7239  			em->block_len = hole_len;
7240  			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
7241  				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7242  		} else {
7243  			em->start = range_start;
7244  			em->len = found;
7245  			em->orig_start = range_start;
7246  			em->block_start = EXTENT_MAP_DELALLOC;
7247  			em->block_len = found;
7248  		}
7249  	} else if (hole_em) {
7250  		return hole_em;
7251  	}
7252  out:
7253  
7254  	free_extent_map(hole_em);
7255  	if (err) {
7256  		free_extent_map(em);
7257  		return ERR_PTR(err);
7258  	}
7259  	return em;
7260  }
7261  
btrfs_create_dio_extent(struct inode * inode,const u64 start,const u64 len,const u64 orig_start,const u64 block_start,const u64 block_len,const u64 orig_block_len,const u64 ram_bytes,const int type)7262  static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
7263  						  const u64 start,
7264  						  const u64 len,
7265  						  const u64 orig_start,
7266  						  const u64 block_start,
7267  						  const u64 block_len,
7268  						  const u64 orig_block_len,
7269  						  const u64 ram_bytes,
7270  						  const int type)
7271  {
7272  	struct extent_map *em = NULL;
7273  	int ret;
7274  
7275  	if (type != BTRFS_ORDERED_NOCOW) {
7276  		em = create_pinned_em(inode, start, len, orig_start,
7277  				      block_start, block_len, orig_block_len,
7278  				      ram_bytes, type);
7279  		if (IS_ERR(em))
7280  			goto out;
7281  	}
7282  	ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
7283  					   len, block_len, type);
7284  	if (ret) {
7285  		if (em) {
7286  			free_extent_map(em);
7287  			btrfs_drop_extent_cache(inode, start,
7288  						start + len - 1, 0);
7289  		}
7290  		em = ERR_PTR(ret);
7291  	}
7292   out:
7293  
7294  	return em;
7295  }
7296  
btrfs_new_extent_direct(struct inode * inode,u64 start,u64 len)7297  static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7298  						  u64 start, u64 len)
7299  {
7300  	struct btrfs_root *root = BTRFS_I(inode)->root;
7301  	struct extent_map *em;
7302  	struct btrfs_key ins;
7303  	u64 alloc_hint;
7304  	int ret;
7305  
7306  	alloc_hint = get_extent_allocation_hint(inode, start, len);
7307  	ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0,
7308  				   alloc_hint, &ins, 1, 1);
7309  	if (ret)
7310  		return ERR_PTR(ret);
7311  
7312  	em = btrfs_create_dio_extent(inode, start, ins.offset, start,
7313  				     ins.objectid, ins.offset, ins.offset,
7314  				     ins.offset, 0);
7315  	btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
7316  	if (IS_ERR(em))
7317  		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7318  
7319  	return em;
7320  }
7321  
7322  /*
7323   * returns 1 when the nocow is safe, < 1 on error, 0 if the
7324   * block must be cow'd
7325   */
can_nocow_extent(struct inode * inode,u64 offset,u64 * len,u64 * orig_start,u64 * orig_block_len,u64 * ram_bytes)7326  noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7327  			      u64 *orig_start, u64 *orig_block_len,
7328  			      u64 *ram_bytes)
7329  {
7330  	struct btrfs_trans_handle *trans;
7331  	struct btrfs_path *path;
7332  	int ret;
7333  	struct extent_buffer *leaf;
7334  	struct btrfs_root *root = BTRFS_I(inode)->root;
7335  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7336  	struct btrfs_file_extent_item *fi;
7337  	struct btrfs_key key;
7338  	u64 disk_bytenr;
7339  	u64 backref_offset;
7340  	u64 extent_end;
7341  	u64 num_bytes;
7342  	int slot;
7343  	int found_type;
7344  	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
7345  
7346  	path = btrfs_alloc_path();
7347  	if (!path)
7348  		return -ENOMEM;
7349  
7350  	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
7351  				       offset, 0);
7352  	if (ret < 0)
7353  		goto out;
7354  
7355  	slot = path->slots[0];
7356  	if (ret == 1) {
7357  		if (slot == 0) {
7358  			/* can't find the item, must cow */
7359  			ret = 0;
7360  			goto out;
7361  		}
7362  		slot--;
7363  	}
7364  	ret = 0;
7365  	leaf = path->nodes[0];
7366  	btrfs_item_key_to_cpu(leaf, &key, slot);
7367  	if (key.objectid != btrfs_ino(inode) ||
7368  	    key.type != BTRFS_EXTENT_DATA_KEY) {
7369  		/* not our file or wrong item type, must cow */
7370  		goto out;
7371  	}
7372  
7373  	if (key.offset > offset) {
7374  		/* Wrong offset, must cow */
7375  		goto out;
7376  	}
7377  
7378  	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
7379  	found_type = btrfs_file_extent_type(leaf, fi);
7380  	if (found_type != BTRFS_FILE_EXTENT_REG &&
7381  	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
7382  		/* not a regular extent, must cow */
7383  		goto out;
7384  	}
7385  
7386  	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
7387  		goto out;
7388  
7389  	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
7390  	if (extent_end <= offset)
7391  		goto out;
7392  
7393  	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7394  	if (disk_bytenr == 0)
7395  		goto out;
7396  
7397  	if (btrfs_file_extent_compression(leaf, fi) ||
7398  	    btrfs_file_extent_encryption(leaf, fi) ||
7399  	    btrfs_file_extent_other_encoding(leaf, fi))
7400  		goto out;
7401  
7402  	backref_offset = btrfs_file_extent_offset(leaf, fi);
7403  
7404  	if (orig_start) {
7405  		*orig_start = key.offset - backref_offset;
7406  		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
7407  		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7408  	}
7409  
7410  	if (btrfs_extent_readonly(root, disk_bytenr))
7411  		goto out;
7412  
7413  	num_bytes = min(offset + *len, extent_end) - offset;
7414  	if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7415  		u64 range_end;
7416  
7417  		range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
7418  		ret = test_range_bit(io_tree, offset, range_end,
7419  				     EXTENT_DELALLOC, 0, NULL);
7420  		if (ret) {
7421  			ret = -EAGAIN;
7422  			goto out;
7423  		}
7424  	}
7425  
7426  	btrfs_release_path(path);
7427  
7428  	/*
7429  	 * look for other files referencing this extent, if we
7430  	 * find any we must cow
7431  	 */
7432  	trans = btrfs_join_transaction(root);
7433  	if (IS_ERR(trans)) {
7434  		ret = 0;
7435  		goto out;
7436  	}
7437  
7438  	ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
7439  				    key.offset - backref_offset, disk_bytenr);
7440  	btrfs_end_transaction(trans, root);
7441  	if (ret) {
7442  		ret = 0;
7443  		goto out;
7444  	}
7445  
7446  	/*
7447  	 * adjust disk_bytenr and num_bytes to cover just the bytes
7448  	 * in this extent we are about to write.  If there
7449  	 * are any csums in that range we have to cow in order
7450  	 * to keep the csums correct
7451  	 */
7452  	disk_bytenr += backref_offset;
7453  	disk_bytenr += offset - key.offset;
7454  	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
7455  				goto out;
7456  	/*
7457  	 * all of the above have passed, it is safe to overwrite this extent
7458  	 * without cow
7459  	 */
7460  	*len = num_bytes;
7461  	ret = 1;
7462  out:
7463  	btrfs_free_path(path);
7464  	return ret;
7465  }
7466  
btrfs_page_exists_in_range(struct inode * inode,loff_t start,loff_t end)7467  bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
7468  {
7469  	struct radix_tree_root *root = &inode->i_mapping->page_tree;
7470  	int found = false;
7471  	void **pagep = NULL;
7472  	struct page *page = NULL;
7473  	unsigned long start_idx;
7474  	unsigned long end_idx;
7475  
7476  	start_idx = start >> PAGE_SHIFT;
7477  
7478  	/*
7479  	 * end is the last byte in the last page.  end == start is legal
7480  	 */
7481  	end_idx = end >> PAGE_SHIFT;
7482  
7483  	rcu_read_lock();
7484  
7485  	/* Most of the code in this while loop is lifted from
7486  	 * find_get_page.  It's been modified to begin searching from a
7487  	 * page and return just the first page found in that range.  If the
7488  	 * found idx is less than or equal to the end idx then we know that
7489  	 * a page exists.  If no pages are found or if those pages are
7490  	 * outside of the range then we're fine (yay!) */
7491  	while (page == NULL &&
7492  	       radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
7493  		page = radix_tree_deref_slot(pagep);
7494  		if (unlikely(!page))
7495  			break;
7496  
7497  		if (radix_tree_exception(page)) {
7498  			if (radix_tree_deref_retry(page)) {
7499  				page = NULL;
7500  				continue;
7501  			}
7502  			/*
7503  			 * Otherwise, shmem/tmpfs must be storing a swap entry
7504  			 * here as an exceptional entry: so return it without
7505  			 * attempting to raise page count.
7506  			 */
7507  			page = NULL;
7508  			break; /* TODO: Is this relevant for this use case? */
7509  		}
7510  
7511  		if (!page_cache_get_speculative(page)) {
7512  			page = NULL;
7513  			continue;
7514  		}
7515  
7516  		/*
7517  		 * Has the page moved?
7518  		 * This is part of the lockless pagecache protocol. See
7519  		 * include/linux/pagemap.h for details.
7520  		 */
7521  		if (unlikely(page != *pagep)) {
7522  			put_page(page);
7523  			page = NULL;
7524  		}
7525  	}
7526  
7527  	if (page) {
7528  		if (page->index <= end_idx)
7529  			found = true;
7530  		put_page(page);
7531  	}
7532  
7533  	rcu_read_unlock();
7534  	return found;
7535  }
7536  
lock_extent_direct(struct inode * inode,u64 lockstart,u64 lockend,struct extent_state ** cached_state,int writing)7537  static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7538  			      struct extent_state **cached_state, int writing)
7539  {
7540  	struct btrfs_ordered_extent *ordered;
7541  	int ret = 0;
7542  
7543  	while (1) {
7544  		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7545  				 cached_state);
7546  		/*
7547  		 * We're concerned with the entire range that we're going to be
7548  		 * doing DIO to, so we need to make sure there's no ordered
7549  		 * extents in this range.
7550  		 */
7551  		ordered = btrfs_lookup_ordered_range(inode, lockstart,
7552  						     lockend - lockstart + 1);
7553  
7554  		/*
7555  		 * We need to make sure there are no buffered pages in this
7556  		 * range either, we could have raced between the invalidate in
7557  		 * generic_file_direct_write and locking the extent.  The
7558  		 * invalidate needs to happen so that reads after a write do not
7559  		 * get stale data.
7560  		 */
7561  		if (!ordered &&
7562  		    (!writing ||
7563  		     !btrfs_page_exists_in_range(inode, lockstart, lockend)))
7564  			break;
7565  
7566  		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7567  				     cached_state, GFP_NOFS);
7568  
7569  		if (ordered) {
7570  			/*
7571  			 * If we are doing a DIO read and the ordered extent we
7572  			 * found is for a buffered write, we can not wait for it
7573  			 * to complete and retry, because if we do so we can
7574  			 * deadlock with concurrent buffered writes on page
7575  			 * locks. This happens only if our DIO read covers more
7576  			 * than one extent map, if at this point has already
7577  			 * created an ordered extent for a previous extent map
7578  			 * and locked its range in the inode's io tree, and a
7579  			 * concurrent write against that previous extent map's
7580  			 * range and this range started (we unlock the ranges
7581  			 * in the io tree only when the bios complete and
7582  			 * buffered writes always lock pages before attempting
7583  			 * to lock range in the io tree).
7584  			 */
7585  			if (writing ||
7586  			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7587  				btrfs_start_ordered_extent(inode, ordered, 1);
7588  			else
7589  				ret = -ENOTBLK;
7590  			btrfs_put_ordered_extent(ordered);
7591  		} else {
7592  			/*
7593  			 * We could trigger writeback for this range (and wait
7594  			 * for it to complete) and then invalidate the pages for
7595  			 * this range (through invalidate_inode_pages2_range()),
7596  			 * but that can lead us to a deadlock with a concurrent
7597  			 * call to readpages() (a buffered read or a defrag call
7598  			 * triggered a readahead) on a page lock due to an
7599  			 * ordered dio extent we created before but did not have
7600  			 * yet a corresponding bio submitted (whence it can not
7601  			 * complete), which makes readpages() wait for that
7602  			 * ordered extent to complete while holding a lock on
7603  			 * that page.
7604  			 */
7605  			ret = -ENOTBLK;
7606  		}
7607  
7608  		if (ret)
7609  			break;
7610  
7611  		cond_resched();
7612  	}
7613  
7614  	return ret;
7615  }
7616  
create_pinned_em(struct inode * inode,u64 start,u64 len,u64 orig_start,u64 block_start,u64 block_len,u64 orig_block_len,u64 ram_bytes,int type)7617  static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
7618  					   u64 len, u64 orig_start,
7619  					   u64 block_start, u64 block_len,
7620  					   u64 orig_block_len, u64 ram_bytes,
7621  					   int type)
7622  {
7623  	struct extent_map_tree *em_tree;
7624  	struct extent_map *em;
7625  	struct btrfs_root *root = BTRFS_I(inode)->root;
7626  	int ret;
7627  
7628  	em_tree = &BTRFS_I(inode)->extent_tree;
7629  	em = alloc_extent_map();
7630  	if (!em)
7631  		return ERR_PTR(-ENOMEM);
7632  
7633  	em->start = start;
7634  	em->orig_start = orig_start;
7635  	em->mod_start = start;
7636  	em->mod_len = len;
7637  	em->len = len;
7638  	em->block_len = block_len;
7639  	em->block_start = block_start;
7640  	em->bdev = root->fs_info->fs_devices->latest_bdev;
7641  	em->orig_block_len = orig_block_len;
7642  	em->ram_bytes = ram_bytes;
7643  	em->generation = -1;
7644  	set_bit(EXTENT_FLAG_PINNED, &em->flags);
7645  	if (type == BTRFS_ORDERED_PREALLOC)
7646  		set_bit(EXTENT_FLAG_FILLING, &em->flags);
7647  
7648  	do {
7649  		btrfs_drop_extent_cache(inode, em->start,
7650  				em->start + em->len - 1, 0);
7651  		write_lock(&em_tree->lock);
7652  		ret = add_extent_mapping(em_tree, em, 1);
7653  		write_unlock(&em_tree->lock);
7654  	} while (ret == -EEXIST);
7655  
7656  	if (ret) {
7657  		free_extent_map(em);
7658  		return ERR_PTR(ret);
7659  	}
7660  
7661  	return em;
7662  }
7663  
adjust_dio_outstanding_extents(struct inode * inode,struct btrfs_dio_data * dio_data,const u64 len)7664  static void adjust_dio_outstanding_extents(struct inode *inode,
7665  					   struct btrfs_dio_data *dio_data,
7666  					   const u64 len)
7667  {
7668  	unsigned num_extents;
7669  
7670  	num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1,
7671  					   BTRFS_MAX_EXTENT_SIZE);
7672  	/*
7673  	 * If we have an outstanding_extents count still set then we're
7674  	 * within our reservation, otherwise we need to adjust our inode
7675  	 * counter appropriately.
7676  	 */
7677  	if (dio_data->outstanding_extents >= num_extents) {
7678  		dio_data->outstanding_extents -= num_extents;
7679  	} else {
7680  		/*
7681  		 * If dio write length has been split due to no large enough
7682  		 * contiguous space, we need to compensate our inode counter
7683  		 * appropriately.
7684  		 */
7685  		u64 num_needed = num_extents - dio_data->outstanding_extents;
7686  
7687  		spin_lock(&BTRFS_I(inode)->lock);
7688  		BTRFS_I(inode)->outstanding_extents += num_needed;
7689  		spin_unlock(&BTRFS_I(inode)->lock);
7690  	}
7691  }
7692  
btrfs_get_blocks_direct(struct inode * inode,sector_t iblock,struct buffer_head * bh_result,int create)7693  static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7694  				   struct buffer_head *bh_result, int create)
7695  {
7696  	struct extent_map *em;
7697  	struct btrfs_root *root = BTRFS_I(inode)->root;
7698  	struct extent_state *cached_state = NULL;
7699  	struct btrfs_dio_data *dio_data = NULL;
7700  	u64 start = iblock << inode->i_blkbits;
7701  	u64 lockstart, lockend;
7702  	u64 len = bh_result->b_size;
7703  	int unlock_bits = EXTENT_LOCKED;
7704  	int ret = 0;
7705  
7706  	if (create)
7707  		unlock_bits |= EXTENT_DIRTY;
7708  	else
7709  		len = min_t(u64, len, root->sectorsize);
7710  
7711  	lockstart = start;
7712  	lockend = start + len - 1;
7713  
7714  	if (current->journal_info) {
7715  		/*
7716  		 * Need to pull our outstanding extents and set journal_info to NULL so
7717  		 * that anything that needs to check if there's a transaction doesn't get
7718  		 * confused.
7719  		 */
7720  		dio_data = current->journal_info;
7721  		current->journal_info = NULL;
7722  	}
7723  
7724  	/*
7725  	 * If this errors out it's because we couldn't invalidate pagecache for
7726  	 * this range and we need to fallback to buffered.
7727  	 */
7728  	if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
7729  			       create)) {
7730  		ret = -ENOTBLK;
7731  		goto err;
7732  	}
7733  
7734  	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
7735  	if (IS_ERR(em)) {
7736  		ret = PTR_ERR(em);
7737  		goto unlock_err;
7738  	}
7739  
7740  	/*
7741  	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7742  	 * io.  INLINE is special, and we could probably kludge it in here, but
7743  	 * it's still buffered so for safety lets just fall back to the generic
7744  	 * buffered path.
7745  	 *
7746  	 * For COMPRESSED we _have_ to read the entire extent in so we can
7747  	 * decompress it, so there will be buffering required no matter what we
7748  	 * do, so go ahead and fallback to buffered.
7749  	 *
7750  	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
7751  	 * to buffered IO.  Don't blame me, this is the price we pay for using
7752  	 * the generic code.
7753  	 */
7754  	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7755  	    em->block_start == EXTENT_MAP_INLINE) {
7756  		free_extent_map(em);
7757  		ret = -ENOTBLK;
7758  		goto unlock_err;
7759  	}
7760  
7761  	/* Just a good old fashioned hole, return */
7762  	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
7763  			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7764  		free_extent_map(em);
7765  		goto unlock_err;
7766  	}
7767  
7768  	/*
7769  	 * We don't allocate a new extent in the following cases
7770  	 *
7771  	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
7772  	 * existing extent.
7773  	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
7774  	 * just use the extent.
7775  	 *
7776  	 */
7777  	if (!create) {
7778  		len = min(len, em->len - (start - em->start));
7779  		lockstart = start + len;
7780  		goto unlock;
7781  	}
7782  
7783  	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7784  	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7785  	     em->block_start != EXTENT_MAP_HOLE)) {
7786  		int type;
7787  		u64 block_start, orig_start, orig_block_len, ram_bytes;
7788  
7789  		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7790  			type = BTRFS_ORDERED_PREALLOC;
7791  		else
7792  			type = BTRFS_ORDERED_NOCOW;
7793  		len = min(len, em->len - (start - em->start));
7794  		block_start = em->block_start + (start - em->start);
7795  
7796  		if (can_nocow_extent(inode, start, &len, &orig_start,
7797  				     &orig_block_len, &ram_bytes) == 1 &&
7798  		    btrfs_inc_nocow_writers(root->fs_info, block_start)) {
7799  			struct extent_map *em2;
7800  
7801  			em2 = btrfs_create_dio_extent(inode, start, len,
7802  						      orig_start, block_start,
7803  						      len, orig_block_len,
7804  						      ram_bytes, type);
7805  			btrfs_dec_nocow_writers(root->fs_info, block_start);
7806  			if (type == BTRFS_ORDERED_PREALLOC) {
7807  				free_extent_map(em);
7808  				em = em2;
7809  			}
7810  			if (em2 && IS_ERR(em2)) {
7811  				ret = PTR_ERR(em2);
7812  				goto unlock_err;
7813  			}
7814  			/*
7815  			 * For inode marked NODATACOW or extent marked PREALLOC,
7816  			 * use the existing or preallocated extent, so does not
7817  			 * need to adjust btrfs_space_info's bytes_may_use.
7818  			 */
7819  			btrfs_free_reserved_data_space_noquota(inode,
7820  					start, len);
7821  			goto unlock;
7822  		}
7823  	}
7824  
7825  	/*
7826  	 * this will cow the extent, reset the len in case we changed
7827  	 * it above
7828  	 */
7829  	len = bh_result->b_size;
7830  	free_extent_map(em);
7831  	em = btrfs_new_extent_direct(inode, start, len);
7832  	if (IS_ERR(em)) {
7833  		ret = PTR_ERR(em);
7834  		goto unlock_err;
7835  	}
7836  	len = min(len, em->len - (start - em->start));
7837  unlock:
7838  	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7839  		inode->i_blkbits;
7840  	bh_result->b_size = len;
7841  	bh_result->b_bdev = em->bdev;
7842  	set_buffer_mapped(bh_result);
7843  	if (create) {
7844  		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7845  			set_buffer_new(bh_result);
7846  
7847  		/*
7848  		 * Need to update the i_size under the extent lock so buffered
7849  		 * readers will get the updated i_size when we unlock.
7850  		 */
7851  		if (start + len > i_size_read(inode))
7852  			i_size_write(inode, start + len);
7853  
7854  		adjust_dio_outstanding_extents(inode, dio_data, len);
7855  		WARN_ON(dio_data->reserve < len);
7856  		dio_data->reserve -= len;
7857  		dio_data->unsubmitted_oe_range_end = start + len;
7858  		current->journal_info = dio_data;
7859  	}
7860  
7861  	/*
7862  	 * In the case of write we need to clear and unlock the entire range,
7863  	 * in the case of read we need to unlock only the end area that we
7864  	 * aren't using if there is any left over space.
7865  	 */
7866  	if (lockstart < lockend) {
7867  		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7868  				 lockend, unlock_bits, 1, 0,
7869  				 &cached_state, GFP_NOFS);
7870  	} else {
7871  		free_extent_state(cached_state);
7872  	}
7873  
7874  	free_extent_map(em);
7875  
7876  	return 0;
7877  
7878  unlock_err:
7879  	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7880  			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
7881  err:
7882  	if (dio_data)
7883  		current->journal_info = dio_data;
7884  	/*
7885  	 * Compensate the delalloc release we do in btrfs_direct_IO() when we
7886  	 * write less data then expected, so that we don't underflow our inode's
7887  	 * outstanding extents counter.
7888  	 */
7889  	if (create && dio_data)
7890  		adjust_dio_outstanding_extents(inode, dio_data, len);
7891  
7892  	return ret;
7893  }
7894  
submit_dio_repair_bio(struct inode * inode,struct bio * bio,int mirror_num)7895  static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
7896  					int mirror_num)
7897  {
7898  	struct btrfs_root *root = BTRFS_I(inode)->root;
7899  	int ret;
7900  
7901  	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
7902  
7903  	bio_get(bio);
7904  
7905  	ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7906  				  BTRFS_WQ_ENDIO_DIO_REPAIR);
7907  	if (ret)
7908  		goto err;
7909  
7910  	ret = btrfs_map_bio(root, bio, mirror_num, 0);
7911  err:
7912  	bio_put(bio);
7913  	return ret;
7914  }
7915  
btrfs_check_dio_repairable(struct inode * inode,struct bio * failed_bio,struct io_failure_record * failrec,int failed_mirror)7916  static int btrfs_check_dio_repairable(struct inode *inode,
7917  				      struct bio *failed_bio,
7918  				      struct io_failure_record *failrec,
7919  				      int failed_mirror)
7920  {
7921  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7922  	int num_copies;
7923  
7924  	num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
7925  	if (num_copies == 1) {
7926  		/*
7927  		 * we only have a single copy of the data, so don't bother with
7928  		 * all the retry and error correction code that follows. no
7929  		 * matter what the error is, it is very likely to persist.
7930  		 */
7931  		btrfs_debug(fs_info,
7932  			"Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
7933  			num_copies, failrec->this_mirror, failed_mirror);
7934  		return 0;
7935  	}
7936  
7937  	failrec->failed_mirror = failed_mirror;
7938  	failrec->this_mirror++;
7939  	if (failrec->this_mirror == failed_mirror)
7940  		failrec->this_mirror++;
7941  
7942  	if (failrec->this_mirror > num_copies) {
7943  		btrfs_debug(fs_info,
7944  			"Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
7945  			num_copies, failrec->this_mirror, failed_mirror);
7946  		return 0;
7947  	}
7948  
7949  	return 1;
7950  }
7951  
dio_read_error(struct inode * inode,struct bio * failed_bio,struct page * page,unsigned int pgoff,u64 start,u64 end,int failed_mirror,bio_end_io_t * repair_endio,void * repair_arg)7952  static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7953  			struct page *page, unsigned int pgoff,
7954  			u64 start, u64 end, int failed_mirror,
7955  			bio_end_io_t *repair_endio, void *repair_arg)
7956  {
7957  	struct io_failure_record *failrec;
7958  	struct bio *bio;
7959  	int isector;
7960  	int read_mode;
7961  	int ret;
7962  
7963  	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
7964  
7965  	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7966  	if (ret)
7967  		return ret;
7968  
7969  	ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7970  					 failed_mirror);
7971  	if (!ret) {
7972  		free_io_failure(inode, failrec);
7973  		return -EIO;
7974  	}
7975  
7976  	if ((failed_bio->bi_vcnt > 1)
7977  		|| (failed_bio->bi_io_vec->bv_len
7978  			> BTRFS_I(inode)->root->sectorsize))
7979  		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
7980  	else
7981  		read_mode = READ_SYNC;
7982  
7983  	isector = start - btrfs_io_bio(failed_bio)->logical;
7984  	isector >>= inode->i_sb->s_blocksize_bits;
7985  	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7986  				pgoff, isector, repair_endio, repair_arg);
7987  	if (!bio) {
7988  		free_io_failure(inode, failrec);
7989  		return -EIO;
7990  	}
7991  	bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
7992  
7993  	btrfs_debug(BTRFS_I(inode)->root->fs_info,
7994  		    "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
7995  		    read_mode, failrec->this_mirror, failrec->in_validation);
7996  
7997  	ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
7998  	if (ret) {
7999  		free_io_failure(inode, failrec);
8000  		bio_put(bio);
8001  	}
8002  
8003  	return ret;
8004  }
8005  
8006  struct btrfs_retry_complete {
8007  	struct completion done;
8008  	struct inode *inode;
8009  	u64 start;
8010  	int uptodate;
8011  };
8012  
btrfs_retry_endio_nocsum(struct bio * bio)8013  static void btrfs_retry_endio_nocsum(struct bio *bio)
8014  {
8015  	struct btrfs_retry_complete *done = bio->bi_private;
8016  	struct inode *inode;
8017  	struct bio_vec *bvec;
8018  	int i;
8019  
8020  	if (bio->bi_error)
8021  		goto end;
8022  
8023  	ASSERT(bio->bi_vcnt == 1);
8024  	inode = bio->bi_io_vec->bv_page->mapping->host;
8025  	ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
8026  
8027  	done->uptodate = 1;
8028  	bio_for_each_segment_all(bvec, bio, i)
8029  		clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
8030  end:
8031  	complete(&done->done);
8032  	bio_put(bio);
8033  }
8034  
__btrfs_correct_data_nocsum(struct inode * inode,struct btrfs_io_bio * io_bio)8035  static int __btrfs_correct_data_nocsum(struct inode *inode,
8036  				       struct btrfs_io_bio *io_bio)
8037  {
8038  	struct btrfs_fs_info *fs_info;
8039  	struct bio_vec *bvec;
8040  	struct btrfs_retry_complete done;
8041  	u64 start;
8042  	unsigned int pgoff;
8043  	u32 sectorsize;
8044  	int nr_sectors;
8045  	int i;
8046  	int ret;
8047  
8048  	fs_info = BTRFS_I(inode)->root->fs_info;
8049  	sectorsize = BTRFS_I(inode)->root->sectorsize;
8050  
8051  	start = io_bio->logical;
8052  	done.inode = inode;
8053  
8054  	bio_for_each_segment_all(bvec, &io_bio->bio, i) {
8055  		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
8056  		pgoff = bvec->bv_offset;
8057  
8058  next_block_or_try_again:
8059  		done.uptodate = 0;
8060  		done.start = start;
8061  		init_completion(&done.done);
8062  
8063  		ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
8064  				pgoff, start, start + sectorsize - 1,
8065  				io_bio->mirror_num,
8066  				btrfs_retry_endio_nocsum, &done);
8067  		if (ret)
8068  			return ret;
8069  
8070  		wait_for_completion(&done.done);
8071  
8072  		if (!done.uptodate) {
8073  			/* We might have another mirror, so try again */
8074  			goto next_block_or_try_again;
8075  		}
8076  
8077  		start += sectorsize;
8078  
8079  		nr_sectors--;
8080  		if (nr_sectors) {
8081  			pgoff += sectorsize;
8082  			ASSERT(pgoff < PAGE_SIZE);
8083  			goto next_block_or_try_again;
8084  		}
8085  	}
8086  
8087  	return 0;
8088  }
8089  
btrfs_retry_endio(struct bio * bio)8090  static void btrfs_retry_endio(struct bio *bio)
8091  {
8092  	struct btrfs_retry_complete *done = bio->bi_private;
8093  	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8094  	struct inode *inode;
8095  	struct bio_vec *bvec;
8096  	u64 start;
8097  	int uptodate;
8098  	int ret;
8099  	int i;
8100  
8101  	if (bio->bi_error)
8102  		goto end;
8103  
8104  	uptodate = 1;
8105  
8106  	start = done->start;
8107  
8108  	ASSERT(bio->bi_vcnt == 1);
8109  	inode = bio->bi_io_vec->bv_page->mapping->host;
8110  	ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
8111  
8112  	bio_for_each_segment_all(bvec, bio, i) {
8113  		ret = __readpage_endio_check(done->inode, io_bio, i,
8114  					bvec->bv_page, bvec->bv_offset,
8115  					done->start, bvec->bv_len);
8116  		if (!ret)
8117  			clean_io_failure(done->inode, done->start,
8118  					bvec->bv_page, bvec->bv_offset);
8119  		else
8120  			uptodate = 0;
8121  	}
8122  
8123  	done->uptodate = uptodate;
8124  end:
8125  	complete(&done->done);
8126  	bio_put(bio);
8127  }
8128  
__btrfs_subio_endio_read(struct inode * inode,struct btrfs_io_bio * io_bio,int err)8129  static int __btrfs_subio_endio_read(struct inode *inode,
8130  				    struct btrfs_io_bio *io_bio, int err)
8131  {
8132  	struct btrfs_fs_info *fs_info;
8133  	struct bio_vec *bvec;
8134  	struct btrfs_retry_complete done;
8135  	u64 start;
8136  	u64 offset = 0;
8137  	u32 sectorsize;
8138  	int nr_sectors;
8139  	unsigned int pgoff;
8140  	int csum_pos;
8141  	int i;
8142  	int ret;
8143  
8144  	fs_info = BTRFS_I(inode)->root->fs_info;
8145  	sectorsize = BTRFS_I(inode)->root->sectorsize;
8146  
8147  	err = 0;
8148  	start = io_bio->logical;
8149  	done.inode = inode;
8150  
8151  	bio_for_each_segment_all(bvec, &io_bio->bio, i) {
8152  		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
8153  
8154  		pgoff = bvec->bv_offset;
8155  next_block:
8156  		csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
8157  		ret = __readpage_endio_check(inode, io_bio, csum_pos,
8158  					bvec->bv_page, pgoff, start,
8159  					sectorsize);
8160  		if (likely(!ret))
8161  			goto next;
8162  try_again:
8163  		done.uptodate = 0;
8164  		done.start = start;
8165  		init_completion(&done.done);
8166  
8167  		ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
8168  				pgoff, start, start + sectorsize - 1,
8169  				io_bio->mirror_num,
8170  				btrfs_retry_endio, &done);
8171  		if (ret) {
8172  			err = ret;
8173  			goto next;
8174  		}
8175  
8176  		wait_for_completion(&done.done);
8177  
8178  		if (!done.uptodate) {
8179  			/* We might have another mirror, so try again */
8180  			goto try_again;
8181  		}
8182  next:
8183  		offset += sectorsize;
8184  		start += sectorsize;
8185  
8186  		ASSERT(nr_sectors);
8187  
8188  		nr_sectors--;
8189  		if (nr_sectors) {
8190  			pgoff += sectorsize;
8191  			ASSERT(pgoff < PAGE_SIZE);
8192  			goto next_block;
8193  		}
8194  	}
8195  
8196  	return err;
8197  }
8198  
btrfs_subio_endio_read(struct inode * inode,struct btrfs_io_bio * io_bio,int err)8199  static int btrfs_subio_endio_read(struct inode *inode,
8200  				  struct btrfs_io_bio *io_bio, int err)
8201  {
8202  	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8203  
8204  	if (skip_csum) {
8205  		if (unlikely(err))
8206  			return __btrfs_correct_data_nocsum(inode, io_bio);
8207  		else
8208  			return 0;
8209  	} else {
8210  		return __btrfs_subio_endio_read(inode, io_bio, err);
8211  	}
8212  }
8213  
btrfs_endio_direct_read(struct bio * bio)8214  static void btrfs_endio_direct_read(struct bio *bio)
8215  {
8216  	struct btrfs_dio_private *dip = bio->bi_private;
8217  	struct inode *inode = dip->inode;
8218  	struct bio *dio_bio;
8219  	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8220  	int err = bio->bi_error;
8221  
8222  	if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
8223  		err = btrfs_subio_endio_read(inode, io_bio, err);
8224  
8225  	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
8226  		      dip->logical_offset + dip->bytes - 1);
8227  	dio_bio = dip->dio_bio;
8228  
8229  	kfree(dip);
8230  
8231  	dio_bio->bi_error = bio->bi_error;
8232  	dio_end_io(dio_bio, bio->bi_error);
8233  
8234  	if (io_bio->end_io)
8235  		io_bio->end_io(io_bio, err);
8236  	bio_put(bio);
8237  }
8238  
btrfs_endio_direct_write_update_ordered(struct inode * inode,const u64 offset,const u64 bytes,const int uptodate)8239  static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
8240  						    const u64 offset,
8241  						    const u64 bytes,
8242  						    const int uptodate)
8243  {
8244  	struct btrfs_root *root = BTRFS_I(inode)->root;
8245  	struct btrfs_ordered_extent *ordered = NULL;
8246  	u64 ordered_offset = offset;
8247  	u64 ordered_bytes = bytes;
8248  	int ret;
8249  
8250  again:
8251  	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
8252  						   &ordered_offset,
8253  						   ordered_bytes,
8254  						   uptodate);
8255  	if (!ret)
8256  		goto out_test;
8257  
8258  	btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
8259  			finish_ordered_fn, NULL, NULL);
8260  	btrfs_queue_work(root->fs_info->endio_write_workers,
8261  			 &ordered->work);
8262  out_test:
8263  	/*
8264  	 * our bio might span multiple ordered extents.  If we haven't
8265  	 * completed the accounting for the whole dio, go back and try again
8266  	 */
8267  	if (ordered_offset < offset + bytes) {
8268  		ordered_bytes = offset + bytes - ordered_offset;
8269  		ordered = NULL;
8270  		goto again;
8271  	}
8272  }
8273  
btrfs_endio_direct_write(struct bio * bio)8274  static void btrfs_endio_direct_write(struct bio *bio)
8275  {
8276  	struct btrfs_dio_private *dip = bio->bi_private;
8277  	struct bio *dio_bio = dip->dio_bio;
8278  
8279  	btrfs_endio_direct_write_update_ordered(dip->inode,
8280  						dip->logical_offset,
8281  						dip->bytes,
8282  						!bio->bi_error);
8283  
8284  	kfree(dip);
8285  
8286  	dio_bio->bi_error = bio->bi_error;
8287  	dio_end_io(dio_bio, bio->bi_error);
8288  	bio_put(bio);
8289  }
8290  
__btrfs_submit_bio_start_direct_io(struct inode * inode,struct bio * bio,int mirror_num,unsigned long bio_flags,u64 offset)8291  static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
8292  				    struct bio *bio, int mirror_num,
8293  				    unsigned long bio_flags, u64 offset)
8294  {
8295  	int ret;
8296  	struct btrfs_root *root = BTRFS_I(inode)->root;
8297  	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
8298  	BUG_ON(ret); /* -ENOMEM */
8299  	return 0;
8300  }
8301  
btrfs_end_dio_bio(struct bio * bio)8302  static void btrfs_end_dio_bio(struct bio *bio)
8303  {
8304  	struct btrfs_dio_private *dip = bio->bi_private;
8305  	int err = bio->bi_error;
8306  
8307  	if (err)
8308  		btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
8309  			   "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
8310  			   btrfs_ino(dip->inode), bio_op(bio), bio->bi_opf,
8311  			   (unsigned long long)bio->bi_iter.bi_sector,
8312  			   bio->bi_iter.bi_size, err);
8313  
8314  	if (dip->subio_endio)
8315  		err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
8316  
8317  	if (err) {
8318  		dip->errors = 1;
8319  
8320  		/*
8321  		 * before atomic variable goto zero, we must make sure
8322  		 * dip->errors is perceived to be set.
8323  		 */
8324  		smp_mb__before_atomic();
8325  	}
8326  
8327  	/* if there are more bios still pending for this dio, just exit */
8328  	if (!atomic_dec_and_test(&dip->pending_bios))
8329  		goto out;
8330  
8331  	if (dip->errors) {
8332  		bio_io_error(dip->orig_bio);
8333  	} else {
8334  		dip->dio_bio->bi_error = 0;
8335  		bio_endio(dip->orig_bio);
8336  	}
8337  out:
8338  	bio_put(bio);
8339  }
8340  
btrfs_dio_bio_alloc(struct block_device * bdev,u64 first_sector,gfp_t gfp_flags)8341  static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
8342  				       u64 first_sector, gfp_t gfp_flags)
8343  {
8344  	struct bio *bio;
8345  	bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
8346  	if (bio)
8347  		bio_associate_current(bio);
8348  	return bio;
8349  }
8350  
btrfs_lookup_and_bind_dio_csum(struct btrfs_root * root,struct inode * inode,struct btrfs_dio_private * dip,struct bio * bio,u64 file_offset)8351  static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
8352  						 struct inode *inode,
8353  						 struct btrfs_dio_private *dip,
8354  						 struct bio *bio,
8355  						 u64 file_offset)
8356  {
8357  	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8358  	struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
8359  	int ret;
8360  
8361  	/*
8362  	 * We load all the csum data we need when we submit
8363  	 * the first bio to reduce the csum tree search and
8364  	 * contention.
8365  	 */
8366  	if (dip->logical_offset == file_offset) {
8367  		ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
8368  						file_offset);
8369  		if (ret)
8370  			return ret;
8371  	}
8372  
8373  	if (bio == dip->orig_bio)
8374  		return 0;
8375  
8376  	file_offset -= dip->logical_offset;
8377  	file_offset >>= inode->i_sb->s_blocksize_bits;
8378  	io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
8379  
8380  	return 0;
8381  }
8382  
__btrfs_submit_dio_bio(struct bio * bio,struct inode * inode,u64 file_offset,int skip_sum,int async_submit)8383  static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
8384  					 u64 file_offset, int skip_sum,
8385  					 int async_submit)
8386  {
8387  	struct btrfs_dio_private *dip = bio->bi_private;
8388  	bool write = bio_op(bio) == REQ_OP_WRITE;
8389  	struct btrfs_root *root = BTRFS_I(inode)->root;
8390  	int ret;
8391  
8392  	if (async_submit)
8393  		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
8394  
8395  	bio_get(bio);
8396  
8397  	if (!write) {
8398  		ret = btrfs_bio_wq_end_io(root->fs_info, bio,
8399  				BTRFS_WQ_ENDIO_DATA);
8400  		if (ret)
8401  			goto err;
8402  	}
8403  
8404  	if (skip_sum)
8405  		goto map;
8406  
8407  	if (write && async_submit) {
8408  		ret = btrfs_wq_submit_bio(root->fs_info,
8409  				   inode, bio, 0, 0, file_offset,
8410  				   __btrfs_submit_bio_start_direct_io,
8411  				   __btrfs_submit_bio_done);
8412  		goto err;
8413  	} else if (write) {
8414  		/*
8415  		 * If we aren't doing async submit, calculate the csum of the
8416  		 * bio now.
8417  		 */
8418  		ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
8419  		if (ret)
8420  			goto err;
8421  	} else {
8422  		ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
8423  						     file_offset);
8424  		if (ret)
8425  			goto err;
8426  	}
8427  map:
8428  	ret = btrfs_map_bio(root, bio, 0, async_submit);
8429  err:
8430  	bio_put(bio);
8431  	return ret;
8432  }
8433  
btrfs_submit_direct_hook(struct btrfs_dio_private * dip,int skip_sum)8434  static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
8435  				    int skip_sum)
8436  {
8437  	struct inode *inode = dip->inode;
8438  	struct btrfs_root *root = BTRFS_I(inode)->root;
8439  	struct bio *bio;
8440  	struct bio *orig_bio = dip->orig_bio;
8441  	struct bio_vec *bvec = orig_bio->bi_io_vec;
8442  	u64 start_sector = orig_bio->bi_iter.bi_sector;
8443  	u64 file_offset = dip->logical_offset;
8444  	u64 submit_len = 0;
8445  	u64 map_length;
8446  	u32 blocksize = root->sectorsize;
8447  	int async_submit = 0;
8448  	int nr_sectors;
8449  	int ret;
8450  	int i;
8451  
8452  	map_length = orig_bio->bi_iter.bi_size;
8453  	ret = btrfs_map_block(root->fs_info, bio_op(orig_bio),
8454  			      start_sector << 9, &map_length, NULL, 0);
8455  	if (ret)
8456  		return -EIO;
8457  
8458  	if (map_length >= orig_bio->bi_iter.bi_size) {
8459  		bio = orig_bio;
8460  		dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
8461  		goto submit;
8462  	}
8463  
8464  	/* async crcs make it difficult to collect full stripe writes. */
8465  	if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
8466  		async_submit = 0;
8467  	else
8468  		async_submit = 1;
8469  
8470  	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
8471  	if (!bio)
8472  		return -ENOMEM;
8473  
8474  	bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio));
8475  	bio->bi_private = dip;
8476  	bio->bi_end_io = btrfs_end_dio_bio;
8477  	btrfs_io_bio(bio)->logical = file_offset;
8478  	atomic_inc(&dip->pending_bios);
8479  
8480  	while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
8481  		nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len);
8482  		i = 0;
8483  next_block:
8484  		if (unlikely(map_length < submit_len + blocksize ||
8485  		    bio_add_page(bio, bvec->bv_page, blocksize,
8486  			    bvec->bv_offset + (i * blocksize)) < blocksize)) {
8487  			/*
8488  			 * inc the count before we submit the bio so
8489  			 * we know the end IO handler won't happen before
8490  			 * we inc the count. Otherwise, the dip might get freed
8491  			 * before we're done setting it up
8492  			 */
8493  			atomic_inc(&dip->pending_bios);
8494  			ret = __btrfs_submit_dio_bio(bio, inode,
8495  						     file_offset, skip_sum,
8496  						     async_submit);
8497  			if (ret) {
8498  				bio_put(bio);
8499  				atomic_dec(&dip->pending_bios);
8500  				goto out_err;
8501  			}
8502  
8503  			start_sector += submit_len >> 9;
8504  			file_offset += submit_len;
8505  
8506  			submit_len = 0;
8507  
8508  			bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
8509  						  start_sector, GFP_NOFS);
8510  			if (!bio)
8511  				goto out_err;
8512  			bio_set_op_attrs(bio, bio_op(orig_bio),
8513  					 bio_flags(orig_bio));
8514  			bio->bi_private = dip;
8515  			bio->bi_end_io = btrfs_end_dio_bio;
8516  			btrfs_io_bio(bio)->logical = file_offset;
8517  
8518  			map_length = orig_bio->bi_iter.bi_size;
8519  			ret = btrfs_map_block(root->fs_info, bio_op(orig_bio),
8520  					      start_sector << 9,
8521  					      &map_length, NULL, 0);
8522  			if (ret) {
8523  				bio_put(bio);
8524  				goto out_err;
8525  			}
8526  
8527  			goto next_block;
8528  		} else {
8529  			submit_len += blocksize;
8530  			if (--nr_sectors) {
8531  				i++;
8532  				goto next_block;
8533  			}
8534  			bvec++;
8535  		}
8536  	}
8537  
8538  submit:
8539  	ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
8540  				     async_submit);
8541  	if (!ret)
8542  		return 0;
8543  
8544  	bio_put(bio);
8545  out_err:
8546  	dip->errors = 1;
8547  	/*
8548  	 * before atomic variable goto zero, we must
8549  	 * make sure dip->errors is perceived to be set.
8550  	 */
8551  	smp_mb__before_atomic();
8552  	if (atomic_dec_and_test(&dip->pending_bios))
8553  		bio_io_error(dip->orig_bio);
8554  
8555  	/* bio_end_io() will handle error, so we needn't return it */
8556  	return 0;
8557  }
8558  
btrfs_submit_direct(struct bio * dio_bio,struct inode * inode,loff_t file_offset)8559  static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
8560  				loff_t file_offset)
8561  {
8562  	struct btrfs_dio_private *dip = NULL;
8563  	struct bio *io_bio = NULL;
8564  	struct btrfs_io_bio *btrfs_bio;
8565  	int skip_sum;
8566  	bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
8567  	int ret = 0;
8568  
8569  	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8570  
8571  	io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
8572  	if (!io_bio) {
8573  		ret = -ENOMEM;
8574  		goto free_ordered;
8575  	}
8576  
8577  	dip = kzalloc(sizeof(*dip), GFP_NOFS);
8578  	if (!dip) {
8579  		ret = -ENOMEM;
8580  		goto free_ordered;
8581  	}
8582  
8583  	dip->private = dio_bio->bi_private;
8584  	dip->inode = inode;
8585  	dip->logical_offset = file_offset;
8586  	dip->bytes = dio_bio->bi_iter.bi_size;
8587  	dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
8588  	io_bio->bi_private = dip;
8589  	dip->orig_bio = io_bio;
8590  	dip->dio_bio = dio_bio;
8591  	atomic_set(&dip->pending_bios, 0);
8592  	btrfs_bio = btrfs_io_bio(io_bio);
8593  	btrfs_bio->logical = file_offset;
8594  
8595  	if (write) {
8596  		io_bio->bi_end_io = btrfs_endio_direct_write;
8597  	} else {
8598  		io_bio->bi_end_io = btrfs_endio_direct_read;
8599  		dip->subio_endio = btrfs_subio_endio_read;
8600  	}
8601  
8602  	/*
8603  	 * Reset the range for unsubmitted ordered extents (to a 0 length range)
8604  	 * even if we fail to submit a bio, because in such case we do the
8605  	 * corresponding error handling below and it must not be done a second
8606  	 * time by btrfs_direct_IO().
8607  	 */
8608  	if (write) {
8609  		struct btrfs_dio_data *dio_data = current->journal_info;
8610  
8611  		dio_data->unsubmitted_oe_range_end = dip->logical_offset +
8612  			dip->bytes;
8613  		dio_data->unsubmitted_oe_range_start =
8614  			dio_data->unsubmitted_oe_range_end;
8615  	}
8616  
8617  	ret = btrfs_submit_direct_hook(dip, skip_sum);
8618  	if (!ret)
8619  		return;
8620  
8621  	if (btrfs_bio->end_io)
8622  		btrfs_bio->end_io(btrfs_bio, ret);
8623  
8624  free_ordered:
8625  	/*
8626  	 * If we arrived here it means either we failed to submit the dip
8627  	 * or we either failed to clone the dio_bio or failed to allocate the
8628  	 * dip. If we cloned the dio_bio and allocated the dip, we can just
8629  	 * call bio_endio against our io_bio so that we get proper resource
8630  	 * cleanup if we fail to submit the dip, otherwise, we must do the
8631  	 * same as btrfs_endio_direct_[write|read] because we can't call these
8632  	 * callbacks - they require an allocated dip and a clone of dio_bio.
8633  	 */
8634  	if (io_bio && dip) {
8635  		io_bio->bi_error = -EIO;
8636  		bio_endio(io_bio);
8637  		/*
8638  		 * The end io callbacks free our dip, do the final put on io_bio
8639  		 * and all the cleanup and final put for dio_bio (through
8640  		 * dio_end_io()).
8641  		 */
8642  		dip = NULL;
8643  		io_bio = NULL;
8644  	} else {
8645  		if (write)
8646  			btrfs_endio_direct_write_update_ordered(inode,
8647  						file_offset,
8648  						dio_bio->bi_iter.bi_size,
8649  						0);
8650  		else
8651  			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8652  			      file_offset + dio_bio->bi_iter.bi_size - 1);
8653  
8654  		dio_bio->bi_error = -EIO;
8655  		/*
8656  		 * Releases and cleans up our dio_bio, no need to bio_put()
8657  		 * nor bio_endio()/bio_io_error() against dio_bio.
8658  		 */
8659  		dio_end_io(dio_bio, ret);
8660  	}
8661  	if (io_bio)
8662  		bio_put(io_bio);
8663  	kfree(dip);
8664  }
8665  
check_direct_IO(struct btrfs_root * root,struct kiocb * iocb,const struct iov_iter * iter,loff_t offset)8666  static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
8667  			const struct iov_iter *iter, loff_t offset)
8668  {
8669  	int seg;
8670  	int i;
8671  	unsigned blocksize_mask = root->sectorsize - 1;
8672  	ssize_t retval = -EINVAL;
8673  
8674  	if (offset & blocksize_mask)
8675  		goto out;
8676  
8677  	if (iov_iter_alignment(iter) & blocksize_mask)
8678  		goto out;
8679  
8680  	/* If this is a write we don't need to check anymore */
8681  	if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
8682  		return 0;
8683  	/*
8684  	 * Check to make sure we don't have duplicate iov_base's in this
8685  	 * iovec, if so return EINVAL, otherwise we'll get csum errors
8686  	 * when reading back.
8687  	 */
8688  	for (seg = 0; seg < iter->nr_segs; seg++) {
8689  		for (i = seg + 1; i < iter->nr_segs; i++) {
8690  			if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
8691  				goto out;
8692  		}
8693  	}
8694  	retval = 0;
8695  out:
8696  	return retval;
8697  }
8698  
btrfs_direct_IO(struct kiocb * iocb,struct iov_iter * iter)8699  static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8700  {
8701  	struct file *file = iocb->ki_filp;
8702  	struct inode *inode = file->f_mapping->host;
8703  	struct btrfs_root *root = BTRFS_I(inode)->root;
8704  	struct btrfs_dio_data dio_data = { 0 };
8705  	loff_t offset = iocb->ki_pos;
8706  	size_t count = 0;
8707  	int flags = 0;
8708  	bool wakeup = true;
8709  	bool relock = false;
8710  	ssize_t ret;
8711  
8712  	if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset))
8713  		return 0;
8714  
8715  	inode_dio_begin(inode);
8716  	smp_mb__after_atomic();
8717  
8718  	/*
8719  	 * The generic stuff only does filemap_write_and_wait_range, which
8720  	 * isn't enough if we've written compressed pages to this area, so
8721  	 * we need to flush the dirty pages again to make absolutely sure
8722  	 * that any outstanding dirty pages are on disk.
8723  	 */
8724  	count = iov_iter_count(iter);
8725  	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8726  		     &BTRFS_I(inode)->runtime_flags))
8727  		filemap_fdatawrite_range(inode->i_mapping, offset,
8728  					 offset + count - 1);
8729  
8730  	if (iov_iter_rw(iter) == WRITE) {
8731  		/*
8732  		 * If the write DIO is beyond the EOF, we need update
8733  		 * the isize, but it is protected by i_mutex. So we can
8734  		 * not unlock the i_mutex at this case.
8735  		 */
8736  		if (offset + count <= inode->i_size) {
8737  			inode_unlock(inode);
8738  			relock = true;
8739  		}
8740  		ret = btrfs_delalloc_reserve_space(inode, offset, count);
8741  		if (ret)
8742  			goto out;
8743  		dio_data.outstanding_extents = div64_u64(count +
8744  						BTRFS_MAX_EXTENT_SIZE - 1,
8745  						BTRFS_MAX_EXTENT_SIZE);
8746  
8747  		/*
8748  		 * We need to know how many extents we reserved so that we can
8749  		 * do the accounting properly if we go over the number we
8750  		 * originally calculated.  Abuse current->journal_info for this.
8751  		 */
8752  		dio_data.reserve = round_up(count, root->sectorsize);
8753  		dio_data.unsubmitted_oe_range_start = (u64)offset;
8754  		dio_data.unsubmitted_oe_range_end = (u64)offset;
8755  		current->journal_info = &dio_data;
8756  		down_read(&BTRFS_I(inode)->dio_sem);
8757  	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8758  				     &BTRFS_I(inode)->runtime_flags)) {
8759  		inode_dio_end(inode);
8760  		flags = DIO_LOCKING | DIO_SKIP_HOLES;
8761  		wakeup = false;
8762  	}
8763  
8764  	ret = __blockdev_direct_IO(iocb, inode,
8765  				   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
8766  				   iter, btrfs_get_blocks_direct, NULL,
8767  				   btrfs_submit_direct, flags);
8768  	if (iov_iter_rw(iter) == WRITE) {
8769  		up_read(&BTRFS_I(inode)->dio_sem);
8770  		current->journal_info = NULL;
8771  		if (ret < 0 && ret != -EIOCBQUEUED) {
8772  			if (dio_data.reserve)
8773  				btrfs_delalloc_release_space(inode, offset,
8774  							     dio_data.reserve);
8775  			/*
8776  			 * On error we might have left some ordered extents
8777  			 * without submitting corresponding bios for them, so
8778  			 * cleanup them up to avoid other tasks getting them
8779  			 * and waiting for them to complete forever.
8780  			 */
8781  			if (dio_data.unsubmitted_oe_range_start <
8782  			    dio_data.unsubmitted_oe_range_end)
8783  				btrfs_endio_direct_write_update_ordered(inode,
8784  					dio_data.unsubmitted_oe_range_start,
8785  					dio_data.unsubmitted_oe_range_end -
8786  					dio_data.unsubmitted_oe_range_start,
8787  					0);
8788  		} else if (ret >= 0 && (size_t)ret < count)
8789  			btrfs_delalloc_release_space(inode, offset,
8790  						     count - (size_t)ret);
8791  	}
8792  out:
8793  	if (wakeup)
8794  		inode_dio_end(inode);
8795  	if (relock)
8796  		inode_lock(inode);
8797  
8798  	return ret;
8799  }
8800  
8801  #define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
8802  
btrfs_fiemap(struct inode * inode,struct fiemap_extent_info * fieinfo,__u64 start,__u64 len)8803  static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8804  		__u64 start, __u64 len)
8805  {
8806  	int	ret;
8807  
8808  	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
8809  	if (ret)
8810  		return ret;
8811  
8812  	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
8813  }
8814  
btrfs_readpage(struct file * file,struct page * page)8815  int btrfs_readpage(struct file *file, struct page *page)
8816  {
8817  	struct extent_io_tree *tree;
8818  	tree = &BTRFS_I(page->mapping->host)->io_tree;
8819  	return extent_read_full_page(tree, page, btrfs_get_extent, 0);
8820  }
8821  
btrfs_writepage(struct page * page,struct writeback_control * wbc)8822  static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
8823  {
8824  	struct extent_io_tree *tree;
8825  	struct inode *inode = page->mapping->host;
8826  	int ret;
8827  
8828  	if (current->flags & PF_MEMALLOC) {
8829  		redirty_page_for_writepage(wbc, page);
8830  		unlock_page(page);
8831  		return 0;
8832  	}
8833  
8834  	/*
8835  	 * If we are under memory pressure we will call this directly from the
8836  	 * VM, we need to make sure we have the inode referenced for the ordered
8837  	 * extent.  If not just return like we didn't do anything.
8838  	 */
8839  	if (!igrab(inode)) {
8840  		redirty_page_for_writepage(wbc, page);
8841  		return AOP_WRITEPAGE_ACTIVATE;
8842  	}
8843  	tree = &BTRFS_I(page->mapping->host)->io_tree;
8844  	ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
8845  	btrfs_add_delayed_iput(inode);
8846  	return ret;
8847  }
8848  
btrfs_writepages(struct address_space * mapping,struct writeback_control * wbc)8849  static int btrfs_writepages(struct address_space *mapping,
8850  			    struct writeback_control *wbc)
8851  {
8852  	struct extent_io_tree *tree;
8853  
8854  	tree = &BTRFS_I(mapping->host)->io_tree;
8855  	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
8856  }
8857  
8858  static int
btrfs_readpages(struct file * file,struct address_space * mapping,struct list_head * pages,unsigned nr_pages)8859  btrfs_readpages(struct file *file, struct address_space *mapping,
8860  		struct list_head *pages, unsigned nr_pages)
8861  {
8862  	struct extent_io_tree *tree;
8863  	tree = &BTRFS_I(mapping->host)->io_tree;
8864  	return extent_readpages(tree, mapping, pages, nr_pages,
8865  				btrfs_get_extent);
8866  }
__btrfs_releasepage(struct page * page,gfp_t gfp_flags)8867  static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8868  {
8869  	struct extent_io_tree *tree;
8870  	struct extent_map_tree *map;
8871  	int ret;
8872  
8873  	tree = &BTRFS_I(page->mapping->host)->io_tree;
8874  	map = &BTRFS_I(page->mapping->host)->extent_tree;
8875  	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
8876  	if (ret == 1) {
8877  		ClearPagePrivate(page);
8878  		set_page_private(page, 0);
8879  		put_page(page);
8880  	}
8881  	return ret;
8882  }
8883  
btrfs_releasepage(struct page * page,gfp_t gfp_flags)8884  static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8885  {
8886  	if (PageWriteback(page) || PageDirty(page))
8887  		return 0;
8888  	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
8889  }
8890  
btrfs_invalidatepage(struct page * page,unsigned int offset,unsigned int length)8891  static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8892  				 unsigned int length)
8893  {
8894  	struct inode *inode = page->mapping->host;
8895  	struct extent_io_tree *tree;
8896  	struct btrfs_ordered_extent *ordered;
8897  	struct extent_state *cached_state = NULL;
8898  	u64 page_start = page_offset(page);
8899  	u64 page_end = page_start + PAGE_SIZE - 1;
8900  	u64 start;
8901  	u64 end;
8902  	int inode_evicting = inode->i_state & I_FREEING;
8903  
8904  	/*
8905  	 * we have the page locked, so new writeback can't start,
8906  	 * and the dirty bit won't be cleared while we are here.
8907  	 *
8908  	 * Wait for IO on this page so that we can safely clear
8909  	 * the PagePrivate2 bit and do ordered accounting
8910  	 */
8911  	wait_on_page_writeback(page);
8912  
8913  	tree = &BTRFS_I(inode)->io_tree;
8914  	if (offset) {
8915  		btrfs_releasepage(page, GFP_NOFS);
8916  		return;
8917  	}
8918  
8919  	if (!inode_evicting)
8920  		lock_extent_bits(tree, page_start, page_end, &cached_state);
8921  again:
8922  	start = page_start;
8923  	ordered = btrfs_lookup_ordered_range(inode, start,
8924  					page_end - start + 1);
8925  	if (ordered) {
8926  		end = min(page_end, ordered->file_offset + ordered->len - 1);
8927  		/*
8928  		 * IO on this page will never be started, so we need
8929  		 * to account for any ordered extents now
8930  		 */
8931  		if (!inode_evicting)
8932  			clear_extent_bit(tree, start, end,
8933  					 EXTENT_DIRTY | EXTENT_DELALLOC |
8934  					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8935  					 EXTENT_DEFRAG, 1, 0, &cached_state,
8936  					 GFP_NOFS);
8937  		/*
8938  		 * whoever cleared the private bit is responsible
8939  		 * for the finish_ordered_io
8940  		 */
8941  		if (TestClearPagePrivate2(page)) {
8942  			struct btrfs_ordered_inode_tree *tree;
8943  			u64 new_len;
8944  
8945  			tree = &BTRFS_I(inode)->ordered_tree;
8946  
8947  			spin_lock_irq(&tree->lock);
8948  			set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8949  			new_len = start - ordered->file_offset;
8950  			if (new_len < ordered->truncated_len)
8951  				ordered->truncated_len = new_len;
8952  			spin_unlock_irq(&tree->lock);
8953  
8954  			if (btrfs_dec_test_ordered_pending(inode, &ordered,
8955  							   start,
8956  							   end - start + 1, 1))
8957  				btrfs_finish_ordered_io(ordered);
8958  		}
8959  		btrfs_put_ordered_extent(ordered);
8960  		if (!inode_evicting) {
8961  			cached_state = NULL;
8962  			lock_extent_bits(tree, start, end,
8963  					 &cached_state);
8964  		}
8965  
8966  		start = end + 1;
8967  		if (start < page_end)
8968  			goto again;
8969  	}
8970  
8971  	/*
8972  	 * Qgroup reserved space handler
8973  	 * Page here will be either
8974  	 * 1) Already written to disk
8975  	 *    In this case, its reserved space is released from data rsv map
8976  	 *    and will be freed by delayed_ref handler finally.
8977  	 *    So even we call qgroup_free_data(), it won't decrease reserved
8978  	 *    space.
8979  	 * 2) Not written to disk
8980  	 *    This means the reserved space should be freed here. However,
8981  	 *    if a truncate invalidates the page (by clearing PageDirty)
8982  	 *    and the page is accounted for while allocating extent
8983  	 *    in btrfs_check_data_free_space() we let delayed_ref to
8984  	 *    free the entire extent.
8985  	 */
8986  	if (PageDirty(page))
8987  		btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
8988  	if (!inode_evicting) {
8989  		clear_extent_bit(tree, page_start, page_end,
8990  				 EXTENT_LOCKED | EXTENT_DIRTY |
8991  				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8992  				 EXTENT_DEFRAG, 1, 1,
8993  				 &cached_state, GFP_NOFS);
8994  
8995  		__btrfs_releasepage(page, GFP_NOFS);
8996  	}
8997  
8998  	ClearPageChecked(page);
8999  	if (PagePrivate(page)) {
9000  		ClearPagePrivate(page);
9001  		set_page_private(page, 0);
9002  		put_page(page);
9003  	}
9004  }
9005  
9006  /*
9007   * btrfs_page_mkwrite() is not allowed to change the file size as it gets
9008   * called from a page fault handler when a page is first dirtied. Hence we must
9009   * be careful to check for EOF conditions here. We set the page up correctly
9010   * for a written page which means we get ENOSPC checking when writing into
9011   * holes and correct delalloc and unwritten extent mapping on filesystems that
9012   * support these features.
9013   *
9014   * We are not allowed to take the i_mutex here so we have to play games to
9015   * protect against truncate races as the page could now be beyond EOF.  Because
9016   * vmtruncate() writes the inode size before removing pages, once we have the
9017   * page lock we can determine safely if the page is beyond EOF. If it is not
9018   * beyond EOF, then the page is guaranteed safe against truncation until we
9019   * unlock the page.
9020   */
btrfs_page_mkwrite(struct vm_area_struct * vma,struct vm_fault * vmf)9021  int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
9022  {
9023  	struct page *page = vmf->page;
9024  	struct inode *inode = file_inode(vma->vm_file);
9025  	struct btrfs_root *root = BTRFS_I(inode)->root;
9026  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9027  	struct btrfs_ordered_extent *ordered;
9028  	struct extent_state *cached_state = NULL;
9029  	char *kaddr;
9030  	unsigned long zero_start;
9031  	loff_t size;
9032  	int ret;
9033  	int reserved = 0;
9034  	u64 reserved_space;
9035  	u64 page_start;
9036  	u64 page_end;
9037  	u64 end;
9038  
9039  	reserved_space = PAGE_SIZE;
9040  
9041  	sb_start_pagefault(inode->i_sb);
9042  	page_start = page_offset(page);
9043  	page_end = page_start + PAGE_SIZE - 1;
9044  	end = page_end;
9045  
9046  	/*
9047  	 * Reserving delalloc space after obtaining the page lock can lead to
9048  	 * deadlock. For example, if a dirty page is locked by this function
9049  	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
9050  	 * dirty page write out, then the btrfs_writepage() function could
9051  	 * end up waiting indefinitely to get a lock on the page currently
9052  	 * being processed by btrfs_page_mkwrite() function.
9053  	 */
9054  	ret = btrfs_delalloc_reserve_space(inode, page_start,
9055  					   reserved_space);
9056  	if (!ret) {
9057  		ret = file_update_time(vma->vm_file);
9058  		reserved = 1;
9059  	}
9060  	if (ret) {
9061  		if (ret == -ENOMEM)
9062  			ret = VM_FAULT_OOM;
9063  		else /* -ENOSPC, -EIO, etc */
9064  			ret = VM_FAULT_SIGBUS;
9065  		if (reserved)
9066  			goto out;
9067  		goto out_noreserve;
9068  	}
9069  
9070  	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
9071  again:
9072  	lock_page(page);
9073  	size = i_size_read(inode);
9074  
9075  	if ((page->mapping != inode->i_mapping) ||
9076  	    (page_start >= size)) {
9077  		/* page got truncated out from underneath us */
9078  		goto out_unlock;
9079  	}
9080  	wait_on_page_writeback(page);
9081  
9082  	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
9083  	set_page_extent_mapped(page);
9084  
9085  	/*
9086  	 * we can't set the delalloc bits if there are pending ordered
9087  	 * extents.  Drop our locks and wait for them to finish
9088  	 */
9089  	ordered = btrfs_lookup_ordered_range(inode, page_start, page_end);
9090  	if (ordered) {
9091  		unlock_extent_cached(io_tree, page_start, page_end,
9092  				     &cached_state, GFP_NOFS);
9093  		unlock_page(page);
9094  		btrfs_start_ordered_extent(inode, ordered, 1);
9095  		btrfs_put_ordered_extent(ordered);
9096  		goto again;
9097  	}
9098  
9099  	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
9100  		reserved_space = round_up(size - page_start, root->sectorsize);
9101  		if (reserved_space < PAGE_SIZE) {
9102  			end = page_start + reserved_space - 1;
9103  			spin_lock(&BTRFS_I(inode)->lock);
9104  			BTRFS_I(inode)->outstanding_extents++;
9105  			spin_unlock(&BTRFS_I(inode)->lock);
9106  			btrfs_delalloc_release_space(inode, page_start,
9107  						PAGE_SIZE - reserved_space);
9108  		}
9109  	}
9110  
9111  	/*
9112  	 * XXX - page_mkwrite gets called every time the page is dirtied, even
9113  	 * if it was already dirty, so for space accounting reasons we need to
9114  	 * clear any delalloc bits for the range we are fixing to save.  There
9115  	 * is probably a better way to do this, but for now keep consistent with
9116  	 * prepare_pages in the normal write path.
9117  	 */
9118  	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
9119  			  EXTENT_DIRTY | EXTENT_DELALLOC |
9120  			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
9121  			  0, 0, &cached_state, GFP_NOFS);
9122  
9123  	ret = btrfs_set_extent_delalloc(inode, page_start, end,
9124  					&cached_state, 0);
9125  	if (ret) {
9126  		unlock_extent_cached(io_tree, page_start, page_end,
9127  				     &cached_state, GFP_NOFS);
9128  		ret = VM_FAULT_SIGBUS;
9129  		goto out_unlock;
9130  	}
9131  	ret = 0;
9132  
9133  	/* page is wholly or partially inside EOF */
9134  	if (page_start + PAGE_SIZE > size)
9135  		zero_start = size & ~PAGE_MASK;
9136  	else
9137  		zero_start = PAGE_SIZE;
9138  
9139  	if (zero_start != PAGE_SIZE) {
9140  		kaddr = kmap(page);
9141  		memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
9142  		flush_dcache_page(page);
9143  		kunmap(page);
9144  	}
9145  	ClearPageChecked(page);
9146  	set_page_dirty(page);
9147  	SetPageUptodate(page);
9148  
9149  	BTRFS_I(inode)->last_trans = root->fs_info->generation;
9150  	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
9151  	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
9152  
9153  	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
9154  
9155  out_unlock:
9156  	if (!ret) {
9157  		sb_end_pagefault(inode->i_sb);
9158  		return VM_FAULT_LOCKED;
9159  	}
9160  	unlock_page(page);
9161  out:
9162  	btrfs_delalloc_release_space(inode, page_start, reserved_space);
9163  out_noreserve:
9164  	sb_end_pagefault(inode->i_sb);
9165  	return ret;
9166  }
9167  
btrfs_truncate(struct inode * inode)9168  static int btrfs_truncate(struct inode *inode)
9169  {
9170  	struct btrfs_root *root = BTRFS_I(inode)->root;
9171  	struct btrfs_block_rsv *rsv;
9172  	int ret = 0;
9173  	int err = 0;
9174  	struct btrfs_trans_handle *trans;
9175  	u64 mask = root->sectorsize - 1;
9176  	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
9177  
9178  	ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
9179  				       (u64)-1);
9180  	if (ret)
9181  		return ret;
9182  
9183  	/*
9184  	 * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have
9185  	 * 3 things going on here
9186  	 *
9187  	 * 1) We need to reserve space for our orphan item and the space to
9188  	 * delete our orphan item.  Lord knows we don't want to have a dangling
9189  	 * orphan item because we didn't reserve space to remove it.
9190  	 *
9191  	 * 2) We need to reserve space to update our inode.
9192  	 *
9193  	 * 3) We need to have something to cache all the space that is going to
9194  	 * be free'd up by the truncate operation, but also have some slack
9195  	 * space reserved in case it uses space during the truncate (thank you
9196  	 * very much snapshotting).
9197  	 *
9198  	 * And we need these to all be separate.  The fact is we can use a lot of
9199  	 * space doing the truncate, and we have no earthly idea how much space
9200  	 * we will use, so we need the truncate reservation to be separate so it
9201  	 * doesn't end up using space reserved for updating the inode or
9202  	 * removing the orphan item.  We also need to be able to stop the
9203  	 * transaction and start a new one, which means we need to be able to
9204  	 * update the inode several times, and we have no idea of knowing how
9205  	 * many times that will be, so we can't just reserve 1 item for the
9206  	 * entirety of the operation, so that has to be done separately as well.
9207  	 * Then there is the orphan item, which does indeed need to be held on
9208  	 * to for the whole operation, and we need nobody to touch this reserved
9209  	 * space except the orphan code.
9210  	 *
9211  	 * So that leaves us with
9212  	 *
9213  	 * 1) root->orphan_block_rsv - for the orphan deletion.
9214  	 * 2) rsv - for the truncate reservation, which we will steal from the
9215  	 * transaction reservation.
9216  	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
9217  	 * updating the inode.
9218  	 */
9219  	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
9220  	if (!rsv)
9221  		return -ENOMEM;
9222  	rsv->size = min_size;
9223  	rsv->failfast = 1;
9224  
9225  	/*
9226  	 * 1 for the truncate slack space
9227  	 * 1 for updating the inode.
9228  	 */
9229  	trans = btrfs_start_transaction(root, 2);
9230  	if (IS_ERR(trans)) {
9231  		err = PTR_ERR(trans);
9232  		goto out;
9233  	}
9234  
9235  	/* Migrate the slack space for the truncate to our reserve */
9236  	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
9237  				      min_size, 0);
9238  	BUG_ON(ret);
9239  
9240  	/*
9241  	 * So if we truncate and then write and fsync we normally would just
9242  	 * write the extents that changed, which is a problem if we need to
9243  	 * first truncate that entire inode.  So set this flag so we write out
9244  	 * all of the extents in the inode to the sync log so we're completely
9245  	 * safe.
9246  	 */
9247  	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
9248  	trans->block_rsv = rsv;
9249  
9250  	while (1) {
9251  		ret = btrfs_truncate_inode_items(trans, root, inode,
9252  						 inode->i_size,
9253  						 BTRFS_EXTENT_DATA_KEY);
9254  		if (ret != -ENOSPC && ret != -EAGAIN) {
9255  			err = ret;
9256  			break;
9257  		}
9258  
9259  		trans->block_rsv = &root->fs_info->trans_block_rsv;
9260  		ret = btrfs_update_inode(trans, root, inode);
9261  		if (ret) {
9262  			err = ret;
9263  			break;
9264  		}
9265  
9266  		btrfs_end_transaction(trans, root);
9267  		btrfs_btree_balance_dirty(root);
9268  
9269  		trans = btrfs_start_transaction(root, 2);
9270  		if (IS_ERR(trans)) {
9271  			ret = err = PTR_ERR(trans);
9272  			trans = NULL;
9273  			break;
9274  		}
9275  
9276  		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
9277  					      rsv, min_size, 0);
9278  		BUG_ON(ret);	/* shouldn't happen */
9279  		trans->block_rsv = rsv;
9280  	}
9281  
9282  	if (ret == 0 && inode->i_nlink > 0) {
9283  		trans->block_rsv = root->orphan_block_rsv;
9284  		ret = btrfs_orphan_del(trans, inode);
9285  		if (ret)
9286  			err = ret;
9287  	}
9288  
9289  	if (trans) {
9290  		trans->block_rsv = &root->fs_info->trans_block_rsv;
9291  		ret = btrfs_update_inode(trans, root, inode);
9292  		if (ret && !err)
9293  			err = ret;
9294  
9295  		ret = btrfs_end_transaction(trans, root);
9296  		btrfs_btree_balance_dirty(root);
9297  	}
9298  out:
9299  	btrfs_free_block_rsv(root, rsv);
9300  
9301  	if (ret && !err)
9302  		err = ret;
9303  
9304  	return err;
9305  }
9306  
9307  /*
9308   * create a new subvolume directory/inode (helper for the ioctl).
9309   */
btrfs_create_subvol_root(struct btrfs_trans_handle * trans,struct btrfs_root * new_root,struct btrfs_root * parent_root,u64 new_dirid)9310  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
9311  			     struct btrfs_root *new_root,
9312  			     struct btrfs_root *parent_root,
9313  			     u64 new_dirid)
9314  {
9315  	struct inode *inode;
9316  	int err;
9317  	u64 index = 0;
9318  
9319  	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
9320  				new_dirid, new_dirid,
9321  				S_IFDIR | (~current_umask() & S_IRWXUGO),
9322  				&index);
9323  	if (IS_ERR(inode))
9324  		return PTR_ERR(inode);
9325  	inode->i_op = &btrfs_dir_inode_operations;
9326  	inode->i_fop = &btrfs_dir_file_operations;
9327  
9328  	set_nlink(inode, 1);
9329  	btrfs_i_size_write(inode, 0);
9330  	unlock_new_inode(inode);
9331  
9332  	err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
9333  	if (err)
9334  		btrfs_err(new_root->fs_info,
9335  			  "error inheriting subvolume %llu properties: %d",
9336  			  new_root->root_key.objectid, err);
9337  
9338  	err = btrfs_update_inode(trans, new_root, inode);
9339  
9340  	iput(inode);
9341  	return err;
9342  }
9343  
btrfs_alloc_inode(struct super_block * sb)9344  struct inode *btrfs_alloc_inode(struct super_block *sb)
9345  {
9346  	struct btrfs_inode *ei;
9347  	struct inode *inode;
9348  
9349  	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
9350  	if (!ei)
9351  		return NULL;
9352  
9353  	ei->root = NULL;
9354  	ei->generation = 0;
9355  	ei->last_trans = 0;
9356  	ei->last_sub_trans = 0;
9357  	ei->logged_trans = 0;
9358  	ei->delalloc_bytes = 0;
9359  	ei->defrag_bytes = 0;
9360  	ei->disk_i_size = 0;
9361  	ei->flags = 0;
9362  	ei->csum_bytes = 0;
9363  	ei->index_cnt = (u64)-1;
9364  	ei->dir_index = 0;
9365  	ei->last_unlink_trans = 0;
9366  	ei->last_log_commit = 0;
9367  	ei->delayed_iput_count = 0;
9368  
9369  	spin_lock_init(&ei->lock);
9370  	ei->outstanding_extents = 0;
9371  	ei->reserved_extents = 0;
9372  
9373  	ei->runtime_flags = 0;
9374  	ei->force_compress = BTRFS_COMPRESS_NONE;
9375  
9376  	ei->delayed_node = NULL;
9377  
9378  	ei->i_otime.tv_sec = 0;
9379  	ei->i_otime.tv_nsec = 0;
9380  
9381  	inode = &ei->vfs_inode;
9382  	extent_map_tree_init(&ei->extent_tree);
9383  	extent_io_tree_init(&ei->io_tree, &inode->i_data);
9384  	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
9385  	ei->io_tree.track_uptodate = 1;
9386  	ei->io_failure_tree.track_uptodate = 1;
9387  	atomic_set(&ei->sync_writers, 0);
9388  	mutex_init(&ei->log_mutex);
9389  	mutex_init(&ei->delalloc_mutex);
9390  	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
9391  	INIT_LIST_HEAD(&ei->delalloc_inodes);
9392  	INIT_LIST_HEAD(&ei->delayed_iput);
9393  	RB_CLEAR_NODE(&ei->rb_node);
9394  	init_rwsem(&ei->dio_sem);
9395  
9396  	return inode;
9397  }
9398  
9399  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
btrfs_test_destroy_inode(struct inode * inode)9400  void btrfs_test_destroy_inode(struct inode *inode)
9401  {
9402  	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
9403  	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9404  }
9405  #endif
9406  
btrfs_i_callback(struct rcu_head * head)9407  static void btrfs_i_callback(struct rcu_head *head)
9408  {
9409  	struct inode *inode = container_of(head, struct inode, i_rcu);
9410  	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9411  }
9412  
btrfs_destroy_inode(struct inode * inode)9413  void btrfs_destroy_inode(struct inode *inode)
9414  {
9415  	struct btrfs_ordered_extent *ordered;
9416  	struct btrfs_root *root = BTRFS_I(inode)->root;
9417  
9418  	WARN_ON(!hlist_empty(&inode->i_dentry));
9419  	WARN_ON(inode->i_data.nrpages);
9420  	WARN_ON(BTRFS_I(inode)->outstanding_extents);
9421  	WARN_ON(BTRFS_I(inode)->reserved_extents);
9422  	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
9423  	WARN_ON(BTRFS_I(inode)->csum_bytes);
9424  	WARN_ON(BTRFS_I(inode)->defrag_bytes);
9425  
9426  	/*
9427  	 * This can happen where we create an inode, but somebody else also
9428  	 * created the same inode and we need to destroy the one we already
9429  	 * created.
9430  	 */
9431  	if (!root)
9432  		goto free;
9433  
9434  	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
9435  		     &BTRFS_I(inode)->runtime_flags)) {
9436  		btrfs_info(root->fs_info, "inode %llu still on the orphan list",
9437  			btrfs_ino(inode));
9438  		atomic_dec(&root->orphan_inodes);
9439  	}
9440  
9441  	while (1) {
9442  		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
9443  		if (!ordered)
9444  			break;
9445  		else {
9446  			btrfs_err(root->fs_info,
9447  				  "found ordered extent %llu %llu on inode cleanup",
9448  				  ordered->file_offset, ordered->len);
9449  			btrfs_remove_ordered_extent(inode, ordered);
9450  			btrfs_put_ordered_extent(ordered);
9451  			btrfs_put_ordered_extent(ordered);
9452  		}
9453  	}
9454  	btrfs_qgroup_check_reserved_leak(inode);
9455  	inode_tree_del(inode);
9456  	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
9457  free:
9458  	call_rcu(&inode->i_rcu, btrfs_i_callback);
9459  }
9460  
btrfs_drop_inode(struct inode * inode)9461  int btrfs_drop_inode(struct inode *inode)
9462  {
9463  	struct btrfs_root *root = BTRFS_I(inode)->root;
9464  
9465  	if (root == NULL)
9466  		return 1;
9467  
9468  	/* the snap/subvol tree is on deleting */
9469  	if (btrfs_root_refs(&root->root_item) == 0)
9470  		return 1;
9471  	else
9472  		return generic_drop_inode(inode);
9473  }
9474  
init_once(void * foo)9475  static void init_once(void *foo)
9476  {
9477  	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
9478  
9479  	inode_init_once(&ei->vfs_inode);
9480  }
9481  
btrfs_destroy_cachep(void)9482  void btrfs_destroy_cachep(void)
9483  {
9484  	/*
9485  	 * Make sure all delayed rcu free inodes are flushed before we
9486  	 * destroy cache.
9487  	 */
9488  	rcu_barrier();
9489  	kmem_cache_destroy(btrfs_inode_cachep);
9490  	kmem_cache_destroy(btrfs_trans_handle_cachep);
9491  	kmem_cache_destroy(btrfs_transaction_cachep);
9492  	kmem_cache_destroy(btrfs_path_cachep);
9493  	kmem_cache_destroy(btrfs_free_space_cachep);
9494  }
9495  
btrfs_init_cachep(void)9496  int btrfs_init_cachep(void)
9497  {
9498  	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
9499  			sizeof(struct btrfs_inode), 0,
9500  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
9501  			init_once);
9502  	if (!btrfs_inode_cachep)
9503  		goto fail;
9504  
9505  	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
9506  			sizeof(struct btrfs_trans_handle), 0,
9507  			SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9508  	if (!btrfs_trans_handle_cachep)
9509  		goto fail;
9510  
9511  	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
9512  			sizeof(struct btrfs_transaction), 0,
9513  			SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9514  	if (!btrfs_transaction_cachep)
9515  		goto fail;
9516  
9517  	btrfs_path_cachep = kmem_cache_create("btrfs_path",
9518  			sizeof(struct btrfs_path), 0,
9519  			SLAB_MEM_SPREAD, NULL);
9520  	if (!btrfs_path_cachep)
9521  		goto fail;
9522  
9523  	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
9524  			sizeof(struct btrfs_free_space), 0,
9525  			SLAB_MEM_SPREAD, NULL);
9526  	if (!btrfs_free_space_cachep)
9527  		goto fail;
9528  
9529  	return 0;
9530  fail:
9531  	btrfs_destroy_cachep();
9532  	return -ENOMEM;
9533  }
9534  
btrfs_getattr(struct vfsmount * mnt,struct dentry * dentry,struct kstat * stat)9535  static int btrfs_getattr(struct vfsmount *mnt,
9536  			 struct dentry *dentry, struct kstat *stat)
9537  {
9538  	u64 delalloc_bytes;
9539  	struct inode *inode = d_inode(dentry);
9540  	u32 blocksize = inode->i_sb->s_blocksize;
9541  
9542  	generic_fillattr(inode, stat);
9543  	stat->dev = BTRFS_I(inode)->root->anon_dev;
9544  
9545  	spin_lock(&BTRFS_I(inode)->lock);
9546  	delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
9547  	spin_unlock(&BTRFS_I(inode)->lock);
9548  	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
9549  			ALIGN(delalloc_bytes, blocksize)) >> 9;
9550  	return 0;
9551  }
9552  
btrfs_rename_exchange(struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry)9553  static int btrfs_rename_exchange(struct inode *old_dir,
9554  			      struct dentry *old_dentry,
9555  			      struct inode *new_dir,
9556  			      struct dentry *new_dentry)
9557  {
9558  	struct btrfs_trans_handle *trans;
9559  	struct btrfs_root *root = BTRFS_I(old_dir)->root;
9560  	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9561  	struct inode *new_inode = new_dentry->d_inode;
9562  	struct inode *old_inode = old_dentry->d_inode;
9563  	struct timespec ctime = current_time(old_inode);
9564  	struct dentry *parent;
9565  	u64 old_ino = btrfs_ino(old_inode);
9566  	u64 new_ino = btrfs_ino(new_inode);
9567  	u64 old_idx = 0;
9568  	u64 new_idx = 0;
9569  	u64 root_objectid;
9570  	int ret;
9571  	bool root_log_pinned = false;
9572  	bool dest_log_pinned = false;
9573  
9574  	/* we only allow rename subvolume link between subvolumes */
9575  	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9576  		return -EXDEV;
9577  
9578  	/* close the race window with snapshot create/destroy ioctl */
9579  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9580  		down_read(&root->fs_info->subvol_sem);
9581  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9582  		down_read(&dest->fs_info->subvol_sem);
9583  
9584  	/*
9585  	 * We want to reserve the absolute worst case amount of items.  So if
9586  	 * both inodes are subvols and we need to unlink them then that would
9587  	 * require 4 item modifications, but if they are both normal inodes it
9588  	 * would require 5 item modifications, so we'll assume their normal
9589  	 * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
9590  	 * should cover the worst case number of items we'll modify.
9591  	 */
9592  	trans = btrfs_start_transaction(root, 12);
9593  	if (IS_ERR(trans)) {
9594  		ret = PTR_ERR(trans);
9595  		goto out_notrans;
9596  	}
9597  
9598  	/*
9599  	 * We need to find a free sequence number both in the source and
9600  	 * in the destination directory for the exchange.
9601  	 */
9602  	ret = btrfs_set_inode_index(new_dir, &old_idx);
9603  	if (ret)
9604  		goto out_fail;
9605  	ret = btrfs_set_inode_index(old_dir, &new_idx);
9606  	if (ret)
9607  		goto out_fail;
9608  
9609  	BTRFS_I(old_inode)->dir_index = 0ULL;
9610  	BTRFS_I(new_inode)->dir_index = 0ULL;
9611  
9612  	/* Reference for the source. */
9613  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9614  		/* force full log commit if subvolume involved. */
9615  		btrfs_set_log_full_commit(root->fs_info, trans);
9616  	} else {
9617  		btrfs_pin_log_trans(root);
9618  		root_log_pinned = true;
9619  		ret = btrfs_insert_inode_ref(trans, dest,
9620  					     new_dentry->d_name.name,
9621  					     new_dentry->d_name.len,
9622  					     old_ino,
9623  					     btrfs_ino(new_dir), old_idx);
9624  		if (ret)
9625  			goto out_fail;
9626  	}
9627  
9628  	/* And now for the dest. */
9629  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9630  		/* force full log commit if subvolume involved. */
9631  		btrfs_set_log_full_commit(dest->fs_info, trans);
9632  	} else {
9633  		btrfs_pin_log_trans(dest);
9634  		dest_log_pinned = true;
9635  		ret = btrfs_insert_inode_ref(trans, root,
9636  					     old_dentry->d_name.name,
9637  					     old_dentry->d_name.len,
9638  					     new_ino,
9639  					     btrfs_ino(old_dir), new_idx);
9640  		if (ret)
9641  			goto out_fail;
9642  	}
9643  
9644  	/* Update inode version and ctime/mtime. */
9645  	inode_inc_iversion(old_dir);
9646  	inode_inc_iversion(new_dir);
9647  	inode_inc_iversion(old_inode);
9648  	inode_inc_iversion(new_inode);
9649  	old_dir->i_ctime = old_dir->i_mtime = ctime;
9650  	new_dir->i_ctime = new_dir->i_mtime = ctime;
9651  	old_inode->i_ctime = ctime;
9652  	new_inode->i_ctime = ctime;
9653  
9654  	if (old_dentry->d_parent != new_dentry->d_parent) {
9655  		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
9656  		btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
9657  	}
9658  
9659  	/* src is a subvolume */
9660  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9661  		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9662  		ret = btrfs_unlink_subvol(trans, root, old_dir,
9663  					  root_objectid,
9664  					  old_dentry->d_name.name,
9665  					  old_dentry->d_name.len);
9666  	} else { /* src is an inode */
9667  		ret = __btrfs_unlink_inode(trans, root, old_dir,
9668  					   old_dentry->d_inode,
9669  					   old_dentry->d_name.name,
9670  					   old_dentry->d_name.len);
9671  		if (!ret)
9672  			ret = btrfs_update_inode(trans, root, old_inode);
9673  	}
9674  	if (ret) {
9675  		btrfs_abort_transaction(trans, ret);
9676  		goto out_fail;
9677  	}
9678  
9679  	/* dest is a subvolume */
9680  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9681  		root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
9682  		ret = btrfs_unlink_subvol(trans, dest, new_dir,
9683  					  root_objectid,
9684  					  new_dentry->d_name.name,
9685  					  new_dentry->d_name.len);
9686  	} else { /* dest is an inode */
9687  		ret = __btrfs_unlink_inode(trans, dest, new_dir,
9688  					   new_dentry->d_inode,
9689  					   new_dentry->d_name.name,
9690  					   new_dentry->d_name.len);
9691  		if (!ret)
9692  			ret = btrfs_update_inode(trans, dest, new_inode);
9693  	}
9694  	if (ret) {
9695  		btrfs_abort_transaction(trans, ret);
9696  		goto out_fail;
9697  	}
9698  
9699  	ret = btrfs_add_link(trans, new_dir, old_inode,
9700  			     new_dentry->d_name.name,
9701  			     new_dentry->d_name.len, 0, old_idx);
9702  	if (ret) {
9703  		btrfs_abort_transaction(trans, ret);
9704  		goto out_fail;
9705  	}
9706  
9707  	ret = btrfs_add_link(trans, old_dir, new_inode,
9708  			     old_dentry->d_name.name,
9709  			     old_dentry->d_name.len, 0, new_idx);
9710  	if (ret) {
9711  		btrfs_abort_transaction(trans, ret);
9712  		goto out_fail;
9713  	}
9714  
9715  	if (old_inode->i_nlink == 1)
9716  		BTRFS_I(old_inode)->dir_index = old_idx;
9717  	if (new_inode->i_nlink == 1)
9718  		BTRFS_I(new_inode)->dir_index = new_idx;
9719  
9720  	if (root_log_pinned) {
9721  		parent = new_dentry->d_parent;
9722  		btrfs_log_new_name(trans, old_inode, old_dir, parent);
9723  		btrfs_end_log_trans(root);
9724  		root_log_pinned = false;
9725  	}
9726  	if (dest_log_pinned) {
9727  		parent = old_dentry->d_parent;
9728  		btrfs_log_new_name(trans, new_inode, new_dir, parent);
9729  		btrfs_end_log_trans(dest);
9730  		dest_log_pinned = false;
9731  	}
9732  out_fail:
9733  	/*
9734  	 * If we have pinned a log and an error happened, we unpin tasks
9735  	 * trying to sync the log and force them to fallback to a transaction
9736  	 * commit if the log currently contains any of the inodes involved in
9737  	 * this rename operation (to ensure we do not persist a log with an
9738  	 * inconsistent state for any of these inodes or leading to any
9739  	 * inconsistencies when replayed). If the transaction was aborted, the
9740  	 * abortion reason is propagated to userspace when attempting to commit
9741  	 * the transaction. If the log does not contain any of these inodes, we
9742  	 * allow the tasks to sync it.
9743  	 */
9744  	if (ret && (root_log_pinned || dest_log_pinned)) {
9745  		if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
9746  		    btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
9747  		    btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
9748  		    (new_inode &&
9749  		     btrfs_inode_in_log(new_inode, root->fs_info->generation)))
9750  		    btrfs_set_log_full_commit(root->fs_info, trans);
9751  
9752  		if (root_log_pinned) {
9753  			btrfs_end_log_trans(root);
9754  			root_log_pinned = false;
9755  		}
9756  		if (dest_log_pinned) {
9757  			btrfs_end_log_trans(dest);
9758  			dest_log_pinned = false;
9759  		}
9760  	}
9761  	ret = btrfs_end_transaction(trans, root);
9762  out_notrans:
9763  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9764  		up_read(&dest->fs_info->subvol_sem);
9765  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9766  		up_read(&root->fs_info->subvol_sem);
9767  
9768  	return ret;
9769  }
9770  
btrfs_whiteout_for_rename(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * dir,struct dentry * dentry)9771  static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
9772  				     struct btrfs_root *root,
9773  				     struct inode *dir,
9774  				     struct dentry *dentry)
9775  {
9776  	int ret;
9777  	struct inode *inode;
9778  	u64 objectid;
9779  	u64 index;
9780  
9781  	ret = btrfs_find_free_ino(root, &objectid);
9782  	if (ret)
9783  		return ret;
9784  
9785  	inode = btrfs_new_inode(trans, root, dir,
9786  				dentry->d_name.name,
9787  				dentry->d_name.len,
9788  				btrfs_ino(dir),
9789  				objectid,
9790  				S_IFCHR | WHITEOUT_MODE,
9791  				&index);
9792  
9793  	if (IS_ERR(inode)) {
9794  		ret = PTR_ERR(inode);
9795  		return ret;
9796  	}
9797  
9798  	inode->i_op = &btrfs_special_inode_operations;
9799  	init_special_inode(inode, inode->i_mode,
9800  		WHITEOUT_DEV);
9801  
9802  	ret = btrfs_init_inode_security(trans, inode, dir,
9803  				&dentry->d_name);
9804  	if (ret)
9805  		goto out;
9806  
9807  	ret = btrfs_add_nondir(trans, dir, dentry,
9808  				inode, 0, index);
9809  	if (ret)
9810  		goto out;
9811  
9812  	ret = btrfs_update_inode(trans, root, inode);
9813  out:
9814  	unlock_new_inode(inode);
9815  	if (ret)
9816  		inode_dec_link_count(inode);
9817  	iput(inode);
9818  
9819  	return ret;
9820  }
9821  
btrfs_rename(struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)9822  static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9823  			   struct inode *new_dir, struct dentry *new_dentry,
9824  			   unsigned int flags)
9825  {
9826  	struct btrfs_trans_handle *trans;
9827  	unsigned int trans_num_items;
9828  	struct btrfs_root *root = BTRFS_I(old_dir)->root;
9829  	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9830  	struct inode *new_inode = d_inode(new_dentry);
9831  	struct inode *old_inode = d_inode(old_dentry);
9832  	u64 index = 0;
9833  	u64 root_objectid;
9834  	int ret;
9835  	u64 old_ino = btrfs_ino(old_inode);
9836  	bool log_pinned = false;
9837  
9838  	if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9839  		return -EPERM;
9840  
9841  	/* we only allow rename subvolume link between subvolumes */
9842  	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9843  		return -EXDEV;
9844  
9845  	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9846  	    (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
9847  		return -ENOTEMPTY;
9848  
9849  	if (S_ISDIR(old_inode->i_mode) && new_inode &&
9850  	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9851  		return -ENOTEMPTY;
9852  
9853  
9854  	/* check for collisions, even if the  name isn't there */
9855  	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
9856  			     new_dentry->d_name.name,
9857  			     new_dentry->d_name.len);
9858  
9859  	if (ret) {
9860  		if (ret == -EEXIST) {
9861  			/* we shouldn't get
9862  			 * eexist without a new_inode */
9863  			if (WARN_ON(!new_inode)) {
9864  				return ret;
9865  			}
9866  		} else {
9867  			/* maybe -EOVERFLOW */
9868  			return ret;
9869  		}
9870  	}
9871  	ret = 0;
9872  
9873  	/*
9874  	 * we're using rename to replace one file with another.  Start IO on it
9875  	 * now so  we don't add too much work to the end of the transaction
9876  	 */
9877  	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9878  		filemap_flush(old_inode->i_mapping);
9879  
9880  	/* close the racy window with snapshot create/destroy ioctl */
9881  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9882  		down_read(&root->fs_info->subvol_sem);
9883  	/*
9884  	 * We want to reserve the absolute worst case amount of items.  So if
9885  	 * both inodes are subvols and we need to unlink them then that would
9886  	 * require 4 item modifications, but if they are both normal inodes it
9887  	 * would require 5 item modifications, so we'll assume they are normal
9888  	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
9889  	 * should cover the worst case number of items we'll modify.
9890  	 * If our rename has the whiteout flag, we need more 5 units for the
9891  	 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
9892  	 * when selinux is enabled).
9893  	 */
9894  	trans_num_items = 11;
9895  	if (flags & RENAME_WHITEOUT)
9896  		trans_num_items += 5;
9897  	trans = btrfs_start_transaction(root, trans_num_items);
9898  	if (IS_ERR(trans)) {
9899  		ret = PTR_ERR(trans);
9900  		goto out_notrans;
9901  	}
9902  
9903  	if (dest != root)
9904  		btrfs_record_root_in_trans(trans, dest);
9905  
9906  	ret = btrfs_set_inode_index(new_dir, &index);
9907  	if (ret)
9908  		goto out_fail;
9909  
9910  	BTRFS_I(old_inode)->dir_index = 0ULL;
9911  	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9912  		/* force full log commit if subvolume involved. */
9913  		btrfs_set_log_full_commit(root->fs_info, trans);
9914  	} else {
9915  		btrfs_pin_log_trans(root);
9916  		log_pinned = true;
9917  		ret = btrfs_insert_inode_ref(trans, dest,
9918  					     new_dentry->d_name.name,
9919  					     new_dentry->d_name.len,
9920  					     old_ino,
9921  					     btrfs_ino(new_dir), index);
9922  		if (ret)
9923  			goto out_fail;
9924  	}
9925  
9926  	inode_inc_iversion(old_dir);
9927  	inode_inc_iversion(new_dir);
9928  	inode_inc_iversion(old_inode);
9929  	old_dir->i_ctime = old_dir->i_mtime =
9930  	new_dir->i_ctime = new_dir->i_mtime =
9931  	old_inode->i_ctime = current_time(old_dir);
9932  
9933  	if (old_dentry->d_parent != new_dentry->d_parent)
9934  		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
9935  
9936  	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9937  		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9938  		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
9939  					old_dentry->d_name.name,
9940  					old_dentry->d_name.len);
9941  	} else {
9942  		ret = __btrfs_unlink_inode(trans, root, old_dir,
9943  					d_inode(old_dentry),
9944  					old_dentry->d_name.name,
9945  					old_dentry->d_name.len);
9946  		if (!ret)
9947  			ret = btrfs_update_inode(trans, root, old_inode);
9948  	}
9949  	if (ret) {
9950  		btrfs_abort_transaction(trans, ret);
9951  		goto out_fail;
9952  	}
9953  
9954  	if (new_inode) {
9955  		inode_inc_iversion(new_inode);
9956  		new_inode->i_ctime = current_time(new_inode);
9957  		if (unlikely(btrfs_ino(new_inode) ==
9958  			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9959  			root_objectid = BTRFS_I(new_inode)->location.objectid;
9960  			ret = btrfs_unlink_subvol(trans, dest, new_dir,
9961  						root_objectid,
9962  						new_dentry->d_name.name,
9963  						new_dentry->d_name.len);
9964  			BUG_ON(new_inode->i_nlink == 0);
9965  		} else {
9966  			ret = btrfs_unlink_inode(trans, dest, new_dir,
9967  						 d_inode(new_dentry),
9968  						 new_dentry->d_name.name,
9969  						 new_dentry->d_name.len);
9970  		}
9971  		if (!ret && new_inode->i_nlink == 0)
9972  			ret = btrfs_orphan_add(trans, d_inode(new_dentry));
9973  		if (ret) {
9974  			btrfs_abort_transaction(trans, ret);
9975  			goto out_fail;
9976  		}
9977  	}
9978  
9979  	ret = btrfs_add_link(trans, new_dir, old_inode,
9980  			     new_dentry->d_name.name,
9981  			     new_dentry->d_name.len, 0, index);
9982  	if (ret) {
9983  		btrfs_abort_transaction(trans, ret);
9984  		goto out_fail;
9985  	}
9986  
9987  	if (old_inode->i_nlink == 1)
9988  		BTRFS_I(old_inode)->dir_index = index;
9989  
9990  	if (log_pinned) {
9991  		struct dentry *parent = new_dentry->d_parent;
9992  
9993  		btrfs_log_new_name(trans, old_inode, old_dir, parent);
9994  		btrfs_end_log_trans(root);
9995  		log_pinned = false;
9996  	}
9997  
9998  	if (flags & RENAME_WHITEOUT) {
9999  		ret = btrfs_whiteout_for_rename(trans, root, old_dir,
10000  						old_dentry);
10001  
10002  		if (ret) {
10003  			btrfs_abort_transaction(trans, ret);
10004  			goto out_fail;
10005  		}
10006  	}
10007  out_fail:
10008  	/*
10009  	 * If we have pinned the log and an error happened, we unpin tasks
10010  	 * trying to sync the log and force them to fallback to a transaction
10011  	 * commit if the log currently contains any of the inodes involved in
10012  	 * this rename operation (to ensure we do not persist a log with an
10013  	 * inconsistent state for any of these inodes or leading to any
10014  	 * inconsistencies when replayed). If the transaction was aborted, the
10015  	 * abortion reason is propagated to userspace when attempting to commit
10016  	 * the transaction. If the log does not contain any of these inodes, we
10017  	 * allow the tasks to sync it.
10018  	 */
10019  	if (ret && log_pinned) {
10020  		if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
10021  		    btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
10022  		    btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
10023  		    (new_inode &&
10024  		     btrfs_inode_in_log(new_inode, root->fs_info->generation)))
10025  		    btrfs_set_log_full_commit(root->fs_info, trans);
10026  
10027  		btrfs_end_log_trans(root);
10028  		log_pinned = false;
10029  	}
10030  	btrfs_end_transaction(trans, root);
10031  out_notrans:
10032  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
10033  		up_read(&root->fs_info->subvol_sem);
10034  
10035  	return ret;
10036  }
10037  
btrfs_rename2(struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)10038  static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
10039  			 struct inode *new_dir, struct dentry *new_dentry,
10040  			 unsigned int flags)
10041  {
10042  	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
10043  		return -EINVAL;
10044  
10045  	if (flags & RENAME_EXCHANGE)
10046  		return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
10047  					  new_dentry);
10048  
10049  	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
10050  }
10051  
btrfs_run_delalloc_work(struct btrfs_work * work)10052  static void btrfs_run_delalloc_work(struct btrfs_work *work)
10053  {
10054  	struct btrfs_delalloc_work *delalloc_work;
10055  	struct inode *inode;
10056  
10057  	delalloc_work = container_of(work, struct btrfs_delalloc_work,
10058  				     work);
10059  	inode = delalloc_work->inode;
10060  	filemap_flush(inode->i_mapping);
10061  	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
10062  				&BTRFS_I(inode)->runtime_flags))
10063  		filemap_flush(inode->i_mapping);
10064  
10065  	if (delalloc_work->delay_iput)
10066  		btrfs_add_delayed_iput(inode);
10067  	else
10068  		iput(inode);
10069  	complete(&delalloc_work->completion);
10070  }
10071  
btrfs_alloc_delalloc_work(struct inode * inode,int delay_iput)10072  struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
10073  						    int delay_iput)
10074  {
10075  	struct btrfs_delalloc_work *work;
10076  
10077  	work = kmalloc(sizeof(*work), GFP_NOFS);
10078  	if (!work)
10079  		return NULL;
10080  
10081  	init_completion(&work->completion);
10082  	INIT_LIST_HEAD(&work->list);
10083  	work->inode = inode;
10084  	work->delay_iput = delay_iput;
10085  	WARN_ON_ONCE(!inode);
10086  	btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
10087  			btrfs_run_delalloc_work, NULL, NULL);
10088  
10089  	return work;
10090  }
10091  
btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work * work)10092  void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
10093  {
10094  	wait_for_completion(&work->completion);
10095  	kfree(work);
10096  }
10097  
10098  /*
10099   * some fairly slow code that needs optimization. This walks the list
10100   * of all the inodes with pending delalloc and forces them to disk.
10101   */
__start_delalloc_inodes(struct btrfs_root * root,int delay_iput,int nr)10102  static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
10103  				   int nr)
10104  {
10105  	struct btrfs_inode *binode;
10106  	struct inode *inode;
10107  	struct btrfs_delalloc_work *work, *next;
10108  	struct list_head works;
10109  	struct list_head splice;
10110  	int ret = 0;
10111  
10112  	INIT_LIST_HEAD(&works);
10113  	INIT_LIST_HEAD(&splice);
10114  
10115  	mutex_lock(&root->delalloc_mutex);
10116  	spin_lock(&root->delalloc_lock);
10117  	list_splice_init(&root->delalloc_inodes, &splice);
10118  	while (!list_empty(&splice)) {
10119  		binode = list_entry(splice.next, struct btrfs_inode,
10120  				    delalloc_inodes);
10121  
10122  		list_move_tail(&binode->delalloc_inodes,
10123  			       &root->delalloc_inodes);
10124  		inode = igrab(&binode->vfs_inode);
10125  		if (!inode) {
10126  			cond_resched_lock(&root->delalloc_lock);
10127  			continue;
10128  		}
10129  		spin_unlock(&root->delalloc_lock);
10130  
10131  		work = btrfs_alloc_delalloc_work(inode, delay_iput);
10132  		if (!work) {
10133  			if (delay_iput)
10134  				btrfs_add_delayed_iput(inode);
10135  			else
10136  				iput(inode);
10137  			ret = -ENOMEM;
10138  			goto out;
10139  		}
10140  		list_add_tail(&work->list, &works);
10141  		btrfs_queue_work(root->fs_info->flush_workers,
10142  				 &work->work);
10143  		ret++;
10144  		if (nr != -1 && ret >= nr)
10145  			goto out;
10146  		cond_resched();
10147  		spin_lock(&root->delalloc_lock);
10148  	}
10149  	spin_unlock(&root->delalloc_lock);
10150  
10151  out:
10152  	list_for_each_entry_safe(work, next, &works, list) {
10153  		list_del_init(&work->list);
10154  		btrfs_wait_and_free_delalloc_work(work);
10155  	}
10156  
10157  	if (!list_empty_careful(&splice)) {
10158  		spin_lock(&root->delalloc_lock);
10159  		list_splice_tail(&splice, &root->delalloc_inodes);
10160  		spin_unlock(&root->delalloc_lock);
10161  	}
10162  	mutex_unlock(&root->delalloc_mutex);
10163  	return ret;
10164  }
10165  
btrfs_start_delalloc_inodes(struct btrfs_root * root,int delay_iput)10166  int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
10167  {
10168  	int ret;
10169  
10170  	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
10171  		return -EROFS;
10172  
10173  	ret = __start_delalloc_inodes(root, delay_iput, -1);
10174  	if (ret > 0)
10175  		ret = 0;
10176  	/*
10177  	 * the filemap_flush will queue IO into the worker threads, but
10178  	 * we have to make sure the IO is actually started and that
10179  	 * ordered extents get created before we return
10180  	 */
10181  	atomic_inc(&root->fs_info->async_submit_draining);
10182  	while (atomic_read(&root->fs_info->nr_async_submits) ||
10183  	      atomic_read(&root->fs_info->async_delalloc_pages)) {
10184  		wait_event(root->fs_info->async_submit_wait,
10185  		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
10186  		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
10187  	}
10188  	atomic_dec(&root->fs_info->async_submit_draining);
10189  	return ret;
10190  }
10191  
btrfs_start_delalloc_roots(struct btrfs_fs_info * fs_info,int delay_iput,int nr)10192  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
10193  			       int nr)
10194  {
10195  	struct btrfs_root *root;
10196  	struct list_head splice;
10197  	int ret;
10198  
10199  	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
10200  		return -EROFS;
10201  
10202  	INIT_LIST_HEAD(&splice);
10203  
10204  	mutex_lock(&fs_info->delalloc_root_mutex);
10205  	spin_lock(&fs_info->delalloc_root_lock);
10206  	list_splice_init(&fs_info->delalloc_roots, &splice);
10207  	while (!list_empty(&splice) && nr) {
10208  		root = list_first_entry(&splice, struct btrfs_root,
10209  					delalloc_root);
10210  		root = btrfs_grab_fs_root(root);
10211  		BUG_ON(!root);
10212  		list_move_tail(&root->delalloc_root,
10213  			       &fs_info->delalloc_roots);
10214  		spin_unlock(&fs_info->delalloc_root_lock);
10215  
10216  		ret = __start_delalloc_inodes(root, delay_iput, nr);
10217  		btrfs_put_fs_root(root);
10218  		if (ret < 0)
10219  			goto out;
10220  
10221  		if (nr != -1) {
10222  			nr -= ret;
10223  			WARN_ON(nr < 0);
10224  		}
10225  		spin_lock(&fs_info->delalloc_root_lock);
10226  	}
10227  	spin_unlock(&fs_info->delalloc_root_lock);
10228  
10229  	ret = 0;
10230  	atomic_inc(&fs_info->async_submit_draining);
10231  	while (atomic_read(&fs_info->nr_async_submits) ||
10232  	      atomic_read(&fs_info->async_delalloc_pages)) {
10233  		wait_event(fs_info->async_submit_wait,
10234  		   (atomic_read(&fs_info->nr_async_submits) == 0 &&
10235  		    atomic_read(&fs_info->async_delalloc_pages) == 0));
10236  	}
10237  	atomic_dec(&fs_info->async_submit_draining);
10238  out:
10239  	if (!list_empty_careful(&splice)) {
10240  		spin_lock(&fs_info->delalloc_root_lock);
10241  		list_splice_tail(&splice, &fs_info->delalloc_roots);
10242  		spin_unlock(&fs_info->delalloc_root_lock);
10243  	}
10244  	mutex_unlock(&fs_info->delalloc_root_mutex);
10245  	return ret;
10246  }
10247  
btrfs_symlink(struct inode * dir,struct dentry * dentry,const char * symname)10248  static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
10249  			 const char *symname)
10250  {
10251  	struct btrfs_trans_handle *trans;
10252  	struct btrfs_root *root = BTRFS_I(dir)->root;
10253  	struct btrfs_path *path;
10254  	struct btrfs_key key;
10255  	struct inode *inode = NULL;
10256  	int err;
10257  	int drop_inode = 0;
10258  	u64 objectid;
10259  	u64 index = 0;
10260  	int name_len;
10261  	int datasize;
10262  	unsigned long ptr;
10263  	struct btrfs_file_extent_item *ei;
10264  	struct extent_buffer *leaf;
10265  
10266  	name_len = strlen(symname);
10267  	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
10268  		return -ENAMETOOLONG;
10269  
10270  	/*
10271  	 * 2 items for inode item and ref
10272  	 * 2 items for dir items
10273  	 * 1 item for updating parent inode item
10274  	 * 1 item for the inline extent item
10275  	 * 1 item for xattr if selinux is on
10276  	 */
10277  	trans = btrfs_start_transaction(root, 7);
10278  	if (IS_ERR(trans))
10279  		return PTR_ERR(trans);
10280  
10281  	err = btrfs_find_free_ino(root, &objectid);
10282  	if (err)
10283  		goto out_unlock;
10284  
10285  	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
10286  				dentry->d_name.len, btrfs_ino(dir), objectid,
10287  				S_IFLNK|S_IRWXUGO, &index);
10288  	if (IS_ERR(inode)) {
10289  		err = PTR_ERR(inode);
10290  		goto out_unlock;
10291  	}
10292  
10293  	/*
10294  	* If the active LSM wants to access the inode during
10295  	* d_instantiate it needs these. Smack checks to see
10296  	* if the filesystem supports xattrs by looking at the
10297  	* ops vector.
10298  	*/
10299  	inode->i_fop = &btrfs_file_operations;
10300  	inode->i_op = &btrfs_file_inode_operations;
10301  	inode->i_mapping->a_ops = &btrfs_aops;
10302  	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10303  
10304  	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
10305  	if (err)
10306  		goto out_unlock_inode;
10307  
10308  	path = btrfs_alloc_path();
10309  	if (!path) {
10310  		err = -ENOMEM;
10311  		goto out_unlock_inode;
10312  	}
10313  	key.objectid = btrfs_ino(inode);
10314  	key.offset = 0;
10315  	key.type = BTRFS_EXTENT_DATA_KEY;
10316  	datasize = btrfs_file_extent_calc_inline_size(name_len);
10317  	err = btrfs_insert_empty_item(trans, root, path, &key,
10318  				      datasize);
10319  	if (err) {
10320  		btrfs_free_path(path);
10321  		goto out_unlock_inode;
10322  	}
10323  	leaf = path->nodes[0];
10324  	ei = btrfs_item_ptr(leaf, path->slots[0],
10325  			    struct btrfs_file_extent_item);
10326  	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
10327  	btrfs_set_file_extent_type(leaf, ei,
10328  				   BTRFS_FILE_EXTENT_INLINE);
10329  	btrfs_set_file_extent_encryption(leaf, ei, 0);
10330  	btrfs_set_file_extent_compression(leaf, ei, 0);
10331  	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
10332  	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
10333  
10334  	ptr = btrfs_file_extent_inline_start(ei);
10335  	write_extent_buffer(leaf, symname, ptr, name_len);
10336  	btrfs_mark_buffer_dirty(leaf);
10337  	btrfs_free_path(path);
10338  
10339  	inode->i_op = &btrfs_symlink_inode_operations;
10340  	inode_nohighmem(inode);
10341  	inode->i_mapping->a_ops = &btrfs_symlink_aops;
10342  	inode_set_bytes(inode, name_len);
10343  	btrfs_i_size_write(inode, name_len);
10344  	err = btrfs_update_inode(trans, root, inode);
10345  	/*
10346  	 * Last step, add directory indexes for our symlink inode. This is the
10347  	 * last step to avoid extra cleanup of these indexes if an error happens
10348  	 * elsewhere above.
10349  	 */
10350  	if (!err)
10351  		err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
10352  	if (err) {
10353  		drop_inode = 1;
10354  		goto out_unlock_inode;
10355  	}
10356  
10357  	unlock_new_inode(inode);
10358  	d_instantiate(dentry, inode);
10359  
10360  out_unlock:
10361  	btrfs_end_transaction(trans, root);
10362  	if (drop_inode) {
10363  		inode_dec_link_count(inode);
10364  		iput(inode);
10365  	}
10366  	btrfs_btree_balance_dirty(root);
10367  	return err;
10368  
10369  out_unlock_inode:
10370  	drop_inode = 1;
10371  	unlock_new_inode(inode);
10372  	goto out_unlock;
10373  }
10374  
__btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint,struct btrfs_trans_handle * trans)10375  static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10376  				       u64 start, u64 num_bytes, u64 min_size,
10377  				       loff_t actual_len, u64 *alloc_hint,
10378  				       struct btrfs_trans_handle *trans)
10379  {
10380  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
10381  	struct extent_map *em;
10382  	struct btrfs_root *root = BTRFS_I(inode)->root;
10383  	struct btrfs_key ins;
10384  	u64 cur_offset = start;
10385  	u64 i_size;
10386  	u64 cur_bytes;
10387  	u64 last_alloc = (u64)-1;
10388  	int ret = 0;
10389  	bool own_trans = true;
10390  	u64 end = start + num_bytes - 1;
10391  
10392  	if (trans)
10393  		own_trans = false;
10394  	while (num_bytes > 0) {
10395  		if (own_trans) {
10396  			trans = btrfs_start_transaction(root, 3);
10397  			if (IS_ERR(trans)) {
10398  				ret = PTR_ERR(trans);
10399  				break;
10400  			}
10401  		}
10402  
10403  		cur_bytes = min_t(u64, num_bytes, SZ_256M);
10404  		cur_bytes = max(cur_bytes, min_size);
10405  		/*
10406  		 * If we are severely fragmented we could end up with really
10407  		 * small allocations, so if the allocator is returning small
10408  		 * chunks lets make its job easier by only searching for those
10409  		 * sized chunks.
10410  		 */
10411  		cur_bytes = min(cur_bytes, last_alloc);
10412  		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
10413  				min_size, 0, *alloc_hint, &ins, 1, 0);
10414  		if (ret) {
10415  			if (own_trans)
10416  				btrfs_end_transaction(trans, root);
10417  			break;
10418  		}
10419  		btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
10420  
10421  		last_alloc = ins.offset;
10422  		ret = insert_reserved_file_extent(trans, inode,
10423  						  cur_offset, ins.objectid,
10424  						  ins.offset, ins.offset,
10425  						  ins.offset, 0, 0, 0,
10426  						  BTRFS_FILE_EXTENT_PREALLOC);
10427  		if (ret) {
10428  			btrfs_free_reserved_extent(root, ins.objectid,
10429  						   ins.offset, 0);
10430  			btrfs_abort_transaction(trans, ret);
10431  			if (own_trans)
10432  				btrfs_end_transaction(trans, root);
10433  			break;
10434  		}
10435  
10436  		btrfs_drop_extent_cache(inode, cur_offset,
10437  					cur_offset + ins.offset -1, 0);
10438  
10439  		em = alloc_extent_map();
10440  		if (!em) {
10441  			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
10442  				&BTRFS_I(inode)->runtime_flags);
10443  			goto next;
10444  		}
10445  
10446  		em->start = cur_offset;
10447  		em->orig_start = cur_offset;
10448  		em->len = ins.offset;
10449  		em->block_start = ins.objectid;
10450  		em->block_len = ins.offset;
10451  		em->orig_block_len = ins.offset;
10452  		em->ram_bytes = ins.offset;
10453  		em->bdev = root->fs_info->fs_devices->latest_bdev;
10454  		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
10455  		em->generation = trans->transid;
10456  
10457  		while (1) {
10458  			write_lock(&em_tree->lock);
10459  			ret = add_extent_mapping(em_tree, em, 1);
10460  			write_unlock(&em_tree->lock);
10461  			if (ret != -EEXIST)
10462  				break;
10463  			btrfs_drop_extent_cache(inode, cur_offset,
10464  						cur_offset + ins.offset - 1,
10465  						0);
10466  		}
10467  		free_extent_map(em);
10468  next:
10469  		num_bytes -= ins.offset;
10470  		cur_offset += ins.offset;
10471  		*alloc_hint = ins.objectid + ins.offset;
10472  
10473  		inode_inc_iversion(inode);
10474  		inode->i_ctime = current_time(inode);
10475  		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
10476  		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
10477  		    (actual_len > inode->i_size) &&
10478  		    (cur_offset > inode->i_size)) {
10479  			if (cur_offset > actual_len)
10480  				i_size = actual_len;
10481  			else
10482  				i_size = cur_offset;
10483  			i_size_write(inode, i_size);
10484  			btrfs_ordered_update_i_size(inode, i_size, NULL);
10485  		}
10486  
10487  		ret = btrfs_update_inode(trans, root, inode);
10488  
10489  		if (ret) {
10490  			btrfs_abort_transaction(trans, ret);
10491  			if (own_trans)
10492  				btrfs_end_transaction(trans, root);
10493  			break;
10494  		}
10495  
10496  		if (own_trans)
10497  			btrfs_end_transaction(trans, root);
10498  	}
10499  	if (cur_offset < end)
10500  		btrfs_free_reserved_data_space(inode, cur_offset,
10501  			end - cur_offset + 1);
10502  	return ret;
10503  }
10504  
btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)10505  int btrfs_prealloc_file_range(struct inode *inode, int mode,
10506  			      u64 start, u64 num_bytes, u64 min_size,
10507  			      loff_t actual_len, u64 *alloc_hint)
10508  {
10509  	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10510  					   min_size, actual_len, alloc_hint,
10511  					   NULL);
10512  }
10513  
btrfs_prealloc_file_range_trans(struct inode * inode,struct btrfs_trans_handle * trans,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)10514  int btrfs_prealloc_file_range_trans(struct inode *inode,
10515  				    struct btrfs_trans_handle *trans, int mode,
10516  				    u64 start, u64 num_bytes, u64 min_size,
10517  				    loff_t actual_len, u64 *alloc_hint)
10518  {
10519  	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10520  					   min_size, actual_len, alloc_hint, trans);
10521  }
10522  
btrfs_set_page_dirty(struct page * page)10523  static int btrfs_set_page_dirty(struct page *page)
10524  {
10525  	return __set_page_dirty_nobuffers(page);
10526  }
10527  
btrfs_permission(struct inode * inode,int mask)10528  static int btrfs_permission(struct inode *inode, int mask)
10529  {
10530  	struct btrfs_root *root = BTRFS_I(inode)->root;
10531  	umode_t mode = inode->i_mode;
10532  
10533  	if (mask & MAY_WRITE &&
10534  	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
10535  		if (btrfs_root_readonly(root))
10536  			return -EROFS;
10537  		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
10538  			return -EACCES;
10539  	}
10540  	return generic_permission(inode, mask);
10541  }
10542  
btrfs_tmpfile(struct inode * dir,struct dentry * dentry,umode_t mode)10543  static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
10544  {
10545  	struct btrfs_trans_handle *trans;
10546  	struct btrfs_root *root = BTRFS_I(dir)->root;
10547  	struct inode *inode = NULL;
10548  	u64 objectid;
10549  	u64 index;
10550  	int ret = 0;
10551  
10552  	/*
10553  	 * 5 units required for adding orphan entry
10554  	 */
10555  	trans = btrfs_start_transaction(root, 5);
10556  	if (IS_ERR(trans))
10557  		return PTR_ERR(trans);
10558  
10559  	ret = btrfs_find_free_ino(root, &objectid);
10560  	if (ret)
10561  		goto out;
10562  
10563  	inode = btrfs_new_inode(trans, root, dir, NULL, 0,
10564  				btrfs_ino(dir), objectid, mode, &index);
10565  	if (IS_ERR(inode)) {
10566  		ret = PTR_ERR(inode);
10567  		inode = NULL;
10568  		goto out;
10569  	}
10570  
10571  	inode->i_fop = &btrfs_file_operations;
10572  	inode->i_op = &btrfs_file_inode_operations;
10573  
10574  	inode->i_mapping->a_ops = &btrfs_aops;
10575  	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10576  
10577  	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
10578  	if (ret)
10579  		goto out_inode;
10580  
10581  	ret = btrfs_update_inode(trans, root, inode);
10582  	if (ret)
10583  		goto out_inode;
10584  	ret = btrfs_orphan_add(trans, inode);
10585  	if (ret)
10586  		goto out_inode;
10587  
10588  	/*
10589  	 * We set number of links to 0 in btrfs_new_inode(), and here we set
10590  	 * it to 1 because d_tmpfile() will issue a warning if the count is 0,
10591  	 * through:
10592  	 *
10593  	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
10594  	 */
10595  	set_nlink(inode, 1);
10596  	unlock_new_inode(inode);
10597  	d_tmpfile(dentry, inode);
10598  	mark_inode_dirty(inode);
10599  
10600  out:
10601  	btrfs_end_transaction(trans, root);
10602  	if (ret)
10603  		iput(inode);
10604  	btrfs_balance_delayed_items(root);
10605  	btrfs_btree_balance_dirty(root);
10606  	return ret;
10607  
10608  out_inode:
10609  	unlock_new_inode(inode);
10610  	goto out;
10611  
10612  }
10613  
10614  static const struct inode_operations btrfs_dir_inode_operations = {
10615  	.getattr	= btrfs_getattr,
10616  	.lookup		= btrfs_lookup,
10617  	.create		= btrfs_create,
10618  	.unlink		= btrfs_unlink,
10619  	.link		= btrfs_link,
10620  	.mkdir		= btrfs_mkdir,
10621  	.rmdir		= btrfs_rmdir,
10622  	.rename		= btrfs_rename2,
10623  	.symlink	= btrfs_symlink,
10624  	.setattr	= btrfs_setattr,
10625  	.mknod		= btrfs_mknod,
10626  	.listxattr	= btrfs_listxattr,
10627  	.permission	= btrfs_permission,
10628  	.get_acl	= btrfs_get_acl,
10629  	.set_acl	= btrfs_set_acl,
10630  	.update_time	= btrfs_update_time,
10631  	.tmpfile        = btrfs_tmpfile,
10632  };
10633  static const struct inode_operations btrfs_dir_ro_inode_operations = {
10634  	.lookup		= btrfs_lookup,
10635  	.permission	= btrfs_permission,
10636  	.update_time	= btrfs_update_time,
10637  };
10638  
10639  static const struct file_operations btrfs_dir_file_operations = {
10640  	.llseek		= generic_file_llseek,
10641  	.read		= generic_read_dir,
10642  	.iterate_shared	= btrfs_real_readdir,
10643  	.unlocked_ioctl	= btrfs_ioctl,
10644  #ifdef CONFIG_COMPAT
10645  	.compat_ioctl	= btrfs_compat_ioctl,
10646  #endif
10647  	.release        = btrfs_release_file,
10648  	.fsync		= btrfs_sync_file,
10649  };
10650  
10651  static const struct extent_io_ops btrfs_extent_io_ops = {
10652  	.fill_delalloc = run_delalloc_range,
10653  	.submit_bio_hook = btrfs_submit_bio_hook,
10654  	.merge_bio_hook = btrfs_merge_bio_hook,
10655  	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
10656  	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
10657  	.writepage_start_hook = btrfs_writepage_start_hook,
10658  	.set_bit_hook = btrfs_set_bit_hook,
10659  	.clear_bit_hook = btrfs_clear_bit_hook,
10660  	.merge_extent_hook = btrfs_merge_extent_hook,
10661  	.split_extent_hook = btrfs_split_extent_hook,
10662  };
10663  
10664  /*
10665   * btrfs doesn't support the bmap operation because swapfiles
10666   * use bmap to make a mapping of extents in the file.  They assume
10667   * these extents won't change over the life of the file and they
10668   * use the bmap result to do IO directly to the drive.
10669   *
10670   * the btrfs bmap call would return logical addresses that aren't
10671   * suitable for IO and they also will change frequently as COW
10672   * operations happen.  So, swapfile + btrfs == corruption.
10673   *
10674   * For now we're avoiding this by dropping bmap.
10675   */
10676  static const struct address_space_operations btrfs_aops = {
10677  	.readpage	= btrfs_readpage,
10678  	.writepage	= btrfs_writepage,
10679  	.writepages	= btrfs_writepages,
10680  	.readpages	= btrfs_readpages,
10681  	.direct_IO	= btrfs_direct_IO,
10682  	.invalidatepage = btrfs_invalidatepage,
10683  	.releasepage	= btrfs_releasepage,
10684  	.set_page_dirty	= btrfs_set_page_dirty,
10685  	.error_remove_page = generic_error_remove_page,
10686  };
10687  
10688  static const struct address_space_operations btrfs_symlink_aops = {
10689  	.readpage	= btrfs_readpage,
10690  	.writepage	= btrfs_writepage,
10691  	.invalidatepage = btrfs_invalidatepage,
10692  	.releasepage	= btrfs_releasepage,
10693  };
10694  
10695  static const struct inode_operations btrfs_file_inode_operations = {
10696  	.getattr	= btrfs_getattr,
10697  	.setattr	= btrfs_setattr,
10698  	.listxattr      = btrfs_listxattr,
10699  	.permission	= btrfs_permission,
10700  	.fiemap		= btrfs_fiemap,
10701  	.get_acl	= btrfs_get_acl,
10702  	.set_acl	= btrfs_set_acl,
10703  	.update_time	= btrfs_update_time,
10704  };
10705  static const struct inode_operations btrfs_special_inode_operations = {
10706  	.getattr	= btrfs_getattr,
10707  	.setattr	= btrfs_setattr,
10708  	.permission	= btrfs_permission,
10709  	.listxattr	= btrfs_listxattr,
10710  	.get_acl	= btrfs_get_acl,
10711  	.set_acl	= btrfs_set_acl,
10712  	.update_time	= btrfs_update_time,
10713  };
10714  static const struct inode_operations btrfs_symlink_inode_operations = {
10715  	.readlink	= generic_readlink,
10716  	.get_link	= page_get_link,
10717  	.getattr	= btrfs_getattr,
10718  	.setattr	= btrfs_setattr,
10719  	.permission	= btrfs_permission,
10720  	.listxattr	= btrfs_listxattr,
10721  	.update_time	= btrfs_update_time,
10722  };
10723  
10724  const struct dentry_operations btrfs_dentry_operations = {
10725  	.d_delete	= btrfs_dentry_delete,
10726  	.d_release	= btrfs_dentry_release,
10727  };
10728