1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2008 Oracle. All rights reserved.
4 */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/blkdev.h>
9 #include <linux/list_sort.h>
10 #include <linux/iversion.h>
11 #include "misc.h"
12 #include "ctree.h"
13 #include "tree-log.h"
14 #include "disk-io.h"
15 #include "locking.h"
16 #include "print-tree.h"
17 #include "backref.h"
18 #include "compression.h"
19 #include "qgroup.h"
20 #include "inode-map.h"
21 #include "block-group.h"
22 #include "space-info.h"
23
24 /* magic values for the inode_only field in btrfs_log_inode:
25 *
26 * LOG_INODE_ALL means to log everything
27 * LOG_INODE_EXISTS means to log just enough to recreate the inode
28 * during log replay
29 */
30 enum {
31 LOG_INODE_ALL,
32 LOG_INODE_EXISTS,
33 LOG_OTHER_INODE,
34 LOG_OTHER_INODE_ALL,
35 };
36
37 /*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2). After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 * 2a is actually the more important variant. With the extra logging
63 * a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir. After a crash the rm -rf must
75 * be replayed. This must be able to recurse down the entire
76 * directory tree. The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80 /*
81 * stages for the tree walking. The first
82 * stage (0) is to only pin down the blocks we find
83 * the second stage (1) is to make sure that all the inodes
84 * we find in the log are created in the subvolume.
85 *
86 * The last stage is to deal with directories and links and extents
87 * and all the other fun semantics
88 */
89 enum {
90 LOG_WALK_PIN_ONLY,
91 LOG_WALK_REPLAY_INODES,
92 LOG_WALK_REPLAY_DIR_INDEX,
93 LOG_WALK_REPLAY_ALL,
94 };
95
96 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root, struct btrfs_inode *inode,
98 int inode_only,
99 struct btrfs_log_ctx *ctx);
100 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
101 struct btrfs_root *root,
102 struct btrfs_path *path, u64 objectid);
103 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
104 struct btrfs_root *root,
105 struct btrfs_root *log,
106 struct btrfs_path *path,
107 u64 dirid, int del_all);
108
109 /*
110 * tree logging is a special write ahead log used to make sure that
111 * fsyncs and O_SYNCs can happen without doing full tree commits.
112 *
113 * Full tree commits are expensive because they require commonly
114 * modified blocks to be recowed, creating many dirty pages in the
115 * extent tree an 4x-6x higher write load than ext3.
116 *
117 * Instead of doing a tree commit on every fsync, we use the
118 * key ranges and transaction ids to find items for a given file or directory
119 * that have changed in this transaction. Those items are copied into
120 * a special tree (one per subvolume root), that tree is written to disk
121 * and then the fsync is considered complete.
122 *
123 * After a crash, items are copied out of the log-tree back into the
124 * subvolume tree. Any file data extents found are recorded in the extent
125 * allocation tree, and the log-tree freed.
126 *
127 * The log tree is read three times, once to pin down all the extents it is
128 * using in ram and once, once to create all the inodes logged in the tree
129 * and once to do all the other items.
130 */
131
132 /*
133 * start a sub transaction and setup the log tree
134 * this increments the log tree writer count to make the people
135 * syncing the tree wait for us to finish
136 */
start_log_trans(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)137 static int start_log_trans(struct btrfs_trans_handle *trans,
138 struct btrfs_root *root,
139 struct btrfs_log_ctx *ctx)
140 {
141 struct btrfs_fs_info *fs_info = root->fs_info;
142 int ret = 0;
143
144 mutex_lock(&root->log_mutex);
145
146 if (root->log_root) {
147 if (btrfs_need_log_full_commit(trans)) {
148 ret = -EAGAIN;
149 goto out;
150 }
151
152 if (!root->log_start_pid) {
153 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
154 root->log_start_pid = current->pid;
155 } else if (root->log_start_pid != current->pid) {
156 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
157 }
158 } else {
159 mutex_lock(&fs_info->tree_log_mutex);
160 if (!fs_info->log_root_tree)
161 ret = btrfs_init_log_root_tree(trans, fs_info);
162 mutex_unlock(&fs_info->tree_log_mutex);
163 if (ret)
164 goto out;
165
166 ret = btrfs_add_log_tree(trans, root);
167 if (ret)
168 goto out;
169
170 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
171 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
172 root->log_start_pid = current->pid;
173 }
174
175 atomic_inc(&root->log_batch);
176 atomic_inc(&root->log_writers);
177 if (ctx && !ctx->logging_new_name) {
178 int index = root->log_transid % 2;
179 list_add_tail(&ctx->list, &root->log_ctxs[index]);
180 ctx->log_transid = root->log_transid;
181 }
182
183 out:
184 mutex_unlock(&root->log_mutex);
185 return ret;
186 }
187
188 /*
189 * returns 0 if there was a log transaction running and we were able
190 * to join, or returns -ENOENT if there were not transactions
191 * in progress
192 */
join_running_log_trans(struct btrfs_root * root)193 static int join_running_log_trans(struct btrfs_root *root)
194 {
195 int ret = -ENOENT;
196
197 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
198 return ret;
199
200 mutex_lock(&root->log_mutex);
201 if (root->log_root) {
202 ret = 0;
203 atomic_inc(&root->log_writers);
204 }
205 mutex_unlock(&root->log_mutex);
206 return ret;
207 }
208
209 /*
210 * This either makes the current running log transaction wait
211 * until you call btrfs_end_log_trans() or it makes any future
212 * log transactions wait until you call btrfs_end_log_trans()
213 */
btrfs_pin_log_trans(struct btrfs_root * root)214 void btrfs_pin_log_trans(struct btrfs_root *root)
215 {
216 atomic_inc(&root->log_writers);
217 }
218
219 /*
220 * indicate we're done making changes to the log tree
221 * and wake up anyone waiting to do a sync
222 */
btrfs_end_log_trans(struct btrfs_root * root)223 void btrfs_end_log_trans(struct btrfs_root *root)
224 {
225 if (atomic_dec_and_test(&root->log_writers)) {
226 /* atomic_dec_and_test implies a barrier */
227 cond_wake_up_nomb(&root->log_writer_wait);
228 }
229 }
230
btrfs_write_tree_block(struct extent_buffer * buf)231 static int btrfs_write_tree_block(struct extent_buffer *buf)
232 {
233 return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
234 buf->start + buf->len - 1);
235 }
236
btrfs_wait_tree_block_writeback(struct extent_buffer * buf)237 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
238 {
239 filemap_fdatawait_range(buf->pages[0]->mapping,
240 buf->start, buf->start + buf->len - 1);
241 }
242
243 /*
244 * the walk control struct is used to pass state down the chain when
245 * processing the log tree. The stage field tells us which part
246 * of the log tree processing we are currently doing. The others
247 * are state fields used for that specific part
248 */
249 struct walk_control {
250 /* should we free the extent on disk when done? This is used
251 * at transaction commit time while freeing a log tree
252 */
253 int free;
254
255 /* should we write out the extent buffer? This is used
256 * while flushing the log tree to disk during a sync
257 */
258 int write;
259
260 /* should we wait for the extent buffer io to finish? Also used
261 * while flushing the log tree to disk for a sync
262 */
263 int wait;
264
265 /* pin only walk, we record which extents on disk belong to the
266 * log trees
267 */
268 int pin;
269
270 /* what stage of the replay code we're currently in */
271 int stage;
272
273 /*
274 * Ignore any items from the inode currently being processed. Needs
275 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
276 * the LOG_WALK_REPLAY_INODES stage.
277 */
278 bool ignore_cur_inode;
279
280 /* the root we are currently replaying */
281 struct btrfs_root *replay_dest;
282
283 /* the trans handle for the current replay */
284 struct btrfs_trans_handle *trans;
285
286 /* the function that gets used to process blocks we find in the
287 * tree. Note the extent_buffer might not be up to date when it is
288 * passed in, and it must be checked or read if you need the data
289 * inside it
290 */
291 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
292 struct walk_control *wc, u64 gen, int level);
293 };
294
295 /*
296 * process_func used to pin down extents, write them or wait on them
297 */
process_one_buffer(struct btrfs_root * log,struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)298 static int process_one_buffer(struct btrfs_root *log,
299 struct extent_buffer *eb,
300 struct walk_control *wc, u64 gen, int level)
301 {
302 struct btrfs_fs_info *fs_info = log->fs_info;
303 int ret = 0;
304
305 /*
306 * If this fs is mixed then we need to be able to process the leaves to
307 * pin down any logged extents, so we have to read the block.
308 */
309 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
310 ret = btrfs_read_buffer(eb, gen, level, NULL);
311 if (ret)
312 return ret;
313 }
314
315 if (wc->pin)
316 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
317 eb->len);
318
319 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
320 if (wc->pin && btrfs_header_level(eb) == 0)
321 ret = btrfs_exclude_logged_extents(eb);
322 if (wc->write)
323 btrfs_write_tree_block(eb);
324 if (wc->wait)
325 btrfs_wait_tree_block_writeback(eb);
326 }
327 return ret;
328 }
329
330 /*
331 * Item overwrite used by replay and tree logging. eb, slot and key all refer
332 * to the src data we are copying out.
333 *
334 * root is the tree we are copying into, and path is a scratch
335 * path for use in this function (it should be released on entry and
336 * will be released on exit).
337 *
338 * If the key is already in the destination tree the existing item is
339 * overwritten. If the existing item isn't big enough, it is extended.
340 * If it is too large, it is truncated.
341 *
342 * If the key isn't in the destination yet, a new item is inserted.
343 */
overwrite_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)344 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
345 struct btrfs_root *root,
346 struct btrfs_path *path,
347 struct extent_buffer *eb, int slot,
348 struct btrfs_key *key)
349 {
350 int ret;
351 u32 item_size;
352 u64 saved_i_size = 0;
353 int save_old_i_size = 0;
354 unsigned long src_ptr;
355 unsigned long dst_ptr;
356 int overwrite_root = 0;
357 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
358
359 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
360 overwrite_root = 1;
361
362 item_size = btrfs_item_size_nr(eb, slot);
363 src_ptr = btrfs_item_ptr_offset(eb, slot);
364
365 /* look for the key in the destination tree */
366 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
367 if (ret < 0)
368 return ret;
369
370 if (ret == 0) {
371 char *src_copy;
372 char *dst_copy;
373 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
374 path->slots[0]);
375 if (dst_size != item_size)
376 goto insert;
377
378 if (item_size == 0) {
379 btrfs_release_path(path);
380 return 0;
381 }
382 dst_copy = kmalloc(item_size, GFP_NOFS);
383 src_copy = kmalloc(item_size, GFP_NOFS);
384 if (!dst_copy || !src_copy) {
385 btrfs_release_path(path);
386 kfree(dst_copy);
387 kfree(src_copy);
388 return -ENOMEM;
389 }
390
391 read_extent_buffer(eb, src_copy, src_ptr, item_size);
392
393 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
394 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
395 item_size);
396 ret = memcmp(dst_copy, src_copy, item_size);
397
398 kfree(dst_copy);
399 kfree(src_copy);
400 /*
401 * they have the same contents, just return, this saves
402 * us from cowing blocks in the destination tree and doing
403 * extra writes that may not have been done by a previous
404 * sync
405 */
406 if (ret == 0) {
407 btrfs_release_path(path);
408 return 0;
409 }
410
411 /*
412 * We need to load the old nbytes into the inode so when we
413 * replay the extents we've logged we get the right nbytes.
414 */
415 if (inode_item) {
416 struct btrfs_inode_item *item;
417 u64 nbytes;
418 u32 mode;
419
420 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
421 struct btrfs_inode_item);
422 nbytes = btrfs_inode_nbytes(path->nodes[0], item);
423 item = btrfs_item_ptr(eb, slot,
424 struct btrfs_inode_item);
425 btrfs_set_inode_nbytes(eb, item, nbytes);
426
427 /*
428 * If this is a directory we need to reset the i_size to
429 * 0 so that we can set it up properly when replaying
430 * the rest of the items in this log.
431 */
432 mode = btrfs_inode_mode(eb, item);
433 if (S_ISDIR(mode))
434 btrfs_set_inode_size(eb, item, 0);
435 }
436 } else if (inode_item) {
437 struct btrfs_inode_item *item;
438 u32 mode;
439
440 /*
441 * New inode, set nbytes to 0 so that the nbytes comes out
442 * properly when we replay the extents.
443 */
444 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
445 btrfs_set_inode_nbytes(eb, item, 0);
446
447 /*
448 * If this is a directory we need to reset the i_size to 0 so
449 * that we can set it up properly when replaying the rest of
450 * the items in this log.
451 */
452 mode = btrfs_inode_mode(eb, item);
453 if (S_ISDIR(mode))
454 btrfs_set_inode_size(eb, item, 0);
455 }
456 insert:
457 btrfs_release_path(path);
458 /* try to insert the key into the destination tree */
459 path->skip_release_on_error = 1;
460 ret = btrfs_insert_empty_item(trans, root, path,
461 key, item_size);
462 path->skip_release_on_error = 0;
463
464 /* make sure any existing item is the correct size */
465 if (ret == -EEXIST || ret == -EOVERFLOW) {
466 u32 found_size;
467 found_size = btrfs_item_size_nr(path->nodes[0],
468 path->slots[0]);
469 if (found_size > item_size)
470 btrfs_truncate_item(path, item_size, 1);
471 else if (found_size < item_size)
472 btrfs_extend_item(path, item_size - found_size);
473 } else if (ret) {
474 return ret;
475 }
476 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
477 path->slots[0]);
478
479 /* don't overwrite an existing inode if the generation number
480 * was logged as zero. This is done when the tree logging code
481 * is just logging an inode to make sure it exists after recovery.
482 *
483 * Also, don't overwrite i_size on directories during replay.
484 * log replay inserts and removes directory items based on the
485 * state of the tree found in the subvolume, and i_size is modified
486 * as it goes
487 */
488 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
489 struct btrfs_inode_item *src_item;
490 struct btrfs_inode_item *dst_item;
491
492 src_item = (struct btrfs_inode_item *)src_ptr;
493 dst_item = (struct btrfs_inode_item *)dst_ptr;
494
495 if (btrfs_inode_generation(eb, src_item) == 0) {
496 struct extent_buffer *dst_eb = path->nodes[0];
497 const u64 ino_size = btrfs_inode_size(eb, src_item);
498
499 /*
500 * For regular files an ino_size == 0 is used only when
501 * logging that an inode exists, as part of a directory
502 * fsync, and the inode wasn't fsynced before. In this
503 * case don't set the size of the inode in the fs/subvol
504 * tree, otherwise we would be throwing valid data away.
505 */
506 if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
507 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
508 ino_size != 0)
509 btrfs_set_inode_size(dst_eb, dst_item, ino_size);
510 goto no_copy;
511 }
512
513 if (overwrite_root &&
514 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
515 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
516 save_old_i_size = 1;
517 saved_i_size = btrfs_inode_size(path->nodes[0],
518 dst_item);
519 }
520 }
521
522 copy_extent_buffer(path->nodes[0], eb, dst_ptr,
523 src_ptr, item_size);
524
525 if (save_old_i_size) {
526 struct btrfs_inode_item *dst_item;
527 dst_item = (struct btrfs_inode_item *)dst_ptr;
528 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
529 }
530
531 /* make sure the generation is filled in */
532 if (key->type == BTRFS_INODE_ITEM_KEY) {
533 struct btrfs_inode_item *dst_item;
534 dst_item = (struct btrfs_inode_item *)dst_ptr;
535 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
536 btrfs_set_inode_generation(path->nodes[0], dst_item,
537 trans->transid);
538 }
539 }
540 no_copy:
541 btrfs_mark_buffer_dirty(path->nodes[0]);
542 btrfs_release_path(path);
543 return 0;
544 }
545
546 /*
547 * simple helper to read an inode off the disk from a given root
548 * This can only be called for subvolume roots and not for the log
549 */
read_one_inode(struct btrfs_root * root,u64 objectid)550 static noinline struct inode *read_one_inode(struct btrfs_root *root,
551 u64 objectid)
552 {
553 struct inode *inode;
554
555 inode = btrfs_iget(root->fs_info->sb, objectid, root);
556 if (IS_ERR(inode))
557 inode = NULL;
558 return inode;
559 }
560
561 /* replays a single extent in 'eb' at 'slot' with 'key' into the
562 * subvolume 'root'. path is released on entry and should be released
563 * on exit.
564 *
565 * extents in the log tree have not been allocated out of the extent
566 * tree yet. So, this completes the allocation, taking a reference
567 * as required if the extent already exists or creating a new extent
568 * if it isn't in the extent allocation tree yet.
569 *
570 * The extent is inserted into the file, dropping any existing extents
571 * from the file that overlap the new one.
572 */
replay_one_extent(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)573 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
574 struct btrfs_root *root,
575 struct btrfs_path *path,
576 struct extent_buffer *eb, int slot,
577 struct btrfs_key *key)
578 {
579 struct btrfs_fs_info *fs_info = root->fs_info;
580 int found_type;
581 u64 extent_end;
582 u64 start = key->offset;
583 u64 nbytes = 0;
584 struct btrfs_file_extent_item *item;
585 struct inode *inode = NULL;
586 unsigned long size;
587 int ret = 0;
588
589 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
590 found_type = btrfs_file_extent_type(eb, item);
591
592 if (found_type == BTRFS_FILE_EXTENT_REG ||
593 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
594 nbytes = btrfs_file_extent_num_bytes(eb, item);
595 extent_end = start + nbytes;
596
597 /*
598 * We don't add to the inodes nbytes if we are prealloc or a
599 * hole.
600 */
601 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
602 nbytes = 0;
603 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
604 size = btrfs_file_extent_ram_bytes(eb, item);
605 nbytes = btrfs_file_extent_ram_bytes(eb, item);
606 extent_end = ALIGN(start + size,
607 fs_info->sectorsize);
608 } else {
609 ret = 0;
610 goto out;
611 }
612
613 inode = read_one_inode(root, key->objectid);
614 if (!inode) {
615 ret = -EIO;
616 goto out;
617 }
618
619 /*
620 * first check to see if we already have this extent in the
621 * file. This must be done before the btrfs_drop_extents run
622 * so we don't try to drop this extent.
623 */
624 ret = btrfs_lookup_file_extent(trans, root, path,
625 btrfs_ino(BTRFS_I(inode)), start, 0);
626
627 if (ret == 0 &&
628 (found_type == BTRFS_FILE_EXTENT_REG ||
629 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
630 struct btrfs_file_extent_item cmp1;
631 struct btrfs_file_extent_item cmp2;
632 struct btrfs_file_extent_item *existing;
633 struct extent_buffer *leaf;
634
635 leaf = path->nodes[0];
636 existing = btrfs_item_ptr(leaf, path->slots[0],
637 struct btrfs_file_extent_item);
638
639 read_extent_buffer(eb, &cmp1, (unsigned long)item,
640 sizeof(cmp1));
641 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
642 sizeof(cmp2));
643
644 /*
645 * we already have a pointer to this exact extent,
646 * we don't have to do anything
647 */
648 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
649 btrfs_release_path(path);
650 goto out;
651 }
652 }
653 btrfs_release_path(path);
654
655 /* drop any overlapping extents */
656 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
657 if (ret)
658 goto out;
659
660 if (found_type == BTRFS_FILE_EXTENT_REG ||
661 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
662 u64 offset;
663 unsigned long dest_offset;
664 struct btrfs_key ins;
665
666 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
667 btrfs_fs_incompat(fs_info, NO_HOLES))
668 goto update_inode;
669
670 ret = btrfs_insert_empty_item(trans, root, path, key,
671 sizeof(*item));
672 if (ret)
673 goto out;
674 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
675 path->slots[0]);
676 copy_extent_buffer(path->nodes[0], eb, dest_offset,
677 (unsigned long)item, sizeof(*item));
678
679 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
680 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
681 ins.type = BTRFS_EXTENT_ITEM_KEY;
682 offset = key->offset - btrfs_file_extent_offset(eb, item);
683
684 /*
685 * Manually record dirty extent, as here we did a shallow
686 * file extent item copy and skip normal backref update,
687 * but modifying extent tree all by ourselves.
688 * So need to manually record dirty extent for qgroup,
689 * as the owner of the file extent changed from log tree
690 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
691 */
692 ret = btrfs_qgroup_trace_extent(trans,
693 btrfs_file_extent_disk_bytenr(eb, item),
694 btrfs_file_extent_disk_num_bytes(eb, item),
695 GFP_NOFS);
696 if (ret < 0)
697 goto out;
698
699 if (ins.objectid > 0) {
700 struct btrfs_ref ref = { 0 };
701 u64 csum_start;
702 u64 csum_end;
703 LIST_HEAD(ordered_sums);
704
705 /*
706 * is this extent already allocated in the extent
707 * allocation tree? If so, just add a reference
708 */
709 ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
710 ins.offset);
711 if (ret < 0) {
712 goto out;
713 } else if (ret == 0) {
714 btrfs_init_generic_ref(&ref,
715 BTRFS_ADD_DELAYED_REF,
716 ins.objectid, ins.offset, 0);
717 btrfs_init_data_ref(&ref,
718 root->root_key.objectid,
719 key->objectid, offset);
720 ret = btrfs_inc_extent_ref(trans, &ref);
721 if (ret)
722 goto out;
723 } else {
724 /*
725 * insert the extent pointer in the extent
726 * allocation tree
727 */
728 ret = btrfs_alloc_logged_file_extent(trans,
729 root->root_key.objectid,
730 key->objectid, offset, &ins);
731 if (ret)
732 goto out;
733 }
734 btrfs_release_path(path);
735
736 if (btrfs_file_extent_compression(eb, item)) {
737 csum_start = ins.objectid;
738 csum_end = csum_start + ins.offset;
739 } else {
740 csum_start = ins.objectid +
741 btrfs_file_extent_offset(eb, item);
742 csum_end = csum_start +
743 btrfs_file_extent_num_bytes(eb, item);
744 }
745
746 ret = btrfs_lookup_csums_range(root->log_root,
747 csum_start, csum_end - 1,
748 &ordered_sums, 0);
749 if (ret)
750 goto out;
751 /*
752 * Now delete all existing cums in the csum root that
753 * cover our range. We do this because we can have an
754 * extent that is completely referenced by one file
755 * extent item and partially referenced by another
756 * file extent item (like after using the clone or
757 * extent_same ioctls). In this case if we end up doing
758 * the replay of the one that partially references the
759 * extent first, and we do not do the csum deletion
760 * below, we can get 2 csum items in the csum tree that
761 * overlap each other. For example, imagine our log has
762 * the two following file extent items:
763 *
764 * key (257 EXTENT_DATA 409600)
765 * extent data disk byte 12845056 nr 102400
766 * extent data offset 20480 nr 20480 ram 102400
767 *
768 * key (257 EXTENT_DATA 819200)
769 * extent data disk byte 12845056 nr 102400
770 * extent data offset 0 nr 102400 ram 102400
771 *
772 * Where the second one fully references the 100K extent
773 * that starts at disk byte 12845056, and the log tree
774 * has a single csum item that covers the entire range
775 * of the extent:
776 *
777 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
778 *
779 * After the first file extent item is replayed, the
780 * csum tree gets the following csum item:
781 *
782 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
783 *
784 * Which covers the 20K sub-range starting at offset 20K
785 * of our extent. Now when we replay the second file
786 * extent item, if we do not delete existing csum items
787 * that cover any of its blocks, we end up getting two
788 * csum items in our csum tree that overlap each other:
789 *
790 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
791 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
792 *
793 * Which is a problem, because after this anyone trying
794 * to lookup up for the checksum of any block of our
795 * extent starting at an offset of 40K or higher, will
796 * end up looking at the second csum item only, which
797 * does not contain the checksum for any block starting
798 * at offset 40K or higher of our extent.
799 */
800 while (!list_empty(&ordered_sums)) {
801 struct btrfs_ordered_sum *sums;
802 sums = list_entry(ordered_sums.next,
803 struct btrfs_ordered_sum,
804 list);
805 if (!ret)
806 ret = btrfs_del_csums(trans,
807 fs_info->csum_root,
808 sums->bytenr,
809 sums->len);
810 if (!ret)
811 ret = btrfs_csum_file_blocks(trans,
812 fs_info->csum_root, sums);
813 list_del(&sums->list);
814 kfree(sums);
815 }
816 if (ret)
817 goto out;
818 } else {
819 btrfs_release_path(path);
820 }
821 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
822 /* inline extents are easy, we just overwrite them */
823 ret = overwrite_item(trans, root, path, eb, slot, key);
824 if (ret)
825 goto out;
826 }
827
828 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
829 extent_end - start);
830 if (ret)
831 goto out;
832
833 inode_add_bytes(inode, nbytes);
834 update_inode:
835 ret = btrfs_update_inode(trans, root, inode);
836 out:
837 if (inode)
838 iput(inode);
839 return ret;
840 }
841
842 /*
843 * when cleaning up conflicts between the directory names in the
844 * subvolume, directory names in the log and directory names in the
845 * inode back references, we may have to unlink inodes from directories.
846 *
847 * This is a helper function to do the unlink of a specific directory
848 * item
849 */
drop_one_dir_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_inode * dir,struct btrfs_dir_item * di)850 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
851 struct btrfs_root *root,
852 struct btrfs_path *path,
853 struct btrfs_inode *dir,
854 struct btrfs_dir_item *di)
855 {
856 struct inode *inode;
857 char *name;
858 int name_len;
859 struct extent_buffer *leaf;
860 struct btrfs_key location;
861 int ret;
862
863 leaf = path->nodes[0];
864
865 btrfs_dir_item_key_to_cpu(leaf, di, &location);
866 name_len = btrfs_dir_name_len(leaf, di);
867 name = kmalloc(name_len, GFP_NOFS);
868 if (!name)
869 return -ENOMEM;
870
871 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
872 btrfs_release_path(path);
873
874 inode = read_one_inode(root, location.objectid);
875 if (!inode) {
876 ret = -EIO;
877 goto out;
878 }
879
880 ret = link_to_fixup_dir(trans, root, path, location.objectid);
881 if (ret)
882 goto out;
883
884 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
885 name_len);
886 if (ret)
887 goto out;
888 else
889 ret = btrfs_run_delayed_items(trans);
890 out:
891 kfree(name);
892 iput(inode);
893 return ret;
894 }
895
896 /*
897 * See if a given name and sequence number found in an inode back reference are
898 * already in a directory and correctly point to this inode.
899 *
900 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
901 * exists.
902 */
inode_in_dir(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,u64 objectid,u64 index,const char * name,int name_len)903 static noinline int inode_in_dir(struct btrfs_root *root,
904 struct btrfs_path *path,
905 u64 dirid, u64 objectid, u64 index,
906 const char *name, int name_len)
907 {
908 struct btrfs_dir_item *di;
909 struct btrfs_key location;
910 int ret = 0;
911
912 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
913 index, name, name_len, 0);
914 if (IS_ERR(di)) {
915 if (PTR_ERR(di) != -ENOENT)
916 ret = PTR_ERR(di);
917 goto out;
918 } else if (di) {
919 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
920 if (location.objectid != objectid)
921 goto out;
922 } else {
923 goto out;
924 }
925
926 btrfs_release_path(path);
927 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
928 if (IS_ERR(di)) {
929 ret = PTR_ERR(di);
930 goto out;
931 } else if (di) {
932 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
933 if (location.objectid == objectid)
934 ret = 1;
935 }
936 out:
937 btrfs_release_path(path);
938 return ret;
939 }
940
941 /*
942 * helper function to check a log tree for a named back reference in
943 * an inode. This is used to decide if a back reference that is
944 * found in the subvolume conflicts with what we find in the log.
945 *
946 * inode backreferences may have multiple refs in a single item,
947 * during replay we process one reference at a time, and we don't
948 * want to delete valid links to a file from the subvolume if that
949 * link is also in the log.
950 */
backref_in_log(struct btrfs_root * log,struct btrfs_key * key,u64 ref_objectid,const char * name,int namelen)951 static noinline int backref_in_log(struct btrfs_root *log,
952 struct btrfs_key *key,
953 u64 ref_objectid,
954 const char *name, int namelen)
955 {
956 struct btrfs_path *path;
957 int ret;
958
959 path = btrfs_alloc_path();
960 if (!path)
961 return -ENOMEM;
962
963 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
964 if (ret < 0) {
965 goto out;
966 } else if (ret == 1) {
967 ret = 0;
968 goto out;
969 }
970
971 if (key->type == BTRFS_INODE_EXTREF_KEY)
972 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
973 path->slots[0],
974 ref_objectid,
975 name, namelen);
976 else
977 ret = !!btrfs_find_name_in_backref(path->nodes[0],
978 path->slots[0],
979 name, namelen);
980 out:
981 btrfs_free_path(path);
982 return ret;
983 }
984
__add_inode_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_root * log_root,struct btrfs_inode * dir,struct btrfs_inode * inode,u64 inode_objectid,u64 parent_objectid,u64 ref_index,char * name,int namelen,int * search_done)985 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
986 struct btrfs_root *root,
987 struct btrfs_path *path,
988 struct btrfs_root *log_root,
989 struct btrfs_inode *dir,
990 struct btrfs_inode *inode,
991 u64 inode_objectid, u64 parent_objectid,
992 u64 ref_index, char *name, int namelen,
993 int *search_done)
994 {
995 int ret;
996 char *victim_name;
997 int victim_name_len;
998 struct extent_buffer *leaf;
999 struct btrfs_dir_item *di;
1000 struct btrfs_key search_key;
1001 struct btrfs_inode_extref *extref;
1002
1003 again:
1004 /* Search old style refs */
1005 search_key.objectid = inode_objectid;
1006 search_key.type = BTRFS_INODE_REF_KEY;
1007 search_key.offset = parent_objectid;
1008 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1009 if (ret == 0) {
1010 struct btrfs_inode_ref *victim_ref;
1011 unsigned long ptr;
1012 unsigned long ptr_end;
1013
1014 leaf = path->nodes[0];
1015
1016 /* are we trying to overwrite a back ref for the root directory
1017 * if so, just jump out, we're done
1018 */
1019 if (search_key.objectid == search_key.offset)
1020 return 1;
1021
1022 /* check all the names in this back reference to see
1023 * if they are in the log. if so, we allow them to stay
1024 * otherwise they must be unlinked as a conflict
1025 */
1026 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1027 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1028 while (ptr < ptr_end) {
1029 victim_ref = (struct btrfs_inode_ref *)ptr;
1030 victim_name_len = btrfs_inode_ref_name_len(leaf,
1031 victim_ref);
1032 victim_name = kmalloc(victim_name_len, GFP_NOFS);
1033 if (!victim_name)
1034 return -ENOMEM;
1035
1036 read_extent_buffer(leaf, victim_name,
1037 (unsigned long)(victim_ref + 1),
1038 victim_name_len);
1039
1040 ret = backref_in_log(log_root, &search_key,
1041 parent_objectid, victim_name,
1042 victim_name_len);
1043 if (ret < 0) {
1044 kfree(victim_name);
1045 return ret;
1046 } else if (!ret) {
1047 inc_nlink(&inode->vfs_inode);
1048 btrfs_release_path(path);
1049
1050 ret = btrfs_unlink_inode(trans, root, dir, inode,
1051 victim_name, victim_name_len);
1052 kfree(victim_name);
1053 if (ret)
1054 return ret;
1055 ret = btrfs_run_delayed_items(trans);
1056 if (ret)
1057 return ret;
1058 *search_done = 1;
1059 goto again;
1060 }
1061 kfree(victim_name);
1062
1063 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1064 }
1065
1066 /*
1067 * NOTE: we have searched root tree and checked the
1068 * corresponding ref, it does not need to check again.
1069 */
1070 *search_done = 1;
1071 }
1072 btrfs_release_path(path);
1073
1074 /* Same search but for extended refs */
1075 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1076 inode_objectid, parent_objectid, 0,
1077 0);
1078 if (!IS_ERR_OR_NULL(extref)) {
1079 u32 item_size;
1080 u32 cur_offset = 0;
1081 unsigned long base;
1082 struct inode *victim_parent;
1083
1084 leaf = path->nodes[0];
1085
1086 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1087 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1088
1089 while (cur_offset < item_size) {
1090 extref = (struct btrfs_inode_extref *)(base + cur_offset);
1091
1092 victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1093
1094 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1095 goto next;
1096
1097 victim_name = kmalloc(victim_name_len, GFP_NOFS);
1098 if (!victim_name)
1099 return -ENOMEM;
1100 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1101 victim_name_len);
1102
1103 search_key.objectid = inode_objectid;
1104 search_key.type = BTRFS_INODE_EXTREF_KEY;
1105 search_key.offset = btrfs_extref_hash(parent_objectid,
1106 victim_name,
1107 victim_name_len);
1108 ret = backref_in_log(log_root, &search_key,
1109 parent_objectid, victim_name,
1110 victim_name_len);
1111 if (ret < 0) {
1112 kfree(victim_name);
1113 return ret;
1114 } else if (!ret) {
1115 ret = -ENOENT;
1116 victim_parent = read_one_inode(root,
1117 parent_objectid);
1118 if (victim_parent) {
1119 inc_nlink(&inode->vfs_inode);
1120 btrfs_release_path(path);
1121
1122 ret = btrfs_unlink_inode(trans, root,
1123 BTRFS_I(victim_parent),
1124 inode,
1125 victim_name,
1126 victim_name_len);
1127 if (!ret)
1128 ret = btrfs_run_delayed_items(
1129 trans);
1130 }
1131 iput(victim_parent);
1132 kfree(victim_name);
1133 if (ret)
1134 return ret;
1135 *search_done = 1;
1136 goto again;
1137 }
1138 kfree(victim_name);
1139 next:
1140 cur_offset += victim_name_len + sizeof(*extref);
1141 }
1142 *search_done = 1;
1143 }
1144 btrfs_release_path(path);
1145
1146 /* look for a conflicting sequence number */
1147 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1148 ref_index, name, namelen, 0);
1149 if (IS_ERR(di)) {
1150 if (PTR_ERR(di) != -ENOENT)
1151 return PTR_ERR(di);
1152 } else if (di) {
1153 ret = drop_one_dir_item(trans, root, path, dir, di);
1154 if (ret)
1155 return ret;
1156 }
1157 btrfs_release_path(path);
1158
1159 /* look for a conflicting name */
1160 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1161 name, namelen, 0);
1162 if (IS_ERR(di)) {
1163 return PTR_ERR(di);
1164 } else if (di) {
1165 ret = drop_one_dir_item(trans, root, path, dir, di);
1166 if (ret)
1167 return ret;
1168 }
1169 btrfs_release_path(path);
1170
1171 return 0;
1172 }
1173
extref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,u32 * namelen,char ** name,u64 * index,u64 * parent_objectid)1174 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1175 u32 *namelen, char **name, u64 *index,
1176 u64 *parent_objectid)
1177 {
1178 struct btrfs_inode_extref *extref;
1179
1180 extref = (struct btrfs_inode_extref *)ref_ptr;
1181
1182 *namelen = btrfs_inode_extref_name_len(eb, extref);
1183 *name = kmalloc(*namelen, GFP_NOFS);
1184 if (*name == NULL)
1185 return -ENOMEM;
1186
1187 read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1188 *namelen);
1189
1190 if (index)
1191 *index = btrfs_inode_extref_index(eb, extref);
1192 if (parent_objectid)
1193 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1194
1195 return 0;
1196 }
1197
ref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,u32 * namelen,char ** name,u64 * index)1198 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1199 u32 *namelen, char **name, u64 *index)
1200 {
1201 struct btrfs_inode_ref *ref;
1202
1203 ref = (struct btrfs_inode_ref *)ref_ptr;
1204
1205 *namelen = btrfs_inode_ref_name_len(eb, ref);
1206 *name = kmalloc(*namelen, GFP_NOFS);
1207 if (*name == NULL)
1208 return -ENOMEM;
1209
1210 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1211
1212 if (index)
1213 *index = btrfs_inode_ref_index(eb, ref);
1214
1215 return 0;
1216 }
1217
1218 /*
1219 * Take an inode reference item from the log tree and iterate all names from the
1220 * inode reference item in the subvolume tree with the same key (if it exists).
1221 * For any name that is not in the inode reference item from the log tree, do a
1222 * proper unlink of that name (that is, remove its entry from the inode
1223 * reference item and both dir index keys).
1224 */
unlink_old_inode_refs(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_inode * inode,struct extent_buffer * log_eb,int log_slot,struct btrfs_key * key)1225 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1226 struct btrfs_root *root,
1227 struct btrfs_path *path,
1228 struct btrfs_inode *inode,
1229 struct extent_buffer *log_eb,
1230 int log_slot,
1231 struct btrfs_key *key)
1232 {
1233 int ret;
1234 unsigned long ref_ptr;
1235 unsigned long ref_end;
1236 struct extent_buffer *eb;
1237
1238 again:
1239 btrfs_release_path(path);
1240 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1241 if (ret > 0) {
1242 ret = 0;
1243 goto out;
1244 }
1245 if (ret < 0)
1246 goto out;
1247
1248 eb = path->nodes[0];
1249 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1250 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
1251 while (ref_ptr < ref_end) {
1252 char *name = NULL;
1253 int namelen;
1254 u64 parent_id;
1255
1256 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1257 ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1258 NULL, &parent_id);
1259 } else {
1260 parent_id = key->offset;
1261 ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1262 NULL);
1263 }
1264 if (ret)
1265 goto out;
1266
1267 if (key->type == BTRFS_INODE_EXTREF_KEY)
1268 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1269 parent_id, name,
1270 namelen);
1271 else
1272 ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
1273 name, namelen);
1274
1275 if (!ret) {
1276 struct inode *dir;
1277
1278 btrfs_release_path(path);
1279 dir = read_one_inode(root, parent_id);
1280 if (!dir) {
1281 ret = -ENOENT;
1282 kfree(name);
1283 goto out;
1284 }
1285 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
1286 inode, name, namelen);
1287 kfree(name);
1288 iput(dir);
1289 if (ret)
1290 goto out;
1291 goto again;
1292 }
1293
1294 kfree(name);
1295 ref_ptr += namelen;
1296 if (key->type == BTRFS_INODE_EXTREF_KEY)
1297 ref_ptr += sizeof(struct btrfs_inode_extref);
1298 else
1299 ref_ptr += sizeof(struct btrfs_inode_ref);
1300 }
1301 ret = 0;
1302 out:
1303 btrfs_release_path(path);
1304 return ret;
1305 }
1306
btrfs_inode_ref_exists(struct inode * inode,struct inode * dir,const u8 ref_type,const char * name,const int namelen)1307 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
1308 const u8 ref_type, const char *name,
1309 const int namelen)
1310 {
1311 struct btrfs_key key;
1312 struct btrfs_path *path;
1313 const u64 parent_id = btrfs_ino(BTRFS_I(dir));
1314 int ret;
1315
1316 path = btrfs_alloc_path();
1317 if (!path)
1318 return -ENOMEM;
1319
1320 key.objectid = btrfs_ino(BTRFS_I(inode));
1321 key.type = ref_type;
1322 if (key.type == BTRFS_INODE_REF_KEY)
1323 key.offset = parent_id;
1324 else
1325 key.offset = btrfs_extref_hash(parent_id, name, namelen);
1326
1327 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
1328 if (ret < 0)
1329 goto out;
1330 if (ret > 0) {
1331 ret = 0;
1332 goto out;
1333 }
1334 if (key.type == BTRFS_INODE_EXTREF_KEY)
1335 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1336 path->slots[0], parent_id, name, namelen);
1337 else
1338 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1339 name, namelen);
1340
1341 out:
1342 btrfs_free_path(path);
1343 return ret;
1344 }
1345
add_link(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * dir,struct inode * inode,const char * name,int namelen,u64 ref_index)1346 static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1347 struct inode *dir, struct inode *inode, const char *name,
1348 int namelen, u64 ref_index)
1349 {
1350 struct btrfs_dir_item *dir_item;
1351 struct btrfs_key key;
1352 struct btrfs_path *path;
1353 struct inode *other_inode = NULL;
1354 int ret;
1355
1356 path = btrfs_alloc_path();
1357 if (!path)
1358 return -ENOMEM;
1359
1360 dir_item = btrfs_lookup_dir_item(NULL, root, path,
1361 btrfs_ino(BTRFS_I(dir)),
1362 name, namelen, 0);
1363 if (!dir_item) {
1364 btrfs_release_path(path);
1365 goto add_link;
1366 } else if (IS_ERR(dir_item)) {
1367 ret = PTR_ERR(dir_item);
1368 goto out;
1369 }
1370
1371 /*
1372 * Our inode's dentry collides with the dentry of another inode which is
1373 * in the log but not yet processed since it has a higher inode number.
1374 * So delete that other dentry.
1375 */
1376 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1377 btrfs_release_path(path);
1378 other_inode = read_one_inode(root, key.objectid);
1379 if (!other_inode) {
1380 ret = -ENOENT;
1381 goto out;
1382 }
1383 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1384 name, namelen);
1385 if (ret)
1386 goto out;
1387 /*
1388 * If we dropped the link count to 0, bump it so that later the iput()
1389 * on the inode will not free it. We will fixup the link count later.
1390 */
1391 if (other_inode->i_nlink == 0)
1392 inc_nlink(other_inode);
1393
1394 ret = btrfs_run_delayed_items(trans);
1395 if (ret)
1396 goto out;
1397 add_link:
1398 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1399 name, namelen, 0, ref_index);
1400 out:
1401 iput(other_inode);
1402 btrfs_free_path(path);
1403
1404 return ret;
1405 }
1406
1407 /*
1408 * replay one inode back reference item found in the log tree.
1409 * eb, slot and key refer to the buffer and key found in the log tree.
1410 * root is the destination we are replaying into, and path is for temp
1411 * use by this function. (it should be released on return).
1412 */
add_inode_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)1413 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1414 struct btrfs_root *root,
1415 struct btrfs_root *log,
1416 struct btrfs_path *path,
1417 struct extent_buffer *eb, int slot,
1418 struct btrfs_key *key)
1419 {
1420 struct inode *dir = NULL;
1421 struct inode *inode = NULL;
1422 unsigned long ref_ptr;
1423 unsigned long ref_end;
1424 char *name = NULL;
1425 int namelen;
1426 int ret;
1427 int search_done = 0;
1428 int log_ref_ver = 0;
1429 u64 parent_objectid;
1430 u64 inode_objectid;
1431 u64 ref_index = 0;
1432 int ref_struct_size;
1433
1434 ref_ptr = btrfs_item_ptr_offset(eb, slot);
1435 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1436
1437 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1438 struct btrfs_inode_extref *r;
1439
1440 ref_struct_size = sizeof(struct btrfs_inode_extref);
1441 log_ref_ver = 1;
1442 r = (struct btrfs_inode_extref *)ref_ptr;
1443 parent_objectid = btrfs_inode_extref_parent(eb, r);
1444 } else {
1445 ref_struct_size = sizeof(struct btrfs_inode_ref);
1446 parent_objectid = key->offset;
1447 }
1448 inode_objectid = key->objectid;
1449
1450 /*
1451 * it is possible that we didn't log all the parent directories
1452 * for a given inode. If we don't find the dir, just don't
1453 * copy the back ref in. The link count fixup code will take
1454 * care of the rest
1455 */
1456 dir = read_one_inode(root, parent_objectid);
1457 if (!dir) {
1458 ret = -ENOENT;
1459 goto out;
1460 }
1461
1462 inode = read_one_inode(root, inode_objectid);
1463 if (!inode) {
1464 ret = -EIO;
1465 goto out;
1466 }
1467
1468 while (ref_ptr < ref_end) {
1469 if (log_ref_ver) {
1470 ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1471 &ref_index, &parent_objectid);
1472 /*
1473 * parent object can change from one array
1474 * item to another.
1475 */
1476 if (!dir)
1477 dir = read_one_inode(root, parent_objectid);
1478 if (!dir) {
1479 ret = -ENOENT;
1480 goto out;
1481 }
1482 } else {
1483 ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1484 &ref_index);
1485 }
1486 if (ret)
1487 goto out;
1488
1489 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1490 btrfs_ino(BTRFS_I(inode)), ref_index,
1491 name, namelen);
1492 if (ret < 0) {
1493 goto out;
1494 } else if (ret == 0) {
1495 /*
1496 * look for a conflicting back reference in the
1497 * metadata. if we find one we have to unlink that name
1498 * of the file before we add our new link. Later on, we
1499 * overwrite any existing back reference, and we don't
1500 * want to create dangling pointers in the directory.
1501 */
1502
1503 if (!search_done) {
1504 ret = __add_inode_ref(trans, root, path, log,
1505 BTRFS_I(dir),
1506 BTRFS_I(inode),
1507 inode_objectid,
1508 parent_objectid,
1509 ref_index, name, namelen,
1510 &search_done);
1511 if (ret) {
1512 if (ret == 1)
1513 ret = 0;
1514 goto out;
1515 }
1516 }
1517
1518 /*
1519 * If a reference item already exists for this inode
1520 * with the same parent and name, but different index,
1521 * drop it and the corresponding directory index entries
1522 * from the parent before adding the new reference item
1523 * and dir index entries, otherwise we would fail with
1524 * -EEXIST returned from btrfs_add_link() below.
1525 */
1526 ret = btrfs_inode_ref_exists(inode, dir, key->type,
1527 name, namelen);
1528 if (ret > 0) {
1529 ret = btrfs_unlink_inode(trans, root,
1530 BTRFS_I(dir),
1531 BTRFS_I(inode),
1532 name, namelen);
1533 /*
1534 * If we dropped the link count to 0, bump it so
1535 * that later the iput() on the inode will not
1536 * free it. We will fixup the link count later.
1537 */
1538 if (!ret && inode->i_nlink == 0)
1539 inc_nlink(inode);
1540 }
1541 if (ret < 0)
1542 goto out;
1543
1544 /* insert our name */
1545 ret = add_link(trans, root, dir, inode, name, namelen,
1546 ref_index);
1547 if (ret)
1548 goto out;
1549
1550 btrfs_update_inode(trans, root, inode);
1551 }
1552 /* Else, ret == 1, we already have a perfect match, we're done. */
1553
1554 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1555 kfree(name);
1556 name = NULL;
1557 if (log_ref_ver) {
1558 iput(dir);
1559 dir = NULL;
1560 }
1561 }
1562
1563 /*
1564 * Before we overwrite the inode reference item in the subvolume tree
1565 * with the item from the log tree, we must unlink all names from the
1566 * parent directory that are in the subvolume's tree inode reference
1567 * item, otherwise we end up with an inconsistent subvolume tree where
1568 * dir index entries exist for a name but there is no inode reference
1569 * item with the same name.
1570 */
1571 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1572 key);
1573 if (ret)
1574 goto out;
1575
1576 /* finally write the back reference in the inode */
1577 ret = overwrite_item(trans, root, path, eb, slot, key);
1578 out:
1579 btrfs_release_path(path);
1580 kfree(name);
1581 iput(dir);
1582 iput(inode);
1583 return ret;
1584 }
1585
insert_orphan_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 ino)1586 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1587 struct btrfs_root *root, u64 ino)
1588 {
1589 int ret;
1590
1591 ret = btrfs_insert_orphan_item(trans, root, ino);
1592 if (ret == -EEXIST)
1593 ret = 0;
1594
1595 return ret;
1596 }
1597
count_inode_extrefs(struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path)1598 static int count_inode_extrefs(struct btrfs_root *root,
1599 struct btrfs_inode *inode, struct btrfs_path *path)
1600 {
1601 int ret = 0;
1602 int name_len;
1603 unsigned int nlink = 0;
1604 u32 item_size;
1605 u32 cur_offset = 0;
1606 u64 inode_objectid = btrfs_ino(inode);
1607 u64 offset = 0;
1608 unsigned long ptr;
1609 struct btrfs_inode_extref *extref;
1610 struct extent_buffer *leaf;
1611
1612 while (1) {
1613 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1614 &extref, &offset);
1615 if (ret)
1616 break;
1617
1618 leaf = path->nodes[0];
1619 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1620 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1621 cur_offset = 0;
1622
1623 while (cur_offset < item_size) {
1624 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1625 name_len = btrfs_inode_extref_name_len(leaf, extref);
1626
1627 nlink++;
1628
1629 cur_offset += name_len + sizeof(*extref);
1630 }
1631
1632 offset++;
1633 btrfs_release_path(path);
1634 }
1635 btrfs_release_path(path);
1636
1637 if (ret < 0 && ret != -ENOENT)
1638 return ret;
1639 return nlink;
1640 }
1641
count_inode_refs(struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path)1642 static int count_inode_refs(struct btrfs_root *root,
1643 struct btrfs_inode *inode, struct btrfs_path *path)
1644 {
1645 int ret;
1646 struct btrfs_key key;
1647 unsigned int nlink = 0;
1648 unsigned long ptr;
1649 unsigned long ptr_end;
1650 int name_len;
1651 u64 ino = btrfs_ino(inode);
1652
1653 key.objectid = ino;
1654 key.type = BTRFS_INODE_REF_KEY;
1655 key.offset = (u64)-1;
1656
1657 while (1) {
1658 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1659 if (ret < 0)
1660 break;
1661 if (ret > 0) {
1662 if (path->slots[0] == 0)
1663 break;
1664 path->slots[0]--;
1665 }
1666 process_slot:
1667 btrfs_item_key_to_cpu(path->nodes[0], &key,
1668 path->slots[0]);
1669 if (key.objectid != ino ||
1670 key.type != BTRFS_INODE_REF_KEY)
1671 break;
1672 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1673 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1674 path->slots[0]);
1675 while (ptr < ptr_end) {
1676 struct btrfs_inode_ref *ref;
1677
1678 ref = (struct btrfs_inode_ref *)ptr;
1679 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1680 ref);
1681 ptr = (unsigned long)(ref + 1) + name_len;
1682 nlink++;
1683 }
1684
1685 if (key.offset == 0)
1686 break;
1687 if (path->slots[0] > 0) {
1688 path->slots[0]--;
1689 goto process_slot;
1690 }
1691 key.offset--;
1692 btrfs_release_path(path);
1693 }
1694 btrfs_release_path(path);
1695
1696 return nlink;
1697 }
1698
1699 /*
1700 * There are a few corners where the link count of the file can't
1701 * be properly maintained during replay. So, instead of adding
1702 * lots of complexity to the log code, we just scan the backrefs
1703 * for any file that has been through replay.
1704 *
1705 * The scan will update the link count on the inode to reflect the
1706 * number of back refs found. If it goes down to zero, the iput
1707 * will free the inode.
1708 */
fixup_inode_link_count(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * inode)1709 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1710 struct btrfs_root *root,
1711 struct inode *inode)
1712 {
1713 struct btrfs_path *path;
1714 int ret;
1715 u64 nlink = 0;
1716 u64 ino = btrfs_ino(BTRFS_I(inode));
1717
1718 path = btrfs_alloc_path();
1719 if (!path)
1720 return -ENOMEM;
1721
1722 ret = count_inode_refs(root, BTRFS_I(inode), path);
1723 if (ret < 0)
1724 goto out;
1725
1726 nlink = ret;
1727
1728 ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1729 if (ret < 0)
1730 goto out;
1731
1732 nlink += ret;
1733
1734 ret = 0;
1735
1736 if (nlink != inode->i_nlink) {
1737 set_nlink(inode, nlink);
1738 btrfs_update_inode(trans, root, inode);
1739 }
1740 BTRFS_I(inode)->index_cnt = (u64)-1;
1741
1742 if (inode->i_nlink == 0) {
1743 if (S_ISDIR(inode->i_mode)) {
1744 ret = replay_dir_deletes(trans, root, NULL, path,
1745 ino, 1);
1746 if (ret)
1747 goto out;
1748 }
1749 ret = insert_orphan_item(trans, root, ino);
1750 }
1751
1752 out:
1753 btrfs_free_path(path);
1754 return ret;
1755 }
1756
fixup_inode_link_counts(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path)1757 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1758 struct btrfs_root *root,
1759 struct btrfs_path *path)
1760 {
1761 int ret;
1762 struct btrfs_key key;
1763 struct inode *inode;
1764
1765 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1766 key.type = BTRFS_ORPHAN_ITEM_KEY;
1767 key.offset = (u64)-1;
1768 while (1) {
1769 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1770 if (ret < 0)
1771 break;
1772
1773 if (ret == 1) {
1774 ret = 0;
1775 if (path->slots[0] == 0)
1776 break;
1777 path->slots[0]--;
1778 }
1779
1780 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1781 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1782 key.type != BTRFS_ORPHAN_ITEM_KEY)
1783 break;
1784
1785 ret = btrfs_del_item(trans, root, path);
1786 if (ret)
1787 break;
1788
1789 btrfs_release_path(path);
1790 inode = read_one_inode(root, key.offset);
1791 if (!inode) {
1792 ret = -EIO;
1793 break;
1794 }
1795
1796 ret = fixup_inode_link_count(trans, root, inode);
1797 iput(inode);
1798 if (ret)
1799 break;
1800
1801 /*
1802 * fixup on a directory may create new entries,
1803 * make sure we always look for the highset possible
1804 * offset
1805 */
1806 key.offset = (u64)-1;
1807 }
1808 btrfs_release_path(path);
1809 return ret;
1810 }
1811
1812
1813 /*
1814 * record a given inode in the fixup dir so we can check its link
1815 * count when replay is done. The link count is incremented here
1816 * so the inode won't go away until we check it
1817 */
link_to_fixup_dir(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,u64 objectid)1818 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1819 struct btrfs_root *root,
1820 struct btrfs_path *path,
1821 u64 objectid)
1822 {
1823 struct btrfs_key key;
1824 int ret = 0;
1825 struct inode *inode;
1826
1827 inode = read_one_inode(root, objectid);
1828 if (!inode)
1829 return -EIO;
1830
1831 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1832 key.type = BTRFS_ORPHAN_ITEM_KEY;
1833 key.offset = objectid;
1834
1835 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1836
1837 btrfs_release_path(path);
1838 if (ret == 0) {
1839 if (!inode->i_nlink)
1840 set_nlink(inode, 1);
1841 else
1842 inc_nlink(inode);
1843 ret = btrfs_update_inode(trans, root, inode);
1844 } else if (ret == -EEXIST) {
1845 ret = 0;
1846 }
1847 iput(inode);
1848
1849 return ret;
1850 }
1851
1852 /*
1853 * when replaying the log for a directory, we only insert names
1854 * for inodes that actually exist. This means an fsync on a directory
1855 * does not implicitly fsync all the new files in it
1856 */
insert_one_name(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 dirid,u64 index,char * name,int name_len,struct btrfs_key * location)1857 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1858 struct btrfs_root *root,
1859 u64 dirid, u64 index,
1860 char *name, int name_len,
1861 struct btrfs_key *location)
1862 {
1863 struct inode *inode;
1864 struct inode *dir;
1865 int ret;
1866
1867 inode = read_one_inode(root, location->objectid);
1868 if (!inode)
1869 return -ENOENT;
1870
1871 dir = read_one_inode(root, dirid);
1872 if (!dir) {
1873 iput(inode);
1874 return -EIO;
1875 }
1876
1877 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1878 name_len, 1, index);
1879
1880 /* FIXME, put inode into FIXUP list */
1881
1882 iput(inode);
1883 iput(dir);
1884 return ret;
1885 }
1886
1887 /*
1888 * take a single entry in a log directory item and replay it into
1889 * the subvolume.
1890 *
1891 * if a conflicting item exists in the subdirectory already,
1892 * the inode it points to is unlinked and put into the link count
1893 * fix up tree.
1894 *
1895 * If a name from the log points to a file or directory that does
1896 * not exist in the FS, it is skipped. fsyncs on directories
1897 * do not force down inodes inside that directory, just changes to the
1898 * names or unlinks in a directory.
1899 *
1900 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1901 * non-existing inode) and 1 if the name was replayed.
1902 */
replay_one_name(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,struct btrfs_dir_item * di,struct btrfs_key * key)1903 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1904 struct btrfs_root *root,
1905 struct btrfs_path *path,
1906 struct extent_buffer *eb,
1907 struct btrfs_dir_item *di,
1908 struct btrfs_key *key)
1909 {
1910 char *name;
1911 int name_len;
1912 struct btrfs_dir_item *dst_di;
1913 struct btrfs_key found_key;
1914 struct btrfs_key log_key;
1915 struct inode *dir;
1916 u8 log_type;
1917 bool exists;
1918 int ret;
1919 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1920 bool name_added = false;
1921
1922 dir = read_one_inode(root, key->objectid);
1923 if (!dir)
1924 return -EIO;
1925
1926 name_len = btrfs_dir_name_len(eb, di);
1927 name = kmalloc(name_len, GFP_NOFS);
1928 if (!name) {
1929 ret = -ENOMEM;
1930 goto out;
1931 }
1932
1933 log_type = btrfs_dir_type(eb, di);
1934 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1935 name_len);
1936
1937 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1938 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1939 btrfs_release_path(path);
1940 if (ret < 0)
1941 goto out;
1942 exists = (ret == 0);
1943 ret = 0;
1944
1945 if (key->type == BTRFS_DIR_ITEM_KEY) {
1946 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1947 name, name_len, 1);
1948 } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1949 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1950 key->objectid,
1951 key->offset, name,
1952 name_len, 1);
1953 } else {
1954 /* Corruption */
1955 ret = -EINVAL;
1956 goto out;
1957 }
1958
1959 if (dst_di == ERR_PTR(-ENOENT))
1960 dst_di = NULL;
1961
1962 if (IS_ERR(dst_di)) {
1963 ret = PTR_ERR(dst_di);
1964 goto out;
1965 } else if (!dst_di) {
1966 /* we need a sequence number to insert, so we only
1967 * do inserts for the BTRFS_DIR_INDEX_KEY types
1968 */
1969 if (key->type != BTRFS_DIR_INDEX_KEY)
1970 goto out;
1971 goto insert;
1972 }
1973
1974 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1975 /* the existing item matches the logged item */
1976 if (found_key.objectid == log_key.objectid &&
1977 found_key.type == log_key.type &&
1978 found_key.offset == log_key.offset &&
1979 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1980 update_size = false;
1981 goto out;
1982 }
1983
1984 /*
1985 * don't drop the conflicting directory entry if the inode
1986 * for the new entry doesn't exist
1987 */
1988 if (!exists)
1989 goto out;
1990
1991 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
1992 if (ret)
1993 goto out;
1994
1995 if (key->type == BTRFS_DIR_INDEX_KEY)
1996 goto insert;
1997 out:
1998 btrfs_release_path(path);
1999 if (!ret && update_size) {
2000 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
2001 ret = btrfs_update_inode(trans, root, dir);
2002 }
2003 kfree(name);
2004 iput(dir);
2005 if (!ret && name_added)
2006 ret = 1;
2007 return ret;
2008
2009 insert:
2010 /*
2011 * Check if the inode reference exists in the log for the given name,
2012 * inode and parent inode
2013 */
2014 found_key.objectid = log_key.objectid;
2015 found_key.type = BTRFS_INODE_REF_KEY;
2016 found_key.offset = key->objectid;
2017 ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
2018 if (ret < 0) {
2019 goto out;
2020 } else if (ret) {
2021 /* The dentry will be added later. */
2022 ret = 0;
2023 update_size = false;
2024 goto out;
2025 }
2026
2027 found_key.objectid = log_key.objectid;
2028 found_key.type = BTRFS_INODE_EXTREF_KEY;
2029 found_key.offset = key->objectid;
2030 ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
2031 name_len);
2032 if (ret < 0) {
2033 goto out;
2034 } else if (ret) {
2035 /* The dentry will be added later. */
2036 ret = 0;
2037 update_size = false;
2038 goto out;
2039 }
2040 btrfs_release_path(path);
2041 ret = insert_one_name(trans, root, key->objectid, key->offset,
2042 name, name_len, &log_key);
2043 if (ret && ret != -ENOENT && ret != -EEXIST)
2044 goto out;
2045 if (!ret)
2046 name_added = true;
2047 update_size = false;
2048 ret = 0;
2049 goto out;
2050 }
2051
2052 /*
2053 * find all the names in a directory item and reconcile them into
2054 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
2055 * one name in a directory item, but the same code gets used for
2056 * both directory index types
2057 */
replay_one_dir_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)2058 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2059 struct btrfs_root *root,
2060 struct btrfs_path *path,
2061 struct extent_buffer *eb, int slot,
2062 struct btrfs_key *key)
2063 {
2064 int ret = 0;
2065 u32 item_size = btrfs_item_size_nr(eb, slot);
2066 struct btrfs_dir_item *di;
2067 int name_len;
2068 unsigned long ptr;
2069 unsigned long ptr_end;
2070 struct btrfs_path *fixup_path = NULL;
2071
2072 ptr = btrfs_item_ptr_offset(eb, slot);
2073 ptr_end = ptr + item_size;
2074 while (ptr < ptr_end) {
2075 di = (struct btrfs_dir_item *)ptr;
2076 name_len = btrfs_dir_name_len(eb, di);
2077 ret = replay_one_name(trans, root, path, eb, di, key);
2078 if (ret < 0)
2079 break;
2080 ptr = (unsigned long)(di + 1);
2081 ptr += name_len;
2082
2083 /*
2084 * If this entry refers to a non-directory (directories can not
2085 * have a link count > 1) and it was added in the transaction
2086 * that was not committed, make sure we fixup the link count of
2087 * the inode it the entry points to. Otherwise something like
2088 * the following would result in a directory pointing to an
2089 * inode with a wrong link that does not account for this dir
2090 * entry:
2091 *
2092 * mkdir testdir
2093 * touch testdir/foo
2094 * touch testdir/bar
2095 * sync
2096 *
2097 * ln testdir/bar testdir/bar_link
2098 * ln testdir/foo testdir/foo_link
2099 * xfs_io -c "fsync" testdir/bar
2100 *
2101 * <power failure>
2102 *
2103 * mount fs, log replay happens
2104 *
2105 * File foo would remain with a link count of 1 when it has two
2106 * entries pointing to it in the directory testdir. This would
2107 * make it impossible to ever delete the parent directory has
2108 * it would result in stale dentries that can never be deleted.
2109 */
2110 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2111 struct btrfs_key di_key;
2112
2113 if (!fixup_path) {
2114 fixup_path = btrfs_alloc_path();
2115 if (!fixup_path) {
2116 ret = -ENOMEM;
2117 break;
2118 }
2119 }
2120
2121 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2122 ret = link_to_fixup_dir(trans, root, fixup_path,
2123 di_key.objectid);
2124 if (ret)
2125 break;
2126 }
2127 ret = 0;
2128 }
2129 btrfs_free_path(fixup_path);
2130 return ret;
2131 }
2132
2133 /*
2134 * directory replay has two parts. There are the standard directory
2135 * items in the log copied from the subvolume, and range items
2136 * created in the log while the subvolume was logged.
2137 *
2138 * The range items tell us which parts of the key space the log
2139 * is authoritative for. During replay, if a key in the subvolume
2140 * directory is in a logged range item, but not actually in the log
2141 * that means it was deleted from the directory before the fsync
2142 * and should be removed.
2143 */
find_dir_range(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,int key_type,u64 * start_ret,u64 * end_ret)2144 static noinline int find_dir_range(struct btrfs_root *root,
2145 struct btrfs_path *path,
2146 u64 dirid, int key_type,
2147 u64 *start_ret, u64 *end_ret)
2148 {
2149 struct btrfs_key key;
2150 u64 found_end;
2151 struct btrfs_dir_log_item *item;
2152 int ret;
2153 int nritems;
2154
2155 if (*start_ret == (u64)-1)
2156 return 1;
2157
2158 key.objectid = dirid;
2159 key.type = key_type;
2160 key.offset = *start_ret;
2161
2162 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2163 if (ret < 0)
2164 goto out;
2165 if (ret > 0) {
2166 if (path->slots[0] == 0)
2167 goto out;
2168 path->slots[0]--;
2169 }
2170 if (ret != 0)
2171 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2172
2173 if (key.type != key_type || key.objectid != dirid) {
2174 ret = 1;
2175 goto next;
2176 }
2177 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2178 struct btrfs_dir_log_item);
2179 found_end = btrfs_dir_log_end(path->nodes[0], item);
2180
2181 if (*start_ret >= key.offset && *start_ret <= found_end) {
2182 ret = 0;
2183 *start_ret = key.offset;
2184 *end_ret = found_end;
2185 goto out;
2186 }
2187 ret = 1;
2188 next:
2189 /* check the next slot in the tree to see if it is a valid item */
2190 nritems = btrfs_header_nritems(path->nodes[0]);
2191 path->slots[0]++;
2192 if (path->slots[0] >= nritems) {
2193 ret = btrfs_next_leaf(root, path);
2194 if (ret)
2195 goto out;
2196 }
2197
2198 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2199
2200 if (key.type != key_type || key.objectid != dirid) {
2201 ret = 1;
2202 goto out;
2203 }
2204 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2205 struct btrfs_dir_log_item);
2206 found_end = btrfs_dir_log_end(path->nodes[0], item);
2207 *start_ret = key.offset;
2208 *end_ret = found_end;
2209 ret = 0;
2210 out:
2211 btrfs_release_path(path);
2212 return ret;
2213 }
2214
2215 /*
2216 * this looks for a given directory item in the log. If the directory
2217 * item is not in the log, the item is removed and the inode it points
2218 * to is unlinked
2219 */
check_item_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_path * log_path,struct inode * dir,struct btrfs_key * dir_key)2220 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2221 struct btrfs_root *root,
2222 struct btrfs_root *log,
2223 struct btrfs_path *path,
2224 struct btrfs_path *log_path,
2225 struct inode *dir,
2226 struct btrfs_key *dir_key)
2227 {
2228 int ret;
2229 struct extent_buffer *eb;
2230 int slot;
2231 u32 item_size;
2232 struct btrfs_dir_item *di;
2233 struct btrfs_dir_item *log_di;
2234 int name_len;
2235 unsigned long ptr;
2236 unsigned long ptr_end;
2237 char *name;
2238 struct inode *inode;
2239 struct btrfs_key location;
2240
2241 again:
2242 eb = path->nodes[0];
2243 slot = path->slots[0];
2244 item_size = btrfs_item_size_nr(eb, slot);
2245 ptr = btrfs_item_ptr_offset(eb, slot);
2246 ptr_end = ptr + item_size;
2247 while (ptr < ptr_end) {
2248 di = (struct btrfs_dir_item *)ptr;
2249 name_len = btrfs_dir_name_len(eb, di);
2250 name = kmalloc(name_len, GFP_NOFS);
2251 if (!name) {
2252 ret = -ENOMEM;
2253 goto out;
2254 }
2255 read_extent_buffer(eb, name, (unsigned long)(di + 1),
2256 name_len);
2257 log_di = NULL;
2258 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2259 log_di = btrfs_lookup_dir_item(trans, log, log_path,
2260 dir_key->objectid,
2261 name, name_len, 0);
2262 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2263 log_di = btrfs_lookup_dir_index_item(trans, log,
2264 log_path,
2265 dir_key->objectid,
2266 dir_key->offset,
2267 name, name_len, 0);
2268 }
2269 if (!log_di || log_di == ERR_PTR(-ENOENT)) {
2270 btrfs_dir_item_key_to_cpu(eb, di, &location);
2271 btrfs_release_path(path);
2272 btrfs_release_path(log_path);
2273 inode = read_one_inode(root, location.objectid);
2274 if (!inode) {
2275 kfree(name);
2276 return -EIO;
2277 }
2278
2279 ret = link_to_fixup_dir(trans, root,
2280 path, location.objectid);
2281 if (ret) {
2282 kfree(name);
2283 iput(inode);
2284 goto out;
2285 }
2286
2287 inc_nlink(inode);
2288 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2289 BTRFS_I(inode), name, name_len);
2290 if (!ret)
2291 ret = btrfs_run_delayed_items(trans);
2292 kfree(name);
2293 iput(inode);
2294 if (ret)
2295 goto out;
2296
2297 /* there might still be more names under this key
2298 * check and repeat if required
2299 */
2300 ret = btrfs_search_slot(NULL, root, dir_key, path,
2301 0, 0);
2302 if (ret == 0)
2303 goto again;
2304 ret = 0;
2305 goto out;
2306 } else if (IS_ERR(log_di)) {
2307 kfree(name);
2308 return PTR_ERR(log_di);
2309 }
2310 btrfs_release_path(log_path);
2311 kfree(name);
2312
2313 ptr = (unsigned long)(di + 1);
2314 ptr += name_len;
2315 }
2316 ret = 0;
2317 out:
2318 btrfs_release_path(path);
2319 btrfs_release_path(log_path);
2320 return ret;
2321 }
2322
replay_xattr_deletes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,const u64 ino)2323 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2324 struct btrfs_root *root,
2325 struct btrfs_root *log,
2326 struct btrfs_path *path,
2327 const u64 ino)
2328 {
2329 struct btrfs_key search_key;
2330 struct btrfs_path *log_path;
2331 int i;
2332 int nritems;
2333 int ret;
2334
2335 log_path = btrfs_alloc_path();
2336 if (!log_path)
2337 return -ENOMEM;
2338
2339 search_key.objectid = ino;
2340 search_key.type = BTRFS_XATTR_ITEM_KEY;
2341 search_key.offset = 0;
2342 again:
2343 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2344 if (ret < 0)
2345 goto out;
2346 process_leaf:
2347 nritems = btrfs_header_nritems(path->nodes[0]);
2348 for (i = path->slots[0]; i < nritems; i++) {
2349 struct btrfs_key key;
2350 struct btrfs_dir_item *di;
2351 struct btrfs_dir_item *log_di;
2352 u32 total_size;
2353 u32 cur;
2354
2355 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2356 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2357 ret = 0;
2358 goto out;
2359 }
2360
2361 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2362 total_size = btrfs_item_size_nr(path->nodes[0], i);
2363 cur = 0;
2364 while (cur < total_size) {
2365 u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2366 u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2367 u32 this_len = sizeof(*di) + name_len + data_len;
2368 char *name;
2369
2370 name = kmalloc(name_len, GFP_NOFS);
2371 if (!name) {
2372 ret = -ENOMEM;
2373 goto out;
2374 }
2375 read_extent_buffer(path->nodes[0], name,
2376 (unsigned long)(di + 1), name_len);
2377
2378 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2379 name, name_len, 0);
2380 btrfs_release_path(log_path);
2381 if (!log_di) {
2382 /* Doesn't exist in log tree, so delete it. */
2383 btrfs_release_path(path);
2384 di = btrfs_lookup_xattr(trans, root, path, ino,
2385 name, name_len, -1);
2386 kfree(name);
2387 if (IS_ERR(di)) {
2388 ret = PTR_ERR(di);
2389 goto out;
2390 }
2391 ASSERT(di);
2392 ret = btrfs_delete_one_dir_name(trans, root,
2393 path, di);
2394 if (ret)
2395 goto out;
2396 btrfs_release_path(path);
2397 search_key = key;
2398 goto again;
2399 }
2400 kfree(name);
2401 if (IS_ERR(log_di)) {
2402 ret = PTR_ERR(log_di);
2403 goto out;
2404 }
2405 cur += this_len;
2406 di = (struct btrfs_dir_item *)((char *)di + this_len);
2407 }
2408 }
2409 ret = btrfs_next_leaf(root, path);
2410 if (ret > 0)
2411 ret = 0;
2412 else if (ret == 0)
2413 goto process_leaf;
2414 out:
2415 btrfs_free_path(log_path);
2416 btrfs_release_path(path);
2417 return ret;
2418 }
2419
2420
2421 /*
2422 * deletion replay happens before we copy any new directory items
2423 * out of the log or out of backreferences from inodes. It
2424 * scans the log to find ranges of keys that log is authoritative for,
2425 * and then scans the directory to find items in those ranges that are
2426 * not present in the log.
2427 *
2428 * Anything we don't find in the log is unlinked and removed from the
2429 * directory.
2430 */
replay_dir_deletes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,u64 dirid,int del_all)2431 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2432 struct btrfs_root *root,
2433 struct btrfs_root *log,
2434 struct btrfs_path *path,
2435 u64 dirid, int del_all)
2436 {
2437 u64 range_start;
2438 u64 range_end;
2439 int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2440 int ret = 0;
2441 struct btrfs_key dir_key;
2442 struct btrfs_key found_key;
2443 struct btrfs_path *log_path;
2444 struct inode *dir;
2445
2446 dir_key.objectid = dirid;
2447 dir_key.type = BTRFS_DIR_ITEM_KEY;
2448 log_path = btrfs_alloc_path();
2449 if (!log_path)
2450 return -ENOMEM;
2451
2452 dir = read_one_inode(root, dirid);
2453 /* it isn't an error if the inode isn't there, that can happen
2454 * because we replay the deletes before we copy in the inode item
2455 * from the log
2456 */
2457 if (!dir) {
2458 btrfs_free_path(log_path);
2459 return 0;
2460 }
2461 again:
2462 range_start = 0;
2463 range_end = 0;
2464 while (1) {
2465 if (del_all)
2466 range_end = (u64)-1;
2467 else {
2468 ret = find_dir_range(log, path, dirid, key_type,
2469 &range_start, &range_end);
2470 if (ret < 0)
2471 goto out;
2472 else if (ret > 0)
2473 break;
2474 }
2475
2476 dir_key.offset = range_start;
2477 while (1) {
2478 int nritems;
2479 ret = btrfs_search_slot(NULL, root, &dir_key, path,
2480 0, 0);
2481 if (ret < 0)
2482 goto out;
2483
2484 nritems = btrfs_header_nritems(path->nodes[0]);
2485 if (path->slots[0] >= nritems) {
2486 ret = btrfs_next_leaf(root, path);
2487 if (ret == 1)
2488 break;
2489 else if (ret < 0)
2490 goto out;
2491 }
2492 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2493 path->slots[0]);
2494 if (found_key.objectid != dirid ||
2495 found_key.type != dir_key.type)
2496 goto next_type;
2497
2498 if (found_key.offset > range_end)
2499 break;
2500
2501 ret = check_item_in_log(trans, root, log, path,
2502 log_path, dir,
2503 &found_key);
2504 if (ret)
2505 goto out;
2506 if (found_key.offset == (u64)-1)
2507 break;
2508 dir_key.offset = found_key.offset + 1;
2509 }
2510 btrfs_release_path(path);
2511 if (range_end == (u64)-1)
2512 break;
2513 range_start = range_end + 1;
2514 }
2515
2516 next_type:
2517 ret = 0;
2518 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2519 key_type = BTRFS_DIR_LOG_INDEX_KEY;
2520 dir_key.type = BTRFS_DIR_INDEX_KEY;
2521 btrfs_release_path(path);
2522 goto again;
2523 }
2524 out:
2525 btrfs_release_path(path);
2526 btrfs_free_path(log_path);
2527 iput(dir);
2528 return ret;
2529 }
2530
2531 /*
2532 * the process_func used to replay items from the log tree. This
2533 * gets called in two different stages. The first stage just looks
2534 * for inodes and makes sure they are all copied into the subvolume.
2535 *
2536 * The second stage copies all the other item types from the log into
2537 * the subvolume. The two stage approach is slower, but gets rid of
2538 * lots of complexity around inodes referencing other inodes that exist
2539 * only in the log (references come from either directory items or inode
2540 * back refs).
2541 */
replay_one_buffer(struct btrfs_root * log,struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)2542 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2543 struct walk_control *wc, u64 gen, int level)
2544 {
2545 int nritems;
2546 struct btrfs_path *path;
2547 struct btrfs_root *root = wc->replay_dest;
2548 struct btrfs_key key;
2549 int i;
2550 int ret;
2551
2552 ret = btrfs_read_buffer(eb, gen, level, NULL);
2553 if (ret)
2554 return ret;
2555
2556 level = btrfs_header_level(eb);
2557
2558 if (level != 0)
2559 return 0;
2560
2561 path = btrfs_alloc_path();
2562 if (!path)
2563 return -ENOMEM;
2564
2565 nritems = btrfs_header_nritems(eb);
2566 for (i = 0; i < nritems; i++) {
2567 btrfs_item_key_to_cpu(eb, &key, i);
2568
2569 /* inode keys are done during the first stage */
2570 if (key.type == BTRFS_INODE_ITEM_KEY &&
2571 wc->stage == LOG_WALK_REPLAY_INODES) {
2572 struct btrfs_inode_item *inode_item;
2573 u32 mode;
2574
2575 inode_item = btrfs_item_ptr(eb, i,
2576 struct btrfs_inode_item);
2577 /*
2578 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2579 * and never got linked before the fsync, skip it, as
2580 * replaying it is pointless since it would be deleted
2581 * later. We skip logging tmpfiles, but it's always
2582 * possible we are replaying a log created with a kernel
2583 * that used to log tmpfiles.
2584 */
2585 if (btrfs_inode_nlink(eb, inode_item) == 0) {
2586 wc->ignore_cur_inode = true;
2587 continue;
2588 } else {
2589 wc->ignore_cur_inode = false;
2590 }
2591 ret = replay_xattr_deletes(wc->trans, root, log,
2592 path, key.objectid);
2593 if (ret)
2594 break;
2595 mode = btrfs_inode_mode(eb, inode_item);
2596 if (S_ISDIR(mode)) {
2597 ret = replay_dir_deletes(wc->trans,
2598 root, log, path, key.objectid, 0);
2599 if (ret)
2600 break;
2601 }
2602 ret = overwrite_item(wc->trans, root, path,
2603 eb, i, &key);
2604 if (ret)
2605 break;
2606
2607 /*
2608 * Before replaying extents, truncate the inode to its
2609 * size. We need to do it now and not after log replay
2610 * because before an fsync we can have prealloc extents
2611 * added beyond the inode's i_size. If we did it after,
2612 * through orphan cleanup for example, we would drop
2613 * those prealloc extents just after replaying them.
2614 */
2615 if (S_ISREG(mode)) {
2616 struct inode *inode;
2617 u64 from;
2618
2619 inode = read_one_inode(root, key.objectid);
2620 if (!inode) {
2621 ret = -EIO;
2622 break;
2623 }
2624 from = ALIGN(i_size_read(inode),
2625 root->fs_info->sectorsize);
2626 ret = btrfs_drop_extents(wc->trans, root, inode,
2627 from, (u64)-1, 1);
2628 if (!ret) {
2629 /* Update the inode's nbytes. */
2630 ret = btrfs_update_inode(wc->trans,
2631 root, inode);
2632 }
2633 iput(inode);
2634 if (ret)
2635 break;
2636 }
2637
2638 ret = link_to_fixup_dir(wc->trans, root,
2639 path, key.objectid);
2640 if (ret)
2641 break;
2642 }
2643
2644 if (wc->ignore_cur_inode)
2645 continue;
2646
2647 if (key.type == BTRFS_DIR_INDEX_KEY &&
2648 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2649 ret = replay_one_dir_item(wc->trans, root, path,
2650 eb, i, &key);
2651 if (ret)
2652 break;
2653 }
2654
2655 if (wc->stage < LOG_WALK_REPLAY_ALL)
2656 continue;
2657
2658 /* these keys are simply copied */
2659 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2660 ret = overwrite_item(wc->trans, root, path,
2661 eb, i, &key);
2662 if (ret)
2663 break;
2664 } else if (key.type == BTRFS_INODE_REF_KEY ||
2665 key.type == BTRFS_INODE_EXTREF_KEY) {
2666 ret = add_inode_ref(wc->trans, root, log, path,
2667 eb, i, &key);
2668 if (ret && ret != -ENOENT)
2669 break;
2670 ret = 0;
2671 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2672 ret = replay_one_extent(wc->trans, root, path,
2673 eb, i, &key);
2674 if (ret)
2675 break;
2676 } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2677 ret = replay_one_dir_item(wc->trans, root, path,
2678 eb, i, &key);
2679 if (ret)
2680 break;
2681 }
2682 }
2683 btrfs_free_path(path);
2684 return ret;
2685 }
2686
2687 /*
2688 * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2689 */
unaccount_log_buffer(struct btrfs_fs_info * fs_info,u64 start)2690 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2691 {
2692 struct btrfs_block_group *cache;
2693
2694 cache = btrfs_lookup_block_group(fs_info, start);
2695 if (!cache) {
2696 btrfs_err(fs_info, "unable to find block group for %llu", start);
2697 return;
2698 }
2699
2700 spin_lock(&cache->space_info->lock);
2701 spin_lock(&cache->lock);
2702 cache->reserved -= fs_info->nodesize;
2703 cache->space_info->bytes_reserved -= fs_info->nodesize;
2704 spin_unlock(&cache->lock);
2705 spin_unlock(&cache->space_info->lock);
2706
2707 btrfs_put_block_group(cache);
2708 }
2709
walk_down_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int * level,struct walk_control * wc)2710 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2711 struct btrfs_root *root,
2712 struct btrfs_path *path, int *level,
2713 struct walk_control *wc)
2714 {
2715 struct btrfs_fs_info *fs_info = root->fs_info;
2716 u64 bytenr;
2717 u64 ptr_gen;
2718 struct extent_buffer *next;
2719 struct extent_buffer *cur;
2720 u32 blocksize;
2721 int ret = 0;
2722
2723 while (*level > 0) {
2724 struct btrfs_key first_key;
2725
2726 cur = path->nodes[*level];
2727
2728 WARN_ON(btrfs_header_level(cur) != *level);
2729
2730 if (path->slots[*level] >=
2731 btrfs_header_nritems(cur))
2732 break;
2733
2734 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2735 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2736 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
2737 blocksize = fs_info->nodesize;
2738
2739 next = btrfs_find_create_tree_block(fs_info, bytenr);
2740 if (IS_ERR(next))
2741 return PTR_ERR(next);
2742
2743 if (*level == 1) {
2744 ret = wc->process_func(root, next, wc, ptr_gen,
2745 *level - 1);
2746 if (ret) {
2747 free_extent_buffer(next);
2748 return ret;
2749 }
2750
2751 path->slots[*level]++;
2752 if (wc->free) {
2753 ret = btrfs_read_buffer(next, ptr_gen,
2754 *level - 1, &first_key);
2755 if (ret) {
2756 free_extent_buffer(next);
2757 return ret;
2758 }
2759
2760 if (trans) {
2761 btrfs_tree_lock(next);
2762 btrfs_set_lock_blocking_write(next);
2763 btrfs_clean_tree_block(next);
2764 btrfs_wait_tree_block_writeback(next);
2765 btrfs_tree_unlock(next);
2766 ret = btrfs_pin_reserved_extent(trans,
2767 bytenr, blocksize);
2768 if (ret) {
2769 free_extent_buffer(next);
2770 return ret;
2771 }
2772 } else {
2773 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2774 clear_extent_buffer_dirty(next);
2775 unaccount_log_buffer(fs_info, bytenr);
2776 }
2777 }
2778 free_extent_buffer(next);
2779 continue;
2780 }
2781 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
2782 if (ret) {
2783 free_extent_buffer(next);
2784 return ret;
2785 }
2786
2787 if (path->nodes[*level-1])
2788 free_extent_buffer(path->nodes[*level-1]);
2789 path->nodes[*level-1] = next;
2790 *level = btrfs_header_level(next);
2791 path->slots[*level] = 0;
2792 cond_resched();
2793 }
2794 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2795
2796 cond_resched();
2797 return 0;
2798 }
2799
walk_up_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int * level,struct walk_control * wc)2800 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2801 struct btrfs_root *root,
2802 struct btrfs_path *path, int *level,
2803 struct walk_control *wc)
2804 {
2805 struct btrfs_fs_info *fs_info = root->fs_info;
2806 int i;
2807 int slot;
2808 int ret;
2809
2810 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2811 slot = path->slots[i];
2812 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2813 path->slots[i]++;
2814 *level = i;
2815 WARN_ON(*level == 0);
2816 return 0;
2817 } else {
2818 ret = wc->process_func(root, path->nodes[*level], wc,
2819 btrfs_header_generation(path->nodes[*level]),
2820 *level);
2821 if (ret)
2822 return ret;
2823
2824 if (wc->free) {
2825 struct extent_buffer *next;
2826
2827 next = path->nodes[*level];
2828
2829 if (trans) {
2830 btrfs_tree_lock(next);
2831 btrfs_set_lock_blocking_write(next);
2832 btrfs_clean_tree_block(next);
2833 btrfs_wait_tree_block_writeback(next);
2834 btrfs_tree_unlock(next);
2835 ret = btrfs_pin_reserved_extent(trans,
2836 path->nodes[*level]->start,
2837 path->nodes[*level]->len);
2838 if (ret)
2839 return ret;
2840 } else {
2841 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2842 clear_extent_buffer_dirty(next);
2843
2844 unaccount_log_buffer(fs_info,
2845 path->nodes[*level]->start);
2846 }
2847 }
2848 free_extent_buffer(path->nodes[*level]);
2849 path->nodes[*level] = NULL;
2850 *level = i + 1;
2851 }
2852 }
2853 return 1;
2854 }
2855
2856 /*
2857 * drop the reference count on the tree rooted at 'snap'. This traverses
2858 * the tree freeing any blocks that have a ref count of zero after being
2859 * decremented.
2860 */
walk_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct walk_control * wc)2861 static int walk_log_tree(struct btrfs_trans_handle *trans,
2862 struct btrfs_root *log, struct walk_control *wc)
2863 {
2864 struct btrfs_fs_info *fs_info = log->fs_info;
2865 int ret = 0;
2866 int wret;
2867 int level;
2868 struct btrfs_path *path;
2869 int orig_level;
2870
2871 path = btrfs_alloc_path();
2872 if (!path)
2873 return -ENOMEM;
2874
2875 level = btrfs_header_level(log->node);
2876 orig_level = level;
2877 path->nodes[level] = log->node;
2878 atomic_inc(&log->node->refs);
2879 path->slots[level] = 0;
2880
2881 while (1) {
2882 wret = walk_down_log_tree(trans, log, path, &level, wc);
2883 if (wret > 0)
2884 break;
2885 if (wret < 0) {
2886 ret = wret;
2887 goto out;
2888 }
2889
2890 wret = walk_up_log_tree(trans, log, path, &level, wc);
2891 if (wret > 0)
2892 break;
2893 if (wret < 0) {
2894 ret = wret;
2895 goto out;
2896 }
2897 }
2898
2899 /* was the root node processed? if not, catch it here */
2900 if (path->nodes[orig_level]) {
2901 ret = wc->process_func(log, path->nodes[orig_level], wc,
2902 btrfs_header_generation(path->nodes[orig_level]),
2903 orig_level);
2904 if (ret)
2905 goto out;
2906 if (wc->free) {
2907 struct extent_buffer *next;
2908
2909 next = path->nodes[orig_level];
2910
2911 if (trans) {
2912 btrfs_tree_lock(next);
2913 btrfs_set_lock_blocking_write(next);
2914 btrfs_clean_tree_block(next);
2915 btrfs_wait_tree_block_writeback(next);
2916 btrfs_tree_unlock(next);
2917 ret = btrfs_pin_reserved_extent(trans,
2918 next->start, next->len);
2919 if (ret)
2920 goto out;
2921 } else {
2922 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2923 clear_extent_buffer_dirty(next);
2924 unaccount_log_buffer(fs_info, next->start);
2925 }
2926 }
2927 }
2928
2929 out:
2930 btrfs_free_path(path);
2931 return ret;
2932 }
2933
2934 /*
2935 * helper function to update the item for a given subvolumes log root
2936 * in the tree of log roots
2937 */
update_log_root(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_root_item * root_item)2938 static int update_log_root(struct btrfs_trans_handle *trans,
2939 struct btrfs_root *log,
2940 struct btrfs_root_item *root_item)
2941 {
2942 struct btrfs_fs_info *fs_info = log->fs_info;
2943 int ret;
2944
2945 if (log->log_transid == 1) {
2946 /* insert root item on the first sync */
2947 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2948 &log->root_key, root_item);
2949 } else {
2950 ret = btrfs_update_root(trans, fs_info->log_root_tree,
2951 &log->root_key, root_item);
2952 }
2953 return ret;
2954 }
2955
wait_log_commit(struct btrfs_root * root,int transid)2956 static void wait_log_commit(struct btrfs_root *root, int transid)
2957 {
2958 DEFINE_WAIT(wait);
2959 int index = transid % 2;
2960
2961 /*
2962 * we only allow two pending log transactions at a time,
2963 * so we know that if ours is more than 2 older than the
2964 * current transaction, we're done
2965 */
2966 for (;;) {
2967 prepare_to_wait(&root->log_commit_wait[index],
2968 &wait, TASK_UNINTERRUPTIBLE);
2969
2970 if (!(root->log_transid_committed < transid &&
2971 atomic_read(&root->log_commit[index])))
2972 break;
2973
2974 mutex_unlock(&root->log_mutex);
2975 schedule();
2976 mutex_lock(&root->log_mutex);
2977 }
2978 finish_wait(&root->log_commit_wait[index], &wait);
2979 }
2980
wait_for_writer(struct btrfs_root * root)2981 static void wait_for_writer(struct btrfs_root *root)
2982 {
2983 DEFINE_WAIT(wait);
2984
2985 for (;;) {
2986 prepare_to_wait(&root->log_writer_wait, &wait,
2987 TASK_UNINTERRUPTIBLE);
2988 if (!atomic_read(&root->log_writers))
2989 break;
2990
2991 mutex_unlock(&root->log_mutex);
2992 schedule();
2993 mutex_lock(&root->log_mutex);
2994 }
2995 finish_wait(&root->log_writer_wait, &wait);
2996 }
2997
btrfs_remove_log_ctx(struct btrfs_root * root,struct btrfs_log_ctx * ctx)2998 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2999 struct btrfs_log_ctx *ctx)
3000 {
3001 if (!ctx)
3002 return;
3003
3004 mutex_lock(&root->log_mutex);
3005 list_del_init(&ctx->list);
3006 mutex_unlock(&root->log_mutex);
3007 }
3008
3009 /*
3010 * Invoked in log mutex context, or be sure there is no other task which
3011 * can access the list.
3012 */
btrfs_remove_all_log_ctxs(struct btrfs_root * root,int index,int error)3013 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
3014 int index, int error)
3015 {
3016 struct btrfs_log_ctx *ctx;
3017 struct btrfs_log_ctx *safe;
3018
3019 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3020 list_del_init(&ctx->list);
3021 ctx->log_ret = error;
3022 }
3023
3024 INIT_LIST_HEAD(&root->log_ctxs[index]);
3025 }
3026
3027 /*
3028 * btrfs_sync_log does sends a given tree log down to the disk and
3029 * updates the super blocks to record it. When this call is done,
3030 * you know that any inodes previously logged are safely on disk only
3031 * if it returns 0.
3032 *
3033 * Any other return value means you need to call btrfs_commit_transaction.
3034 * Some of the edge cases for fsyncing directories that have had unlinks
3035 * or renames done in the past mean that sometimes the only safe
3036 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
3037 * that has happened.
3038 */
btrfs_sync_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)3039 int btrfs_sync_log(struct btrfs_trans_handle *trans,
3040 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
3041 {
3042 int index1;
3043 int index2;
3044 int mark;
3045 int ret;
3046 struct btrfs_fs_info *fs_info = root->fs_info;
3047 struct btrfs_root *log = root->log_root;
3048 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
3049 struct btrfs_root_item new_root_item;
3050 int log_transid = 0;
3051 struct btrfs_log_ctx root_log_ctx;
3052 struct blk_plug plug;
3053
3054 mutex_lock(&root->log_mutex);
3055 log_transid = ctx->log_transid;
3056 if (root->log_transid_committed >= log_transid) {
3057 mutex_unlock(&root->log_mutex);
3058 return ctx->log_ret;
3059 }
3060
3061 index1 = log_transid % 2;
3062 if (atomic_read(&root->log_commit[index1])) {
3063 wait_log_commit(root, log_transid);
3064 mutex_unlock(&root->log_mutex);
3065 return ctx->log_ret;
3066 }
3067 ASSERT(log_transid == root->log_transid);
3068 atomic_set(&root->log_commit[index1], 1);
3069
3070 /* wait for previous tree log sync to complete */
3071 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
3072 wait_log_commit(root, log_transid - 1);
3073
3074 while (1) {
3075 int batch = atomic_read(&root->log_batch);
3076 /* when we're on an ssd, just kick the log commit out */
3077 if (!btrfs_test_opt(fs_info, SSD) &&
3078 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
3079 mutex_unlock(&root->log_mutex);
3080 schedule_timeout_uninterruptible(1);
3081 mutex_lock(&root->log_mutex);
3082 }
3083 wait_for_writer(root);
3084 if (batch == atomic_read(&root->log_batch))
3085 break;
3086 }
3087
3088 /* bail out if we need to do a full commit */
3089 if (btrfs_need_log_full_commit(trans)) {
3090 ret = -EAGAIN;
3091 mutex_unlock(&root->log_mutex);
3092 goto out;
3093 }
3094
3095 if (log_transid % 2 == 0)
3096 mark = EXTENT_DIRTY;
3097 else
3098 mark = EXTENT_NEW;
3099
3100 /* we start IO on all the marked extents here, but we don't actually
3101 * wait for them until later.
3102 */
3103 blk_start_plug(&plug);
3104 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3105 if (ret) {
3106 blk_finish_plug(&plug);
3107 btrfs_abort_transaction(trans, ret);
3108 btrfs_set_log_full_commit(trans);
3109 mutex_unlock(&root->log_mutex);
3110 goto out;
3111 }
3112
3113 /*
3114 * We _must_ update under the root->log_mutex in order to make sure we
3115 * have a consistent view of the log root we are trying to commit at
3116 * this moment.
3117 *
3118 * We _must_ copy this into a local copy, because we are not holding the
3119 * log_root_tree->log_mutex yet. This is important because when we
3120 * commit the log_root_tree we must have a consistent view of the
3121 * log_root_tree when we update the super block to point at the
3122 * log_root_tree bytenr. If we update the log_root_tree here we'll race
3123 * with the commit and possibly point at the new block which we may not
3124 * have written out.
3125 */
3126 btrfs_set_root_node(&log->root_item, log->node);
3127 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3128
3129 root->log_transid++;
3130 log->log_transid = root->log_transid;
3131 root->log_start_pid = 0;
3132 /*
3133 * IO has been started, blocks of the log tree have WRITTEN flag set
3134 * in their headers. new modifications of the log will be written to
3135 * new positions. so it's safe to allow log writers to go in.
3136 */
3137 mutex_unlock(&root->log_mutex);
3138
3139 btrfs_init_log_ctx(&root_log_ctx, NULL);
3140
3141 mutex_lock(&log_root_tree->log_mutex);
3142
3143 index2 = log_root_tree->log_transid % 2;
3144 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3145 root_log_ctx.log_transid = log_root_tree->log_transid;
3146
3147 /*
3148 * Now we are safe to update the log_root_tree because we're under the
3149 * log_mutex, and we're a current writer so we're holding the commit
3150 * open until we drop the log_mutex.
3151 */
3152 ret = update_log_root(trans, log, &new_root_item);
3153 if (ret) {
3154 if (!list_empty(&root_log_ctx.list))
3155 list_del_init(&root_log_ctx.list);
3156
3157 blk_finish_plug(&plug);
3158 btrfs_set_log_full_commit(trans);
3159
3160 if (ret != -ENOSPC) {
3161 btrfs_abort_transaction(trans, ret);
3162 mutex_unlock(&log_root_tree->log_mutex);
3163 goto out;
3164 }
3165 btrfs_wait_tree_log_extents(log, mark);
3166 mutex_unlock(&log_root_tree->log_mutex);
3167 ret = -EAGAIN;
3168 goto out;
3169 }
3170
3171 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3172 blk_finish_plug(&plug);
3173 list_del_init(&root_log_ctx.list);
3174 mutex_unlock(&log_root_tree->log_mutex);
3175 ret = root_log_ctx.log_ret;
3176 goto out;
3177 }
3178
3179 index2 = root_log_ctx.log_transid % 2;
3180 if (atomic_read(&log_root_tree->log_commit[index2])) {
3181 blk_finish_plug(&plug);
3182 ret = btrfs_wait_tree_log_extents(log, mark);
3183 wait_log_commit(log_root_tree,
3184 root_log_ctx.log_transid);
3185 mutex_unlock(&log_root_tree->log_mutex);
3186 if (!ret)
3187 ret = root_log_ctx.log_ret;
3188 goto out;
3189 }
3190 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3191 atomic_set(&log_root_tree->log_commit[index2], 1);
3192
3193 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3194 wait_log_commit(log_root_tree,
3195 root_log_ctx.log_transid - 1);
3196 }
3197
3198 /*
3199 * now that we've moved on to the tree of log tree roots,
3200 * check the full commit flag again
3201 */
3202 if (btrfs_need_log_full_commit(trans)) {
3203 blk_finish_plug(&plug);
3204 btrfs_wait_tree_log_extents(log, mark);
3205 mutex_unlock(&log_root_tree->log_mutex);
3206 ret = -EAGAIN;
3207 goto out_wake_log_root;
3208 }
3209
3210 ret = btrfs_write_marked_extents(fs_info,
3211 &log_root_tree->dirty_log_pages,
3212 EXTENT_DIRTY | EXTENT_NEW);
3213 blk_finish_plug(&plug);
3214 if (ret) {
3215 btrfs_set_log_full_commit(trans);
3216 btrfs_abort_transaction(trans, ret);
3217 mutex_unlock(&log_root_tree->log_mutex);
3218 goto out_wake_log_root;
3219 }
3220 ret = btrfs_wait_tree_log_extents(log, mark);
3221 if (!ret)
3222 ret = btrfs_wait_tree_log_extents(log_root_tree,
3223 EXTENT_NEW | EXTENT_DIRTY);
3224 if (ret) {
3225 btrfs_set_log_full_commit(trans);
3226 mutex_unlock(&log_root_tree->log_mutex);
3227 goto out_wake_log_root;
3228 }
3229
3230 btrfs_set_super_log_root(fs_info->super_for_commit,
3231 log_root_tree->node->start);
3232 btrfs_set_super_log_root_level(fs_info->super_for_commit,
3233 btrfs_header_level(log_root_tree->node));
3234
3235 log_root_tree->log_transid++;
3236 mutex_unlock(&log_root_tree->log_mutex);
3237
3238 /*
3239 * Nobody else is going to jump in and write the ctree
3240 * super here because the log_commit atomic below is protecting
3241 * us. We must be called with a transaction handle pinning
3242 * the running transaction open, so a full commit can't hop
3243 * in and cause problems either.
3244 */
3245 ret = write_all_supers(fs_info, 1);
3246 if (ret) {
3247 btrfs_set_log_full_commit(trans);
3248 btrfs_abort_transaction(trans, ret);
3249 goto out_wake_log_root;
3250 }
3251
3252 mutex_lock(&root->log_mutex);
3253 if (root->last_log_commit < log_transid)
3254 root->last_log_commit = log_transid;
3255 mutex_unlock(&root->log_mutex);
3256
3257 out_wake_log_root:
3258 mutex_lock(&log_root_tree->log_mutex);
3259 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3260
3261 log_root_tree->log_transid_committed++;
3262 atomic_set(&log_root_tree->log_commit[index2], 0);
3263 mutex_unlock(&log_root_tree->log_mutex);
3264
3265 /*
3266 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3267 * all the updates above are seen by the woken threads. It might not be
3268 * necessary, but proving that seems to be hard.
3269 */
3270 cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3271 out:
3272 mutex_lock(&root->log_mutex);
3273 btrfs_remove_all_log_ctxs(root, index1, ret);
3274 root->log_transid_committed++;
3275 atomic_set(&root->log_commit[index1], 0);
3276 mutex_unlock(&root->log_mutex);
3277
3278 /*
3279 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3280 * all the updates above are seen by the woken threads. It might not be
3281 * necessary, but proving that seems to be hard.
3282 */
3283 cond_wake_up(&root->log_commit_wait[index1]);
3284 return ret;
3285 }
3286
free_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * log)3287 static void free_log_tree(struct btrfs_trans_handle *trans,
3288 struct btrfs_root *log)
3289 {
3290 int ret;
3291 struct walk_control wc = {
3292 .free = 1,
3293 .process_func = process_one_buffer
3294 };
3295
3296 ret = walk_log_tree(trans, log, &wc);
3297 if (ret) {
3298 if (trans)
3299 btrfs_abort_transaction(trans, ret);
3300 else
3301 btrfs_handle_fs_error(log->fs_info, ret, NULL);
3302 }
3303
3304 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3305 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3306 extent_io_tree_release(&log->log_csum_range);
3307 btrfs_put_root(log);
3308 }
3309
3310 /*
3311 * free all the extents used by the tree log. This should be called
3312 * at commit time of the full transaction
3313 */
btrfs_free_log(struct btrfs_trans_handle * trans,struct btrfs_root * root)3314 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3315 {
3316 if (root->log_root) {
3317 free_log_tree(trans, root->log_root);
3318 root->log_root = NULL;
3319 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3320 }
3321 return 0;
3322 }
3323
btrfs_free_log_root_tree(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info)3324 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3325 struct btrfs_fs_info *fs_info)
3326 {
3327 if (fs_info->log_root_tree) {
3328 free_log_tree(trans, fs_info->log_root_tree);
3329 fs_info->log_root_tree = NULL;
3330 }
3331 return 0;
3332 }
3333
3334 /*
3335 * Check if an inode was logged in the current transaction. We can't always rely
3336 * on an inode's logged_trans value, because it's an in-memory only field and
3337 * therefore not persisted. This means that its value is lost if the inode gets
3338 * evicted and loaded again from disk (in which case it has a value of 0, and
3339 * certainly it is smaller then any possible transaction ID), when that happens
3340 * the full_sync flag is set in the inode's runtime flags, so on that case we
3341 * assume eviction happened and ignore the logged_trans value, assuming the
3342 * worst case, that the inode was logged before in the current transaction.
3343 */
inode_logged(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3344 static bool inode_logged(struct btrfs_trans_handle *trans,
3345 struct btrfs_inode *inode)
3346 {
3347 if (inode->logged_trans == trans->transid)
3348 return true;
3349
3350 if (inode->last_trans == trans->transid &&
3351 test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3352 !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3353 return true;
3354
3355 return false;
3356 }
3357
3358 /*
3359 * If both a file and directory are logged, and unlinks or renames are
3360 * mixed in, we have a few interesting corners:
3361 *
3362 * create file X in dir Y
3363 * link file X to X.link in dir Y
3364 * fsync file X
3365 * unlink file X but leave X.link
3366 * fsync dir Y
3367 *
3368 * After a crash we would expect only X.link to exist. But file X
3369 * didn't get fsync'd again so the log has back refs for X and X.link.
3370 *
3371 * We solve this by removing directory entries and inode backrefs from the
3372 * log when a file that was logged in the current transaction is
3373 * unlinked. Any later fsync will include the updated log entries, and
3374 * we'll be able to reconstruct the proper directory items from backrefs.
3375 *
3376 * This optimizations allows us to avoid relogging the entire inode
3377 * or the entire directory.
3378 */
btrfs_del_dir_entries_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,const char * name,int name_len,struct btrfs_inode * dir,u64 index)3379 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3380 struct btrfs_root *root,
3381 const char *name, int name_len,
3382 struct btrfs_inode *dir, u64 index)
3383 {
3384 struct btrfs_root *log;
3385 struct btrfs_dir_item *di;
3386 struct btrfs_path *path;
3387 int ret;
3388 int err = 0;
3389 int bytes_del = 0;
3390 u64 dir_ino = btrfs_ino(dir);
3391
3392 if (!inode_logged(trans, dir))
3393 return 0;
3394
3395 ret = join_running_log_trans(root);
3396 if (ret)
3397 return 0;
3398
3399 mutex_lock(&dir->log_mutex);
3400
3401 log = root->log_root;
3402 path = btrfs_alloc_path();
3403 if (!path) {
3404 err = -ENOMEM;
3405 goto out_unlock;
3406 }
3407
3408 di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3409 name, name_len, -1);
3410 if (IS_ERR(di)) {
3411 err = PTR_ERR(di);
3412 goto fail;
3413 }
3414 if (di) {
3415 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3416 bytes_del += name_len;
3417 if (ret) {
3418 err = ret;
3419 goto fail;
3420 }
3421 }
3422 btrfs_release_path(path);
3423 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3424 index, name, name_len, -1);
3425 if (IS_ERR(di)) {
3426 err = PTR_ERR(di);
3427 goto fail;
3428 }
3429 if (di) {
3430 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3431 bytes_del += name_len;
3432 if (ret) {
3433 err = ret;
3434 goto fail;
3435 }
3436 }
3437
3438 /* update the directory size in the log to reflect the names
3439 * we have removed
3440 */
3441 if (bytes_del) {
3442 struct btrfs_key key;
3443
3444 key.objectid = dir_ino;
3445 key.offset = 0;
3446 key.type = BTRFS_INODE_ITEM_KEY;
3447 btrfs_release_path(path);
3448
3449 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
3450 if (ret < 0) {
3451 err = ret;
3452 goto fail;
3453 }
3454 if (ret == 0) {
3455 struct btrfs_inode_item *item;
3456 u64 i_size;
3457
3458 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3459 struct btrfs_inode_item);
3460 i_size = btrfs_inode_size(path->nodes[0], item);
3461 if (i_size > bytes_del)
3462 i_size -= bytes_del;
3463 else
3464 i_size = 0;
3465 btrfs_set_inode_size(path->nodes[0], item, i_size);
3466 btrfs_mark_buffer_dirty(path->nodes[0]);
3467 } else
3468 ret = 0;
3469 btrfs_release_path(path);
3470 }
3471 fail:
3472 btrfs_free_path(path);
3473 out_unlock:
3474 mutex_unlock(&dir->log_mutex);
3475 if (err == -ENOSPC) {
3476 btrfs_set_log_full_commit(trans);
3477 err = 0;
3478 } else if (err < 0 && err != -ENOENT) {
3479 /* ENOENT can be returned if the entry hasn't been fsynced yet */
3480 btrfs_abort_transaction(trans, err);
3481 }
3482
3483 btrfs_end_log_trans(root);
3484
3485 return err;
3486 }
3487
3488 /* see comments for btrfs_del_dir_entries_in_log */
btrfs_del_inode_ref_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,const char * name,int name_len,struct btrfs_inode * inode,u64 dirid)3489 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3490 struct btrfs_root *root,
3491 const char *name, int name_len,
3492 struct btrfs_inode *inode, u64 dirid)
3493 {
3494 struct btrfs_root *log;
3495 u64 index;
3496 int ret;
3497
3498 if (!inode_logged(trans, inode))
3499 return 0;
3500
3501 ret = join_running_log_trans(root);
3502 if (ret)
3503 return 0;
3504 log = root->log_root;
3505 mutex_lock(&inode->log_mutex);
3506
3507 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3508 dirid, &index);
3509 mutex_unlock(&inode->log_mutex);
3510 if (ret == -ENOSPC) {
3511 btrfs_set_log_full_commit(trans);
3512 ret = 0;
3513 } else if (ret < 0 && ret != -ENOENT)
3514 btrfs_abort_transaction(trans, ret);
3515 btrfs_end_log_trans(root);
3516
3517 return ret;
3518 }
3519
3520 /*
3521 * creates a range item in the log for 'dirid'. first_offset and
3522 * last_offset tell us which parts of the key space the log should
3523 * be considered authoritative for.
3524 */
insert_dir_log_key(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,int key_type,u64 dirid,u64 first_offset,u64 last_offset)3525 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3526 struct btrfs_root *log,
3527 struct btrfs_path *path,
3528 int key_type, u64 dirid,
3529 u64 first_offset, u64 last_offset)
3530 {
3531 int ret;
3532 struct btrfs_key key;
3533 struct btrfs_dir_log_item *item;
3534
3535 key.objectid = dirid;
3536 key.offset = first_offset;
3537 if (key_type == BTRFS_DIR_ITEM_KEY)
3538 key.type = BTRFS_DIR_LOG_ITEM_KEY;
3539 else
3540 key.type = BTRFS_DIR_LOG_INDEX_KEY;
3541 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3542 if (ret)
3543 return ret;
3544
3545 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3546 struct btrfs_dir_log_item);
3547 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3548 btrfs_mark_buffer_dirty(path->nodes[0]);
3549 btrfs_release_path(path);
3550 return 0;
3551 }
3552
3553 /*
3554 * log all the items included in the current transaction for a given
3555 * directory. This also creates the range items in the log tree required
3556 * to replay anything deleted before the fsync
3557 */
log_dir_items(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,int key_type,struct btrfs_log_ctx * ctx,u64 min_offset,u64 * last_offset_ret)3558 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3559 struct btrfs_root *root, struct btrfs_inode *inode,
3560 struct btrfs_path *path,
3561 struct btrfs_path *dst_path, int key_type,
3562 struct btrfs_log_ctx *ctx,
3563 u64 min_offset, u64 *last_offset_ret)
3564 {
3565 struct btrfs_key min_key;
3566 struct btrfs_root *log = root->log_root;
3567 struct extent_buffer *src;
3568 int err = 0;
3569 int ret;
3570 int i;
3571 int nritems;
3572 u64 first_offset = min_offset;
3573 u64 last_offset = (u64)-1;
3574 u64 ino = btrfs_ino(inode);
3575
3576 log = root->log_root;
3577
3578 min_key.objectid = ino;
3579 min_key.type = key_type;
3580 min_key.offset = min_offset;
3581
3582 ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3583
3584 /*
3585 * we didn't find anything from this transaction, see if there
3586 * is anything at all
3587 */
3588 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3589 min_key.objectid = ino;
3590 min_key.type = key_type;
3591 min_key.offset = (u64)-1;
3592 btrfs_release_path(path);
3593 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3594 if (ret < 0) {
3595 btrfs_release_path(path);
3596 return ret;
3597 }
3598 ret = btrfs_previous_item(root, path, ino, key_type);
3599
3600 /* if ret == 0 there are items for this type,
3601 * create a range to tell us the last key of this type.
3602 * otherwise, there are no items in this directory after
3603 * *min_offset, and we create a range to indicate that.
3604 */
3605 if (ret == 0) {
3606 struct btrfs_key tmp;
3607 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3608 path->slots[0]);
3609 if (key_type == tmp.type)
3610 first_offset = max(min_offset, tmp.offset) + 1;
3611 }
3612 goto done;
3613 }
3614
3615 /* go backward to find any previous key */
3616 ret = btrfs_previous_item(root, path, ino, key_type);
3617 if (ret == 0) {
3618 struct btrfs_key tmp;
3619 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3620 if (key_type == tmp.type) {
3621 first_offset = tmp.offset;
3622 ret = overwrite_item(trans, log, dst_path,
3623 path->nodes[0], path->slots[0],
3624 &tmp);
3625 if (ret) {
3626 err = ret;
3627 goto done;
3628 }
3629 }
3630 }
3631 btrfs_release_path(path);
3632
3633 /*
3634 * Find the first key from this transaction again. See the note for
3635 * log_new_dir_dentries, if we're logging a directory recursively we
3636 * won't be holding its i_mutex, which means we can modify the directory
3637 * while we're logging it. If we remove an entry between our first
3638 * search and this search we'll not find the key again and can just
3639 * bail.
3640 */
3641 search:
3642 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3643 if (ret != 0)
3644 goto done;
3645
3646 /*
3647 * we have a block from this transaction, log every item in it
3648 * from our directory
3649 */
3650 while (1) {
3651 struct btrfs_key tmp;
3652 src = path->nodes[0];
3653 nritems = btrfs_header_nritems(src);
3654 for (i = path->slots[0]; i < nritems; i++) {
3655 struct btrfs_dir_item *di;
3656
3657 btrfs_item_key_to_cpu(src, &min_key, i);
3658
3659 if (min_key.objectid != ino || min_key.type != key_type)
3660 goto done;
3661
3662 if (need_resched()) {
3663 btrfs_release_path(path);
3664 cond_resched();
3665 goto search;
3666 }
3667
3668 ret = overwrite_item(trans, log, dst_path, src, i,
3669 &min_key);
3670 if (ret) {
3671 err = ret;
3672 goto done;
3673 }
3674
3675 /*
3676 * We must make sure that when we log a directory entry,
3677 * the corresponding inode, after log replay, has a
3678 * matching link count. For example:
3679 *
3680 * touch foo
3681 * mkdir mydir
3682 * sync
3683 * ln foo mydir/bar
3684 * xfs_io -c "fsync" mydir
3685 * <crash>
3686 * <mount fs and log replay>
3687 *
3688 * Would result in a fsync log that when replayed, our
3689 * file inode would have a link count of 1, but we get
3690 * two directory entries pointing to the same inode.
3691 * After removing one of the names, it would not be
3692 * possible to remove the other name, which resulted
3693 * always in stale file handle errors, and would not
3694 * be possible to rmdir the parent directory, since
3695 * its i_size could never decrement to the value
3696 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3697 */
3698 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3699 btrfs_dir_item_key_to_cpu(src, di, &tmp);
3700 if (ctx &&
3701 (btrfs_dir_transid(src, di) == trans->transid ||
3702 btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3703 tmp.type != BTRFS_ROOT_ITEM_KEY)
3704 ctx->log_new_dentries = true;
3705 }
3706 path->slots[0] = nritems;
3707
3708 /*
3709 * look ahead to the next item and see if it is also
3710 * from this directory and from this transaction
3711 */
3712 ret = btrfs_next_leaf(root, path);
3713 if (ret) {
3714 if (ret == 1)
3715 last_offset = (u64)-1;
3716 else
3717 err = ret;
3718 goto done;
3719 }
3720 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3721 if (tmp.objectid != ino || tmp.type != key_type) {
3722 last_offset = (u64)-1;
3723 goto done;
3724 }
3725 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3726 ret = overwrite_item(trans, log, dst_path,
3727 path->nodes[0], path->slots[0],
3728 &tmp);
3729 if (ret)
3730 err = ret;
3731 else
3732 last_offset = tmp.offset;
3733 goto done;
3734 }
3735 }
3736 done:
3737 btrfs_release_path(path);
3738 btrfs_release_path(dst_path);
3739
3740 if (err == 0) {
3741 *last_offset_ret = last_offset;
3742 /*
3743 * insert the log range keys to indicate where the log
3744 * is valid
3745 */
3746 ret = insert_dir_log_key(trans, log, path, key_type,
3747 ino, first_offset, last_offset);
3748 if (ret)
3749 err = ret;
3750 }
3751 return err;
3752 }
3753
3754 /*
3755 * logging directories is very similar to logging inodes, We find all the items
3756 * from the current transaction and write them to the log.
3757 *
3758 * The recovery code scans the directory in the subvolume, and if it finds a
3759 * key in the range logged that is not present in the log tree, then it means
3760 * that dir entry was unlinked during the transaction.
3761 *
3762 * In order for that scan to work, we must include one key smaller than
3763 * the smallest logged by this transaction and one key larger than the largest
3764 * key logged by this transaction.
3765 */
log_directory_changes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx)3766 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3767 struct btrfs_root *root, struct btrfs_inode *inode,
3768 struct btrfs_path *path,
3769 struct btrfs_path *dst_path,
3770 struct btrfs_log_ctx *ctx)
3771 {
3772 u64 min_key;
3773 u64 max_key;
3774 int ret;
3775 int key_type = BTRFS_DIR_ITEM_KEY;
3776
3777 again:
3778 min_key = 0;
3779 max_key = 0;
3780 while (1) {
3781 ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3782 ctx, min_key, &max_key);
3783 if (ret)
3784 return ret;
3785 if (max_key == (u64)-1)
3786 break;
3787 min_key = max_key + 1;
3788 }
3789
3790 if (key_type == BTRFS_DIR_ITEM_KEY) {
3791 key_type = BTRFS_DIR_INDEX_KEY;
3792 goto again;
3793 }
3794 return 0;
3795 }
3796
3797 /*
3798 * a helper function to drop items from the log before we relog an
3799 * inode. max_key_type indicates the highest item type to remove.
3800 * This cannot be run for file data extents because it does not
3801 * free the extents they point to.
3802 */
drop_objectid_items(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,u64 objectid,int max_key_type)3803 static int drop_objectid_items(struct btrfs_trans_handle *trans,
3804 struct btrfs_root *log,
3805 struct btrfs_path *path,
3806 u64 objectid, int max_key_type)
3807 {
3808 int ret;
3809 struct btrfs_key key;
3810 struct btrfs_key found_key;
3811 int start_slot;
3812
3813 key.objectid = objectid;
3814 key.type = max_key_type;
3815 key.offset = (u64)-1;
3816
3817 while (1) {
3818 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3819 BUG_ON(ret == 0); /* Logic error */
3820 if (ret < 0)
3821 break;
3822
3823 if (path->slots[0] == 0)
3824 break;
3825
3826 path->slots[0]--;
3827 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3828 path->slots[0]);
3829
3830 if (found_key.objectid != objectid)
3831 break;
3832
3833 found_key.offset = 0;
3834 found_key.type = 0;
3835 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
3836 if (ret < 0)
3837 break;
3838
3839 ret = btrfs_del_items(trans, log, path, start_slot,
3840 path->slots[0] - start_slot + 1);
3841 /*
3842 * If start slot isn't 0 then we don't need to re-search, we've
3843 * found the last guy with the objectid in this tree.
3844 */
3845 if (ret || start_slot != 0)
3846 break;
3847 btrfs_release_path(path);
3848 }
3849 btrfs_release_path(path);
3850 if (ret > 0)
3851 ret = 0;
3852 return ret;
3853 }
3854
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode,int log_inode_only,u64 logged_isize)3855 static void fill_inode_item(struct btrfs_trans_handle *trans,
3856 struct extent_buffer *leaf,
3857 struct btrfs_inode_item *item,
3858 struct inode *inode, int log_inode_only,
3859 u64 logged_isize)
3860 {
3861 struct btrfs_map_token token;
3862
3863 btrfs_init_map_token(&token, leaf);
3864
3865 if (log_inode_only) {
3866 /* set the generation to zero so the recover code
3867 * can tell the difference between an logging
3868 * just to say 'this inode exists' and a logging
3869 * to say 'update this inode with these values'
3870 */
3871 btrfs_set_token_inode_generation(&token, item, 0);
3872 btrfs_set_token_inode_size(&token, item, logged_isize);
3873 } else {
3874 btrfs_set_token_inode_generation(&token, item,
3875 BTRFS_I(inode)->generation);
3876 btrfs_set_token_inode_size(&token, item, inode->i_size);
3877 }
3878
3879 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3880 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3881 btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3882 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3883
3884 btrfs_set_token_timespec_sec(&token, &item->atime,
3885 inode->i_atime.tv_sec);
3886 btrfs_set_token_timespec_nsec(&token, &item->atime,
3887 inode->i_atime.tv_nsec);
3888
3889 btrfs_set_token_timespec_sec(&token, &item->mtime,
3890 inode->i_mtime.tv_sec);
3891 btrfs_set_token_timespec_nsec(&token, &item->mtime,
3892 inode->i_mtime.tv_nsec);
3893
3894 btrfs_set_token_timespec_sec(&token, &item->ctime,
3895 inode->i_ctime.tv_sec);
3896 btrfs_set_token_timespec_nsec(&token, &item->ctime,
3897 inode->i_ctime.tv_nsec);
3898
3899 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3900
3901 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3902 btrfs_set_token_inode_transid(&token, item, trans->transid);
3903 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3904 btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
3905 btrfs_set_token_inode_block_group(&token, item, 0);
3906 }
3907
log_inode_item(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_inode * inode)3908 static int log_inode_item(struct btrfs_trans_handle *trans,
3909 struct btrfs_root *log, struct btrfs_path *path,
3910 struct btrfs_inode *inode)
3911 {
3912 struct btrfs_inode_item *inode_item;
3913 int ret;
3914
3915 ret = btrfs_insert_empty_item(trans, log, path,
3916 &inode->location, sizeof(*inode_item));
3917 if (ret && ret != -EEXIST)
3918 return ret;
3919 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3920 struct btrfs_inode_item);
3921 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
3922 0, 0);
3923 btrfs_release_path(path);
3924 return 0;
3925 }
3926
log_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,struct btrfs_ordered_sum * sums)3927 static int log_csums(struct btrfs_trans_handle *trans,
3928 struct btrfs_inode *inode,
3929 struct btrfs_root *log_root,
3930 struct btrfs_ordered_sum *sums)
3931 {
3932 const u64 lock_end = sums->bytenr + sums->len - 1;
3933 struct extent_state *cached_state = NULL;
3934 int ret;
3935
3936 /*
3937 * If this inode was not used for reflink operations in the current
3938 * transaction with new extents, then do the fast path, no need to
3939 * worry about logging checksum items with overlapping ranges.
3940 */
3941 if (inode->last_reflink_trans < trans->transid)
3942 return btrfs_csum_file_blocks(trans, log_root, sums);
3943
3944 /*
3945 * Serialize logging for checksums. This is to avoid racing with the
3946 * same checksum being logged by another task that is logging another
3947 * file which happens to refer to the same extent as well. Such races
3948 * can leave checksum items in the log with overlapping ranges.
3949 */
3950 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
3951 lock_end, &cached_state);
3952 if (ret)
3953 return ret;
3954 /*
3955 * Due to extent cloning, we might have logged a csum item that covers a
3956 * subrange of a cloned extent, and later we can end up logging a csum
3957 * item for a larger subrange of the same extent or the entire range.
3958 * This would leave csum items in the log tree that cover the same range
3959 * and break the searches for checksums in the log tree, resulting in
3960 * some checksums missing in the fs/subvolume tree. So just delete (or
3961 * trim and adjust) any existing csum items in the log for this range.
3962 */
3963 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
3964 if (!ret)
3965 ret = btrfs_csum_file_blocks(trans, log_root, sums);
3966
3967 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
3968 &cached_state);
3969
3970 return ret;
3971 }
3972
copy_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * dst_path,struct btrfs_path * src_path,int start_slot,int nr,int inode_only,u64 logged_isize)3973 static noinline int copy_items(struct btrfs_trans_handle *trans,
3974 struct btrfs_inode *inode,
3975 struct btrfs_path *dst_path,
3976 struct btrfs_path *src_path,
3977 int start_slot, int nr, int inode_only,
3978 u64 logged_isize)
3979 {
3980 struct btrfs_fs_info *fs_info = trans->fs_info;
3981 unsigned long src_offset;
3982 unsigned long dst_offset;
3983 struct btrfs_root *log = inode->root->log_root;
3984 struct btrfs_file_extent_item *extent;
3985 struct btrfs_inode_item *inode_item;
3986 struct extent_buffer *src = src_path->nodes[0];
3987 int ret;
3988 struct btrfs_key *ins_keys;
3989 u32 *ins_sizes;
3990 char *ins_data;
3991 int i;
3992 struct list_head ordered_sums;
3993 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
3994
3995 INIT_LIST_HEAD(&ordered_sums);
3996
3997 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3998 nr * sizeof(u32), GFP_NOFS);
3999 if (!ins_data)
4000 return -ENOMEM;
4001
4002 ins_sizes = (u32 *)ins_data;
4003 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4004
4005 for (i = 0; i < nr; i++) {
4006 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
4007 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
4008 }
4009 ret = btrfs_insert_empty_items(trans, log, dst_path,
4010 ins_keys, ins_sizes, nr);
4011 if (ret) {
4012 kfree(ins_data);
4013 return ret;
4014 }
4015
4016 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
4017 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
4018 dst_path->slots[0]);
4019
4020 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
4021
4022 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
4023 inode_item = btrfs_item_ptr(dst_path->nodes[0],
4024 dst_path->slots[0],
4025 struct btrfs_inode_item);
4026 fill_inode_item(trans, dst_path->nodes[0], inode_item,
4027 &inode->vfs_inode,
4028 inode_only == LOG_INODE_EXISTS,
4029 logged_isize);
4030 } else {
4031 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4032 src_offset, ins_sizes[i]);
4033 }
4034
4035 /* take a reference on file data extents so that truncates
4036 * or deletes of this inode don't have to relog the inode
4037 * again
4038 */
4039 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
4040 !skip_csum) {
4041 int found_type;
4042 extent = btrfs_item_ptr(src, start_slot + i,
4043 struct btrfs_file_extent_item);
4044
4045 if (btrfs_file_extent_generation(src, extent) < trans->transid)
4046 continue;
4047
4048 found_type = btrfs_file_extent_type(src, extent);
4049 if (found_type == BTRFS_FILE_EXTENT_REG) {
4050 u64 ds, dl, cs, cl;
4051 ds = btrfs_file_extent_disk_bytenr(src,
4052 extent);
4053 /* ds == 0 is a hole */
4054 if (ds == 0)
4055 continue;
4056
4057 dl = btrfs_file_extent_disk_num_bytes(src,
4058 extent);
4059 cs = btrfs_file_extent_offset(src, extent);
4060 cl = btrfs_file_extent_num_bytes(src,
4061 extent);
4062 if (btrfs_file_extent_compression(src,
4063 extent)) {
4064 cs = 0;
4065 cl = dl;
4066 }
4067
4068 ret = btrfs_lookup_csums_range(
4069 fs_info->csum_root,
4070 ds + cs, ds + cs + cl - 1,
4071 &ordered_sums, 0);
4072 if (ret)
4073 break;
4074 }
4075 }
4076 }
4077
4078 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
4079 btrfs_release_path(dst_path);
4080 kfree(ins_data);
4081
4082 /*
4083 * we have to do this after the loop above to avoid changing the
4084 * log tree while trying to change the log tree.
4085 */
4086 while (!list_empty(&ordered_sums)) {
4087 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4088 struct btrfs_ordered_sum,
4089 list);
4090 if (!ret)
4091 ret = log_csums(trans, inode, log, sums);
4092 list_del(&sums->list);
4093 kfree(sums);
4094 }
4095
4096 return ret;
4097 }
4098
extent_cmp(void * priv,const struct list_head * a,const struct list_head * b)4099 static int extent_cmp(void *priv, const struct list_head *a,
4100 const struct list_head *b)
4101 {
4102 struct extent_map *em1, *em2;
4103
4104 em1 = list_entry(a, struct extent_map, list);
4105 em2 = list_entry(b, struct extent_map, list);
4106
4107 if (em1->start < em2->start)
4108 return -1;
4109 else if (em1->start > em2->start)
4110 return 1;
4111 return 0;
4112 }
4113
log_extent_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,const struct extent_map * em,struct btrfs_log_ctx * ctx)4114 static int log_extent_csums(struct btrfs_trans_handle *trans,
4115 struct btrfs_inode *inode,
4116 struct btrfs_root *log_root,
4117 const struct extent_map *em,
4118 struct btrfs_log_ctx *ctx)
4119 {
4120 struct btrfs_ordered_extent *ordered;
4121 u64 csum_offset;
4122 u64 csum_len;
4123 u64 mod_start = em->mod_start;
4124 u64 mod_len = em->mod_len;
4125 LIST_HEAD(ordered_sums);
4126 int ret = 0;
4127
4128 if (inode->flags & BTRFS_INODE_NODATASUM ||
4129 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4130 em->block_start == EXTENT_MAP_HOLE)
4131 return 0;
4132
4133 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4134 const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4135 const u64 mod_end = mod_start + mod_len;
4136 struct btrfs_ordered_sum *sums;
4137
4138 if (mod_len == 0)
4139 break;
4140
4141 if (ordered_end <= mod_start)
4142 continue;
4143 if (mod_end <= ordered->file_offset)
4144 break;
4145
4146 /*
4147 * We are going to copy all the csums on this ordered extent, so
4148 * go ahead and adjust mod_start and mod_len in case this ordered
4149 * extent has already been logged.
4150 */
4151 if (ordered->file_offset > mod_start) {
4152 if (ordered_end >= mod_end)
4153 mod_len = ordered->file_offset - mod_start;
4154 /*
4155 * If we have this case
4156 *
4157 * |--------- logged extent ---------|
4158 * |----- ordered extent ----|
4159 *
4160 * Just don't mess with mod_start and mod_len, we'll
4161 * just end up logging more csums than we need and it
4162 * will be ok.
4163 */
4164 } else {
4165 if (ordered_end < mod_end) {
4166 mod_len = mod_end - ordered_end;
4167 mod_start = ordered_end;
4168 } else {
4169 mod_len = 0;
4170 }
4171 }
4172
4173 /*
4174 * To keep us from looping for the above case of an ordered
4175 * extent that falls inside of the logged extent.
4176 */
4177 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4178 continue;
4179
4180 list_for_each_entry(sums, &ordered->list, list) {
4181 ret = log_csums(trans, inode, log_root, sums);
4182 if (ret)
4183 return ret;
4184 }
4185 }
4186
4187 /* We're done, found all csums in the ordered extents. */
4188 if (mod_len == 0)
4189 return 0;
4190
4191 /* If we're compressed we have to save the entire range of csums. */
4192 if (em->compress_type) {
4193 csum_offset = 0;
4194 csum_len = max(em->block_len, em->orig_block_len);
4195 } else {
4196 csum_offset = mod_start - em->start;
4197 csum_len = mod_len;
4198 }
4199
4200 /* block start is already adjusted for the file extent offset. */
4201 ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
4202 em->block_start + csum_offset,
4203 em->block_start + csum_offset +
4204 csum_len - 1, &ordered_sums, 0);
4205 if (ret)
4206 return ret;
4207
4208 while (!list_empty(&ordered_sums)) {
4209 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4210 struct btrfs_ordered_sum,
4211 list);
4212 if (!ret)
4213 ret = log_csums(trans, inode, log_root, sums);
4214 list_del(&sums->list);
4215 kfree(sums);
4216 }
4217
4218 return ret;
4219 }
4220
log_one_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * root,const struct extent_map * em,struct btrfs_path * path,struct btrfs_log_ctx * ctx)4221 static int log_one_extent(struct btrfs_trans_handle *trans,
4222 struct btrfs_inode *inode, struct btrfs_root *root,
4223 const struct extent_map *em,
4224 struct btrfs_path *path,
4225 struct btrfs_log_ctx *ctx)
4226 {
4227 struct btrfs_root *log = root->log_root;
4228 struct btrfs_file_extent_item *fi;
4229 struct extent_buffer *leaf;
4230 struct btrfs_map_token token;
4231 struct btrfs_key key;
4232 u64 extent_offset = em->start - em->orig_start;
4233 u64 block_len;
4234 int ret;
4235 int extent_inserted = 0;
4236
4237 ret = log_extent_csums(trans, inode, log, em, ctx);
4238 if (ret)
4239 return ret;
4240
4241 ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
4242 em->start + em->len, NULL, 0, 1,
4243 sizeof(*fi), &extent_inserted);
4244 if (ret)
4245 return ret;
4246
4247 if (!extent_inserted) {
4248 key.objectid = btrfs_ino(inode);
4249 key.type = BTRFS_EXTENT_DATA_KEY;
4250 key.offset = em->start;
4251
4252 ret = btrfs_insert_empty_item(trans, log, path, &key,
4253 sizeof(*fi));
4254 if (ret)
4255 return ret;
4256 }
4257 leaf = path->nodes[0];
4258 btrfs_init_map_token(&token, leaf);
4259 fi = btrfs_item_ptr(leaf, path->slots[0],
4260 struct btrfs_file_extent_item);
4261
4262 btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
4263 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4264 btrfs_set_token_file_extent_type(&token, fi,
4265 BTRFS_FILE_EXTENT_PREALLOC);
4266 else
4267 btrfs_set_token_file_extent_type(&token, fi,
4268 BTRFS_FILE_EXTENT_REG);
4269
4270 block_len = max(em->block_len, em->orig_block_len);
4271 if (em->compress_type != BTRFS_COMPRESS_NONE) {
4272 btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4273 em->block_start);
4274 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4275 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4276 btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4277 em->block_start -
4278 extent_offset);
4279 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4280 } else {
4281 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
4282 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
4283 }
4284
4285 btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
4286 btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
4287 btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
4288 btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
4289 btrfs_set_token_file_extent_encryption(&token, fi, 0);
4290 btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
4291 btrfs_mark_buffer_dirty(leaf);
4292
4293 btrfs_release_path(path);
4294
4295 return ret;
4296 }
4297
4298 /*
4299 * Log all prealloc extents beyond the inode's i_size to make sure we do not
4300 * lose them after doing a fast fsync and replaying the log. We scan the
4301 * subvolume's root instead of iterating the inode's extent map tree because
4302 * otherwise we can log incorrect extent items based on extent map conversion.
4303 * That can happen due to the fact that extent maps are merged when they
4304 * are not in the extent map tree's list of modified extents.
4305 */
btrfs_log_prealloc_extents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path)4306 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4307 struct btrfs_inode *inode,
4308 struct btrfs_path *path)
4309 {
4310 struct btrfs_root *root = inode->root;
4311 struct btrfs_key key;
4312 const u64 i_size = i_size_read(&inode->vfs_inode);
4313 const u64 ino = btrfs_ino(inode);
4314 struct btrfs_path *dst_path = NULL;
4315 bool dropped_extents = false;
4316 u64 truncate_offset = i_size;
4317 struct extent_buffer *leaf;
4318 int slot;
4319 int ins_nr = 0;
4320 int start_slot;
4321 int ret;
4322
4323 if (!(inode->flags & BTRFS_INODE_PREALLOC))
4324 return 0;
4325
4326 key.objectid = ino;
4327 key.type = BTRFS_EXTENT_DATA_KEY;
4328 key.offset = i_size;
4329 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4330 if (ret < 0)
4331 goto out;
4332
4333 /*
4334 * We must check if there is a prealloc extent that starts before the
4335 * i_size and crosses the i_size boundary. This is to ensure later we
4336 * truncate down to the end of that extent and not to the i_size, as
4337 * otherwise we end up losing part of the prealloc extent after a log
4338 * replay and with an implicit hole if there is another prealloc extent
4339 * that starts at an offset beyond i_size.
4340 */
4341 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4342 if (ret < 0)
4343 goto out;
4344
4345 if (ret == 0) {
4346 struct btrfs_file_extent_item *ei;
4347
4348 leaf = path->nodes[0];
4349 slot = path->slots[0];
4350 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4351
4352 if (btrfs_file_extent_type(leaf, ei) ==
4353 BTRFS_FILE_EXTENT_PREALLOC) {
4354 u64 extent_end;
4355
4356 btrfs_item_key_to_cpu(leaf, &key, slot);
4357 extent_end = key.offset +
4358 btrfs_file_extent_num_bytes(leaf, ei);
4359
4360 if (extent_end > i_size)
4361 truncate_offset = extent_end;
4362 }
4363 } else {
4364 ret = 0;
4365 }
4366
4367 while (true) {
4368 leaf = path->nodes[0];
4369 slot = path->slots[0];
4370
4371 if (slot >= btrfs_header_nritems(leaf)) {
4372 if (ins_nr > 0) {
4373 ret = copy_items(trans, inode, dst_path, path,
4374 start_slot, ins_nr, 1, 0);
4375 if (ret < 0)
4376 goto out;
4377 ins_nr = 0;
4378 }
4379 ret = btrfs_next_leaf(root, path);
4380 if (ret < 0)
4381 goto out;
4382 if (ret > 0) {
4383 ret = 0;
4384 break;
4385 }
4386 continue;
4387 }
4388
4389 btrfs_item_key_to_cpu(leaf, &key, slot);
4390 if (key.objectid > ino)
4391 break;
4392 if (WARN_ON_ONCE(key.objectid < ino) ||
4393 key.type < BTRFS_EXTENT_DATA_KEY ||
4394 key.offset < i_size) {
4395 path->slots[0]++;
4396 continue;
4397 }
4398 if (!dropped_extents) {
4399 /*
4400 * Avoid logging extent items logged in past fsync calls
4401 * and leading to duplicate keys in the log tree.
4402 */
4403 do {
4404 ret = btrfs_truncate_inode_items(trans,
4405 root->log_root,
4406 &inode->vfs_inode,
4407 truncate_offset,
4408 BTRFS_EXTENT_DATA_KEY);
4409 } while (ret == -EAGAIN);
4410 if (ret)
4411 goto out;
4412 dropped_extents = true;
4413 }
4414 if (ins_nr == 0)
4415 start_slot = slot;
4416 ins_nr++;
4417 path->slots[0]++;
4418 if (!dst_path) {
4419 dst_path = btrfs_alloc_path();
4420 if (!dst_path) {
4421 ret = -ENOMEM;
4422 goto out;
4423 }
4424 }
4425 }
4426 if (ins_nr > 0)
4427 ret = copy_items(trans, inode, dst_path, path,
4428 start_slot, ins_nr, 1, 0);
4429 out:
4430 btrfs_release_path(path);
4431 btrfs_free_path(dst_path);
4432 return ret;
4433 }
4434
btrfs_log_changed_extents(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx)4435 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4436 struct btrfs_root *root,
4437 struct btrfs_inode *inode,
4438 struct btrfs_path *path,
4439 struct btrfs_log_ctx *ctx)
4440 {
4441 struct btrfs_ordered_extent *ordered;
4442 struct btrfs_ordered_extent *tmp;
4443 struct extent_map *em, *n;
4444 struct list_head extents;
4445 struct extent_map_tree *tree = &inode->extent_tree;
4446 u64 test_gen;
4447 int ret = 0;
4448 int num = 0;
4449
4450 INIT_LIST_HEAD(&extents);
4451
4452 write_lock(&tree->lock);
4453 test_gen = root->fs_info->last_trans_committed;
4454
4455 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4456 list_del_init(&em->list);
4457 /*
4458 * Just an arbitrary number, this can be really CPU intensive
4459 * once we start getting a lot of extents, and really once we
4460 * have a bunch of extents we just want to commit since it will
4461 * be faster.
4462 */
4463 if (++num > 32768) {
4464 list_del_init(&tree->modified_extents);
4465 ret = -EFBIG;
4466 goto process;
4467 }
4468
4469 if (em->generation <= test_gen)
4470 continue;
4471
4472 /* We log prealloc extents beyond eof later. */
4473 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4474 em->start >= i_size_read(&inode->vfs_inode))
4475 continue;
4476
4477 /* Need a ref to keep it from getting evicted from cache */
4478 refcount_inc(&em->refs);
4479 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4480 list_add_tail(&em->list, &extents);
4481 num++;
4482 }
4483
4484 list_sort(NULL, &extents, extent_cmp);
4485 process:
4486 while (!list_empty(&extents)) {
4487 em = list_entry(extents.next, struct extent_map, list);
4488
4489 list_del_init(&em->list);
4490
4491 /*
4492 * If we had an error we just need to delete everybody from our
4493 * private list.
4494 */
4495 if (ret) {
4496 clear_em_logging(tree, em);
4497 free_extent_map(em);
4498 continue;
4499 }
4500
4501 write_unlock(&tree->lock);
4502
4503 ret = log_one_extent(trans, inode, root, em, path, ctx);
4504 write_lock(&tree->lock);
4505 clear_em_logging(tree, em);
4506 free_extent_map(em);
4507 }
4508 WARN_ON(!list_empty(&extents));
4509 write_unlock(&tree->lock);
4510
4511 btrfs_release_path(path);
4512 if (!ret)
4513 ret = btrfs_log_prealloc_extents(trans, inode, path);
4514 if (ret)
4515 return ret;
4516
4517 /*
4518 * We have logged all extents successfully, now make sure the commit of
4519 * the current transaction waits for the ordered extents to complete
4520 * before it commits and wipes out the log trees, otherwise we would
4521 * lose data if an ordered extents completes after the transaction
4522 * commits and a power failure happens after the transaction commit.
4523 */
4524 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4525 list_del_init(&ordered->log_list);
4526 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4527
4528 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4529 spin_lock_irq(&inode->ordered_tree.lock);
4530 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4531 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4532 atomic_inc(&trans->transaction->pending_ordered);
4533 }
4534 spin_unlock_irq(&inode->ordered_tree.lock);
4535 }
4536 btrfs_put_ordered_extent(ordered);
4537 }
4538
4539 return 0;
4540 }
4541
logged_inode_size(struct btrfs_root * log,struct btrfs_inode * inode,struct btrfs_path * path,u64 * size_ret)4542 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4543 struct btrfs_path *path, u64 *size_ret)
4544 {
4545 struct btrfs_key key;
4546 int ret;
4547
4548 key.objectid = btrfs_ino(inode);
4549 key.type = BTRFS_INODE_ITEM_KEY;
4550 key.offset = 0;
4551
4552 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4553 if (ret < 0) {
4554 return ret;
4555 } else if (ret > 0) {
4556 *size_ret = 0;
4557 } else {
4558 struct btrfs_inode_item *item;
4559
4560 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4561 struct btrfs_inode_item);
4562 *size_ret = btrfs_inode_size(path->nodes[0], item);
4563 /*
4564 * If the in-memory inode's i_size is smaller then the inode
4565 * size stored in the btree, return the inode's i_size, so
4566 * that we get a correct inode size after replaying the log
4567 * when before a power failure we had a shrinking truncate
4568 * followed by addition of a new name (rename / new hard link).
4569 * Otherwise return the inode size from the btree, to avoid
4570 * data loss when replaying a log due to previously doing a
4571 * write that expands the inode's size and logging a new name
4572 * immediately after.
4573 */
4574 if (*size_ret > inode->vfs_inode.i_size)
4575 *size_ret = inode->vfs_inode.i_size;
4576 }
4577
4578 btrfs_release_path(path);
4579 return 0;
4580 }
4581
4582 /*
4583 * At the moment we always log all xattrs. This is to figure out at log replay
4584 * time which xattrs must have their deletion replayed. If a xattr is missing
4585 * in the log tree and exists in the fs/subvol tree, we delete it. This is
4586 * because if a xattr is deleted, the inode is fsynced and a power failure
4587 * happens, causing the log to be replayed the next time the fs is mounted,
4588 * we want the xattr to not exist anymore (same behaviour as other filesystems
4589 * with a journal, ext3/4, xfs, f2fs, etc).
4590 */
btrfs_log_all_xattrs(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path)4591 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4592 struct btrfs_root *root,
4593 struct btrfs_inode *inode,
4594 struct btrfs_path *path,
4595 struct btrfs_path *dst_path)
4596 {
4597 int ret;
4598 struct btrfs_key key;
4599 const u64 ino = btrfs_ino(inode);
4600 int ins_nr = 0;
4601 int start_slot = 0;
4602 bool found_xattrs = false;
4603
4604 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
4605 return 0;
4606
4607 key.objectid = ino;
4608 key.type = BTRFS_XATTR_ITEM_KEY;
4609 key.offset = 0;
4610
4611 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4612 if (ret < 0)
4613 return ret;
4614
4615 while (true) {
4616 int slot = path->slots[0];
4617 struct extent_buffer *leaf = path->nodes[0];
4618 int nritems = btrfs_header_nritems(leaf);
4619
4620 if (slot >= nritems) {
4621 if (ins_nr > 0) {
4622 ret = copy_items(trans, inode, dst_path, path,
4623 start_slot, ins_nr, 1, 0);
4624 if (ret < 0)
4625 return ret;
4626 ins_nr = 0;
4627 }
4628 ret = btrfs_next_leaf(root, path);
4629 if (ret < 0)
4630 return ret;
4631 else if (ret > 0)
4632 break;
4633 continue;
4634 }
4635
4636 btrfs_item_key_to_cpu(leaf, &key, slot);
4637 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4638 break;
4639
4640 if (ins_nr == 0)
4641 start_slot = slot;
4642 ins_nr++;
4643 path->slots[0]++;
4644 found_xattrs = true;
4645 cond_resched();
4646 }
4647 if (ins_nr > 0) {
4648 ret = copy_items(trans, inode, dst_path, path,
4649 start_slot, ins_nr, 1, 0);
4650 if (ret < 0)
4651 return ret;
4652 }
4653
4654 if (!found_xattrs)
4655 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
4656
4657 return 0;
4658 }
4659
4660 /*
4661 * When using the NO_HOLES feature if we punched a hole that causes the
4662 * deletion of entire leafs or all the extent items of the first leaf (the one
4663 * that contains the inode item and references) we may end up not processing
4664 * any extents, because there are no leafs with a generation matching the
4665 * current transaction that have extent items for our inode. So we need to find
4666 * if any holes exist and then log them. We also need to log holes after any
4667 * truncate operation that changes the inode's size.
4668 */
btrfs_log_holes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path)4669 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
4670 struct btrfs_root *root,
4671 struct btrfs_inode *inode,
4672 struct btrfs_path *path)
4673 {
4674 struct btrfs_fs_info *fs_info = root->fs_info;
4675 struct btrfs_key key;
4676 const u64 ino = btrfs_ino(inode);
4677 const u64 i_size = i_size_read(&inode->vfs_inode);
4678 u64 prev_extent_end = 0;
4679 int ret;
4680
4681 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
4682 return 0;
4683
4684 key.objectid = ino;
4685 key.type = BTRFS_EXTENT_DATA_KEY;
4686 key.offset = 0;
4687
4688 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4689 if (ret < 0)
4690 return ret;
4691
4692 while (true) {
4693 struct extent_buffer *leaf = path->nodes[0];
4694
4695 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
4696 ret = btrfs_next_leaf(root, path);
4697 if (ret < 0)
4698 return ret;
4699 if (ret > 0) {
4700 ret = 0;
4701 break;
4702 }
4703 leaf = path->nodes[0];
4704 }
4705
4706 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4707 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
4708 break;
4709
4710 /* We have a hole, log it. */
4711 if (prev_extent_end < key.offset) {
4712 const u64 hole_len = key.offset - prev_extent_end;
4713
4714 /*
4715 * Release the path to avoid deadlocks with other code
4716 * paths that search the root while holding locks on
4717 * leafs from the log root.
4718 */
4719 btrfs_release_path(path);
4720 ret = btrfs_insert_file_extent(trans, root->log_root,
4721 ino, prev_extent_end, 0,
4722 0, hole_len, 0, hole_len,
4723 0, 0, 0);
4724 if (ret < 0)
4725 return ret;
4726
4727 /*
4728 * Search for the same key again in the root. Since it's
4729 * an extent item and we are holding the inode lock, the
4730 * key must still exist. If it doesn't just emit warning
4731 * and return an error to fall back to a transaction
4732 * commit.
4733 */
4734 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4735 if (ret < 0)
4736 return ret;
4737 if (WARN_ON(ret > 0))
4738 return -ENOENT;
4739 leaf = path->nodes[0];
4740 }
4741
4742 prev_extent_end = btrfs_file_extent_end(path);
4743 path->slots[0]++;
4744 cond_resched();
4745 }
4746
4747 if (prev_extent_end < i_size) {
4748 u64 hole_len;
4749
4750 btrfs_release_path(path);
4751 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
4752 ret = btrfs_insert_file_extent(trans, root->log_root,
4753 ino, prev_extent_end, 0, 0,
4754 hole_len, 0, hole_len,
4755 0, 0, 0);
4756 if (ret < 0)
4757 return ret;
4758 }
4759
4760 return 0;
4761 }
4762
4763 /*
4764 * When we are logging a new inode X, check if it doesn't have a reference that
4765 * matches the reference from some other inode Y created in a past transaction
4766 * and that was renamed in the current transaction. If we don't do this, then at
4767 * log replay time we can lose inode Y (and all its files if it's a directory):
4768 *
4769 * mkdir /mnt/x
4770 * echo "hello world" > /mnt/x/foobar
4771 * sync
4772 * mv /mnt/x /mnt/y
4773 * mkdir /mnt/x # or touch /mnt/x
4774 * xfs_io -c fsync /mnt/x
4775 * <power fail>
4776 * mount fs, trigger log replay
4777 *
4778 * After the log replay procedure, we would lose the first directory and all its
4779 * files (file foobar).
4780 * For the case where inode Y is not a directory we simply end up losing it:
4781 *
4782 * echo "123" > /mnt/foo
4783 * sync
4784 * mv /mnt/foo /mnt/bar
4785 * echo "abc" > /mnt/foo
4786 * xfs_io -c fsync /mnt/foo
4787 * <power fail>
4788 *
4789 * We also need this for cases where a snapshot entry is replaced by some other
4790 * entry (file or directory) otherwise we end up with an unreplayable log due to
4791 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4792 * if it were a regular entry:
4793 *
4794 * mkdir /mnt/x
4795 * btrfs subvolume snapshot /mnt /mnt/x/snap
4796 * btrfs subvolume delete /mnt/x/snap
4797 * rmdir /mnt/x
4798 * mkdir /mnt/x
4799 * fsync /mnt/x or fsync some new file inside it
4800 * <power fail>
4801 *
4802 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4803 * the same transaction.
4804 */
btrfs_check_ref_name_override(struct extent_buffer * eb,const int slot,const struct btrfs_key * key,struct btrfs_inode * inode,u64 * other_ino,u64 * other_parent)4805 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4806 const int slot,
4807 const struct btrfs_key *key,
4808 struct btrfs_inode *inode,
4809 u64 *other_ino, u64 *other_parent)
4810 {
4811 int ret;
4812 struct btrfs_path *search_path;
4813 char *name = NULL;
4814 u32 name_len = 0;
4815 u32 item_size = btrfs_item_size_nr(eb, slot);
4816 u32 cur_offset = 0;
4817 unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4818
4819 search_path = btrfs_alloc_path();
4820 if (!search_path)
4821 return -ENOMEM;
4822 search_path->search_commit_root = 1;
4823 search_path->skip_locking = 1;
4824
4825 while (cur_offset < item_size) {
4826 u64 parent;
4827 u32 this_name_len;
4828 u32 this_len;
4829 unsigned long name_ptr;
4830 struct btrfs_dir_item *di;
4831
4832 if (key->type == BTRFS_INODE_REF_KEY) {
4833 struct btrfs_inode_ref *iref;
4834
4835 iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4836 parent = key->offset;
4837 this_name_len = btrfs_inode_ref_name_len(eb, iref);
4838 name_ptr = (unsigned long)(iref + 1);
4839 this_len = sizeof(*iref) + this_name_len;
4840 } else {
4841 struct btrfs_inode_extref *extref;
4842
4843 extref = (struct btrfs_inode_extref *)(ptr +
4844 cur_offset);
4845 parent = btrfs_inode_extref_parent(eb, extref);
4846 this_name_len = btrfs_inode_extref_name_len(eb, extref);
4847 name_ptr = (unsigned long)&extref->name;
4848 this_len = sizeof(*extref) + this_name_len;
4849 }
4850
4851 if (this_name_len > name_len) {
4852 char *new_name;
4853
4854 new_name = krealloc(name, this_name_len, GFP_NOFS);
4855 if (!new_name) {
4856 ret = -ENOMEM;
4857 goto out;
4858 }
4859 name_len = this_name_len;
4860 name = new_name;
4861 }
4862
4863 read_extent_buffer(eb, name, name_ptr, this_name_len);
4864 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4865 parent, name, this_name_len, 0);
4866 if (di && !IS_ERR(di)) {
4867 struct btrfs_key di_key;
4868
4869 btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4870 di, &di_key);
4871 if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4872 if (di_key.objectid != key->objectid) {
4873 ret = 1;
4874 *other_ino = di_key.objectid;
4875 *other_parent = parent;
4876 } else {
4877 ret = 0;
4878 }
4879 } else {
4880 ret = -EAGAIN;
4881 }
4882 goto out;
4883 } else if (IS_ERR(di)) {
4884 ret = PTR_ERR(di);
4885 goto out;
4886 }
4887 btrfs_release_path(search_path);
4888
4889 cur_offset += this_len;
4890 }
4891 ret = 0;
4892 out:
4893 btrfs_free_path(search_path);
4894 kfree(name);
4895 return ret;
4896 }
4897
4898 struct btrfs_ino_list {
4899 u64 ino;
4900 u64 parent;
4901 struct list_head list;
4902 };
4903
log_conflicting_inodes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_log_ctx * ctx,u64 ino,u64 parent)4904 static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
4905 struct btrfs_root *root,
4906 struct btrfs_path *path,
4907 struct btrfs_log_ctx *ctx,
4908 u64 ino, u64 parent)
4909 {
4910 struct btrfs_ino_list *ino_elem;
4911 LIST_HEAD(inode_list);
4912 int ret = 0;
4913
4914 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4915 if (!ino_elem)
4916 return -ENOMEM;
4917 ino_elem->ino = ino;
4918 ino_elem->parent = parent;
4919 list_add_tail(&ino_elem->list, &inode_list);
4920
4921 while (!list_empty(&inode_list)) {
4922 struct btrfs_fs_info *fs_info = root->fs_info;
4923 struct btrfs_key key;
4924 struct inode *inode;
4925
4926 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
4927 list);
4928 ino = ino_elem->ino;
4929 parent = ino_elem->parent;
4930 list_del(&ino_elem->list);
4931 kfree(ino_elem);
4932 if (ret)
4933 continue;
4934
4935 btrfs_release_path(path);
4936
4937 inode = btrfs_iget(fs_info->sb, ino, root);
4938 /*
4939 * If the other inode that had a conflicting dir entry was
4940 * deleted in the current transaction, we need to log its parent
4941 * directory.
4942 */
4943 if (IS_ERR(inode)) {
4944 ret = PTR_ERR(inode);
4945 if (ret == -ENOENT) {
4946 inode = btrfs_iget(fs_info->sb, parent, root);
4947 if (IS_ERR(inode)) {
4948 ret = PTR_ERR(inode);
4949 } else {
4950 ret = btrfs_log_inode(trans, root,
4951 BTRFS_I(inode),
4952 LOG_OTHER_INODE_ALL,
4953 ctx);
4954 btrfs_add_delayed_iput(inode);
4955 }
4956 }
4957 continue;
4958 }
4959 /*
4960 * If the inode was already logged skip it - otherwise we can
4961 * hit an infinite loop. Example:
4962 *
4963 * From the commit root (previous transaction) we have the
4964 * following inodes:
4965 *
4966 * inode 257 a directory
4967 * inode 258 with references "zz" and "zz_link" on inode 257
4968 * inode 259 with reference "a" on inode 257
4969 *
4970 * And in the current (uncommitted) transaction we have:
4971 *
4972 * inode 257 a directory, unchanged
4973 * inode 258 with references "a" and "a2" on inode 257
4974 * inode 259 with reference "zz_link" on inode 257
4975 * inode 261 with reference "zz" on inode 257
4976 *
4977 * When logging inode 261 the following infinite loop could
4978 * happen if we don't skip already logged inodes:
4979 *
4980 * - we detect inode 258 as a conflicting inode, with inode 261
4981 * on reference "zz", and log it;
4982 *
4983 * - we detect inode 259 as a conflicting inode, with inode 258
4984 * on reference "a", and log it;
4985 *
4986 * - we detect inode 258 as a conflicting inode, with inode 259
4987 * on reference "zz_link", and log it - again! After this we
4988 * repeat the above steps forever.
4989 */
4990 spin_lock(&BTRFS_I(inode)->lock);
4991 /*
4992 * Check the inode's logged_trans only instead of
4993 * btrfs_inode_in_log(). This is because the last_log_commit of
4994 * the inode is not updated when we only log that it exists and
4995 * it has the full sync bit set (see btrfs_log_inode()).
4996 */
4997 if (BTRFS_I(inode)->logged_trans == trans->transid) {
4998 spin_unlock(&BTRFS_I(inode)->lock);
4999 btrfs_add_delayed_iput(inode);
5000 continue;
5001 }
5002 spin_unlock(&BTRFS_I(inode)->lock);
5003 /*
5004 * We are safe logging the other inode without acquiring its
5005 * lock as long as we log with the LOG_INODE_EXISTS mode. We
5006 * are safe against concurrent renames of the other inode as
5007 * well because during a rename we pin the log and update the
5008 * log with the new name before we unpin it.
5009 */
5010 ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5011 LOG_OTHER_INODE, ctx);
5012 if (ret) {
5013 btrfs_add_delayed_iput(inode);
5014 continue;
5015 }
5016
5017 key.objectid = ino;
5018 key.type = BTRFS_INODE_REF_KEY;
5019 key.offset = 0;
5020 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5021 if (ret < 0) {
5022 btrfs_add_delayed_iput(inode);
5023 continue;
5024 }
5025
5026 while (true) {
5027 struct extent_buffer *leaf = path->nodes[0];
5028 int slot = path->slots[0];
5029 u64 other_ino = 0;
5030 u64 other_parent = 0;
5031
5032 if (slot >= btrfs_header_nritems(leaf)) {
5033 ret = btrfs_next_leaf(root, path);
5034 if (ret < 0) {
5035 break;
5036 } else if (ret > 0) {
5037 ret = 0;
5038 break;
5039 }
5040 continue;
5041 }
5042
5043 btrfs_item_key_to_cpu(leaf, &key, slot);
5044 if (key.objectid != ino ||
5045 (key.type != BTRFS_INODE_REF_KEY &&
5046 key.type != BTRFS_INODE_EXTREF_KEY)) {
5047 ret = 0;
5048 break;
5049 }
5050
5051 ret = btrfs_check_ref_name_override(leaf, slot, &key,
5052 BTRFS_I(inode), &other_ino,
5053 &other_parent);
5054 if (ret < 0)
5055 break;
5056 if (ret > 0) {
5057 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5058 if (!ino_elem) {
5059 ret = -ENOMEM;
5060 break;
5061 }
5062 ino_elem->ino = other_ino;
5063 ino_elem->parent = other_parent;
5064 list_add_tail(&ino_elem->list, &inode_list);
5065 ret = 0;
5066 }
5067 path->slots[0]++;
5068 }
5069 btrfs_add_delayed_iput(inode);
5070 }
5071
5072 return ret;
5073 }
5074
copy_inode_items_to_log(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_key * min_key,const struct btrfs_key * max_key,struct btrfs_path * path,struct btrfs_path * dst_path,const u64 logged_isize,const bool recursive_logging,const int inode_only,struct btrfs_log_ctx * ctx,bool * need_log_inode_item)5075 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5076 struct btrfs_inode *inode,
5077 struct btrfs_key *min_key,
5078 const struct btrfs_key *max_key,
5079 struct btrfs_path *path,
5080 struct btrfs_path *dst_path,
5081 const u64 logged_isize,
5082 const bool recursive_logging,
5083 const int inode_only,
5084 struct btrfs_log_ctx *ctx,
5085 bool *need_log_inode_item)
5086 {
5087 struct btrfs_root *root = inode->root;
5088 int ins_start_slot = 0;
5089 int ins_nr = 0;
5090 int ret;
5091
5092 while (1) {
5093 ret = btrfs_search_forward(root, min_key, path, trans->transid);
5094 if (ret < 0)
5095 return ret;
5096 if (ret > 0) {
5097 ret = 0;
5098 break;
5099 }
5100 again:
5101 /* Note, ins_nr might be > 0 here, cleanup outside the loop */
5102 if (min_key->objectid != max_key->objectid)
5103 break;
5104 if (min_key->type > max_key->type)
5105 break;
5106
5107 if (min_key->type == BTRFS_INODE_ITEM_KEY)
5108 *need_log_inode_item = false;
5109
5110 if ((min_key->type == BTRFS_INODE_REF_KEY ||
5111 min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5112 inode->generation == trans->transid &&
5113 !recursive_logging) {
5114 u64 other_ino = 0;
5115 u64 other_parent = 0;
5116
5117 ret = btrfs_check_ref_name_override(path->nodes[0],
5118 path->slots[0], min_key, inode,
5119 &other_ino, &other_parent);
5120 if (ret < 0) {
5121 return ret;
5122 } else if (ret > 0 && ctx &&
5123 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5124 if (ins_nr > 0) {
5125 ins_nr++;
5126 } else {
5127 ins_nr = 1;
5128 ins_start_slot = path->slots[0];
5129 }
5130 ret = copy_items(trans, inode, dst_path, path,
5131 ins_start_slot, ins_nr,
5132 inode_only, logged_isize);
5133 if (ret < 0)
5134 return ret;
5135 ins_nr = 0;
5136
5137 ret = log_conflicting_inodes(trans, root, path,
5138 ctx, other_ino, other_parent);
5139 if (ret)
5140 return ret;
5141 btrfs_release_path(path);
5142 goto next_key;
5143 }
5144 }
5145
5146 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5147 if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5148 if (ins_nr == 0)
5149 goto next_slot;
5150 ret = copy_items(trans, inode, dst_path, path,
5151 ins_start_slot,
5152 ins_nr, inode_only, logged_isize);
5153 if (ret < 0)
5154 return ret;
5155 ins_nr = 0;
5156 goto next_slot;
5157 }
5158
5159 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5160 ins_nr++;
5161 goto next_slot;
5162 } else if (!ins_nr) {
5163 ins_start_slot = path->slots[0];
5164 ins_nr = 1;
5165 goto next_slot;
5166 }
5167
5168 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5169 ins_nr, inode_only, logged_isize);
5170 if (ret < 0)
5171 return ret;
5172 ins_nr = 1;
5173 ins_start_slot = path->slots[0];
5174 next_slot:
5175 path->slots[0]++;
5176 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5177 btrfs_item_key_to_cpu(path->nodes[0], min_key,
5178 path->slots[0]);
5179 goto again;
5180 }
5181 if (ins_nr) {
5182 ret = copy_items(trans, inode, dst_path, path,
5183 ins_start_slot, ins_nr, inode_only,
5184 logged_isize);
5185 if (ret < 0)
5186 return ret;
5187 ins_nr = 0;
5188 }
5189 btrfs_release_path(path);
5190 next_key:
5191 if (min_key->offset < (u64)-1) {
5192 min_key->offset++;
5193 } else if (min_key->type < max_key->type) {
5194 min_key->type++;
5195 min_key->offset = 0;
5196 } else {
5197 break;
5198 }
5199 }
5200 if (ins_nr)
5201 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5202 ins_nr, inode_only, logged_isize);
5203
5204 return ret;
5205 }
5206
5207 /* log a single inode in the tree log.
5208 * At least one parent directory for this inode must exist in the tree
5209 * or be logged already.
5210 *
5211 * Any items from this inode changed by the current transaction are copied
5212 * to the log tree. An extra reference is taken on any extents in this
5213 * file, allowing us to avoid a whole pile of corner cases around logging
5214 * blocks that have been removed from the tree.
5215 *
5216 * See LOG_INODE_ALL and related defines for a description of what inode_only
5217 * does.
5218 *
5219 * This handles both files and directories.
5220 */
btrfs_log_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,int inode_only,struct btrfs_log_ctx * ctx)5221 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
5222 struct btrfs_root *root, struct btrfs_inode *inode,
5223 int inode_only,
5224 struct btrfs_log_ctx *ctx)
5225 {
5226 struct btrfs_path *path;
5227 struct btrfs_path *dst_path;
5228 struct btrfs_key min_key;
5229 struct btrfs_key max_key;
5230 struct btrfs_root *log = root->log_root;
5231 int err = 0;
5232 int ret = 0;
5233 bool fast_search = false;
5234 u64 ino = btrfs_ino(inode);
5235 struct extent_map_tree *em_tree = &inode->extent_tree;
5236 u64 logged_isize = 0;
5237 bool need_log_inode_item = true;
5238 bool xattrs_logged = false;
5239 bool recursive_logging = false;
5240
5241 path = btrfs_alloc_path();
5242 if (!path)
5243 return -ENOMEM;
5244 dst_path = btrfs_alloc_path();
5245 if (!dst_path) {
5246 btrfs_free_path(path);
5247 return -ENOMEM;
5248 }
5249
5250 min_key.objectid = ino;
5251 min_key.type = BTRFS_INODE_ITEM_KEY;
5252 min_key.offset = 0;
5253
5254 max_key.objectid = ino;
5255
5256
5257 /* today the code can only do partial logging of directories */
5258 if (S_ISDIR(inode->vfs_inode.i_mode) ||
5259 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5260 &inode->runtime_flags) &&
5261 inode_only >= LOG_INODE_EXISTS))
5262 max_key.type = BTRFS_XATTR_ITEM_KEY;
5263 else
5264 max_key.type = (u8)-1;
5265 max_key.offset = (u64)-1;
5266
5267 /*
5268 * Only run delayed items if we are a directory. We want to make sure
5269 * all directory indexes hit the fs/subvolume tree so we can find them
5270 * and figure out which index ranges have to be logged.
5271 *
5272 * Otherwise commit the delayed inode only if the full sync flag is set,
5273 * as we want to make sure an up to date version is in the subvolume
5274 * tree so copy_inode_items_to_log() / copy_items() can find it and copy
5275 * it to the log tree. For a non full sync, we always log the inode item
5276 * based on the in-memory struct btrfs_inode which is always up to date.
5277 */
5278 if (S_ISDIR(inode->vfs_inode.i_mode))
5279 ret = btrfs_commit_inode_delayed_items(trans, inode);
5280 else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5281 ret = btrfs_commit_inode_delayed_inode(inode);
5282
5283 if (ret) {
5284 btrfs_free_path(path);
5285 btrfs_free_path(dst_path);
5286 return ret;
5287 }
5288
5289 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5290 recursive_logging = true;
5291 if (inode_only == LOG_OTHER_INODE)
5292 inode_only = LOG_INODE_EXISTS;
5293 else
5294 inode_only = LOG_INODE_ALL;
5295 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
5296 } else {
5297 mutex_lock(&inode->log_mutex);
5298 }
5299
5300 /*
5301 * a brute force approach to making sure we get the most uptodate
5302 * copies of everything.
5303 */
5304 if (S_ISDIR(inode->vfs_inode.i_mode)) {
5305 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
5306
5307 if (inode_only == LOG_INODE_EXISTS)
5308 max_key_type = BTRFS_XATTR_ITEM_KEY;
5309 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
5310 } else {
5311 if (inode_only == LOG_INODE_EXISTS) {
5312 /*
5313 * Make sure the new inode item we write to the log has
5314 * the same isize as the current one (if it exists).
5315 * This is necessary to prevent data loss after log
5316 * replay, and also to prevent doing a wrong expanding
5317 * truncate - for e.g. create file, write 4K into offset
5318 * 0, fsync, write 4K into offset 4096, add hard link,
5319 * fsync some other file (to sync log), power fail - if
5320 * we use the inode's current i_size, after log replay
5321 * we get a 8Kb file, with the last 4Kb extent as a hole
5322 * (zeroes), as if an expanding truncate happened,
5323 * instead of getting a file of 4Kb only.
5324 */
5325 err = logged_inode_size(log, inode, path, &logged_isize);
5326 if (err)
5327 goto out_unlock;
5328 }
5329 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5330 &inode->runtime_flags)) {
5331 if (inode_only == LOG_INODE_EXISTS) {
5332 max_key.type = BTRFS_XATTR_ITEM_KEY;
5333 ret = drop_objectid_items(trans, log, path, ino,
5334 max_key.type);
5335 } else {
5336 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5337 &inode->runtime_flags);
5338 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5339 &inode->runtime_flags);
5340 while(1) {
5341 ret = btrfs_truncate_inode_items(trans,
5342 log, &inode->vfs_inode, 0, 0);
5343 if (ret != -EAGAIN)
5344 break;
5345 }
5346 }
5347 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5348 &inode->runtime_flags) ||
5349 inode_only == LOG_INODE_EXISTS) {
5350 if (inode_only == LOG_INODE_ALL)
5351 fast_search = true;
5352 max_key.type = BTRFS_XATTR_ITEM_KEY;
5353 ret = drop_objectid_items(trans, log, path, ino,
5354 max_key.type);
5355 } else {
5356 if (inode_only == LOG_INODE_ALL)
5357 fast_search = true;
5358 goto log_extents;
5359 }
5360
5361 }
5362 if (ret) {
5363 err = ret;
5364 goto out_unlock;
5365 }
5366
5367 err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
5368 path, dst_path, logged_isize,
5369 recursive_logging, inode_only, ctx,
5370 &need_log_inode_item);
5371 if (err)
5372 goto out_unlock;
5373
5374 btrfs_release_path(path);
5375 btrfs_release_path(dst_path);
5376 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
5377 if (err)
5378 goto out_unlock;
5379 xattrs_logged = true;
5380 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5381 btrfs_release_path(path);
5382 btrfs_release_path(dst_path);
5383 err = btrfs_log_holes(trans, root, inode, path);
5384 if (err)
5385 goto out_unlock;
5386 }
5387 log_extents:
5388 btrfs_release_path(path);
5389 btrfs_release_path(dst_path);
5390 if (need_log_inode_item) {
5391 err = log_inode_item(trans, log, dst_path, inode);
5392 if (!err && !xattrs_logged) {
5393 err = btrfs_log_all_xattrs(trans, root, inode, path,
5394 dst_path);
5395 btrfs_release_path(path);
5396 }
5397 if (err)
5398 goto out_unlock;
5399 }
5400 if (fast_search) {
5401 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
5402 ctx);
5403 if (ret) {
5404 err = ret;
5405 goto out_unlock;
5406 }
5407 } else if (inode_only == LOG_INODE_ALL) {
5408 struct extent_map *em, *n;
5409
5410 write_lock(&em_tree->lock);
5411 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
5412 list_del_init(&em->list);
5413 write_unlock(&em_tree->lock);
5414 }
5415
5416 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5417 ret = log_directory_changes(trans, root, inode, path, dst_path,
5418 ctx);
5419 if (ret) {
5420 err = ret;
5421 goto out_unlock;
5422 }
5423 }
5424
5425 /*
5426 * If we are logging that an ancestor inode exists as part of logging a
5427 * new name from a link or rename operation, don't mark the inode as
5428 * logged - otherwise if an explicit fsync is made against an ancestor,
5429 * the fsync considers the inode in the log and doesn't sync the log,
5430 * resulting in the ancestor missing after a power failure unless the
5431 * log was synced as part of an fsync against any other unrelated inode.
5432 * So keep it simple for this case and just don't flag the ancestors as
5433 * logged.
5434 */
5435 if (!ctx ||
5436 !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
5437 &inode->vfs_inode != ctx->inode)) {
5438 spin_lock(&inode->lock);
5439 inode->logged_trans = trans->transid;
5440 /*
5441 * Don't update last_log_commit if we logged that an inode exists
5442 * after it was loaded to memory (full_sync bit set).
5443 * This is to prevent data loss when we do a write to the inode,
5444 * then the inode gets evicted after all delalloc was flushed,
5445 * then we log it exists (due to a rename for example) and then
5446 * fsync it. This last fsync would do nothing (not logging the
5447 * extents previously written).
5448 */
5449 if (inode_only != LOG_INODE_EXISTS ||
5450 !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5451 inode->last_log_commit = inode->last_sub_trans;
5452 spin_unlock(&inode->lock);
5453 }
5454 out_unlock:
5455 mutex_unlock(&inode->log_mutex);
5456
5457 btrfs_free_path(path);
5458 btrfs_free_path(dst_path);
5459 return err;
5460 }
5461
5462 /*
5463 * Check if we must fallback to a transaction commit when logging an inode.
5464 * This must be called after logging the inode and is used only in the context
5465 * when fsyncing an inode requires the need to log some other inode - in which
5466 * case we can't lock the i_mutex of each other inode we need to log as that
5467 * can lead to deadlocks with concurrent fsync against other inodes (as we can
5468 * log inodes up or down in the hierarchy) or rename operations for example. So
5469 * we take the log_mutex of the inode after we have logged it and then check for
5470 * its last_unlink_trans value - this is safe because any task setting
5471 * last_unlink_trans must take the log_mutex and it must do this before it does
5472 * the actual unlink operation, so if we do this check before a concurrent task
5473 * sets last_unlink_trans it means we've logged a consistent version/state of
5474 * all the inode items, otherwise we are not sure and must do a transaction
5475 * commit (the concurrent task might have only updated last_unlink_trans before
5476 * we logged the inode or it might have also done the unlink).
5477 */
btrfs_must_commit_transaction(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)5478 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
5479 struct btrfs_inode *inode)
5480 {
5481 struct btrfs_fs_info *fs_info = inode->root->fs_info;
5482 bool ret = false;
5483
5484 mutex_lock(&inode->log_mutex);
5485 if (inode->last_unlink_trans > fs_info->last_trans_committed) {
5486 /*
5487 * Make sure any commits to the log are forced to be full
5488 * commits.
5489 */
5490 btrfs_set_log_full_commit(trans);
5491 ret = true;
5492 }
5493 mutex_unlock(&inode->log_mutex);
5494
5495 return ret;
5496 }
5497
5498 /*
5499 * follow the dentry parent pointers up the chain and see if any
5500 * of the directories in it require a full commit before they can
5501 * be logged. Returns zero if nothing special needs to be done or 1 if
5502 * a full commit is required.
5503 */
check_parent_dirs_for_sync(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct super_block * sb,u64 last_committed)5504 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
5505 struct btrfs_inode *inode,
5506 struct dentry *parent,
5507 struct super_block *sb,
5508 u64 last_committed)
5509 {
5510 int ret = 0;
5511 struct dentry *old_parent = NULL;
5512
5513 /*
5514 * for regular files, if its inode is already on disk, we don't
5515 * have to worry about the parents at all. This is because
5516 * we can use the last_unlink_trans field to record renames
5517 * and other fun in this file.
5518 */
5519 if (S_ISREG(inode->vfs_inode.i_mode) &&
5520 inode->generation <= last_committed &&
5521 inode->last_unlink_trans <= last_committed)
5522 goto out;
5523
5524 if (!S_ISDIR(inode->vfs_inode.i_mode)) {
5525 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5526 goto out;
5527 inode = BTRFS_I(d_inode(parent));
5528 }
5529
5530 while (1) {
5531 if (btrfs_must_commit_transaction(trans, inode)) {
5532 ret = 1;
5533 break;
5534 }
5535
5536 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5537 break;
5538
5539 if (IS_ROOT(parent)) {
5540 inode = BTRFS_I(d_inode(parent));
5541 if (btrfs_must_commit_transaction(trans, inode))
5542 ret = 1;
5543 break;
5544 }
5545
5546 parent = dget_parent(parent);
5547 dput(old_parent);
5548 old_parent = parent;
5549 inode = BTRFS_I(d_inode(parent));
5550
5551 }
5552 dput(old_parent);
5553 out:
5554 return ret;
5555 }
5556
5557 struct btrfs_dir_list {
5558 u64 ino;
5559 struct list_head list;
5560 };
5561
5562 /*
5563 * Log the inodes of the new dentries of a directory. See log_dir_items() for
5564 * details about the why it is needed.
5565 * This is a recursive operation - if an existing dentry corresponds to a
5566 * directory, that directory's new entries are logged too (same behaviour as
5567 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5568 * the dentries point to we do not lock their i_mutex, otherwise lockdep
5569 * complains about the following circular lock dependency / possible deadlock:
5570 *
5571 * CPU0 CPU1
5572 * ---- ----
5573 * lock(&type->i_mutex_dir_key#3/2);
5574 * lock(sb_internal#2);
5575 * lock(&type->i_mutex_dir_key#3/2);
5576 * lock(&sb->s_type->i_mutex_key#14);
5577 *
5578 * Where sb_internal is the lock (a counter that works as a lock) acquired by
5579 * sb_start_intwrite() in btrfs_start_transaction().
5580 * Not locking i_mutex of the inodes is still safe because:
5581 *
5582 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5583 * that while logging the inode new references (names) are added or removed
5584 * from the inode, leaving the logged inode item with a link count that does
5585 * not match the number of logged inode reference items. This is fine because
5586 * at log replay time we compute the real number of links and correct the
5587 * link count in the inode item (see replay_one_buffer() and
5588 * link_to_fixup_dir());
5589 *
5590 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5591 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5592 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5593 * has a size that doesn't match the sum of the lengths of all the logged
5594 * names. This does not result in a problem because if a dir_item key is
5595 * logged but its matching dir_index key is not logged, at log replay time we
5596 * don't use it to replay the respective name (see replay_one_name()). On the
5597 * other hand if only the dir_index key ends up being logged, the respective
5598 * name is added to the fs/subvol tree with both the dir_item and dir_index
5599 * keys created (see replay_one_name()).
5600 * The directory's inode item with a wrong i_size is not a problem as well,
5601 * since we don't use it at log replay time to set the i_size in the inode
5602 * item of the fs/subvol tree (see overwrite_item()).
5603 */
log_new_dir_dentries(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * start_inode,struct btrfs_log_ctx * ctx)5604 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5605 struct btrfs_root *root,
5606 struct btrfs_inode *start_inode,
5607 struct btrfs_log_ctx *ctx)
5608 {
5609 struct btrfs_fs_info *fs_info = root->fs_info;
5610 struct btrfs_root *log = root->log_root;
5611 struct btrfs_path *path;
5612 LIST_HEAD(dir_list);
5613 struct btrfs_dir_list *dir_elem;
5614 int ret = 0;
5615
5616 path = btrfs_alloc_path();
5617 if (!path)
5618 return -ENOMEM;
5619
5620 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5621 if (!dir_elem) {
5622 btrfs_free_path(path);
5623 return -ENOMEM;
5624 }
5625 dir_elem->ino = btrfs_ino(start_inode);
5626 list_add_tail(&dir_elem->list, &dir_list);
5627
5628 while (!list_empty(&dir_list)) {
5629 struct extent_buffer *leaf;
5630 struct btrfs_key min_key;
5631 int nritems;
5632 int i;
5633
5634 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5635 list);
5636 if (ret)
5637 goto next_dir_inode;
5638
5639 min_key.objectid = dir_elem->ino;
5640 min_key.type = BTRFS_DIR_ITEM_KEY;
5641 min_key.offset = 0;
5642 again:
5643 btrfs_release_path(path);
5644 ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5645 if (ret < 0) {
5646 goto next_dir_inode;
5647 } else if (ret > 0) {
5648 ret = 0;
5649 goto next_dir_inode;
5650 }
5651
5652 process_leaf:
5653 leaf = path->nodes[0];
5654 nritems = btrfs_header_nritems(leaf);
5655 for (i = path->slots[0]; i < nritems; i++) {
5656 struct btrfs_dir_item *di;
5657 struct btrfs_key di_key;
5658 struct inode *di_inode;
5659 struct btrfs_dir_list *new_dir_elem;
5660 int log_mode = LOG_INODE_EXISTS;
5661 int type;
5662
5663 btrfs_item_key_to_cpu(leaf, &min_key, i);
5664 if (min_key.objectid != dir_elem->ino ||
5665 min_key.type != BTRFS_DIR_ITEM_KEY)
5666 goto next_dir_inode;
5667
5668 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5669 type = btrfs_dir_type(leaf, di);
5670 if (btrfs_dir_transid(leaf, di) < trans->transid &&
5671 type != BTRFS_FT_DIR)
5672 continue;
5673 btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5674 if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5675 continue;
5676
5677 btrfs_release_path(path);
5678 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
5679 if (IS_ERR(di_inode)) {
5680 ret = PTR_ERR(di_inode);
5681 goto next_dir_inode;
5682 }
5683
5684 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
5685 btrfs_add_delayed_iput(di_inode);
5686 break;
5687 }
5688
5689 ctx->log_new_dentries = false;
5690 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
5691 log_mode = LOG_INODE_ALL;
5692 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
5693 log_mode, ctx);
5694 if (!ret &&
5695 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
5696 ret = 1;
5697 btrfs_add_delayed_iput(di_inode);
5698 if (ret)
5699 goto next_dir_inode;
5700 if (ctx->log_new_dentries) {
5701 new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5702 GFP_NOFS);
5703 if (!new_dir_elem) {
5704 ret = -ENOMEM;
5705 goto next_dir_inode;
5706 }
5707 new_dir_elem->ino = di_key.objectid;
5708 list_add_tail(&new_dir_elem->list, &dir_list);
5709 }
5710 break;
5711 }
5712 if (i == nritems) {
5713 ret = btrfs_next_leaf(log, path);
5714 if (ret < 0) {
5715 goto next_dir_inode;
5716 } else if (ret > 0) {
5717 ret = 0;
5718 goto next_dir_inode;
5719 }
5720 goto process_leaf;
5721 }
5722 if (min_key.offset < (u64)-1) {
5723 min_key.offset++;
5724 goto again;
5725 }
5726 next_dir_inode:
5727 list_del(&dir_elem->list);
5728 kfree(dir_elem);
5729 }
5730
5731 btrfs_free_path(path);
5732 return ret;
5733 }
5734
btrfs_log_all_parents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_log_ctx * ctx)5735 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5736 struct btrfs_inode *inode,
5737 struct btrfs_log_ctx *ctx)
5738 {
5739 struct btrfs_fs_info *fs_info = trans->fs_info;
5740 int ret;
5741 struct btrfs_path *path;
5742 struct btrfs_key key;
5743 struct btrfs_root *root = inode->root;
5744 const u64 ino = btrfs_ino(inode);
5745
5746 path = btrfs_alloc_path();
5747 if (!path)
5748 return -ENOMEM;
5749 path->skip_locking = 1;
5750 path->search_commit_root = 1;
5751
5752 key.objectid = ino;
5753 key.type = BTRFS_INODE_REF_KEY;
5754 key.offset = 0;
5755 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5756 if (ret < 0)
5757 goto out;
5758
5759 while (true) {
5760 struct extent_buffer *leaf = path->nodes[0];
5761 int slot = path->slots[0];
5762 u32 cur_offset = 0;
5763 u32 item_size;
5764 unsigned long ptr;
5765
5766 if (slot >= btrfs_header_nritems(leaf)) {
5767 ret = btrfs_next_leaf(root, path);
5768 if (ret < 0)
5769 goto out;
5770 else if (ret > 0)
5771 break;
5772 continue;
5773 }
5774
5775 btrfs_item_key_to_cpu(leaf, &key, slot);
5776 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5777 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5778 break;
5779
5780 item_size = btrfs_item_size_nr(leaf, slot);
5781 ptr = btrfs_item_ptr_offset(leaf, slot);
5782 while (cur_offset < item_size) {
5783 struct btrfs_key inode_key;
5784 struct inode *dir_inode;
5785
5786 inode_key.type = BTRFS_INODE_ITEM_KEY;
5787 inode_key.offset = 0;
5788
5789 if (key.type == BTRFS_INODE_EXTREF_KEY) {
5790 struct btrfs_inode_extref *extref;
5791
5792 extref = (struct btrfs_inode_extref *)
5793 (ptr + cur_offset);
5794 inode_key.objectid = btrfs_inode_extref_parent(
5795 leaf, extref);
5796 cur_offset += sizeof(*extref);
5797 cur_offset += btrfs_inode_extref_name_len(leaf,
5798 extref);
5799 } else {
5800 inode_key.objectid = key.offset;
5801 cur_offset = item_size;
5802 }
5803
5804 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
5805 root);
5806 /*
5807 * If the parent inode was deleted, return an error to
5808 * fallback to a transaction commit. This is to prevent
5809 * getting an inode that was moved from one parent A to
5810 * a parent B, got its former parent A deleted and then
5811 * it got fsync'ed, from existing at both parents after
5812 * a log replay (and the old parent still existing).
5813 * Example:
5814 *
5815 * mkdir /mnt/A
5816 * mkdir /mnt/B
5817 * touch /mnt/B/bar
5818 * sync
5819 * mv /mnt/B/bar /mnt/A/bar
5820 * mv -T /mnt/A /mnt/B
5821 * fsync /mnt/B/bar
5822 * <power fail>
5823 *
5824 * If we ignore the old parent B which got deleted,
5825 * after a log replay we would have file bar linked
5826 * at both parents and the old parent B would still
5827 * exist.
5828 */
5829 if (IS_ERR(dir_inode)) {
5830 ret = PTR_ERR(dir_inode);
5831 goto out;
5832 }
5833
5834 if (ctx)
5835 ctx->log_new_dentries = false;
5836 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
5837 LOG_INODE_ALL, ctx);
5838 if (!ret &&
5839 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
5840 ret = 1;
5841 if (!ret && ctx && ctx->log_new_dentries)
5842 ret = log_new_dir_dentries(trans, root,
5843 BTRFS_I(dir_inode), ctx);
5844 btrfs_add_delayed_iput(dir_inode);
5845 if (ret)
5846 goto out;
5847 }
5848 path->slots[0]++;
5849 }
5850 ret = 0;
5851 out:
5852 btrfs_free_path(path);
5853 return ret;
5854 }
5855
log_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_log_ctx * ctx)5856 static int log_new_ancestors(struct btrfs_trans_handle *trans,
5857 struct btrfs_root *root,
5858 struct btrfs_path *path,
5859 struct btrfs_log_ctx *ctx)
5860 {
5861 struct btrfs_key found_key;
5862
5863 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5864
5865 while (true) {
5866 struct btrfs_fs_info *fs_info = root->fs_info;
5867 const u64 last_committed = fs_info->last_trans_committed;
5868 struct extent_buffer *leaf = path->nodes[0];
5869 int slot = path->slots[0];
5870 struct btrfs_key search_key;
5871 struct inode *inode;
5872 u64 ino;
5873 int ret = 0;
5874
5875 btrfs_release_path(path);
5876
5877 ino = found_key.offset;
5878
5879 search_key.objectid = found_key.offset;
5880 search_key.type = BTRFS_INODE_ITEM_KEY;
5881 search_key.offset = 0;
5882 inode = btrfs_iget(fs_info->sb, ino, root);
5883 if (IS_ERR(inode))
5884 return PTR_ERR(inode);
5885
5886 if (BTRFS_I(inode)->generation > last_committed)
5887 ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5888 LOG_INODE_EXISTS, ctx);
5889 btrfs_add_delayed_iput(inode);
5890 if (ret)
5891 return ret;
5892
5893 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
5894 break;
5895
5896 search_key.type = BTRFS_INODE_REF_KEY;
5897 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5898 if (ret < 0)
5899 return ret;
5900
5901 leaf = path->nodes[0];
5902 slot = path->slots[0];
5903 if (slot >= btrfs_header_nritems(leaf)) {
5904 ret = btrfs_next_leaf(root, path);
5905 if (ret < 0)
5906 return ret;
5907 else if (ret > 0)
5908 return -ENOENT;
5909 leaf = path->nodes[0];
5910 slot = path->slots[0];
5911 }
5912
5913 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5914 if (found_key.objectid != search_key.objectid ||
5915 found_key.type != BTRFS_INODE_REF_KEY)
5916 return -ENOENT;
5917 }
5918 return 0;
5919 }
5920
log_new_ancestors_fast(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)5921 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
5922 struct btrfs_inode *inode,
5923 struct dentry *parent,
5924 struct btrfs_log_ctx *ctx)
5925 {
5926 struct btrfs_root *root = inode->root;
5927 struct btrfs_fs_info *fs_info = root->fs_info;
5928 struct dentry *old_parent = NULL;
5929 struct super_block *sb = inode->vfs_inode.i_sb;
5930 int ret = 0;
5931
5932 while (true) {
5933 if (!parent || d_really_is_negative(parent) ||
5934 sb != parent->d_sb)
5935 break;
5936
5937 inode = BTRFS_I(d_inode(parent));
5938 if (root != inode->root)
5939 break;
5940
5941 if (inode->generation > fs_info->last_trans_committed) {
5942 ret = btrfs_log_inode(trans, root, inode,
5943 LOG_INODE_EXISTS, ctx);
5944 if (ret)
5945 break;
5946 }
5947 if (IS_ROOT(parent))
5948 break;
5949
5950 parent = dget_parent(parent);
5951 dput(old_parent);
5952 old_parent = parent;
5953 }
5954 dput(old_parent);
5955
5956 return ret;
5957 }
5958
log_all_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)5959 static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
5960 struct btrfs_inode *inode,
5961 struct dentry *parent,
5962 struct btrfs_log_ctx *ctx)
5963 {
5964 struct btrfs_root *root = inode->root;
5965 const u64 ino = btrfs_ino(inode);
5966 struct btrfs_path *path;
5967 struct btrfs_key search_key;
5968 int ret;
5969
5970 /*
5971 * For a single hard link case, go through a fast path that does not
5972 * need to iterate the fs/subvolume tree.
5973 */
5974 if (inode->vfs_inode.i_nlink < 2)
5975 return log_new_ancestors_fast(trans, inode, parent, ctx);
5976
5977 path = btrfs_alloc_path();
5978 if (!path)
5979 return -ENOMEM;
5980
5981 search_key.objectid = ino;
5982 search_key.type = BTRFS_INODE_REF_KEY;
5983 search_key.offset = 0;
5984 again:
5985 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5986 if (ret < 0)
5987 goto out;
5988 if (ret == 0)
5989 path->slots[0]++;
5990
5991 while (true) {
5992 struct extent_buffer *leaf = path->nodes[0];
5993 int slot = path->slots[0];
5994 struct btrfs_key found_key;
5995
5996 if (slot >= btrfs_header_nritems(leaf)) {
5997 ret = btrfs_next_leaf(root, path);
5998 if (ret < 0)
5999 goto out;
6000 else if (ret > 0)
6001 break;
6002 continue;
6003 }
6004
6005 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6006 if (found_key.objectid != ino ||
6007 found_key.type > BTRFS_INODE_EXTREF_KEY)
6008 break;
6009
6010 /*
6011 * Don't deal with extended references because they are rare
6012 * cases and too complex to deal with (we would need to keep
6013 * track of which subitem we are processing for each item in
6014 * this loop, etc). So just return some error to fallback to
6015 * a transaction commit.
6016 */
6017 if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6018 ret = -EMLINK;
6019 goto out;
6020 }
6021
6022 /*
6023 * Logging ancestors needs to do more searches on the fs/subvol
6024 * tree, so it releases the path as needed to avoid deadlocks.
6025 * Keep track of the last inode ref key and resume from that key
6026 * after logging all new ancestors for the current hard link.
6027 */
6028 memcpy(&search_key, &found_key, sizeof(search_key));
6029
6030 ret = log_new_ancestors(trans, root, path, ctx);
6031 if (ret)
6032 goto out;
6033 btrfs_release_path(path);
6034 goto again;
6035 }
6036 ret = 0;
6037 out:
6038 btrfs_free_path(path);
6039 return ret;
6040 }
6041
6042 /*
6043 * helper function around btrfs_log_inode to make sure newly created
6044 * parent directories also end up in the log. A minimal inode and backref
6045 * only logging is done of any parent directories that are older than
6046 * the last committed transaction
6047 */
btrfs_log_inode_parent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,int inode_only,struct btrfs_log_ctx * ctx)6048 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
6049 struct btrfs_inode *inode,
6050 struct dentry *parent,
6051 int inode_only,
6052 struct btrfs_log_ctx *ctx)
6053 {
6054 struct btrfs_root *root = inode->root;
6055 struct btrfs_fs_info *fs_info = root->fs_info;
6056 struct super_block *sb;
6057 int ret = 0;
6058 u64 last_committed = fs_info->last_trans_committed;
6059 bool log_dentries = false;
6060
6061 sb = inode->vfs_inode.i_sb;
6062
6063 if (btrfs_test_opt(fs_info, NOTREELOG)) {
6064 ret = 1;
6065 goto end_no_trans;
6066 }
6067
6068 /*
6069 * The prev transaction commit doesn't complete, we need do
6070 * full commit by ourselves.
6071 */
6072 if (fs_info->last_trans_log_full_commit >
6073 fs_info->last_trans_committed) {
6074 ret = 1;
6075 goto end_no_trans;
6076 }
6077
6078 if (btrfs_root_refs(&root->root_item) == 0) {
6079 ret = 1;
6080 goto end_no_trans;
6081 }
6082
6083 ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
6084 last_committed);
6085 if (ret)
6086 goto end_no_trans;
6087
6088 /*
6089 * Skip already logged inodes or inodes corresponding to tmpfiles
6090 * (since logging them is pointless, a link count of 0 means they
6091 * will never be accessible).
6092 */
6093 if ((btrfs_inode_in_log(inode, trans->transid) &&
6094 list_empty(&ctx->ordered_extents)) ||
6095 inode->vfs_inode.i_nlink == 0) {
6096 ret = BTRFS_NO_LOG_SYNC;
6097 goto end_no_trans;
6098 }
6099
6100 ret = start_log_trans(trans, root, ctx);
6101 if (ret)
6102 goto end_no_trans;
6103
6104 ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
6105 if (ret)
6106 goto end_trans;
6107
6108 /*
6109 * for regular files, if its inode is already on disk, we don't
6110 * have to worry about the parents at all. This is because
6111 * we can use the last_unlink_trans field to record renames
6112 * and other fun in this file.
6113 */
6114 if (S_ISREG(inode->vfs_inode.i_mode) &&
6115 inode->generation <= last_committed &&
6116 inode->last_unlink_trans <= last_committed) {
6117 ret = 0;
6118 goto end_trans;
6119 }
6120
6121 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
6122 log_dentries = true;
6123
6124 /*
6125 * On unlink we must make sure all our current and old parent directory
6126 * inodes are fully logged. This is to prevent leaving dangling
6127 * directory index entries in directories that were our parents but are
6128 * not anymore. Not doing this results in old parent directory being
6129 * impossible to delete after log replay (rmdir will always fail with
6130 * error -ENOTEMPTY).
6131 *
6132 * Example 1:
6133 *
6134 * mkdir testdir
6135 * touch testdir/foo
6136 * ln testdir/foo testdir/bar
6137 * sync
6138 * unlink testdir/bar
6139 * xfs_io -c fsync testdir/foo
6140 * <power failure>
6141 * mount fs, triggers log replay
6142 *
6143 * If we don't log the parent directory (testdir), after log replay the
6144 * directory still has an entry pointing to the file inode using the bar
6145 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
6146 * the file inode has a link count of 1.
6147 *
6148 * Example 2:
6149 *
6150 * mkdir testdir
6151 * touch foo
6152 * ln foo testdir/foo2
6153 * ln foo testdir/foo3
6154 * sync
6155 * unlink testdir/foo3
6156 * xfs_io -c fsync foo
6157 * <power failure>
6158 * mount fs, triggers log replay
6159 *
6160 * Similar as the first example, after log replay the parent directory
6161 * testdir still has an entry pointing to the inode file with name foo3
6162 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
6163 * and has a link count of 2.
6164 */
6165 if (inode->last_unlink_trans > last_committed) {
6166 ret = btrfs_log_all_parents(trans, inode, ctx);
6167 if (ret)
6168 goto end_trans;
6169 }
6170
6171 ret = log_all_new_ancestors(trans, inode, parent, ctx);
6172 if (ret)
6173 goto end_trans;
6174
6175 if (log_dentries)
6176 ret = log_new_dir_dentries(trans, root, inode, ctx);
6177 else
6178 ret = 0;
6179 end_trans:
6180 if (ret < 0) {
6181 btrfs_set_log_full_commit(trans);
6182 ret = 1;
6183 }
6184
6185 if (ret)
6186 btrfs_remove_log_ctx(root, ctx);
6187 btrfs_end_log_trans(root);
6188 end_no_trans:
6189 return ret;
6190 }
6191
6192 /*
6193 * it is not safe to log dentry if the chunk root has added new
6194 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
6195 * If this returns 1, you must commit the transaction to safely get your
6196 * data on disk.
6197 */
btrfs_log_dentry_safe(struct btrfs_trans_handle * trans,struct dentry * dentry,struct btrfs_log_ctx * ctx)6198 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
6199 struct dentry *dentry,
6200 struct btrfs_log_ctx *ctx)
6201 {
6202 struct dentry *parent = dget_parent(dentry);
6203 int ret;
6204
6205 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
6206 LOG_INODE_ALL, ctx);
6207 dput(parent);
6208
6209 return ret;
6210 }
6211
6212 /*
6213 * should be called during mount to recover any replay any log trees
6214 * from the FS
6215 */
btrfs_recover_log_trees(struct btrfs_root * log_root_tree)6216 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
6217 {
6218 int ret;
6219 struct btrfs_path *path;
6220 struct btrfs_trans_handle *trans;
6221 struct btrfs_key key;
6222 struct btrfs_key found_key;
6223 struct btrfs_root *log;
6224 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
6225 struct walk_control wc = {
6226 .process_func = process_one_buffer,
6227 .stage = LOG_WALK_PIN_ONLY,
6228 };
6229
6230 path = btrfs_alloc_path();
6231 if (!path)
6232 return -ENOMEM;
6233
6234 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6235
6236 trans = btrfs_start_transaction(fs_info->tree_root, 0);
6237 if (IS_ERR(trans)) {
6238 ret = PTR_ERR(trans);
6239 goto error;
6240 }
6241
6242 wc.trans = trans;
6243 wc.pin = 1;
6244
6245 ret = walk_log_tree(trans, log_root_tree, &wc);
6246 if (ret) {
6247 btrfs_handle_fs_error(fs_info, ret,
6248 "Failed to pin buffers while recovering log root tree.");
6249 goto error;
6250 }
6251
6252 again:
6253 key.objectid = BTRFS_TREE_LOG_OBJECTID;
6254 key.offset = (u64)-1;
6255 key.type = BTRFS_ROOT_ITEM_KEY;
6256
6257 while (1) {
6258 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
6259
6260 if (ret < 0) {
6261 btrfs_handle_fs_error(fs_info, ret,
6262 "Couldn't find tree log root.");
6263 goto error;
6264 }
6265 if (ret > 0) {
6266 if (path->slots[0] == 0)
6267 break;
6268 path->slots[0]--;
6269 }
6270 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
6271 path->slots[0]);
6272 btrfs_release_path(path);
6273 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
6274 break;
6275
6276 log = btrfs_read_tree_root(log_root_tree, &found_key);
6277 if (IS_ERR(log)) {
6278 ret = PTR_ERR(log);
6279 btrfs_handle_fs_error(fs_info, ret,
6280 "Couldn't read tree log root.");
6281 goto error;
6282 }
6283
6284 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
6285 true);
6286 if (IS_ERR(wc.replay_dest)) {
6287 ret = PTR_ERR(wc.replay_dest);
6288
6289 /*
6290 * We didn't find the subvol, likely because it was
6291 * deleted. This is ok, simply skip this log and go to
6292 * the next one.
6293 *
6294 * We need to exclude the root because we can't have
6295 * other log replays overwriting this log as we'll read
6296 * it back in a few more times. This will keep our
6297 * block from being modified, and we'll just bail for
6298 * each subsequent pass.
6299 */
6300 if (ret == -ENOENT)
6301 ret = btrfs_pin_extent_for_log_replay(trans,
6302 log->node->start,
6303 log->node->len);
6304 btrfs_put_root(log);
6305
6306 if (!ret)
6307 goto next;
6308 btrfs_handle_fs_error(fs_info, ret,
6309 "Couldn't read target root for tree log recovery.");
6310 goto error;
6311 }
6312
6313 wc.replay_dest->log_root = log;
6314 btrfs_record_root_in_trans(trans, wc.replay_dest);
6315 ret = walk_log_tree(trans, log, &wc);
6316
6317 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6318 ret = fixup_inode_link_counts(trans, wc.replay_dest,
6319 path);
6320 }
6321
6322 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6323 struct btrfs_root *root = wc.replay_dest;
6324
6325 btrfs_release_path(path);
6326
6327 /*
6328 * We have just replayed everything, and the highest
6329 * objectid of fs roots probably has changed in case
6330 * some inode_item's got replayed.
6331 *
6332 * root->objectid_mutex is not acquired as log replay
6333 * could only happen during mount.
6334 */
6335 ret = btrfs_find_highest_objectid(root,
6336 &root->highest_objectid);
6337 }
6338
6339 wc.replay_dest->log_root = NULL;
6340 btrfs_put_root(wc.replay_dest);
6341 btrfs_put_root(log);
6342
6343 if (ret)
6344 goto error;
6345 next:
6346 if (found_key.offset == 0)
6347 break;
6348 key.offset = found_key.offset - 1;
6349 }
6350 btrfs_release_path(path);
6351
6352 /* step one is to pin it all, step two is to replay just inodes */
6353 if (wc.pin) {
6354 wc.pin = 0;
6355 wc.process_func = replay_one_buffer;
6356 wc.stage = LOG_WALK_REPLAY_INODES;
6357 goto again;
6358 }
6359 /* step three is to replay everything */
6360 if (wc.stage < LOG_WALK_REPLAY_ALL) {
6361 wc.stage++;
6362 goto again;
6363 }
6364
6365 btrfs_free_path(path);
6366
6367 /* step 4: commit the transaction, which also unpins the blocks */
6368 ret = btrfs_commit_transaction(trans);
6369 if (ret)
6370 return ret;
6371
6372 log_root_tree->log_root = NULL;
6373 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6374 btrfs_put_root(log_root_tree);
6375
6376 return 0;
6377 error:
6378 if (wc.trans)
6379 btrfs_end_transaction(wc.trans);
6380 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6381 btrfs_free_path(path);
6382 return ret;
6383 }
6384
6385 /*
6386 * there are some corner cases where we want to force a full
6387 * commit instead of allowing a directory to be logged.
6388 *
6389 * They revolve around files there were unlinked from the directory, and
6390 * this function updates the parent directory so that a full commit is
6391 * properly done if it is fsync'd later after the unlinks are done.
6392 *
6393 * Must be called before the unlink operations (updates to the subvolume tree,
6394 * inodes, etc) are done.
6395 */
btrfs_record_unlink_dir(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,int for_rename)6396 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
6397 struct btrfs_inode *dir, struct btrfs_inode *inode,
6398 int for_rename)
6399 {
6400 /*
6401 * when we're logging a file, if it hasn't been renamed
6402 * or unlinked, and its inode is fully committed on disk,
6403 * we don't have to worry about walking up the directory chain
6404 * to log its parents.
6405 *
6406 * So, we use the last_unlink_trans field to put this transid
6407 * into the file. When the file is logged we check it and
6408 * don't log the parents if the file is fully on disk.
6409 */
6410 mutex_lock(&inode->log_mutex);
6411 inode->last_unlink_trans = trans->transid;
6412 mutex_unlock(&inode->log_mutex);
6413
6414 /*
6415 * if this directory was already logged any new
6416 * names for this file/dir will get recorded
6417 */
6418 if (dir->logged_trans == trans->transid)
6419 return;
6420
6421 /*
6422 * if the inode we're about to unlink was logged,
6423 * the log will be properly updated for any new names
6424 */
6425 if (inode->logged_trans == trans->transid)
6426 return;
6427
6428 /*
6429 * when renaming files across directories, if the directory
6430 * there we're unlinking from gets fsync'd later on, there's
6431 * no way to find the destination directory later and fsync it
6432 * properly. So, we have to be conservative and force commits
6433 * so the new name gets discovered.
6434 */
6435 if (for_rename)
6436 goto record;
6437
6438 /* we can safely do the unlink without any special recording */
6439 return;
6440
6441 record:
6442 mutex_lock(&dir->log_mutex);
6443 dir->last_unlink_trans = trans->transid;
6444 mutex_unlock(&dir->log_mutex);
6445 }
6446
6447 /*
6448 * Make sure that if someone attempts to fsync the parent directory of a deleted
6449 * snapshot, it ends up triggering a transaction commit. This is to guarantee
6450 * that after replaying the log tree of the parent directory's root we will not
6451 * see the snapshot anymore and at log replay time we will not see any log tree
6452 * corresponding to the deleted snapshot's root, which could lead to replaying
6453 * it after replaying the log tree of the parent directory (which would replay
6454 * the snapshot delete operation).
6455 *
6456 * Must be called before the actual snapshot destroy operation (updates to the
6457 * parent root and tree of tree roots trees, etc) are done.
6458 */
btrfs_record_snapshot_destroy(struct btrfs_trans_handle * trans,struct btrfs_inode * dir)6459 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
6460 struct btrfs_inode *dir)
6461 {
6462 mutex_lock(&dir->log_mutex);
6463 dir->last_unlink_trans = trans->transid;
6464 mutex_unlock(&dir->log_mutex);
6465 }
6466
6467 /*
6468 * Call this after adding a new name for a file and it will properly
6469 * update the log to reflect the new name.
6470 */
btrfs_log_new_name(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_inode * old_dir,struct dentry * parent)6471 void btrfs_log_new_name(struct btrfs_trans_handle *trans,
6472 struct btrfs_inode *inode, struct btrfs_inode *old_dir,
6473 struct dentry *parent)
6474 {
6475 struct btrfs_log_ctx ctx;
6476
6477 /*
6478 * this will force the logging code to walk the dentry chain
6479 * up for the file
6480 */
6481 if (!S_ISDIR(inode->vfs_inode.i_mode))
6482 inode->last_unlink_trans = trans->transid;
6483
6484 /*
6485 * if this inode hasn't been logged and directory we're renaming it
6486 * from hasn't been logged, we don't need to log it
6487 */
6488 if (!inode_logged(trans, inode) &&
6489 (!old_dir || !inode_logged(trans, old_dir)))
6490 return;
6491
6492 btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
6493 ctx.logging_new_name = true;
6494 /*
6495 * We don't care about the return value. If we fail to log the new name
6496 * then we know the next attempt to sync the log will fallback to a full
6497 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
6498 * we don't need to worry about getting a log committed that has an
6499 * inconsistent state after a rename operation.
6500 */
6501 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
6502 }
6503
6504