• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  * Not all operations are supported by fast commits today (e.g extended
69  * attributes). Fast commit ineligiblity is marked by calling one of the
70  * two following functions:
71  *
72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73  *   back to full commit. This is useful in case of transient errors.
74  *
75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76  *   the fast commits happening between ext4_fc_start_ineligible() and
77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79  *   make one more fast commit to fall back to full commit after stop call so
80  *   that it guaranteed that the fast commit ineligible operation contained
81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82  *   followed by at least 1 full commit.
83  *
84  * Atomicity of commits
85  * --------------------
86  * In order to guarantee atomicity during the commit operation, fast commit
87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88  * tag contains CRC of the contents and TID of the transaction after which
89  * this fast commit should be applied. Recovery code replays fast commit
90  * logs only if there's at least 1 valid tail present. For every fast commit
91  * operation, there is 1 tail. This means, we may end up with multiple tails
92  * in the fast commit space. Here's an example:
93  *
94  * - Create a new file A and remove existing file B
95  * - fsync()
96  * - Append contents to file A
97  * - Truncate file A
98  * - fsync()
99  *
100  * The fast commit space at the end of above operations would look like this:
101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
103  *
104  * Replay code should thus check for all the valid tails in the FC area.
105  *
106  * TODOs
107  * -----
108  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109  *    eligible update must be protected within ext4_fc_start_update() and
110  *    ext4_fc_stop_update(). These routines are called at much higher
111  *    routines. This can be made more fine grained by combining with
112  *    ext4_journal_start().
113  *
114  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115  *
116  * 3) Handle more ineligible cases.
117  */
118 
119 #include <trace/events/ext4.h>
120 static struct kmem_cache *ext4_fc_dentry_cachep;
121 
ext4_end_buffer_io_sync(struct buffer_head * bh,int uptodate)122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123 {
124 	BUFFER_TRACE(bh, "");
125 	if (uptodate) {
126 		ext4_debug("%s: Block %lld up-to-date",
127 			   __func__, bh->b_blocknr);
128 		set_buffer_uptodate(bh);
129 	} else {
130 		ext4_debug("%s: Block %lld not up-to-date",
131 			   __func__, bh->b_blocknr);
132 		clear_buffer_uptodate(bh);
133 	}
134 
135 	unlock_buffer(bh);
136 }
137 
ext4_fc_reset_inode(struct inode * inode)138 static inline void ext4_fc_reset_inode(struct inode *inode)
139 {
140 	struct ext4_inode_info *ei = EXT4_I(inode);
141 
142 	ei->i_fc_lblk_start = 0;
143 	ei->i_fc_lblk_len = 0;
144 }
145 
ext4_fc_init_inode(struct inode * inode)146 void ext4_fc_init_inode(struct inode *inode)
147 {
148 	struct ext4_inode_info *ei = EXT4_I(inode);
149 
150 	ext4_fc_reset_inode(inode);
151 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 	INIT_LIST_HEAD(&ei->i_fc_list);
153 	init_waitqueue_head(&ei->i_fc_wait);
154 	atomic_set(&ei->i_fc_updates, 0);
155 }
156 
157 /* This function must be called with sbi->s_fc_lock held. */
ext4_fc_wait_committing_inode(struct inode * inode)158 static void ext4_fc_wait_committing_inode(struct inode *inode)
159 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
160 {
161 	wait_queue_head_t *wq;
162 	struct ext4_inode_info *ei = EXT4_I(inode);
163 
164 #if (BITS_PER_LONG < 64)
165 	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
166 			EXT4_STATE_FC_COMMITTING);
167 	wq = bit_waitqueue(&ei->i_state_flags,
168 				EXT4_STATE_FC_COMMITTING);
169 #else
170 	DEFINE_WAIT_BIT(wait, &ei->i_flags,
171 			EXT4_STATE_FC_COMMITTING);
172 	wq = bit_waitqueue(&ei->i_flags,
173 				EXT4_STATE_FC_COMMITTING);
174 #endif
175 	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
176 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
177 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
178 	schedule();
179 	finish_wait(wq, &wait.wq_entry);
180 }
181 
182 /*
183  * Inform Ext4's fast about start of an inode update
184  *
185  * This function is called by the high level call VFS callbacks before
186  * performing any inode update. This function blocks if there's an ongoing
187  * fast commit on the inode in question.
188  */
ext4_fc_start_update(struct inode * inode)189 void ext4_fc_start_update(struct inode *inode)
190 {
191 	struct ext4_inode_info *ei = EXT4_I(inode);
192 
193 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
194 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
195 		return;
196 
197 restart:
198 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
199 	if (list_empty(&ei->i_fc_list))
200 		goto out;
201 
202 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
203 		ext4_fc_wait_committing_inode(inode);
204 		goto restart;
205 	}
206 out:
207 	atomic_inc(&ei->i_fc_updates);
208 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
209 }
210 
211 /*
212  * Stop inode update and wake up waiting fast commits if any.
213  */
ext4_fc_stop_update(struct inode * inode)214 void ext4_fc_stop_update(struct inode *inode)
215 {
216 	struct ext4_inode_info *ei = EXT4_I(inode);
217 
218 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
219 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
220 		return;
221 
222 	if (atomic_dec_and_test(&ei->i_fc_updates))
223 		wake_up_all(&ei->i_fc_wait);
224 }
225 
226 /*
227  * Remove inode from fast commit list. If the inode is being committed
228  * we wait until inode commit is done.
229  */
ext4_fc_del(struct inode * inode)230 void ext4_fc_del(struct inode *inode)
231 {
232 	struct ext4_inode_info *ei = EXT4_I(inode);
233 
234 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
235 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
236 		return;
237 
238 restart:
239 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
240 	if (list_empty(&ei->i_fc_list)) {
241 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
242 		return;
243 	}
244 
245 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
246 		ext4_fc_wait_committing_inode(inode);
247 		goto restart;
248 	}
249 	list_del_init(&ei->i_fc_list);
250 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251 }
252 
253 /*
254  * Mark file system as fast commit ineligible. This means that next commit
255  * operation would result in a full jbd2 commit.
256  */
ext4_fc_mark_ineligible(struct super_block * sb,int reason)257 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
258 {
259 	struct ext4_sb_info *sbi = EXT4_SB(sb);
260 
261 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
262 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
263 		return;
264 
265 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
266 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
267 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
268 }
269 
270 /*
271  * Start a fast commit ineligible update. Any commits that happen while
272  * such an operation is in progress fall back to full commits.
273  */
ext4_fc_start_ineligible(struct super_block * sb,int reason)274 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
275 {
276 	struct ext4_sb_info *sbi = EXT4_SB(sb);
277 
278 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
279 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
280 		return;
281 
282 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
283 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
284 	atomic_inc(&sbi->s_fc_ineligible_updates);
285 }
286 
287 /*
288  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
289  * to ensure that after stopping the ineligible update, at least one full
290  * commit takes place.
291  */
ext4_fc_stop_ineligible(struct super_block * sb)292 void ext4_fc_stop_ineligible(struct super_block *sb)
293 {
294 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
295 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
296 		return;
297 
298 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
299 	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
300 }
301 
ext4_fc_is_ineligible(struct super_block * sb)302 static inline int ext4_fc_is_ineligible(struct super_block *sb)
303 {
304 	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
305 		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
306 }
307 
308 /*
309  * Generic fast commit tracking function. If this is the first time this we are
310  * called after a full commit, we initialize fast commit fields and then call
311  * __fc_track_fn() with update = 0. If we have already been called after a full
312  * commit, we pass update = 1. Based on that, the track function can determine
313  * if it needs to track a field for the first time or if it needs to just
314  * update the previously tracked value.
315  *
316  * If enqueue is set, this function enqueues the inode in fast commit list.
317  */
ext4_fc_track_template(handle_t * handle,struct inode * inode,int (* __fc_track_fn)(struct inode *,void *,bool),void * args,int enqueue)318 static int ext4_fc_track_template(
319 	handle_t *handle, struct inode *inode,
320 	int (*__fc_track_fn)(struct inode *, void *, bool),
321 	void *args, int enqueue)
322 {
323 	bool update = false;
324 	struct ext4_inode_info *ei = EXT4_I(inode);
325 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
326 	tid_t tid = 0;
327 	int ret;
328 
329 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
330 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
331 		return -EOPNOTSUPP;
332 
333 	if (ext4_fc_is_ineligible(inode->i_sb))
334 		return -EINVAL;
335 
336 	tid = handle->h_transaction->t_tid;
337 	mutex_lock(&ei->i_fc_lock);
338 	if (tid == ei->i_sync_tid) {
339 		update = true;
340 	} else {
341 		ext4_fc_reset_inode(inode);
342 		ei->i_sync_tid = tid;
343 	}
344 	ret = __fc_track_fn(inode, args, update);
345 	mutex_unlock(&ei->i_fc_lock);
346 
347 	if (!enqueue)
348 		return ret;
349 
350 	spin_lock(&sbi->s_fc_lock);
351 	if (list_empty(&EXT4_I(inode)->i_fc_list))
352 		list_add_tail(&EXT4_I(inode)->i_fc_list,
353 				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
354 				&sbi->s_fc_q[FC_Q_STAGING] :
355 				&sbi->s_fc_q[FC_Q_MAIN]);
356 	spin_unlock(&sbi->s_fc_lock);
357 
358 	return ret;
359 }
360 
361 struct __track_dentry_update_args {
362 	struct dentry *dentry;
363 	int op;
364 };
365 
366 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
__track_dentry_update(struct inode * inode,void * arg,bool update)367 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
368 {
369 	struct ext4_fc_dentry_update *node;
370 	struct ext4_inode_info *ei = EXT4_I(inode);
371 	struct __track_dentry_update_args *dentry_update =
372 		(struct __track_dentry_update_args *)arg;
373 	struct dentry *dentry = dentry_update->dentry;
374 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
375 
376 	mutex_unlock(&ei->i_fc_lock);
377 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
378 	if (!node) {
379 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
380 		mutex_lock(&ei->i_fc_lock);
381 		return -ENOMEM;
382 	}
383 
384 	node->fcd_op = dentry_update->op;
385 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
386 	node->fcd_ino = inode->i_ino;
387 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
388 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
389 		if (!node->fcd_name.name) {
390 			kmem_cache_free(ext4_fc_dentry_cachep, node);
391 			ext4_fc_mark_ineligible(inode->i_sb,
392 				EXT4_FC_REASON_NOMEM);
393 			mutex_lock(&ei->i_fc_lock);
394 			return -ENOMEM;
395 		}
396 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
397 			dentry->d_name.len);
398 	} else {
399 		memcpy(node->fcd_iname, dentry->d_name.name,
400 			dentry->d_name.len);
401 		node->fcd_name.name = node->fcd_iname;
402 	}
403 	node->fcd_name.len = dentry->d_name.len;
404 
405 	spin_lock(&sbi->s_fc_lock);
406 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
407 		list_add_tail(&node->fcd_list,
408 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
409 	else
410 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
411 	spin_unlock(&sbi->s_fc_lock);
412 	mutex_lock(&ei->i_fc_lock);
413 
414 	return 0;
415 }
416 
__ext4_fc_track_unlink(handle_t * handle,struct inode * inode,struct dentry * dentry)417 void __ext4_fc_track_unlink(handle_t *handle,
418 		struct inode *inode, struct dentry *dentry)
419 {
420 	struct __track_dentry_update_args args;
421 	int ret;
422 
423 	args.dentry = dentry;
424 	args.op = EXT4_FC_TAG_UNLINK;
425 
426 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
427 					(void *)&args, 0);
428 	trace_ext4_fc_track_unlink(inode, dentry, ret);
429 }
430 
ext4_fc_track_unlink(handle_t * handle,struct dentry * dentry)431 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
432 {
433 	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
434 }
435 
__ext4_fc_track_link(handle_t * handle,struct inode * inode,struct dentry * dentry)436 void __ext4_fc_track_link(handle_t *handle,
437 	struct inode *inode, struct dentry *dentry)
438 {
439 	struct __track_dentry_update_args args;
440 	int ret;
441 
442 	args.dentry = dentry;
443 	args.op = EXT4_FC_TAG_LINK;
444 
445 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
446 					(void *)&args, 0);
447 	trace_ext4_fc_track_link(inode, dentry, ret);
448 }
449 
ext4_fc_track_link(handle_t * handle,struct dentry * dentry)450 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
451 {
452 	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
453 }
454 
__ext4_fc_track_create(handle_t * handle,struct inode * inode,struct dentry * dentry)455 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
456 			  struct dentry *dentry)
457 {
458 	struct __track_dentry_update_args args;
459 	int ret;
460 
461 	args.dentry = dentry;
462 	args.op = EXT4_FC_TAG_CREAT;
463 
464 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
465 					(void *)&args, 0);
466 	trace_ext4_fc_track_create(inode, dentry, ret);
467 }
468 
ext4_fc_track_create(handle_t * handle,struct dentry * dentry)469 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
470 {
471 	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
472 }
473 
474 /* __track_fn for inode tracking */
__track_inode(struct inode * inode,void * arg,bool update)475 static int __track_inode(struct inode *inode, void *arg, bool update)
476 {
477 	if (update)
478 		return -EEXIST;
479 
480 	EXT4_I(inode)->i_fc_lblk_len = 0;
481 
482 	return 0;
483 }
484 
ext4_fc_track_inode(handle_t * handle,struct inode * inode)485 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
486 {
487 	int ret;
488 
489 	if (S_ISDIR(inode->i_mode))
490 		return;
491 
492 	if (ext4_should_journal_data(inode)) {
493 		ext4_fc_mark_ineligible(inode->i_sb,
494 					EXT4_FC_REASON_INODE_JOURNAL_DATA);
495 		return;
496 	}
497 
498 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
499 	trace_ext4_fc_track_inode(inode, ret);
500 }
501 
502 struct __track_range_args {
503 	ext4_lblk_t start, end;
504 };
505 
506 /* __track_fn for tracking data updates */
__track_range(struct inode * inode,void * arg,bool update)507 static int __track_range(struct inode *inode, void *arg, bool update)
508 {
509 	struct ext4_inode_info *ei = EXT4_I(inode);
510 	ext4_lblk_t oldstart;
511 	struct __track_range_args *__arg =
512 		(struct __track_range_args *)arg;
513 
514 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
515 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
516 		return -ECANCELED;
517 	}
518 
519 	oldstart = ei->i_fc_lblk_start;
520 
521 	if (update && ei->i_fc_lblk_len > 0) {
522 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
523 		ei->i_fc_lblk_len =
524 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
525 				ei->i_fc_lblk_start + 1;
526 	} else {
527 		ei->i_fc_lblk_start = __arg->start;
528 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
529 	}
530 
531 	return 0;
532 }
533 
ext4_fc_track_range(handle_t * handle,struct inode * inode,ext4_lblk_t start,ext4_lblk_t end)534 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
535 			 ext4_lblk_t end)
536 {
537 	struct __track_range_args args;
538 	int ret;
539 
540 	if (S_ISDIR(inode->i_mode))
541 		return;
542 
543 	args.start = start;
544 	args.end = end;
545 
546 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
547 
548 	trace_ext4_fc_track_range(inode, start, end, ret);
549 }
550 
ext4_fc_submit_bh(struct super_block * sb)551 static void ext4_fc_submit_bh(struct super_block *sb)
552 {
553 	int write_flags = REQ_SYNC;
554 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
555 
556 	/* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
557 	if (test_opt(sb, BARRIER))
558 		write_flags |= REQ_FUA | REQ_PREFLUSH;
559 	lock_buffer(bh);
560 	set_buffer_dirty(bh);
561 	set_buffer_uptodate(bh);
562 	bh->b_end_io = ext4_end_buffer_io_sync;
563 	submit_bh(REQ_OP_WRITE, write_flags, bh);
564 	EXT4_SB(sb)->s_fc_bh = NULL;
565 }
566 
567 /* Ext4 commit path routines */
568 
569 /* memzero and update CRC */
ext4_fc_memzero(struct super_block * sb,void * dst,int len,u32 * crc)570 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
571 				u32 *crc)
572 {
573 	void *ret;
574 
575 	ret = memset(dst, 0, len);
576 	if (crc)
577 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
578 	return ret;
579 }
580 
581 /*
582  * Allocate len bytes on a fast commit buffer.
583  *
584  * During the commit time this function is used to manage fast commit
585  * block space. We don't split a fast commit log onto different
586  * blocks. So this function makes sure that if there's not enough space
587  * on the current block, the remaining space in the current block is
588  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
589  * new block is from jbd2 and CRC is updated to reflect the padding
590  * we added.
591  */
ext4_fc_reserve_space(struct super_block * sb,int len,u32 * crc)592 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
593 {
594 	struct ext4_fc_tl *tl;
595 	struct ext4_sb_info *sbi = EXT4_SB(sb);
596 	struct buffer_head *bh;
597 	int bsize = sbi->s_journal->j_blocksize;
598 	int ret, off = sbi->s_fc_bytes % bsize;
599 	int pad_len;
600 
601 	/*
602 	 * After allocating len, we should have space at least for a 0 byte
603 	 * padding.
604 	 */
605 	if (len + sizeof(struct ext4_fc_tl) > bsize)
606 		return NULL;
607 
608 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
609 		/*
610 		 * Only allocate from current buffer if we have enough space for
611 		 * this request AND we have space to add a zero byte padding.
612 		 */
613 		if (!sbi->s_fc_bh) {
614 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
615 			if (ret)
616 				return NULL;
617 			sbi->s_fc_bh = bh;
618 		}
619 		sbi->s_fc_bytes += len;
620 		return sbi->s_fc_bh->b_data + off;
621 	}
622 	/* Need to add PAD tag */
623 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
624 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
625 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
626 	tl->fc_len = cpu_to_le16(pad_len);
627 	if (crc)
628 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
629 	if (pad_len > 0)
630 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
631 	ext4_fc_submit_bh(sb);
632 
633 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
634 	if (ret)
635 		return NULL;
636 	sbi->s_fc_bh = bh;
637 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
638 	return sbi->s_fc_bh->b_data;
639 }
640 
641 /* memcpy to fc reserved space and update CRC */
ext4_fc_memcpy(struct super_block * sb,void * dst,const void * src,int len,u32 * crc)642 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
643 				int len, u32 *crc)
644 {
645 	if (crc)
646 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
647 	return memcpy(dst, src, len);
648 }
649 
650 /*
651  * Complete a fast commit by writing tail tag.
652  *
653  * Writing tail tag marks the end of a fast commit. In order to guarantee
654  * atomicity, after writing tail tag, even if there's space remaining
655  * in the block, next commit shouldn't use it. That's why tail tag
656  * has the length as that of the remaining space on the block.
657  */
ext4_fc_write_tail(struct super_block * sb,u32 crc)658 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
659 {
660 	struct ext4_sb_info *sbi = EXT4_SB(sb);
661 	struct ext4_fc_tl tl;
662 	struct ext4_fc_tail tail;
663 	int off, bsize = sbi->s_journal->j_blocksize;
664 	u8 *dst;
665 
666 	/*
667 	 * ext4_fc_reserve_space takes care of allocating an extra block if
668 	 * there's no enough space on this block for accommodating this tail.
669 	 */
670 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
671 	if (!dst)
672 		return -ENOSPC;
673 
674 	off = sbi->s_fc_bytes % bsize;
675 
676 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
677 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
678 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
679 
680 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
681 	dst += sizeof(tl);
682 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
683 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
684 	dst += sizeof(tail.fc_tid);
685 	tail.fc_crc = cpu_to_le32(crc);
686 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
687 
688 	ext4_fc_submit_bh(sb);
689 
690 	return 0;
691 }
692 
693 /*
694  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
695  * Returns false if there's not enough space.
696  */
ext4_fc_add_tlv(struct super_block * sb,u16 tag,u16 len,u8 * val,u32 * crc)697 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
698 			   u32 *crc)
699 {
700 	struct ext4_fc_tl tl;
701 	u8 *dst;
702 
703 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
704 	if (!dst)
705 		return false;
706 
707 	tl.fc_tag = cpu_to_le16(tag);
708 	tl.fc_len = cpu_to_le16(len);
709 
710 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
711 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
712 
713 	return true;
714 }
715 
716 /* Same as above, but adds dentry tlv. */
ext4_fc_add_dentry_tlv(struct super_block * sb,u16 tag,int parent_ino,int ino,int dlen,const unsigned char * dname,u32 * crc)717 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
718 					int parent_ino, int ino, int dlen,
719 					const unsigned char *dname,
720 					u32 *crc)
721 {
722 	struct ext4_fc_dentry_info fcd;
723 	struct ext4_fc_tl tl;
724 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
725 					crc);
726 
727 	if (!dst)
728 		return false;
729 
730 	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
731 	fcd.fc_ino = cpu_to_le32(ino);
732 	tl.fc_tag = cpu_to_le16(tag);
733 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
734 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
735 	dst += sizeof(tl);
736 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
737 	dst += sizeof(fcd);
738 	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
739 	dst += dlen;
740 
741 	return true;
742 }
743 
744 /*
745  * Writes inode in the fast commit space under TLV with tag @tag.
746  * Returns 0 on success, error on failure.
747  */
ext4_fc_write_inode(struct inode * inode,u32 * crc)748 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
749 {
750 	struct ext4_inode_info *ei = EXT4_I(inode);
751 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
752 	int ret;
753 	struct ext4_iloc iloc;
754 	struct ext4_fc_inode fc_inode;
755 	struct ext4_fc_tl tl;
756 	u8 *dst;
757 
758 	ret = ext4_get_inode_loc(inode, &iloc);
759 	if (ret)
760 		return ret;
761 
762 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
763 		inode_len += ei->i_extra_isize;
764 
765 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
766 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
767 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
768 
769 	dst = ext4_fc_reserve_space(inode->i_sb,
770 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
771 	if (!dst)
772 		return -ECANCELED;
773 
774 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
775 		return -ECANCELED;
776 	dst += sizeof(tl);
777 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
778 		return -ECANCELED;
779 	dst += sizeof(fc_inode);
780 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
781 					inode_len, crc))
782 		return -ECANCELED;
783 
784 	return 0;
785 }
786 
787 /*
788  * Writes updated data ranges for the inode in question. Updates CRC.
789  * Returns 0 on success, error otherwise.
790  */
ext4_fc_write_inode_data(struct inode * inode,u32 * crc)791 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
792 {
793 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
794 	struct ext4_inode_info *ei = EXT4_I(inode);
795 	struct ext4_map_blocks map;
796 	struct ext4_fc_add_range fc_ext;
797 	struct ext4_fc_del_range lrange;
798 	struct ext4_extent *ex;
799 	int ret;
800 
801 	mutex_lock(&ei->i_fc_lock);
802 	if (ei->i_fc_lblk_len == 0) {
803 		mutex_unlock(&ei->i_fc_lock);
804 		return 0;
805 	}
806 	old_blk_size = ei->i_fc_lblk_start;
807 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
808 	ei->i_fc_lblk_len = 0;
809 	mutex_unlock(&ei->i_fc_lock);
810 
811 	cur_lblk_off = old_blk_size;
812 	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
813 		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
814 
815 	while (cur_lblk_off <= new_blk_size) {
816 		map.m_lblk = cur_lblk_off;
817 		map.m_len = new_blk_size - cur_lblk_off + 1;
818 		ret = ext4_map_blocks(NULL, inode, &map, 0);
819 		if (ret < 0)
820 			return -ECANCELED;
821 
822 		if (map.m_len == 0) {
823 			cur_lblk_off++;
824 			continue;
825 		}
826 
827 		if (ret == 0) {
828 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
829 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
830 			lrange.fc_len = cpu_to_le32(map.m_len);
831 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
832 					    sizeof(lrange), (u8 *)&lrange, crc))
833 				return -ENOSPC;
834 		} else {
835 			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
836 				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
837 
838 			/* Limit the number of blocks in one extent */
839 			map.m_len = min(max, map.m_len);
840 
841 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
842 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
843 			ex->ee_block = cpu_to_le32(map.m_lblk);
844 			ex->ee_len = cpu_to_le16(map.m_len);
845 			ext4_ext_store_pblock(ex, map.m_pblk);
846 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
847 				ext4_ext_mark_unwritten(ex);
848 			else
849 				ext4_ext_mark_initialized(ex);
850 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
851 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
852 				return -ENOSPC;
853 		}
854 
855 		cur_lblk_off += map.m_len;
856 	}
857 
858 	return 0;
859 }
860 
861 
862 /* Submit data for all the fast commit inodes */
ext4_fc_submit_inode_data_all(journal_t * journal)863 static int ext4_fc_submit_inode_data_all(journal_t *journal)
864 {
865 	struct super_block *sb = (struct super_block *)(journal->j_private);
866 	struct ext4_sb_info *sbi = EXT4_SB(sb);
867 	struct ext4_inode_info *ei;
868 	struct list_head *pos;
869 	int ret = 0;
870 
871 	spin_lock(&sbi->s_fc_lock);
872 	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
873 	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
874 		ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
875 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
876 		while (atomic_read(&ei->i_fc_updates)) {
877 			DEFINE_WAIT(wait);
878 
879 			prepare_to_wait(&ei->i_fc_wait, &wait,
880 						TASK_UNINTERRUPTIBLE);
881 			if (atomic_read(&ei->i_fc_updates)) {
882 				spin_unlock(&sbi->s_fc_lock);
883 				schedule();
884 				spin_lock(&sbi->s_fc_lock);
885 			}
886 			finish_wait(&ei->i_fc_wait, &wait);
887 		}
888 		spin_unlock(&sbi->s_fc_lock);
889 		ret = jbd2_submit_inode_data(ei->jinode);
890 		if (ret)
891 			return ret;
892 		spin_lock(&sbi->s_fc_lock);
893 	}
894 	spin_unlock(&sbi->s_fc_lock);
895 
896 	return ret;
897 }
898 
899 /* Wait for completion of data for all the fast commit inodes */
ext4_fc_wait_inode_data_all(journal_t * journal)900 static int ext4_fc_wait_inode_data_all(journal_t *journal)
901 {
902 	struct super_block *sb = (struct super_block *)(journal->j_private);
903 	struct ext4_sb_info *sbi = EXT4_SB(sb);
904 	struct ext4_inode_info *pos, *n;
905 	int ret = 0;
906 
907 	spin_lock(&sbi->s_fc_lock);
908 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
909 		if (!ext4_test_inode_state(&pos->vfs_inode,
910 					   EXT4_STATE_FC_COMMITTING))
911 			continue;
912 		spin_unlock(&sbi->s_fc_lock);
913 
914 		ret = jbd2_wait_inode_data(journal, pos->jinode);
915 		if (ret)
916 			return ret;
917 		spin_lock(&sbi->s_fc_lock);
918 	}
919 	spin_unlock(&sbi->s_fc_lock);
920 
921 	return 0;
922 }
923 
924 /* Commit all the directory entry updates */
ext4_fc_commit_dentry_updates(journal_t * journal,u32 * crc)925 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
926 __acquires(&sbi->s_fc_lock)
927 __releases(&sbi->s_fc_lock)
928 {
929 	struct super_block *sb = (struct super_block *)(journal->j_private);
930 	struct ext4_sb_info *sbi = EXT4_SB(sb);
931 	struct ext4_fc_dentry_update *fc_dentry;
932 	struct inode *inode;
933 	struct list_head *pos, *n, *fcd_pos, *fcd_n;
934 	struct ext4_inode_info *ei;
935 	int ret;
936 
937 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
938 		return 0;
939 	list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
940 		fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
941 					fcd_list);
942 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
943 			spin_unlock(&sbi->s_fc_lock);
944 			if (!ext4_fc_add_dentry_tlv(
945 				sb, fc_dentry->fcd_op,
946 				fc_dentry->fcd_parent, fc_dentry->fcd_ino,
947 				fc_dentry->fcd_name.len,
948 				fc_dentry->fcd_name.name, crc)) {
949 				ret = -ENOSPC;
950 				goto lock_and_exit;
951 			}
952 			spin_lock(&sbi->s_fc_lock);
953 			continue;
954 		}
955 
956 		inode = NULL;
957 		list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
958 			ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
959 			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
960 				inode = &ei->vfs_inode;
961 				break;
962 			}
963 		}
964 		/*
965 		 * If we don't find inode in our list, then it was deleted,
966 		 * in which case, we don't need to record it's create tag.
967 		 */
968 		if (!inode)
969 			continue;
970 		spin_unlock(&sbi->s_fc_lock);
971 
972 		/*
973 		 * We first write the inode and then the create dirent. This
974 		 * allows the recovery code to create an unnamed inode first
975 		 * and then link it to a directory entry. This allows us
976 		 * to use namei.c routines almost as is and simplifies
977 		 * the recovery code.
978 		 */
979 		ret = ext4_fc_write_inode(inode, crc);
980 		if (ret)
981 			goto lock_and_exit;
982 
983 		ret = ext4_fc_write_inode_data(inode, crc);
984 		if (ret)
985 			goto lock_and_exit;
986 
987 		if (!ext4_fc_add_dentry_tlv(
988 			sb, fc_dentry->fcd_op,
989 			fc_dentry->fcd_parent, fc_dentry->fcd_ino,
990 			fc_dentry->fcd_name.len,
991 			fc_dentry->fcd_name.name, crc)) {
992 			ret = -ENOSPC;
993 			goto lock_and_exit;
994 		}
995 
996 		spin_lock(&sbi->s_fc_lock);
997 	}
998 	return 0;
999 lock_and_exit:
1000 	spin_lock(&sbi->s_fc_lock);
1001 	return ret;
1002 }
1003 
ext4_fc_perform_commit(journal_t * journal)1004 static int ext4_fc_perform_commit(journal_t *journal)
1005 {
1006 	struct super_block *sb = (struct super_block *)(journal->j_private);
1007 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1008 	struct ext4_inode_info *iter;
1009 	struct ext4_fc_head head;
1010 	struct list_head *pos;
1011 	struct inode *inode;
1012 	struct blk_plug plug;
1013 	int ret = 0;
1014 	u32 crc = 0;
1015 
1016 	ret = ext4_fc_submit_inode_data_all(journal);
1017 	if (ret)
1018 		return ret;
1019 
1020 	ret = ext4_fc_wait_inode_data_all(journal);
1021 	if (ret)
1022 		return ret;
1023 
1024 	/*
1025 	 * If file system device is different from journal device, issue a cache
1026 	 * flush before we start writing fast commit blocks.
1027 	 */
1028 	if (journal->j_fs_dev != journal->j_dev)
1029 		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1030 
1031 	blk_start_plug(&plug);
1032 	if (sbi->s_fc_bytes == 0) {
1033 		/*
1034 		 * Add a head tag only if this is the first fast commit
1035 		 * in this TID.
1036 		 */
1037 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1038 		head.fc_tid = cpu_to_le32(
1039 			sbi->s_journal->j_running_transaction->t_tid);
1040 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1041 			(u8 *)&head, &crc)) {
1042 			ret = -ENOSPC;
1043 			goto out;
1044 		}
1045 	}
1046 
1047 	spin_lock(&sbi->s_fc_lock);
1048 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1049 	if (ret) {
1050 		spin_unlock(&sbi->s_fc_lock);
1051 		goto out;
1052 	}
1053 
1054 	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1055 		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1056 		inode = &iter->vfs_inode;
1057 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1058 			continue;
1059 
1060 		spin_unlock(&sbi->s_fc_lock);
1061 		ret = ext4_fc_write_inode_data(inode, &crc);
1062 		if (ret)
1063 			goto out;
1064 		ret = ext4_fc_write_inode(inode, &crc);
1065 		if (ret)
1066 			goto out;
1067 		spin_lock(&sbi->s_fc_lock);
1068 	}
1069 	spin_unlock(&sbi->s_fc_lock);
1070 
1071 	ret = ext4_fc_write_tail(sb, crc);
1072 
1073 out:
1074 	blk_finish_plug(&plug);
1075 	return ret;
1076 }
1077 
1078 /*
1079  * The main commit entry point. Performs a fast commit for transaction
1080  * commit_tid if needed. If it's not possible to perform a fast commit
1081  * due to various reasons, we fall back to full commit. Returns 0
1082  * on success, error otherwise.
1083  */
ext4_fc_commit(journal_t * journal,tid_t commit_tid)1084 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1085 {
1086 	struct super_block *sb = (struct super_block *)(journal->j_private);
1087 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1088 	int nblks = 0, ret, bsize = journal->j_blocksize;
1089 	int subtid = atomic_read(&sbi->s_fc_subtid);
1090 	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1091 	ktime_t start_time, commit_time;
1092 
1093 	trace_ext4_fc_commit_start(sb);
1094 
1095 	start_time = ktime_get();
1096 
1097 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1098 		(ext4_fc_is_ineligible(sb))) {
1099 		reason = EXT4_FC_REASON_INELIGIBLE;
1100 		goto out;
1101 	}
1102 
1103 restart_fc:
1104 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1105 	if (ret == -EALREADY) {
1106 		/* There was an ongoing commit, check if we need to restart */
1107 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1108 			commit_tid > journal->j_commit_sequence)
1109 			goto restart_fc;
1110 		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1111 		goto out;
1112 	} else if (ret) {
1113 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1114 		reason = EXT4_FC_REASON_FC_START_FAILED;
1115 		goto out;
1116 	}
1117 
1118 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1119 	ret = ext4_fc_perform_commit(journal);
1120 	if (ret < 0) {
1121 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1122 		reason = EXT4_FC_REASON_FC_FAILED;
1123 		goto out;
1124 	}
1125 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1126 	ret = jbd2_fc_wait_bufs(journal, nblks);
1127 	if (ret < 0) {
1128 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1129 		reason = EXT4_FC_REASON_FC_FAILED;
1130 		goto out;
1131 	}
1132 	atomic_inc(&sbi->s_fc_subtid);
1133 	jbd2_fc_end_commit(journal);
1134 out:
1135 	/* Has any ineligible update happened since we started? */
1136 	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1137 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1138 		reason = EXT4_FC_REASON_INELIGIBLE;
1139 	}
1140 
1141 	spin_lock(&sbi->s_fc_lock);
1142 	if (reason != EXT4_FC_REASON_OK &&
1143 		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1144 		sbi->s_fc_stats.fc_ineligible_commits++;
1145 	} else {
1146 		sbi->s_fc_stats.fc_num_commits++;
1147 		sbi->s_fc_stats.fc_numblks += nblks;
1148 	}
1149 	spin_unlock(&sbi->s_fc_lock);
1150 	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1151 	trace_ext4_fc_commit_stop(sb, nblks, reason);
1152 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1153 	/*
1154 	 * weight the commit time higher than the average time so we don't
1155 	 * react too strongly to vast changes in the commit time
1156 	 */
1157 	if (likely(sbi->s_fc_avg_commit_time))
1158 		sbi->s_fc_avg_commit_time = (commit_time +
1159 				sbi->s_fc_avg_commit_time * 3) / 4;
1160 	else
1161 		sbi->s_fc_avg_commit_time = commit_time;
1162 	jbd_debug(1,
1163 		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1164 		nblks, reason, subtid);
1165 	if (reason == EXT4_FC_REASON_FC_FAILED)
1166 		return jbd2_fc_end_commit_fallback(journal);
1167 	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1168 		reason == EXT4_FC_REASON_INELIGIBLE)
1169 		return jbd2_complete_transaction(journal, commit_tid);
1170 	return 0;
1171 }
1172 
1173 /*
1174  * Fast commit cleanup routine. This is called after every fast commit and
1175  * full commit. full is true if we are called after a full commit.
1176  */
ext4_fc_cleanup(journal_t * journal,int full)1177 static void ext4_fc_cleanup(journal_t *journal, int full)
1178 {
1179 	struct super_block *sb = journal->j_private;
1180 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1181 	struct ext4_inode_info *iter;
1182 	struct ext4_fc_dentry_update *fc_dentry;
1183 	struct list_head *pos, *n;
1184 
1185 	if (full && sbi->s_fc_bh)
1186 		sbi->s_fc_bh = NULL;
1187 
1188 	jbd2_fc_release_bufs(journal);
1189 
1190 	spin_lock(&sbi->s_fc_lock);
1191 	list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1192 		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1193 		list_del_init(&iter->i_fc_list);
1194 		ext4_clear_inode_state(&iter->vfs_inode,
1195 				       EXT4_STATE_FC_COMMITTING);
1196 		ext4_fc_reset_inode(&iter->vfs_inode);
1197 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1198 		smp_mb();
1199 #if (BITS_PER_LONG < 64)
1200 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1201 #else
1202 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1203 #endif
1204 	}
1205 
1206 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1207 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1208 					     struct ext4_fc_dentry_update,
1209 					     fcd_list);
1210 		list_del_init(&fc_dentry->fcd_list);
1211 		spin_unlock(&sbi->s_fc_lock);
1212 
1213 		if (fc_dentry->fcd_name.name &&
1214 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1215 			kfree(fc_dentry->fcd_name.name);
1216 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1217 		spin_lock(&sbi->s_fc_lock);
1218 	}
1219 
1220 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1221 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1222 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1223 				&sbi->s_fc_q[FC_Q_MAIN]);
1224 
1225 	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1226 	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1227 
1228 	if (full)
1229 		sbi->s_fc_bytes = 0;
1230 	spin_unlock(&sbi->s_fc_lock);
1231 	trace_ext4_fc_stats(sb);
1232 }
1233 
1234 /* Ext4 Replay Path Routines */
1235 
1236 /* Helper struct for dentry replay routines */
1237 struct dentry_info_args {
1238 	int parent_ino, dname_len, ino, inode_len;
1239 	char *dname;
1240 };
1241 
tl_to_darg(struct dentry_info_args * darg,struct ext4_fc_tl * tl,u8 * val)1242 static inline void tl_to_darg(struct dentry_info_args *darg,
1243 			      struct  ext4_fc_tl *tl, u8 *val)
1244 {
1245 	struct ext4_fc_dentry_info fcd;
1246 
1247 	memcpy(&fcd, val, sizeof(fcd));
1248 
1249 	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1250 	darg->ino = le32_to_cpu(fcd.fc_ino);
1251 	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1252 	darg->dname_len = le16_to_cpu(tl->fc_len) -
1253 		sizeof(struct ext4_fc_dentry_info);
1254 }
1255 
1256 /* Unlink replay function */
ext4_fc_replay_unlink(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1257 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1258 				 u8 *val)
1259 {
1260 	struct inode *inode, *old_parent;
1261 	struct qstr entry;
1262 	struct dentry_info_args darg;
1263 	int ret = 0;
1264 
1265 	tl_to_darg(&darg, tl, val);
1266 
1267 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1268 			darg.parent_ino, darg.dname_len);
1269 
1270 	entry.name = darg.dname;
1271 	entry.len = darg.dname_len;
1272 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1273 
1274 	if (IS_ERR(inode)) {
1275 		jbd_debug(1, "Inode %d not found", darg.ino);
1276 		return 0;
1277 	}
1278 
1279 	old_parent = ext4_iget(sb, darg.parent_ino,
1280 				EXT4_IGET_NORMAL);
1281 	if (IS_ERR(old_parent)) {
1282 		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1283 		iput(inode);
1284 		return 0;
1285 	}
1286 
1287 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1288 	/* -ENOENT ok coz it might not exist anymore. */
1289 	if (ret == -ENOENT)
1290 		ret = 0;
1291 	iput(old_parent);
1292 	iput(inode);
1293 	return ret;
1294 }
1295 
ext4_fc_replay_link_internal(struct super_block * sb,struct dentry_info_args * darg,struct inode * inode)1296 static int ext4_fc_replay_link_internal(struct super_block *sb,
1297 				struct dentry_info_args *darg,
1298 				struct inode *inode)
1299 {
1300 	struct inode *dir = NULL;
1301 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1302 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1303 	int ret = 0;
1304 
1305 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1306 	if (IS_ERR(dir)) {
1307 		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1308 		dir = NULL;
1309 		goto out;
1310 	}
1311 
1312 	dentry_dir = d_obtain_alias(dir);
1313 	if (IS_ERR(dentry_dir)) {
1314 		jbd_debug(1, "Failed to obtain dentry");
1315 		dentry_dir = NULL;
1316 		goto out;
1317 	}
1318 
1319 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1320 	if (!dentry_inode) {
1321 		jbd_debug(1, "Inode dentry not created.");
1322 		ret = -ENOMEM;
1323 		goto out;
1324 	}
1325 
1326 	ret = __ext4_link(dir, inode, dentry_inode);
1327 	/*
1328 	 * It's possible that link already existed since data blocks
1329 	 * for the dir in question got persisted before we crashed OR
1330 	 * we replayed this tag and crashed before the entire replay
1331 	 * could complete.
1332 	 */
1333 	if (ret && ret != -EEXIST) {
1334 		jbd_debug(1, "Failed to link\n");
1335 		goto out;
1336 	}
1337 
1338 	ret = 0;
1339 out:
1340 	if (dentry_dir) {
1341 		d_drop(dentry_dir);
1342 		dput(dentry_dir);
1343 	} else if (dir) {
1344 		iput(dir);
1345 	}
1346 	if (dentry_inode) {
1347 		d_drop(dentry_inode);
1348 		dput(dentry_inode);
1349 	}
1350 
1351 	return ret;
1352 }
1353 
1354 /* Link replay function */
ext4_fc_replay_link(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1355 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1356 			       u8 *val)
1357 {
1358 	struct inode *inode;
1359 	struct dentry_info_args darg;
1360 	int ret = 0;
1361 
1362 	tl_to_darg(&darg, tl, val);
1363 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1364 			darg.parent_ino, darg.dname_len);
1365 
1366 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1367 	if (IS_ERR(inode)) {
1368 		jbd_debug(1, "Inode not found.");
1369 		return 0;
1370 	}
1371 
1372 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1373 	iput(inode);
1374 	return ret;
1375 }
1376 
1377 /*
1378  * Record all the modified inodes during replay. We use this later to setup
1379  * block bitmaps correctly.
1380  */
ext4_fc_record_modified_inode(struct super_block * sb,int ino)1381 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1382 {
1383 	struct ext4_fc_replay_state *state;
1384 	int i;
1385 
1386 	state = &EXT4_SB(sb)->s_fc_replay_state;
1387 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1388 		if (state->fc_modified_inodes[i] == ino)
1389 			return 0;
1390 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1391 		state->fc_modified_inodes_size +=
1392 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1393 		state->fc_modified_inodes = krealloc(
1394 					state->fc_modified_inodes, sizeof(int) *
1395 					state->fc_modified_inodes_size,
1396 					GFP_KERNEL);
1397 		if (!state->fc_modified_inodes)
1398 			return -ENOMEM;
1399 	}
1400 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1401 	return 0;
1402 }
1403 
1404 /*
1405  * Inode replay function
1406  */
ext4_fc_replay_inode(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1407 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1408 				u8 *val)
1409 {
1410 	struct ext4_fc_inode fc_inode;
1411 	struct ext4_inode *raw_inode;
1412 	struct ext4_inode *raw_fc_inode;
1413 	struct inode *inode = NULL;
1414 	struct ext4_iloc iloc;
1415 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1416 	struct ext4_extent_header *eh;
1417 
1418 	memcpy(&fc_inode, val, sizeof(fc_inode));
1419 
1420 	ino = le32_to_cpu(fc_inode.fc_ino);
1421 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1422 
1423 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1424 	if (!IS_ERR(inode)) {
1425 		ext4_ext_clear_bb(inode);
1426 		iput(inode);
1427 	}
1428 	inode = NULL;
1429 
1430 	ext4_fc_record_modified_inode(sb, ino);
1431 
1432 	raw_fc_inode = (struct ext4_inode *)
1433 		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1434 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1435 	if (ret)
1436 		goto out;
1437 
1438 	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1439 	raw_inode = ext4_raw_inode(&iloc);
1440 
1441 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1442 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1443 		inode_len - offsetof(struct ext4_inode, i_generation));
1444 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1445 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1446 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1447 			memset(eh, 0, sizeof(*eh));
1448 			eh->eh_magic = EXT4_EXT_MAGIC;
1449 			eh->eh_max = cpu_to_le16(
1450 				(sizeof(raw_inode->i_block) -
1451 				 sizeof(struct ext4_extent_header))
1452 				 / sizeof(struct ext4_extent));
1453 		}
1454 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1455 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1456 			sizeof(raw_inode->i_block));
1457 	}
1458 
1459 	/* Immediately update the inode on disk. */
1460 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1461 	if (ret)
1462 		goto out;
1463 	ret = sync_dirty_buffer(iloc.bh);
1464 	if (ret)
1465 		goto out;
1466 	ret = ext4_mark_inode_used(sb, ino);
1467 	if (ret)
1468 		goto out;
1469 
1470 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1471 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1472 	if (IS_ERR(inode)) {
1473 		jbd_debug(1, "Inode not found.");
1474 		return -EFSCORRUPTED;
1475 	}
1476 
1477 	/*
1478 	 * Our allocator could have made different decisions than before
1479 	 * crashing. This should be fixed but until then, we calculate
1480 	 * the number of blocks the inode.
1481 	 */
1482 	ext4_ext_replay_set_iblocks(inode);
1483 
1484 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1485 	ext4_reset_inode_seed(inode);
1486 
1487 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1488 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1489 	sync_dirty_buffer(iloc.bh);
1490 	brelse(iloc.bh);
1491 out:
1492 	iput(inode);
1493 	if (!ret)
1494 		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1495 
1496 	return 0;
1497 }
1498 
1499 /*
1500  * Dentry create replay function.
1501  *
1502  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1503  * inode for which we are trying to create a dentry here, should already have
1504  * been replayed before we start here.
1505  */
ext4_fc_replay_create(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1506 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1507 				 u8 *val)
1508 {
1509 	int ret = 0;
1510 	struct inode *inode = NULL;
1511 	struct inode *dir = NULL;
1512 	struct dentry_info_args darg;
1513 
1514 	tl_to_darg(&darg, tl, val);
1515 
1516 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1517 			darg.parent_ino, darg.dname_len);
1518 
1519 	/* This takes care of update group descriptor and other metadata */
1520 	ret = ext4_mark_inode_used(sb, darg.ino);
1521 	if (ret)
1522 		goto out;
1523 
1524 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1525 	if (IS_ERR(inode)) {
1526 		jbd_debug(1, "inode %d not found.", darg.ino);
1527 		inode = NULL;
1528 		ret = -EINVAL;
1529 		goto out;
1530 	}
1531 
1532 	if (S_ISDIR(inode->i_mode)) {
1533 		/*
1534 		 * If we are creating a directory, we need to make sure that the
1535 		 * dot and dot dot dirents are setup properly.
1536 		 */
1537 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1538 		if (IS_ERR(dir)) {
1539 			jbd_debug(1, "Dir %d not found.", darg.ino);
1540 			goto out;
1541 		}
1542 		ret = ext4_init_new_dir(NULL, dir, inode);
1543 		iput(dir);
1544 		if (ret) {
1545 			ret = 0;
1546 			goto out;
1547 		}
1548 	}
1549 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1550 	if (ret)
1551 		goto out;
1552 	set_nlink(inode, 1);
1553 	ext4_mark_inode_dirty(NULL, inode);
1554 out:
1555 	if (inode)
1556 		iput(inode);
1557 	return ret;
1558 }
1559 
1560 /*
1561  * Record physical disk regions which are in use as per fast commit area. Our
1562  * simple replay phase allocator excludes these regions from allocation.
1563  */
ext4_fc_record_regions(struct super_block * sb,int ino,ext4_lblk_t lblk,ext4_fsblk_t pblk,int len)1564 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1565 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1566 {
1567 	struct ext4_fc_replay_state *state;
1568 	struct ext4_fc_alloc_region *region;
1569 
1570 	state = &EXT4_SB(sb)->s_fc_replay_state;
1571 	if (state->fc_regions_used == state->fc_regions_size) {
1572 		state->fc_regions_size +=
1573 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1574 		state->fc_regions = krealloc(
1575 					state->fc_regions,
1576 					state->fc_regions_size *
1577 					sizeof(struct ext4_fc_alloc_region),
1578 					GFP_KERNEL);
1579 		if (!state->fc_regions)
1580 			return -ENOMEM;
1581 	}
1582 	region = &state->fc_regions[state->fc_regions_used++];
1583 	region->ino = ino;
1584 	region->lblk = lblk;
1585 	region->pblk = pblk;
1586 	region->len = len;
1587 
1588 	return 0;
1589 }
1590 
1591 /* Replay add range tag */
ext4_fc_replay_add_range(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1592 static int ext4_fc_replay_add_range(struct super_block *sb,
1593 				    struct ext4_fc_tl *tl, u8 *val)
1594 {
1595 	struct ext4_fc_add_range fc_add_ex;
1596 	struct ext4_extent newex, *ex;
1597 	struct inode *inode;
1598 	ext4_lblk_t start, cur;
1599 	int remaining, len;
1600 	ext4_fsblk_t start_pblk;
1601 	struct ext4_map_blocks map;
1602 	struct ext4_ext_path *path = NULL;
1603 	int ret;
1604 
1605 	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1606 	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1607 
1608 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1609 		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1610 		ext4_ext_get_actual_len(ex));
1611 
1612 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1613 	if (IS_ERR(inode)) {
1614 		jbd_debug(1, "Inode not found.");
1615 		return 0;
1616 	}
1617 
1618 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1619 
1620 	start = le32_to_cpu(ex->ee_block);
1621 	start_pblk = ext4_ext_pblock(ex);
1622 	len = ext4_ext_get_actual_len(ex);
1623 
1624 	cur = start;
1625 	remaining = len;
1626 	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1627 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1628 		  inode->i_ino);
1629 
1630 	while (remaining > 0) {
1631 		map.m_lblk = cur;
1632 		map.m_len = remaining;
1633 		map.m_pblk = 0;
1634 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1635 
1636 		if (ret < 0) {
1637 			iput(inode);
1638 			return 0;
1639 		}
1640 
1641 		if (ret == 0) {
1642 			/* Range is not mapped */
1643 			path = ext4_find_extent(inode, cur, NULL, 0);
1644 			if (IS_ERR(path)) {
1645 				iput(inode);
1646 				return 0;
1647 			}
1648 			memset(&newex, 0, sizeof(newex));
1649 			newex.ee_block = cpu_to_le32(cur);
1650 			ext4_ext_store_pblock(
1651 				&newex, start_pblk + cur - start);
1652 			newex.ee_len = cpu_to_le16(map.m_len);
1653 			if (ext4_ext_is_unwritten(ex))
1654 				ext4_ext_mark_unwritten(&newex);
1655 			down_write(&EXT4_I(inode)->i_data_sem);
1656 			ret = ext4_ext_insert_extent(
1657 				NULL, inode, &path, &newex, 0);
1658 			up_write((&EXT4_I(inode)->i_data_sem));
1659 			ext4_ext_drop_refs(path);
1660 			kfree(path);
1661 			if (ret) {
1662 				iput(inode);
1663 				return 0;
1664 			}
1665 			goto next;
1666 		}
1667 
1668 		if (start_pblk + cur - start != map.m_pblk) {
1669 			/*
1670 			 * Logical to physical mapping changed. This can happen
1671 			 * if this range was removed and then reallocated to
1672 			 * map to new physical blocks during a fast commit.
1673 			 */
1674 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1675 					ext4_ext_is_unwritten(ex),
1676 					start_pblk + cur - start);
1677 			if (ret) {
1678 				iput(inode);
1679 				return 0;
1680 			}
1681 			/*
1682 			 * Mark the old blocks as free since they aren't used
1683 			 * anymore. We maintain an array of all the modified
1684 			 * inodes. In case these blocks are still used at either
1685 			 * a different logical range in the same inode or in
1686 			 * some different inode, we will mark them as allocated
1687 			 * at the end of the FC replay using our array of
1688 			 * modified inodes.
1689 			 */
1690 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1691 			goto next;
1692 		}
1693 
1694 		/* Range is mapped and needs a state change */
1695 		jbd_debug(1, "Converting from %ld to %d %lld",
1696 				map.m_flags & EXT4_MAP_UNWRITTEN,
1697 			ext4_ext_is_unwritten(ex), map.m_pblk);
1698 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1699 					ext4_ext_is_unwritten(ex), map.m_pblk);
1700 		if (ret) {
1701 			iput(inode);
1702 			return 0;
1703 		}
1704 		/*
1705 		 * We may have split the extent tree while toggling the state.
1706 		 * Try to shrink the extent tree now.
1707 		 */
1708 		ext4_ext_replay_shrink_inode(inode, start + len);
1709 next:
1710 		cur += map.m_len;
1711 		remaining -= map.m_len;
1712 	}
1713 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1714 					sb->s_blocksize_bits);
1715 	iput(inode);
1716 	return 0;
1717 }
1718 
1719 /* Replay DEL_RANGE tag */
1720 static int
ext4_fc_replay_del_range(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1721 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1722 			 u8 *val)
1723 {
1724 	struct inode *inode;
1725 	struct ext4_fc_del_range lrange;
1726 	struct ext4_map_blocks map;
1727 	ext4_lblk_t cur, remaining;
1728 	int ret;
1729 
1730 	memcpy(&lrange, val, sizeof(lrange));
1731 	cur = le32_to_cpu(lrange.fc_lblk);
1732 	remaining = le32_to_cpu(lrange.fc_len);
1733 
1734 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1735 		le32_to_cpu(lrange.fc_ino), cur, remaining);
1736 
1737 	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1738 	if (IS_ERR(inode)) {
1739 		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1740 		return 0;
1741 	}
1742 
1743 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1744 
1745 	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1746 			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1747 			le32_to_cpu(lrange.fc_len));
1748 	while (remaining > 0) {
1749 		map.m_lblk = cur;
1750 		map.m_len = remaining;
1751 
1752 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1753 		if (ret < 0) {
1754 			iput(inode);
1755 			return 0;
1756 		}
1757 		if (ret > 0) {
1758 			remaining -= ret;
1759 			cur += ret;
1760 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1761 		} else {
1762 			remaining -= map.m_len;
1763 			cur += map.m_len;
1764 		}
1765 	}
1766 
1767 	ret = ext4_punch_hole(inode,
1768 		le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1769 		le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
1770 	if (ret)
1771 		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1772 	ext4_ext_replay_shrink_inode(inode,
1773 		i_size_read(inode) >> sb->s_blocksize_bits);
1774 	ext4_mark_inode_dirty(NULL, inode);
1775 	iput(inode);
1776 
1777 	return 0;
1778 }
1779 
tag2str(u16 tag)1780 static inline const char *tag2str(u16 tag)
1781 {
1782 	switch (tag) {
1783 	case EXT4_FC_TAG_LINK:
1784 		return "TAG_ADD_ENTRY";
1785 	case EXT4_FC_TAG_UNLINK:
1786 		return "TAG_DEL_ENTRY";
1787 	case EXT4_FC_TAG_ADD_RANGE:
1788 		return "TAG_ADD_RANGE";
1789 	case EXT4_FC_TAG_CREAT:
1790 		return "TAG_CREAT_DENTRY";
1791 	case EXT4_FC_TAG_DEL_RANGE:
1792 		return "TAG_DEL_RANGE";
1793 	case EXT4_FC_TAG_INODE:
1794 		return "TAG_INODE";
1795 	case EXT4_FC_TAG_PAD:
1796 		return "TAG_PAD";
1797 	case EXT4_FC_TAG_TAIL:
1798 		return "TAG_TAIL";
1799 	case EXT4_FC_TAG_HEAD:
1800 		return "TAG_HEAD";
1801 	default:
1802 		return "TAG_ERROR";
1803 	}
1804 }
1805 
ext4_fc_set_bitmaps_and_counters(struct super_block * sb)1806 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1807 {
1808 	struct ext4_fc_replay_state *state;
1809 	struct inode *inode;
1810 	struct ext4_ext_path *path = NULL;
1811 	struct ext4_map_blocks map;
1812 	int i, ret, j;
1813 	ext4_lblk_t cur, end;
1814 
1815 	state = &EXT4_SB(sb)->s_fc_replay_state;
1816 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1817 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1818 			EXT4_IGET_NORMAL);
1819 		if (IS_ERR(inode)) {
1820 			jbd_debug(1, "Inode %d not found.",
1821 				state->fc_modified_inodes[i]);
1822 			continue;
1823 		}
1824 		cur = 0;
1825 		end = EXT_MAX_BLOCKS;
1826 		while (cur < end) {
1827 			map.m_lblk = cur;
1828 			map.m_len = end - cur;
1829 
1830 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1831 			if (ret < 0)
1832 				break;
1833 
1834 			if (ret > 0) {
1835 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1836 				if (!IS_ERR(path)) {
1837 					for (j = 0; j < path->p_depth; j++)
1838 						ext4_mb_mark_bb(inode->i_sb,
1839 							path[j].p_block, 1, 1);
1840 					ext4_ext_drop_refs(path);
1841 					kfree(path);
1842 				}
1843 				cur += ret;
1844 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1845 							map.m_len, 1);
1846 			} else {
1847 				cur = cur + (map.m_len ? map.m_len : 1);
1848 			}
1849 		}
1850 		iput(inode);
1851 	}
1852 }
1853 
1854 /*
1855  * Check if block is in excluded regions for block allocation. The simple
1856  * allocator that runs during replay phase is calls this function to see
1857  * if it is okay to use a block.
1858  */
ext4_fc_replay_check_excluded(struct super_block * sb,ext4_fsblk_t blk)1859 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1860 {
1861 	int i;
1862 	struct ext4_fc_replay_state *state;
1863 
1864 	state = &EXT4_SB(sb)->s_fc_replay_state;
1865 	for (i = 0; i < state->fc_regions_valid; i++) {
1866 		if (state->fc_regions[i].ino == 0 ||
1867 			state->fc_regions[i].len == 0)
1868 			continue;
1869 		if (blk >= state->fc_regions[i].pblk &&
1870 		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1871 			return true;
1872 	}
1873 	return false;
1874 }
1875 
1876 /* Cleanup function called after replay */
ext4_fc_replay_cleanup(struct super_block * sb)1877 void ext4_fc_replay_cleanup(struct super_block *sb)
1878 {
1879 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1880 
1881 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1882 	kfree(sbi->s_fc_replay_state.fc_regions);
1883 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1884 }
1885 
1886 /*
1887  * Recovery Scan phase handler
1888  *
1889  * This function is called during the scan phase and is responsible
1890  * for doing following things:
1891  * - Make sure the fast commit area has valid tags for replay
1892  * - Count number of tags that need to be replayed by the replay handler
1893  * - Verify CRC
1894  * - Create a list of excluded blocks for allocation during replay phase
1895  *
1896  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1897  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1898  * to indicate that scan has finished and JBD2 can now start replay phase.
1899  * It returns a negative error to indicate that there was an error. At the end
1900  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1901  * to indicate the number of tags that need to replayed during the replay phase.
1902  */
ext4_fc_replay_scan(journal_t * journal,struct buffer_head * bh,int off,tid_t expected_tid)1903 static int ext4_fc_replay_scan(journal_t *journal,
1904 				struct buffer_head *bh, int off,
1905 				tid_t expected_tid)
1906 {
1907 	struct super_block *sb = journal->j_private;
1908 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1909 	struct ext4_fc_replay_state *state;
1910 	int ret = JBD2_FC_REPLAY_CONTINUE;
1911 	struct ext4_fc_add_range ext;
1912 	struct ext4_fc_tl tl;
1913 	struct ext4_fc_tail tail;
1914 	__u8 *start, *end, *cur, *val;
1915 	struct ext4_fc_head head;
1916 	struct ext4_extent *ex;
1917 
1918 	state = &sbi->s_fc_replay_state;
1919 
1920 	start = (u8 *)bh->b_data;
1921 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1922 
1923 	if (state->fc_replay_expected_off == 0) {
1924 		state->fc_cur_tag = 0;
1925 		state->fc_replay_num_tags = 0;
1926 		state->fc_crc = 0;
1927 		state->fc_regions = NULL;
1928 		state->fc_regions_valid = state->fc_regions_used =
1929 			state->fc_regions_size = 0;
1930 		/* Check if we can stop early */
1931 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1932 			!= EXT4_FC_TAG_HEAD)
1933 			return 0;
1934 	}
1935 
1936 	if (off != state->fc_replay_expected_off) {
1937 		ret = -EFSCORRUPTED;
1938 		goto out_err;
1939 	}
1940 
1941 	state->fc_replay_expected_off++;
1942 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1943 		memcpy(&tl, cur, sizeof(tl));
1944 		val = cur + sizeof(tl);
1945 		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1946 			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1947 		switch (le16_to_cpu(tl.fc_tag)) {
1948 		case EXT4_FC_TAG_ADD_RANGE:
1949 			memcpy(&ext, val, sizeof(ext));
1950 			ex = (struct ext4_extent *)&ext.fc_ex;
1951 			ret = ext4_fc_record_regions(sb,
1952 				le32_to_cpu(ext.fc_ino),
1953 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1954 				ext4_ext_get_actual_len(ex));
1955 			if (ret < 0)
1956 				break;
1957 			ret = JBD2_FC_REPLAY_CONTINUE;
1958 			fallthrough;
1959 		case EXT4_FC_TAG_DEL_RANGE:
1960 		case EXT4_FC_TAG_LINK:
1961 		case EXT4_FC_TAG_UNLINK:
1962 		case EXT4_FC_TAG_CREAT:
1963 		case EXT4_FC_TAG_INODE:
1964 		case EXT4_FC_TAG_PAD:
1965 			state->fc_cur_tag++;
1966 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1967 					sizeof(tl) + le16_to_cpu(tl.fc_len));
1968 			break;
1969 		case EXT4_FC_TAG_TAIL:
1970 			state->fc_cur_tag++;
1971 			memcpy(&tail, val, sizeof(tail));
1972 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1973 						sizeof(tl) +
1974 						offsetof(struct ext4_fc_tail,
1975 						fc_crc));
1976 			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1977 				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1978 				state->fc_replay_num_tags = state->fc_cur_tag;
1979 				state->fc_regions_valid =
1980 					state->fc_regions_used;
1981 			} else {
1982 				ret = state->fc_replay_num_tags ?
1983 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1984 			}
1985 			state->fc_crc = 0;
1986 			break;
1987 		case EXT4_FC_TAG_HEAD:
1988 			memcpy(&head, val, sizeof(head));
1989 			if (le32_to_cpu(head.fc_features) &
1990 				~EXT4_FC_SUPPORTED_FEATURES) {
1991 				ret = -EOPNOTSUPP;
1992 				break;
1993 			}
1994 			if (le32_to_cpu(head.fc_tid) != expected_tid) {
1995 				ret = JBD2_FC_REPLAY_STOP;
1996 				break;
1997 			}
1998 			state->fc_cur_tag++;
1999 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2000 					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2001 			break;
2002 		default:
2003 			ret = state->fc_replay_num_tags ?
2004 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2005 		}
2006 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2007 			break;
2008 	}
2009 
2010 out_err:
2011 	trace_ext4_fc_replay_scan(sb, ret, off);
2012 	return ret;
2013 }
2014 
2015 /*
2016  * Main recovery path entry point.
2017  * The meaning of return codes is similar as above.
2018  */
ext4_fc_replay(journal_t * journal,struct buffer_head * bh,enum passtype pass,int off,tid_t expected_tid)2019 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2020 				enum passtype pass, int off, tid_t expected_tid)
2021 {
2022 	struct super_block *sb = journal->j_private;
2023 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2024 	struct ext4_fc_tl tl;
2025 	__u8 *start, *end, *cur, *val;
2026 	int ret = JBD2_FC_REPLAY_CONTINUE;
2027 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2028 	struct ext4_fc_tail tail;
2029 
2030 	if (pass == PASS_SCAN) {
2031 		state->fc_current_pass = PASS_SCAN;
2032 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2033 	}
2034 
2035 	if (state->fc_current_pass != pass) {
2036 		state->fc_current_pass = pass;
2037 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2038 	}
2039 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2040 		jbd_debug(1, "Replay stops\n");
2041 		ext4_fc_set_bitmaps_and_counters(sb);
2042 		return 0;
2043 	}
2044 
2045 #ifdef CONFIG_EXT4_DEBUG
2046 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2047 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2048 		return JBD2_FC_REPLAY_STOP;
2049 	}
2050 #endif
2051 
2052 	start = (u8 *)bh->b_data;
2053 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2054 
2055 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2056 		memcpy(&tl, cur, sizeof(tl));
2057 		val = cur + sizeof(tl);
2058 
2059 		if (state->fc_replay_num_tags == 0) {
2060 			ret = JBD2_FC_REPLAY_STOP;
2061 			ext4_fc_set_bitmaps_and_counters(sb);
2062 			break;
2063 		}
2064 		jbd_debug(3, "Replay phase, tag:%s\n",
2065 				tag2str(le16_to_cpu(tl.fc_tag)));
2066 		state->fc_replay_num_tags--;
2067 		switch (le16_to_cpu(tl.fc_tag)) {
2068 		case EXT4_FC_TAG_LINK:
2069 			ret = ext4_fc_replay_link(sb, &tl, val);
2070 			break;
2071 		case EXT4_FC_TAG_UNLINK:
2072 			ret = ext4_fc_replay_unlink(sb, &tl, val);
2073 			break;
2074 		case EXT4_FC_TAG_ADD_RANGE:
2075 			ret = ext4_fc_replay_add_range(sb, &tl, val);
2076 			break;
2077 		case EXT4_FC_TAG_CREAT:
2078 			ret = ext4_fc_replay_create(sb, &tl, val);
2079 			break;
2080 		case EXT4_FC_TAG_DEL_RANGE:
2081 			ret = ext4_fc_replay_del_range(sb, &tl, val);
2082 			break;
2083 		case EXT4_FC_TAG_INODE:
2084 			ret = ext4_fc_replay_inode(sb, &tl, val);
2085 			break;
2086 		case EXT4_FC_TAG_PAD:
2087 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2088 					     le16_to_cpu(tl.fc_len), 0);
2089 			break;
2090 		case EXT4_FC_TAG_TAIL:
2091 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2092 					     le16_to_cpu(tl.fc_len), 0);
2093 			memcpy(&tail, val, sizeof(tail));
2094 			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2095 			break;
2096 		case EXT4_FC_TAG_HEAD:
2097 			break;
2098 		default:
2099 			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2100 					     le16_to_cpu(tl.fc_len), 0);
2101 			ret = -ECANCELED;
2102 			break;
2103 		}
2104 		if (ret < 0)
2105 			break;
2106 		ret = JBD2_FC_REPLAY_CONTINUE;
2107 	}
2108 	return ret;
2109 }
2110 
ext4_fc_init(struct super_block * sb,journal_t * journal)2111 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2112 {
2113 	/*
2114 	 * We set replay callback even if fast commit disabled because we may
2115 	 * could still have fast commit blocks that need to be replayed even if
2116 	 * fast commit has now been turned off.
2117 	 */
2118 	journal->j_fc_replay_callback = ext4_fc_replay;
2119 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2120 		return;
2121 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2122 }
2123 
2124 static const char *fc_ineligible_reasons[] = {
2125 	"Extended attributes changed",
2126 	"Cross rename",
2127 	"Journal flag changed",
2128 	"Insufficient memory",
2129 	"Swap boot",
2130 	"Resize",
2131 	"Dir renamed",
2132 	"Falloc range op",
2133 	"Data journalling",
2134 	"FC Commit Failed"
2135 };
2136 
ext4_fc_info_show(struct seq_file * seq,void * v)2137 int ext4_fc_info_show(struct seq_file *seq, void *v)
2138 {
2139 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2140 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2141 	int i;
2142 
2143 	if (v != SEQ_START_TOKEN)
2144 		return 0;
2145 
2146 	seq_printf(seq,
2147 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2148 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2149 		   stats->fc_numblks,
2150 		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2151 	seq_puts(seq, "Ineligible reasons:\n");
2152 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2153 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2154 			stats->fc_ineligible_reason_count[i]);
2155 
2156 	return 0;
2157 }
2158 
ext4_fc_init_dentry_cache(void)2159 int __init ext4_fc_init_dentry_cache(void)
2160 {
2161 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2162 					   SLAB_RECLAIM_ACCOUNT);
2163 
2164 	if (ext4_fc_dentry_cachep == NULL)
2165 		return -ENOMEM;
2166 
2167 	return 0;
2168 }
2169