1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
86 * In order to guarantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * TODOs
107 * -----
108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109 * eligible update must be protected within ext4_fc_start_update() and
110 * ext4_fc_stop_update(). These routines are called at much higher
111 * routines. This can be made more fine grained by combining with
112 * ext4_journal_start().
113 *
114 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115 *
116 * 3) Handle more ineligible cases.
117 */
118
119 #include <trace/events/ext4.h>
120 static struct kmem_cache *ext4_fc_dentry_cachep;
121
ext4_end_buffer_io_sync(struct buffer_head * bh,int uptodate)122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123 {
124 BUFFER_TRACE(bh, "");
125 if (uptodate) {
126 ext4_debug("%s: Block %lld up-to-date",
127 __func__, bh->b_blocknr);
128 set_buffer_uptodate(bh);
129 } else {
130 ext4_debug("%s: Block %lld not up-to-date",
131 __func__, bh->b_blocknr);
132 clear_buffer_uptodate(bh);
133 }
134
135 unlock_buffer(bh);
136 }
137
ext4_fc_reset_inode(struct inode * inode)138 static inline void ext4_fc_reset_inode(struct inode *inode)
139 {
140 struct ext4_inode_info *ei = EXT4_I(inode);
141
142 ei->i_fc_lblk_start = 0;
143 ei->i_fc_lblk_len = 0;
144 }
145
ext4_fc_init_inode(struct inode * inode)146 void ext4_fc_init_inode(struct inode *inode)
147 {
148 struct ext4_inode_info *ei = EXT4_I(inode);
149
150 ext4_fc_reset_inode(inode);
151 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 INIT_LIST_HEAD(&ei->i_fc_list);
153 init_waitqueue_head(&ei->i_fc_wait);
154 atomic_set(&ei->i_fc_updates, 0);
155 }
156
157 /* This function must be called with sbi->s_fc_lock held. */
ext4_fc_wait_committing_inode(struct inode * inode)158 static void ext4_fc_wait_committing_inode(struct inode *inode)
159 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
160 {
161 wait_queue_head_t *wq;
162 struct ext4_inode_info *ei = EXT4_I(inode);
163
164 #if (BITS_PER_LONG < 64)
165 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
166 EXT4_STATE_FC_COMMITTING);
167 wq = bit_waitqueue(&ei->i_state_flags,
168 EXT4_STATE_FC_COMMITTING);
169 #else
170 DEFINE_WAIT_BIT(wait, &ei->i_flags,
171 EXT4_STATE_FC_COMMITTING);
172 wq = bit_waitqueue(&ei->i_flags,
173 EXT4_STATE_FC_COMMITTING);
174 #endif
175 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
176 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
177 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
178 schedule();
179 finish_wait(wq, &wait.wq_entry);
180 }
181
182 /*
183 * Inform Ext4's fast about start of an inode update
184 *
185 * This function is called by the high level call VFS callbacks before
186 * performing any inode update. This function blocks if there's an ongoing
187 * fast commit on the inode in question.
188 */
ext4_fc_start_update(struct inode * inode)189 void ext4_fc_start_update(struct inode *inode)
190 {
191 struct ext4_inode_info *ei = EXT4_I(inode);
192
193 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
194 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
195 return;
196
197 restart:
198 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
199 if (list_empty(&ei->i_fc_list))
200 goto out;
201
202 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
203 ext4_fc_wait_committing_inode(inode);
204 goto restart;
205 }
206 out:
207 atomic_inc(&ei->i_fc_updates);
208 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
209 }
210
211 /*
212 * Stop inode update and wake up waiting fast commits if any.
213 */
ext4_fc_stop_update(struct inode * inode)214 void ext4_fc_stop_update(struct inode *inode)
215 {
216 struct ext4_inode_info *ei = EXT4_I(inode);
217
218 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
219 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
220 return;
221
222 if (atomic_dec_and_test(&ei->i_fc_updates))
223 wake_up_all(&ei->i_fc_wait);
224 }
225
226 /*
227 * Remove inode from fast commit list. If the inode is being committed
228 * we wait until inode commit is done.
229 */
ext4_fc_del(struct inode * inode)230 void ext4_fc_del(struct inode *inode)
231 {
232 struct ext4_inode_info *ei = EXT4_I(inode);
233
234 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
235 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
236 return;
237
238 restart:
239 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
240 if (list_empty(&ei->i_fc_list)) {
241 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
242 return;
243 }
244
245 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
246 ext4_fc_wait_committing_inode(inode);
247 goto restart;
248 }
249 list_del_init(&ei->i_fc_list);
250 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251 }
252
253 /*
254 * Mark file system as fast commit ineligible. This means that next commit
255 * operation would result in a full jbd2 commit.
256 */
ext4_fc_mark_ineligible(struct super_block * sb,int reason)257 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
258 {
259 struct ext4_sb_info *sbi = EXT4_SB(sb);
260
261 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
262 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
263 return;
264
265 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
266 WARN_ON(reason >= EXT4_FC_REASON_MAX);
267 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
268 }
269
270 /*
271 * Start a fast commit ineligible update. Any commits that happen while
272 * such an operation is in progress fall back to full commits.
273 */
ext4_fc_start_ineligible(struct super_block * sb,int reason)274 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
275 {
276 struct ext4_sb_info *sbi = EXT4_SB(sb);
277
278 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
279 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
280 return;
281
282 WARN_ON(reason >= EXT4_FC_REASON_MAX);
283 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
284 atomic_inc(&sbi->s_fc_ineligible_updates);
285 }
286
287 /*
288 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
289 * to ensure that after stopping the ineligible update, at least one full
290 * commit takes place.
291 */
ext4_fc_stop_ineligible(struct super_block * sb)292 void ext4_fc_stop_ineligible(struct super_block *sb)
293 {
294 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
295 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
296 return;
297
298 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
299 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
300 }
301
ext4_fc_is_ineligible(struct super_block * sb)302 static inline int ext4_fc_is_ineligible(struct super_block *sb)
303 {
304 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
305 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
306 }
307
308 /*
309 * Generic fast commit tracking function. If this is the first time this we are
310 * called after a full commit, we initialize fast commit fields and then call
311 * __fc_track_fn() with update = 0. If we have already been called after a full
312 * commit, we pass update = 1. Based on that, the track function can determine
313 * if it needs to track a field for the first time or if it needs to just
314 * update the previously tracked value.
315 *
316 * If enqueue is set, this function enqueues the inode in fast commit list.
317 */
ext4_fc_track_template(handle_t * handle,struct inode * inode,int (* __fc_track_fn)(struct inode *,void *,bool),void * args,int enqueue)318 static int ext4_fc_track_template(
319 handle_t *handle, struct inode *inode,
320 int (*__fc_track_fn)(struct inode *, void *, bool),
321 void *args, int enqueue)
322 {
323 bool update = false;
324 struct ext4_inode_info *ei = EXT4_I(inode);
325 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
326 tid_t tid = 0;
327 int ret;
328
329 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
330 (sbi->s_mount_state & EXT4_FC_REPLAY))
331 return -EOPNOTSUPP;
332
333 if (ext4_fc_is_ineligible(inode->i_sb))
334 return -EINVAL;
335
336 tid = handle->h_transaction->t_tid;
337 mutex_lock(&ei->i_fc_lock);
338 if (tid == ei->i_sync_tid) {
339 update = true;
340 } else {
341 ext4_fc_reset_inode(inode);
342 ei->i_sync_tid = tid;
343 }
344 ret = __fc_track_fn(inode, args, update);
345 mutex_unlock(&ei->i_fc_lock);
346
347 if (!enqueue)
348 return ret;
349
350 spin_lock(&sbi->s_fc_lock);
351 if (list_empty(&EXT4_I(inode)->i_fc_list))
352 list_add_tail(&EXT4_I(inode)->i_fc_list,
353 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
354 &sbi->s_fc_q[FC_Q_STAGING] :
355 &sbi->s_fc_q[FC_Q_MAIN]);
356 spin_unlock(&sbi->s_fc_lock);
357
358 return ret;
359 }
360
361 struct __track_dentry_update_args {
362 struct dentry *dentry;
363 int op;
364 };
365
366 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
__track_dentry_update(struct inode * inode,void * arg,bool update)367 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
368 {
369 struct ext4_fc_dentry_update *node;
370 struct ext4_inode_info *ei = EXT4_I(inode);
371 struct __track_dentry_update_args *dentry_update =
372 (struct __track_dentry_update_args *)arg;
373 struct dentry *dentry = dentry_update->dentry;
374 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
375
376 mutex_unlock(&ei->i_fc_lock);
377 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
378 if (!node) {
379 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
380 mutex_lock(&ei->i_fc_lock);
381 return -ENOMEM;
382 }
383
384 node->fcd_op = dentry_update->op;
385 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
386 node->fcd_ino = inode->i_ino;
387 if (dentry->d_name.len > DNAME_INLINE_LEN) {
388 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
389 if (!node->fcd_name.name) {
390 kmem_cache_free(ext4_fc_dentry_cachep, node);
391 ext4_fc_mark_ineligible(inode->i_sb,
392 EXT4_FC_REASON_NOMEM);
393 mutex_lock(&ei->i_fc_lock);
394 return -ENOMEM;
395 }
396 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
397 dentry->d_name.len);
398 } else {
399 memcpy(node->fcd_iname, dentry->d_name.name,
400 dentry->d_name.len);
401 node->fcd_name.name = node->fcd_iname;
402 }
403 node->fcd_name.len = dentry->d_name.len;
404
405 spin_lock(&sbi->s_fc_lock);
406 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
407 list_add_tail(&node->fcd_list,
408 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
409 else
410 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
411 spin_unlock(&sbi->s_fc_lock);
412 mutex_lock(&ei->i_fc_lock);
413
414 return 0;
415 }
416
__ext4_fc_track_unlink(handle_t * handle,struct inode * inode,struct dentry * dentry)417 void __ext4_fc_track_unlink(handle_t *handle,
418 struct inode *inode, struct dentry *dentry)
419 {
420 struct __track_dentry_update_args args;
421 int ret;
422
423 args.dentry = dentry;
424 args.op = EXT4_FC_TAG_UNLINK;
425
426 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
427 (void *)&args, 0);
428 trace_ext4_fc_track_unlink(inode, dentry, ret);
429 }
430
ext4_fc_track_unlink(handle_t * handle,struct dentry * dentry)431 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
432 {
433 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
434 }
435
__ext4_fc_track_link(handle_t * handle,struct inode * inode,struct dentry * dentry)436 void __ext4_fc_track_link(handle_t *handle,
437 struct inode *inode, struct dentry *dentry)
438 {
439 struct __track_dentry_update_args args;
440 int ret;
441
442 args.dentry = dentry;
443 args.op = EXT4_FC_TAG_LINK;
444
445 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
446 (void *)&args, 0);
447 trace_ext4_fc_track_link(inode, dentry, ret);
448 }
449
ext4_fc_track_link(handle_t * handle,struct dentry * dentry)450 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
451 {
452 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
453 }
454
__ext4_fc_track_create(handle_t * handle,struct inode * inode,struct dentry * dentry)455 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
456 struct dentry *dentry)
457 {
458 struct __track_dentry_update_args args;
459 int ret;
460
461 args.dentry = dentry;
462 args.op = EXT4_FC_TAG_CREAT;
463
464 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
465 (void *)&args, 0);
466 trace_ext4_fc_track_create(inode, dentry, ret);
467 }
468
ext4_fc_track_create(handle_t * handle,struct dentry * dentry)469 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
470 {
471 __ext4_fc_track_create(handle, d_inode(dentry), dentry);
472 }
473
474 /* __track_fn for inode tracking */
__track_inode(struct inode * inode,void * arg,bool update)475 static int __track_inode(struct inode *inode, void *arg, bool update)
476 {
477 if (update)
478 return -EEXIST;
479
480 EXT4_I(inode)->i_fc_lblk_len = 0;
481
482 return 0;
483 }
484
ext4_fc_track_inode(handle_t * handle,struct inode * inode)485 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
486 {
487 int ret;
488
489 if (S_ISDIR(inode->i_mode))
490 return;
491
492 if (ext4_should_journal_data(inode)) {
493 ext4_fc_mark_ineligible(inode->i_sb,
494 EXT4_FC_REASON_INODE_JOURNAL_DATA);
495 return;
496 }
497
498 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
499 trace_ext4_fc_track_inode(inode, ret);
500 }
501
502 struct __track_range_args {
503 ext4_lblk_t start, end;
504 };
505
506 /* __track_fn for tracking data updates */
__track_range(struct inode * inode,void * arg,bool update)507 static int __track_range(struct inode *inode, void *arg, bool update)
508 {
509 struct ext4_inode_info *ei = EXT4_I(inode);
510 ext4_lblk_t oldstart;
511 struct __track_range_args *__arg =
512 (struct __track_range_args *)arg;
513
514 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
515 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
516 return -ECANCELED;
517 }
518
519 oldstart = ei->i_fc_lblk_start;
520
521 if (update && ei->i_fc_lblk_len > 0) {
522 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
523 ei->i_fc_lblk_len =
524 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
525 ei->i_fc_lblk_start + 1;
526 } else {
527 ei->i_fc_lblk_start = __arg->start;
528 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
529 }
530
531 return 0;
532 }
533
ext4_fc_track_range(handle_t * handle,struct inode * inode,ext4_lblk_t start,ext4_lblk_t end)534 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
535 ext4_lblk_t end)
536 {
537 struct __track_range_args args;
538 int ret;
539
540 if (S_ISDIR(inode->i_mode))
541 return;
542
543 args.start = start;
544 args.end = end;
545
546 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
547
548 trace_ext4_fc_track_range(inode, start, end, ret);
549 }
550
ext4_fc_submit_bh(struct super_block * sb)551 static void ext4_fc_submit_bh(struct super_block *sb)
552 {
553 int write_flags = REQ_SYNC;
554 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
555
556 /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
557 if (test_opt(sb, BARRIER))
558 write_flags |= REQ_FUA | REQ_PREFLUSH;
559 lock_buffer(bh);
560 set_buffer_dirty(bh);
561 set_buffer_uptodate(bh);
562 bh->b_end_io = ext4_end_buffer_io_sync;
563 submit_bh(REQ_OP_WRITE, write_flags, bh);
564 EXT4_SB(sb)->s_fc_bh = NULL;
565 }
566
567 /* Ext4 commit path routines */
568
569 /* memzero and update CRC */
ext4_fc_memzero(struct super_block * sb,void * dst,int len,u32 * crc)570 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
571 u32 *crc)
572 {
573 void *ret;
574
575 ret = memset(dst, 0, len);
576 if (crc)
577 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
578 return ret;
579 }
580
581 /*
582 * Allocate len bytes on a fast commit buffer.
583 *
584 * During the commit time this function is used to manage fast commit
585 * block space. We don't split a fast commit log onto different
586 * blocks. So this function makes sure that if there's not enough space
587 * on the current block, the remaining space in the current block is
588 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
589 * new block is from jbd2 and CRC is updated to reflect the padding
590 * we added.
591 */
ext4_fc_reserve_space(struct super_block * sb,int len,u32 * crc)592 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
593 {
594 struct ext4_fc_tl *tl;
595 struct ext4_sb_info *sbi = EXT4_SB(sb);
596 struct buffer_head *bh;
597 int bsize = sbi->s_journal->j_blocksize;
598 int ret, off = sbi->s_fc_bytes % bsize;
599 int pad_len;
600
601 /*
602 * After allocating len, we should have space at least for a 0 byte
603 * padding.
604 */
605 if (len + sizeof(struct ext4_fc_tl) > bsize)
606 return NULL;
607
608 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
609 /*
610 * Only allocate from current buffer if we have enough space for
611 * this request AND we have space to add a zero byte padding.
612 */
613 if (!sbi->s_fc_bh) {
614 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
615 if (ret)
616 return NULL;
617 sbi->s_fc_bh = bh;
618 }
619 sbi->s_fc_bytes += len;
620 return sbi->s_fc_bh->b_data + off;
621 }
622 /* Need to add PAD tag */
623 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
624 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
625 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
626 tl->fc_len = cpu_to_le16(pad_len);
627 if (crc)
628 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
629 if (pad_len > 0)
630 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
631 ext4_fc_submit_bh(sb);
632
633 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
634 if (ret)
635 return NULL;
636 sbi->s_fc_bh = bh;
637 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
638 return sbi->s_fc_bh->b_data;
639 }
640
641 /* memcpy to fc reserved space and update CRC */
ext4_fc_memcpy(struct super_block * sb,void * dst,const void * src,int len,u32 * crc)642 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
643 int len, u32 *crc)
644 {
645 if (crc)
646 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
647 return memcpy(dst, src, len);
648 }
649
650 /*
651 * Complete a fast commit by writing tail tag.
652 *
653 * Writing tail tag marks the end of a fast commit. In order to guarantee
654 * atomicity, after writing tail tag, even if there's space remaining
655 * in the block, next commit shouldn't use it. That's why tail tag
656 * has the length as that of the remaining space on the block.
657 */
ext4_fc_write_tail(struct super_block * sb,u32 crc)658 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
659 {
660 struct ext4_sb_info *sbi = EXT4_SB(sb);
661 struct ext4_fc_tl tl;
662 struct ext4_fc_tail tail;
663 int off, bsize = sbi->s_journal->j_blocksize;
664 u8 *dst;
665
666 /*
667 * ext4_fc_reserve_space takes care of allocating an extra block if
668 * there's no enough space on this block for accommodating this tail.
669 */
670 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
671 if (!dst)
672 return -ENOSPC;
673
674 off = sbi->s_fc_bytes % bsize;
675
676 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
677 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
678 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
679
680 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
681 dst += sizeof(tl);
682 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
683 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
684 dst += sizeof(tail.fc_tid);
685 tail.fc_crc = cpu_to_le32(crc);
686 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
687
688 ext4_fc_submit_bh(sb);
689
690 return 0;
691 }
692
693 /*
694 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
695 * Returns false if there's not enough space.
696 */
ext4_fc_add_tlv(struct super_block * sb,u16 tag,u16 len,u8 * val,u32 * crc)697 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
698 u32 *crc)
699 {
700 struct ext4_fc_tl tl;
701 u8 *dst;
702
703 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
704 if (!dst)
705 return false;
706
707 tl.fc_tag = cpu_to_le16(tag);
708 tl.fc_len = cpu_to_le16(len);
709
710 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
711 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
712
713 return true;
714 }
715
716 /* Same as above, but adds dentry tlv. */
ext4_fc_add_dentry_tlv(struct super_block * sb,u16 tag,int parent_ino,int ino,int dlen,const unsigned char * dname,u32 * crc)717 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
718 int parent_ino, int ino, int dlen,
719 const unsigned char *dname,
720 u32 *crc)
721 {
722 struct ext4_fc_dentry_info fcd;
723 struct ext4_fc_tl tl;
724 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
725 crc);
726
727 if (!dst)
728 return false;
729
730 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
731 fcd.fc_ino = cpu_to_le32(ino);
732 tl.fc_tag = cpu_to_le16(tag);
733 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
734 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
735 dst += sizeof(tl);
736 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
737 dst += sizeof(fcd);
738 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
739 dst += dlen;
740
741 return true;
742 }
743
744 /*
745 * Writes inode in the fast commit space under TLV with tag @tag.
746 * Returns 0 on success, error on failure.
747 */
ext4_fc_write_inode(struct inode * inode,u32 * crc)748 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
749 {
750 struct ext4_inode_info *ei = EXT4_I(inode);
751 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
752 int ret;
753 struct ext4_iloc iloc;
754 struct ext4_fc_inode fc_inode;
755 struct ext4_fc_tl tl;
756 u8 *dst;
757
758 ret = ext4_get_inode_loc(inode, &iloc);
759 if (ret)
760 return ret;
761
762 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
763 inode_len += ei->i_extra_isize;
764
765 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
766 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
767 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
768
769 dst = ext4_fc_reserve_space(inode->i_sb,
770 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
771 if (!dst)
772 return -ECANCELED;
773
774 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
775 return -ECANCELED;
776 dst += sizeof(tl);
777 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
778 return -ECANCELED;
779 dst += sizeof(fc_inode);
780 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
781 inode_len, crc))
782 return -ECANCELED;
783
784 return 0;
785 }
786
787 /*
788 * Writes updated data ranges for the inode in question. Updates CRC.
789 * Returns 0 on success, error otherwise.
790 */
ext4_fc_write_inode_data(struct inode * inode,u32 * crc)791 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
792 {
793 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
794 struct ext4_inode_info *ei = EXT4_I(inode);
795 struct ext4_map_blocks map;
796 struct ext4_fc_add_range fc_ext;
797 struct ext4_fc_del_range lrange;
798 struct ext4_extent *ex;
799 int ret;
800
801 mutex_lock(&ei->i_fc_lock);
802 if (ei->i_fc_lblk_len == 0) {
803 mutex_unlock(&ei->i_fc_lock);
804 return 0;
805 }
806 old_blk_size = ei->i_fc_lblk_start;
807 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
808 ei->i_fc_lblk_len = 0;
809 mutex_unlock(&ei->i_fc_lock);
810
811 cur_lblk_off = old_blk_size;
812 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
813 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
814
815 while (cur_lblk_off <= new_blk_size) {
816 map.m_lblk = cur_lblk_off;
817 map.m_len = new_blk_size - cur_lblk_off + 1;
818 ret = ext4_map_blocks(NULL, inode, &map, 0);
819 if (ret < 0)
820 return -ECANCELED;
821
822 if (map.m_len == 0) {
823 cur_lblk_off++;
824 continue;
825 }
826
827 if (ret == 0) {
828 lrange.fc_ino = cpu_to_le32(inode->i_ino);
829 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
830 lrange.fc_len = cpu_to_le32(map.m_len);
831 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
832 sizeof(lrange), (u8 *)&lrange, crc))
833 return -ENOSPC;
834 } else {
835 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
836 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
837
838 /* Limit the number of blocks in one extent */
839 map.m_len = min(max, map.m_len);
840
841 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
842 ex = (struct ext4_extent *)&fc_ext.fc_ex;
843 ex->ee_block = cpu_to_le32(map.m_lblk);
844 ex->ee_len = cpu_to_le16(map.m_len);
845 ext4_ext_store_pblock(ex, map.m_pblk);
846 if (map.m_flags & EXT4_MAP_UNWRITTEN)
847 ext4_ext_mark_unwritten(ex);
848 else
849 ext4_ext_mark_initialized(ex);
850 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
851 sizeof(fc_ext), (u8 *)&fc_ext, crc))
852 return -ENOSPC;
853 }
854
855 cur_lblk_off += map.m_len;
856 }
857
858 return 0;
859 }
860
861
862 /* Submit data for all the fast commit inodes */
ext4_fc_submit_inode_data_all(journal_t * journal)863 static int ext4_fc_submit_inode_data_all(journal_t *journal)
864 {
865 struct super_block *sb = (struct super_block *)(journal->j_private);
866 struct ext4_sb_info *sbi = EXT4_SB(sb);
867 struct ext4_inode_info *ei;
868 struct list_head *pos;
869 int ret = 0;
870
871 spin_lock(&sbi->s_fc_lock);
872 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
873 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
874 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
875 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
876 while (atomic_read(&ei->i_fc_updates)) {
877 DEFINE_WAIT(wait);
878
879 prepare_to_wait(&ei->i_fc_wait, &wait,
880 TASK_UNINTERRUPTIBLE);
881 if (atomic_read(&ei->i_fc_updates)) {
882 spin_unlock(&sbi->s_fc_lock);
883 schedule();
884 spin_lock(&sbi->s_fc_lock);
885 }
886 finish_wait(&ei->i_fc_wait, &wait);
887 }
888 spin_unlock(&sbi->s_fc_lock);
889 ret = jbd2_submit_inode_data(ei->jinode);
890 if (ret)
891 return ret;
892 spin_lock(&sbi->s_fc_lock);
893 }
894 spin_unlock(&sbi->s_fc_lock);
895
896 return ret;
897 }
898
899 /* Wait for completion of data for all the fast commit inodes */
ext4_fc_wait_inode_data_all(journal_t * journal)900 static int ext4_fc_wait_inode_data_all(journal_t *journal)
901 {
902 struct super_block *sb = (struct super_block *)(journal->j_private);
903 struct ext4_sb_info *sbi = EXT4_SB(sb);
904 struct ext4_inode_info *pos, *n;
905 int ret = 0;
906
907 spin_lock(&sbi->s_fc_lock);
908 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
909 if (!ext4_test_inode_state(&pos->vfs_inode,
910 EXT4_STATE_FC_COMMITTING))
911 continue;
912 spin_unlock(&sbi->s_fc_lock);
913
914 ret = jbd2_wait_inode_data(journal, pos->jinode);
915 if (ret)
916 return ret;
917 spin_lock(&sbi->s_fc_lock);
918 }
919 spin_unlock(&sbi->s_fc_lock);
920
921 return 0;
922 }
923
924 /* Commit all the directory entry updates */
ext4_fc_commit_dentry_updates(journal_t * journal,u32 * crc)925 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
926 __acquires(&sbi->s_fc_lock)
927 __releases(&sbi->s_fc_lock)
928 {
929 struct super_block *sb = (struct super_block *)(journal->j_private);
930 struct ext4_sb_info *sbi = EXT4_SB(sb);
931 struct ext4_fc_dentry_update *fc_dentry;
932 struct inode *inode;
933 struct list_head *pos, *n, *fcd_pos, *fcd_n;
934 struct ext4_inode_info *ei;
935 int ret;
936
937 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
938 return 0;
939 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
940 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
941 fcd_list);
942 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
943 spin_unlock(&sbi->s_fc_lock);
944 if (!ext4_fc_add_dentry_tlv(
945 sb, fc_dentry->fcd_op,
946 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
947 fc_dentry->fcd_name.len,
948 fc_dentry->fcd_name.name, crc)) {
949 ret = -ENOSPC;
950 goto lock_and_exit;
951 }
952 spin_lock(&sbi->s_fc_lock);
953 continue;
954 }
955
956 inode = NULL;
957 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
958 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
959 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
960 inode = &ei->vfs_inode;
961 break;
962 }
963 }
964 /*
965 * If we don't find inode in our list, then it was deleted,
966 * in which case, we don't need to record it's create tag.
967 */
968 if (!inode)
969 continue;
970 spin_unlock(&sbi->s_fc_lock);
971
972 /*
973 * We first write the inode and then the create dirent. This
974 * allows the recovery code to create an unnamed inode first
975 * and then link it to a directory entry. This allows us
976 * to use namei.c routines almost as is and simplifies
977 * the recovery code.
978 */
979 ret = ext4_fc_write_inode(inode, crc);
980 if (ret)
981 goto lock_and_exit;
982
983 ret = ext4_fc_write_inode_data(inode, crc);
984 if (ret)
985 goto lock_and_exit;
986
987 if (!ext4_fc_add_dentry_tlv(
988 sb, fc_dentry->fcd_op,
989 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
990 fc_dentry->fcd_name.len,
991 fc_dentry->fcd_name.name, crc)) {
992 ret = -ENOSPC;
993 goto lock_and_exit;
994 }
995
996 spin_lock(&sbi->s_fc_lock);
997 }
998 return 0;
999 lock_and_exit:
1000 spin_lock(&sbi->s_fc_lock);
1001 return ret;
1002 }
1003
ext4_fc_perform_commit(journal_t * journal)1004 static int ext4_fc_perform_commit(journal_t *journal)
1005 {
1006 struct super_block *sb = (struct super_block *)(journal->j_private);
1007 struct ext4_sb_info *sbi = EXT4_SB(sb);
1008 struct ext4_inode_info *iter;
1009 struct ext4_fc_head head;
1010 struct list_head *pos;
1011 struct inode *inode;
1012 struct blk_plug plug;
1013 int ret = 0;
1014 u32 crc = 0;
1015
1016 ret = ext4_fc_submit_inode_data_all(journal);
1017 if (ret)
1018 return ret;
1019
1020 ret = ext4_fc_wait_inode_data_all(journal);
1021 if (ret)
1022 return ret;
1023
1024 /*
1025 * If file system device is different from journal device, issue a cache
1026 * flush before we start writing fast commit blocks.
1027 */
1028 if (journal->j_fs_dev != journal->j_dev)
1029 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1030
1031 blk_start_plug(&plug);
1032 if (sbi->s_fc_bytes == 0) {
1033 /*
1034 * Add a head tag only if this is the first fast commit
1035 * in this TID.
1036 */
1037 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1038 head.fc_tid = cpu_to_le32(
1039 sbi->s_journal->j_running_transaction->t_tid);
1040 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1041 (u8 *)&head, &crc)) {
1042 ret = -ENOSPC;
1043 goto out;
1044 }
1045 }
1046
1047 spin_lock(&sbi->s_fc_lock);
1048 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1049 if (ret) {
1050 spin_unlock(&sbi->s_fc_lock);
1051 goto out;
1052 }
1053
1054 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1055 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1056 inode = &iter->vfs_inode;
1057 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1058 continue;
1059
1060 spin_unlock(&sbi->s_fc_lock);
1061 ret = ext4_fc_write_inode_data(inode, &crc);
1062 if (ret)
1063 goto out;
1064 ret = ext4_fc_write_inode(inode, &crc);
1065 if (ret)
1066 goto out;
1067 spin_lock(&sbi->s_fc_lock);
1068 }
1069 spin_unlock(&sbi->s_fc_lock);
1070
1071 ret = ext4_fc_write_tail(sb, crc);
1072
1073 out:
1074 blk_finish_plug(&plug);
1075 return ret;
1076 }
1077
1078 /*
1079 * The main commit entry point. Performs a fast commit for transaction
1080 * commit_tid if needed. If it's not possible to perform a fast commit
1081 * due to various reasons, we fall back to full commit. Returns 0
1082 * on success, error otherwise.
1083 */
ext4_fc_commit(journal_t * journal,tid_t commit_tid)1084 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1085 {
1086 struct super_block *sb = (struct super_block *)(journal->j_private);
1087 struct ext4_sb_info *sbi = EXT4_SB(sb);
1088 int nblks = 0, ret, bsize = journal->j_blocksize;
1089 int subtid = atomic_read(&sbi->s_fc_subtid);
1090 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1091 ktime_t start_time, commit_time;
1092
1093 trace_ext4_fc_commit_start(sb);
1094
1095 start_time = ktime_get();
1096
1097 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1098 (ext4_fc_is_ineligible(sb))) {
1099 reason = EXT4_FC_REASON_INELIGIBLE;
1100 goto out;
1101 }
1102
1103 restart_fc:
1104 ret = jbd2_fc_begin_commit(journal, commit_tid);
1105 if (ret == -EALREADY) {
1106 /* There was an ongoing commit, check if we need to restart */
1107 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1108 commit_tid > journal->j_commit_sequence)
1109 goto restart_fc;
1110 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1111 goto out;
1112 } else if (ret) {
1113 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1114 reason = EXT4_FC_REASON_FC_START_FAILED;
1115 goto out;
1116 }
1117
1118 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1119 ret = ext4_fc_perform_commit(journal);
1120 if (ret < 0) {
1121 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1122 reason = EXT4_FC_REASON_FC_FAILED;
1123 goto out;
1124 }
1125 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1126 ret = jbd2_fc_wait_bufs(journal, nblks);
1127 if (ret < 0) {
1128 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1129 reason = EXT4_FC_REASON_FC_FAILED;
1130 goto out;
1131 }
1132 atomic_inc(&sbi->s_fc_subtid);
1133 jbd2_fc_end_commit(journal);
1134 out:
1135 /* Has any ineligible update happened since we started? */
1136 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1137 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1138 reason = EXT4_FC_REASON_INELIGIBLE;
1139 }
1140
1141 spin_lock(&sbi->s_fc_lock);
1142 if (reason != EXT4_FC_REASON_OK &&
1143 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1144 sbi->s_fc_stats.fc_ineligible_commits++;
1145 } else {
1146 sbi->s_fc_stats.fc_num_commits++;
1147 sbi->s_fc_stats.fc_numblks += nblks;
1148 }
1149 spin_unlock(&sbi->s_fc_lock);
1150 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1151 trace_ext4_fc_commit_stop(sb, nblks, reason);
1152 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1153 /*
1154 * weight the commit time higher than the average time so we don't
1155 * react too strongly to vast changes in the commit time
1156 */
1157 if (likely(sbi->s_fc_avg_commit_time))
1158 sbi->s_fc_avg_commit_time = (commit_time +
1159 sbi->s_fc_avg_commit_time * 3) / 4;
1160 else
1161 sbi->s_fc_avg_commit_time = commit_time;
1162 jbd_debug(1,
1163 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1164 nblks, reason, subtid);
1165 if (reason == EXT4_FC_REASON_FC_FAILED)
1166 return jbd2_fc_end_commit_fallback(journal);
1167 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1168 reason == EXT4_FC_REASON_INELIGIBLE)
1169 return jbd2_complete_transaction(journal, commit_tid);
1170 return 0;
1171 }
1172
1173 /*
1174 * Fast commit cleanup routine. This is called after every fast commit and
1175 * full commit. full is true if we are called after a full commit.
1176 */
ext4_fc_cleanup(journal_t * journal,int full)1177 static void ext4_fc_cleanup(journal_t *journal, int full)
1178 {
1179 struct super_block *sb = journal->j_private;
1180 struct ext4_sb_info *sbi = EXT4_SB(sb);
1181 struct ext4_inode_info *iter;
1182 struct ext4_fc_dentry_update *fc_dentry;
1183 struct list_head *pos, *n;
1184
1185 if (full && sbi->s_fc_bh)
1186 sbi->s_fc_bh = NULL;
1187
1188 jbd2_fc_release_bufs(journal);
1189
1190 spin_lock(&sbi->s_fc_lock);
1191 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1192 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1193 list_del_init(&iter->i_fc_list);
1194 ext4_clear_inode_state(&iter->vfs_inode,
1195 EXT4_STATE_FC_COMMITTING);
1196 ext4_fc_reset_inode(&iter->vfs_inode);
1197 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1198 smp_mb();
1199 #if (BITS_PER_LONG < 64)
1200 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1201 #else
1202 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1203 #endif
1204 }
1205
1206 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1207 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1208 struct ext4_fc_dentry_update,
1209 fcd_list);
1210 list_del_init(&fc_dentry->fcd_list);
1211 spin_unlock(&sbi->s_fc_lock);
1212
1213 if (fc_dentry->fcd_name.name &&
1214 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1215 kfree(fc_dentry->fcd_name.name);
1216 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1217 spin_lock(&sbi->s_fc_lock);
1218 }
1219
1220 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1221 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1222 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1223 &sbi->s_fc_q[FC_Q_MAIN]);
1224
1225 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1226 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1227
1228 if (full)
1229 sbi->s_fc_bytes = 0;
1230 spin_unlock(&sbi->s_fc_lock);
1231 trace_ext4_fc_stats(sb);
1232 }
1233
1234 /* Ext4 Replay Path Routines */
1235
1236 /* Helper struct for dentry replay routines */
1237 struct dentry_info_args {
1238 int parent_ino, dname_len, ino, inode_len;
1239 char *dname;
1240 };
1241
tl_to_darg(struct dentry_info_args * darg,struct ext4_fc_tl * tl,u8 * val)1242 static inline void tl_to_darg(struct dentry_info_args *darg,
1243 struct ext4_fc_tl *tl, u8 *val)
1244 {
1245 struct ext4_fc_dentry_info fcd;
1246
1247 memcpy(&fcd, val, sizeof(fcd));
1248
1249 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1250 darg->ino = le32_to_cpu(fcd.fc_ino);
1251 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1252 darg->dname_len = le16_to_cpu(tl->fc_len) -
1253 sizeof(struct ext4_fc_dentry_info);
1254 }
1255
1256 /* Unlink replay function */
ext4_fc_replay_unlink(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1257 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1258 u8 *val)
1259 {
1260 struct inode *inode, *old_parent;
1261 struct qstr entry;
1262 struct dentry_info_args darg;
1263 int ret = 0;
1264
1265 tl_to_darg(&darg, tl, val);
1266
1267 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1268 darg.parent_ino, darg.dname_len);
1269
1270 entry.name = darg.dname;
1271 entry.len = darg.dname_len;
1272 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1273
1274 if (IS_ERR(inode)) {
1275 jbd_debug(1, "Inode %d not found", darg.ino);
1276 return 0;
1277 }
1278
1279 old_parent = ext4_iget(sb, darg.parent_ino,
1280 EXT4_IGET_NORMAL);
1281 if (IS_ERR(old_parent)) {
1282 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1283 iput(inode);
1284 return 0;
1285 }
1286
1287 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1288 /* -ENOENT ok coz it might not exist anymore. */
1289 if (ret == -ENOENT)
1290 ret = 0;
1291 iput(old_parent);
1292 iput(inode);
1293 return ret;
1294 }
1295
ext4_fc_replay_link_internal(struct super_block * sb,struct dentry_info_args * darg,struct inode * inode)1296 static int ext4_fc_replay_link_internal(struct super_block *sb,
1297 struct dentry_info_args *darg,
1298 struct inode *inode)
1299 {
1300 struct inode *dir = NULL;
1301 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1302 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1303 int ret = 0;
1304
1305 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1306 if (IS_ERR(dir)) {
1307 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1308 dir = NULL;
1309 goto out;
1310 }
1311
1312 dentry_dir = d_obtain_alias(dir);
1313 if (IS_ERR(dentry_dir)) {
1314 jbd_debug(1, "Failed to obtain dentry");
1315 dentry_dir = NULL;
1316 goto out;
1317 }
1318
1319 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1320 if (!dentry_inode) {
1321 jbd_debug(1, "Inode dentry not created.");
1322 ret = -ENOMEM;
1323 goto out;
1324 }
1325
1326 ret = __ext4_link(dir, inode, dentry_inode);
1327 /*
1328 * It's possible that link already existed since data blocks
1329 * for the dir in question got persisted before we crashed OR
1330 * we replayed this tag and crashed before the entire replay
1331 * could complete.
1332 */
1333 if (ret && ret != -EEXIST) {
1334 jbd_debug(1, "Failed to link\n");
1335 goto out;
1336 }
1337
1338 ret = 0;
1339 out:
1340 if (dentry_dir) {
1341 d_drop(dentry_dir);
1342 dput(dentry_dir);
1343 } else if (dir) {
1344 iput(dir);
1345 }
1346 if (dentry_inode) {
1347 d_drop(dentry_inode);
1348 dput(dentry_inode);
1349 }
1350
1351 return ret;
1352 }
1353
1354 /* Link replay function */
ext4_fc_replay_link(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1355 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1356 u8 *val)
1357 {
1358 struct inode *inode;
1359 struct dentry_info_args darg;
1360 int ret = 0;
1361
1362 tl_to_darg(&darg, tl, val);
1363 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1364 darg.parent_ino, darg.dname_len);
1365
1366 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1367 if (IS_ERR(inode)) {
1368 jbd_debug(1, "Inode not found.");
1369 return 0;
1370 }
1371
1372 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1373 iput(inode);
1374 return ret;
1375 }
1376
1377 /*
1378 * Record all the modified inodes during replay. We use this later to setup
1379 * block bitmaps correctly.
1380 */
ext4_fc_record_modified_inode(struct super_block * sb,int ino)1381 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1382 {
1383 struct ext4_fc_replay_state *state;
1384 int i;
1385
1386 state = &EXT4_SB(sb)->s_fc_replay_state;
1387 for (i = 0; i < state->fc_modified_inodes_used; i++)
1388 if (state->fc_modified_inodes[i] == ino)
1389 return 0;
1390 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1391 state->fc_modified_inodes_size +=
1392 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1393 state->fc_modified_inodes = krealloc(
1394 state->fc_modified_inodes, sizeof(int) *
1395 state->fc_modified_inodes_size,
1396 GFP_KERNEL);
1397 if (!state->fc_modified_inodes)
1398 return -ENOMEM;
1399 }
1400 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1401 return 0;
1402 }
1403
1404 /*
1405 * Inode replay function
1406 */
ext4_fc_replay_inode(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1407 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1408 u8 *val)
1409 {
1410 struct ext4_fc_inode fc_inode;
1411 struct ext4_inode *raw_inode;
1412 struct ext4_inode *raw_fc_inode;
1413 struct inode *inode = NULL;
1414 struct ext4_iloc iloc;
1415 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1416 struct ext4_extent_header *eh;
1417
1418 memcpy(&fc_inode, val, sizeof(fc_inode));
1419
1420 ino = le32_to_cpu(fc_inode.fc_ino);
1421 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1422
1423 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1424 if (!IS_ERR(inode)) {
1425 ext4_ext_clear_bb(inode);
1426 iput(inode);
1427 }
1428 inode = NULL;
1429
1430 ext4_fc_record_modified_inode(sb, ino);
1431
1432 raw_fc_inode = (struct ext4_inode *)
1433 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1434 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1435 if (ret)
1436 goto out;
1437
1438 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1439 raw_inode = ext4_raw_inode(&iloc);
1440
1441 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1442 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1443 inode_len - offsetof(struct ext4_inode, i_generation));
1444 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1445 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1446 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1447 memset(eh, 0, sizeof(*eh));
1448 eh->eh_magic = EXT4_EXT_MAGIC;
1449 eh->eh_max = cpu_to_le16(
1450 (sizeof(raw_inode->i_block) -
1451 sizeof(struct ext4_extent_header))
1452 / sizeof(struct ext4_extent));
1453 }
1454 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1455 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1456 sizeof(raw_inode->i_block));
1457 }
1458
1459 /* Immediately update the inode on disk. */
1460 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1461 if (ret)
1462 goto out;
1463 ret = sync_dirty_buffer(iloc.bh);
1464 if (ret)
1465 goto out;
1466 ret = ext4_mark_inode_used(sb, ino);
1467 if (ret)
1468 goto out;
1469
1470 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1471 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1472 if (IS_ERR(inode)) {
1473 jbd_debug(1, "Inode not found.");
1474 return -EFSCORRUPTED;
1475 }
1476
1477 /*
1478 * Our allocator could have made different decisions than before
1479 * crashing. This should be fixed but until then, we calculate
1480 * the number of blocks the inode.
1481 */
1482 ext4_ext_replay_set_iblocks(inode);
1483
1484 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1485 ext4_reset_inode_seed(inode);
1486
1487 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1488 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1489 sync_dirty_buffer(iloc.bh);
1490 brelse(iloc.bh);
1491 out:
1492 iput(inode);
1493 if (!ret)
1494 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1495
1496 return 0;
1497 }
1498
1499 /*
1500 * Dentry create replay function.
1501 *
1502 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1503 * inode for which we are trying to create a dentry here, should already have
1504 * been replayed before we start here.
1505 */
ext4_fc_replay_create(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1506 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1507 u8 *val)
1508 {
1509 int ret = 0;
1510 struct inode *inode = NULL;
1511 struct inode *dir = NULL;
1512 struct dentry_info_args darg;
1513
1514 tl_to_darg(&darg, tl, val);
1515
1516 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1517 darg.parent_ino, darg.dname_len);
1518
1519 /* This takes care of update group descriptor and other metadata */
1520 ret = ext4_mark_inode_used(sb, darg.ino);
1521 if (ret)
1522 goto out;
1523
1524 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1525 if (IS_ERR(inode)) {
1526 jbd_debug(1, "inode %d not found.", darg.ino);
1527 inode = NULL;
1528 ret = -EINVAL;
1529 goto out;
1530 }
1531
1532 if (S_ISDIR(inode->i_mode)) {
1533 /*
1534 * If we are creating a directory, we need to make sure that the
1535 * dot and dot dot dirents are setup properly.
1536 */
1537 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1538 if (IS_ERR(dir)) {
1539 jbd_debug(1, "Dir %d not found.", darg.ino);
1540 goto out;
1541 }
1542 ret = ext4_init_new_dir(NULL, dir, inode);
1543 iput(dir);
1544 if (ret) {
1545 ret = 0;
1546 goto out;
1547 }
1548 }
1549 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1550 if (ret)
1551 goto out;
1552 set_nlink(inode, 1);
1553 ext4_mark_inode_dirty(NULL, inode);
1554 out:
1555 if (inode)
1556 iput(inode);
1557 return ret;
1558 }
1559
1560 /*
1561 * Record physical disk regions which are in use as per fast commit area. Our
1562 * simple replay phase allocator excludes these regions from allocation.
1563 */
ext4_fc_record_regions(struct super_block * sb,int ino,ext4_lblk_t lblk,ext4_fsblk_t pblk,int len)1564 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1565 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1566 {
1567 struct ext4_fc_replay_state *state;
1568 struct ext4_fc_alloc_region *region;
1569
1570 state = &EXT4_SB(sb)->s_fc_replay_state;
1571 if (state->fc_regions_used == state->fc_regions_size) {
1572 state->fc_regions_size +=
1573 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1574 state->fc_regions = krealloc(
1575 state->fc_regions,
1576 state->fc_regions_size *
1577 sizeof(struct ext4_fc_alloc_region),
1578 GFP_KERNEL);
1579 if (!state->fc_regions)
1580 return -ENOMEM;
1581 }
1582 region = &state->fc_regions[state->fc_regions_used++];
1583 region->ino = ino;
1584 region->lblk = lblk;
1585 region->pblk = pblk;
1586 region->len = len;
1587
1588 return 0;
1589 }
1590
1591 /* Replay add range tag */
ext4_fc_replay_add_range(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1592 static int ext4_fc_replay_add_range(struct super_block *sb,
1593 struct ext4_fc_tl *tl, u8 *val)
1594 {
1595 struct ext4_fc_add_range fc_add_ex;
1596 struct ext4_extent newex, *ex;
1597 struct inode *inode;
1598 ext4_lblk_t start, cur;
1599 int remaining, len;
1600 ext4_fsblk_t start_pblk;
1601 struct ext4_map_blocks map;
1602 struct ext4_ext_path *path = NULL;
1603 int ret;
1604
1605 memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1606 ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1607
1608 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1609 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1610 ext4_ext_get_actual_len(ex));
1611
1612 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1613 if (IS_ERR(inode)) {
1614 jbd_debug(1, "Inode not found.");
1615 return 0;
1616 }
1617
1618 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1619
1620 start = le32_to_cpu(ex->ee_block);
1621 start_pblk = ext4_ext_pblock(ex);
1622 len = ext4_ext_get_actual_len(ex);
1623
1624 cur = start;
1625 remaining = len;
1626 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1627 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1628 inode->i_ino);
1629
1630 while (remaining > 0) {
1631 map.m_lblk = cur;
1632 map.m_len = remaining;
1633 map.m_pblk = 0;
1634 ret = ext4_map_blocks(NULL, inode, &map, 0);
1635
1636 if (ret < 0) {
1637 iput(inode);
1638 return 0;
1639 }
1640
1641 if (ret == 0) {
1642 /* Range is not mapped */
1643 path = ext4_find_extent(inode, cur, NULL, 0);
1644 if (IS_ERR(path)) {
1645 iput(inode);
1646 return 0;
1647 }
1648 memset(&newex, 0, sizeof(newex));
1649 newex.ee_block = cpu_to_le32(cur);
1650 ext4_ext_store_pblock(
1651 &newex, start_pblk + cur - start);
1652 newex.ee_len = cpu_to_le16(map.m_len);
1653 if (ext4_ext_is_unwritten(ex))
1654 ext4_ext_mark_unwritten(&newex);
1655 down_write(&EXT4_I(inode)->i_data_sem);
1656 ret = ext4_ext_insert_extent(
1657 NULL, inode, &path, &newex, 0);
1658 up_write((&EXT4_I(inode)->i_data_sem));
1659 ext4_ext_drop_refs(path);
1660 kfree(path);
1661 if (ret) {
1662 iput(inode);
1663 return 0;
1664 }
1665 goto next;
1666 }
1667
1668 if (start_pblk + cur - start != map.m_pblk) {
1669 /*
1670 * Logical to physical mapping changed. This can happen
1671 * if this range was removed and then reallocated to
1672 * map to new physical blocks during a fast commit.
1673 */
1674 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1675 ext4_ext_is_unwritten(ex),
1676 start_pblk + cur - start);
1677 if (ret) {
1678 iput(inode);
1679 return 0;
1680 }
1681 /*
1682 * Mark the old blocks as free since they aren't used
1683 * anymore. We maintain an array of all the modified
1684 * inodes. In case these blocks are still used at either
1685 * a different logical range in the same inode or in
1686 * some different inode, we will mark them as allocated
1687 * at the end of the FC replay using our array of
1688 * modified inodes.
1689 */
1690 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1691 goto next;
1692 }
1693
1694 /* Range is mapped and needs a state change */
1695 jbd_debug(1, "Converting from %ld to %d %lld",
1696 map.m_flags & EXT4_MAP_UNWRITTEN,
1697 ext4_ext_is_unwritten(ex), map.m_pblk);
1698 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1699 ext4_ext_is_unwritten(ex), map.m_pblk);
1700 if (ret) {
1701 iput(inode);
1702 return 0;
1703 }
1704 /*
1705 * We may have split the extent tree while toggling the state.
1706 * Try to shrink the extent tree now.
1707 */
1708 ext4_ext_replay_shrink_inode(inode, start + len);
1709 next:
1710 cur += map.m_len;
1711 remaining -= map.m_len;
1712 }
1713 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1714 sb->s_blocksize_bits);
1715 iput(inode);
1716 return 0;
1717 }
1718
1719 /* Replay DEL_RANGE tag */
1720 static int
ext4_fc_replay_del_range(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1721 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1722 u8 *val)
1723 {
1724 struct inode *inode;
1725 struct ext4_fc_del_range lrange;
1726 struct ext4_map_blocks map;
1727 ext4_lblk_t cur, remaining;
1728 int ret;
1729
1730 memcpy(&lrange, val, sizeof(lrange));
1731 cur = le32_to_cpu(lrange.fc_lblk);
1732 remaining = le32_to_cpu(lrange.fc_len);
1733
1734 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1735 le32_to_cpu(lrange.fc_ino), cur, remaining);
1736
1737 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1738 if (IS_ERR(inode)) {
1739 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1740 return 0;
1741 }
1742
1743 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1744
1745 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1746 inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1747 le32_to_cpu(lrange.fc_len));
1748 while (remaining > 0) {
1749 map.m_lblk = cur;
1750 map.m_len = remaining;
1751
1752 ret = ext4_map_blocks(NULL, inode, &map, 0);
1753 if (ret < 0) {
1754 iput(inode);
1755 return 0;
1756 }
1757 if (ret > 0) {
1758 remaining -= ret;
1759 cur += ret;
1760 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1761 } else {
1762 remaining -= map.m_len;
1763 cur += map.m_len;
1764 }
1765 }
1766
1767 ret = ext4_punch_hole(inode,
1768 le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1769 le32_to_cpu(lrange.fc_len) << sb->s_blocksize_bits);
1770 if (ret)
1771 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1772 ext4_ext_replay_shrink_inode(inode,
1773 i_size_read(inode) >> sb->s_blocksize_bits);
1774 ext4_mark_inode_dirty(NULL, inode);
1775 iput(inode);
1776
1777 return 0;
1778 }
1779
tag2str(u16 tag)1780 static inline const char *tag2str(u16 tag)
1781 {
1782 switch (tag) {
1783 case EXT4_FC_TAG_LINK:
1784 return "TAG_ADD_ENTRY";
1785 case EXT4_FC_TAG_UNLINK:
1786 return "TAG_DEL_ENTRY";
1787 case EXT4_FC_TAG_ADD_RANGE:
1788 return "TAG_ADD_RANGE";
1789 case EXT4_FC_TAG_CREAT:
1790 return "TAG_CREAT_DENTRY";
1791 case EXT4_FC_TAG_DEL_RANGE:
1792 return "TAG_DEL_RANGE";
1793 case EXT4_FC_TAG_INODE:
1794 return "TAG_INODE";
1795 case EXT4_FC_TAG_PAD:
1796 return "TAG_PAD";
1797 case EXT4_FC_TAG_TAIL:
1798 return "TAG_TAIL";
1799 case EXT4_FC_TAG_HEAD:
1800 return "TAG_HEAD";
1801 default:
1802 return "TAG_ERROR";
1803 }
1804 }
1805
ext4_fc_set_bitmaps_and_counters(struct super_block * sb)1806 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1807 {
1808 struct ext4_fc_replay_state *state;
1809 struct inode *inode;
1810 struct ext4_ext_path *path = NULL;
1811 struct ext4_map_blocks map;
1812 int i, ret, j;
1813 ext4_lblk_t cur, end;
1814
1815 state = &EXT4_SB(sb)->s_fc_replay_state;
1816 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1817 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1818 EXT4_IGET_NORMAL);
1819 if (IS_ERR(inode)) {
1820 jbd_debug(1, "Inode %d not found.",
1821 state->fc_modified_inodes[i]);
1822 continue;
1823 }
1824 cur = 0;
1825 end = EXT_MAX_BLOCKS;
1826 while (cur < end) {
1827 map.m_lblk = cur;
1828 map.m_len = end - cur;
1829
1830 ret = ext4_map_blocks(NULL, inode, &map, 0);
1831 if (ret < 0)
1832 break;
1833
1834 if (ret > 0) {
1835 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1836 if (!IS_ERR(path)) {
1837 for (j = 0; j < path->p_depth; j++)
1838 ext4_mb_mark_bb(inode->i_sb,
1839 path[j].p_block, 1, 1);
1840 ext4_ext_drop_refs(path);
1841 kfree(path);
1842 }
1843 cur += ret;
1844 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1845 map.m_len, 1);
1846 } else {
1847 cur = cur + (map.m_len ? map.m_len : 1);
1848 }
1849 }
1850 iput(inode);
1851 }
1852 }
1853
1854 /*
1855 * Check if block is in excluded regions for block allocation. The simple
1856 * allocator that runs during replay phase is calls this function to see
1857 * if it is okay to use a block.
1858 */
ext4_fc_replay_check_excluded(struct super_block * sb,ext4_fsblk_t blk)1859 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1860 {
1861 int i;
1862 struct ext4_fc_replay_state *state;
1863
1864 state = &EXT4_SB(sb)->s_fc_replay_state;
1865 for (i = 0; i < state->fc_regions_valid; i++) {
1866 if (state->fc_regions[i].ino == 0 ||
1867 state->fc_regions[i].len == 0)
1868 continue;
1869 if (blk >= state->fc_regions[i].pblk &&
1870 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1871 return true;
1872 }
1873 return false;
1874 }
1875
1876 /* Cleanup function called after replay */
ext4_fc_replay_cleanup(struct super_block * sb)1877 void ext4_fc_replay_cleanup(struct super_block *sb)
1878 {
1879 struct ext4_sb_info *sbi = EXT4_SB(sb);
1880
1881 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1882 kfree(sbi->s_fc_replay_state.fc_regions);
1883 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1884 }
1885
1886 /*
1887 * Recovery Scan phase handler
1888 *
1889 * This function is called during the scan phase and is responsible
1890 * for doing following things:
1891 * - Make sure the fast commit area has valid tags for replay
1892 * - Count number of tags that need to be replayed by the replay handler
1893 * - Verify CRC
1894 * - Create a list of excluded blocks for allocation during replay phase
1895 *
1896 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1897 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1898 * to indicate that scan has finished and JBD2 can now start replay phase.
1899 * It returns a negative error to indicate that there was an error. At the end
1900 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1901 * to indicate the number of tags that need to replayed during the replay phase.
1902 */
ext4_fc_replay_scan(journal_t * journal,struct buffer_head * bh,int off,tid_t expected_tid)1903 static int ext4_fc_replay_scan(journal_t *journal,
1904 struct buffer_head *bh, int off,
1905 tid_t expected_tid)
1906 {
1907 struct super_block *sb = journal->j_private;
1908 struct ext4_sb_info *sbi = EXT4_SB(sb);
1909 struct ext4_fc_replay_state *state;
1910 int ret = JBD2_FC_REPLAY_CONTINUE;
1911 struct ext4_fc_add_range ext;
1912 struct ext4_fc_tl tl;
1913 struct ext4_fc_tail tail;
1914 __u8 *start, *end, *cur, *val;
1915 struct ext4_fc_head head;
1916 struct ext4_extent *ex;
1917
1918 state = &sbi->s_fc_replay_state;
1919
1920 start = (u8 *)bh->b_data;
1921 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1922
1923 if (state->fc_replay_expected_off == 0) {
1924 state->fc_cur_tag = 0;
1925 state->fc_replay_num_tags = 0;
1926 state->fc_crc = 0;
1927 state->fc_regions = NULL;
1928 state->fc_regions_valid = state->fc_regions_used =
1929 state->fc_regions_size = 0;
1930 /* Check if we can stop early */
1931 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1932 != EXT4_FC_TAG_HEAD)
1933 return 0;
1934 }
1935
1936 if (off != state->fc_replay_expected_off) {
1937 ret = -EFSCORRUPTED;
1938 goto out_err;
1939 }
1940
1941 state->fc_replay_expected_off++;
1942 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1943 memcpy(&tl, cur, sizeof(tl));
1944 val = cur + sizeof(tl);
1945 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1946 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1947 switch (le16_to_cpu(tl.fc_tag)) {
1948 case EXT4_FC_TAG_ADD_RANGE:
1949 memcpy(&ext, val, sizeof(ext));
1950 ex = (struct ext4_extent *)&ext.fc_ex;
1951 ret = ext4_fc_record_regions(sb,
1952 le32_to_cpu(ext.fc_ino),
1953 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1954 ext4_ext_get_actual_len(ex));
1955 if (ret < 0)
1956 break;
1957 ret = JBD2_FC_REPLAY_CONTINUE;
1958 fallthrough;
1959 case EXT4_FC_TAG_DEL_RANGE:
1960 case EXT4_FC_TAG_LINK:
1961 case EXT4_FC_TAG_UNLINK:
1962 case EXT4_FC_TAG_CREAT:
1963 case EXT4_FC_TAG_INODE:
1964 case EXT4_FC_TAG_PAD:
1965 state->fc_cur_tag++;
1966 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1967 sizeof(tl) + le16_to_cpu(tl.fc_len));
1968 break;
1969 case EXT4_FC_TAG_TAIL:
1970 state->fc_cur_tag++;
1971 memcpy(&tail, val, sizeof(tail));
1972 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1973 sizeof(tl) +
1974 offsetof(struct ext4_fc_tail,
1975 fc_crc));
1976 if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1977 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1978 state->fc_replay_num_tags = state->fc_cur_tag;
1979 state->fc_regions_valid =
1980 state->fc_regions_used;
1981 } else {
1982 ret = state->fc_replay_num_tags ?
1983 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1984 }
1985 state->fc_crc = 0;
1986 break;
1987 case EXT4_FC_TAG_HEAD:
1988 memcpy(&head, val, sizeof(head));
1989 if (le32_to_cpu(head.fc_features) &
1990 ~EXT4_FC_SUPPORTED_FEATURES) {
1991 ret = -EOPNOTSUPP;
1992 break;
1993 }
1994 if (le32_to_cpu(head.fc_tid) != expected_tid) {
1995 ret = JBD2_FC_REPLAY_STOP;
1996 break;
1997 }
1998 state->fc_cur_tag++;
1999 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2000 sizeof(tl) + le16_to_cpu(tl.fc_len));
2001 break;
2002 default:
2003 ret = state->fc_replay_num_tags ?
2004 JBD2_FC_REPLAY_STOP : -ECANCELED;
2005 }
2006 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2007 break;
2008 }
2009
2010 out_err:
2011 trace_ext4_fc_replay_scan(sb, ret, off);
2012 return ret;
2013 }
2014
2015 /*
2016 * Main recovery path entry point.
2017 * The meaning of return codes is similar as above.
2018 */
ext4_fc_replay(journal_t * journal,struct buffer_head * bh,enum passtype pass,int off,tid_t expected_tid)2019 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2020 enum passtype pass, int off, tid_t expected_tid)
2021 {
2022 struct super_block *sb = journal->j_private;
2023 struct ext4_sb_info *sbi = EXT4_SB(sb);
2024 struct ext4_fc_tl tl;
2025 __u8 *start, *end, *cur, *val;
2026 int ret = JBD2_FC_REPLAY_CONTINUE;
2027 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2028 struct ext4_fc_tail tail;
2029
2030 if (pass == PASS_SCAN) {
2031 state->fc_current_pass = PASS_SCAN;
2032 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2033 }
2034
2035 if (state->fc_current_pass != pass) {
2036 state->fc_current_pass = pass;
2037 sbi->s_mount_state |= EXT4_FC_REPLAY;
2038 }
2039 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2040 jbd_debug(1, "Replay stops\n");
2041 ext4_fc_set_bitmaps_and_counters(sb);
2042 return 0;
2043 }
2044
2045 #ifdef CONFIG_EXT4_DEBUG
2046 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2047 pr_warn("Dropping fc block %d because max_replay set\n", off);
2048 return JBD2_FC_REPLAY_STOP;
2049 }
2050 #endif
2051
2052 start = (u8 *)bh->b_data;
2053 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2054
2055 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2056 memcpy(&tl, cur, sizeof(tl));
2057 val = cur + sizeof(tl);
2058
2059 if (state->fc_replay_num_tags == 0) {
2060 ret = JBD2_FC_REPLAY_STOP;
2061 ext4_fc_set_bitmaps_and_counters(sb);
2062 break;
2063 }
2064 jbd_debug(3, "Replay phase, tag:%s\n",
2065 tag2str(le16_to_cpu(tl.fc_tag)));
2066 state->fc_replay_num_tags--;
2067 switch (le16_to_cpu(tl.fc_tag)) {
2068 case EXT4_FC_TAG_LINK:
2069 ret = ext4_fc_replay_link(sb, &tl, val);
2070 break;
2071 case EXT4_FC_TAG_UNLINK:
2072 ret = ext4_fc_replay_unlink(sb, &tl, val);
2073 break;
2074 case EXT4_FC_TAG_ADD_RANGE:
2075 ret = ext4_fc_replay_add_range(sb, &tl, val);
2076 break;
2077 case EXT4_FC_TAG_CREAT:
2078 ret = ext4_fc_replay_create(sb, &tl, val);
2079 break;
2080 case EXT4_FC_TAG_DEL_RANGE:
2081 ret = ext4_fc_replay_del_range(sb, &tl, val);
2082 break;
2083 case EXT4_FC_TAG_INODE:
2084 ret = ext4_fc_replay_inode(sb, &tl, val);
2085 break;
2086 case EXT4_FC_TAG_PAD:
2087 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2088 le16_to_cpu(tl.fc_len), 0);
2089 break;
2090 case EXT4_FC_TAG_TAIL:
2091 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2092 le16_to_cpu(tl.fc_len), 0);
2093 memcpy(&tail, val, sizeof(tail));
2094 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2095 break;
2096 case EXT4_FC_TAG_HEAD:
2097 break;
2098 default:
2099 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2100 le16_to_cpu(tl.fc_len), 0);
2101 ret = -ECANCELED;
2102 break;
2103 }
2104 if (ret < 0)
2105 break;
2106 ret = JBD2_FC_REPLAY_CONTINUE;
2107 }
2108 return ret;
2109 }
2110
ext4_fc_init(struct super_block * sb,journal_t * journal)2111 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2112 {
2113 /*
2114 * We set replay callback even if fast commit disabled because we may
2115 * could still have fast commit blocks that need to be replayed even if
2116 * fast commit has now been turned off.
2117 */
2118 journal->j_fc_replay_callback = ext4_fc_replay;
2119 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2120 return;
2121 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2122 }
2123
2124 static const char *fc_ineligible_reasons[] = {
2125 "Extended attributes changed",
2126 "Cross rename",
2127 "Journal flag changed",
2128 "Insufficient memory",
2129 "Swap boot",
2130 "Resize",
2131 "Dir renamed",
2132 "Falloc range op",
2133 "Data journalling",
2134 "FC Commit Failed"
2135 };
2136
ext4_fc_info_show(struct seq_file * seq,void * v)2137 int ext4_fc_info_show(struct seq_file *seq, void *v)
2138 {
2139 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2140 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2141 int i;
2142
2143 if (v != SEQ_START_TOKEN)
2144 return 0;
2145
2146 seq_printf(seq,
2147 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2148 stats->fc_num_commits, stats->fc_ineligible_commits,
2149 stats->fc_numblks,
2150 div_u64(sbi->s_fc_avg_commit_time, 1000));
2151 seq_puts(seq, "Ineligible reasons:\n");
2152 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2153 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2154 stats->fc_ineligible_reason_count[i]);
2155
2156 return 0;
2157 }
2158
ext4_fc_init_dentry_cache(void)2159 int __init ext4_fc_init_dentry_cache(void)
2160 {
2161 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2162 SLAB_RECLAIM_ACCOUNT);
2163
2164 if (ext4_fc_dentry_cachep == NULL)
2165 return -ENOMEM;
2166
2167 return 0;
2168 }
2169