1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_sb.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_inode.h"
17 #include "xfs_trans.h"
18 #include "xfs_log.h"
19 #include "xfs_log_priv.h"
20 #include "xfs_log_recover.h"
21 #include "xfs_inode_item.h"
22 #include "xfs_extfree_item.h"
23 #include "xfs_trans_priv.h"
24 #include "xfs_alloc.h"
25 #include "xfs_ialloc.h"
26 #include "xfs_quota.h"
27 #include "xfs_trace.h"
28 #include "xfs_icache.h"
29 #include "xfs_bmap_btree.h"
30 #include "xfs_error.h"
31 #include "xfs_dir2.h"
32 #include "xfs_rmap_item.h"
33 #include "xfs_buf_item.h"
34 #include "xfs_refcount_item.h"
35 #include "xfs_bmap_item.h"
36
37 #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
38
39 STATIC int
40 xlog_find_zeroed(
41 struct xlog *,
42 xfs_daddr_t *);
43 STATIC int
44 xlog_clear_stale_blocks(
45 struct xlog *,
46 xfs_lsn_t);
47 #if defined(DEBUG)
48 STATIC void
49 xlog_recover_check_summary(
50 struct xlog *);
51 #else
52 #define xlog_recover_check_summary(log)
53 #endif
54 STATIC int
55 xlog_do_recovery_pass(
56 struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
57
58 /*
59 * This structure is used during recovery to record the buf log items which
60 * have been canceled and should not be replayed.
61 */
62 struct xfs_buf_cancel {
63 xfs_daddr_t bc_blkno;
64 uint bc_len;
65 int bc_refcount;
66 struct list_head bc_list;
67 };
68
69 /*
70 * Sector aligned buffer routines for buffer create/read/write/access
71 */
72
73 /*
74 * Verify the log-relative block number and length in basic blocks are valid for
75 * an operation involving the given XFS log buffer. Returns true if the fields
76 * are valid, false otherwise.
77 */
78 static inline bool
xlog_verify_bno(struct xlog * log,xfs_daddr_t blk_no,int bbcount)79 xlog_verify_bno(
80 struct xlog *log,
81 xfs_daddr_t blk_no,
82 int bbcount)
83 {
84 if (blk_no < 0 || blk_no >= log->l_logBBsize)
85 return false;
86 if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize)
87 return false;
88 return true;
89 }
90
91 /*
92 * Allocate a buffer to hold log data. The buffer needs to be able to map to
93 * a range of nbblks basic blocks at any valid offset within the log.
94 */
95 static char *
xlog_alloc_buffer(struct xlog * log,int nbblks)96 xlog_alloc_buffer(
97 struct xlog *log,
98 int nbblks)
99 {
100 int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
101
102 /*
103 * Pass log block 0 since we don't have an addr yet, buffer will be
104 * verified on read.
105 */
106 if (!xlog_verify_bno(log, 0, nbblks)) {
107 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
108 nbblks);
109 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
110 return NULL;
111 }
112
113 /*
114 * We do log I/O in units of log sectors (a power-of-2 multiple of the
115 * basic block size), so we round up the requested size to accommodate
116 * the basic blocks required for complete log sectors.
117 *
118 * In addition, the buffer may be used for a non-sector-aligned block
119 * offset, in which case an I/O of the requested size could extend
120 * beyond the end of the buffer. If the requested size is only 1 basic
121 * block it will never straddle a sector boundary, so this won't be an
122 * issue. Nor will this be a problem if the log I/O is done in basic
123 * blocks (sector size 1). But otherwise we extend the buffer by one
124 * extra log sector to ensure there's space to accommodate this
125 * possibility.
126 */
127 if (nbblks > 1 && log->l_sectBBsize > 1)
128 nbblks += log->l_sectBBsize;
129 nbblks = round_up(nbblks, log->l_sectBBsize);
130 return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
131 }
132
133 /*
134 * Return the address of the start of the given block number's data
135 * in a log buffer. The buffer covers a log sector-aligned region.
136 */
137 static inline unsigned int
xlog_align(struct xlog * log,xfs_daddr_t blk_no)138 xlog_align(
139 struct xlog *log,
140 xfs_daddr_t blk_no)
141 {
142 return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
143 }
144
145 static int
xlog_do_io(struct xlog * log,xfs_daddr_t blk_no,unsigned int nbblks,char * data,unsigned int op)146 xlog_do_io(
147 struct xlog *log,
148 xfs_daddr_t blk_no,
149 unsigned int nbblks,
150 char *data,
151 unsigned int op)
152 {
153 int error;
154
155 if (!xlog_verify_bno(log, blk_no, nbblks)) {
156 xfs_warn(log->l_mp,
157 "Invalid log block/length (0x%llx, 0x%x) for buffer",
158 blk_no, nbblks);
159 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
160 return -EFSCORRUPTED;
161 }
162
163 blk_no = round_down(blk_no, log->l_sectBBsize);
164 nbblks = round_up(nbblks, log->l_sectBBsize);
165 ASSERT(nbblks > 0);
166
167 error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
168 BBTOB(nbblks), data, op);
169 if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) {
170 xfs_alert(log->l_mp,
171 "log recovery %s I/O error at daddr 0x%llx len %d error %d",
172 op == REQ_OP_WRITE ? "write" : "read",
173 blk_no, nbblks, error);
174 }
175 return error;
176 }
177
178 STATIC int
xlog_bread_noalign(struct xlog * log,xfs_daddr_t blk_no,int nbblks,char * data)179 xlog_bread_noalign(
180 struct xlog *log,
181 xfs_daddr_t blk_no,
182 int nbblks,
183 char *data)
184 {
185 return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
186 }
187
188 STATIC int
xlog_bread(struct xlog * log,xfs_daddr_t blk_no,int nbblks,char * data,char ** offset)189 xlog_bread(
190 struct xlog *log,
191 xfs_daddr_t blk_no,
192 int nbblks,
193 char *data,
194 char **offset)
195 {
196 int error;
197
198 error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
199 if (!error)
200 *offset = data + xlog_align(log, blk_no);
201 return error;
202 }
203
204 STATIC int
xlog_bwrite(struct xlog * log,xfs_daddr_t blk_no,int nbblks,char * data)205 xlog_bwrite(
206 struct xlog *log,
207 xfs_daddr_t blk_no,
208 int nbblks,
209 char *data)
210 {
211 return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
212 }
213
214 #ifdef DEBUG
215 /*
216 * dump debug superblock and log record information
217 */
218 STATIC void
xlog_header_check_dump(xfs_mount_t * mp,xlog_rec_header_t * head)219 xlog_header_check_dump(
220 xfs_mount_t *mp,
221 xlog_rec_header_t *head)
222 {
223 xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d",
224 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
225 xfs_debug(mp, " log : uuid = %pU, fmt = %d",
226 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
227 }
228 #else
229 #define xlog_header_check_dump(mp, head)
230 #endif
231
232 /*
233 * check log record header for recovery
234 */
235 STATIC int
xlog_header_check_recover(xfs_mount_t * mp,xlog_rec_header_t * head)236 xlog_header_check_recover(
237 xfs_mount_t *mp,
238 xlog_rec_header_t *head)
239 {
240 ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
241
242 /*
243 * IRIX doesn't write the h_fmt field and leaves it zeroed
244 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
245 * a dirty log created in IRIX.
246 */
247 if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
248 xfs_warn(mp,
249 "dirty log written in incompatible format - can't recover");
250 xlog_header_check_dump(mp, head);
251 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
252 XFS_ERRLEVEL_HIGH, mp);
253 return -EFSCORRUPTED;
254 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
255 xfs_warn(mp,
256 "dirty log entry has mismatched uuid - can't recover");
257 xlog_header_check_dump(mp, head);
258 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
259 XFS_ERRLEVEL_HIGH, mp);
260 return -EFSCORRUPTED;
261 }
262 return 0;
263 }
264
265 /*
266 * read the head block of the log and check the header
267 */
268 STATIC int
xlog_header_check_mount(xfs_mount_t * mp,xlog_rec_header_t * head)269 xlog_header_check_mount(
270 xfs_mount_t *mp,
271 xlog_rec_header_t *head)
272 {
273 ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
274
275 if (uuid_is_null(&head->h_fs_uuid)) {
276 /*
277 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
278 * h_fs_uuid is null, we assume this log was last mounted
279 * by IRIX and continue.
280 */
281 xfs_warn(mp, "null uuid in log - IRIX style log");
282 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
283 xfs_warn(mp, "log has mismatched uuid - can't recover");
284 xlog_header_check_dump(mp, head);
285 XFS_ERROR_REPORT("xlog_header_check_mount",
286 XFS_ERRLEVEL_HIGH, mp);
287 return -EFSCORRUPTED;
288 }
289 return 0;
290 }
291
292 STATIC void
xlog_recover_iodone(struct xfs_buf * bp)293 xlog_recover_iodone(
294 struct xfs_buf *bp)
295 {
296 if (bp->b_error) {
297 /*
298 * We're not going to bother about retrying
299 * this during recovery. One strike!
300 */
301 if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) {
302 xfs_buf_ioerror_alert(bp, __func__);
303 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
304 }
305 }
306
307 /*
308 * On v5 supers, a bli could be attached to update the metadata LSN.
309 * Clean it up.
310 */
311 if (bp->b_log_item)
312 xfs_buf_item_relse(bp);
313 ASSERT(bp->b_log_item == NULL);
314
315 bp->b_iodone = NULL;
316 xfs_buf_ioend(bp);
317 }
318
319 /*
320 * This routine finds (to an approximation) the first block in the physical
321 * log which contains the given cycle. It uses a binary search algorithm.
322 * Note that the algorithm can not be perfect because the disk will not
323 * necessarily be perfect.
324 */
325 STATIC int
xlog_find_cycle_start(struct xlog * log,char * buffer,xfs_daddr_t first_blk,xfs_daddr_t * last_blk,uint cycle)326 xlog_find_cycle_start(
327 struct xlog *log,
328 char *buffer,
329 xfs_daddr_t first_blk,
330 xfs_daddr_t *last_blk,
331 uint cycle)
332 {
333 char *offset;
334 xfs_daddr_t mid_blk;
335 xfs_daddr_t end_blk;
336 uint mid_cycle;
337 int error;
338
339 end_blk = *last_blk;
340 mid_blk = BLK_AVG(first_blk, end_blk);
341 while (mid_blk != first_blk && mid_blk != end_blk) {
342 error = xlog_bread(log, mid_blk, 1, buffer, &offset);
343 if (error)
344 return error;
345 mid_cycle = xlog_get_cycle(offset);
346 if (mid_cycle == cycle)
347 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
348 else
349 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
350 mid_blk = BLK_AVG(first_blk, end_blk);
351 }
352 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
353 (mid_blk == end_blk && mid_blk-1 == first_blk));
354
355 *last_blk = end_blk;
356
357 return 0;
358 }
359
360 /*
361 * Check that a range of blocks does not contain stop_on_cycle_no.
362 * Fill in *new_blk with the block offset where such a block is
363 * found, or with -1 (an invalid block number) if there is no such
364 * block in the range. The scan needs to occur from front to back
365 * and the pointer into the region must be updated since a later
366 * routine will need to perform another test.
367 */
368 STATIC int
xlog_find_verify_cycle(struct xlog * log,xfs_daddr_t start_blk,int nbblks,uint stop_on_cycle_no,xfs_daddr_t * new_blk)369 xlog_find_verify_cycle(
370 struct xlog *log,
371 xfs_daddr_t start_blk,
372 int nbblks,
373 uint stop_on_cycle_no,
374 xfs_daddr_t *new_blk)
375 {
376 xfs_daddr_t i, j;
377 uint cycle;
378 char *buffer;
379 xfs_daddr_t bufblks;
380 char *buf = NULL;
381 int error = 0;
382
383 /*
384 * Greedily allocate a buffer big enough to handle the full
385 * range of basic blocks we'll be examining. If that fails,
386 * try a smaller size. We need to be able to read at least
387 * a log sector, or we're out of luck.
388 */
389 bufblks = 1 << ffs(nbblks);
390 while (bufblks > log->l_logBBsize)
391 bufblks >>= 1;
392 while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
393 bufblks >>= 1;
394 if (bufblks < log->l_sectBBsize)
395 return -ENOMEM;
396 }
397
398 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
399 int bcount;
400
401 bcount = min(bufblks, (start_blk + nbblks - i));
402
403 error = xlog_bread(log, i, bcount, buffer, &buf);
404 if (error)
405 goto out;
406
407 for (j = 0; j < bcount; j++) {
408 cycle = xlog_get_cycle(buf);
409 if (cycle == stop_on_cycle_no) {
410 *new_blk = i+j;
411 goto out;
412 }
413
414 buf += BBSIZE;
415 }
416 }
417
418 *new_blk = -1;
419
420 out:
421 kmem_free(buffer);
422 return error;
423 }
424
425 /*
426 * Potentially backup over partial log record write.
427 *
428 * In the typical case, last_blk is the number of the block directly after
429 * a good log record. Therefore, we subtract one to get the block number
430 * of the last block in the given buffer. extra_bblks contains the number
431 * of blocks we would have read on a previous read. This happens when the
432 * last log record is split over the end of the physical log.
433 *
434 * extra_bblks is the number of blocks potentially verified on a previous
435 * call to this routine.
436 */
437 STATIC int
xlog_find_verify_log_record(struct xlog * log,xfs_daddr_t start_blk,xfs_daddr_t * last_blk,int extra_bblks)438 xlog_find_verify_log_record(
439 struct xlog *log,
440 xfs_daddr_t start_blk,
441 xfs_daddr_t *last_blk,
442 int extra_bblks)
443 {
444 xfs_daddr_t i;
445 char *buffer;
446 char *offset = NULL;
447 xlog_rec_header_t *head = NULL;
448 int error = 0;
449 int smallmem = 0;
450 int num_blks = *last_blk - start_blk;
451 int xhdrs;
452
453 ASSERT(start_blk != 0 || *last_blk != start_blk);
454
455 buffer = xlog_alloc_buffer(log, num_blks);
456 if (!buffer) {
457 buffer = xlog_alloc_buffer(log, 1);
458 if (!buffer)
459 return -ENOMEM;
460 smallmem = 1;
461 } else {
462 error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
463 if (error)
464 goto out;
465 offset += ((num_blks - 1) << BBSHIFT);
466 }
467
468 for (i = (*last_blk) - 1; i >= 0; i--) {
469 if (i < start_blk) {
470 /* valid log record not found */
471 xfs_warn(log->l_mp,
472 "Log inconsistent (didn't find previous header)");
473 ASSERT(0);
474 error = -EFSCORRUPTED;
475 goto out;
476 }
477
478 if (smallmem) {
479 error = xlog_bread(log, i, 1, buffer, &offset);
480 if (error)
481 goto out;
482 }
483
484 head = (xlog_rec_header_t *)offset;
485
486 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
487 break;
488
489 if (!smallmem)
490 offset -= BBSIZE;
491 }
492
493 /*
494 * We hit the beginning of the physical log & still no header. Return
495 * to caller. If caller can handle a return of -1, then this routine
496 * will be called again for the end of the physical log.
497 */
498 if (i == -1) {
499 error = 1;
500 goto out;
501 }
502
503 /*
504 * We have the final block of the good log (the first block
505 * of the log record _before_ the head. So we check the uuid.
506 */
507 if ((error = xlog_header_check_mount(log->l_mp, head)))
508 goto out;
509
510 /*
511 * We may have found a log record header before we expected one.
512 * last_blk will be the 1st block # with a given cycle #. We may end
513 * up reading an entire log record. In this case, we don't want to
514 * reset last_blk. Only when last_blk points in the middle of a log
515 * record do we update last_blk.
516 */
517 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
518 uint h_size = be32_to_cpu(head->h_size);
519
520 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
521 if (h_size % XLOG_HEADER_CYCLE_SIZE)
522 xhdrs++;
523 } else {
524 xhdrs = 1;
525 }
526
527 if (*last_blk - i + extra_bblks !=
528 BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
529 *last_blk = i;
530
531 out:
532 kmem_free(buffer);
533 return error;
534 }
535
536 /*
537 * Head is defined to be the point of the log where the next log write
538 * could go. This means that incomplete LR writes at the end are
539 * eliminated when calculating the head. We aren't guaranteed that previous
540 * LR have complete transactions. We only know that a cycle number of
541 * current cycle number -1 won't be present in the log if we start writing
542 * from our current block number.
543 *
544 * last_blk contains the block number of the first block with a given
545 * cycle number.
546 *
547 * Return: zero if normal, non-zero if error.
548 */
549 STATIC int
xlog_find_head(struct xlog * log,xfs_daddr_t * return_head_blk)550 xlog_find_head(
551 struct xlog *log,
552 xfs_daddr_t *return_head_blk)
553 {
554 char *buffer;
555 char *offset;
556 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
557 int num_scan_bblks;
558 uint first_half_cycle, last_half_cycle;
559 uint stop_on_cycle;
560 int error, log_bbnum = log->l_logBBsize;
561
562 /* Is the end of the log device zeroed? */
563 error = xlog_find_zeroed(log, &first_blk);
564 if (error < 0) {
565 xfs_warn(log->l_mp, "empty log check failed");
566 return error;
567 }
568 if (error == 1) {
569 *return_head_blk = first_blk;
570
571 /* Is the whole lot zeroed? */
572 if (!first_blk) {
573 /* Linux XFS shouldn't generate totally zeroed logs -
574 * mkfs etc write a dummy unmount record to a fresh
575 * log so we can store the uuid in there
576 */
577 xfs_warn(log->l_mp, "totally zeroed log");
578 }
579
580 return 0;
581 }
582
583 first_blk = 0; /* get cycle # of 1st block */
584 buffer = xlog_alloc_buffer(log, 1);
585 if (!buffer)
586 return -ENOMEM;
587
588 error = xlog_bread(log, 0, 1, buffer, &offset);
589 if (error)
590 goto out_free_buffer;
591
592 first_half_cycle = xlog_get_cycle(offset);
593
594 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
595 error = xlog_bread(log, last_blk, 1, buffer, &offset);
596 if (error)
597 goto out_free_buffer;
598
599 last_half_cycle = xlog_get_cycle(offset);
600 ASSERT(last_half_cycle != 0);
601
602 /*
603 * If the 1st half cycle number is equal to the last half cycle number,
604 * then the entire log is stamped with the same cycle number. In this
605 * case, head_blk can't be set to zero (which makes sense). The below
606 * math doesn't work out properly with head_blk equal to zero. Instead,
607 * we set it to log_bbnum which is an invalid block number, but this
608 * value makes the math correct. If head_blk doesn't changed through
609 * all the tests below, *head_blk is set to zero at the very end rather
610 * than log_bbnum. In a sense, log_bbnum and zero are the same block
611 * in a circular file.
612 */
613 if (first_half_cycle == last_half_cycle) {
614 /*
615 * In this case we believe that the entire log should have
616 * cycle number last_half_cycle. We need to scan backwards
617 * from the end verifying that there are no holes still
618 * containing last_half_cycle - 1. If we find such a hole,
619 * then the start of that hole will be the new head. The
620 * simple case looks like
621 * x | x ... | x - 1 | x
622 * Another case that fits this picture would be
623 * x | x + 1 | x ... | x
624 * In this case the head really is somewhere at the end of the
625 * log, as one of the latest writes at the beginning was
626 * incomplete.
627 * One more case is
628 * x | x + 1 | x ... | x - 1 | x
629 * This is really the combination of the above two cases, and
630 * the head has to end up at the start of the x-1 hole at the
631 * end of the log.
632 *
633 * In the 256k log case, we will read from the beginning to the
634 * end of the log and search for cycle numbers equal to x-1.
635 * We don't worry about the x+1 blocks that we encounter,
636 * because we know that they cannot be the head since the log
637 * started with x.
638 */
639 head_blk = log_bbnum;
640 stop_on_cycle = last_half_cycle - 1;
641 } else {
642 /*
643 * In this case we want to find the first block with cycle
644 * number matching last_half_cycle. We expect the log to be
645 * some variation on
646 * x + 1 ... | x ... | x
647 * The first block with cycle number x (last_half_cycle) will
648 * be where the new head belongs. First we do a binary search
649 * for the first occurrence of last_half_cycle. The binary
650 * search may not be totally accurate, so then we scan back
651 * from there looking for occurrences of last_half_cycle before
652 * us. If that backwards scan wraps around the beginning of
653 * the log, then we look for occurrences of last_half_cycle - 1
654 * at the end of the log. The cases we're looking for look
655 * like
656 * v binary search stopped here
657 * x + 1 ... | x | x + 1 | x ... | x
658 * ^ but we want to locate this spot
659 * or
660 * <---------> less than scan distance
661 * x + 1 ... | x ... | x - 1 | x
662 * ^ we want to locate this spot
663 */
664 stop_on_cycle = last_half_cycle;
665 error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
666 last_half_cycle);
667 if (error)
668 goto out_free_buffer;
669 }
670
671 /*
672 * Now validate the answer. Scan back some number of maximum possible
673 * blocks and make sure each one has the expected cycle number. The
674 * maximum is determined by the total possible amount of buffering
675 * in the in-core log. The following number can be made tighter if
676 * we actually look at the block size of the filesystem.
677 */
678 num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
679 if (head_blk >= num_scan_bblks) {
680 /*
681 * We are guaranteed that the entire check can be performed
682 * in one buffer.
683 */
684 start_blk = head_blk - num_scan_bblks;
685 if ((error = xlog_find_verify_cycle(log,
686 start_blk, num_scan_bblks,
687 stop_on_cycle, &new_blk)))
688 goto out_free_buffer;
689 if (new_blk != -1)
690 head_blk = new_blk;
691 } else { /* need to read 2 parts of log */
692 /*
693 * We are going to scan backwards in the log in two parts.
694 * First we scan the physical end of the log. In this part
695 * of the log, we are looking for blocks with cycle number
696 * last_half_cycle - 1.
697 * If we find one, then we know that the log starts there, as
698 * we've found a hole that didn't get written in going around
699 * the end of the physical log. The simple case for this is
700 * x + 1 ... | x ... | x - 1 | x
701 * <---------> less than scan distance
702 * If all of the blocks at the end of the log have cycle number
703 * last_half_cycle, then we check the blocks at the start of
704 * the log looking for occurrences of last_half_cycle. If we
705 * find one, then our current estimate for the location of the
706 * first occurrence of last_half_cycle is wrong and we move
707 * back to the hole we've found. This case looks like
708 * x + 1 ... | x | x + 1 | x ...
709 * ^ binary search stopped here
710 * Another case we need to handle that only occurs in 256k
711 * logs is
712 * x + 1 ... | x ... | x+1 | x ...
713 * ^ binary search stops here
714 * In a 256k log, the scan at the end of the log will see the
715 * x + 1 blocks. We need to skip past those since that is
716 * certainly not the head of the log. By searching for
717 * last_half_cycle-1 we accomplish that.
718 */
719 ASSERT(head_blk <= INT_MAX &&
720 (xfs_daddr_t) num_scan_bblks >= head_blk);
721 start_blk = log_bbnum - (num_scan_bblks - head_blk);
722 if ((error = xlog_find_verify_cycle(log, start_blk,
723 num_scan_bblks - (int)head_blk,
724 (stop_on_cycle - 1), &new_blk)))
725 goto out_free_buffer;
726 if (new_blk != -1) {
727 head_blk = new_blk;
728 goto validate_head;
729 }
730
731 /*
732 * Scan beginning of log now. The last part of the physical
733 * log is good. This scan needs to verify that it doesn't find
734 * the last_half_cycle.
735 */
736 start_blk = 0;
737 ASSERT(head_blk <= INT_MAX);
738 if ((error = xlog_find_verify_cycle(log,
739 start_blk, (int)head_blk,
740 stop_on_cycle, &new_blk)))
741 goto out_free_buffer;
742 if (new_blk != -1)
743 head_blk = new_blk;
744 }
745
746 validate_head:
747 /*
748 * Now we need to make sure head_blk is not pointing to a block in
749 * the middle of a log record.
750 */
751 num_scan_bblks = XLOG_REC_SHIFT(log);
752 if (head_blk >= num_scan_bblks) {
753 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
754
755 /* start ptr at last block ptr before head_blk */
756 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
757 if (error == 1)
758 error = -EIO;
759 if (error)
760 goto out_free_buffer;
761 } else {
762 start_blk = 0;
763 ASSERT(head_blk <= INT_MAX);
764 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
765 if (error < 0)
766 goto out_free_buffer;
767 if (error == 1) {
768 /* We hit the beginning of the log during our search */
769 start_blk = log_bbnum - (num_scan_bblks - head_blk);
770 new_blk = log_bbnum;
771 ASSERT(start_blk <= INT_MAX &&
772 (xfs_daddr_t) log_bbnum-start_blk >= 0);
773 ASSERT(head_blk <= INT_MAX);
774 error = xlog_find_verify_log_record(log, start_blk,
775 &new_blk, (int)head_blk);
776 if (error == 1)
777 error = -EIO;
778 if (error)
779 goto out_free_buffer;
780 if (new_blk != log_bbnum)
781 head_blk = new_blk;
782 } else if (error)
783 goto out_free_buffer;
784 }
785
786 kmem_free(buffer);
787 if (head_blk == log_bbnum)
788 *return_head_blk = 0;
789 else
790 *return_head_blk = head_blk;
791 /*
792 * When returning here, we have a good block number. Bad block
793 * means that during a previous crash, we didn't have a clean break
794 * from cycle number N to cycle number N-1. In this case, we need
795 * to find the first block with cycle number N-1.
796 */
797 return 0;
798
799 out_free_buffer:
800 kmem_free(buffer);
801 if (error)
802 xfs_warn(log->l_mp, "failed to find log head");
803 return error;
804 }
805
806 /*
807 * Seek backwards in the log for log record headers.
808 *
809 * Given a starting log block, walk backwards until we find the provided number
810 * of records or hit the provided tail block. The return value is the number of
811 * records encountered or a negative error code. The log block and buffer
812 * pointer of the last record seen are returned in rblk and rhead respectively.
813 */
814 STATIC int
xlog_rseek_logrec_hdr(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk,int count,char * buffer,xfs_daddr_t * rblk,struct xlog_rec_header ** rhead,bool * wrapped)815 xlog_rseek_logrec_hdr(
816 struct xlog *log,
817 xfs_daddr_t head_blk,
818 xfs_daddr_t tail_blk,
819 int count,
820 char *buffer,
821 xfs_daddr_t *rblk,
822 struct xlog_rec_header **rhead,
823 bool *wrapped)
824 {
825 int i;
826 int error;
827 int found = 0;
828 char *offset = NULL;
829 xfs_daddr_t end_blk;
830
831 *wrapped = false;
832
833 /*
834 * Walk backwards from the head block until we hit the tail or the first
835 * block in the log.
836 */
837 end_blk = head_blk > tail_blk ? tail_blk : 0;
838 for (i = (int) head_blk - 1; i >= end_blk; i--) {
839 error = xlog_bread(log, i, 1, buffer, &offset);
840 if (error)
841 goto out_error;
842
843 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
844 *rblk = i;
845 *rhead = (struct xlog_rec_header *) offset;
846 if (++found == count)
847 break;
848 }
849 }
850
851 /*
852 * If we haven't hit the tail block or the log record header count,
853 * start looking again from the end of the physical log. Note that
854 * callers can pass head == tail if the tail is not yet known.
855 */
856 if (tail_blk >= head_blk && found != count) {
857 for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
858 error = xlog_bread(log, i, 1, buffer, &offset);
859 if (error)
860 goto out_error;
861
862 if (*(__be32 *)offset ==
863 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
864 *wrapped = true;
865 *rblk = i;
866 *rhead = (struct xlog_rec_header *) offset;
867 if (++found == count)
868 break;
869 }
870 }
871 }
872
873 return found;
874
875 out_error:
876 return error;
877 }
878
879 /*
880 * Seek forward in the log for log record headers.
881 *
882 * Given head and tail blocks, walk forward from the tail block until we find
883 * the provided number of records or hit the head block. The return value is the
884 * number of records encountered or a negative error code. The log block and
885 * buffer pointer of the last record seen are returned in rblk and rhead
886 * respectively.
887 */
888 STATIC int
xlog_seek_logrec_hdr(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk,int count,char * buffer,xfs_daddr_t * rblk,struct xlog_rec_header ** rhead,bool * wrapped)889 xlog_seek_logrec_hdr(
890 struct xlog *log,
891 xfs_daddr_t head_blk,
892 xfs_daddr_t tail_blk,
893 int count,
894 char *buffer,
895 xfs_daddr_t *rblk,
896 struct xlog_rec_header **rhead,
897 bool *wrapped)
898 {
899 int i;
900 int error;
901 int found = 0;
902 char *offset = NULL;
903 xfs_daddr_t end_blk;
904
905 *wrapped = false;
906
907 /*
908 * Walk forward from the tail block until we hit the head or the last
909 * block in the log.
910 */
911 end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
912 for (i = (int) tail_blk; i <= end_blk; i++) {
913 error = xlog_bread(log, i, 1, buffer, &offset);
914 if (error)
915 goto out_error;
916
917 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
918 *rblk = i;
919 *rhead = (struct xlog_rec_header *) offset;
920 if (++found == count)
921 break;
922 }
923 }
924
925 /*
926 * If we haven't hit the head block or the log record header count,
927 * start looking again from the start of the physical log.
928 */
929 if (tail_blk > head_blk && found != count) {
930 for (i = 0; i < (int) head_blk; i++) {
931 error = xlog_bread(log, i, 1, buffer, &offset);
932 if (error)
933 goto out_error;
934
935 if (*(__be32 *)offset ==
936 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
937 *wrapped = true;
938 *rblk = i;
939 *rhead = (struct xlog_rec_header *) offset;
940 if (++found == count)
941 break;
942 }
943 }
944 }
945
946 return found;
947
948 out_error:
949 return error;
950 }
951
952 /*
953 * Calculate distance from head to tail (i.e., unused space in the log).
954 */
955 static inline int
xlog_tail_distance(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk)956 xlog_tail_distance(
957 struct xlog *log,
958 xfs_daddr_t head_blk,
959 xfs_daddr_t tail_blk)
960 {
961 if (head_blk < tail_blk)
962 return tail_blk - head_blk;
963
964 return tail_blk + (log->l_logBBsize - head_blk);
965 }
966
967 /*
968 * Verify the log tail. This is particularly important when torn or incomplete
969 * writes have been detected near the front of the log and the head has been
970 * walked back accordingly.
971 *
972 * We also have to handle the case where the tail was pinned and the head
973 * blocked behind the tail right before a crash. If the tail had been pushed
974 * immediately prior to the crash and the subsequent checkpoint was only
975 * partially written, it's possible it overwrote the last referenced tail in the
976 * log with garbage. This is not a coherency problem because the tail must have
977 * been pushed before it can be overwritten, but appears as log corruption to
978 * recovery because we have no way to know the tail was updated if the
979 * subsequent checkpoint didn't write successfully.
980 *
981 * Therefore, CRC check the log from tail to head. If a failure occurs and the
982 * offending record is within max iclog bufs from the head, walk the tail
983 * forward and retry until a valid tail is found or corruption is detected out
984 * of the range of a possible overwrite.
985 */
986 STATIC int
xlog_verify_tail(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t * tail_blk,int hsize)987 xlog_verify_tail(
988 struct xlog *log,
989 xfs_daddr_t head_blk,
990 xfs_daddr_t *tail_blk,
991 int hsize)
992 {
993 struct xlog_rec_header *thead;
994 char *buffer;
995 xfs_daddr_t first_bad;
996 int error = 0;
997 bool wrapped;
998 xfs_daddr_t tmp_tail;
999 xfs_daddr_t orig_tail = *tail_blk;
1000
1001 buffer = xlog_alloc_buffer(log, 1);
1002 if (!buffer)
1003 return -ENOMEM;
1004
1005 /*
1006 * Make sure the tail points to a record (returns positive count on
1007 * success).
1008 */
1009 error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
1010 &tmp_tail, &thead, &wrapped);
1011 if (error < 0)
1012 goto out;
1013 if (*tail_blk != tmp_tail)
1014 *tail_blk = tmp_tail;
1015
1016 /*
1017 * Run a CRC check from the tail to the head. We can't just check
1018 * MAX_ICLOGS records past the tail because the tail may point to stale
1019 * blocks cleared during the search for the head/tail. These blocks are
1020 * overwritten with zero-length records and thus record count is not a
1021 * reliable indicator of the iclog state before a crash.
1022 */
1023 first_bad = 0;
1024 error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
1025 XLOG_RECOVER_CRCPASS, &first_bad);
1026 while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
1027 int tail_distance;
1028
1029 /*
1030 * Is corruption within range of the head? If so, retry from
1031 * the next record. Otherwise return an error.
1032 */
1033 tail_distance = xlog_tail_distance(log, head_blk, first_bad);
1034 if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
1035 break;
1036
1037 /* skip to the next record; returns positive count on success */
1038 error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
1039 buffer, &tmp_tail, &thead, &wrapped);
1040 if (error < 0)
1041 goto out;
1042
1043 *tail_blk = tmp_tail;
1044 first_bad = 0;
1045 error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
1046 XLOG_RECOVER_CRCPASS, &first_bad);
1047 }
1048
1049 if (!error && *tail_blk != orig_tail)
1050 xfs_warn(log->l_mp,
1051 "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
1052 orig_tail, *tail_blk);
1053 out:
1054 kmem_free(buffer);
1055 return error;
1056 }
1057
1058 /*
1059 * Detect and trim torn writes from the head of the log.
1060 *
1061 * Storage without sector atomicity guarantees can result in torn writes in the
1062 * log in the event of a crash. Our only means to detect this scenario is via
1063 * CRC verification. While we can't always be certain that CRC verification
1064 * failure is due to a torn write vs. an unrelated corruption, we do know that
1065 * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
1066 * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
1067 * the log and treat failures in this range as torn writes as a matter of
1068 * policy. In the event of CRC failure, the head is walked back to the last good
1069 * record in the log and the tail is updated from that record and verified.
1070 */
1071 STATIC int
xlog_verify_head(struct xlog * log,xfs_daddr_t * head_blk,xfs_daddr_t * tail_blk,char * buffer,xfs_daddr_t * rhead_blk,struct xlog_rec_header ** rhead,bool * wrapped)1072 xlog_verify_head(
1073 struct xlog *log,
1074 xfs_daddr_t *head_blk, /* in/out: unverified head */
1075 xfs_daddr_t *tail_blk, /* out: tail block */
1076 char *buffer,
1077 xfs_daddr_t *rhead_blk, /* start blk of last record */
1078 struct xlog_rec_header **rhead, /* ptr to last record */
1079 bool *wrapped) /* last rec. wraps phys. log */
1080 {
1081 struct xlog_rec_header *tmp_rhead;
1082 char *tmp_buffer;
1083 xfs_daddr_t first_bad;
1084 xfs_daddr_t tmp_rhead_blk;
1085 int found;
1086 int error;
1087 bool tmp_wrapped;
1088
1089 /*
1090 * Check the head of the log for torn writes. Search backwards from the
1091 * head until we hit the tail or the maximum number of log record I/Os
1092 * that could have been in flight at one time. Use a temporary buffer so
1093 * we don't trash the rhead/buffer pointers from the caller.
1094 */
1095 tmp_buffer = xlog_alloc_buffer(log, 1);
1096 if (!tmp_buffer)
1097 return -ENOMEM;
1098 error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
1099 XLOG_MAX_ICLOGS, tmp_buffer,
1100 &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
1101 kmem_free(tmp_buffer);
1102 if (error < 0)
1103 return error;
1104
1105 /*
1106 * Now run a CRC verification pass over the records starting at the
1107 * block found above to the current head. If a CRC failure occurs, the
1108 * log block of the first bad record is saved in first_bad.
1109 */
1110 error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
1111 XLOG_RECOVER_CRCPASS, &first_bad);
1112 if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
1113 /*
1114 * We've hit a potential torn write. Reset the error and warn
1115 * about it.
1116 */
1117 error = 0;
1118 xfs_warn(log->l_mp,
1119 "Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
1120 first_bad, *head_blk);
1121
1122 /*
1123 * Get the header block and buffer pointer for the last good
1124 * record before the bad record.
1125 *
1126 * Note that xlog_find_tail() clears the blocks at the new head
1127 * (i.e., the records with invalid CRC) if the cycle number
1128 * matches the the current cycle.
1129 */
1130 found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
1131 buffer, rhead_blk, rhead, wrapped);
1132 if (found < 0)
1133 return found;
1134 if (found == 0) /* XXX: right thing to do here? */
1135 return -EIO;
1136
1137 /*
1138 * Reset the head block to the starting block of the first bad
1139 * log record and set the tail block based on the last good
1140 * record.
1141 *
1142 * Bail out if the updated head/tail match as this indicates
1143 * possible corruption outside of the acceptable
1144 * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
1145 */
1146 *head_blk = first_bad;
1147 *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1148 if (*head_blk == *tail_blk) {
1149 ASSERT(0);
1150 return 0;
1151 }
1152 }
1153 if (error)
1154 return error;
1155
1156 return xlog_verify_tail(log, *head_blk, tail_blk,
1157 be32_to_cpu((*rhead)->h_size));
1158 }
1159
1160 /*
1161 * We need to make sure we handle log wrapping properly, so we can't use the
1162 * calculated logbno directly. Make sure it wraps to the correct bno inside the
1163 * log.
1164 *
1165 * The log is limited to 32 bit sizes, so we use the appropriate modulus
1166 * operation here and cast it back to a 64 bit daddr on return.
1167 */
1168 static inline xfs_daddr_t
xlog_wrap_logbno(struct xlog * log,xfs_daddr_t bno)1169 xlog_wrap_logbno(
1170 struct xlog *log,
1171 xfs_daddr_t bno)
1172 {
1173 int mod;
1174
1175 div_s64_rem(bno, log->l_logBBsize, &mod);
1176 return mod;
1177 }
1178
1179 /*
1180 * Check whether the head of the log points to an unmount record. In other
1181 * words, determine whether the log is clean. If so, update the in-core state
1182 * appropriately.
1183 */
1184 static int
xlog_check_unmount_rec(struct xlog * log,xfs_daddr_t * head_blk,xfs_daddr_t * tail_blk,struct xlog_rec_header * rhead,xfs_daddr_t rhead_blk,char * buffer,bool * clean)1185 xlog_check_unmount_rec(
1186 struct xlog *log,
1187 xfs_daddr_t *head_blk,
1188 xfs_daddr_t *tail_blk,
1189 struct xlog_rec_header *rhead,
1190 xfs_daddr_t rhead_blk,
1191 char *buffer,
1192 bool *clean)
1193 {
1194 struct xlog_op_header *op_head;
1195 xfs_daddr_t umount_data_blk;
1196 xfs_daddr_t after_umount_blk;
1197 int hblks;
1198 int error;
1199 char *offset;
1200
1201 *clean = false;
1202
1203 /*
1204 * Look for unmount record. If we find it, then we know there was a
1205 * clean unmount. Since 'i' could be the last block in the physical
1206 * log, we convert to a log block before comparing to the head_blk.
1207 *
1208 * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
1209 * below. We won't want to clear the unmount record if there is one, so
1210 * we pass the lsn of the unmount record rather than the block after it.
1211 */
1212 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1213 int h_size = be32_to_cpu(rhead->h_size);
1214 int h_version = be32_to_cpu(rhead->h_version);
1215
1216 if ((h_version & XLOG_VERSION_2) &&
1217 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1218 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1219 if (h_size % XLOG_HEADER_CYCLE_SIZE)
1220 hblks++;
1221 } else {
1222 hblks = 1;
1223 }
1224 } else {
1225 hblks = 1;
1226 }
1227
1228 after_umount_blk = xlog_wrap_logbno(log,
1229 rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
1230
1231 if (*head_blk == after_umount_blk &&
1232 be32_to_cpu(rhead->h_num_logops) == 1) {
1233 umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
1234 error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
1235 if (error)
1236 return error;
1237
1238 op_head = (struct xlog_op_header *)offset;
1239 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1240 /*
1241 * Set tail and last sync so that newly written log
1242 * records will point recovery to after the current
1243 * unmount record.
1244 */
1245 xlog_assign_atomic_lsn(&log->l_tail_lsn,
1246 log->l_curr_cycle, after_umount_blk);
1247 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1248 log->l_curr_cycle, after_umount_blk);
1249 *tail_blk = after_umount_blk;
1250
1251 *clean = true;
1252 }
1253 }
1254
1255 return 0;
1256 }
1257
1258 static void
xlog_set_state(struct xlog * log,xfs_daddr_t head_blk,struct xlog_rec_header * rhead,xfs_daddr_t rhead_blk,bool bump_cycle)1259 xlog_set_state(
1260 struct xlog *log,
1261 xfs_daddr_t head_blk,
1262 struct xlog_rec_header *rhead,
1263 xfs_daddr_t rhead_blk,
1264 bool bump_cycle)
1265 {
1266 /*
1267 * Reset log values according to the state of the log when we
1268 * crashed. In the case where head_blk == 0, we bump curr_cycle
1269 * one because the next write starts a new cycle rather than
1270 * continuing the cycle of the last good log record. At this
1271 * point we have guaranteed that all partial log records have been
1272 * accounted for. Therefore, we know that the last good log record
1273 * written was complete and ended exactly on the end boundary
1274 * of the physical log.
1275 */
1276 log->l_prev_block = rhead_blk;
1277 log->l_curr_block = (int)head_blk;
1278 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1279 if (bump_cycle)
1280 log->l_curr_cycle++;
1281 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1282 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1283 xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
1284 BBTOB(log->l_curr_block));
1285 xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
1286 BBTOB(log->l_curr_block));
1287 }
1288
1289 /*
1290 * Find the sync block number or the tail of the log.
1291 *
1292 * This will be the block number of the last record to have its
1293 * associated buffers synced to disk. Every log record header has
1294 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
1295 * to get a sync block number. The only concern is to figure out which
1296 * log record header to believe.
1297 *
1298 * The following algorithm uses the log record header with the largest
1299 * lsn. The entire log record does not need to be valid. We only care
1300 * that the header is valid.
1301 *
1302 * We could speed up search by using current head_blk buffer, but it is not
1303 * available.
1304 */
1305 STATIC int
xlog_find_tail(struct xlog * log,xfs_daddr_t * head_blk,xfs_daddr_t * tail_blk)1306 xlog_find_tail(
1307 struct xlog *log,
1308 xfs_daddr_t *head_blk,
1309 xfs_daddr_t *tail_blk)
1310 {
1311 xlog_rec_header_t *rhead;
1312 char *offset = NULL;
1313 char *buffer;
1314 int error;
1315 xfs_daddr_t rhead_blk;
1316 xfs_lsn_t tail_lsn;
1317 bool wrapped = false;
1318 bool clean = false;
1319
1320 /*
1321 * Find previous log record
1322 */
1323 if ((error = xlog_find_head(log, head_blk)))
1324 return error;
1325 ASSERT(*head_blk < INT_MAX);
1326
1327 buffer = xlog_alloc_buffer(log, 1);
1328 if (!buffer)
1329 return -ENOMEM;
1330 if (*head_blk == 0) { /* special case */
1331 error = xlog_bread(log, 0, 1, buffer, &offset);
1332 if (error)
1333 goto done;
1334
1335 if (xlog_get_cycle(offset) == 0) {
1336 *tail_blk = 0;
1337 /* leave all other log inited values alone */
1338 goto done;
1339 }
1340 }
1341
1342 /*
1343 * Search backwards through the log looking for the log record header
1344 * block. This wraps all the way back around to the head so something is
1345 * seriously wrong if we can't find it.
1346 */
1347 error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
1348 &rhead_blk, &rhead, &wrapped);
1349 if (error < 0)
1350 goto done;
1351 if (!error) {
1352 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1353 error = -EFSCORRUPTED;
1354 goto done;
1355 }
1356 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1357
1358 /*
1359 * Set the log state based on the current head record.
1360 */
1361 xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
1362 tail_lsn = atomic64_read(&log->l_tail_lsn);
1363
1364 /*
1365 * Look for an unmount record at the head of the log. This sets the log
1366 * state to determine whether recovery is necessary.
1367 */
1368 error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
1369 rhead_blk, buffer, &clean);
1370 if (error)
1371 goto done;
1372
1373 /*
1374 * Verify the log head if the log is not clean (e.g., we have anything
1375 * but an unmount record at the head). This uses CRC verification to
1376 * detect and trim torn writes. If discovered, CRC failures are
1377 * considered torn writes and the log head is trimmed accordingly.
1378 *
1379 * Note that we can only run CRC verification when the log is dirty
1380 * because there's no guarantee that the log data behind an unmount
1381 * record is compatible with the current architecture.
1382 */
1383 if (!clean) {
1384 xfs_daddr_t orig_head = *head_blk;
1385
1386 error = xlog_verify_head(log, head_blk, tail_blk, buffer,
1387 &rhead_blk, &rhead, &wrapped);
1388 if (error)
1389 goto done;
1390
1391 /* update in-core state again if the head changed */
1392 if (*head_blk != orig_head) {
1393 xlog_set_state(log, *head_blk, rhead, rhead_blk,
1394 wrapped);
1395 tail_lsn = atomic64_read(&log->l_tail_lsn);
1396 error = xlog_check_unmount_rec(log, head_blk, tail_blk,
1397 rhead, rhead_blk, buffer,
1398 &clean);
1399 if (error)
1400 goto done;
1401 }
1402 }
1403
1404 /*
1405 * Note that the unmount was clean. If the unmount was not clean, we
1406 * need to know this to rebuild the superblock counters from the perag
1407 * headers if we have a filesystem using non-persistent counters.
1408 */
1409 if (clean)
1410 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1411
1412 /*
1413 * Make sure that there are no blocks in front of the head
1414 * with the same cycle number as the head. This can happen
1415 * because we allow multiple outstanding log writes concurrently,
1416 * and the later writes might make it out before earlier ones.
1417 *
1418 * We use the lsn from before modifying it so that we'll never
1419 * overwrite the unmount record after a clean unmount.
1420 *
1421 * Do this only if we are going to recover the filesystem
1422 *
1423 * NOTE: This used to say "if (!readonly)"
1424 * However on Linux, we can & do recover a read-only filesystem.
1425 * We only skip recovery if NORECOVERY is specified on mount,
1426 * in which case we would not be here.
1427 *
1428 * But... if the -device- itself is readonly, just skip this.
1429 * We can't recover this device anyway, so it won't matter.
1430 */
1431 if (!xfs_readonly_buftarg(log->l_targ))
1432 error = xlog_clear_stale_blocks(log, tail_lsn);
1433
1434 done:
1435 kmem_free(buffer);
1436
1437 if (error)
1438 xfs_warn(log->l_mp, "failed to locate log tail");
1439 return error;
1440 }
1441
1442 /*
1443 * Is the log zeroed at all?
1444 *
1445 * The last binary search should be changed to perform an X block read
1446 * once X becomes small enough. You can then search linearly through
1447 * the X blocks. This will cut down on the number of reads we need to do.
1448 *
1449 * If the log is partially zeroed, this routine will pass back the blkno
1450 * of the first block with cycle number 0. It won't have a complete LR
1451 * preceding it.
1452 *
1453 * Return:
1454 * 0 => the log is completely written to
1455 * 1 => use *blk_no as the first block of the log
1456 * <0 => error has occurred
1457 */
1458 STATIC int
xlog_find_zeroed(struct xlog * log,xfs_daddr_t * blk_no)1459 xlog_find_zeroed(
1460 struct xlog *log,
1461 xfs_daddr_t *blk_no)
1462 {
1463 char *buffer;
1464 char *offset;
1465 uint first_cycle, last_cycle;
1466 xfs_daddr_t new_blk, last_blk, start_blk;
1467 xfs_daddr_t num_scan_bblks;
1468 int error, log_bbnum = log->l_logBBsize;
1469
1470 *blk_no = 0;
1471
1472 /* check totally zeroed log */
1473 buffer = xlog_alloc_buffer(log, 1);
1474 if (!buffer)
1475 return -ENOMEM;
1476 error = xlog_bread(log, 0, 1, buffer, &offset);
1477 if (error)
1478 goto out_free_buffer;
1479
1480 first_cycle = xlog_get_cycle(offset);
1481 if (first_cycle == 0) { /* completely zeroed log */
1482 *blk_no = 0;
1483 kmem_free(buffer);
1484 return 1;
1485 }
1486
1487 /* check partially zeroed log */
1488 error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
1489 if (error)
1490 goto out_free_buffer;
1491
1492 last_cycle = xlog_get_cycle(offset);
1493 if (last_cycle != 0) { /* log completely written to */
1494 kmem_free(buffer);
1495 return 0;
1496 }
1497
1498 /* we have a partially zeroed log */
1499 last_blk = log_bbnum-1;
1500 error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
1501 if (error)
1502 goto out_free_buffer;
1503
1504 /*
1505 * Validate the answer. Because there is no way to guarantee that
1506 * the entire log is made up of log records which are the same size,
1507 * we scan over the defined maximum blocks. At this point, the maximum
1508 * is not chosen to mean anything special. XXXmiken
1509 */
1510 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1511 ASSERT(num_scan_bblks <= INT_MAX);
1512
1513 if (last_blk < num_scan_bblks)
1514 num_scan_bblks = last_blk;
1515 start_blk = last_blk - num_scan_bblks;
1516
1517 /*
1518 * We search for any instances of cycle number 0 that occur before
1519 * our current estimate of the head. What we're trying to detect is
1520 * 1 ... | 0 | 1 | 0...
1521 * ^ binary search ends here
1522 */
1523 if ((error = xlog_find_verify_cycle(log, start_blk,
1524 (int)num_scan_bblks, 0, &new_blk)))
1525 goto out_free_buffer;
1526 if (new_blk != -1)
1527 last_blk = new_blk;
1528
1529 /*
1530 * Potentially backup over partial log record write. We don't need
1531 * to search the end of the log because we know it is zero.
1532 */
1533 error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
1534 if (error == 1)
1535 error = -EIO;
1536 if (error)
1537 goto out_free_buffer;
1538
1539 *blk_no = last_blk;
1540 out_free_buffer:
1541 kmem_free(buffer);
1542 if (error)
1543 return error;
1544 return 1;
1545 }
1546
1547 /*
1548 * These are simple subroutines used by xlog_clear_stale_blocks() below
1549 * to initialize a buffer full of empty log record headers and write
1550 * them into the log.
1551 */
1552 STATIC void
xlog_add_record(struct xlog * log,char * buf,int cycle,int block,int tail_cycle,int tail_block)1553 xlog_add_record(
1554 struct xlog *log,
1555 char *buf,
1556 int cycle,
1557 int block,
1558 int tail_cycle,
1559 int tail_block)
1560 {
1561 xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
1562
1563 memset(buf, 0, BBSIZE);
1564 recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1565 recp->h_cycle = cpu_to_be32(cycle);
1566 recp->h_version = cpu_to_be32(
1567 xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1568 recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1569 recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1570 recp->h_fmt = cpu_to_be32(XLOG_FMT);
1571 memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1572 }
1573
1574 STATIC int
xlog_write_log_records(struct xlog * log,int cycle,int start_block,int blocks,int tail_cycle,int tail_block)1575 xlog_write_log_records(
1576 struct xlog *log,
1577 int cycle,
1578 int start_block,
1579 int blocks,
1580 int tail_cycle,
1581 int tail_block)
1582 {
1583 char *offset;
1584 char *buffer;
1585 int balign, ealign;
1586 int sectbb = log->l_sectBBsize;
1587 int end_block = start_block + blocks;
1588 int bufblks;
1589 int error = 0;
1590 int i, j = 0;
1591
1592 /*
1593 * Greedily allocate a buffer big enough to handle the full
1594 * range of basic blocks to be written. If that fails, try
1595 * a smaller size. We need to be able to write at least a
1596 * log sector, or we're out of luck.
1597 */
1598 bufblks = 1 << ffs(blocks);
1599 while (bufblks > log->l_logBBsize)
1600 bufblks >>= 1;
1601 while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
1602 bufblks >>= 1;
1603 if (bufblks < sectbb)
1604 return -ENOMEM;
1605 }
1606
1607 /* We may need to do a read at the start to fill in part of
1608 * the buffer in the starting sector not covered by the first
1609 * write below.
1610 */
1611 balign = round_down(start_block, sectbb);
1612 if (balign != start_block) {
1613 error = xlog_bread_noalign(log, start_block, 1, buffer);
1614 if (error)
1615 goto out_free_buffer;
1616
1617 j = start_block - balign;
1618 }
1619
1620 for (i = start_block; i < end_block; i += bufblks) {
1621 int bcount, endcount;
1622
1623 bcount = min(bufblks, end_block - start_block);
1624 endcount = bcount - j;
1625
1626 /* We may need to do a read at the end to fill in part of
1627 * the buffer in the final sector not covered by the write.
1628 * If this is the same sector as the above read, skip it.
1629 */
1630 ealign = round_down(end_block, sectbb);
1631 if (j == 0 && (start_block + endcount > ealign)) {
1632 error = xlog_bread_noalign(log, ealign, sectbb,
1633 buffer + BBTOB(ealign - start_block));
1634 if (error)
1635 break;
1636
1637 }
1638
1639 offset = buffer + xlog_align(log, start_block);
1640 for (; j < endcount; j++) {
1641 xlog_add_record(log, offset, cycle, i+j,
1642 tail_cycle, tail_block);
1643 offset += BBSIZE;
1644 }
1645 error = xlog_bwrite(log, start_block, endcount, buffer);
1646 if (error)
1647 break;
1648 start_block += endcount;
1649 j = 0;
1650 }
1651
1652 out_free_buffer:
1653 kmem_free(buffer);
1654 return error;
1655 }
1656
1657 /*
1658 * This routine is called to blow away any incomplete log writes out
1659 * in front of the log head. We do this so that we won't become confused
1660 * if we come up, write only a little bit more, and then crash again.
1661 * If we leave the partial log records out there, this situation could
1662 * cause us to think those partial writes are valid blocks since they
1663 * have the current cycle number. We get rid of them by overwriting them
1664 * with empty log records with the old cycle number rather than the
1665 * current one.
1666 *
1667 * The tail lsn is passed in rather than taken from
1668 * the log so that we will not write over the unmount record after a
1669 * clean unmount in a 512 block log. Doing so would leave the log without
1670 * any valid log records in it until a new one was written. If we crashed
1671 * during that time we would not be able to recover.
1672 */
1673 STATIC int
xlog_clear_stale_blocks(struct xlog * log,xfs_lsn_t tail_lsn)1674 xlog_clear_stale_blocks(
1675 struct xlog *log,
1676 xfs_lsn_t tail_lsn)
1677 {
1678 int tail_cycle, head_cycle;
1679 int tail_block, head_block;
1680 int tail_distance, max_distance;
1681 int distance;
1682 int error;
1683
1684 tail_cycle = CYCLE_LSN(tail_lsn);
1685 tail_block = BLOCK_LSN(tail_lsn);
1686 head_cycle = log->l_curr_cycle;
1687 head_block = log->l_curr_block;
1688
1689 /*
1690 * Figure out the distance between the new head of the log
1691 * and the tail. We want to write over any blocks beyond the
1692 * head that we may have written just before the crash, but
1693 * we don't want to overwrite the tail of the log.
1694 */
1695 if (head_cycle == tail_cycle) {
1696 /*
1697 * The tail is behind the head in the physical log,
1698 * so the distance from the head to the tail is the
1699 * distance from the head to the end of the log plus
1700 * the distance from the beginning of the log to the
1701 * tail.
1702 */
1703 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1704 XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1705 XFS_ERRLEVEL_LOW, log->l_mp);
1706 return -EFSCORRUPTED;
1707 }
1708 tail_distance = tail_block + (log->l_logBBsize - head_block);
1709 } else {
1710 /*
1711 * The head is behind the tail in the physical log,
1712 * so the distance from the head to the tail is just
1713 * the tail block minus the head block.
1714 */
1715 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1716 XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1717 XFS_ERRLEVEL_LOW, log->l_mp);
1718 return -EFSCORRUPTED;
1719 }
1720 tail_distance = tail_block - head_block;
1721 }
1722
1723 /*
1724 * If the head is right up against the tail, we can't clear
1725 * anything.
1726 */
1727 if (tail_distance <= 0) {
1728 ASSERT(tail_distance == 0);
1729 return 0;
1730 }
1731
1732 max_distance = XLOG_TOTAL_REC_SHIFT(log);
1733 /*
1734 * Take the smaller of the maximum amount of outstanding I/O
1735 * we could have and the distance to the tail to clear out.
1736 * We take the smaller so that we don't overwrite the tail and
1737 * we don't waste all day writing from the head to the tail
1738 * for no reason.
1739 */
1740 max_distance = min(max_distance, tail_distance);
1741
1742 if ((head_block + max_distance) <= log->l_logBBsize) {
1743 /*
1744 * We can stomp all the blocks we need to without
1745 * wrapping around the end of the log. Just do it
1746 * in a single write. Use the cycle number of the
1747 * current cycle minus one so that the log will look like:
1748 * n ... | n - 1 ...
1749 */
1750 error = xlog_write_log_records(log, (head_cycle - 1),
1751 head_block, max_distance, tail_cycle,
1752 tail_block);
1753 if (error)
1754 return error;
1755 } else {
1756 /*
1757 * We need to wrap around the end of the physical log in
1758 * order to clear all the blocks. Do it in two separate
1759 * I/Os. The first write should be from the head to the
1760 * end of the physical log, and it should use the current
1761 * cycle number minus one just like above.
1762 */
1763 distance = log->l_logBBsize - head_block;
1764 error = xlog_write_log_records(log, (head_cycle - 1),
1765 head_block, distance, tail_cycle,
1766 tail_block);
1767
1768 if (error)
1769 return error;
1770
1771 /*
1772 * Now write the blocks at the start of the physical log.
1773 * This writes the remainder of the blocks we want to clear.
1774 * It uses the current cycle number since we're now on the
1775 * same cycle as the head so that we get:
1776 * n ... n ... | n - 1 ...
1777 * ^^^^^ blocks we're writing
1778 */
1779 distance = max_distance - (log->l_logBBsize - head_block);
1780 error = xlog_write_log_records(log, head_cycle, 0, distance,
1781 tail_cycle, tail_block);
1782 if (error)
1783 return error;
1784 }
1785
1786 return 0;
1787 }
1788
1789 /******************************************************************************
1790 *
1791 * Log recover routines
1792 *
1793 ******************************************************************************
1794 */
1795
1796 /*
1797 * Sort the log items in the transaction.
1798 *
1799 * The ordering constraints are defined by the inode allocation and unlink
1800 * behaviour. The rules are:
1801 *
1802 * 1. Every item is only logged once in a given transaction. Hence it
1803 * represents the last logged state of the item. Hence ordering is
1804 * dependent on the order in which operations need to be performed so
1805 * required initial conditions are always met.
1806 *
1807 * 2. Cancelled buffers are recorded in pass 1 in a separate table and
1808 * there's nothing to replay from them so we can simply cull them
1809 * from the transaction. However, we can't do that until after we've
1810 * replayed all the other items because they may be dependent on the
1811 * cancelled buffer and replaying the cancelled buffer can remove it
1812 * form the cancelled buffer table. Hence they have tobe done last.
1813 *
1814 * 3. Inode allocation buffers must be replayed before inode items that
1815 * read the buffer and replay changes into it. For filesystems using the
1816 * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1817 * treated the same as inode allocation buffers as they create and
1818 * initialise the buffers directly.
1819 *
1820 * 4. Inode unlink buffers must be replayed after inode items are replayed.
1821 * This ensures that inodes are completely flushed to the inode buffer
1822 * in a "free" state before we remove the unlinked inode list pointer.
1823 *
1824 * Hence the ordering needs to be inode allocation buffers first, inode items
1825 * second, inode unlink buffers third and cancelled buffers last.
1826 *
1827 * But there's a problem with that - we can't tell an inode allocation buffer
1828 * apart from a regular buffer, so we can't separate them. We can, however,
1829 * tell an inode unlink buffer from the others, and so we can separate them out
1830 * from all the other buffers and move them to last.
1831 *
1832 * Hence, 4 lists, in order from head to tail:
1833 * - buffer_list for all buffers except cancelled/inode unlink buffers
1834 * - item_list for all non-buffer items
1835 * - inode_buffer_list for inode unlink buffers
1836 * - cancel_list for the cancelled buffers
1837 *
1838 * Note that we add objects to the tail of the lists so that first-to-last
1839 * ordering is preserved within the lists. Adding objects to the head of the
1840 * list means when we traverse from the head we walk them in last-to-first
1841 * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1842 * but for all other items there may be specific ordering that we need to
1843 * preserve.
1844 */
1845 STATIC int
xlog_recover_reorder_trans(struct xlog * log,struct xlog_recover * trans,int pass)1846 xlog_recover_reorder_trans(
1847 struct xlog *log,
1848 struct xlog_recover *trans,
1849 int pass)
1850 {
1851 xlog_recover_item_t *item, *n;
1852 int error = 0;
1853 LIST_HEAD(sort_list);
1854 LIST_HEAD(cancel_list);
1855 LIST_HEAD(buffer_list);
1856 LIST_HEAD(inode_buffer_list);
1857 LIST_HEAD(inode_list);
1858
1859 list_splice_init(&trans->r_itemq, &sort_list);
1860 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1861 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1862
1863 switch (ITEM_TYPE(item)) {
1864 case XFS_LI_ICREATE:
1865 list_move_tail(&item->ri_list, &buffer_list);
1866 break;
1867 case XFS_LI_BUF:
1868 if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1869 trace_xfs_log_recover_item_reorder_head(log,
1870 trans, item, pass);
1871 list_move(&item->ri_list, &cancel_list);
1872 break;
1873 }
1874 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
1875 list_move(&item->ri_list, &inode_buffer_list);
1876 break;
1877 }
1878 list_move_tail(&item->ri_list, &buffer_list);
1879 break;
1880 case XFS_LI_INODE:
1881 case XFS_LI_DQUOT:
1882 case XFS_LI_QUOTAOFF:
1883 case XFS_LI_EFD:
1884 case XFS_LI_EFI:
1885 case XFS_LI_RUI:
1886 case XFS_LI_RUD:
1887 case XFS_LI_CUI:
1888 case XFS_LI_CUD:
1889 case XFS_LI_BUI:
1890 case XFS_LI_BUD:
1891 trace_xfs_log_recover_item_reorder_tail(log,
1892 trans, item, pass);
1893 list_move_tail(&item->ri_list, &inode_list);
1894 break;
1895 default:
1896 xfs_warn(log->l_mp,
1897 "%s: unrecognized type of log operation",
1898 __func__);
1899 ASSERT(0);
1900 /*
1901 * return the remaining items back to the transaction
1902 * item list so they can be freed in caller.
1903 */
1904 if (!list_empty(&sort_list))
1905 list_splice_init(&sort_list, &trans->r_itemq);
1906 error = -EIO;
1907 goto out;
1908 }
1909 }
1910 out:
1911 ASSERT(list_empty(&sort_list));
1912 if (!list_empty(&buffer_list))
1913 list_splice(&buffer_list, &trans->r_itemq);
1914 if (!list_empty(&inode_list))
1915 list_splice_tail(&inode_list, &trans->r_itemq);
1916 if (!list_empty(&inode_buffer_list))
1917 list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1918 if (!list_empty(&cancel_list))
1919 list_splice_tail(&cancel_list, &trans->r_itemq);
1920 return error;
1921 }
1922
1923 /*
1924 * Build up the table of buf cancel records so that we don't replay
1925 * cancelled data in the second pass. For buffer records that are
1926 * not cancel records, there is nothing to do here so we just return.
1927 *
1928 * If we get a cancel record which is already in the table, this indicates
1929 * that the buffer was cancelled multiple times. In order to ensure
1930 * that during pass 2 we keep the record in the table until we reach its
1931 * last occurrence in the log, we keep a reference count in the cancel
1932 * record in the table to tell us how many times we expect to see this
1933 * record during the second pass.
1934 */
1935 STATIC int
xlog_recover_buffer_pass1(struct xlog * log,struct xlog_recover_item * item)1936 xlog_recover_buffer_pass1(
1937 struct xlog *log,
1938 struct xlog_recover_item *item)
1939 {
1940 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1941 struct list_head *bucket;
1942 struct xfs_buf_cancel *bcp;
1943
1944 /*
1945 * If this isn't a cancel buffer item, then just return.
1946 */
1947 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1948 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1949 return 0;
1950 }
1951
1952 /*
1953 * Insert an xfs_buf_cancel record into the hash table of them.
1954 * If there is already an identical record, bump its reference count.
1955 */
1956 bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1957 list_for_each_entry(bcp, bucket, bc_list) {
1958 if (bcp->bc_blkno == buf_f->blf_blkno &&
1959 bcp->bc_len == buf_f->blf_len) {
1960 bcp->bc_refcount++;
1961 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1962 return 0;
1963 }
1964 }
1965
1966 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
1967 bcp->bc_blkno = buf_f->blf_blkno;
1968 bcp->bc_len = buf_f->blf_len;
1969 bcp->bc_refcount = 1;
1970 list_add_tail(&bcp->bc_list, bucket);
1971
1972 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1973 return 0;
1974 }
1975
1976 /*
1977 * Check to see whether the buffer being recovered has a corresponding
1978 * entry in the buffer cancel record table. If it is, return the cancel
1979 * buffer structure to the caller.
1980 */
1981 STATIC struct xfs_buf_cancel *
xlog_peek_buffer_cancelled(struct xlog * log,xfs_daddr_t blkno,uint len,unsigned short flags)1982 xlog_peek_buffer_cancelled(
1983 struct xlog *log,
1984 xfs_daddr_t blkno,
1985 uint len,
1986 unsigned short flags)
1987 {
1988 struct list_head *bucket;
1989 struct xfs_buf_cancel *bcp;
1990
1991 if (!log->l_buf_cancel_table) {
1992 /* empty table means no cancelled buffers in the log */
1993 ASSERT(!(flags & XFS_BLF_CANCEL));
1994 return NULL;
1995 }
1996
1997 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1998 list_for_each_entry(bcp, bucket, bc_list) {
1999 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
2000 return bcp;
2001 }
2002
2003 /*
2004 * We didn't find a corresponding entry in the table, so return 0 so
2005 * that the buffer is NOT cancelled.
2006 */
2007 ASSERT(!(flags & XFS_BLF_CANCEL));
2008 return NULL;
2009 }
2010
2011 /*
2012 * If the buffer is being cancelled then return 1 so that it will be cancelled,
2013 * otherwise return 0. If the buffer is actually a buffer cancel item
2014 * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
2015 * table and remove it from the table if this is the last reference.
2016 *
2017 * We remove the cancel record from the table when we encounter its last
2018 * occurrence in the log so that if the same buffer is re-used again after its
2019 * last cancellation we actually replay the changes made at that point.
2020 */
2021 STATIC int
xlog_check_buffer_cancelled(struct xlog * log,xfs_daddr_t blkno,uint len,unsigned short flags)2022 xlog_check_buffer_cancelled(
2023 struct xlog *log,
2024 xfs_daddr_t blkno,
2025 uint len,
2026 unsigned short flags)
2027 {
2028 struct xfs_buf_cancel *bcp;
2029
2030 bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
2031 if (!bcp)
2032 return 0;
2033
2034 /*
2035 * We've go a match, so return 1 so that the recovery of this buffer
2036 * is cancelled. If this buffer is actually a buffer cancel log
2037 * item, then decrement the refcount on the one in the table and
2038 * remove it if this is the last reference.
2039 */
2040 if (flags & XFS_BLF_CANCEL) {
2041 if (--bcp->bc_refcount == 0) {
2042 list_del(&bcp->bc_list);
2043 kmem_free(bcp);
2044 }
2045 }
2046 return 1;
2047 }
2048
2049 /*
2050 * Perform recovery for a buffer full of inodes. In these buffers, the only
2051 * data which should be recovered is that which corresponds to the
2052 * di_next_unlinked pointers in the on disk inode structures. The rest of the
2053 * data for the inodes is always logged through the inodes themselves rather
2054 * than the inode buffer and is recovered in xlog_recover_inode_pass2().
2055 *
2056 * The only time when buffers full of inodes are fully recovered is when the
2057 * buffer is full of newly allocated inodes. In this case the buffer will
2058 * not be marked as an inode buffer and so will be sent to
2059 * xlog_recover_do_reg_buffer() below during recovery.
2060 */
2061 STATIC int
xlog_recover_do_inode_buffer(struct xfs_mount * mp,xlog_recover_item_t * item,struct xfs_buf * bp,xfs_buf_log_format_t * buf_f)2062 xlog_recover_do_inode_buffer(
2063 struct xfs_mount *mp,
2064 xlog_recover_item_t *item,
2065 struct xfs_buf *bp,
2066 xfs_buf_log_format_t *buf_f)
2067 {
2068 int i;
2069 int item_index = 0;
2070 int bit = 0;
2071 int nbits = 0;
2072 int reg_buf_offset = 0;
2073 int reg_buf_bytes = 0;
2074 int next_unlinked_offset;
2075 int inodes_per_buf;
2076 xfs_agino_t *logged_nextp;
2077 xfs_agino_t *buffer_nextp;
2078
2079 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
2080
2081 /*
2082 * Post recovery validation only works properly on CRC enabled
2083 * filesystems.
2084 */
2085 if (xfs_sb_version_hascrc(&mp->m_sb))
2086 bp->b_ops = &xfs_inode_buf_ops;
2087
2088 inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
2089 for (i = 0; i < inodes_per_buf; i++) {
2090 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
2091 offsetof(xfs_dinode_t, di_next_unlinked);
2092
2093 while (next_unlinked_offset >=
2094 (reg_buf_offset + reg_buf_bytes)) {
2095 /*
2096 * The next di_next_unlinked field is beyond
2097 * the current logged region. Find the next
2098 * logged region that contains or is beyond
2099 * the current di_next_unlinked field.
2100 */
2101 bit += nbits;
2102 bit = xfs_next_bit(buf_f->blf_data_map,
2103 buf_f->blf_map_size, bit);
2104
2105 /*
2106 * If there are no more logged regions in the
2107 * buffer, then we're done.
2108 */
2109 if (bit == -1)
2110 return 0;
2111
2112 nbits = xfs_contig_bits(buf_f->blf_data_map,
2113 buf_f->blf_map_size, bit);
2114 ASSERT(nbits > 0);
2115 reg_buf_offset = bit << XFS_BLF_SHIFT;
2116 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
2117 item_index++;
2118 }
2119
2120 /*
2121 * If the current logged region starts after the current
2122 * di_next_unlinked field, then move on to the next
2123 * di_next_unlinked field.
2124 */
2125 if (next_unlinked_offset < reg_buf_offset)
2126 continue;
2127
2128 ASSERT(item->ri_buf[item_index].i_addr != NULL);
2129 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
2130 ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
2131
2132 /*
2133 * The current logged region contains a copy of the
2134 * current di_next_unlinked field. Extract its value
2135 * and copy it to the buffer copy.
2136 */
2137 logged_nextp = item->ri_buf[item_index].i_addr +
2138 next_unlinked_offset - reg_buf_offset;
2139 if (unlikely(*logged_nextp == 0)) {
2140 xfs_alert(mp,
2141 "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). "
2142 "Trying to replay bad (0) inode di_next_unlinked field.",
2143 item, bp);
2144 XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
2145 XFS_ERRLEVEL_LOW, mp);
2146 return -EFSCORRUPTED;
2147 }
2148
2149 buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
2150 *buffer_nextp = *logged_nextp;
2151
2152 /*
2153 * If necessary, recalculate the CRC in the on-disk inode. We
2154 * have to leave the inode in a consistent state for whoever
2155 * reads it next....
2156 */
2157 xfs_dinode_calc_crc(mp,
2158 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
2159
2160 }
2161
2162 return 0;
2163 }
2164
2165 /*
2166 * V5 filesystems know the age of the buffer on disk being recovered. We can
2167 * have newer objects on disk than we are replaying, and so for these cases we
2168 * don't want to replay the current change as that will make the buffer contents
2169 * temporarily invalid on disk.
2170 *
2171 * The magic number might not match the buffer type we are going to recover
2172 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
2173 * extract the LSN of the existing object in the buffer based on it's current
2174 * magic number. If we don't recognise the magic number in the buffer, then
2175 * return a LSN of -1 so that the caller knows it was an unrecognised block and
2176 * so can recover the buffer.
2177 *
2178 * Note: we cannot rely solely on magic number matches to determine that the
2179 * buffer has a valid LSN - we also need to verify that it belongs to this
2180 * filesystem, so we need to extract the object's LSN and compare it to that
2181 * which we read from the superblock. If the UUIDs don't match, then we've got a
2182 * stale metadata block from an old filesystem instance that we need to recover
2183 * over the top of.
2184 */
2185 static xfs_lsn_t
xlog_recover_get_buf_lsn(struct xfs_mount * mp,struct xfs_buf * bp)2186 xlog_recover_get_buf_lsn(
2187 struct xfs_mount *mp,
2188 struct xfs_buf *bp)
2189 {
2190 uint32_t magic32;
2191 uint16_t magic16;
2192 uint16_t magicda;
2193 void *blk = bp->b_addr;
2194 uuid_t *uuid;
2195 xfs_lsn_t lsn = -1;
2196
2197 /* v4 filesystems always recover immediately */
2198 if (!xfs_sb_version_hascrc(&mp->m_sb))
2199 goto recover_immediately;
2200
2201 magic32 = be32_to_cpu(*(__be32 *)blk);
2202 switch (magic32) {
2203 case XFS_ABTB_CRC_MAGIC:
2204 case XFS_ABTC_CRC_MAGIC:
2205 case XFS_ABTB_MAGIC:
2206 case XFS_ABTC_MAGIC:
2207 case XFS_RMAP_CRC_MAGIC:
2208 case XFS_REFC_CRC_MAGIC:
2209 case XFS_FIBT_CRC_MAGIC:
2210 case XFS_FIBT_MAGIC:
2211 case XFS_IBT_CRC_MAGIC:
2212 case XFS_IBT_MAGIC: {
2213 struct xfs_btree_block *btb = blk;
2214
2215 lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
2216 uuid = &btb->bb_u.s.bb_uuid;
2217 break;
2218 }
2219 case XFS_BMAP_CRC_MAGIC:
2220 case XFS_BMAP_MAGIC: {
2221 struct xfs_btree_block *btb = blk;
2222
2223 lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
2224 uuid = &btb->bb_u.l.bb_uuid;
2225 break;
2226 }
2227 case XFS_AGF_MAGIC:
2228 lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
2229 uuid = &((struct xfs_agf *)blk)->agf_uuid;
2230 break;
2231 case XFS_AGFL_MAGIC:
2232 lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
2233 uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
2234 break;
2235 case XFS_AGI_MAGIC:
2236 lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
2237 uuid = &((struct xfs_agi *)blk)->agi_uuid;
2238 break;
2239 case XFS_SYMLINK_MAGIC:
2240 lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
2241 uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
2242 break;
2243 case XFS_DIR3_BLOCK_MAGIC:
2244 case XFS_DIR3_DATA_MAGIC:
2245 case XFS_DIR3_FREE_MAGIC:
2246 lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
2247 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
2248 break;
2249 case XFS_ATTR3_RMT_MAGIC:
2250 /*
2251 * Remote attr blocks are written synchronously, rather than
2252 * being logged. That means they do not contain a valid LSN
2253 * (i.e. transactionally ordered) in them, and hence any time we
2254 * see a buffer to replay over the top of a remote attribute
2255 * block we should simply do so.
2256 */
2257 goto recover_immediately;
2258 case XFS_SB_MAGIC:
2259 /*
2260 * superblock uuids are magic. We may or may not have a
2261 * sb_meta_uuid on disk, but it will be set in the in-core
2262 * superblock. We set the uuid pointer for verification
2263 * according to the superblock feature mask to ensure we check
2264 * the relevant UUID in the superblock.
2265 */
2266 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
2267 if (xfs_sb_version_hasmetauuid(&mp->m_sb))
2268 uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
2269 else
2270 uuid = &((struct xfs_dsb *)blk)->sb_uuid;
2271 break;
2272 default:
2273 break;
2274 }
2275
2276 if (lsn != (xfs_lsn_t)-1) {
2277 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
2278 goto recover_immediately;
2279 return lsn;
2280 }
2281
2282 magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
2283 switch (magicda) {
2284 case XFS_DIR3_LEAF1_MAGIC:
2285 case XFS_DIR3_LEAFN_MAGIC:
2286 case XFS_DA3_NODE_MAGIC:
2287 lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
2288 uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
2289 break;
2290 default:
2291 break;
2292 }
2293
2294 if (lsn != (xfs_lsn_t)-1) {
2295 if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
2296 goto recover_immediately;
2297 return lsn;
2298 }
2299
2300 /*
2301 * We do individual object checks on dquot and inode buffers as they
2302 * have their own individual LSN records. Also, we could have a stale
2303 * buffer here, so we have to at least recognise these buffer types.
2304 *
2305 * A notd complexity here is inode unlinked list processing - it logs
2306 * the inode directly in the buffer, but we don't know which inodes have
2307 * been modified, and there is no global buffer LSN. Hence we need to
2308 * recover all inode buffer types immediately. This problem will be
2309 * fixed by logical logging of the unlinked list modifications.
2310 */
2311 magic16 = be16_to_cpu(*(__be16 *)blk);
2312 switch (magic16) {
2313 case XFS_DQUOT_MAGIC:
2314 case XFS_DINODE_MAGIC:
2315 goto recover_immediately;
2316 default:
2317 break;
2318 }
2319
2320 /* unknown buffer contents, recover immediately */
2321
2322 recover_immediately:
2323 return (xfs_lsn_t)-1;
2324
2325 }
2326
2327 /*
2328 * Validate the recovered buffer is of the correct type and attach the
2329 * appropriate buffer operations to them for writeback. Magic numbers are in a
2330 * few places:
2331 * the first 16 bits of the buffer (inode buffer, dquot buffer),
2332 * the first 32 bits of the buffer (most blocks),
2333 * inside a struct xfs_da_blkinfo at the start of the buffer.
2334 */
2335 static void
xlog_recover_validate_buf_type(struct xfs_mount * mp,struct xfs_buf * bp,xfs_buf_log_format_t * buf_f,xfs_lsn_t current_lsn)2336 xlog_recover_validate_buf_type(
2337 struct xfs_mount *mp,
2338 struct xfs_buf *bp,
2339 xfs_buf_log_format_t *buf_f,
2340 xfs_lsn_t current_lsn)
2341 {
2342 struct xfs_da_blkinfo *info = bp->b_addr;
2343 uint32_t magic32;
2344 uint16_t magic16;
2345 uint16_t magicda;
2346 char *warnmsg = NULL;
2347
2348 /*
2349 * We can only do post recovery validation on items on CRC enabled
2350 * fielsystems as we need to know when the buffer was written to be able
2351 * to determine if we should have replayed the item. If we replay old
2352 * metadata over a newer buffer, then it will enter a temporarily
2353 * inconsistent state resulting in verification failures. Hence for now
2354 * just avoid the verification stage for non-crc filesystems
2355 */
2356 if (!xfs_sb_version_hascrc(&mp->m_sb))
2357 return;
2358
2359 magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
2360 magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
2361 magicda = be16_to_cpu(info->magic);
2362 switch (xfs_blft_from_flags(buf_f)) {
2363 case XFS_BLFT_BTREE_BUF:
2364 switch (magic32) {
2365 case XFS_ABTB_CRC_MAGIC:
2366 case XFS_ABTB_MAGIC:
2367 bp->b_ops = &xfs_bnobt_buf_ops;
2368 break;
2369 case XFS_ABTC_CRC_MAGIC:
2370 case XFS_ABTC_MAGIC:
2371 bp->b_ops = &xfs_cntbt_buf_ops;
2372 break;
2373 case XFS_IBT_CRC_MAGIC:
2374 case XFS_IBT_MAGIC:
2375 bp->b_ops = &xfs_inobt_buf_ops;
2376 break;
2377 case XFS_FIBT_CRC_MAGIC:
2378 case XFS_FIBT_MAGIC:
2379 bp->b_ops = &xfs_finobt_buf_ops;
2380 break;
2381 case XFS_BMAP_CRC_MAGIC:
2382 case XFS_BMAP_MAGIC:
2383 bp->b_ops = &xfs_bmbt_buf_ops;
2384 break;
2385 case XFS_RMAP_CRC_MAGIC:
2386 bp->b_ops = &xfs_rmapbt_buf_ops;
2387 break;
2388 case XFS_REFC_CRC_MAGIC:
2389 bp->b_ops = &xfs_refcountbt_buf_ops;
2390 break;
2391 default:
2392 warnmsg = "Bad btree block magic!";
2393 break;
2394 }
2395 break;
2396 case XFS_BLFT_AGF_BUF:
2397 if (magic32 != XFS_AGF_MAGIC) {
2398 warnmsg = "Bad AGF block magic!";
2399 break;
2400 }
2401 bp->b_ops = &xfs_agf_buf_ops;
2402 break;
2403 case XFS_BLFT_AGFL_BUF:
2404 if (magic32 != XFS_AGFL_MAGIC) {
2405 warnmsg = "Bad AGFL block magic!";
2406 break;
2407 }
2408 bp->b_ops = &xfs_agfl_buf_ops;
2409 break;
2410 case XFS_BLFT_AGI_BUF:
2411 if (magic32 != XFS_AGI_MAGIC) {
2412 warnmsg = "Bad AGI block magic!";
2413 break;
2414 }
2415 bp->b_ops = &xfs_agi_buf_ops;
2416 break;
2417 case XFS_BLFT_UDQUOT_BUF:
2418 case XFS_BLFT_PDQUOT_BUF:
2419 case XFS_BLFT_GDQUOT_BUF:
2420 #ifdef CONFIG_XFS_QUOTA
2421 if (magic16 != XFS_DQUOT_MAGIC) {
2422 warnmsg = "Bad DQUOT block magic!";
2423 break;
2424 }
2425 bp->b_ops = &xfs_dquot_buf_ops;
2426 #else
2427 xfs_alert(mp,
2428 "Trying to recover dquots without QUOTA support built in!");
2429 ASSERT(0);
2430 #endif
2431 break;
2432 case XFS_BLFT_DINO_BUF:
2433 if (magic16 != XFS_DINODE_MAGIC) {
2434 warnmsg = "Bad INODE block magic!";
2435 break;
2436 }
2437 bp->b_ops = &xfs_inode_buf_ops;
2438 break;
2439 case XFS_BLFT_SYMLINK_BUF:
2440 if (magic32 != XFS_SYMLINK_MAGIC) {
2441 warnmsg = "Bad symlink block magic!";
2442 break;
2443 }
2444 bp->b_ops = &xfs_symlink_buf_ops;
2445 break;
2446 case XFS_BLFT_DIR_BLOCK_BUF:
2447 if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
2448 magic32 != XFS_DIR3_BLOCK_MAGIC) {
2449 warnmsg = "Bad dir block magic!";
2450 break;
2451 }
2452 bp->b_ops = &xfs_dir3_block_buf_ops;
2453 break;
2454 case XFS_BLFT_DIR_DATA_BUF:
2455 if (magic32 != XFS_DIR2_DATA_MAGIC &&
2456 magic32 != XFS_DIR3_DATA_MAGIC) {
2457 warnmsg = "Bad dir data magic!";
2458 break;
2459 }
2460 bp->b_ops = &xfs_dir3_data_buf_ops;
2461 break;
2462 case XFS_BLFT_DIR_FREE_BUF:
2463 if (magic32 != XFS_DIR2_FREE_MAGIC &&
2464 magic32 != XFS_DIR3_FREE_MAGIC) {
2465 warnmsg = "Bad dir3 free magic!";
2466 break;
2467 }
2468 bp->b_ops = &xfs_dir3_free_buf_ops;
2469 break;
2470 case XFS_BLFT_DIR_LEAF1_BUF:
2471 if (magicda != XFS_DIR2_LEAF1_MAGIC &&
2472 magicda != XFS_DIR3_LEAF1_MAGIC) {
2473 warnmsg = "Bad dir leaf1 magic!";
2474 break;
2475 }
2476 bp->b_ops = &xfs_dir3_leaf1_buf_ops;
2477 break;
2478 case XFS_BLFT_DIR_LEAFN_BUF:
2479 if (magicda != XFS_DIR2_LEAFN_MAGIC &&
2480 magicda != XFS_DIR3_LEAFN_MAGIC) {
2481 warnmsg = "Bad dir leafn magic!";
2482 break;
2483 }
2484 bp->b_ops = &xfs_dir3_leafn_buf_ops;
2485 break;
2486 case XFS_BLFT_DA_NODE_BUF:
2487 if (magicda != XFS_DA_NODE_MAGIC &&
2488 magicda != XFS_DA3_NODE_MAGIC) {
2489 warnmsg = "Bad da node magic!";
2490 break;
2491 }
2492 bp->b_ops = &xfs_da3_node_buf_ops;
2493 break;
2494 case XFS_BLFT_ATTR_LEAF_BUF:
2495 if (magicda != XFS_ATTR_LEAF_MAGIC &&
2496 magicda != XFS_ATTR3_LEAF_MAGIC) {
2497 warnmsg = "Bad attr leaf magic!";
2498 break;
2499 }
2500 bp->b_ops = &xfs_attr3_leaf_buf_ops;
2501 break;
2502 case XFS_BLFT_ATTR_RMT_BUF:
2503 if (magic32 != XFS_ATTR3_RMT_MAGIC) {
2504 warnmsg = "Bad attr remote magic!";
2505 break;
2506 }
2507 bp->b_ops = &xfs_attr3_rmt_buf_ops;
2508 break;
2509 case XFS_BLFT_SB_BUF:
2510 if (magic32 != XFS_SB_MAGIC) {
2511 warnmsg = "Bad SB block magic!";
2512 break;
2513 }
2514 bp->b_ops = &xfs_sb_buf_ops;
2515 break;
2516 #ifdef CONFIG_XFS_RT
2517 case XFS_BLFT_RTBITMAP_BUF:
2518 case XFS_BLFT_RTSUMMARY_BUF:
2519 /* no magic numbers for verification of RT buffers */
2520 bp->b_ops = &xfs_rtbuf_ops;
2521 break;
2522 #endif /* CONFIG_XFS_RT */
2523 default:
2524 xfs_warn(mp, "Unknown buffer type %d!",
2525 xfs_blft_from_flags(buf_f));
2526 break;
2527 }
2528
2529 /*
2530 * Nothing else to do in the case of a NULL current LSN as this means
2531 * the buffer is more recent than the change in the log and will be
2532 * skipped.
2533 */
2534 if (current_lsn == NULLCOMMITLSN)
2535 return;
2536
2537 if (warnmsg) {
2538 xfs_warn(mp, warnmsg);
2539 ASSERT(0);
2540 }
2541
2542 /*
2543 * We must update the metadata LSN of the buffer as it is written out to
2544 * ensure that older transactions never replay over this one and corrupt
2545 * the buffer. This can occur if log recovery is interrupted at some
2546 * point after the current transaction completes, at which point a
2547 * subsequent mount starts recovery from the beginning.
2548 *
2549 * Write verifiers update the metadata LSN from log items attached to
2550 * the buffer. Therefore, initialize a bli purely to carry the LSN to
2551 * the verifier. We'll clean it up in our ->iodone() callback.
2552 */
2553 if (bp->b_ops) {
2554 struct xfs_buf_log_item *bip;
2555
2556 ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
2557 bp->b_iodone = xlog_recover_iodone;
2558 xfs_buf_item_init(bp, mp);
2559 bip = bp->b_log_item;
2560 bip->bli_item.li_lsn = current_lsn;
2561 }
2562 }
2563
2564 /*
2565 * Perform a 'normal' buffer recovery. Each logged region of the
2566 * buffer should be copied over the corresponding region in the
2567 * given buffer. The bitmap in the buf log format structure indicates
2568 * where to place the logged data.
2569 */
2570 STATIC void
xlog_recover_do_reg_buffer(struct xfs_mount * mp,xlog_recover_item_t * item,struct xfs_buf * bp,xfs_buf_log_format_t * buf_f,xfs_lsn_t current_lsn)2571 xlog_recover_do_reg_buffer(
2572 struct xfs_mount *mp,
2573 xlog_recover_item_t *item,
2574 struct xfs_buf *bp,
2575 xfs_buf_log_format_t *buf_f,
2576 xfs_lsn_t current_lsn)
2577 {
2578 int i;
2579 int bit;
2580 int nbits;
2581 xfs_failaddr_t fa;
2582 const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot);
2583
2584 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
2585
2586 bit = 0;
2587 i = 1; /* 0 is the buf format structure */
2588 while (1) {
2589 bit = xfs_next_bit(buf_f->blf_data_map,
2590 buf_f->blf_map_size, bit);
2591 if (bit == -1)
2592 break;
2593 nbits = xfs_contig_bits(buf_f->blf_data_map,
2594 buf_f->blf_map_size, bit);
2595 ASSERT(nbits > 0);
2596 ASSERT(item->ri_buf[i].i_addr != NULL);
2597 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
2598 ASSERT(BBTOB(bp->b_length) >=
2599 ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
2600
2601 /*
2602 * The dirty regions logged in the buffer, even though
2603 * contiguous, may span multiple chunks. This is because the
2604 * dirty region may span a physical page boundary in a buffer
2605 * and hence be split into two separate vectors for writing into
2606 * the log. Hence we need to trim nbits back to the length of
2607 * the current region being copied out of the log.
2608 */
2609 if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
2610 nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
2611
2612 /*
2613 * Do a sanity check if this is a dquot buffer. Just checking
2614 * the first dquot in the buffer should do. XXXThis is
2615 * probably a good thing to do for other buf types also.
2616 */
2617 fa = NULL;
2618 if (buf_f->blf_flags &
2619 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2620 if (item->ri_buf[i].i_addr == NULL) {
2621 xfs_alert(mp,
2622 "XFS: NULL dquot in %s.", __func__);
2623 goto next;
2624 }
2625 if (item->ri_buf[i].i_len < size_disk_dquot) {
2626 xfs_alert(mp,
2627 "XFS: dquot too small (%d) in %s.",
2628 item->ri_buf[i].i_len, __func__);
2629 goto next;
2630 }
2631 fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr,
2632 -1, 0);
2633 if (fa) {
2634 xfs_alert(mp,
2635 "dquot corrupt at %pS trying to replay into block 0x%llx",
2636 fa, bp->b_bn);
2637 goto next;
2638 }
2639 }
2640
2641 memcpy(xfs_buf_offset(bp,
2642 (uint)bit << XFS_BLF_SHIFT), /* dest */
2643 item->ri_buf[i].i_addr, /* source */
2644 nbits<<XFS_BLF_SHIFT); /* length */
2645 next:
2646 i++;
2647 bit += nbits;
2648 }
2649
2650 /* Shouldn't be any more regions */
2651 ASSERT(i == item->ri_total);
2652
2653 xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
2654 }
2655
2656 /*
2657 * Perform a dquot buffer recovery.
2658 * Simple algorithm: if we have found a QUOTAOFF log item of the same type
2659 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2660 * Else, treat it as a regular buffer and do recovery.
2661 *
2662 * Return false if the buffer was tossed and true if we recovered the buffer to
2663 * indicate to the caller if the buffer needs writing.
2664 */
2665 STATIC bool
xlog_recover_do_dquot_buffer(struct xfs_mount * mp,struct xlog * log,struct xlog_recover_item * item,struct xfs_buf * bp,struct xfs_buf_log_format * buf_f)2666 xlog_recover_do_dquot_buffer(
2667 struct xfs_mount *mp,
2668 struct xlog *log,
2669 struct xlog_recover_item *item,
2670 struct xfs_buf *bp,
2671 struct xfs_buf_log_format *buf_f)
2672 {
2673 uint type;
2674
2675 trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2676
2677 /*
2678 * Filesystems are required to send in quota flags at mount time.
2679 */
2680 if (!mp->m_qflags)
2681 return false;
2682
2683 type = 0;
2684 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2685 type |= XFS_DQ_USER;
2686 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2687 type |= XFS_DQ_PROJ;
2688 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2689 type |= XFS_DQ_GROUP;
2690 /*
2691 * This type of quotas was turned off, so ignore this buffer
2692 */
2693 if (log->l_quotaoffs_flag & type)
2694 return false;
2695
2696 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
2697 return true;
2698 }
2699
2700 /*
2701 * This routine replays a modification made to a buffer at runtime.
2702 * There are actually two types of buffer, regular and inode, which
2703 * are handled differently. Inode buffers are handled differently
2704 * in that we only recover a specific set of data from them, namely
2705 * the inode di_next_unlinked fields. This is because all other inode
2706 * data is actually logged via inode records and any data we replay
2707 * here which overlaps that may be stale.
2708 *
2709 * When meta-data buffers are freed at run time we log a buffer item
2710 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2711 * of the buffer in the log should not be replayed at recovery time.
2712 * This is so that if the blocks covered by the buffer are reused for
2713 * file data before we crash we don't end up replaying old, freed
2714 * meta-data into a user's file.
2715 *
2716 * To handle the cancellation of buffer log items, we make two passes
2717 * over the log during recovery. During the first we build a table of
2718 * those buffers which have been cancelled, and during the second we
2719 * only replay those buffers which do not have corresponding cancel
2720 * records in the table. See xlog_recover_buffer_pass[1,2] above
2721 * for more details on the implementation of the table of cancel records.
2722 */
2723 STATIC int
xlog_recover_buffer_pass2(struct xlog * log,struct list_head * buffer_list,struct xlog_recover_item * item,xfs_lsn_t current_lsn)2724 xlog_recover_buffer_pass2(
2725 struct xlog *log,
2726 struct list_head *buffer_list,
2727 struct xlog_recover_item *item,
2728 xfs_lsn_t current_lsn)
2729 {
2730 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2731 xfs_mount_t *mp = log->l_mp;
2732 xfs_buf_t *bp;
2733 int error;
2734 uint buf_flags;
2735 xfs_lsn_t lsn;
2736
2737 /*
2738 * In this pass we only want to recover all the buffers which have
2739 * not been cancelled and are not cancellation buffers themselves.
2740 */
2741 if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2742 buf_f->blf_len, buf_f->blf_flags)) {
2743 trace_xfs_log_recover_buf_cancel(log, buf_f);
2744 return 0;
2745 }
2746
2747 trace_xfs_log_recover_buf_recover(log, buf_f);
2748
2749 buf_flags = 0;
2750 if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
2751 buf_flags |= XBF_UNMAPPED;
2752
2753 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2754 buf_flags, NULL);
2755 if (!bp)
2756 return -ENOMEM;
2757 error = bp->b_error;
2758 if (error) {
2759 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2760 goto out_release;
2761 }
2762
2763 /*
2764 * Recover the buffer only if we get an LSN from it and it's less than
2765 * the lsn of the transaction we are replaying.
2766 *
2767 * Note that we have to be extremely careful of readahead here.
2768 * Readahead does not attach verfiers to the buffers so if we don't
2769 * actually do any replay after readahead because of the LSN we found
2770 * in the buffer if more recent than that current transaction then we
2771 * need to attach the verifier directly. Failure to do so can lead to
2772 * future recovery actions (e.g. EFI and unlinked list recovery) can
2773 * operate on the buffers and they won't get the verifier attached. This
2774 * can lead to blocks on disk having the correct content but a stale
2775 * CRC.
2776 *
2777 * It is safe to assume these clean buffers are currently up to date.
2778 * If the buffer is dirtied by a later transaction being replayed, then
2779 * the verifier will be reset to match whatever recover turns that
2780 * buffer into.
2781 */
2782 lsn = xlog_recover_get_buf_lsn(mp, bp);
2783 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2784 trace_xfs_log_recover_buf_skip(log, buf_f);
2785 xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
2786
2787 /*
2788 * We're skipping replay of this buffer log item due to the log
2789 * item LSN being behind the ondisk buffer. Verify the buffer
2790 * contents since we aren't going to run the write verifier.
2791 */
2792 if (bp->b_ops) {
2793 bp->b_ops->verify_read(bp);
2794 error = bp->b_error;
2795 }
2796 goto out_release;
2797 }
2798
2799 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2800 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2801 if (error)
2802 goto out_release;
2803 } else if (buf_f->blf_flags &
2804 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2805 bool dirty;
2806
2807 dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2808 if (!dirty)
2809 goto out_release;
2810 } else {
2811 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
2812 }
2813
2814 /*
2815 * Perform delayed write on the buffer. Asynchronous writes will be
2816 * slower when taking into account all the buffers to be flushed.
2817 *
2818 * Also make sure that only inode buffers with good sizes stay in
2819 * the buffer cache. The kernel moves inodes in buffers of 1 block
2820 * or inode_cluster_size bytes, whichever is bigger. The inode
2821 * buffers in the log can be a different size if the log was generated
2822 * by an older kernel using unclustered inode buffers or a newer kernel
2823 * running with a different inode cluster size. Regardless, if the
2824 * the inode buffer size isn't max(blocksize, inode_cluster_size)
2825 * for *our* value of inode_cluster_size, then we need to keep
2826 * the buffer out of the buffer cache so that the buffer won't
2827 * overlap with future reads of those inodes.
2828 */
2829 if (XFS_DINODE_MAGIC ==
2830 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2831 (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
2832 xfs_buf_stale(bp);
2833 error = xfs_bwrite(bp);
2834 } else {
2835 ASSERT(bp->b_mount == mp);
2836 bp->b_iodone = xlog_recover_iodone;
2837 xfs_buf_delwri_queue(bp, buffer_list);
2838 }
2839
2840 out_release:
2841 xfs_buf_relse(bp);
2842 return error;
2843 }
2844
2845 /*
2846 * Inode fork owner changes
2847 *
2848 * If we have been told that we have to reparent the inode fork, it's because an
2849 * extent swap operation on a CRC enabled filesystem has been done and we are
2850 * replaying it. We need to walk the BMBT of the appropriate fork and change the
2851 * owners of it.
2852 *
2853 * The complexity here is that we don't have an inode context to work with, so
2854 * after we've replayed the inode we need to instantiate one. This is where the
2855 * fun begins.
2856 *
2857 * We are in the middle of log recovery, so we can't run transactions. That
2858 * means we cannot use cache coherent inode instantiation via xfs_iget(), as
2859 * that will result in the corresponding iput() running the inode through
2860 * xfs_inactive(). If we've just replayed an inode core that changes the link
2861 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
2862 * transactions (bad!).
2863 *
2864 * So, to avoid this, we instantiate an inode directly from the inode core we've
2865 * just recovered. We have the buffer still locked, and all we really need to
2866 * instantiate is the inode core and the forks being modified. We can do this
2867 * manually, then run the inode btree owner change, and then tear down the
2868 * xfs_inode without having to run any transactions at all.
2869 *
2870 * Also, because we don't have a transaction context available here but need to
2871 * gather all the buffers we modify for writeback so we pass the buffer_list
2872 * instead for the operation to use.
2873 */
2874
2875 STATIC int
xfs_recover_inode_owner_change(struct xfs_mount * mp,struct xfs_dinode * dip,struct xfs_inode_log_format * in_f,struct list_head * buffer_list)2876 xfs_recover_inode_owner_change(
2877 struct xfs_mount *mp,
2878 struct xfs_dinode *dip,
2879 struct xfs_inode_log_format *in_f,
2880 struct list_head *buffer_list)
2881 {
2882 struct xfs_inode *ip;
2883 int error;
2884
2885 ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
2886
2887 ip = xfs_inode_alloc(mp, in_f->ilf_ino);
2888 if (!ip)
2889 return -ENOMEM;
2890
2891 /* instantiate the inode */
2892 ASSERT(dip->di_version >= 3);
2893 xfs_inode_from_disk(ip, dip);
2894
2895 error = xfs_iformat_fork(ip, dip);
2896 if (error)
2897 goto out_free_ip;
2898
2899 if (!xfs_inode_verify_forks(ip)) {
2900 error = -EFSCORRUPTED;
2901 goto out_free_ip;
2902 }
2903
2904 if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
2905 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
2906 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
2907 ip->i_ino, buffer_list);
2908 if (error)
2909 goto out_free_ip;
2910 }
2911
2912 if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
2913 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
2914 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
2915 ip->i_ino, buffer_list);
2916 if (error)
2917 goto out_free_ip;
2918 }
2919
2920 out_free_ip:
2921 xfs_inode_free(ip);
2922 return error;
2923 }
2924
2925 STATIC int
xlog_recover_inode_pass2(struct xlog * log,struct list_head * buffer_list,struct xlog_recover_item * item,xfs_lsn_t current_lsn)2926 xlog_recover_inode_pass2(
2927 struct xlog *log,
2928 struct list_head *buffer_list,
2929 struct xlog_recover_item *item,
2930 xfs_lsn_t current_lsn)
2931 {
2932 struct xfs_inode_log_format *in_f;
2933 xfs_mount_t *mp = log->l_mp;
2934 xfs_buf_t *bp;
2935 xfs_dinode_t *dip;
2936 int len;
2937 char *src;
2938 char *dest;
2939 int error;
2940 int attr_index;
2941 uint fields;
2942 struct xfs_log_dinode *ldip;
2943 uint isize;
2944 int need_free = 0;
2945
2946 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
2947 in_f = item->ri_buf[0].i_addr;
2948 } else {
2949 in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
2950 need_free = 1;
2951 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2952 if (error)
2953 goto error;
2954 }
2955
2956 /*
2957 * Inode buffers can be freed, look out for it,
2958 * and do not replay the inode.
2959 */
2960 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2961 in_f->ilf_len, 0)) {
2962 error = 0;
2963 trace_xfs_log_recover_inode_cancel(log, in_f);
2964 goto error;
2965 }
2966 trace_xfs_log_recover_inode_recover(log, in_f);
2967
2968 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2969 &xfs_inode_buf_ops);
2970 if (!bp) {
2971 error = -ENOMEM;
2972 goto error;
2973 }
2974 error = bp->b_error;
2975 if (error) {
2976 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2977 goto out_release;
2978 }
2979 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2980 dip = xfs_buf_offset(bp, in_f->ilf_boffset);
2981
2982 /*
2983 * Make sure the place we're flushing out to really looks
2984 * like an inode!
2985 */
2986 if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
2987 xfs_alert(mp,
2988 "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
2989 __func__, dip, bp, in_f->ilf_ino);
2990 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2991 XFS_ERRLEVEL_LOW, mp);
2992 error = -EFSCORRUPTED;
2993 goto out_release;
2994 }
2995 ldip = item->ri_buf[1].i_addr;
2996 if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) {
2997 xfs_alert(mp,
2998 "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld",
2999 __func__, item, in_f->ilf_ino);
3000 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
3001 XFS_ERRLEVEL_LOW, mp);
3002 error = -EFSCORRUPTED;
3003 goto out_release;
3004 }
3005
3006 /*
3007 * If the inode has an LSN in it, recover the inode only if it's less
3008 * than the lsn of the transaction we are replaying. Note: we still
3009 * need to replay an owner change even though the inode is more recent
3010 * than the transaction as there is no guarantee that all the btree
3011 * blocks are more recent than this transaction, too.
3012 */
3013 if (dip->di_version >= 3) {
3014 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
3015
3016 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3017 trace_xfs_log_recover_inode_skip(log, in_f);
3018 error = 0;
3019 goto out_owner_change;
3020 }
3021 }
3022
3023 /*
3024 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
3025 * are transactional and if ordering is necessary we can determine that
3026 * more accurately by the LSN field in the V3 inode core. Don't trust
3027 * the inode versions we might be changing them here - use the
3028 * superblock flag to determine whether we need to look at di_flushiter
3029 * to skip replay when the on disk inode is newer than the log one
3030 */
3031 if (!xfs_sb_version_has_v3inode(&mp->m_sb) &&
3032 ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
3033 /*
3034 * Deal with the wrap case, DI_MAX_FLUSH is less
3035 * than smaller numbers
3036 */
3037 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
3038 ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
3039 /* do nothing */
3040 } else {
3041 trace_xfs_log_recover_inode_skip(log, in_f);
3042 error = 0;
3043 goto out_release;
3044 }
3045 }
3046
3047 /* Take the opportunity to reset the flush iteration count */
3048 ldip->di_flushiter = 0;
3049
3050 if (unlikely(S_ISREG(ldip->di_mode))) {
3051 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3052 (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
3053 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
3054 XFS_ERRLEVEL_LOW, mp, ldip,
3055 sizeof(*ldip));
3056 xfs_alert(mp,
3057 "%s: Bad regular inode log record, rec ptr "PTR_FMT", "
3058 "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
3059 __func__, item, dip, bp, in_f->ilf_ino);
3060 error = -EFSCORRUPTED;
3061 goto out_release;
3062 }
3063 } else if (unlikely(S_ISDIR(ldip->di_mode))) {
3064 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3065 (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
3066 (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
3067 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
3068 XFS_ERRLEVEL_LOW, mp, ldip,
3069 sizeof(*ldip));
3070 xfs_alert(mp,
3071 "%s: Bad dir inode log record, rec ptr "PTR_FMT", "
3072 "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
3073 __func__, item, dip, bp, in_f->ilf_ino);
3074 error = -EFSCORRUPTED;
3075 goto out_release;
3076 }
3077 }
3078 if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
3079 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
3080 XFS_ERRLEVEL_LOW, mp, ldip,
3081 sizeof(*ldip));
3082 xfs_alert(mp,
3083 "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
3084 "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
3085 __func__, item, dip, bp, in_f->ilf_ino,
3086 ldip->di_nextents + ldip->di_anextents,
3087 ldip->di_nblocks);
3088 error = -EFSCORRUPTED;
3089 goto out_release;
3090 }
3091 if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
3092 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
3093 XFS_ERRLEVEL_LOW, mp, ldip,
3094 sizeof(*ldip));
3095 xfs_alert(mp,
3096 "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
3097 "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
3098 item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
3099 error = -EFSCORRUPTED;
3100 goto out_release;
3101 }
3102 isize = xfs_log_dinode_size(mp);
3103 if (unlikely(item->ri_buf[1].i_len > isize)) {
3104 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
3105 XFS_ERRLEVEL_LOW, mp, ldip,
3106 sizeof(*ldip));
3107 xfs_alert(mp,
3108 "%s: Bad inode log record length %d, rec ptr "PTR_FMT,
3109 __func__, item->ri_buf[1].i_len, item);
3110 error = -EFSCORRUPTED;
3111 goto out_release;
3112 }
3113
3114 /* recover the log dinode inode into the on disk inode */
3115 xfs_log_dinode_to_disk(ldip, dip);
3116
3117 fields = in_f->ilf_fields;
3118 if (fields & XFS_ILOG_DEV)
3119 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
3120
3121 if (in_f->ilf_size == 2)
3122 goto out_owner_change;
3123 len = item->ri_buf[2].i_len;
3124 src = item->ri_buf[2].i_addr;
3125 ASSERT(in_f->ilf_size <= 4);
3126 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
3127 ASSERT(!(fields & XFS_ILOG_DFORK) ||
3128 (len == in_f->ilf_dsize));
3129
3130 switch (fields & XFS_ILOG_DFORK) {
3131 case XFS_ILOG_DDATA:
3132 case XFS_ILOG_DEXT:
3133 memcpy(XFS_DFORK_DPTR(dip), src, len);
3134 break;
3135
3136 case XFS_ILOG_DBROOT:
3137 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
3138 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
3139 XFS_DFORK_DSIZE(dip, mp));
3140 break;
3141
3142 default:
3143 /*
3144 * There are no data fork flags set.
3145 */
3146 ASSERT((fields & XFS_ILOG_DFORK) == 0);
3147 break;
3148 }
3149
3150 /*
3151 * If we logged any attribute data, recover it. There may or
3152 * may not have been any other non-core data logged in this
3153 * transaction.
3154 */
3155 if (in_f->ilf_fields & XFS_ILOG_AFORK) {
3156 if (in_f->ilf_fields & XFS_ILOG_DFORK) {
3157 attr_index = 3;
3158 } else {
3159 attr_index = 2;
3160 }
3161 len = item->ri_buf[attr_index].i_len;
3162 src = item->ri_buf[attr_index].i_addr;
3163 ASSERT(len == in_f->ilf_asize);
3164
3165 switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
3166 case XFS_ILOG_ADATA:
3167 case XFS_ILOG_AEXT:
3168 dest = XFS_DFORK_APTR(dip);
3169 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
3170 memcpy(dest, src, len);
3171 break;
3172
3173 case XFS_ILOG_ABROOT:
3174 dest = XFS_DFORK_APTR(dip);
3175 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
3176 len, (xfs_bmdr_block_t*)dest,
3177 XFS_DFORK_ASIZE(dip, mp));
3178 break;
3179
3180 default:
3181 xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
3182 ASSERT(0);
3183 error = -EFSCORRUPTED;
3184 goto out_release;
3185 }
3186 }
3187
3188 out_owner_change:
3189 /* Recover the swapext owner change unless inode has been deleted */
3190 if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) &&
3191 (dip->di_mode != 0))
3192 error = xfs_recover_inode_owner_change(mp, dip, in_f,
3193 buffer_list);
3194 /* re-generate the checksum. */
3195 xfs_dinode_calc_crc(log->l_mp, dip);
3196
3197 ASSERT(bp->b_mount == mp);
3198 bp->b_iodone = xlog_recover_iodone;
3199 xfs_buf_delwri_queue(bp, buffer_list);
3200
3201 out_release:
3202 xfs_buf_relse(bp);
3203 error:
3204 if (need_free)
3205 kmem_free(in_f);
3206 return error;
3207 }
3208
3209 /*
3210 * Recover QUOTAOFF records. We simply make a note of it in the xlog
3211 * structure, so that we know not to do any dquot item or dquot buffer recovery,
3212 * of that type.
3213 */
3214 STATIC int
xlog_recover_quotaoff_pass1(struct xlog * log,struct xlog_recover_item * item)3215 xlog_recover_quotaoff_pass1(
3216 struct xlog *log,
3217 struct xlog_recover_item *item)
3218 {
3219 xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
3220 ASSERT(qoff_f);
3221
3222 /*
3223 * The logitem format's flag tells us if this was user quotaoff,
3224 * group/project quotaoff or both.
3225 */
3226 if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
3227 log->l_quotaoffs_flag |= XFS_DQ_USER;
3228 if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
3229 log->l_quotaoffs_flag |= XFS_DQ_PROJ;
3230 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
3231 log->l_quotaoffs_flag |= XFS_DQ_GROUP;
3232
3233 return 0;
3234 }
3235
3236 /*
3237 * Recover a dquot record
3238 */
3239 STATIC int
xlog_recover_dquot_pass2(struct xlog * log,struct list_head * buffer_list,struct xlog_recover_item * item,xfs_lsn_t current_lsn)3240 xlog_recover_dquot_pass2(
3241 struct xlog *log,
3242 struct list_head *buffer_list,
3243 struct xlog_recover_item *item,
3244 xfs_lsn_t current_lsn)
3245 {
3246 xfs_mount_t *mp = log->l_mp;
3247 xfs_buf_t *bp;
3248 struct xfs_disk_dquot *ddq, *recddq;
3249 xfs_failaddr_t fa;
3250 int error;
3251 xfs_dq_logformat_t *dq_f;
3252 uint type;
3253
3254
3255 /*
3256 * Filesystems are required to send in quota flags at mount time.
3257 */
3258 if (mp->m_qflags == 0)
3259 return 0;
3260
3261 recddq = item->ri_buf[1].i_addr;
3262 if (recddq == NULL) {
3263 xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
3264 return -EFSCORRUPTED;
3265 }
3266 if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) {
3267 xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
3268 item->ri_buf[1].i_len, __func__);
3269 return -EFSCORRUPTED;
3270 }
3271
3272 /*
3273 * This type of quotas was turned off, so ignore this record.
3274 */
3275 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3276 ASSERT(type);
3277 if (log->l_quotaoffs_flag & type)
3278 return 0;
3279
3280 /*
3281 * At this point we know that quota was _not_ turned off.
3282 * Since the mount flags are not indicating to us otherwise, this
3283 * must mean that quota is on, and the dquot needs to be replayed.
3284 * Remember that we may not have fully recovered the superblock yet,
3285 * so we can't do the usual trick of looking at the SB quota bits.
3286 *
3287 * The other possibility, of course, is that the quota subsystem was
3288 * removed since the last mount - ENOSYS.
3289 */
3290 dq_f = item->ri_buf[0].i_addr;
3291 ASSERT(dq_f);
3292 fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0);
3293 if (fa) {
3294 xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
3295 dq_f->qlf_id, fa);
3296 return -EFSCORRUPTED;
3297 }
3298 ASSERT(dq_f->qlf_len == 1);
3299
3300 /*
3301 * At this point we are assuming that the dquots have been allocated
3302 * and hence the buffer has valid dquots stamped in it. It should,
3303 * therefore, pass verifier validation. If the dquot is bad, then the
3304 * we'll return an error here, so we don't need to specifically check
3305 * the dquot in the buffer after the verifier has run.
3306 */
3307 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
3308 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
3309 &xfs_dquot_buf_ops);
3310 if (error)
3311 return error;
3312
3313 ASSERT(bp);
3314 ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
3315
3316 /*
3317 * If the dquot has an LSN in it, recover the dquot only if it's less
3318 * than the lsn of the transaction we are replaying.
3319 */
3320 if (xfs_sb_version_hascrc(&mp->m_sb)) {
3321 struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
3322 xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
3323
3324 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3325 goto out_release;
3326 }
3327 }
3328
3329 memcpy(ddq, recddq, item->ri_buf[1].i_len);
3330 if (xfs_sb_version_hascrc(&mp->m_sb)) {
3331 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
3332 XFS_DQUOT_CRC_OFF);
3333 }
3334
3335 ASSERT(dq_f->qlf_size == 2);
3336 ASSERT(bp->b_mount == mp);
3337 bp->b_iodone = xlog_recover_iodone;
3338 xfs_buf_delwri_queue(bp, buffer_list);
3339
3340 out_release:
3341 xfs_buf_relse(bp);
3342 return 0;
3343 }
3344
3345 /*
3346 * This routine is called to create an in-core extent free intent
3347 * item from the efi format structure which was logged on disk.
3348 * It allocates an in-core efi, copies the extents from the format
3349 * structure into it, and adds the efi to the AIL with the given
3350 * LSN.
3351 */
3352 STATIC int
xlog_recover_efi_pass2(struct xlog * log,struct xlog_recover_item * item,xfs_lsn_t lsn)3353 xlog_recover_efi_pass2(
3354 struct xlog *log,
3355 struct xlog_recover_item *item,
3356 xfs_lsn_t lsn)
3357 {
3358 int error;
3359 struct xfs_mount *mp = log->l_mp;
3360 struct xfs_efi_log_item *efip;
3361 struct xfs_efi_log_format *efi_formatp;
3362
3363 efi_formatp = item->ri_buf[0].i_addr;
3364
3365 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
3366 error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
3367 if (error) {
3368 xfs_efi_item_free(efip);
3369 return error;
3370 }
3371 atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
3372
3373 spin_lock(&log->l_ailp->ail_lock);
3374 /*
3375 * The EFI has two references. One for the EFD and one for EFI to ensure
3376 * it makes it into the AIL. Insert the EFI into the AIL directly and
3377 * drop the EFI reference. Note that xfs_trans_ail_update() drops the
3378 * AIL lock.
3379 */
3380 xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
3381 xfs_efi_release(efip);
3382 return 0;
3383 }
3384
3385
3386 /*
3387 * This routine is called when an EFD format structure is found in a committed
3388 * transaction in the log. Its purpose is to cancel the corresponding EFI if it
3389 * was still in the log. To do this it searches the AIL for the EFI with an id
3390 * equal to that in the EFD format structure. If we find it we drop the EFD
3391 * reference, which removes the EFI from the AIL and frees it.
3392 */
3393 STATIC int
xlog_recover_efd_pass2(struct xlog * log,struct xlog_recover_item * item)3394 xlog_recover_efd_pass2(
3395 struct xlog *log,
3396 struct xlog_recover_item *item)
3397 {
3398 xfs_efd_log_format_t *efd_formatp;
3399 struct xfs_efi_log_item *efip = NULL;
3400 struct xfs_log_item *lip;
3401 uint64_t efi_id;
3402 struct xfs_ail_cursor cur;
3403 struct xfs_ail *ailp = log->l_ailp;
3404
3405 efd_formatp = item->ri_buf[0].i_addr;
3406 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
3407 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
3408 (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
3409 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
3410 efi_id = efd_formatp->efd_efi_id;
3411
3412 /*
3413 * Search for the EFI with the id in the EFD format structure in the
3414 * AIL.
3415 */
3416 spin_lock(&ailp->ail_lock);
3417 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3418 while (lip != NULL) {
3419 if (lip->li_type == XFS_LI_EFI) {
3420 efip = (struct xfs_efi_log_item *)lip;
3421 if (efip->efi_format.efi_id == efi_id) {
3422 /*
3423 * Drop the EFD reference to the EFI. This
3424 * removes the EFI from the AIL and frees it.
3425 */
3426 spin_unlock(&ailp->ail_lock);
3427 xfs_efi_release(efip);
3428 spin_lock(&ailp->ail_lock);
3429 break;
3430 }
3431 }
3432 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3433 }
3434
3435 xfs_trans_ail_cursor_done(&cur);
3436 spin_unlock(&ailp->ail_lock);
3437
3438 return 0;
3439 }
3440
3441 /*
3442 * This routine is called to create an in-core extent rmap update
3443 * item from the rui format structure which was logged on disk.
3444 * It allocates an in-core rui, copies the extents from the format
3445 * structure into it, and adds the rui to the AIL with the given
3446 * LSN.
3447 */
3448 STATIC int
xlog_recover_rui_pass2(struct xlog * log,struct xlog_recover_item * item,xfs_lsn_t lsn)3449 xlog_recover_rui_pass2(
3450 struct xlog *log,
3451 struct xlog_recover_item *item,
3452 xfs_lsn_t lsn)
3453 {
3454 int error;
3455 struct xfs_mount *mp = log->l_mp;
3456 struct xfs_rui_log_item *ruip;
3457 struct xfs_rui_log_format *rui_formatp;
3458
3459 rui_formatp = item->ri_buf[0].i_addr;
3460
3461 ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
3462 error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format);
3463 if (error) {
3464 xfs_rui_item_free(ruip);
3465 return error;
3466 }
3467 atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
3468
3469 spin_lock(&log->l_ailp->ail_lock);
3470 /*
3471 * The RUI has two references. One for the RUD and one for RUI to ensure
3472 * it makes it into the AIL. Insert the RUI into the AIL directly and
3473 * drop the RUI reference. Note that xfs_trans_ail_update() drops the
3474 * AIL lock.
3475 */
3476 xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn);
3477 xfs_rui_release(ruip);
3478 return 0;
3479 }
3480
3481
3482 /*
3483 * This routine is called when an RUD format structure is found in a committed
3484 * transaction in the log. Its purpose is to cancel the corresponding RUI if it
3485 * was still in the log. To do this it searches the AIL for the RUI with an id
3486 * equal to that in the RUD format structure. If we find it we drop the RUD
3487 * reference, which removes the RUI from the AIL and frees it.
3488 */
3489 STATIC int
xlog_recover_rud_pass2(struct xlog * log,struct xlog_recover_item * item)3490 xlog_recover_rud_pass2(
3491 struct xlog *log,
3492 struct xlog_recover_item *item)
3493 {
3494 struct xfs_rud_log_format *rud_formatp;
3495 struct xfs_rui_log_item *ruip = NULL;
3496 struct xfs_log_item *lip;
3497 uint64_t rui_id;
3498 struct xfs_ail_cursor cur;
3499 struct xfs_ail *ailp = log->l_ailp;
3500
3501 rud_formatp = item->ri_buf[0].i_addr;
3502 ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format));
3503 rui_id = rud_formatp->rud_rui_id;
3504
3505 /*
3506 * Search for the RUI with the id in the RUD format structure in the
3507 * AIL.
3508 */
3509 spin_lock(&ailp->ail_lock);
3510 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3511 while (lip != NULL) {
3512 if (lip->li_type == XFS_LI_RUI) {
3513 ruip = (struct xfs_rui_log_item *)lip;
3514 if (ruip->rui_format.rui_id == rui_id) {
3515 /*
3516 * Drop the RUD reference to the RUI. This
3517 * removes the RUI from the AIL and frees it.
3518 */
3519 spin_unlock(&ailp->ail_lock);
3520 xfs_rui_release(ruip);
3521 spin_lock(&ailp->ail_lock);
3522 break;
3523 }
3524 }
3525 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3526 }
3527
3528 xfs_trans_ail_cursor_done(&cur);
3529 spin_unlock(&ailp->ail_lock);
3530
3531 return 0;
3532 }
3533
3534 /*
3535 * Copy an CUI format buffer from the given buf, and into the destination
3536 * CUI format structure. The CUI/CUD items were designed not to need any
3537 * special alignment handling.
3538 */
3539 static int
xfs_cui_copy_format(struct xfs_log_iovec * buf,struct xfs_cui_log_format * dst_cui_fmt)3540 xfs_cui_copy_format(
3541 struct xfs_log_iovec *buf,
3542 struct xfs_cui_log_format *dst_cui_fmt)
3543 {
3544 struct xfs_cui_log_format *src_cui_fmt;
3545 uint len;
3546
3547 src_cui_fmt = buf->i_addr;
3548 len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
3549
3550 if (buf->i_len == len) {
3551 memcpy(dst_cui_fmt, src_cui_fmt, len);
3552 return 0;
3553 }
3554 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
3555 return -EFSCORRUPTED;
3556 }
3557
3558 /*
3559 * This routine is called to create an in-core extent refcount update
3560 * item from the cui format structure which was logged on disk.
3561 * It allocates an in-core cui, copies the extents from the format
3562 * structure into it, and adds the cui to the AIL with the given
3563 * LSN.
3564 */
3565 STATIC int
xlog_recover_cui_pass2(struct xlog * log,struct xlog_recover_item * item,xfs_lsn_t lsn)3566 xlog_recover_cui_pass2(
3567 struct xlog *log,
3568 struct xlog_recover_item *item,
3569 xfs_lsn_t lsn)
3570 {
3571 int error;
3572 struct xfs_mount *mp = log->l_mp;
3573 struct xfs_cui_log_item *cuip;
3574 struct xfs_cui_log_format *cui_formatp;
3575
3576 cui_formatp = item->ri_buf[0].i_addr;
3577
3578 cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
3579 error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
3580 if (error) {
3581 xfs_cui_item_free(cuip);
3582 return error;
3583 }
3584 atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
3585
3586 spin_lock(&log->l_ailp->ail_lock);
3587 /*
3588 * The CUI has two references. One for the CUD and one for CUI to ensure
3589 * it makes it into the AIL. Insert the CUI into the AIL directly and
3590 * drop the CUI reference. Note that xfs_trans_ail_update() drops the
3591 * AIL lock.
3592 */
3593 xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn);
3594 xfs_cui_release(cuip);
3595 return 0;
3596 }
3597
3598
3599 /*
3600 * This routine is called when an CUD format structure is found in a committed
3601 * transaction in the log. Its purpose is to cancel the corresponding CUI if it
3602 * was still in the log. To do this it searches the AIL for the CUI with an id
3603 * equal to that in the CUD format structure. If we find it we drop the CUD
3604 * reference, which removes the CUI from the AIL and frees it.
3605 */
3606 STATIC int
xlog_recover_cud_pass2(struct xlog * log,struct xlog_recover_item * item)3607 xlog_recover_cud_pass2(
3608 struct xlog *log,
3609 struct xlog_recover_item *item)
3610 {
3611 struct xfs_cud_log_format *cud_formatp;
3612 struct xfs_cui_log_item *cuip = NULL;
3613 struct xfs_log_item *lip;
3614 uint64_t cui_id;
3615 struct xfs_ail_cursor cur;
3616 struct xfs_ail *ailp = log->l_ailp;
3617
3618 cud_formatp = item->ri_buf[0].i_addr;
3619 if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
3620 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
3621 return -EFSCORRUPTED;
3622 }
3623 cui_id = cud_formatp->cud_cui_id;
3624
3625 /*
3626 * Search for the CUI with the id in the CUD format structure in the
3627 * AIL.
3628 */
3629 spin_lock(&ailp->ail_lock);
3630 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3631 while (lip != NULL) {
3632 if (lip->li_type == XFS_LI_CUI) {
3633 cuip = (struct xfs_cui_log_item *)lip;
3634 if (cuip->cui_format.cui_id == cui_id) {
3635 /*
3636 * Drop the CUD reference to the CUI. This
3637 * removes the CUI from the AIL and frees it.
3638 */
3639 spin_unlock(&ailp->ail_lock);
3640 xfs_cui_release(cuip);
3641 spin_lock(&ailp->ail_lock);
3642 break;
3643 }
3644 }
3645 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3646 }
3647
3648 xfs_trans_ail_cursor_done(&cur);
3649 spin_unlock(&ailp->ail_lock);
3650
3651 return 0;
3652 }
3653
3654 /*
3655 * Copy an BUI format buffer from the given buf, and into the destination
3656 * BUI format structure. The BUI/BUD items were designed not to need any
3657 * special alignment handling.
3658 */
3659 static int
xfs_bui_copy_format(struct xfs_log_iovec * buf,struct xfs_bui_log_format * dst_bui_fmt)3660 xfs_bui_copy_format(
3661 struct xfs_log_iovec *buf,
3662 struct xfs_bui_log_format *dst_bui_fmt)
3663 {
3664 struct xfs_bui_log_format *src_bui_fmt;
3665 uint len;
3666
3667 src_bui_fmt = buf->i_addr;
3668 len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
3669
3670 if (buf->i_len == len) {
3671 memcpy(dst_bui_fmt, src_bui_fmt, len);
3672 return 0;
3673 }
3674 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
3675 return -EFSCORRUPTED;
3676 }
3677
3678 /*
3679 * This routine is called to create an in-core extent bmap update
3680 * item from the bui format structure which was logged on disk.
3681 * It allocates an in-core bui, copies the extents from the format
3682 * structure into it, and adds the bui to the AIL with the given
3683 * LSN.
3684 */
3685 STATIC int
xlog_recover_bui_pass2(struct xlog * log,struct xlog_recover_item * item,xfs_lsn_t lsn)3686 xlog_recover_bui_pass2(
3687 struct xlog *log,
3688 struct xlog_recover_item *item,
3689 xfs_lsn_t lsn)
3690 {
3691 int error;
3692 struct xfs_mount *mp = log->l_mp;
3693 struct xfs_bui_log_item *buip;
3694 struct xfs_bui_log_format *bui_formatp;
3695
3696 bui_formatp = item->ri_buf[0].i_addr;
3697
3698 if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
3699 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
3700 return -EFSCORRUPTED;
3701 }
3702 buip = xfs_bui_init(mp);
3703 error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
3704 if (error) {
3705 xfs_bui_item_free(buip);
3706 return error;
3707 }
3708 atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
3709
3710 spin_lock(&log->l_ailp->ail_lock);
3711 /*
3712 * The RUI has two references. One for the RUD and one for RUI to ensure
3713 * it makes it into the AIL. Insert the RUI into the AIL directly and
3714 * drop the RUI reference. Note that xfs_trans_ail_update() drops the
3715 * AIL lock.
3716 */
3717 xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn);
3718 xfs_bui_release(buip);
3719 return 0;
3720 }
3721
3722
3723 /*
3724 * This routine is called when an BUD format structure is found in a committed
3725 * transaction in the log. Its purpose is to cancel the corresponding BUI if it
3726 * was still in the log. To do this it searches the AIL for the BUI with an id
3727 * equal to that in the BUD format structure. If we find it we drop the BUD
3728 * reference, which removes the BUI from the AIL and frees it.
3729 */
3730 STATIC int
xlog_recover_bud_pass2(struct xlog * log,struct xlog_recover_item * item)3731 xlog_recover_bud_pass2(
3732 struct xlog *log,
3733 struct xlog_recover_item *item)
3734 {
3735 struct xfs_bud_log_format *bud_formatp;
3736 struct xfs_bui_log_item *buip = NULL;
3737 struct xfs_log_item *lip;
3738 uint64_t bui_id;
3739 struct xfs_ail_cursor cur;
3740 struct xfs_ail *ailp = log->l_ailp;
3741
3742 bud_formatp = item->ri_buf[0].i_addr;
3743 if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
3744 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
3745 return -EFSCORRUPTED;
3746 }
3747 bui_id = bud_formatp->bud_bui_id;
3748
3749 /*
3750 * Search for the BUI with the id in the BUD format structure in the
3751 * AIL.
3752 */
3753 spin_lock(&ailp->ail_lock);
3754 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3755 while (lip != NULL) {
3756 if (lip->li_type == XFS_LI_BUI) {
3757 buip = (struct xfs_bui_log_item *)lip;
3758 if (buip->bui_format.bui_id == bui_id) {
3759 /*
3760 * Drop the BUD reference to the BUI. This
3761 * removes the BUI from the AIL and frees it.
3762 */
3763 spin_unlock(&ailp->ail_lock);
3764 xfs_bui_release(buip);
3765 spin_lock(&ailp->ail_lock);
3766 break;
3767 }
3768 }
3769 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3770 }
3771
3772 xfs_trans_ail_cursor_done(&cur);
3773 spin_unlock(&ailp->ail_lock);
3774
3775 return 0;
3776 }
3777
3778 /*
3779 * This routine is called when an inode create format structure is found in a
3780 * committed transaction in the log. It's purpose is to initialise the inodes
3781 * being allocated on disk. This requires us to get inode cluster buffers that
3782 * match the range to be initialised, stamped with inode templates and written
3783 * by delayed write so that subsequent modifications will hit the cached buffer
3784 * and only need writing out at the end of recovery.
3785 */
3786 STATIC int
xlog_recover_do_icreate_pass2(struct xlog * log,struct list_head * buffer_list,xlog_recover_item_t * item)3787 xlog_recover_do_icreate_pass2(
3788 struct xlog *log,
3789 struct list_head *buffer_list,
3790 xlog_recover_item_t *item)
3791 {
3792 struct xfs_mount *mp = log->l_mp;
3793 struct xfs_icreate_log *icl;
3794 struct xfs_ino_geometry *igeo = M_IGEO(mp);
3795 xfs_agnumber_t agno;
3796 xfs_agblock_t agbno;
3797 unsigned int count;
3798 unsigned int isize;
3799 xfs_agblock_t length;
3800 int bb_per_cluster;
3801 int cancel_count;
3802 int nbufs;
3803 int i;
3804
3805 icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3806 if (icl->icl_type != XFS_LI_ICREATE) {
3807 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3808 return -EINVAL;
3809 }
3810
3811 if (icl->icl_size != 1) {
3812 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3813 return -EINVAL;
3814 }
3815
3816 agno = be32_to_cpu(icl->icl_ag);
3817 if (agno >= mp->m_sb.sb_agcount) {
3818 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3819 return -EINVAL;
3820 }
3821 agbno = be32_to_cpu(icl->icl_agbno);
3822 if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
3823 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3824 return -EINVAL;
3825 }
3826 isize = be32_to_cpu(icl->icl_isize);
3827 if (isize != mp->m_sb.sb_inodesize) {
3828 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3829 return -EINVAL;
3830 }
3831 count = be32_to_cpu(icl->icl_count);
3832 if (!count) {
3833 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3834 return -EINVAL;
3835 }
3836 length = be32_to_cpu(icl->icl_length);
3837 if (!length || length >= mp->m_sb.sb_agblocks) {
3838 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3839 return -EINVAL;
3840 }
3841
3842 /*
3843 * The inode chunk is either full or sparse and we only support
3844 * m_ino_geo.ialloc_min_blks sized sparse allocations at this time.
3845 */
3846 if (length != igeo->ialloc_blks &&
3847 length != igeo->ialloc_min_blks) {
3848 xfs_warn(log->l_mp,
3849 "%s: unsupported chunk length", __FUNCTION__);
3850 return -EINVAL;
3851 }
3852
3853 /* verify inode count is consistent with extent length */
3854 if ((count >> mp->m_sb.sb_inopblog) != length) {
3855 xfs_warn(log->l_mp,
3856 "%s: inconsistent inode count and chunk length",
3857 __FUNCTION__);
3858 return -EINVAL;
3859 }
3860
3861 /*
3862 * The icreate transaction can cover multiple cluster buffers and these
3863 * buffers could have been freed and reused. Check the individual
3864 * buffers for cancellation so we don't overwrite anything written after
3865 * a cancellation.
3866 */
3867 bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
3868 nbufs = length / igeo->blocks_per_cluster;
3869 for (i = 0, cancel_count = 0; i < nbufs; i++) {
3870 xfs_daddr_t daddr;
3871
3872 daddr = XFS_AGB_TO_DADDR(mp, agno,
3873 agbno + i * igeo->blocks_per_cluster);
3874 if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
3875 cancel_count++;
3876 }
3877
3878 /*
3879 * We currently only use icreate for a single allocation at a time. This
3880 * means we should expect either all or none of the buffers to be
3881 * cancelled. Be conservative and skip replay if at least one buffer is
3882 * cancelled, but warn the user that something is awry if the buffers
3883 * are not consistent.
3884 *
3885 * XXX: This must be refined to only skip cancelled clusters once we use
3886 * icreate for multiple chunk allocations.
3887 */
3888 ASSERT(!cancel_count || cancel_count == nbufs);
3889 if (cancel_count) {
3890 if (cancel_count != nbufs)
3891 xfs_warn(mp,
3892 "WARNING: partial inode chunk cancellation, skipped icreate.");
3893 trace_xfs_log_recover_icreate_cancel(log, icl);
3894 return 0;
3895 }
3896
3897 trace_xfs_log_recover_icreate_recover(log, icl);
3898 return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
3899 length, be32_to_cpu(icl->icl_gen));
3900 }
3901
3902 STATIC void
xlog_recover_buffer_ra_pass2(struct xlog * log,struct xlog_recover_item * item)3903 xlog_recover_buffer_ra_pass2(
3904 struct xlog *log,
3905 struct xlog_recover_item *item)
3906 {
3907 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
3908 struct xfs_mount *mp = log->l_mp;
3909
3910 if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
3911 buf_f->blf_len, buf_f->blf_flags)) {
3912 return;
3913 }
3914
3915 xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
3916 buf_f->blf_len, NULL);
3917 }
3918
3919 STATIC void
xlog_recover_inode_ra_pass2(struct xlog * log,struct xlog_recover_item * item)3920 xlog_recover_inode_ra_pass2(
3921 struct xlog *log,
3922 struct xlog_recover_item *item)
3923 {
3924 struct xfs_inode_log_format ilf_buf;
3925 struct xfs_inode_log_format *ilfp;
3926 struct xfs_mount *mp = log->l_mp;
3927 int error;
3928
3929 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
3930 ilfp = item->ri_buf[0].i_addr;
3931 } else {
3932 ilfp = &ilf_buf;
3933 memset(ilfp, 0, sizeof(*ilfp));
3934 error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
3935 if (error)
3936 return;
3937 }
3938
3939 if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
3940 return;
3941
3942 xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
3943 ilfp->ilf_len, &xfs_inode_buf_ra_ops);
3944 }
3945
3946 STATIC void
xlog_recover_dquot_ra_pass2(struct xlog * log,struct xlog_recover_item * item)3947 xlog_recover_dquot_ra_pass2(
3948 struct xlog *log,
3949 struct xlog_recover_item *item)
3950 {
3951 struct xfs_mount *mp = log->l_mp;
3952 struct xfs_disk_dquot *recddq;
3953 struct xfs_dq_logformat *dq_f;
3954 uint type;
3955 int len;
3956
3957
3958 if (mp->m_qflags == 0)
3959 return;
3960
3961 recddq = item->ri_buf[1].i_addr;
3962 if (recddq == NULL)
3963 return;
3964 if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
3965 return;
3966
3967 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3968 ASSERT(type);
3969 if (log->l_quotaoffs_flag & type)
3970 return;
3971
3972 dq_f = item->ri_buf[0].i_addr;
3973 ASSERT(dq_f);
3974 ASSERT(dq_f->qlf_len == 1);
3975
3976 len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
3977 if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
3978 return;
3979
3980 xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
3981 &xfs_dquot_buf_ra_ops);
3982 }
3983
3984 STATIC void
xlog_recover_ra_pass2(struct xlog * log,struct xlog_recover_item * item)3985 xlog_recover_ra_pass2(
3986 struct xlog *log,
3987 struct xlog_recover_item *item)
3988 {
3989 switch (ITEM_TYPE(item)) {
3990 case XFS_LI_BUF:
3991 xlog_recover_buffer_ra_pass2(log, item);
3992 break;
3993 case XFS_LI_INODE:
3994 xlog_recover_inode_ra_pass2(log, item);
3995 break;
3996 case XFS_LI_DQUOT:
3997 xlog_recover_dquot_ra_pass2(log, item);
3998 break;
3999 case XFS_LI_EFI:
4000 case XFS_LI_EFD:
4001 case XFS_LI_QUOTAOFF:
4002 case XFS_LI_RUI:
4003 case XFS_LI_RUD:
4004 case XFS_LI_CUI:
4005 case XFS_LI_CUD:
4006 case XFS_LI_BUI:
4007 case XFS_LI_BUD:
4008 default:
4009 break;
4010 }
4011 }
4012
4013 STATIC int
xlog_recover_commit_pass1(struct xlog * log,struct xlog_recover * trans,struct xlog_recover_item * item)4014 xlog_recover_commit_pass1(
4015 struct xlog *log,
4016 struct xlog_recover *trans,
4017 struct xlog_recover_item *item)
4018 {
4019 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
4020
4021 switch (ITEM_TYPE(item)) {
4022 case XFS_LI_BUF:
4023 return xlog_recover_buffer_pass1(log, item);
4024 case XFS_LI_QUOTAOFF:
4025 return xlog_recover_quotaoff_pass1(log, item);
4026 case XFS_LI_INODE:
4027 case XFS_LI_EFI:
4028 case XFS_LI_EFD:
4029 case XFS_LI_DQUOT:
4030 case XFS_LI_ICREATE:
4031 case XFS_LI_RUI:
4032 case XFS_LI_RUD:
4033 case XFS_LI_CUI:
4034 case XFS_LI_CUD:
4035 case XFS_LI_BUI:
4036 case XFS_LI_BUD:
4037 /* nothing to do in pass 1 */
4038 return 0;
4039 default:
4040 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
4041 __func__, ITEM_TYPE(item));
4042 ASSERT(0);
4043 return -EFSCORRUPTED;
4044 }
4045 }
4046
4047 STATIC int
xlog_recover_commit_pass2(struct xlog * log,struct xlog_recover * trans,struct list_head * buffer_list,struct xlog_recover_item * item)4048 xlog_recover_commit_pass2(
4049 struct xlog *log,
4050 struct xlog_recover *trans,
4051 struct list_head *buffer_list,
4052 struct xlog_recover_item *item)
4053 {
4054 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
4055
4056 switch (ITEM_TYPE(item)) {
4057 case XFS_LI_BUF:
4058 return xlog_recover_buffer_pass2(log, buffer_list, item,
4059 trans->r_lsn);
4060 case XFS_LI_INODE:
4061 return xlog_recover_inode_pass2(log, buffer_list, item,
4062 trans->r_lsn);
4063 case XFS_LI_EFI:
4064 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
4065 case XFS_LI_EFD:
4066 return xlog_recover_efd_pass2(log, item);
4067 case XFS_LI_RUI:
4068 return xlog_recover_rui_pass2(log, item, trans->r_lsn);
4069 case XFS_LI_RUD:
4070 return xlog_recover_rud_pass2(log, item);
4071 case XFS_LI_CUI:
4072 return xlog_recover_cui_pass2(log, item, trans->r_lsn);
4073 case XFS_LI_CUD:
4074 return xlog_recover_cud_pass2(log, item);
4075 case XFS_LI_BUI:
4076 return xlog_recover_bui_pass2(log, item, trans->r_lsn);
4077 case XFS_LI_BUD:
4078 return xlog_recover_bud_pass2(log, item);
4079 case XFS_LI_DQUOT:
4080 return xlog_recover_dquot_pass2(log, buffer_list, item,
4081 trans->r_lsn);
4082 case XFS_LI_ICREATE:
4083 return xlog_recover_do_icreate_pass2(log, buffer_list, item);
4084 case XFS_LI_QUOTAOFF:
4085 /* nothing to do in pass2 */
4086 return 0;
4087 default:
4088 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
4089 __func__, ITEM_TYPE(item));
4090 ASSERT(0);
4091 return -EFSCORRUPTED;
4092 }
4093 }
4094
4095 STATIC int
xlog_recover_items_pass2(struct xlog * log,struct xlog_recover * trans,struct list_head * buffer_list,struct list_head * item_list)4096 xlog_recover_items_pass2(
4097 struct xlog *log,
4098 struct xlog_recover *trans,
4099 struct list_head *buffer_list,
4100 struct list_head *item_list)
4101 {
4102 struct xlog_recover_item *item;
4103 int error = 0;
4104
4105 list_for_each_entry(item, item_list, ri_list) {
4106 error = xlog_recover_commit_pass2(log, trans,
4107 buffer_list, item);
4108 if (error)
4109 return error;
4110 }
4111
4112 return error;
4113 }
4114
4115 /*
4116 * Perform the transaction.
4117 *
4118 * If the transaction modifies a buffer or inode, do it now. Otherwise,
4119 * EFIs and EFDs get queued up by adding entries into the AIL for them.
4120 */
4121 STATIC int
xlog_recover_commit_trans(struct xlog * log,struct xlog_recover * trans,int pass,struct list_head * buffer_list)4122 xlog_recover_commit_trans(
4123 struct xlog *log,
4124 struct xlog_recover *trans,
4125 int pass,
4126 struct list_head *buffer_list)
4127 {
4128 int error = 0;
4129 int items_queued = 0;
4130 struct xlog_recover_item *item;
4131 struct xlog_recover_item *next;
4132 LIST_HEAD (ra_list);
4133 LIST_HEAD (done_list);
4134
4135 #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
4136
4137 hlist_del_init(&trans->r_list);
4138
4139 error = xlog_recover_reorder_trans(log, trans, pass);
4140 if (error)
4141 return error;
4142
4143 list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
4144 switch (pass) {
4145 case XLOG_RECOVER_PASS1:
4146 error = xlog_recover_commit_pass1(log, trans, item);
4147 break;
4148 case XLOG_RECOVER_PASS2:
4149 xlog_recover_ra_pass2(log, item);
4150 list_move_tail(&item->ri_list, &ra_list);
4151 items_queued++;
4152 if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
4153 error = xlog_recover_items_pass2(log, trans,
4154 buffer_list, &ra_list);
4155 list_splice_tail_init(&ra_list, &done_list);
4156 items_queued = 0;
4157 }
4158
4159 break;
4160 default:
4161 ASSERT(0);
4162 }
4163
4164 if (error)
4165 goto out;
4166 }
4167
4168 out:
4169 if (!list_empty(&ra_list)) {
4170 if (!error)
4171 error = xlog_recover_items_pass2(log, trans,
4172 buffer_list, &ra_list);
4173 list_splice_tail_init(&ra_list, &done_list);
4174 }
4175
4176 if (!list_empty(&done_list))
4177 list_splice_init(&done_list, &trans->r_itemq);
4178
4179 return error;
4180 }
4181
4182 STATIC void
xlog_recover_add_item(struct list_head * head)4183 xlog_recover_add_item(
4184 struct list_head *head)
4185 {
4186 xlog_recover_item_t *item;
4187
4188 item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
4189 INIT_LIST_HEAD(&item->ri_list);
4190 list_add_tail(&item->ri_list, head);
4191 }
4192
4193 STATIC int
xlog_recover_add_to_cont_trans(struct xlog * log,struct xlog_recover * trans,char * dp,int len)4194 xlog_recover_add_to_cont_trans(
4195 struct xlog *log,
4196 struct xlog_recover *trans,
4197 char *dp,
4198 int len)
4199 {
4200 xlog_recover_item_t *item;
4201 char *ptr, *old_ptr;
4202 int old_len;
4203
4204 /*
4205 * If the transaction is empty, the header was split across this and the
4206 * previous record. Copy the rest of the header.
4207 */
4208 if (list_empty(&trans->r_itemq)) {
4209 ASSERT(len <= sizeof(struct xfs_trans_header));
4210 if (len > sizeof(struct xfs_trans_header)) {
4211 xfs_warn(log->l_mp, "%s: bad header length", __func__);
4212 return -EFSCORRUPTED;
4213 }
4214
4215 xlog_recover_add_item(&trans->r_itemq);
4216 ptr = (char *)&trans->r_theader +
4217 sizeof(struct xfs_trans_header) - len;
4218 memcpy(ptr, dp, len);
4219 return 0;
4220 }
4221
4222 /* take the tail entry */
4223 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
4224
4225 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
4226 old_len = item->ri_buf[item->ri_cnt-1].i_len;
4227
4228 ptr = kmem_realloc(old_ptr, len + old_len, 0);
4229 memcpy(&ptr[old_len], dp, len);
4230 item->ri_buf[item->ri_cnt-1].i_len += len;
4231 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
4232 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
4233 return 0;
4234 }
4235
4236 /*
4237 * The next region to add is the start of a new region. It could be
4238 * a whole region or it could be the first part of a new region. Because
4239 * of this, the assumption here is that the type and size fields of all
4240 * format structures fit into the first 32 bits of the structure.
4241 *
4242 * This works because all regions must be 32 bit aligned. Therefore, we
4243 * either have both fields or we have neither field. In the case we have
4244 * neither field, the data part of the region is zero length. We only have
4245 * a log_op_header and can throw away the header since a new one will appear
4246 * later. If we have at least 4 bytes, then we can determine how many regions
4247 * will appear in the current log item.
4248 */
4249 STATIC int
xlog_recover_add_to_trans(struct xlog * log,struct xlog_recover * trans,char * dp,int len)4250 xlog_recover_add_to_trans(
4251 struct xlog *log,
4252 struct xlog_recover *trans,
4253 char *dp,
4254 int len)
4255 {
4256 struct xfs_inode_log_format *in_f; /* any will do */
4257 xlog_recover_item_t *item;
4258 char *ptr;
4259
4260 if (!len)
4261 return 0;
4262 if (list_empty(&trans->r_itemq)) {
4263 /* we need to catch log corruptions here */
4264 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
4265 xfs_warn(log->l_mp, "%s: bad header magic number",
4266 __func__);
4267 ASSERT(0);
4268 return -EFSCORRUPTED;
4269 }
4270
4271 if (len > sizeof(struct xfs_trans_header)) {
4272 xfs_warn(log->l_mp, "%s: bad header length", __func__);
4273 ASSERT(0);
4274 return -EFSCORRUPTED;
4275 }
4276
4277 /*
4278 * The transaction header can be arbitrarily split across op
4279 * records. If we don't have the whole thing here, copy what we
4280 * do have and handle the rest in the next record.
4281 */
4282 if (len == sizeof(struct xfs_trans_header))
4283 xlog_recover_add_item(&trans->r_itemq);
4284 memcpy(&trans->r_theader, dp, len);
4285 return 0;
4286 }
4287
4288 ptr = kmem_alloc(len, 0);
4289 memcpy(ptr, dp, len);
4290 in_f = (struct xfs_inode_log_format *)ptr;
4291
4292 /* take the tail entry */
4293 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
4294 if (item->ri_total != 0 &&
4295 item->ri_total == item->ri_cnt) {
4296 /* tail item is in use, get a new one */
4297 xlog_recover_add_item(&trans->r_itemq);
4298 item = list_entry(trans->r_itemq.prev,
4299 xlog_recover_item_t, ri_list);
4300 }
4301
4302 if (item->ri_total == 0) { /* first region to be added */
4303 if (in_f->ilf_size == 0 ||
4304 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
4305 xfs_warn(log->l_mp,
4306 "bad number of regions (%d) in inode log format",
4307 in_f->ilf_size);
4308 ASSERT(0);
4309 kmem_free(ptr);
4310 return -EFSCORRUPTED;
4311 }
4312
4313 item->ri_total = in_f->ilf_size;
4314 item->ri_buf =
4315 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
4316 0);
4317 }
4318
4319 if (item->ri_total <= item->ri_cnt) {
4320 xfs_warn(log->l_mp,
4321 "log item region count (%d) overflowed size (%d)",
4322 item->ri_cnt, item->ri_total);
4323 ASSERT(0);
4324 kmem_free(ptr);
4325 return -EFSCORRUPTED;
4326 }
4327
4328 /* Description region is ri_buf[0] */
4329 item->ri_buf[item->ri_cnt].i_addr = ptr;
4330 item->ri_buf[item->ri_cnt].i_len = len;
4331 item->ri_cnt++;
4332 trace_xfs_log_recover_item_add(log, trans, item, 0);
4333 return 0;
4334 }
4335
4336 /*
4337 * Free up any resources allocated by the transaction
4338 *
4339 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
4340 */
4341 STATIC void
xlog_recover_free_trans(struct xlog_recover * trans)4342 xlog_recover_free_trans(
4343 struct xlog_recover *trans)
4344 {
4345 xlog_recover_item_t *item, *n;
4346 int i;
4347
4348 hlist_del_init(&trans->r_list);
4349
4350 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
4351 /* Free the regions in the item. */
4352 list_del(&item->ri_list);
4353 for (i = 0; i < item->ri_cnt; i++)
4354 kmem_free(item->ri_buf[i].i_addr);
4355 /* Free the item itself */
4356 kmem_free(item->ri_buf);
4357 kmem_free(item);
4358 }
4359 /* Free the transaction recover structure */
4360 kmem_free(trans);
4361 }
4362
4363 /*
4364 * On error or completion, trans is freed.
4365 */
4366 STATIC int
xlog_recovery_process_trans(struct xlog * log,struct xlog_recover * trans,char * dp,unsigned int len,unsigned int flags,int pass,struct list_head * buffer_list)4367 xlog_recovery_process_trans(
4368 struct xlog *log,
4369 struct xlog_recover *trans,
4370 char *dp,
4371 unsigned int len,
4372 unsigned int flags,
4373 int pass,
4374 struct list_head *buffer_list)
4375 {
4376 int error = 0;
4377 bool freeit = false;
4378
4379 /* mask off ophdr transaction container flags */
4380 flags &= ~XLOG_END_TRANS;
4381 if (flags & XLOG_WAS_CONT_TRANS)
4382 flags &= ~XLOG_CONTINUE_TRANS;
4383
4384 /*
4385 * Callees must not free the trans structure. We'll decide if we need to
4386 * free it or not based on the operation being done and it's result.
4387 */
4388 switch (flags) {
4389 /* expected flag values */
4390 case 0:
4391 case XLOG_CONTINUE_TRANS:
4392 error = xlog_recover_add_to_trans(log, trans, dp, len);
4393 break;
4394 case XLOG_WAS_CONT_TRANS:
4395 error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
4396 break;
4397 case XLOG_COMMIT_TRANS:
4398 error = xlog_recover_commit_trans(log, trans, pass,
4399 buffer_list);
4400 /* success or fail, we are now done with this transaction. */
4401 freeit = true;
4402 break;
4403
4404 /* unexpected flag values */
4405 case XLOG_UNMOUNT_TRANS:
4406 /* just skip trans */
4407 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
4408 freeit = true;
4409 break;
4410 case XLOG_START_TRANS:
4411 default:
4412 xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
4413 ASSERT(0);
4414 error = -EFSCORRUPTED;
4415 break;
4416 }
4417 if (error || freeit)
4418 xlog_recover_free_trans(trans);
4419 return error;
4420 }
4421
4422 /*
4423 * Lookup the transaction recovery structure associated with the ID in the
4424 * current ophdr. If the transaction doesn't exist and the start flag is set in
4425 * the ophdr, then allocate a new transaction for future ID matches to find.
4426 * Either way, return what we found during the lookup - an existing transaction
4427 * or nothing.
4428 */
4429 STATIC struct xlog_recover *
xlog_recover_ophdr_to_trans(struct hlist_head rhash[],struct xlog_rec_header * rhead,struct xlog_op_header * ohead)4430 xlog_recover_ophdr_to_trans(
4431 struct hlist_head rhash[],
4432 struct xlog_rec_header *rhead,
4433 struct xlog_op_header *ohead)
4434 {
4435 struct xlog_recover *trans;
4436 xlog_tid_t tid;
4437 struct hlist_head *rhp;
4438
4439 tid = be32_to_cpu(ohead->oh_tid);
4440 rhp = &rhash[XLOG_RHASH(tid)];
4441 hlist_for_each_entry(trans, rhp, r_list) {
4442 if (trans->r_log_tid == tid)
4443 return trans;
4444 }
4445
4446 /*
4447 * skip over non-start transaction headers - we could be
4448 * processing slack space before the next transaction starts
4449 */
4450 if (!(ohead->oh_flags & XLOG_START_TRANS))
4451 return NULL;
4452
4453 ASSERT(be32_to_cpu(ohead->oh_len) == 0);
4454
4455 /*
4456 * This is a new transaction so allocate a new recovery container to
4457 * hold the recovery ops that will follow.
4458 */
4459 trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
4460 trans->r_log_tid = tid;
4461 trans->r_lsn = be64_to_cpu(rhead->h_lsn);
4462 INIT_LIST_HEAD(&trans->r_itemq);
4463 INIT_HLIST_NODE(&trans->r_list);
4464 hlist_add_head(&trans->r_list, rhp);
4465
4466 /*
4467 * Nothing more to do for this ophdr. Items to be added to this new
4468 * transaction will be in subsequent ophdr containers.
4469 */
4470 return NULL;
4471 }
4472
4473 STATIC int
xlog_recover_process_ophdr(struct xlog * log,struct hlist_head rhash[],struct xlog_rec_header * rhead,struct xlog_op_header * ohead,char * dp,char * end,int pass,struct list_head * buffer_list)4474 xlog_recover_process_ophdr(
4475 struct xlog *log,
4476 struct hlist_head rhash[],
4477 struct xlog_rec_header *rhead,
4478 struct xlog_op_header *ohead,
4479 char *dp,
4480 char *end,
4481 int pass,
4482 struct list_head *buffer_list)
4483 {
4484 struct xlog_recover *trans;
4485 unsigned int len;
4486 int error;
4487
4488 /* Do we understand who wrote this op? */
4489 if (ohead->oh_clientid != XFS_TRANSACTION &&
4490 ohead->oh_clientid != XFS_LOG) {
4491 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
4492 __func__, ohead->oh_clientid);
4493 ASSERT(0);
4494 return -EFSCORRUPTED;
4495 }
4496
4497 /*
4498 * Check the ophdr contains all the data it is supposed to contain.
4499 */
4500 len = be32_to_cpu(ohead->oh_len);
4501 if (dp + len > end) {
4502 xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
4503 WARN_ON(1);
4504 return -EFSCORRUPTED;
4505 }
4506
4507 trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
4508 if (!trans) {
4509 /* nothing to do, so skip over this ophdr */
4510 return 0;
4511 }
4512
4513 /*
4514 * The recovered buffer queue is drained only once we know that all
4515 * recovery items for the current LSN have been processed. This is
4516 * required because:
4517 *
4518 * - Buffer write submission updates the metadata LSN of the buffer.
4519 * - Log recovery skips items with a metadata LSN >= the current LSN of
4520 * the recovery item.
4521 * - Separate recovery items against the same metadata buffer can share
4522 * a current LSN. I.e., consider that the LSN of a recovery item is
4523 * defined as the starting LSN of the first record in which its
4524 * transaction appears, that a record can hold multiple transactions,
4525 * and/or that a transaction can span multiple records.
4526 *
4527 * In other words, we are allowed to submit a buffer from log recovery
4528 * once per current LSN. Otherwise, we may incorrectly skip recovery
4529 * items and cause corruption.
4530 *
4531 * We don't know up front whether buffers are updated multiple times per
4532 * LSN. Therefore, track the current LSN of each commit log record as it
4533 * is processed and drain the queue when it changes. Use commit records
4534 * because they are ordered correctly by the logging code.
4535 */
4536 if (log->l_recovery_lsn != trans->r_lsn &&
4537 ohead->oh_flags & XLOG_COMMIT_TRANS) {
4538 error = xfs_buf_delwri_submit(buffer_list);
4539 if (error)
4540 return error;
4541 log->l_recovery_lsn = trans->r_lsn;
4542 }
4543
4544 return xlog_recovery_process_trans(log, trans, dp, len,
4545 ohead->oh_flags, pass, buffer_list);
4546 }
4547
4548 /*
4549 * There are two valid states of the r_state field. 0 indicates that the
4550 * transaction structure is in a normal state. We have either seen the
4551 * start of the transaction or the last operation we added was not a partial
4552 * operation. If the last operation we added to the transaction was a
4553 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
4554 *
4555 * NOTE: skip LRs with 0 data length.
4556 */
4557 STATIC int
xlog_recover_process_data(struct xlog * log,struct hlist_head rhash[],struct xlog_rec_header * rhead,char * dp,int pass,struct list_head * buffer_list)4558 xlog_recover_process_data(
4559 struct xlog *log,
4560 struct hlist_head rhash[],
4561 struct xlog_rec_header *rhead,
4562 char *dp,
4563 int pass,
4564 struct list_head *buffer_list)
4565 {
4566 struct xlog_op_header *ohead;
4567 char *end;
4568 int num_logops;
4569 int error;
4570
4571 end = dp + be32_to_cpu(rhead->h_len);
4572 num_logops = be32_to_cpu(rhead->h_num_logops);
4573
4574 /* check the log format matches our own - else we can't recover */
4575 if (xlog_header_check_recover(log->l_mp, rhead))
4576 return -EIO;
4577
4578 trace_xfs_log_recover_record(log, rhead, pass);
4579 while ((dp < end) && num_logops) {
4580
4581 ohead = (struct xlog_op_header *)dp;
4582 dp += sizeof(*ohead);
4583 ASSERT(dp <= end);
4584
4585 /* errors will abort recovery */
4586 error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
4587 dp, end, pass, buffer_list);
4588 if (error)
4589 return error;
4590
4591 dp += be32_to_cpu(ohead->oh_len);
4592 num_logops--;
4593 }
4594 return 0;
4595 }
4596
4597 /* Recover the EFI if necessary. */
4598 STATIC int
xlog_recover_process_efi(struct xfs_ail * ailp,struct xfs_log_item * lip,struct list_head * capture_list)4599 xlog_recover_process_efi(
4600 struct xfs_ail *ailp,
4601 struct xfs_log_item *lip,
4602 struct list_head *capture_list)
4603 {
4604 struct xfs_efi_log_item *efip;
4605 int error;
4606
4607 /*
4608 * Skip EFIs that we've already processed.
4609 */
4610 efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4611 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
4612 return 0;
4613
4614 spin_unlock(&ailp->ail_lock);
4615 error = xfs_efi_recover(efip, capture_list);
4616 spin_lock(&ailp->ail_lock);
4617
4618 return error;
4619 }
4620
4621 /* Release the EFI since we're cancelling everything. */
4622 STATIC void
xlog_recover_cancel_efi(struct xfs_mount * mp,struct xfs_ail * ailp,struct xfs_log_item * lip)4623 xlog_recover_cancel_efi(
4624 struct xfs_mount *mp,
4625 struct xfs_ail *ailp,
4626 struct xfs_log_item *lip)
4627 {
4628 struct xfs_efi_log_item *efip;
4629
4630 efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4631
4632 spin_unlock(&ailp->ail_lock);
4633 xfs_efi_release(efip);
4634 spin_lock(&ailp->ail_lock);
4635 }
4636
4637 /* Recover the RUI if necessary. */
4638 STATIC int
xlog_recover_process_rui(struct xfs_ail * ailp,struct xfs_log_item * lip,struct list_head * capture_list)4639 xlog_recover_process_rui(
4640 struct xfs_ail *ailp,
4641 struct xfs_log_item *lip,
4642 struct list_head *capture_list)
4643 {
4644 struct xfs_rui_log_item *ruip;
4645 int error;
4646
4647 /*
4648 * Skip RUIs that we've already processed.
4649 */
4650 ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
4651 if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags))
4652 return 0;
4653
4654 spin_unlock(&ailp->ail_lock);
4655 error = xfs_rui_recover(ruip, capture_list);
4656 spin_lock(&ailp->ail_lock);
4657
4658 return error;
4659 }
4660
4661 /* Release the RUI since we're cancelling everything. */
4662 STATIC void
xlog_recover_cancel_rui(struct xfs_mount * mp,struct xfs_ail * ailp,struct xfs_log_item * lip)4663 xlog_recover_cancel_rui(
4664 struct xfs_mount *mp,
4665 struct xfs_ail *ailp,
4666 struct xfs_log_item *lip)
4667 {
4668 struct xfs_rui_log_item *ruip;
4669
4670 ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
4671
4672 spin_unlock(&ailp->ail_lock);
4673 xfs_rui_release(ruip);
4674 spin_lock(&ailp->ail_lock);
4675 }
4676
4677 /* Recover the CUI if necessary. */
4678 STATIC int
xlog_recover_process_cui(struct xfs_ail * ailp,struct xfs_log_item * lip,struct list_head * capture_list)4679 xlog_recover_process_cui(
4680 struct xfs_ail *ailp,
4681 struct xfs_log_item *lip,
4682 struct list_head *capture_list)
4683 {
4684 struct xfs_cui_log_item *cuip;
4685 int error;
4686
4687 /*
4688 * Skip CUIs that we've already processed.
4689 */
4690 cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
4691 if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags))
4692 return 0;
4693
4694 spin_unlock(&ailp->ail_lock);
4695 error = xfs_cui_recover(cuip, capture_list);
4696 spin_lock(&ailp->ail_lock);
4697
4698 return error;
4699 }
4700
4701 /* Release the CUI since we're cancelling everything. */
4702 STATIC void
xlog_recover_cancel_cui(struct xfs_mount * mp,struct xfs_ail * ailp,struct xfs_log_item * lip)4703 xlog_recover_cancel_cui(
4704 struct xfs_mount *mp,
4705 struct xfs_ail *ailp,
4706 struct xfs_log_item *lip)
4707 {
4708 struct xfs_cui_log_item *cuip;
4709
4710 cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
4711
4712 spin_unlock(&ailp->ail_lock);
4713 xfs_cui_release(cuip);
4714 spin_lock(&ailp->ail_lock);
4715 }
4716
4717 /* Recover the BUI if necessary. */
4718 STATIC int
xlog_recover_process_bui(struct xfs_ail * ailp,struct xfs_log_item * lip,struct list_head * capture_list)4719 xlog_recover_process_bui(
4720 struct xfs_ail *ailp,
4721 struct xfs_log_item *lip,
4722 struct list_head *capture_list)
4723 {
4724 struct xfs_bui_log_item *buip;
4725 int error;
4726
4727 /*
4728 * Skip BUIs that we've already processed.
4729 */
4730 buip = container_of(lip, struct xfs_bui_log_item, bui_item);
4731 if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags))
4732 return 0;
4733
4734 spin_unlock(&ailp->ail_lock);
4735 error = xfs_bui_recover(buip, capture_list);
4736 spin_lock(&ailp->ail_lock);
4737
4738 return error;
4739 }
4740
4741 /* Release the BUI since we're cancelling everything. */
4742 STATIC void
xlog_recover_cancel_bui(struct xfs_mount * mp,struct xfs_ail * ailp,struct xfs_log_item * lip)4743 xlog_recover_cancel_bui(
4744 struct xfs_mount *mp,
4745 struct xfs_ail *ailp,
4746 struct xfs_log_item *lip)
4747 {
4748 struct xfs_bui_log_item *buip;
4749
4750 buip = container_of(lip, struct xfs_bui_log_item, bui_item);
4751
4752 spin_unlock(&ailp->ail_lock);
4753 xfs_bui_release(buip);
4754 spin_lock(&ailp->ail_lock);
4755 }
4756
4757 /* Is this log item a deferred action intent? */
xlog_item_is_intent(struct xfs_log_item * lip)4758 static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
4759 {
4760 switch (lip->li_type) {
4761 case XFS_LI_EFI:
4762 case XFS_LI_RUI:
4763 case XFS_LI_CUI:
4764 case XFS_LI_BUI:
4765 return true;
4766 default:
4767 return false;
4768 }
4769 }
4770
4771 /* Take all the collected deferred ops and finish them in order. */
4772 static int
xlog_finish_defer_ops(struct xfs_mount * mp,struct list_head * capture_list)4773 xlog_finish_defer_ops(
4774 struct xfs_mount *mp,
4775 struct list_head *capture_list)
4776 {
4777 struct xfs_defer_capture *dfc, *next;
4778 struct xfs_trans *tp;
4779 struct xfs_inode *ip;
4780 int error = 0;
4781
4782 list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
4783 struct xfs_trans_res resv;
4784
4785 /*
4786 * Create a new transaction reservation from the captured
4787 * information. Set logcount to 1 to force the new transaction
4788 * to regrant every roll so that we can make forward progress
4789 * in recovery no matter how full the log might be.
4790 */
4791 resv.tr_logres = dfc->dfc_logres;
4792 resv.tr_logcount = 1;
4793 resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
4794
4795 error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
4796 dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
4797 if (error)
4798 return error;
4799
4800 /*
4801 * Transfer to this new transaction all the dfops we captured
4802 * from recovering a single intent item.
4803 */
4804 list_del_init(&dfc->dfc_list);
4805 xfs_defer_ops_continue(dfc, tp, &ip);
4806
4807 error = xfs_trans_commit(tp);
4808 if (ip) {
4809 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4810 xfs_irele(ip);
4811 }
4812 if (error)
4813 return error;
4814 }
4815
4816 ASSERT(list_empty(capture_list));
4817 return 0;
4818 }
4819
4820 /* Release all the captured defer ops and capture structures in this list. */
4821 static void
xlog_abort_defer_ops(struct xfs_mount * mp,struct list_head * capture_list)4822 xlog_abort_defer_ops(
4823 struct xfs_mount *mp,
4824 struct list_head *capture_list)
4825 {
4826 struct xfs_defer_capture *dfc;
4827 struct xfs_defer_capture *next;
4828
4829 list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
4830 list_del_init(&dfc->dfc_list);
4831 xfs_defer_ops_release(mp, dfc);
4832 }
4833 }
4834 /*
4835 * When this is called, all of the log intent items which did not have
4836 * corresponding log done items should be in the AIL. What we do now
4837 * is update the data structures associated with each one.
4838 *
4839 * Since we process the log intent items in normal transactions, they
4840 * will be removed at some point after the commit. This prevents us
4841 * from just walking down the list processing each one. We'll use a
4842 * flag in the intent item to skip those that we've already processed
4843 * and use the AIL iteration mechanism's generation count to try to
4844 * speed this up at least a bit.
4845 *
4846 * When we start, we know that the intents are the only things in the
4847 * AIL. As we process them, however, other items are added to the
4848 * AIL.
4849 */
4850 STATIC int
xlog_recover_process_intents(struct xlog * log)4851 xlog_recover_process_intents(
4852 struct xlog *log)
4853 {
4854 LIST_HEAD(capture_list);
4855 struct xfs_ail_cursor cur;
4856 struct xfs_log_item *lip;
4857 struct xfs_ail *ailp;
4858 int error = 0;
4859 #if defined(DEBUG) || defined(XFS_WARN)
4860 xfs_lsn_t last_lsn;
4861 #endif
4862
4863 ailp = log->l_ailp;
4864 spin_lock(&ailp->ail_lock);
4865 #if defined(DEBUG) || defined(XFS_WARN)
4866 last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
4867 #endif
4868 for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
4869 lip != NULL;
4870 lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
4871 /*
4872 * We're done when we see something other than an intent.
4873 * There should be no intents left in the AIL now.
4874 */
4875 if (!xlog_item_is_intent(lip)) {
4876 #ifdef DEBUG
4877 for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
4878 ASSERT(!xlog_item_is_intent(lip));
4879 #endif
4880 break;
4881 }
4882
4883 /*
4884 * We should never see a redo item with a LSN higher than
4885 * the last transaction we found in the log at the start
4886 * of recovery.
4887 */
4888 ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
4889
4890 /*
4891 * NOTE: If your intent processing routine can create more
4892 * deferred ops, you /must/ attach them to the capture list in
4893 * the recover routine or else those subsequent intents will be
4894 * replayed in the wrong order!
4895 */
4896 switch (lip->li_type) {
4897 case XFS_LI_EFI:
4898 error = xlog_recover_process_efi(ailp, lip, &capture_list);
4899 break;
4900 case XFS_LI_RUI:
4901 error = xlog_recover_process_rui(ailp, lip, &capture_list);
4902 break;
4903 case XFS_LI_CUI:
4904 error = xlog_recover_process_cui(ailp, lip, &capture_list);
4905 break;
4906 case XFS_LI_BUI:
4907 error = xlog_recover_process_bui(ailp, lip, &capture_list);
4908 break;
4909 }
4910 if (error)
4911 break;
4912 }
4913
4914 xfs_trans_ail_cursor_done(&cur);
4915 spin_unlock(&ailp->ail_lock);
4916 if (error)
4917 goto err;
4918
4919 error = xlog_finish_defer_ops(log->l_mp, &capture_list);
4920 if (error)
4921 goto err;
4922
4923 return 0;
4924 err:
4925 xlog_abort_defer_ops(log->l_mp, &capture_list);
4926 return error;
4927 }
4928
4929 /*
4930 * A cancel occurs when the mount has failed and we're bailing out.
4931 * Release all pending log intent items so they don't pin the AIL.
4932 */
4933 STATIC void
xlog_recover_cancel_intents(struct xlog * log)4934 xlog_recover_cancel_intents(
4935 struct xlog *log)
4936 {
4937 struct xfs_log_item *lip;
4938 struct xfs_ail_cursor cur;
4939 struct xfs_ail *ailp;
4940
4941 ailp = log->l_ailp;
4942 spin_lock(&ailp->ail_lock);
4943 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
4944 while (lip != NULL) {
4945 /*
4946 * We're done when we see something other than an intent.
4947 * There should be no intents left in the AIL now.
4948 */
4949 if (!xlog_item_is_intent(lip)) {
4950 #ifdef DEBUG
4951 for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
4952 ASSERT(!xlog_item_is_intent(lip));
4953 #endif
4954 break;
4955 }
4956
4957 switch (lip->li_type) {
4958 case XFS_LI_EFI:
4959 xlog_recover_cancel_efi(log->l_mp, ailp, lip);
4960 break;
4961 case XFS_LI_RUI:
4962 xlog_recover_cancel_rui(log->l_mp, ailp, lip);
4963 break;
4964 case XFS_LI_CUI:
4965 xlog_recover_cancel_cui(log->l_mp, ailp, lip);
4966 break;
4967 case XFS_LI_BUI:
4968 xlog_recover_cancel_bui(log->l_mp, ailp, lip);
4969 break;
4970 }
4971
4972 lip = xfs_trans_ail_cursor_next(ailp, &cur);
4973 }
4974
4975 xfs_trans_ail_cursor_done(&cur);
4976 spin_unlock(&ailp->ail_lock);
4977 }
4978
4979 /*
4980 * This routine performs a transaction to null out a bad inode pointer
4981 * in an agi unlinked inode hash bucket.
4982 */
4983 STATIC void
xlog_recover_clear_agi_bucket(xfs_mount_t * mp,xfs_agnumber_t agno,int bucket)4984 xlog_recover_clear_agi_bucket(
4985 xfs_mount_t *mp,
4986 xfs_agnumber_t agno,
4987 int bucket)
4988 {
4989 xfs_trans_t *tp;
4990 xfs_agi_t *agi;
4991 xfs_buf_t *agibp;
4992 int offset;
4993 int error;
4994
4995 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
4996 if (error)
4997 goto out_error;
4998
4999 error = xfs_read_agi(mp, tp, agno, &agibp);
5000 if (error)
5001 goto out_abort;
5002
5003 agi = XFS_BUF_TO_AGI(agibp);
5004 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
5005 offset = offsetof(xfs_agi_t, agi_unlinked) +
5006 (sizeof(xfs_agino_t) * bucket);
5007 xfs_trans_log_buf(tp, agibp, offset,
5008 (offset + sizeof(xfs_agino_t) - 1));
5009
5010 error = xfs_trans_commit(tp);
5011 if (error)
5012 goto out_error;
5013 return;
5014
5015 out_abort:
5016 xfs_trans_cancel(tp);
5017 out_error:
5018 xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
5019 return;
5020 }
5021
5022 STATIC xfs_agino_t
xlog_recover_process_one_iunlink(struct xfs_mount * mp,xfs_agnumber_t agno,xfs_agino_t agino,int bucket)5023 xlog_recover_process_one_iunlink(
5024 struct xfs_mount *mp,
5025 xfs_agnumber_t agno,
5026 xfs_agino_t agino,
5027 int bucket)
5028 {
5029 struct xfs_buf *ibp;
5030 struct xfs_dinode *dip;
5031 struct xfs_inode *ip;
5032 xfs_ino_t ino;
5033 int error;
5034
5035 ino = XFS_AGINO_TO_INO(mp, agno, agino);
5036 error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
5037 if (error)
5038 goto fail;
5039
5040 /*
5041 * Get the on disk inode to find the next inode in the bucket.
5042 */
5043 error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
5044 if (error)
5045 goto fail_iput;
5046
5047 xfs_iflags_clear(ip, XFS_IRECOVERY);
5048 ASSERT(VFS_I(ip)->i_nlink == 0);
5049 ASSERT(VFS_I(ip)->i_mode != 0);
5050
5051 /* setup for the next pass */
5052 agino = be32_to_cpu(dip->di_next_unlinked);
5053 xfs_buf_relse(ibp);
5054
5055 /*
5056 * Prevent any DMAPI event from being sent when the reference on
5057 * the inode is dropped.
5058 */
5059 ip->i_d.di_dmevmask = 0;
5060
5061 xfs_irele(ip);
5062 return agino;
5063
5064 fail_iput:
5065 xfs_irele(ip);
5066 fail:
5067 /*
5068 * We can't read in the inode this bucket points to, or this inode
5069 * is messed up. Just ditch this bucket of inodes. We will lose
5070 * some inodes and space, but at least we won't hang.
5071 *
5072 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
5073 * clear the inode pointer in the bucket.
5074 */
5075 xlog_recover_clear_agi_bucket(mp, agno, bucket);
5076 return NULLAGINO;
5077 }
5078
5079 /*
5080 * Recover AGI unlinked lists
5081 *
5082 * This is called during recovery to process any inodes which we unlinked but
5083 * not freed when the system crashed. These inodes will be on the lists in the
5084 * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
5085 * any inodes found on the lists. Each inode is removed from the lists when it
5086 * has been fully truncated and is freed. The freeing of the inode and its
5087 * removal from the list must be atomic.
5088 *
5089 * If everything we touch in the agi processing loop is already in memory, this
5090 * loop can hold the cpu for a long time. It runs without lock contention,
5091 * memory allocation contention, the need wait for IO, etc, and so will run
5092 * until we either run out of inodes to process, run low on memory or we run out
5093 * of log space.
5094 *
5095 * This behaviour is bad for latency on single CPU and non-preemptible kernels,
5096 * and can prevent other filesytem work (such as CIL pushes) from running. This
5097 * can lead to deadlocks if the recovery process runs out of log reservation
5098 * space. Hence we need to yield the CPU when there is other kernel work
5099 * scheduled on this CPU to ensure other scheduled work can run without undue
5100 * latency.
5101 */
5102 STATIC void
xlog_recover_process_iunlinks(struct xlog * log)5103 xlog_recover_process_iunlinks(
5104 struct xlog *log)
5105 {
5106 xfs_mount_t *mp;
5107 xfs_agnumber_t agno;
5108 xfs_agi_t *agi;
5109 xfs_buf_t *agibp;
5110 xfs_agino_t agino;
5111 int bucket;
5112 int error;
5113
5114 mp = log->l_mp;
5115
5116 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
5117 /*
5118 * Find the agi for this ag.
5119 */
5120 error = xfs_read_agi(mp, NULL, agno, &agibp);
5121 if (error) {
5122 /*
5123 * AGI is b0rked. Don't process it.
5124 *
5125 * We should probably mark the filesystem as corrupt
5126 * after we've recovered all the ag's we can....
5127 */
5128 continue;
5129 }
5130 /*
5131 * Unlock the buffer so that it can be acquired in the normal
5132 * course of the transaction to truncate and free each inode.
5133 * Because we are not racing with anyone else here for the AGI
5134 * buffer, we don't even need to hold it locked to read the
5135 * initial unlinked bucket entries out of the buffer. We keep
5136 * buffer reference though, so that it stays pinned in memory
5137 * while we need the buffer.
5138 */
5139 agi = XFS_BUF_TO_AGI(agibp);
5140 xfs_buf_unlock(agibp);
5141
5142 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
5143 agino = be32_to_cpu(agi->agi_unlinked[bucket]);
5144 while (agino != NULLAGINO) {
5145 agino = xlog_recover_process_one_iunlink(mp,
5146 agno, agino, bucket);
5147 cond_resched();
5148 }
5149 }
5150 xfs_buf_rele(agibp);
5151 }
5152 }
5153
5154 STATIC void
xlog_unpack_data(struct xlog_rec_header * rhead,char * dp,struct xlog * log)5155 xlog_unpack_data(
5156 struct xlog_rec_header *rhead,
5157 char *dp,
5158 struct xlog *log)
5159 {
5160 int i, j, k;
5161
5162 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
5163 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
5164 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
5165 dp += BBSIZE;
5166 }
5167
5168 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
5169 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
5170 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
5171 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
5172 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
5173 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
5174 dp += BBSIZE;
5175 }
5176 }
5177 }
5178
5179 /*
5180 * CRC check, unpack and process a log record.
5181 */
5182 STATIC int
xlog_recover_process(struct xlog * log,struct hlist_head rhash[],struct xlog_rec_header * rhead,char * dp,int pass,struct list_head * buffer_list)5183 xlog_recover_process(
5184 struct xlog *log,
5185 struct hlist_head rhash[],
5186 struct xlog_rec_header *rhead,
5187 char *dp,
5188 int pass,
5189 struct list_head *buffer_list)
5190 {
5191 __le32 old_crc = rhead->h_crc;
5192 __le32 crc;
5193
5194 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
5195
5196 /*
5197 * Nothing else to do if this is a CRC verification pass. Just return
5198 * if this a record with a non-zero crc. Unfortunately, mkfs always
5199 * sets old_crc to 0 so we must consider this valid even on v5 supers.
5200 * Otherwise, return EFSBADCRC on failure so the callers up the stack
5201 * know precisely what failed.
5202 */
5203 if (pass == XLOG_RECOVER_CRCPASS) {
5204 if (old_crc && crc != old_crc)
5205 return -EFSBADCRC;
5206 return 0;
5207 }
5208
5209 /*
5210 * We're in the normal recovery path. Issue a warning if and only if the
5211 * CRC in the header is non-zero. This is an advisory warning and the
5212 * zero CRC check prevents warnings from being emitted when upgrading
5213 * the kernel from one that does not add CRCs by default.
5214 */
5215 if (crc != old_crc) {
5216 if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
5217 xfs_alert(log->l_mp,
5218 "log record CRC mismatch: found 0x%x, expected 0x%x.",
5219 le32_to_cpu(old_crc),
5220 le32_to_cpu(crc));
5221 xfs_hex_dump(dp, 32);
5222 }
5223
5224 /*
5225 * If the filesystem is CRC enabled, this mismatch becomes a
5226 * fatal log corruption failure.
5227 */
5228 if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
5229 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
5230 return -EFSCORRUPTED;
5231 }
5232 }
5233
5234 xlog_unpack_data(rhead, dp, log);
5235
5236 return xlog_recover_process_data(log, rhash, rhead, dp, pass,
5237 buffer_list);
5238 }
5239
5240 STATIC int
xlog_valid_rec_header(struct xlog * log,struct xlog_rec_header * rhead,xfs_daddr_t blkno)5241 xlog_valid_rec_header(
5242 struct xlog *log,
5243 struct xlog_rec_header *rhead,
5244 xfs_daddr_t blkno)
5245 {
5246 int hlen;
5247
5248 if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
5249 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
5250 XFS_ERRLEVEL_LOW, log->l_mp);
5251 return -EFSCORRUPTED;
5252 }
5253 if (unlikely(
5254 (!rhead->h_version ||
5255 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
5256 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
5257 __func__, be32_to_cpu(rhead->h_version));
5258 return -EFSCORRUPTED;
5259 }
5260
5261 /* LR body must have data or it wouldn't have been written */
5262 hlen = be32_to_cpu(rhead->h_len);
5263 if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
5264 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
5265 XFS_ERRLEVEL_LOW, log->l_mp);
5266 return -EFSCORRUPTED;
5267 }
5268 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
5269 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
5270 XFS_ERRLEVEL_LOW, log->l_mp);
5271 return -EFSCORRUPTED;
5272 }
5273 return 0;
5274 }
5275
5276 /*
5277 * Read the log from tail to head and process the log records found.
5278 * Handle the two cases where the tail and head are in the same cycle
5279 * and where the active portion of the log wraps around the end of
5280 * the physical log separately. The pass parameter is passed through
5281 * to the routines called to process the data and is not looked at
5282 * here.
5283 */
5284 STATIC int
xlog_do_recovery_pass(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk,int pass,xfs_daddr_t * first_bad)5285 xlog_do_recovery_pass(
5286 struct xlog *log,
5287 xfs_daddr_t head_blk,
5288 xfs_daddr_t tail_blk,
5289 int pass,
5290 xfs_daddr_t *first_bad) /* out: first bad log rec */
5291 {
5292 xlog_rec_header_t *rhead;
5293 xfs_daddr_t blk_no, rblk_no;
5294 xfs_daddr_t rhead_blk;
5295 char *offset;
5296 char *hbp, *dbp;
5297 int error = 0, h_size, h_len;
5298 int error2 = 0;
5299 int bblks, split_bblks;
5300 int hblks, split_hblks, wrapped_hblks;
5301 int i;
5302 struct hlist_head rhash[XLOG_RHASH_SIZE];
5303 LIST_HEAD (buffer_list);
5304
5305 ASSERT(head_blk != tail_blk);
5306 blk_no = rhead_blk = tail_blk;
5307
5308 for (i = 0; i < XLOG_RHASH_SIZE; i++)
5309 INIT_HLIST_HEAD(&rhash[i]);
5310
5311 /*
5312 * Read the header of the tail block and get the iclog buffer size from
5313 * h_size. Use this to tell how many sectors make up the log header.
5314 */
5315 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
5316 /*
5317 * When using variable length iclogs, read first sector of
5318 * iclog header and extract the header size from it. Get a
5319 * new hbp that is the correct size.
5320 */
5321 hbp = xlog_alloc_buffer(log, 1);
5322 if (!hbp)
5323 return -ENOMEM;
5324
5325 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
5326 if (error)
5327 goto bread_err1;
5328
5329 rhead = (xlog_rec_header_t *)offset;
5330 error = xlog_valid_rec_header(log, rhead, tail_blk);
5331 if (error)
5332 goto bread_err1;
5333
5334 /*
5335 * xfsprogs has a bug where record length is based on lsunit but
5336 * h_size (iclog size) is hardcoded to 32k. Now that we
5337 * unconditionally CRC verify the unmount record, this means the
5338 * log buffer can be too small for the record and cause an
5339 * overrun.
5340 *
5341 * Detect this condition here. Use lsunit for the buffer size as
5342 * long as this looks like the mkfs case. Otherwise, return an
5343 * error to avoid a buffer overrun.
5344 */
5345 h_size = be32_to_cpu(rhead->h_size);
5346 h_len = be32_to_cpu(rhead->h_len);
5347 if (h_len > h_size) {
5348 if (h_len <= log->l_mp->m_logbsize &&
5349 be32_to_cpu(rhead->h_num_logops) == 1) {
5350 xfs_warn(log->l_mp,
5351 "invalid iclog size (%d bytes), using lsunit (%d bytes)",
5352 h_size, log->l_mp->m_logbsize);
5353 h_size = log->l_mp->m_logbsize;
5354 } else {
5355 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW,
5356 log->l_mp);
5357 error = -EFSCORRUPTED;
5358 goto bread_err1;
5359 }
5360 }
5361
5362 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
5363 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
5364 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
5365 if (h_size % XLOG_HEADER_CYCLE_SIZE)
5366 hblks++;
5367 kmem_free(hbp);
5368 hbp = xlog_alloc_buffer(log, hblks);
5369 } else {
5370 hblks = 1;
5371 }
5372 } else {
5373 ASSERT(log->l_sectBBsize == 1);
5374 hblks = 1;
5375 hbp = xlog_alloc_buffer(log, 1);
5376 h_size = XLOG_BIG_RECORD_BSIZE;
5377 }
5378
5379 if (!hbp)
5380 return -ENOMEM;
5381 dbp = xlog_alloc_buffer(log, BTOBB(h_size));
5382 if (!dbp) {
5383 kmem_free(hbp);
5384 return -ENOMEM;
5385 }
5386
5387 memset(rhash, 0, sizeof(rhash));
5388 if (tail_blk > head_blk) {
5389 /*
5390 * Perform recovery around the end of the physical log.
5391 * When the head is not on the same cycle number as the tail,
5392 * we can't do a sequential recovery.
5393 */
5394 while (blk_no < log->l_logBBsize) {
5395 /*
5396 * Check for header wrapping around physical end-of-log
5397 */
5398 offset = hbp;
5399 split_hblks = 0;
5400 wrapped_hblks = 0;
5401 if (blk_no + hblks <= log->l_logBBsize) {
5402 /* Read header in one read */
5403 error = xlog_bread(log, blk_no, hblks, hbp,
5404 &offset);
5405 if (error)
5406 goto bread_err2;
5407 } else {
5408 /* This LR is split across physical log end */
5409 if (blk_no != log->l_logBBsize) {
5410 /* some data before physical log end */
5411 ASSERT(blk_no <= INT_MAX);
5412 split_hblks = log->l_logBBsize - (int)blk_no;
5413 ASSERT(split_hblks > 0);
5414 error = xlog_bread(log, blk_no,
5415 split_hblks, hbp,
5416 &offset);
5417 if (error)
5418 goto bread_err2;
5419 }
5420
5421 /*
5422 * Note: this black magic still works with
5423 * large sector sizes (non-512) only because:
5424 * - we increased the buffer size originally
5425 * by 1 sector giving us enough extra space
5426 * for the second read;
5427 * - the log start is guaranteed to be sector
5428 * aligned;
5429 * - we read the log end (LR header start)
5430 * _first_, then the log start (LR header end)
5431 * - order is important.
5432 */
5433 wrapped_hblks = hblks - split_hblks;
5434 error = xlog_bread_noalign(log, 0,
5435 wrapped_hblks,
5436 offset + BBTOB(split_hblks));
5437 if (error)
5438 goto bread_err2;
5439 }
5440 rhead = (xlog_rec_header_t *)offset;
5441 error = xlog_valid_rec_header(log, rhead,
5442 split_hblks ? blk_no : 0);
5443 if (error)
5444 goto bread_err2;
5445
5446 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
5447 blk_no += hblks;
5448
5449 /*
5450 * Read the log record data in multiple reads if it
5451 * wraps around the end of the log. Note that if the
5452 * header already wrapped, blk_no could point past the
5453 * end of the log. The record data is contiguous in
5454 * that case.
5455 */
5456 if (blk_no + bblks <= log->l_logBBsize ||
5457 blk_no >= log->l_logBBsize) {
5458 rblk_no = xlog_wrap_logbno(log, blk_no);
5459 error = xlog_bread(log, rblk_no, bblks, dbp,
5460 &offset);
5461 if (error)
5462 goto bread_err2;
5463 } else {
5464 /* This log record is split across the
5465 * physical end of log */
5466 offset = dbp;
5467 split_bblks = 0;
5468 if (blk_no != log->l_logBBsize) {
5469 /* some data is before the physical
5470 * end of log */
5471 ASSERT(!wrapped_hblks);
5472 ASSERT(blk_no <= INT_MAX);
5473 split_bblks =
5474 log->l_logBBsize - (int)blk_no;
5475 ASSERT(split_bblks > 0);
5476 error = xlog_bread(log, blk_no,
5477 split_bblks, dbp,
5478 &offset);
5479 if (error)
5480 goto bread_err2;
5481 }
5482
5483 /*
5484 * Note: this black magic still works with
5485 * large sector sizes (non-512) only because:
5486 * - we increased the buffer size originally
5487 * by 1 sector giving us enough extra space
5488 * for the second read;
5489 * - the log start is guaranteed to be sector
5490 * aligned;
5491 * - we read the log end (LR header start)
5492 * _first_, then the log start (LR header end)
5493 * - order is important.
5494 */
5495 error = xlog_bread_noalign(log, 0,
5496 bblks - split_bblks,
5497 offset + BBTOB(split_bblks));
5498 if (error)
5499 goto bread_err2;
5500 }
5501
5502 error = xlog_recover_process(log, rhash, rhead, offset,
5503 pass, &buffer_list);
5504 if (error)
5505 goto bread_err2;
5506
5507 blk_no += bblks;
5508 rhead_blk = blk_no;
5509 }
5510
5511 ASSERT(blk_no >= log->l_logBBsize);
5512 blk_no -= log->l_logBBsize;
5513 rhead_blk = blk_no;
5514 }
5515
5516 /* read first part of physical log */
5517 while (blk_no < head_blk) {
5518 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
5519 if (error)
5520 goto bread_err2;
5521
5522 rhead = (xlog_rec_header_t *)offset;
5523 error = xlog_valid_rec_header(log, rhead, blk_no);
5524 if (error)
5525 goto bread_err2;
5526
5527 /* blocks in data section */
5528 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
5529 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
5530 &offset);
5531 if (error)
5532 goto bread_err2;
5533
5534 error = xlog_recover_process(log, rhash, rhead, offset, pass,
5535 &buffer_list);
5536 if (error)
5537 goto bread_err2;
5538
5539 blk_no += bblks + hblks;
5540 rhead_blk = blk_no;
5541 }
5542
5543 bread_err2:
5544 kmem_free(dbp);
5545 bread_err1:
5546 kmem_free(hbp);
5547
5548 /*
5549 * Submit buffers that have been added from the last record processed,
5550 * regardless of error status.
5551 */
5552 if (!list_empty(&buffer_list))
5553 error2 = xfs_buf_delwri_submit(&buffer_list);
5554
5555 if (error && first_bad)
5556 *first_bad = rhead_blk;
5557
5558 /*
5559 * Transactions are freed at commit time but transactions without commit
5560 * records on disk are never committed. Free any that may be left in the
5561 * hash table.
5562 */
5563 for (i = 0; i < XLOG_RHASH_SIZE; i++) {
5564 struct hlist_node *tmp;
5565 struct xlog_recover *trans;
5566
5567 hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
5568 xlog_recover_free_trans(trans);
5569 }
5570
5571 return error ? error : error2;
5572 }
5573
5574 /*
5575 * Do the recovery of the log. We actually do this in two phases.
5576 * The two passes are necessary in order to implement the function
5577 * of cancelling a record written into the log. The first pass
5578 * determines those things which have been cancelled, and the
5579 * second pass replays log items normally except for those which
5580 * have been cancelled. The handling of the replay and cancellations
5581 * takes place in the log item type specific routines.
5582 *
5583 * The table of items which have cancel records in the log is allocated
5584 * and freed at this level, since only here do we know when all of
5585 * the log recovery has been completed.
5586 */
5587 STATIC int
xlog_do_log_recovery(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk)5588 xlog_do_log_recovery(
5589 struct xlog *log,
5590 xfs_daddr_t head_blk,
5591 xfs_daddr_t tail_blk)
5592 {
5593 int error, i;
5594
5595 ASSERT(head_blk != tail_blk);
5596
5597 /*
5598 * First do a pass to find all of the cancelled buf log items.
5599 * Store them in the buf_cancel_table for use in the second pass.
5600 */
5601 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
5602 sizeof(struct list_head),
5603 0);
5604 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
5605 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
5606
5607 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
5608 XLOG_RECOVER_PASS1, NULL);
5609 if (error != 0) {
5610 kmem_free(log->l_buf_cancel_table);
5611 log->l_buf_cancel_table = NULL;
5612 return error;
5613 }
5614 /*
5615 * Then do a second pass to actually recover the items in the log.
5616 * When it is complete free the table of buf cancel items.
5617 */
5618 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
5619 XLOG_RECOVER_PASS2, NULL);
5620 #ifdef DEBUG
5621 if (!error) {
5622 int i;
5623
5624 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
5625 ASSERT(list_empty(&log->l_buf_cancel_table[i]));
5626 }
5627 #endif /* DEBUG */
5628
5629 kmem_free(log->l_buf_cancel_table);
5630 log->l_buf_cancel_table = NULL;
5631
5632 return error;
5633 }
5634
5635 /*
5636 * Do the actual recovery
5637 */
5638 STATIC int
xlog_do_recover(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk)5639 xlog_do_recover(
5640 struct xlog *log,
5641 xfs_daddr_t head_blk,
5642 xfs_daddr_t tail_blk)
5643 {
5644 struct xfs_mount *mp = log->l_mp;
5645 int error;
5646 xfs_buf_t *bp;
5647 xfs_sb_t *sbp;
5648
5649 trace_xfs_log_recover(log, head_blk, tail_blk);
5650
5651 /*
5652 * First replay the images in the log.
5653 */
5654 error = xlog_do_log_recovery(log, head_blk, tail_blk);
5655 if (error)
5656 return error;
5657
5658 /*
5659 * If IO errors happened during recovery, bail out.
5660 */
5661 if (XFS_FORCED_SHUTDOWN(mp)) {
5662 return -EIO;
5663 }
5664
5665 /*
5666 * We now update the tail_lsn since much of the recovery has completed
5667 * and there may be space available to use. If there were no extent
5668 * or iunlinks, we can free up the entire log and set the tail_lsn to
5669 * be the last_sync_lsn. This was set in xlog_find_tail to be the
5670 * lsn of the last known good LR on disk. If there are extent frees
5671 * or iunlinks they will have some entries in the AIL; so we look at
5672 * the AIL to determine how to set the tail_lsn.
5673 */
5674 xlog_assign_tail_lsn(mp);
5675
5676 /*
5677 * Now that we've finished replaying all buffer and inode
5678 * updates, re-read in the superblock and reverify it.
5679 */
5680 bp = xfs_getsb(mp);
5681 bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
5682 ASSERT(!(bp->b_flags & XBF_WRITE));
5683 bp->b_flags |= XBF_READ;
5684 bp->b_ops = &xfs_sb_buf_ops;
5685
5686 error = xfs_buf_submit(bp);
5687 if (error) {
5688 if (!XFS_FORCED_SHUTDOWN(mp)) {
5689 xfs_buf_ioerror_alert(bp, __func__);
5690 ASSERT(0);
5691 }
5692 xfs_buf_relse(bp);
5693 return error;
5694 }
5695
5696 /* Convert superblock from on-disk format */
5697 sbp = &mp->m_sb;
5698 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
5699 xfs_buf_relse(bp);
5700
5701 /* re-initialise in-core superblock and geometry structures */
5702 xfs_reinit_percpu_counters(mp);
5703 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
5704 if (error) {
5705 xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
5706 return error;
5707 }
5708 mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
5709
5710 xlog_recover_check_summary(log);
5711
5712 /* Normal transactions can now occur */
5713 log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
5714 return 0;
5715 }
5716
5717 /*
5718 * Perform recovery and re-initialize some log variables in xlog_find_tail.
5719 *
5720 * Return error or zero.
5721 */
5722 int
xlog_recover(struct xlog * log)5723 xlog_recover(
5724 struct xlog *log)
5725 {
5726 xfs_daddr_t head_blk, tail_blk;
5727 int error;
5728
5729 /* find the tail of the log */
5730 error = xlog_find_tail(log, &head_blk, &tail_blk);
5731 if (error)
5732 return error;
5733
5734 /*
5735 * The superblock was read before the log was available and thus the LSN
5736 * could not be verified. Check the superblock LSN against the current
5737 * LSN now that it's known.
5738 */
5739 if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
5740 !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
5741 return -EINVAL;
5742
5743 if (tail_blk != head_blk) {
5744 /* There used to be a comment here:
5745 *
5746 * disallow recovery on read-only mounts. note -- mount
5747 * checks for ENOSPC and turns it into an intelligent
5748 * error message.
5749 * ...but this is no longer true. Now, unless you specify
5750 * NORECOVERY (in which case this function would never be
5751 * called), we just go ahead and recover. We do this all
5752 * under the vfs layer, so we can get away with it unless
5753 * the device itself is read-only, in which case we fail.
5754 */
5755 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
5756 return error;
5757 }
5758
5759 /*
5760 * Version 5 superblock log feature mask validation. We know the
5761 * log is dirty so check if there are any unknown log features
5762 * in what we need to recover. If there are unknown features
5763 * (e.g. unsupported transactions, then simply reject the
5764 * attempt at recovery before touching anything.
5765 */
5766 if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
5767 xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
5768 XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
5769 xfs_warn(log->l_mp,
5770 "Superblock has unknown incompatible log features (0x%x) enabled.",
5771 (log->l_mp->m_sb.sb_features_log_incompat &
5772 XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
5773 xfs_warn(log->l_mp,
5774 "The log can not be fully and/or safely recovered by this kernel.");
5775 xfs_warn(log->l_mp,
5776 "Please recover the log on a kernel that supports the unknown features.");
5777 return -EINVAL;
5778 }
5779
5780 /*
5781 * Delay log recovery if the debug hook is set. This is debug
5782 * instrumention to coordinate simulation of I/O failures with
5783 * log recovery.
5784 */
5785 if (xfs_globals.log_recovery_delay) {
5786 xfs_notice(log->l_mp,
5787 "Delaying log recovery for %d seconds.",
5788 xfs_globals.log_recovery_delay);
5789 msleep(xfs_globals.log_recovery_delay * 1000);
5790 }
5791
5792 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
5793 log->l_mp->m_logname ? log->l_mp->m_logname
5794 : "internal");
5795
5796 error = xlog_do_recover(log, head_blk, tail_blk);
5797 log->l_flags |= XLOG_RECOVERY_NEEDED;
5798 }
5799 return error;
5800 }
5801
5802 /*
5803 * In the first part of recovery we replay inodes and buffers and build
5804 * up the list of extent free items which need to be processed. Here
5805 * we process the extent free items and clean up the on disk unlinked
5806 * inode lists. This is separated from the first part of recovery so
5807 * that the root and real-time bitmap inodes can be read in from disk in
5808 * between the two stages. This is necessary so that we can free space
5809 * in the real-time portion of the file system.
5810 */
5811 int
xlog_recover_finish(struct xlog * log)5812 xlog_recover_finish(
5813 struct xlog *log)
5814 {
5815 /*
5816 * Now we're ready to do the transactions needed for the
5817 * rest of recovery. Start with completing all the extent
5818 * free intent records and then process the unlinked inode
5819 * lists. At this point, we essentially run in normal mode
5820 * except that we're still performing recovery actions
5821 * rather than accepting new requests.
5822 */
5823 if (log->l_flags & XLOG_RECOVERY_NEEDED) {
5824 int error;
5825 error = xlog_recover_process_intents(log);
5826 if (error) {
5827 xfs_alert(log->l_mp, "Failed to recover intents");
5828 return error;
5829 }
5830
5831 /*
5832 * Sync the log to get all the intents out of the AIL.
5833 * This isn't absolutely necessary, but it helps in
5834 * case the unlink transactions would have problems
5835 * pushing the intents out of the way.
5836 */
5837 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
5838
5839 xlog_recover_process_iunlinks(log);
5840
5841 xlog_recover_check_summary(log);
5842
5843 xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
5844 log->l_mp->m_logname ? log->l_mp->m_logname
5845 : "internal");
5846 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
5847 } else {
5848 xfs_info(log->l_mp, "Ending clean mount");
5849 }
5850 return 0;
5851 }
5852
5853 void
xlog_recover_cancel(struct xlog * log)5854 xlog_recover_cancel(
5855 struct xlog *log)
5856 {
5857 if (log->l_flags & XLOG_RECOVERY_NEEDED)
5858 xlog_recover_cancel_intents(log);
5859 }
5860
5861 #if defined(DEBUG)
5862 /*
5863 * Read all of the agf and agi counters and check that they
5864 * are consistent with the superblock counters.
5865 */
5866 STATIC void
xlog_recover_check_summary(struct xlog * log)5867 xlog_recover_check_summary(
5868 struct xlog *log)
5869 {
5870 xfs_mount_t *mp;
5871 xfs_agf_t *agfp;
5872 xfs_buf_t *agfbp;
5873 xfs_buf_t *agibp;
5874 xfs_agnumber_t agno;
5875 uint64_t freeblks;
5876 uint64_t itotal;
5877 uint64_t ifree;
5878 int error;
5879
5880 mp = log->l_mp;
5881
5882 freeblks = 0LL;
5883 itotal = 0LL;
5884 ifree = 0LL;
5885 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
5886 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
5887 if (error) {
5888 xfs_alert(mp, "%s agf read failed agno %d error %d",
5889 __func__, agno, error);
5890 } else {
5891 agfp = XFS_BUF_TO_AGF(agfbp);
5892 freeblks += be32_to_cpu(agfp->agf_freeblks) +
5893 be32_to_cpu(agfp->agf_flcount);
5894 xfs_buf_relse(agfbp);
5895 }
5896
5897 error = xfs_read_agi(mp, NULL, agno, &agibp);
5898 if (error) {
5899 xfs_alert(mp, "%s agi read failed agno %d error %d",
5900 __func__, agno, error);
5901 } else {
5902 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
5903
5904 itotal += be32_to_cpu(agi->agi_count);
5905 ifree += be32_to_cpu(agi->agi_freecount);
5906 xfs_buf_relse(agibp);
5907 }
5908 }
5909 }
5910 #endif /* DEBUG */
5911