1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * This file is part of UBIFS.
4 *
5 * Copyright (C) 2006-2008 Nokia Corporation
6 *
7 * Authors: Adrian Hunter
8 * Artem Bityutskiy (Битюцкий Артём)
9 */
10
11 /*
12 * This file implements functions needed to recover from unclean un-mounts.
13 * When UBIFS is mounted, it checks a flag on the master node to determine if
14 * an un-mount was completed successfully. If not, the process of mounting
15 * incorporates additional checking and fixing of on-flash data structures.
16 * UBIFS always cleans away all remnants of an unclean un-mount, so that
17 * errors do not accumulate. However UBIFS defers recovery if it is mounted
18 * read-only, and the flash is not modified in that case.
19 *
20 * The general UBIFS approach to the recovery is that it recovers from
21 * corruptions which could be caused by power cuts, but it refuses to recover
22 * from corruption caused by other reasons. And UBIFS tries to distinguish
23 * between these 2 reasons of corruptions and silently recover in the former
24 * case and loudly complain in the latter case.
25 *
26 * UBIFS writes only to erased LEBs, so it writes only to the flash space
27 * containing only 0xFFs. UBIFS also always writes strictly from the beginning
28 * of the LEB to the end. And UBIFS assumes that the underlying flash media
29 * writes in @c->max_write_size bytes at a time.
30 *
31 * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min.
32 * I/O unit corresponding to offset X to contain corrupted data, all the
33 * following min. I/O units have to contain empty space (all 0xFFs). If this is
34 * not true, the corruption cannot be the result of a power cut, and UBIFS
35 * refuses to mount.
36 */
37
38 #include <linux/crc32.h>
39 #include <linux/slab.h>
40 #include "ubifs.h"
41
42 #define RECOVERY_TWO 2
43 #define RECOVERY_SEVEN 7
44 #define RECOVERY_EIGHT 8
45
46 /**
47 * is_empty - determine whether a buffer is empty (contains all 0xff).
48 * @buf: buffer to clean
49 * @len: length of buffer
50 *
51 * This function returns %1 if the buffer is empty (contains all 0xff) otherwise
52 * %0 is returned.
53 */
is_empty(void * buf,int len)54 static int is_empty(void *buf, int len)
55 {
56 uint8_t *p = buf;
57 int i;
58
59 for (i = 0; i < len; i++) {
60 if (*p++ != 0xff) {
61 return 0;
62 }
63 }
64 return 1;
65 }
66
67 /**
68 * first_non_ff - find offset of the first non-0xff byte.
69 * @buf: buffer to search in
70 * @len: length of buffer
71 *
72 * This function returns offset of the first non-0xff byte in @buf or %-1 if
73 * the buffer contains only 0xff bytes.
74 */
first_non_ff(void * buf,int len)75 static int first_non_ff(void *buf, int len)
76 {
77 uint8_t *p = buf;
78 int i;
79
80 for (i = 0; i < len; i++) {
81 if (*p++ != 0xff) {
82 return i;
83 }
84 }
85 return -1;
86 }
87
88 /**
89 * get_master_node - get the last valid master node allowing for corruption.
90 * @c: UBIFS file-system description object
91 * @lnum: LEB number
92 * @pbuf: buffer containing the LEB read, is returned here
93 * @mst: master node, if found, is returned here
94 * @cor: corruption, if found, is returned here
95 *
96 * This function allocates a buffer, reads the LEB into it, and finds and
97 * returns the last valid master node allowing for one area of corruption.
98 * The corrupt area, if there is one, must be consistent with the assumption
99 * that it is the result of an unclean unmount while the master node was being
100 * written. Under those circumstances, it is valid to use the previously written
101 * master node.
102 *
103 * This function returns %0 on success and a negative error code on failure.
104 */
get_master_node(const struct ubifs_info * c,int lnum,void ** pbuf,struct ubifs_mst_node ** mst,void ** cor)105 static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf, struct ubifs_mst_node **mst, void **cor)
106 {
107 const int sz = c->mst_node_alsz;
108 int err, offs, len;
109 void *sbuf, *buf;
110
111 sbuf = vmalloc(c->leb_size);
112 if (!sbuf) {
113 return -ENOMEM;
114 }
115
116 err = ubifs_leb_read(c, lnum, sbuf, 0, c->leb_size, 0);
117 if (err && err != -EBADMSG) {
118 goto out_free;
119 }
120
121 /* Find the first position that is definitely not a node */
122 offs = 0;
123 buf = sbuf;
124 len = c->leb_size;
125 while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) {
126 struct ubifs_ch *ch = buf;
127
128 if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
129 break;
130 }
131 offs += sz;
132 buf += sz;
133 len -= sz;
134 }
135 /* See if there was a valid master node before that */
136 if (offs) {
137 int ret;
138
139 offs -= sz;
140 buf -= sz;
141 len += sz;
142 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
143 if (ret != SCANNED_A_NODE && offs) {
144 /* Could have been corruption so check one place back */
145 offs -= sz;
146 buf -= sz;
147 len += sz;
148 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
149 if (ret != SCANNED_A_NODE) {
150 /*
151 * We accept only one area of corruption because
152 * we are assuming that it was caused while
153 * trying to write a master node.
154 */
155 goto out_err;
156 }
157 }
158 if (ret == SCANNED_A_NODE) {
159 struct ubifs_ch *ch = buf;
160
161 if (ch->node_type != UBIFS_MST_NODE) {
162 goto out_err;
163 }
164 dbg_rcvry("found a master node at %d:%d", lnum, offs);
165 *mst = buf;
166 offs += sz;
167 buf += sz;
168 len -= sz;
169 }
170 }
171 /* Check for corruption */
172 if (offs < c->leb_size) {
173 if (!is_empty(buf, min_t(int, len, sz))) {
174 *cor = buf;
175 dbg_rcvry("found corruption at %d:%d", lnum, offs);
176 }
177 offs += sz;
178 buf += sz;
179 len -= sz;
180 }
181 /* Check remaining empty space */
182 if (offs < c->leb_size) {
183 if (!is_empty(buf, len)) {
184 goto out_err;
185 }
186 }
187 *pbuf = sbuf;
188 return 0;
189
190 out_err:
191 err = -EINVAL;
192 out_free:
193 vfree(sbuf);
194 *mst = NULL;
195 *cor = NULL;
196 return err;
197 }
198
199 /**
200 * write_rcvrd_mst_node - write recovered master node.
201 * @c: UBIFS file-system description object
202 * @mst: master node
203 *
204 * This function returns %0 on success and a negative error code on failure.
205 */
write_rcvrd_mst_node(struct ubifs_info * c,struct ubifs_mst_node * mst)206 static int write_rcvrd_mst_node(struct ubifs_info *c, struct ubifs_mst_node *mst)
207 {
208 int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz;
209 __le32 save_flags;
210
211 dbg_rcvry("recovery");
212
213 save_flags = mst->flags;
214 mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY);
215
216 err = ubifs_prepare_node_hmac(c, mst, UBIFS_MST_NODE_SZ, offsetof(struct ubifs_mst_node, hmac), 1);
217 if (err) {
218 goto out;
219 }
220 err = ubifs_leb_change(c, lnum, mst, sz);
221 if (err) {
222 goto out;
223 }
224 err = ubifs_leb_change(c, lnum + 1, mst, sz);
225 if (err) {
226 goto out;
227 }
228 out:
229 mst->flags = save_flags;
230 return err;
231 }
232
233 /**
234 * ubifs_recover_master_node - recover the master node.
235 * @c: UBIFS file-system description object
236 *
237 * This function recovers the master node from corruption that may occur due to
238 * an unclean unmount.
239 *
240 * This function returns %0 on success and a negative error code on failure.
241 */
ubifs_recover_master_node(struct ubifs_info * c)242 int ubifs_recover_master_node(struct ubifs_info *c)
243 {
244 void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL;
245 struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst;
246 const int sz = c->mst_node_alsz;
247 int err, offs1, offs2;
248
249 dbg_rcvry("recovery");
250
251 err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1);
252 if (err) {
253 goto out_free;
254 }
255
256 err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2);
257 if (err) {
258 goto out_free;
259 }
260
261 if (mst1) {
262 offs1 = (void *)mst1 - buf1;
263 if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) && (offs1 == 0 && !cor1)) {
264 /*
265 * mst1 was written by recovery at offset 0 with no
266 * corruption.
267 */
268 dbg_rcvry("recovery recovery");
269 mst = mst1;
270 } else if (mst2) {
271 offs2 = (void *)mst2 - buf2;
272 if (offs1 == offs2) {
273 /* Same offset, so must be the same */
274 if (ubifs_compare_master_node(c, mst1, mst2)) {
275 goto out_err;
276 }
277 mst = mst1;
278 } else if (offs2 + sz == offs1) {
279 /* 1st LEB was written, 2nd was not */
280 if (cor1) {
281 goto out_err;
282 }
283 mst = mst1;
284 } else if (offs1 == 0 && c->leb_size - offs2 - sz < sz) {
285 /* 1st LEB was unmapped and written, 2nd not */
286 if (cor1) {
287 goto out_err;
288 }
289 mst = mst1;
290 } else {
291 goto out_err;
292 }
293 } else {
294 /*
295 * 2nd LEB was unmapped and about to be written, so
296 * there must be only one master node in the first LEB
297 * and no corruption.
298 */
299 if (offs1 != 0 || cor1) {
300 goto out_err;
301 }
302 mst = mst1;
303 }
304 } else {
305 if (!mst2) {
306 goto out_err;
307 }
308 /*
309 * 1st LEB was unmapped and about to be written, so there must
310 * be no room left in 2nd LEB.
311 */
312 offs2 = (void *)mst2 - buf2;
313 if (offs2 + sz + sz <= c->leb_size) {
314 goto out_err;
315 }
316 mst = mst2;
317 }
318
319 ubifs_msg(c, "recovered master node from LEB %d", (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
320
321 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
322
323 if (c->ro_mount) {
324 /* Read-only mode. Keep a copy for switching to rw mode */
325 c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
326 if (!c->rcvrd_mst_node) {
327 err = -ENOMEM;
328 goto out_free;
329 }
330 memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ);
331
332 /*
333 * We had to recover the master node, which means there was an
334 * unclean reboot. However, it is possible that the master node
335 * is clean at this point, i.e., %UBIFS_MST_DIRTY is not set.
336 * E.g., consider the following chain of events:
337 *
338 * 1. UBIFS was cleanly unmounted, so the master node is clean
339 * 2. UBIFS is being mounted R/W and starts changing the master
340 * node in the first (%UBIFS_MST_LNUM). A power cut happens,
341 * so this LEB ends up with some amount of garbage at the
342 * end.
343 * 3. UBIFS is being mounted R/O. We reach this place and
344 * recover the master node from the second LEB
345 * (%UBIFS_MST_LNUM + 1). But we cannot update the media
346 * because we are being mounted R/O. We have to defer the
347 * operation.
348 * 4. However, this master node (@c->mst_node) is marked as
349 * clean (since the step 1). And if we just return, the
350 * mount code will be confused and won't recover the master
351 * node when it is re-mounter R/W later.
352 *
353 * Thus, to force the recovery by marking the master node as
354 * dirty.
355 */
356 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
357 } else {
358 /* Write the recovered master node */
359 c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1;
360 err = write_rcvrd_mst_node(c, c->mst_node);
361 if (err) {
362 goto out_free;
363 }
364 }
365
366 vfree(buf2);
367 vfree(buf1);
368
369 return 0;
370
371 out_err:
372 err = -EINVAL;
373 out_free:
374 ubifs_err(c, "failed to recover master node");
375 if (mst1) {
376 ubifs_err(c, "dumping first master node");
377 ubifs_dump_node(c, mst1, c->leb_size - ((void *)mst1 - buf1));
378 }
379 if (mst2) {
380 ubifs_err(c, "dumping second master node");
381 ubifs_dump_node(c, mst2, c->leb_size - ((void *)mst2 - buf2));
382 }
383 vfree(buf2);
384 vfree(buf1);
385 return err;
386 }
387
388 /**
389 * ubifs_write_rcvrd_mst_node - write the recovered master node.
390 * @c: UBIFS file-system description object
391 *
392 * This function writes the master node that was recovered during mounting in
393 * read-only mode and must now be written because we are remounting rw.
394 *
395 * This function returns %0 on success and a negative error code on failure.
396 */
ubifs_write_rcvrd_mst_node(struct ubifs_info * c)397 int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
398 {
399 int err;
400
401 if (!c->rcvrd_mst_node) {
402 return 0;
403 }
404 c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
405 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
406 err = write_rcvrd_mst_node(c, c->rcvrd_mst_node);
407 if (err) {
408 return err;
409 }
410 kfree(c->rcvrd_mst_node);
411 c->rcvrd_mst_node = NULL;
412 return 0;
413 }
414
415 /**
416 * is_last_write - determine if an offset was in the last write to a LEB.
417 * @c: UBIFS file-system description object
418 * @buf: buffer to check
419 * @offs: offset to check
420 *
421 * This function returns %1 if @offs was in the last write to the LEB whose data
422 * is in @buf, otherwise %0 is returned. The determination is made by checking
423 * for subsequent empty space starting from the next @c->max_write_size
424 * boundary.
425 */
is_last_write(const struct ubifs_info * c,void * buf,int offs)426 static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
427 {
428 int empty_offs, check_len;
429 uint8_t *p;
430
431 /*
432 * Round up to the next @c->max_write_size boundary i.e. @offs is in
433 * the last wbuf written. After that should be empty space.
434 */
435 empty_offs = ALIGN(offs + 1, c->max_write_size);
436 check_len = c->leb_size - empty_offs;
437 p = buf + empty_offs - offs;
438 return is_empty(p, check_len);
439 }
440
441 /**
442 * clean_buf - clean the data from an LEB sitting in a buffer.
443 * @c: UBIFS file-system description object
444 * @buf: buffer to clean
445 * @lnum: LEB number to clean
446 * @offs: offset from which to clean
447 * @len: length of buffer
448 *
449 * This function pads up to the next min_io_size boundary (if there is one) and
450 * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
451 * @c->min_io_size boundary.
452 */
clean_buf(const struct ubifs_info * c,void ** buf,int lnum,int * offs,int * len)453 static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, int *offs, int *len)
454 {
455 int empty_offs, pad_len;
456
457 dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
458
459 ubifs_assert(c, !(*offs & RECOVERY_SEVEN));
460 empty_offs = ALIGN(*offs, c->min_io_size);
461 pad_len = empty_offs - *offs;
462 ubifs_pad(c, *buf, pad_len);
463 *offs += pad_len;
464 *buf += pad_len;
465 *len -= pad_len;
466 memset(*buf, 0xff, c->leb_size - empty_offs);
467 }
468
469 /**
470 * no_more_nodes - determine if there are no more nodes in a buffer.
471 * @c: UBIFS file-system description object
472 * @buf: buffer to check
473 * @len: length of buffer
474 * @lnum: LEB number of the LEB from which @buf was read
475 * @offs: offset from which @buf was read
476 *
477 * This function ensures that the corrupted node at @offs is the last thing
478 * written to a LEB. This function returns %1 if more data is not found and
479 * %0 if more data is found.
480 */
no_more_nodes(const struct ubifs_info * c,void * buf,int len,int lnum,int offs)481 static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, int lnum, int offs)
482 {
483 struct ubifs_ch *ch = buf;
484 int skip, dlen = le32_to_cpu(ch->len);
485
486 /* Check for empty space after the corrupt node's common header */
487 skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs;
488 if (is_empty(buf + skip, len - skip)) {
489 return 1;
490 }
491 /*
492 * The area after the common header size is not empty, so the common
493 * header must be intact. Check it.
494 */
495 if (ubifs_check_node(c, buf, len, lnum, offs, 1, 0) != -EUCLEAN) {
496 dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
497 return 0;
498 }
499 /* Now we know the corrupt node's length we can skip over it */
500 skip = ALIGN(offs + dlen, c->max_write_size) - offs;
501 /* After which there should be empty space */
502 if (is_empty(buf + skip, len - skip)) {
503 return 1;
504 }
505 dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
506 return 0;
507 }
508
509 /**
510 * fix_unclean_leb - fix an unclean LEB.
511 * @c: UBIFS file-system description object
512 * @sleb: scanned LEB information
513 * @start: offset where scan started
514 */
fix_unclean_leb(struct ubifs_info * c,struct ubifs_scan_leb * sleb,int start)515 static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, int start)
516 {
517 int lnum = sleb->lnum, endpt = start;
518
519 /* Get the end offset of the last node we are keeping */
520 if (!list_empty(&sleb->nodes)) {
521 struct ubifs_scan_node *snod;
522
523 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
524 endpt = snod->offs + snod->len;
525 }
526
527 if (c->ro_mount && !c->remounting_rw) {
528 /* Add to recovery list */
529 struct ubifs_unclean_leb *ucleb;
530
531 dbg_rcvry("need to fix LEB %d start %d endpt %d", lnum, start, sleb->endpt);
532 ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS);
533 if (!ucleb) {
534 return -ENOMEM;
535 }
536 ucleb->lnum = lnum;
537 ucleb->endpt = endpt;
538 list_add_tail(&ucleb->list, &c->unclean_leb_list);
539 } else {
540 /* Write the fixed LEB back to flash */
541 int err;
542
543 dbg_rcvry("fixing LEB %d start %d endpt %d", lnum, start, sleb->endpt);
544 if (endpt == 0) {
545 err = ubifs_leb_unmap(c, lnum);
546 if (err) {
547 return err;
548 }
549 } else {
550 int len = ALIGN(endpt, c->min_io_size);
551
552 if (start) {
553 err = ubifs_leb_read(c, lnum, sleb->buf, 0, start, 1);
554 if (err) {
555 return err;
556 }
557 }
558 /* Pad to min_io_size */
559 if (len > endpt) {
560 int pad_len = len - ALIGN(endpt, RECOVERY_EIGHT);
561 if (pad_len > 0) {
562 void *buf = sleb->buf + len - pad_len;
563
564 ubifs_pad(c, buf, pad_len);
565 }
566 }
567 err = ubifs_leb_change(c, lnum, sleb->buf, len);
568 if (err) {
569 return err;
570 }
571 }
572 }
573 return 0;
574 }
575
576 /**
577 * drop_last_group - drop the last group of nodes.
578 * @sleb: scanned LEB information
579 * @offs: offset of dropped nodes is returned here
580 *
581 * This is a helper function for 'ubifs_recover_leb()' which drops the last
582 * group of nodes of the scanned LEB.
583 */
drop_last_group(struct ubifs_scan_leb * sleb,int * offs)584 static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
585 {
586 while (!list_empty(&sleb->nodes)) {
587 struct ubifs_scan_node *snod;
588 struct ubifs_ch *ch;
589
590 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
591 ch = snod->node;
592 if (ch->group_type != UBIFS_IN_NODE_GROUP) {
593 break;
594 }
595
596 dbg_rcvry("dropping grouped node at %d:%d", sleb->lnum, snod->offs);
597 *offs = snod->offs;
598 list_del(&snod->list);
599 kfree(snod);
600 sleb->nodes_cnt -= 1;
601 }
602 }
603
604 /**
605 * drop_last_node - drop the last node.
606 * @sleb: scanned LEB information
607 * @offs: offset of dropped nodes is returned here
608 *
609 * This is a helper function for 'ubifs_recover_leb()' which drops the last
610 * node of the scanned LEB.
611 */
drop_last_node(struct ubifs_scan_leb * sleb,int * offs)612 static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
613 {
614 struct ubifs_scan_node *snod;
615
616 if (!list_empty(&sleb->nodes)) {
617 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
618
619 dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs);
620 *offs = snod->offs;
621 list_del(&snod->list);
622 kfree(snod);
623 sleb->nodes_cnt -= 1;
624 }
625 }
626
627 /**
628 * ubifs_recover_leb - scan and recover a LEB.
629 * @c: UBIFS file-system description object
630 * @lnum: LEB number
631 * @offs: offset
632 * @sbuf: LEB-sized buffer to use
633 * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not
634 * belong to any journal head)
635 *
636 * This function does a scan of a LEB, but caters for errors that might have
637 * been caused by the unclean unmount from which we are attempting to recover.
638 * Returns the scanned information on success and a negative error code on
639 * failure.
640 */
ubifs_recover_leb(struct ubifs_info * c,int lnum,int offs,void * sbuf,int jhead)641 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf, int jhead)
642 {
643 int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
644 int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped;
645 struct ubifs_scan_leb *sleb;
646 void *buf = sbuf + offs;
647
648 dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped);
649
650 sleb = ubifs_start_scan(c, lnum, offs, sbuf);
651 if (IS_ERR(sleb)) {
652 return sleb;
653 }
654
655 ubifs_assert(c, len >= RECOVERY_EIGHT);
656 while (len >= RECOVERY_EIGHT) {
657 dbg_scan("look at LEB %d:%d (%d bytes left)", lnum, offs, len);
658
659 cond_resched();
660
661 /*
662 * Scan quietly until there is an error from which we cannot
663 * recover
664 */
665 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
666 if (ret == SCANNED_A_NODE) {
667 /* A valid node, and not a padding node */
668 struct ubifs_ch *ch = buf;
669 int node_len;
670
671 err = ubifs_add_snod(c, sleb, buf, offs);
672 if (err) {
673 goto error;
674 }
675 node_len = ALIGN(le32_to_cpu(ch->len), RECOVERY_EIGHT);
676 offs += node_len;
677 buf += node_len;
678 len -= node_len;
679 } else if (ret > 0) {
680 /* Padding bytes or a valid padding node */
681 offs += ret;
682 buf += ret;
683 len -= ret;
684 } else if (ret == SCANNED_A_CORRUPT_NODE) {
685 dbg_rcvry("found corruption (%d) at %d:%d", ret, lnum, offs);
686 if (ubifs_check_node(c, buf, len, lnum, offs, 1, 1) == -EUCLEAN &&
687 !no_more_nodes(c, buf, len, lnum, offs)) {
688 int skip;
689 struct ubifs_ch *ch = buf;
690
691 /*
692 * If the flash voltage power down suddenly in the programming
693 * process, it may lead to abnormal data written by the flash
694 * in the low-voltage operation process, and the last data
695 * should be discarded.
696 */
697 ubifs_msg(c, "recovery corrupt node\n");
698 skip = ALIGN(offs + le32_to_cpu(ch->len), c->max_write_size) - offs;
699 memset(buf + skip, 0xff, len - skip);
700 }
701
702 break;
703 } else if (ret == SCANNED_EMPTY_SPACE) {
704 dbg_rcvry("found corruption (%d) at %d:%d", ret, lnum, offs);
705 if (!is_empty(buf, len) && !is_last_write(c, buf, offs)) {
706 /*
707 * If the flash voltage power down suddenly in the programming
708 * process, it may lead to the data was programmed to the wroge
709 * page written by the flash in the low-voltage operation process,
710 * and the data should be discarded.
711 */
712 ubifs_msg(c, "recovery empty space\n");
713 memset(buf, 0xff, len);
714 }
715
716 break;
717 } else if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) {
718 dbg_rcvry("found corruption (%d) at %d:%d", ret, lnum, offs);
719 break;
720 } else {
721 ubifs_err(c, "unexpected return value %d", ret);
722 err = -EINVAL;
723 goto error;
724 }
725 }
726
727 if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) {
728 if (!is_last_write(c, buf, offs)) {
729 goto corrupted_rescan;
730 }
731 } else if (ret == SCANNED_A_CORRUPT_NODE) {
732 if (!no_more_nodes(c, buf, len, lnum, offs)) {
733 goto corrupted_rescan;
734 }
735 } else if (!is_empty(buf, len)) {
736 if (!is_last_write(c, buf, offs)) {
737 int corruption = first_non_ff(buf, len);
738
739 /*
740 * See header comment for this file for more
741 * explanations about the reasons we have this check.
742 */
743 ubifs_err(c, "corrupt empty space LEB %d:%d, corruption starts at %d", lnum, offs, corruption);
744 /* Make sure we dump interesting non-0xFF data */
745 offs += corruption;
746 buf += corruption;
747 goto corrupted;
748 }
749 }
750
751 min_io_unit = round_down(offs, c->min_io_size);
752 if (grouped) {
753 /*
754 * If nodes are grouped, always drop the incomplete group at
755 * the end.
756 */
757 drop_last_group(sleb, &offs);
758 }
759
760 if (jhead == GCHD) {
761 /*
762 * If this LEB belongs to the GC head then while we are in the
763 * middle of the same min. I/O unit keep dropping nodes. So
764 * basically, what we want is to make sure that the last min.
765 * I/O unit where we saw the corruption is dropped completely
766 * with all the uncorrupted nodes which may possibly sit there.
767 *
768 * In other words, let's name the min. I/O unit where the
769 * corruption starts B, and the previous min. I/O unit A. The
770 * below code tries to deal with a situation when half of B
771 * contains valid nodes or the end of a valid node, and the
772 * second half of B contains corrupted data or garbage. This
773 * means that UBIFS had been writing to B just before the power
774 * cut happened. I do not know how realistic is this scenario
775 * that half of the min. I/O unit had been written successfully
776 * and the other half not, but this is possible in our 'failure
777 * mode emulation' infrastructure at least.
778 *
779 * So what is the problem, why we need to drop those nodes? Why
780 * can't we just clean-up the second half of B by putting a
781 * padding node there? We can, and this works fine with one
782 * exception which was reproduced with power cut emulation
783 * testing and happens extremely rarely.
784 *
785 * Imagine the file-system is full, we run GC which starts
786 * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is
787 * the current GC head LEB). The @c->gc_lnum is -1, which means
788 * that GC will retain LEB X and will try to continue. Imagine
789 * that LEB X is currently the dirtiest LEB, and the amount of
790 * used space in LEB Y is exactly the same as amount of free
791 * space in LEB X.
792 *
793 * And a power cut happens when nodes are moved from LEB X to
794 * LEB Y. We are here trying to recover LEB Y which is the GC
795 * head LEB. We find the min. I/O unit B as described above.
796 * Then we clean-up LEB Y by padding min. I/O unit. And later
797 * 'ubifs_rcvry_gc_commit()' function fails, because it cannot
798 * find a dirty LEB which could be GC'd into LEB Y! Even LEB X
799 * does not match because the amount of valid nodes there does
800 * not fit the free space in LEB Y any more! And this is
801 * because of the padding node which we added to LEB Y. The
802 * user-visible effect of this which I once observed and
803 * analysed is that we cannot mount the file-system with
804 * -ENOSPC error.
805 *
806 * So obviously, to make sure that situation does not happen we
807 * should free min. I/O unit B in LEB Y completely and the last
808 * used min. I/O unit in LEB Y should be A. This is basically
809 * what the below code tries to do.
810 */
811 while (offs > min_io_unit) {
812 drop_last_node(sleb, &offs);
813 }
814 }
815
816 buf = sbuf + offs;
817 len = c->leb_size - offs;
818
819 clean_buf(c, &buf, lnum, &offs, &len);
820 ubifs_end_scan(c, sleb, lnum, offs);
821
822 err = fix_unclean_leb(c, sleb, start);
823 if (err) {
824 goto error;
825 }
826
827 return sleb;
828
829 corrupted_rescan:
830 /* Re-scan the corrupted data with verbose messages */
831 ubifs_err(c, "corruption %d", ret);
832 ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
833 corrupted:
834 ubifs_scanned_corruption(c, lnum, offs, buf);
835 err = -EUCLEAN;
836 error:
837 ubifs_err(c, "LEB %d scanning failed", lnum);
838 ubifs_scan_destroy(sleb);
839 return ERR_PTR(err);
840 }
841
842 /**
843 * get_cs_sqnum - get commit start sequence number.
844 * @c: UBIFS file-system description object
845 * @lnum: LEB number of commit start node
846 * @offs: offset of commit start node
847 * @cs_sqnum: commit start sequence number is returned here
848 *
849 * This function returns %0 on success and a negative error code on failure.
850 */
get_cs_sqnum(struct ubifs_info * c,int lnum,int offs,unsigned long long * cs_sqnum)851 static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, unsigned long long *cs_sqnum)
852 {
853 struct ubifs_cs_node *cs_node = NULL;
854 int err, ret;
855
856 dbg_rcvry("at %d:%d", lnum, offs);
857 cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL);
858 if (!cs_node) {
859 return -ENOMEM;
860 }
861 if (c->leb_size - offs < UBIFS_CS_NODE_SZ) {
862 goto out_err;
863 }
864 err = ubifs_leb_read(c, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ, 0);
865 if (err && err != -EBADMSG) {
866 goto out_free;
867 }
868 ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
869 if (ret != SCANNED_A_NODE) {
870 ubifs_err(c, "Not a valid node");
871 goto out_err;
872 }
873 if (cs_node->ch.node_type != UBIFS_CS_NODE) {
874 ubifs_err(c, "Not a CS node, type is %d", cs_node->ch.node_type);
875 goto out_err;
876 }
877 if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
878 ubifs_err(c, "CS node cmt_no %llu != current cmt_no %llu", (unsigned long long)le64_to_cpu(cs_node->cmt_no),
879 c->cmt_no);
880 goto out_err;
881 }
882 *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum);
883 dbg_rcvry("commit start sqnum %llu", *cs_sqnum);
884 kfree(cs_node);
885 return 0;
886
887 out_err:
888 err = -EINVAL;
889 out_free:
890 ubifs_err(c, "failed to get CS sqnum");
891 kfree(cs_node);
892 return err;
893 }
894
895 /**
896 * ubifs_recover_log_leb - scan and recover a log LEB.
897 * @c: UBIFS file-system description object
898 * @lnum: LEB number
899 * @offs: offset
900 * @sbuf: LEB-sized buffer to use
901 *
902 * This function does a scan of a LEB, but caters for errors that might have
903 * been caused by unclean reboots from which we are attempting to recover
904 * (assume that only the last log LEB can be corrupted by an unclean reboot).
905 *
906 * This function returns %0 on success and a negative error code on failure.
907 */
ubifs_recover_log_leb(struct ubifs_info * c,int lnum,int offs,void * sbuf)908 struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
909 {
910 struct ubifs_scan_leb *sleb;
911 int next_lnum;
912
913 dbg_rcvry("LEB %d", lnum);
914 next_lnum = lnum + 1;
915 if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs) {
916 next_lnum = UBIFS_LOG_LNUM;
917 }
918 if (next_lnum != c->ltail_lnum) {
919 /*
920 * We can only recover at the end of the log, so check that the
921 * next log LEB is empty or out of date.
922 */
923 sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0);
924 if (IS_ERR(sleb)) {
925 return sleb;
926 }
927 if (sleb->nodes_cnt) {
928 struct ubifs_scan_node *snod;
929 unsigned long long cs_sqnum = c->cs_sqnum;
930
931 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
932 if (cs_sqnum == 0) {
933 int err;
934
935 err = get_cs_sqnum(c, lnum, offs, &cs_sqnum);
936 if (err) {
937 ubifs_scan_destroy(sleb);
938 return ERR_PTR(err);
939 }
940 }
941 if (snod->sqnum > cs_sqnum) {
942 ubifs_err(c, "unrecoverable log corruption in LEB %d", lnum);
943 ubifs_scan_destroy(sleb);
944 return ERR_PTR(-EUCLEAN);
945 }
946 }
947 ubifs_scan_destroy(sleb);
948 }
949 return ubifs_recover_leb(c, lnum, offs, sbuf, -1);
950 }
951
952 /**
953 * recover_head - recover a head.
954 * @c: UBIFS file-system description object
955 * @lnum: LEB number of head to recover
956 * @offs: offset of head to recover
957 * @sbuf: LEB-sized buffer to use
958 *
959 * This function ensures that there is no data on the flash at a head location.
960 *
961 * This function returns %0 on success and a negative error code on failure.
962 */
recover_head(struct ubifs_info * c,int lnum,int offs,void * sbuf)963 static int recover_head(struct ubifs_info *c, int lnum, int offs, void *sbuf)
964 {
965 int len = c->max_write_size, err;
966
967 if (offs + len > c->leb_size) {
968 len = c->leb_size - offs;
969 }
970
971 if (!len) {
972 return 0;
973 }
974
975 /* Read at the head location and check it is empty flash */
976 err = ubifs_leb_read(c, lnum, sbuf, offs, len, 1);
977 if (err || !is_empty(sbuf, len)) {
978 dbg_rcvry("cleaning head at %d:%d", lnum, offs);
979 if (offs == 0) {
980 return ubifs_leb_unmap(c, lnum);
981 }
982 err = ubifs_leb_read(c, lnum, sbuf, 0, offs, 1);
983 if (err) {
984 return err;
985 }
986 return ubifs_leb_change(c, lnum, sbuf, offs);
987 }
988
989 return 0;
990 }
991
992 /**
993 * ubifs_recover_inl_heads - recover index and LPT heads.
994 * @c: UBIFS file-system description object
995 * @sbuf: LEB-sized buffer to use
996 *
997 * This function ensures that there is no data on the flash at the index and
998 * LPT head locations.
999 *
1000 * This deals with the recovery of a half-completed journal commit. UBIFS is
1001 * careful never to overwrite the last version of the index or the LPT. Because
1002 * the index and LPT are wandering trees, data from a half-completed commit will
1003 * not be referenced anywhere in UBIFS. The data will be either in LEBs that are
1004 * assumed to be empty and will be unmapped anyway before use, or in the index
1005 * and LPT heads.
1006 *
1007 * This function returns %0 on success and a negative error code on failure.
1008 */
ubifs_recover_inl_heads(struct ubifs_info * c,void * sbuf)1009 int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf)
1010 {
1011 int err;
1012
1013 ubifs_assert(c, !c->ro_mount || c->remounting_rw);
1014
1015 dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
1016 err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
1017 if (err) {
1018 return err;
1019 }
1020
1021 dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs);
1022
1023 return recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf);
1024 }
1025
1026 /**
1027 * clean_an_unclean_leb - read and write a LEB to remove corruption.
1028 * @c: UBIFS file-system description object
1029 * @ucleb: unclean LEB information
1030 * @sbuf: LEB-sized buffer to use
1031 *
1032 * This function reads a LEB up to a point pre-determined by the mount recovery,
1033 * checks the nodes, and writes the result back to the flash, thereby cleaning
1034 * off any following corruption, or non-fatal ECC errors.
1035 *
1036 * This function returns %0 on success and a negative error code on failure.
1037 */
clean_an_unclean_leb(struct ubifs_info * c,struct ubifs_unclean_leb * ucleb,void * sbuf)1038 static int clean_an_unclean_leb(struct ubifs_info *c, struct ubifs_unclean_leb *ucleb, void *sbuf)
1039 {
1040 int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1;
1041 void *buf = sbuf;
1042
1043 dbg_rcvry("LEB %d len %d", lnum, len);
1044
1045 if (len == 0) {
1046 /* Nothing to read, just unmap it */
1047 return ubifs_leb_unmap(c, lnum);
1048 }
1049
1050 err = ubifs_leb_read(c, lnum, buf, offs, len, 0);
1051 if (err && err != -EBADMSG) {
1052 return err;
1053 }
1054
1055 while (len >= RECOVERY_EIGHT) {
1056 int ret;
1057
1058 cond_resched();
1059
1060 /* Scan quietly until there is an error */
1061 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
1062 if (ret == SCANNED_A_NODE) {
1063 /* A valid node, and not a padding node */
1064 struct ubifs_ch *ch = buf;
1065 int node_len;
1066
1067 node_len = ALIGN(le32_to_cpu(ch->len), RECOVERY_EIGHT);
1068 offs += node_len;
1069 buf += node_len;
1070 len -= node_len;
1071 continue;
1072 }
1073
1074 if (ret > 0) {
1075 /* Padding bytes or a valid padding node */
1076 offs += ret;
1077 buf += ret;
1078 len -= ret;
1079 continue;
1080 }
1081
1082 if (ret == SCANNED_EMPTY_SPACE) {
1083 ubifs_err(c, "unexpected empty space at %d:%d", lnum, offs);
1084 return -EUCLEAN;
1085 }
1086
1087 if (quiet) {
1088 /* Redo the last scan but noisily */
1089 quiet = 0;
1090 continue;
1091 }
1092
1093 ubifs_scanned_corruption(c, lnum, offs, buf);
1094 return -EUCLEAN;
1095 }
1096
1097 /* Pad to min_io_size */
1098 len = ALIGN(ucleb->endpt, c->min_io_size);
1099 if (len > ucleb->endpt) {
1100 int pad_len = len - ALIGN(ucleb->endpt, RECOVERY_EIGHT);
1101 if (pad_len > 0) {
1102 buf = c->sbuf + len - pad_len;
1103 ubifs_pad(c, buf, pad_len);
1104 }
1105 }
1106
1107 /* Write back the LEB atomically */
1108 err = ubifs_leb_change(c, lnum, sbuf, len);
1109 if (err) {
1110 return err;
1111 }
1112
1113 dbg_rcvry("cleaned LEB %d", lnum);
1114
1115 return 0;
1116 }
1117
1118 /**
1119 * ubifs_clean_lebs - clean LEBs recovered during read-only mount.
1120 * @c: UBIFS file-system description object
1121 * @sbuf: LEB-sized buffer to use
1122 *
1123 * This function cleans a LEB identified during recovery that needs to be
1124 * written but was not because UBIFS was mounted read-only. This happens when
1125 * remounting to read-write mode.
1126 *
1127 * This function returns %0 on success and a negative error code on failure.
1128 */
ubifs_clean_lebs(struct ubifs_info * c,void * sbuf)1129 int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf)
1130 {
1131 dbg_rcvry("recovery");
1132 while (!list_empty(&c->unclean_leb_list)) {
1133 struct ubifs_unclean_leb *ucleb;
1134 int err;
1135
1136 ucleb = list_entry(c->unclean_leb_list.next, struct ubifs_unclean_leb, list);
1137 err = clean_an_unclean_leb(c, ucleb, sbuf);
1138 if (err) {
1139 return err;
1140 }
1141 list_del(&ucleb->list);
1142 kfree(ucleb);
1143 }
1144 return 0;
1145 }
1146
1147 /**
1148 * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit.
1149 * @c: UBIFS file-system description object
1150 *
1151 * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty
1152 * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns
1153 * zero in case of success and a negative error code in case of failure.
1154 */
grab_empty_leb(struct ubifs_info * c)1155 static int grab_empty_leb(struct ubifs_info *c)
1156 {
1157 int lnum, err;
1158
1159 /*
1160 * Note, it is very important to first search for an empty LEB and then
1161 * run the commit, not vice-versa. The reason is that there might be
1162 * only one empty LEB at the moment, the one which has been the
1163 * @c->gc_lnum just before the power cut happened. During the regular
1164 * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no
1165 * one but GC can grab it. But at this moment this single empty LEB is
1166 * not marked as taken, so if we run commit - what happens? Right, the
1167 * commit will grab it and write the index there. Remember that the
1168 * index always expands as long as there is free space, and it only
1169 * starts consolidating when we run out of space.
1170 *
1171 * IOW, if we run commit now, we might not be able to find a free LEB
1172 * after this.
1173 */
1174 lnum = ubifs_find_free_leb_for_idx(c);
1175 if (lnum < 0) {
1176 ubifs_err(c, "could not find an empty LEB");
1177 ubifs_dump_lprops(c);
1178 ubifs_dump_budg(c, &c->bi);
1179 return lnum;
1180 }
1181
1182 /* Reset the index flag */
1183 err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, LPROPS_INDEX, 0);
1184 if (err) {
1185 return err;
1186 }
1187
1188 c->gc_lnum = lnum;
1189 dbg_rcvry("found empty LEB %d, run commit", lnum);
1190
1191 return ubifs_run_commit(c);
1192 }
1193
1194 /**
1195 * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
1196 * @c: UBIFS file-system description object
1197 *
1198 * Out-of-place garbage collection requires always one empty LEB with which to
1199 * start garbage collection. The LEB number is recorded in c->gc_lnum and is
1200 * written to the master node on unmounting. In the case of an unclean unmount
1201 * the value of gc_lnum recorded in the master node is out of date and cannot
1202 * be used. Instead, recovery must allocate an empty LEB for this purpose.
1203 * However, there may not be enough empty space, in which case it must be
1204 * possible to GC the dirtiest LEB into the GC head LEB.
1205 *
1206 * This function also runs the commit which causes the TNC updates from
1207 * size-recovery and orphans to be written to the flash. That is important to
1208 * ensure correct replay order for subsequent mounts.
1209 *
1210 * This function returns %0 on success and a negative error code on failure.
1211 */
ubifs_rcvry_gc_commit(struct ubifs_info * c)1212 int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1213 {
1214 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
1215 struct ubifs_lprops lp;
1216 int err;
1217
1218 dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs);
1219
1220 c->gc_lnum = -1;
1221 if (wbuf->lnum == -1 || wbuf->offs == c->leb_size) {
1222 return grab_empty_leb(c);
1223 }
1224
1225 err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, RECOVERY_TWO);
1226 if (err) {
1227 if (err != -ENOSPC) {
1228 return err;
1229 }
1230
1231 dbg_rcvry("could not find a dirty LEB");
1232 return grab_empty_leb(c);
1233 }
1234
1235 ubifs_assert(c, !(lp.flags & LPROPS_INDEX));
1236 ubifs_assert(c, lp.free + lp.dirty >= wbuf->offs);
1237
1238 /*
1239 * We run the commit before garbage collection otherwise subsequent
1240 * mounts will see the GC and orphan deletion in a different order.
1241 */
1242 dbg_rcvry("committing");
1243 err = ubifs_run_commit(c);
1244 if (err) {
1245 return err;
1246 }
1247
1248 dbg_rcvry("GC'ing LEB %d", lp.lnum);
1249 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
1250 err = ubifs_garbage_collect_leb(c, &lp);
1251 if (err >= 0) {
1252 int err2 = ubifs_wbuf_sync_nolock(wbuf);
1253 if (err2) {
1254 err = err2;
1255 }
1256 }
1257 mutex_unlock(&wbuf->io_mutex);
1258 if (err < 0) {
1259 ubifs_err(c, "GC failed, error %d", err);
1260 if (err == -EAGAIN) {
1261 err = -EINVAL;
1262 }
1263 return err;
1264 }
1265
1266 ubifs_assert(c, err == LEB_RETAINED);
1267 if (err != LEB_RETAINED) {
1268 return -EINVAL;
1269 }
1270
1271 err = ubifs_leb_unmap(c, c->gc_lnum);
1272 if (err) {
1273 return err;
1274 }
1275
1276 dbg_rcvry("allocated LEB %d for GC", lp.lnum);
1277 return 0;
1278 }
1279
1280 /**
1281 * struct size_entry - inode size information for recovery.
1282 * @rb: link in the RB-tree of sizes
1283 * @inum: inode number
1284 * @i_size: size on inode
1285 * @d_size: maximum size based on data nodes
1286 * @exists: indicates whether the inode exists
1287 * @inode: inode if pinned in memory awaiting rw mode to fix it
1288 */
1289 struct size_entry {
1290 struct rb_node rb;
1291 ino_t inum;
1292 loff_t i_size;
1293 loff_t d_size;
1294 int exists;
1295 struct inode *inode;
1296 };
1297
1298 /**
1299 * add_ino - add an entry to the size tree.
1300 * @c: UBIFS file-system description object
1301 * @inum: inode number
1302 * @i_size: size on inode
1303 * @d_size: maximum size based on data nodes
1304 * @exists: indicates whether the inode exists
1305 */
add_ino(struct ubifs_info * c,ino_t inum,loff_t i_size,loff_t d_size,int exists)1306 static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size, loff_t d_size, int exists)
1307 {
1308 struct rb_node **p = &c->size_tree.rb_node, *parent = NULL;
1309 struct size_entry *e;
1310
1311 while (*p) {
1312 parent = *p;
1313 e = rb_entry(parent, struct size_entry, rb);
1314 if (inum < e->inum) {
1315 p = &(*p)->rb_left;
1316 } else {
1317 p = &(*p)->rb_right;
1318 }
1319 }
1320
1321 e = kzalloc(sizeof(struct size_entry), GFP_KERNEL);
1322 if (!e) {
1323 return -ENOMEM;
1324 }
1325
1326 e->inum = inum;
1327 e->i_size = i_size;
1328 e->d_size = d_size;
1329 e->exists = exists;
1330
1331 rb_link_node(&e->rb, parent, p);
1332 rb_insert_color(&e->rb, &c->size_tree);
1333
1334 return 0;
1335 }
1336
1337 /**
1338 * find_ino - find an entry on the size tree.
1339 * @c: UBIFS file-system description object
1340 * @inum: inode number
1341 */
find_ino(struct ubifs_info * c,ino_t inum)1342 static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum)
1343 {
1344 struct rb_node *p = c->size_tree.rb_node;
1345 struct size_entry *e;
1346
1347 while (p) {
1348 e = rb_entry(p, struct size_entry, rb);
1349 if (inum < e->inum) {
1350 p = p->rb_left;
1351 } else if (inum > e->inum) {
1352 p = p->rb_right;
1353 } else {
1354 return e;
1355 }
1356 }
1357 return NULL;
1358 }
1359
1360 /**
1361 * remove_ino - remove an entry from the size tree.
1362 * @c: UBIFS file-system description object
1363 * @inum: inode number
1364 */
remove_ino(struct ubifs_info * c,ino_t inum)1365 static void remove_ino(struct ubifs_info *c, ino_t inum)
1366 {
1367 struct size_entry *e = find_ino(c, inum);
1368
1369 if (!e) {
1370 return;
1371 }
1372 rb_erase(&e->rb, &c->size_tree);
1373 kfree(e);
1374 }
1375
1376 /**
1377 * ubifs_destroy_size_tree - free resources related to the size tree.
1378 * @c: UBIFS file-system description object
1379 */
ubifs_destroy_size_tree(struct ubifs_info * c)1380 void ubifs_destroy_size_tree(struct ubifs_info *c)
1381 {
1382 struct size_entry *e, *n;
1383
1384 rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb)
1385 {
1386 iput(e->inode);
1387 kfree(e);
1388 }
1389
1390 c->size_tree = RB_ROOT;
1391 }
1392
1393 /**
1394 * ubifs_recover_size_accum - accumulate inode sizes for recovery.
1395 * @c: UBIFS file-system description object
1396 * @key: node key
1397 * @deletion: node is for a deletion
1398 * @new_size: inode size
1399 *
1400 * This function has two purposes:
1401 * 1) to ensure there are no data nodes that fall outside the inode size
1402 * 2) to ensure there are no data nodes for inodes that do not exist
1403 * To accomplish those purposes, a rb-tree is constructed containing an entry
1404 * for each inode number in the journal that has not been deleted, and recording
1405 * the size from the inode node, the maximum size of any data node (also altered
1406 * by truncations) and a flag indicating a inode number for which no inode node
1407 * was present in the journal.
1408 *
1409 * Note that there is still the possibility that there are data nodes that have
1410 * been committed that are beyond the inode size, however the only way to find
1411 * them would be to scan the entire index. Alternatively, some provision could
1412 * be made to record the size of inodes at the start of commit, which would seem
1413 * very cumbersome for a scenario that is quite unlikely and the only negative
1414 * consequence of which is wasted space.
1415 *
1416 * This functions returns %0 on success and a negative error code on failure.
1417 */
ubifs_recover_size_accum(struct ubifs_info * c,union ubifs_key * key,int deletion,loff_t new_size)1418 int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, int deletion, loff_t new_size)
1419 {
1420 ino_t inum = key_inum(c, key);
1421 struct size_entry *e;
1422 int err;
1423
1424 switch (key_type(c, key)) {
1425 case UBIFS_INO_KEY:
1426 if (deletion) {
1427 remove_ino(c, inum);
1428 } else {
1429 e = find_ino(c, inum);
1430 if (e) {
1431 e->i_size = new_size;
1432 e->exists = 1;
1433 } else {
1434 err = add_ino(c, inum, new_size, 0, 1);
1435 if (err) {
1436 return err;
1437 }
1438 }
1439 }
1440 break;
1441 case UBIFS_DATA_KEY:
1442 e = find_ino(c, inum);
1443 if (e) {
1444 if (new_size > e->d_size) {
1445 e->d_size = new_size;
1446 }
1447 } else {
1448 err = add_ino(c, inum, 0, new_size, 0);
1449 if (err) {
1450 return err;
1451 }
1452 }
1453 break;
1454 case UBIFS_TRUN_KEY:
1455 e = find_ino(c, inum);
1456 if (e) {
1457 e->d_size = new_size;
1458 }
1459 break;
1460 default:
1461 break;
1462 }
1463 return 0;
1464 }
1465
1466 /**
1467 * fix_size_in_place - fix inode size in place on flash.
1468 * @c: UBIFS file-system description object
1469 * @e: inode size information for recovery
1470 */
fix_size_in_place(struct ubifs_info * c,struct size_entry * e)1471 static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
1472 {
1473 struct ubifs_ino_node *ino = c->sbuf;
1474 unsigned char *p;
1475 union ubifs_key key;
1476 int err, lnum, offs, len;
1477 loff_t i_size;
1478 uint32_t crc;
1479
1480 /* Locate the inode node LEB number and offset */
1481 ino_key_init(c, &key, e->inum);
1482 err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs);
1483 if (err) {
1484 goto out;
1485 }
1486 /*
1487 * If the size recorded on the inode node is greater than the size that
1488 * was calculated from nodes in the journal then don't change the inode.
1489 */
1490 i_size = le64_to_cpu(ino->size);
1491 if (i_size >= e->d_size) {
1492 return 0;
1493 }
1494 /* Read the LEB */
1495 err = ubifs_leb_read(c, lnum, c->sbuf, 0, c->leb_size, 1);
1496 if (err) {
1497 goto out;
1498 }
1499 /* Change the size field and recalculate the CRC */
1500 ino = c->sbuf + offs;
1501 ino->size = cpu_to_le64(e->d_size);
1502 len = le32_to_cpu(ino->ch.len);
1503 crc = crc32(UBIFS_CRC32_INIT, (void *)ino + RECOVERY_EIGHT, len - RECOVERY_EIGHT);
1504 ino->ch.crc = cpu_to_le32(crc);
1505 /* Work out where data in the LEB ends and free space begins */
1506 p = c->sbuf;
1507 len = c->leb_size - 1;
1508 while (p[len] == 0xff) {
1509 len -= 1;
1510 }
1511 len = ALIGN(len + 1, c->min_io_size);
1512 /* Atomically write the fixed LEB back again */
1513 err = ubifs_leb_change(c, lnum, c->sbuf, len);
1514 if (err) {
1515 goto out;
1516 }
1517 dbg_rcvry("inode %lu at %d:%d size %lld -> %lld", (unsigned long)e->inum, lnum, offs, i_size, e->d_size);
1518 return 0;
1519
1520 out:
1521 ubifs_warn(c, "inode %lu failed to fix size %lld -> %lld error %d", (unsigned long)e->inum, e->i_size, e->d_size,
1522 err);
1523 return err;
1524 }
1525
1526 /**
1527 * inode_fix_size - fix inode size
1528 * @c: UBIFS file-system description object
1529 * @e: inode size information for recovery
1530 */
inode_fix_size(struct ubifs_info * c,struct size_entry * e)1531 static int inode_fix_size(struct ubifs_info *c, struct size_entry *e)
1532 {
1533 struct inode *inode;
1534 struct ubifs_inode *ui;
1535 int err;
1536
1537 if (c->ro_mount) {
1538 ubifs_assert(c, !e->inode);
1539 }
1540
1541 if (e->inode) {
1542 /* Remounting rw, pick up inode we stored earlier */
1543 inode = e->inode;
1544 } else {
1545 inode = ubifs_iget(c->vfs_sb, e->inum);
1546 if (IS_ERR(inode)) {
1547 return PTR_ERR(inode);
1548 }
1549
1550 if (inode->i_size >= e->d_size) {
1551 /*
1552 * The original inode in the index already has a size
1553 * big enough, nothing to do
1554 */
1555 iput(inode);
1556 return 0;
1557 }
1558
1559 dbg_rcvry("ino %lu size %lld -> %lld", (unsigned long)e->inum, inode->i_size, e->d_size);
1560
1561 ui = ubifs_inode(inode);
1562
1563 inode->i_size = e->d_size;
1564 ui->ui_size = e->d_size;
1565 ui->synced_i_size = e->d_size;
1566
1567 e->inode = inode;
1568 }
1569
1570 /*
1571 * In readonly mode just keep the inode pinned in memory until we go
1572 * readwrite. In readwrite mode write the inode to the journal with the
1573 * fixed size.
1574 */
1575 if (c->ro_mount) {
1576 return 0;
1577 }
1578
1579 err = ubifs_jnl_write_inode(c, inode);
1580
1581 iput(inode);
1582
1583 if (err) {
1584 return err;
1585 }
1586
1587 rb_erase(&e->rb, &c->size_tree);
1588 kfree(e);
1589
1590 return 0;
1591 }
1592
1593 /**
1594 * ubifs_recover_size - recover inode size.
1595 * @c: UBIFS file-system description object
1596 * @in_place: If true, do a in-place size fixup
1597 *
1598 * This function attempts to fix inode size discrepancies identified by the
1599 * 'ubifs_recover_size_accum()' function.
1600 *
1601 * This functions returns %0 on success and a negative error code on failure.
1602 */
ubifs_recover_size(struct ubifs_info * c,bool in_place)1603 int ubifs_recover_size(struct ubifs_info *c, bool in_place)
1604 {
1605 struct rb_node *this = rb_first(&c->size_tree);
1606
1607 while (this) {
1608 struct size_entry *e;
1609 int err;
1610
1611 e = rb_entry(this, struct size_entry, rb);
1612
1613 this = rb_next(this);
1614
1615 if (!e->exists) {
1616 union ubifs_key key;
1617
1618 ino_key_init(c, &key, e->inum);
1619 err = ubifs_tnc_lookup(c, &key, c->sbuf);
1620 if (err && err != -ENOENT) {
1621 return err;
1622 }
1623 if (err == -ENOENT) {
1624 /* Remove data nodes that have no inode */
1625 dbg_rcvry("removing ino %lu", (unsigned long)e->inum);
1626 err = ubifs_tnc_remove_ino(c, e->inum);
1627 if (err) {
1628 return err;
1629 }
1630 } else {
1631 struct ubifs_ino_node *ino = c->sbuf;
1632
1633 e->exists = 1;
1634 e->i_size = le64_to_cpu(ino->size);
1635 }
1636 }
1637
1638 if (e->exists && e->i_size < e->d_size) {
1639 ubifs_assert(c, !(c->ro_mount && in_place));
1640
1641 /*
1642 * We found data that is outside the found inode size,
1643 * fixup the inode size
1644 */
1645
1646 if (in_place) {
1647 err = fix_size_in_place(c, e);
1648 if (err) {
1649 return err;
1650 }
1651 iput(e->inode);
1652 } else {
1653 err = inode_fix_size(c, e);
1654 if (err) {
1655 return err;
1656 }
1657 continue;
1658 }
1659 }
1660
1661 rb_erase(&e->rb, &c->size_tree);
1662 kfree(e);
1663 }
1664
1665 return 0;
1666 }
1667