• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <fcntl.h>
18 #include <stdlib.h>
19 #include <sys/mman.h>
20 
21 extern "C" {
22     #include <fec.h>
23 }
24 
25 #include "fec_private.h"
26 
27 using rs_unique_ptr = std::unique_ptr<void, decltype(&free_rs_char)>;
28 
29 /* prints a hexdump of `data' using warn(...) */
dump(const char * name,uint64_t value,const uint8_t * data,size_t size)30 static void dump(const char *name, uint64_t value, const uint8_t *data,
31         size_t size)
32 {
33     const int bytes_per_line = 16;
34     char hex[bytes_per_line * 3 + 1];
35     char prn[bytes_per_line + 1];
36 
37     warn("%s (%" PRIu64 ") (%zu bytes):", name ? name : "", value, size);
38 
39     if (!data) {
40         warn("    (null)");
41         return;
42     }
43 
44     for (size_t n = 0; n < size; n += bytes_per_line) {
45         memset(hex, 0, sizeof(hex));
46         memset(prn, 0, sizeof(prn));
47 
48         for (size_t m = 0; m < bytes_per_line; ++m) {
49             if (n + m < size) {
50                 ptrdiff_t offset = &hex[m * 3] - hex;
51                 snprintf(hex + offset, sizeof(hex) - offset, "%02x ",
52                          data[n + m]);
53 
54                 if (isprint(data[n + m])) {
55                     prn[m] = data[n + m];
56                 } else {
57                     prn[m] = '.';
58                 }
59             } else {
60                 strcpy(&hex[m * 3], "   ");
61             }
62         }
63 
64         warn("    %04zu   %s  %s", n, hex, prn);
65     }
66 }
67 
68 /* checks if `offset' is within a corrupted block */
is_erasure(fec_handle * f,uint64_t offset,const uint8_t * data)69 static inline bool is_erasure(fec_handle *f, uint64_t offset,
70         const uint8_t *data)
71 {
72     if (unlikely(offset >= f->data_size)) {
73         return false;
74     }
75 
76     /* ideally, we would like to know if a specific byte on this block has
77        been corrupted, but knowing whether any of them is can be useful as
78        well, because often the entire block is corrupted */
79 
80     uint64_t n = offset / FEC_BLOCKSIZE;
81 
82     return !f->hashtree().check_block_hash_with_index(n, data);
83 }
84 
85 /* check if `offset' is within a block expected to contain zeros */
is_zero(fec_handle * f,uint64_t offset)86 static inline bool is_zero(fec_handle *f, uint64_t offset)
87 {
88     auto hashtree = f->hashtree();
89 
90     if (hashtree.hash_data.empty() || unlikely(offset >= f->data_size)) {
91         return false;
92     }
93 
94     uint64_t hash_offset = (offset / FEC_BLOCKSIZE) * SHA256_DIGEST_LENGTH;
95 
96     if (unlikely(hash_offset >
97                  hashtree.hash_data.size() - SHA256_DIGEST_LENGTH)) {
98         return false;
99     }
100 
101     return !memcmp(hashtree.zero_hash.data(), &hashtree.hash_data[hash_offset],
102                    SHA256_DIGEST_LENGTH);
103 }
104 
105 /* reads and decodes a single block starting from `offset', returns the number
106    of bytes corrected in `errors' */
__ecc_read(fec_handle * f,void * rs,uint8_t * dest,uint64_t offset,bool use_erasures,uint8_t * ecc_data,size_t * errors)107 static int __ecc_read(fec_handle *f, void *rs, uint8_t *dest, uint64_t offset,
108         bool use_erasures, uint8_t *ecc_data, size_t *errors)
109 {
110     check(offset % FEC_BLOCKSIZE == 0);
111     ecc_info *e = &f->ecc;
112 
113     /* reverse interleaving: calculate the RS block that includes the requested
114        offset */
115     uint64_t rsb = offset - (offset / (e->rounds * FEC_BLOCKSIZE)) *
116                         e->rounds * FEC_BLOCKSIZE;
117     int data_index = -1;
118     int erasures[e->rsn];
119     int neras = 0;
120 
121     /* verity is required to check for erasures */
122     check(!use_erasures || !f->hashtree().hash_data.empty());
123 
124     for (int i = 0; i < e->rsn; ++i) {
125         uint64_t interleaved = fec_ecc_interleave(rsb * e->rsn + i, e->rsn,
126                                     e->rounds);
127 
128         if (interleaved == offset) {
129             data_index = i;
130         }
131 
132         /* to improve our chances of correcting IO errors, initialize the
133            buffer to zeros even if we are going to read to it later */
134         uint8_t bbuf[FEC_BLOCKSIZE] = {0};
135 
136         if (likely(interleaved < e->start) && !is_zero(f, interleaved)) {
137             /* copy raw data to reconstruct the RS block */
138             if (!raw_pread(f->fd, bbuf, FEC_BLOCKSIZE, interleaved)) {
139                 warn("failed to read: %s", strerror(errno));
140 
141                 /* treat errors as corruption */
142                 if (use_erasures && neras <= e->roots) {
143                     erasures[neras++] = i;
144                 }
145             } else if (use_erasures && neras <= e->roots &&
146                        is_erasure(f, interleaved, bbuf)) {
147                 erasures[neras++] = i;
148             }
149         }
150 
151         for (int j = 0; j < FEC_BLOCKSIZE; ++j) {
152             ecc_data[j * FEC_RSM + i] = bbuf[j];
153         }
154     }
155 
156     check(data_index >= 0);
157 
158     size_t nerrs = 0;
159     uint8_t copy[FEC_RSM];
160 
161     for (int i = 0; i < FEC_BLOCKSIZE; ++i) {
162         /* copy parity data */
163         if (!raw_pread(f->fd, &ecc_data[i * FEC_RSM + e->rsn], e->roots,
164                        e->start + (i + rsb) * e->roots)) {
165             error("failed to read ecc data: %s", strerror(errno));
166             return -1;
167         }
168 
169         /* for debugging decoding failures, because decode_rs_char can mangle
170            ecc_data */
171         if (unlikely(use_erasures)) {
172             memcpy(copy, &ecc_data[i * FEC_RSM], FEC_RSM);
173         }
174 
175         /* decode */
176         int rc = decode_rs_char(rs, &ecc_data[i * FEC_RSM], erasures, neras);
177 
178         if (unlikely(rc < 0)) {
179             if (use_erasures) {
180                 error("RS block %" PRIu64 ": decoding failed (%d erasures)",
181                     rsb, neras);
182                 dump("raw RS block", rsb, copy, FEC_RSM);
183             } else if (f->hashtree().hash_data.empty()) {
184                 warn("RS block %" PRIu64 ": decoding failed", rsb);
185             } else {
186                 debug("RS block %" PRIu64 ": decoding failed", rsb);
187             }
188 
189             errno = EIO;
190             return -1;
191         } else if (unlikely(rc > 0)) {
192             check(rc <= (use_erasures ? e->roots : e->roots / 2));
193             nerrs += rc;
194         }
195 
196         dest[i] = ecc_data[i * FEC_RSM + data_index];
197     }
198 
199     if (nerrs) {
200         warn("RS block %" PRIu64 ": corrected %zu errors", rsb, nerrs);
201         *errors += nerrs;
202     }
203 
204     return FEC_BLOCKSIZE;
205 }
206 
207 /* initializes RS decoder and allocates memory for interleaving */
ecc_init(fec_handle * f,rs_unique_ptr & rs,std::unique_ptr<uint8_t[]> & ecc_data)208 static int ecc_init(fec_handle *f, rs_unique_ptr& rs,
209         std::unique_ptr<uint8_t[]>& ecc_data)
210 {
211     check(f);
212 
213     rs.reset(init_rs_char(FEC_PARAMS(f->ecc.roots)));
214 
215     if (unlikely(!rs)) {
216         error("failed to initialize RS");
217         errno = ENOMEM;
218         return -1;
219     }
220 
221     ecc_data.reset(new (std::nothrow) uint8_t[FEC_RSM * FEC_BLOCKSIZE]);
222 
223     if (unlikely(!ecc_data)) {
224         error("failed to allocate ecc buffer");
225         errno = ENOMEM;
226         return -1;
227     }
228 
229     return 0;
230 }
231 
232 /* reads `count' bytes from `offset' and corrects possible errors without
233    erasure detection, returning the number of corrected bytes in `errors' */
ecc_read(fec_handle * f,uint8_t * dest,size_t count,uint64_t offset,size_t * errors)234 static ssize_t ecc_read(fec_handle *f, uint8_t *dest, size_t count,
235         uint64_t offset, size_t *errors)
236 {
237     check(f);
238     check(dest);
239     check(offset < f->data_size);
240     check(offset + count <= f->data_size);
241     check(errors);
242 
243     debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count);
244 
245     rs_unique_ptr rs(NULL, free_rs_char);
246     std::unique_ptr<uint8_t[]> ecc_data;
247 
248     if (ecc_init(f, rs, ecc_data) == -1) {
249         return -1;
250     }
251 
252     uint64_t curr = offset / FEC_BLOCKSIZE;
253     size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE);
254     size_t left = count;
255 
256     uint8_t data[FEC_BLOCKSIZE];
257 
258     while (left > 0) {
259         /* there's no erasure detection without verity metadata */
260         if (__ecc_read(f, rs.get(), data, curr * FEC_BLOCKSIZE, false,
261                 ecc_data.get(), errors) == -1) {
262             return -1;
263         }
264 
265         size_t copy = FEC_BLOCKSIZE - coff;
266 
267         if (copy > left) {
268             copy = left;
269         }
270 
271         memcpy(dest, &data[coff], copy);
272 
273         dest += copy;
274         left -= copy;
275         coff = 0;
276         ++curr;
277     }
278 
279     return count;
280 }
281 
282 /* reads `count' bytes from `offset', corrects possible errors with
283    erasure detection, and verifies the integrity of read data using
284    verity hash tree; returns the number of corrections in `errors' */
verity_read(fec_handle * f,uint8_t * dest,size_t count,uint64_t offset,size_t * errors)285 static ssize_t verity_read(fec_handle *f, uint8_t *dest, size_t count,
286         uint64_t offset, size_t *errors)
287 {
288     check(f);
289     check(dest);
290     check(offset < f->data_size);
291     check(offset + count <= f->data_size);
292     check(!f->hashtree().hash_data.empty());
293     check(errors);
294 
295     debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count);
296 
297     rs_unique_ptr rs(NULL, free_rs_char);
298     std::unique_ptr<uint8_t[]> ecc_data;
299 
300     if (f->ecc.start && ecc_init(f, rs, ecc_data) == -1) {
301         return -1;
302     }
303 
304     uint64_t curr = offset / FEC_BLOCKSIZE;
305     size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE);
306     size_t left = count;
307     uint8_t data[FEC_BLOCKSIZE];
308 
309     uint64_t max_hash_block =
310         (f->hashtree().hash_data.size() - SHA256_DIGEST_LENGTH) /
311         SHA256_DIGEST_LENGTH;
312 
313     while (left > 0) {
314         check(curr <= max_hash_block);
315         uint64_t curr_offset = curr * FEC_BLOCKSIZE;
316 
317         bool expect_zeros = is_zero(f, curr_offset);
318 
319         /* if we are in read-only mode and expect to read a zero block,
320            skip reading and just return zeros */
321         if ((f->mode & O_ACCMODE) == O_RDONLY && expect_zeros) {
322             memset(data, 0, FEC_BLOCKSIZE);
323             goto valid;
324         }
325 
326         /* copy raw data without error correction */
327         if (!raw_pread(f->fd, data, FEC_BLOCKSIZE, curr_offset)) {
328             if (errno == EIO) {
329                 warn("I/O error encounter when reading, attempting to recover using fec");
330             } else {
331                 error("failed to read: %s", strerror(errno));
332                 return -1;
333             }
334         }
335 
336         if (likely(f->hashtree().check_block_hash_with_index(curr, data))) {
337             goto valid;
338         }
339 
340         /* we know the block is supposed to contain zeros, so return zeros
341            instead of trying to correct it */
342         if (expect_zeros) {
343             memset(data, 0, FEC_BLOCKSIZE);
344             goto corrected;
345         }
346 
347         if (!f->ecc.start) {
348             /* fatal error without ecc */
349             error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64,
350                 offset, offset + count, curr);
351             return -1;
352         } else {
353             debug("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64,
354                 offset, offset + count, curr);
355         }
356 
357         /* try to correct without erasures first, because checking for
358            erasure locations is slower */
359         if (__ecc_read(f, rs.get(), data, curr_offset, false, ecc_data.get(),
360                        errors) == FEC_BLOCKSIZE &&
361             f->hashtree().check_block_hash_with_index(curr, data)) {
362             goto corrected;
363         }
364 
365         /* try to correct with erasures */
366         if (__ecc_read(f, rs.get(), data, curr_offset, true, ecc_data.get(),
367                        errors) == FEC_BLOCKSIZE &&
368             f->hashtree().check_block_hash_with_index(curr, data)) {
369             goto corrected;
370         }
371 
372         error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64
373             " (offset %" PRIu64 ") cannot be recovered",
374             offset, offset + count, curr, curr_offset);
375         dump("decoded block", curr, data, FEC_BLOCKSIZE);
376 
377         errno = EIO;
378         return -1;
379 
380 corrected:
381         /* update the corrected block to the file if we are in r/w mode */
382         if (f->mode & O_RDWR &&
383             !raw_pwrite(f->fd, data, FEC_BLOCKSIZE, curr_offset)) {
384             error("failed to write: %s", strerror(errno));
385             return -1;
386         }
387 
388 valid:
389         size_t copy = FEC_BLOCKSIZE - coff;
390 
391         if (copy > left) {
392             copy = left;
393         }
394 
395         memcpy(dest, &data[coff], copy);
396 
397         dest += copy;
398         left -= copy;
399         coff = 0;
400         ++curr;
401     }
402 
403     return count;
404 }
405 
406 /* sets the internal file position to `offset' relative to `whence' */
fec_seek(struct fec_handle * f,int64_t offset,int whence)407 int fec_seek(struct fec_handle *f, int64_t offset, int whence)
408 {
409     check(f);
410 
411     if (whence == SEEK_SET) {
412         if (offset < 0) {
413             errno = EOVERFLOW;
414             return -1;
415         }
416 
417         f->pos = offset;
418     } else if (whence == SEEK_CUR) {
419         if (offset < 0 && f->pos < (uint64_t)-offset) {
420             errno = EOVERFLOW;
421             return -1;
422         } else if (offset > 0 && (uint64_t)offset > UINT64_MAX - f->pos) {
423             errno = EOVERFLOW;
424             return -1;
425         }
426 
427         f->pos += offset;
428     } else if (whence == SEEK_END) {
429         if (offset >= 0) {
430             errno = ENXIO;
431             return -1;
432         } else if ((uint64_t)-offset > f->size) {
433             errno = EOVERFLOW;
434             return -1;
435         }
436 
437         f->pos = f->size + offset;
438     } else {
439         errno = EINVAL;
440         return -1;
441     }
442 
443     return 0;
444 }
445 
446 /* reads up to `count' bytes starting from the internal file position using
447    error correction and integrity validation, if available */
fec_read(struct fec_handle * f,void * buf,size_t count)448 ssize_t fec_read(struct fec_handle *f, void *buf, size_t count)
449 {
450     ssize_t rc = fec_pread(f, buf, count, f->pos);
451 
452     if (rc > 0) {
453         check(f->pos < UINT64_MAX - rc);
454         f->pos += rc;
455     }
456 
457     return rc;
458 }
459 
460 /* for a file with size `max', returns the number of bytes we can read starting
461    from `offset', up to `count' bytes */
get_max_count(uint64_t offset,size_t count,uint64_t max)462 static inline size_t get_max_count(uint64_t offset, size_t count, uint64_t max)
463 {
464     if (offset >= max) {
465         return 0;
466     } else if (offset > max - count) {
467         return (size_t)(max - offset);
468     }
469 
470     return count;
471 }
472 
473 /* reads `count' bytes from `f->fd' starting from `offset', and copies the
474    data to `buf' */
raw_pread(int fd,void * buf,size_t count,uint64_t offset)475 bool raw_pread(int fd, void *buf, size_t count, uint64_t offset) {
476     check(buf);
477 
478     uint8_t *p = (uint8_t *)buf;
479     size_t remaining = count;
480 
481     while (remaining > 0) {
482         ssize_t n = TEMP_FAILURE_RETRY(pread64(fd, p, remaining, offset));
483 
484         if (n <= 0) {
485             return false;
486         }
487 
488         p += n;
489         remaining -= n;
490         offset += n;
491     }
492 
493     return true;
494 }
495 
496 /* writes `count' bytes from `buf' to `f->fd' to a file position `offset' */
raw_pwrite(int fd,const void * buf,size_t count,uint64_t offset)497 bool raw_pwrite(int fd, const void *buf, size_t count, uint64_t offset) {
498     check(buf);
499 
500     const uint8_t *p = (const uint8_t *)buf;
501     size_t remaining = count;
502 
503     while (remaining > 0) {
504         ssize_t n = TEMP_FAILURE_RETRY(pwrite64(fd, p, remaining, offset));
505 
506         if (n <= 0) {
507             return false;
508         }
509 
510         p += n;
511         remaining -= n;
512         offset += n;
513     }
514 
515     return true;
516 }
517 
518 /* reads up to `count' bytes starting from `offset' using error correction and
519    integrity validation, if available */
fec_pread(struct fec_handle * f,void * buf,size_t count,uint64_t offset)520 ssize_t fec_pread(struct fec_handle *f, void *buf, size_t count,
521         uint64_t offset)
522 {
523     check(f);
524     check(buf);
525 
526     if (unlikely(offset > UINT64_MAX - count)) {
527         errno = EOVERFLOW;
528         return -1;
529     }
530 
531     if (!f->hashtree().hash_data.empty()) {
532         return process(f, (uint8_t *)buf,
533                        get_max_count(offset, count, f->data_size), offset,
534                        verity_read);
535     } else if (f->ecc.start) {
536         check(f->ecc.start < f->size);
537 
538         count = get_max_count(offset, count, f->data_size);
539         ssize_t rc = process(f, (uint8_t *)buf, count, offset, ecc_read);
540 
541         if (rc >= 0) {
542             return rc;
543         }
544 
545         /* return raw data if pure ecc read fails; due to interleaving
546            the specific blocks the caller wants may still be fine */
547     } else {
548         count = get_max_count(offset, count, f->size);
549     }
550 
551     if (raw_pread(f->fd, buf, count, offset)) {
552         return count;
553     }
554 
555     return -1;
556 }
557