• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu-common.h"
26 #include "block_int.h"
27 #include "block/qcow2.h"
28 
29 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
30 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
31                             int64_t offset, int64_t length,
32                             int addend);
33 
34 
35 static int cache_refcount_updates = 0;
36 
write_refcount_block(BlockDriverState * bs)37 static int write_refcount_block(BlockDriverState *bs)
38 {
39     BDRVQcowState *s = bs->opaque;
40     size_t size = s->cluster_size;
41 
42     if (s->refcount_block_cache_offset == 0) {
43         return 0;
44     }
45 
46     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE);
47     if (bdrv_pwrite_sync(bs->file, s->refcount_block_cache_offset,
48             s->refcount_block_cache, size) < 0)
49     {
50         return -EIO;
51     }
52 
53     return 0;
54 }
55 
56 /*********************************************************/
57 /* refcount handling */
58 
qcow2_refcount_init(BlockDriverState * bs)59 int qcow2_refcount_init(BlockDriverState *bs)
60 {
61     BDRVQcowState *s = bs->opaque;
62     int ret, refcount_table_size2, i;
63 
64     s->refcount_block_cache = qemu_malloc(s->cluster_size);
65     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
66     s->refcount_table = qemu_malloc(refcount_table_size2);
67     if (s->refcount_table_size > 0) {
68         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
69         ret = bdrv_pread(bs->file, s->refcount_table_offset,
70                          s->refcount_table, refcount_table_size2);
71         if (ret != refcount_table_size2)
72             goto fail;
73         for(i = 0; i < s->refcount_table_size; i++)
74             be64_to_cpus(&s->refcount_table[i]);
75     }
76     return 0;
77  fail:
78     return -ENOMEM;
79 }
80 
qcow2_refcount_close(BlockDriverState * bs)81 void qcow2_refcount_close(BlockDriverState *bs)
82 {
83     BDRVQcowState *s = bs->opaque;
84     qemu_free(s->refcount_block_cache);
85     qemu_free(s->refcount_table);
86 }
87 
88 
load_refcount_block(BlockDriverState * bs,int64_t refcount_block_offset)89 static int load_refcount_block(BlockDriverState *bs,
90                                int64_t refcount_block_offset)
91 {
92     BDRVQcowState *s = bs->opaque;
93     int ret;
94 
95     if (cache_refcount_updates) {
96         ret = write_refcount_block(bs);
97         if (ret < 0) {
98             return ret;
99         }
100     }
101 
102     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
103     ret = bdrv_pread(bs->file, refcount_block_offset, s->refcount_block_cache,
104                      s->cluster_size);
105     if (ret < 0) {
106         return ret;
107     }
108 
109     s->refcount_block_cache_offset = refcount_block_offset;
110     return 0;
111 }
112 
113 /*
114  * Returns the refcount of the cluster given by its index. Any non-negative
115  * return value is the refcount of the cluster, negative values are -errno
116  * and indicate an error.
117  */
get_refcount(BlockDriverState * bs,int64_t cluster_index)118 static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
119 {
120     BDRVQcowState *s = bs->opaque;
121     int refcount_table_index, block_index;
122     int64_t refcount_block_offset;
123     int ret;
124 
125     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
126     if (refcount_table_index >= s->refcount_table_size)
127         return 0;
128     refcount_block_offset = s->refcount_table[refcount_table_index];
129     if (!refcount_block_offset)
130         return 0;
131     if (refcount_block_offset != s->refcount_block_cache_offset) {
132         /* better than nothing: return allocated if read error */
133         ret = load_refcount_block(bs, refcount_block_offset);
134         if (ret < 0) {
135             return ret;
136         }
137     }
138     block_index = cluster_index &
139         ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
140     return be16_to_cpu(s->refcount_block_cache[block_index]);
141 }
142 
143 /*
144  * Rounds the refcount table size up to avoid growing the table for each single
145  * refcount block that is allocated.
146  */
next_refcount_table_size(BDRVQcowState * s,unsigned int min_size)147 static unsigned int next_refcount_table_size(BDRVQcowState *s,
148     unsigned int min_size)
149 {
150     unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
151     unsigned int refcount_table_clusters =
152         MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
153 
154     while (min_clusters > refcount_table_clusters) {
155         refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
156     }
157 
158     return refcount_table_clusters << (s->cluster_bits - 3);
159 }
160 
161 
162 /* Checks if two offsets are described by the same refcount block */
in_same_refcount_block(BDRVQcowState * s,uint64_t offset_a,uint64_t offset_b)163 static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
164     uint64_t offset_b)
165 {
166     uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
167     uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
168 
169     return (block_a == block_b);
170 }
171 
172 /*
173  * Loads a refcount block. If it doesn't exist yet, it is allocated first
174  * (including growing the refcount table if needed).
175  *
176  * Returns the offset of the refcount block on success or -errno in error case
177  */
alloc_refcount_block(BlockDriverState * bs,int64_t cluster_index)178 static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
179 {
180     BDRVQcowState *s = bs->opaque;
181     unsigned int refcount_table_index;
182     int ret;
183 
184     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
185 
186     /* Find the refcount block for the given cluster */
187     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
188 
189     if (refcount_table_index < s->refcount_table_size) {
190 
191         uint64_t refcount_block_offset =
192             s->refcount_table[refcount_table_index];
193 
194         /* If it's already there, we're done */
195         if (refcount_block_offset) {
196             if (refcount_block_offset != s->refcount_block_cache_offset) {
197                 ret = load_refcount_block(bs, refcount_block_offset);
198                 if (ret < 0) {
199                     return ret;
200                 }
201             }
202             return refcount_block_offset;
203         }
204     }
205 
206     /*
207      * If we came here, we need to allocate something. Something is at least
208      * a cluster for the new refcount block. It may also include a new refcount
209      * table if the old refcount table is too small.
210      *
211      * Note that allocating clusters here needs some special care:
212      *
213      * - We can't use the normal qcow2_alloc_clusters(), it would try to
214      *   increase the refcount and very likely we would end up with an endless
215      *   recursion. Instead we must place the refcount blocks in a way that
216      *   they can describe them themselves.
217      *
218      * - We need to consider that at this point we are inside update_refcounts
219      *   and doing the initial refcount increase. This means that some clusters
220      *   have already been allocated by the caller, but their refcount isn't
221      *   accurate yet. free_cluster_index tells us where this allocation ends
222      *   as long as we don't overwrite it by freeing clusters.
223      *
224      * - alloc_clusters_noref and qcow2_free_clusters may load a different
225      *   refcount block into the cache
226      */
227 
228     if (cache_refcount_updates) {
229         ret = write_refcount_block(bs);
230         if (ret < 0) {
231             return ret;
232         }
233     }
234 
235     /* Allocate the refcount block itself and mark it as used */
236     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
237     if (new_block < 0) {
238         return new_block;
239     }
240 
241 #ifdef DEBUG_ALLOC2
242     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
243         " at %" PRIx64 "\n",
244         refcount_table_index, cluster_index << s->cluster_bits, new_block);
245 #endif
246 
247     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
248         /* Zero the new refcount block before updating it */
249         memset(s->refcount_block_cache, 0, s->cluster_size);
250         s->refcount_block_cache_offset = new_block;
251 
252         /* The block describes itself, need to update the cache */
253         int block_index = (new_block >> s->cluster_bits) &
254             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
255         s->refcount_block_cache[block_index] = cpu_to_be16(1);
256     } else {
257         /* Described somewhere else. This can recurse at most twice before we
258          * arrive at a block that describes itself. */
259         ret = update_refcount(bs, new_block, s->cluster_size, 1);
260         if (ret < 0) {
261             goto fail_block;
262         }
263 
264         /* Initialize the new refcount block only after updating its refcount,
265          * update_refcount uses the refcount cache itself */
266         memset(s->refcount_block_cache, 0, s->cluster_size);
267         s->refcount_block_cache_offset = new_block;
268     }
269 
270     /* Now the new refcount block needs to be written to disk */
271     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
272     ret = bdrv_pwrite_sync(bs->file, new_block, s->refcount_block_cache,
273         s->cluster_size);
274     if (ret < 0) {
275         goto fail_block;
276     }
277 
278     /* If the refcount table is big enough, just hook the block up there */
279     if (refcount_table_index < s->refcount_table_size) {
280         uint64_t data64 = cpu_to_be64(new_block);
281         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
282         ret = bdrv_pwrite_sync(bs->file,
283             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
284             &data64, sizeof(data64));
285         if (ret < 0) {
286             goto fail_block;
287         }
288 
289         s->refcount_table[refcount_table_index] = new_block;
290         return new_block;
291     }
292 
293     /*
294      * If we come here, we need to grow the refcount table. Again, a new
295      * refcount table needs some space and we can't simply allocate to avoid
296      * endless recursion.
297      *
298      * Therefore let's grab new refcount blocks at the end of the image, which
299      * will describe themselves and the new refcount table. This way we can
300      * reference them only in the new table and do the switch to the new
301      * refcount table at once without producing an inconsistent state in
302      * between.
303      */
304     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
305 
306     /* Calculate the number of refcount blocks needed so far */
307     uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
308     uint64_t blocks_used = (s->free_cluster_index +
309         refcount_block_clusters - 1) / refcount_block_clusters;
310 
311     /* And now we need at least one block more for the new metadata */
312     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
313     uint64_t last_table_size;
314     uint64_t blocks_clusters;
315     do {
316         uint64_t table_clusters = size_to_clusters(s, table_size);
317         blocks_clusters = 1 +
318             ((table_clusters + refcount_block_clusters - 1)
319             / refcount_block_clusters);
320         uint64_t meta_clusters = table_clusters + blocks_clusters;
321 
322         last_table_size = table_size;
323         table_size = next_refcount_table_size(s, blocks_used +
324             ((meta_clusters + refcount_block_clusters - 1)
325             / refcount_block_clusters));
326 
327     } while (last_table_size != table_size);
328 
329 #ifdef DEBUG_ALLOC2
330     fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
331         s->refcount_table_size, table_size);
332 #endif
333 
334     /* Create the new refcount table and blocks */
335     uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
336         s->cluster_size;
337     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
338     uint16_t *new_blocks = qemu_mallocz(blocks_clusters * s->cluster_size);
339     uint64_t *new_table = qemu_mallocz(table_size * sizeof(uint64_t));
340 
341     assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
342 
343     /* Fill the new refcount table */
344     memcpy(new_table, s->refcount_table,
345         s->refcount_table_size * sizeof(uint64_t));
346     new_table[refcount_table_index] = new_block;
347 
348     int i;
349     for (i = 0; i < blocks_clusters; i++) {
350         new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
351     }
352 
353     /* Fill the refcount blocks */
354     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
355     int block = 0;
356     for (i = 0; i < table_clusters + blocks_clusters; i++) {
357         new_blocks[block++] = cpu_to_be16(1);
358     }
359 
360     /* Write refcount blocks to disk */
361     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
362     ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
363         blocks_clusters * s->cluster_size);
364     qemu_free(new_blocks);
365     if (ret < 0) {
366         goto fail_table;
367     }
368 
369     /* Write refcount table to disk */
370     for(i = 0; i < table_size; i++) {
371         cpu_to_be64s(&new_table[i]);
372     }
373 
374     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
375     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
376         table_size * sizeof(uint64_t));
377     if (ret < 0) {
378         goto fail_table;
379     }
380 
381     for(i = 0; i < table_size; i++) {
382         cpu_to_be64s(&new_table[i]);
383     }
384 
385     /* Hook up the new refcount table in the qcow2 header */
386     uint8_t data[12];
387     cpu_to_be64w((uint64_t*)data, table_offset);
388     cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
389     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
390     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
391         data, sizeof(data));
392     if (ret < 0) {
393         goto fail_table;
394     }
395 
396     /* And switch it in memory */
397     uint64_t old_table_offset = s->refcount_table_offset;
398     uint64_t old_table_size = s->refcount_table_size;
399 
400     qemu_free(s->refcount_table);
401     s->refcount_table = new_table;
402     s->refcount_table_size = table_size;
403     s->refcount_table_offset = table_offset;
404 
405     /* Free old table. Remember, we must not change free_cluster_index */
406     uint64_t old_free_cluster_index = s->free_cluster_index;
407     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
408     s->free_cluster_index = old_free_cluster_index;
409 
410     ret = load_refcount_block(bs, new_block);
411     if (ret < 0) {
412         goto fail_block;
413     }
414 
415     return new_block;
416 
417 fail_table:
418     qemu_free(new_table);
419 fail_block:
420     s->refcount_block_cache_offset = 0;
421     return ret;
422 }
423 
424 #define REFCOUNTS_PER_SECTOR (512 >> REFCOUNT_SHIFT)
write_refcount_block_entries(BlockDriverState * bs,int64_t refcount_block_offset,int first_index,int last_index)425 static int write_refcount_block_entries(BlockDriverState *bs,
426     int64_t refcount_block_offset, int first_index, int last_index)
427 {
428     BDRVQcowState *s = bs->opaque;
429     size_t size;
430     int ret;
431 
432     if (cache_refcount_updates) {
433         return 0;
434     }
435 
436     if (first_index < 0) {
437         return 0;
438     }
439 
440     first_index &= ~(REFCOUNTS_PER_SECTOR - 1);
441     last_index = (last_index + REFCOUNTS_PER_SECTOR)
442         & ~(REFCOUNTS_PER_SECTOR - 1);
443 
444     size = (last_index - first_index) << REFCOUNT_SHIFT;
445 
446     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
447     ret = bdrv_pwrite_sync(bs->file,
448         refcount_block_offset + (first_index << REFCOUNT_SHIFT),
449         &s->refcount_block_cache[first_index], size);
450     if (ret < 0) {
451         return ret;
452     }
453 
454     return 0;
455 }
456 
457 /* XXX: cache several refcount block clusters ? */
update_refcount(BlockDriverState * bs,int64_t offset,int64_t length,int addend)458 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
459     int64_t offset, int64_t length, int addend)
460 {
461     BDRVQcowState *s = bs->opaque;
462     int64_t start, last, cluster_offset;
463     int64_t refcount_block_offset = 0;
464     int64_t table_index = -1, old_table_index;
465     int first_index = -1, last_index = -1;
466     int ret;
467 
468 #ifdef DEBUG_ALLOC2
469     printf("update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
470            offset, length, addend);
471 #endif
472     if (length < 0) {
473         return -EINVAL;
474     } else if (length == 0) {
475         return 0;
476     }
477 
478     start = offset & ~(s->cluster_size - 1);
479     last = (offset + length - 1) & ~(s->cluster_size - 1);
480     for(cluster_offset = start; cluster_offset <= last;
481         cluster_offset += s->cluster_size)
482     {
483         int block_index, refcount;
484         int64_t cluster_index = cluster_offset >> s->cluster_bits;
485         int64_t new_block;
486 
487         /* Only write refcount block to disk when we are done with it */
488         old_table_index = table_index;
489         table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
490         if ((old_table_index >= 0) && (table_index != old_table_index)) {
491 
492             ret = write_refcount_block_entries(bs, refcount_block_offset,
493                 first_index, last_index);
494             if (ret < 0) {
495                 return ret;
496             }
497 
498             first_index = -1;
499             last_index = -1;
500         }
501 
502         /* Load the refcount block and allocate it if needed */
503         new_block = alloc_refcount_block(bs, cluster_index);
504         if (new_block < 0) {
505             ret = new_block;
506             goto fail;
507         }
508         refcount_block_offset = new_block;
509 
510         /* we can update the count and save it */
511         block_index = cluster_index &
512             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
513         if (first_index == -1 || block_index < first_index) {
514             first_index = block_index;
515         }
516         if (block_index > last_index) {
517             last_index = block_index;
518         }
519 
520         refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
521         refcount += addend;
522         if (refcount < 0 || refcount > 0xffff) {
523             ret = -EINVAL;
524             goto fail;
525         }
526         if (refcount == 0 && cluster_index < s->free_cluster_index) {
527             s->free_cluster_index = cluster_index;
528         }
529         s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
530     }
531 
532     ret = 0;
533 fail:
534 
535     /* Write last changed block to disk */
536     if (refcount_block_offset != 0) {
537         int wret;
538         wret = write_refcount_block_entries(bs, refcount_block_offset,
539             first_index, last_index);
540         if (wret < 0) {
541             return ret < 0 ? ret : wret;
542         }
543     }
544 
545     /*
546      * Try do undo any updates if an error is returned (This may succeed in
547      * some cases like ENOSPC for allocating a new refcount block)
548      */
549     if (ret < 0) {
550         int dummy;
551         dummy = update_refcount(bs, offset, cluster_offset - offset, -addend);
552     }
553 
554     return ret;
555 }
556 
557 /*
558  * Increases or decreases the refcount of a given cluster by one.
559  * addend must be 1 or -1.
560  *
561  * If the return value is non-negative, it is the new refcount of the cluster.
562  * If it is negative, it is -errno and indicates an error.
563  */
update_cluster_refcount(BlockDriverState * bs,int64_t cluster_index,int addend)564 static int update_cluster_refcount(BlockDriverState *bs,
565                                    int64_t cluster_index,
566                                    int addend)
567 {
568     BDRVQcowState *s = bs->opaque;
569     int ret;
570 
571     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend);
572     if (ret < 0) {
573         return ret;
574     }
575 
576     return get_refcount(bs, cluster_index);
577 }
578 
579 
580 
581 /*********************************************************/
582 /* cluster allocation functions */
583 
584 
585 
586 /* return < 0 if error */
alloc_clusters_noref(BlockDriverState * bs,int64_t size)587 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
588 {
589     BDRVQcowState *s = bs->opaque;
590     int i, nb_clusters, refcount;
591 
592     nb_clusters = size_to_clusters(s, size);
593 retry:
594     for(i = 0; i < nb_clusters; i++) {
595         int64_t next_cluster_index = s->free_cluster_index++;
596         refcount = get_refcount(bs, next_cluster_index);
597 
598         if (refcount < 0) {
599             return refcount;
600         } else if (refcount != 0) {
601             goto retry;
602         }
603     }
604 #ifdef DEBUG_ALLOC2
605     printf("alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
606             size,
607             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
608 #endif
609     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
610 }
611 
qcow2_alloc_clusters(BlockDriverState * bs,int64_t size)612 int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
613 {
614     int64_t offset;
615     int ret;
616 
617     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
618     offset = alloc_clusters_noref(bs, size);
619     if (offset < 0) {
620         return offset;
621     }
622 
623     ret = update_refcount(bs, offset, size, 1);
624     if (ret < 0) {
625         return ret;
626     }
627     return offset;
628 }
629 
630 /* only used to allocate compressed sectors. We try to allocate
631    contiguous sectors. size must be <= cluster_size */
qcow2_alloc_bytes(BlockDriverState * bs,int size)632 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
633 {
634     BDRVQcowState *s = bs->opaque;
635     int64_t offset, cluster_offset;
636     int free_in_cluster;
637 
638     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
639     assert(size > 0 && size <= s->cluster_size);
640     if (s->free_byte_offset == 0) {
641         s->free_byte_offset = qcow2_alloc_clusters(bs, s->cluster_size);
642         if (s->free_byte_offset < 0) {
643             return s->free_byte_offset;
644         }
645     }
646  redo:
647     free_in_cluster = s->cluster_size -
648         (s->free_byte_offset & (s->cluster_size - 1));
649     if (size <= free_in_cluster) {
650         /* enough space in current cluster */
651         offset = s->free_byte_offset;
652         s->free_byte_offset += size;
653         free_in_cluster -= size;
654         if (free_in_cluster == 0)
655             s->free_byte_offset = 0;
656         if ((offset & (s->cluster_size - 1)) != 0)
657             update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
658     } else {
659         offset = qcow2_alloc_clusters(bs, s->cluster_size);
660         if (offset < 0) {
661             return offset;
662         }
663         cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
664         if ((cluster_offset + s->cluster_size) == offset) {
665             /* we are lucky: contiguous data */
666             offset = s->free_byte_offset;
667             update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
668             s->free_byte_offset += size;
669         } else {
670             s->free_byte_offset = offset;
671             goto redo;
672         }
673     }
674     return offset;
675 }
676 
qcow2_free_clusters(BlockDriverState * bs,int64_t offset,int64_t size)677 void qcow2_free_clusters(BlockDriverState *bs,
678                           int64_t offset, int64_t size)
679 {
680     int ret;
681 
682     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
683     ret = update_refcount(bs, offset, size, -1);
684     if (ret < 0) {
685         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
686         /* TODO Remember the clusters to free them later and avoid leaking */
687     }
688 }
689 
690 /*
691  * free_any_clusters
692  *
693  * free clusters according to its type: compressed or not
694  *
695  */
696 
qcow2_free_any_clusters(BlockDriverState * bs,uint64_t cluster_offset,int nb_clusters)697 void qcow2_free_any_clusters(BlockDriverState *bs,
698     uint64_t cluster_offset, int nb_clusters)
699 {
700     BDRVQcowState *s = bs->opaque;
701 
702     /* free the cluster */
703 
704     if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
705         int nb_csectors;
706         nb_csectors = ((cluster_offset >> s->csize_shift) &
707                        s->csize_mask) + 1;
708         qcow2_free_clusters(bs,
709             (cluster_offset & s->cluster_offset_mask) & ~511,
710             nb_csectors * 512);
711         return;
712     }
713 
714     qcow2_free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
715 
716     return;
717 }
718 
719 
720 
721 /*********************************************************/
722 /* snapshots and image creation */
723 
724 
725 
qcow2_create_refcount_update(QCowCreateState * s,int64_t offset,int64_t size)726 void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset,
727     int64_t size)
728 {
729     int refcount;
730     int64_t start, last, cluster_offset;
731     uint16_t *p;
732 
733     start = offset & ~(s->cluster_size - 1);
734     last = (offset + size - 1)  & ~(s->cluster_size - 1);
735     for(cluster_offset = start; cluster_offset <= last;
736         cluster_offset += s->cluster_size) {
737         p = &s->refcount_block[cluster_offset >> s->cluster_bits];
738         refcount = be16_to_cpu(*p);
739         refcount++;
740         *p = cpu_to_be16(refcount);
741     }
742 }
743 
744 /* update the refcounts of snapshots and the copied flag */
qcow2_update_snapshot_refcount(BlockDriverState * bs,int64_t l1_table_offset,int l1_size,int addend)745 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
746     int64_t l1_table_offset, int l1_size, int addend)
747 {
748     BDRVQcowState *s = bs->opaque;
749     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
750     int64_t old_offset, old_l2_offset;
751     int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
752 
753     qcow2_l2_cache_reset(bs);
754     cache_refcount_updates = 1;
755 
756     l2_table = NULL;
757     l1_table = NULL;
758     l1_size2 = l1_size * sizeof(uint64_t);
759     if (l1_table_offset != s->l1_table_offset) {
760         if (l1_size2 != 0) {
761             l1_table = qemu_mallocz(align_offset(l1_size2, 512));
762         } else {
763             l1_table = NULL;
764         }
765         l1_allocated = 1;
766         if (bdrv_pread(bs->file, l1_table_offset,
767                        l1_table, l1_size2) != l1_size2)
768             goto fail;
769         for(i = 0;i < l1_size; i++)
770             be64_to_cpus(&l1_table[i]);
771     } else {
772         assert(l1_size == s->l1_size);
773         l1_table = s->l1_table;
774         l1_allocated = 0;
775     }
776 
777     l2_size = s->l2_size * sizeof(uint64_t);
778     l2_table = qemu_malloc(l2_size);
779     l1_modified = 0;
780     for(i = 0; i < l1_size; i++) {
781         l2_offset = l1_table[i];
782         if (l2_offset) {
783             old_l2_offset = l2_offset;
784             l2_offset &= ~QCOW_OFLAG_COPIED;
785             l2_modified = 0;
786             if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
787                 goto fail;
788             for(j = 0; j < s->l2_size; j++) {
789                 offset = be64_to_cpu(l2_table[j]);
790                 if (offset != 0) {
791                     old_offset = offset;
792                     offset &= ~QCOW_OFLAG_COPIED;
793                     if (offset & QCOW_OFLAG_COMPRESSED) {
794                         nb_csectors = ((offset >> s->csize_shift) &
795                                        s->csize_mask) + 1;
796                         if (addend != 0) {
797                             int ret;
798                             ret = update_refcount(bs,
799                                 (offset & s->cluster_offset_mask) & ~511,
800                                 nb_csectors * 512, addend);
801                             if (ret < 0) {
802                                 goto fail;
803                             }
804                         }
805                         /* compressed clusters are never modified */
806                         refcount = 2;
807                     } else {
808                         if (addend != 0) {
809                             refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
810                         } else {
811                             refcount = get_refcount(bs, offset >> s->cluster_bits);
812                         }
813 
814                         if (refcount < 0) {
815                             goto fail;
816                         }
817                     }
818 
819                     if (refcount == 1) {
820                         offset |= QCOW_OFLAG_COPIED;
821                     }
822                     if (offset != old_offset) {
823                         l2_table[j] = cpu_to_be64(offset);
824                         l2_modified = 1;
825                     }
826                 }
827             }
828             if (l2_modified) {
829                 if (bdrv_pwrite_sync(bs->file,
830                                 l2_offset, l2_table, l2_size) < 0)
831                     goto fail;
832             }
833 
834             if (addend != 0) {
835                 refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
836             } else {
837                 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
838             }
839             if (refcount < 0) {
840                 goto fail;
841             } else if (refcount == 1) {
842                 l2_offset |= QCOW_OFLAG_COPIED;
843             }
844             if (l2_offset != old_l2_offset) {
845                 l1_table[i] = l2_offset;
846                 l1_modified = 1;
847             }
848         }
849     }
850     if (l1_modified) {
851         for(i = 0; i < l1_size; i++)
852             cpu_to_be64s(&l1_table[i]);
853         if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table,
854                         l1_size2) < 0)
855             goto fail;
856         for(i = 0; i < l1_size; i++)
857             be64_to_cpus(&l1_table[i]);
858     }
859     if (l1_allocated)
860         qemu_free(l1_table);
861     qemu_free(l2_table);
862     cache_refcount_updates = 0;
863     write_refcount_block(bs);
864     return 0;
865  fail:
866     if (l1_allocated)
867         qemu_free(l1_table);
868     qemu_free(l2_table);
869     cache_refcount_updates = 0;
870     write_refcount_block(bs);
871     return -EIO;
872 }
873 
874 
875 
876 
877 /*********************************************************/
878 /* refcount checking functions */
879 
880 
881 
882 /*
883  * Increases the refcount for a range of clusters in a given refcount table.
884  * This is used to construct a temporary refcount table out of L1 and L2 tables
885  * which can be compared the the refcount table saved in the image.
886  *
887  * Modifies the number of errors in res.
888  */
inc_refcounts(BlockDriverState * bs,BdrvCheckResult * res,uint16_t * refcount_table,int refcount_table_size,int64_t offset,int64_t size)889 static void inc_refcounts(BlockDriverState *bs,
890                           BdrvCheckResult *res,
891                           uint16_t *refcount_table,
892                           int refcount_table_size,
893                           int64_t offset, int64_t size)
894 {
895     BDRVQcowState *s = bs->opaque;
896     int64_t start, last, cluster_offset;
897     int k;
898 
899     if (size <= 0)
900         return;
901 
902     start = offset & ~(s->cluster_size - 1);
903     last = (offset + size - 1) & ~(s->cluster_size - 1);
904     for(cluster_offset = start; cluster_offset <= last;
905         cluster_offset += s->cluster_size) {
906         k = cluster_offset >> s->cluster_bits;
907         if (k < 0) {
908             fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
909                 cluster_offset);
910             res->corruptions++;
911         } else if (k >= refcount_table_size) {
912             fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
913                 "the end of the image file, can't properly check refcounts.\n",
914                 cluster_offset);
915             res->check_errors++;
916         } else {
917             if (++refcount_table[k] == 0) {
918                 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
919                     "\n", cluster_offset);
920                 res->corruptions++;
921             }
922         }
923     }
924 }
925 
926 /*
927  * Increases the refcount in the given refcount table for the all clusters
928  * referenced in the L2 table. While doing so, performs some checks on L2
929  * entries.
930  *
931  * Returns the number of errors found by the checks or -errno if an internal
932  * error occurred.
933  */
check_refcounts_l2(BlockDriverState * bs,BdrvCheckResult * res,uint16_t * refcount_table,int refcount_table_size,int64_t l2_offset,int check_copied)934 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
935     uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
936     int check_copied)
937 {
938     BDRVQcowState *s = bs->opaque;
939     uint64_t *l2_table, offset;
940     int i, l2_size, nb_csectors, refcount;
941 
942     /* Read L2 table from disk */
943     l2_size = s->l2_size * sizeof(uint64_t);
944     l2_table = qemu_malloc(l2_size);
945 
946     if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
947         goto fail;
948 
949     /* Do the actual checks */
950     for(i = 0; i < s->l2_size; i++) {
951         offset = be64_to_cpu(l2_table[i]);
952         if (offset != 0) {
953             if (offset & QCOW_OFLAG_COMPRESSED) {
954                 /* Compressed clusters don't have QCOW_OFLAG_COPIED */
955                 if (offset & QCOW_OFLAG_COPIED) {
956                     fprintf(stderr, "ERROR: cluster %" PRId64 ": "
957                         "copied flag must never be set for compressed "
958                         "clusters\n", offset >> s->cluster_bits);
959                     offset &= ~QCOW_OFLAG_COPIED;
960                     res->corruptions++;
961                 }
962 
963                 /* Mark cluster as used */
964                 nb_csectors = ((offset >> s->csize_shift) &
965                                s->csize_mask) + 1;
966                 offset &= s->cluster_offset_mask;
967                 inc_refcounts(bs, res, refcount_table, refcount_table_size,
968                     offset & ~511, nb_csectors * 512);
969             } else {
970                 /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
971                 if (check_copied) {
972                     uint64_t entry = offset;
973                     offset &= ~QCOW_OFLAG_COPIED;
974                     refcount = get_refcount(bs, offset >> s->cluster_bits);
975                     if (refcount < 0) {
976                         fprintf(stderr, "Can't get refcount for offset %"
977                             PRIx64 ": %s\n", entry, strerror(-refcount));
978                         goto fail;
979                     }
980                     if ((refcount == 1) != ((entry & QCOW_OFLAG_COPIED) != 0)) {
981                         fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
982                             PRIx64 " refcount=%d\n", entry, refcount);
983                         res->corruptions++;
984                     }
985                 }
986 
987                 /* Mark cluster as used */
988                 offset &= ~QCOW_OFLAG_COPIED;
989                 inc_refcounts(bs, res, refcount_table,refcount_table_size,
990                     offset, s->cluster_size);
991 
992                 /* Correct offsets are cluster aligned */
993                 if (offset & (s->cluster_size - 1)) {
994                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
995                         "properly aligned; L2 entry corrupted.\n", offset);
996                     res->corruptions++;
997                 }
998             }
999         }
1000     }
1001 
1002     qemu_free(l2_table);
1003     return 0;
1004 
1005 fail:
1006     fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1007     qemu_free(l2_table);
1008     return -EIO;
1009 }
1010 
1011 /*
1012  * Increases the refcount for the L1 table, its L2 tables and all referenced
1013  * clusters in the given refcount table. While doing so, performs some checks
1014  * on L1 and L2 entries.
1015  *
1016  * Returns the number of errors found by the checks or -errno if an internal
1017  * error occurred.
1018  */
check_refcounts_l1(BlockDriverState * bs,BdrvCheckResult * res,uint16_t * refcount_table,int refcount_table_size,int64_t l1_table_offset,int l1_size,int check_copied)1019 static int check_refcounts_l1(BlockDriverState *bs,
1020                               BdrvCheckResult *res,
1021                               uint16_t *refcount_table,
1022                               int refcount_table_size,
1023                               int64_t l1_table_offset, int l1_size,
1024                               int check_copied)
1025 {
1026     BDRVQcowState *s = bs->opaque;
1027     uint64_t *l1_table, l2_offset, l1_size2;
1028     int i, refcount, ret;
1029 
1030     l1_size2 = l1_size * sizeof(uint64_t);
1031 
1032     /* Mark L1 table as used */
1033     inc_refcounts(bs, res, refcount_table, refcount_table_size,
1034         l1_table_offset, l1_size2);
1035 
1036     /* Read L1 table entries from disk */
1037     if (l1_size2 == 0) {
1038         l1_table = NULL;
1039     } else {
1040         l1_table = qemu_malloc(l1_size2);
1041         if (bdrv_pread(bs->file, l1_table_offset,
1042                        l1_table, l1_size2) != l1_size2)
1043             goto fail;
1044         for(i = 0;i < l1_size; i++)
1045             be64_to_cpus(&l1_table[i]);
1046     }
1047 
1048     /* Do the actual checks */
1049     for(i = 0; i < l1_size; i++) {
1050         l2_offset = l1_table[i];
1051         if (l2_offset) {
1052             /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
1053             if (check_copied) {
1054                 refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
1055                     >> s->cluster_bits);
1056                 if (refcount < 0) {
1057                     fprintf(stderr, "Can't get refcount for l2_offset %"
1058                         PRIx64 ": %s\n", l2_offset, strerror(-refcount));
1059                     goto fail;
1060                 }
1061                 if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
1062                     fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
1063                         " refcount=%d\n", l2_offset, refcount);
1064                     res->corruptions++;
1065                 }
1066             }
1067 
1068             /* Mark L2 table as used */
1069             l2_offset &= ~QCOW_OFLAG_COPIED;
1070             inc_refcounts(bs, res, refcount_table, refcount_table_size,
1071                 l2_offset, s->cluster_size);
1072 
1073             /* L2 tables are cluster aligned */
1074             if (l2_offset & (s->cluster_size - 1)) {
1075                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1076                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1077                 res->corruptions++;
1078             }
1079 
1080             /* Process and check L2 entries */
1081             ret = check_refcounts_l2(bs, res, refcount_table,
1082                 refcount_table_size, l2_offset, check_copied);
1083             if (ret < 0) {
1084                 goto fail;
1085             }
1086         }
1087     }
1088     qemu_free(l1_table);
1089     return 0;
1090 
1091 fail:
1092     fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1093     res->check_errors++;
1094     qemu_free(l1_table);
1095     return -EIO;
1096 }
1097 
1098 /*
1099  * Checks an image for refcount consistency.
1100  *
1101  * Returns 0 if no errors are found, the number of errors in case the image is
1102  * detected as corrupted, and -errno when an internal error occured.
1103  */
qcow2_check_refcounts(BlockDriverState * bs,BdrvCheckResult * res)1104 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res)
1105 {
1106     BDRVQcowState *s = bs->opaque;
1107     int64_t size;
1108     int nb_clusters, refcount1, refcount2, i;
1109     QCowSnapshot *sn;
1110     uint16_t *refcount_table;
1111     int ret;
1112 
1113     size = bdrv_getlength(bs->file);
1114     nb_clusters = size_to_clusters(s, size);
1115     refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
1116 
1117     /* header */
1118     inc_refcounts(bs, res, refcount_table, nb_clusters,
1119         0, s->cluster_size);
1120 
1121     /* current L1 table */
1122     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1123                        s->l1_table_offset, s->l1_size, 1);
1124     if (ret < 0) {
1125         return ret;
1126     }
1127 
1128     /* snapshots */
1129     for(i = 0; i < s->nb_snapshots; i++) {
1130         sn = s->snapshots + i;
1131         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1132             sn->l1_table_offset, sn->l1_size, 0);
1133         if (ret < 0) {
1134             return ret;
1135         }
1136     }
1137     inc_refcounts(bs, res, refcount_table, nb_clusters,
1138         s->snapshots_offset, s->snapshots_size);
1139 
1140     /* refcount data */
1141     inc_refcounts(bs, res, refcount_table, nb_clusters,
1142         s->refcount_table_offset,
1143         s->refcount_table_size * sizeof(uint64_t));
1144 
1145     for(i = 0; i < s->refcount_table_size; i++) {
1146         uint64_t offset, cluster;
1147         offset = s->refcount_table[i];
1148         cluster = offset >> s->cluster_bits;
1149 
1150         /* Refcount blocks are cluster aligned */
1151         if (offset & (s->cluster_size - 1)) {
1152             fprintf(stderr, "ERROR refcount block %d is not "
1153                 "cluster aligned; refcount table entry corrupted\n", i);
1154             res->corruptions++;
1155             continue;
1156         }
1157 
1158         if (cluster >= nb_clusters) {
1159             fprintf(stderr, "ERROR refcount block %d is outside image\n", i);
1160             res->corruptions++;
1161             continue;
1162         }
1163 
1164         if (offset != 0) {
1165             inc_refcounts(bs, res, refcount_table, nb_clusters,
1166                 offset, s->cluster_size);
1167             if (refcount_table[cluster] != 1) {
1168                 fprintf(stderr, "ERROR refcount block %d refcount=%d\n",
1169                     i, refcount_table[cluster]);
1170                 res->corruptions++;
1171             }
1172         }
1173     }
1174 
1175     /* compare ref counts */
1176     for(i = 0; i < nb_clusters; i++) {
1177         refcount1 = get_refcount(bs, i);
1178         if (refcount1 < 0) {
1179             fprintf(stderr, "Can't get refcount for cluster %d: %s\n",
1180                 i, strerror(-refcount1));
1181             res->check_errors++;
1182             continue;
1183         }
1184 
1185         refcount2 = refcount_table[i];
1186         if (refcount1 != refcount2) {
1187             fprintf(stderr, "%s cluster %d refcount=%d reference=%d\n",
1188                    refcount1 < refcount2 ? "ERROR" : "Leaked",
1189                    i, refcount1, refcount2);
1190             if (refcount1 < refcount2) {
1191                 res->corruptions++;
1192             } else {
1193                 res->leaks++;
1194             }
1195         }
1196     }
1197 
1198     qemu_free(refcount_table);
1199 
1200     return 0;
1201 }
1202 
1203