1 /*
2 * Copyright (C) 2011-2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7 #include "dm-thin-metadata.h"
8 #include "persistent-data/dm-btree.h"
9 #include "persistent-data/dm-space-map.h"
10 #include "persistent-data/dm-space-map-disk.h"
11 #include "persistent-data/dm-transaction-manager.h"
12
13 #include <linux/list.h>
14 #include <linux/device-mapper.h>
15 #include <linux/workqueue.h>
16
17 /*--------------------------------------------------------------------------
18 * As far as the metadata goes, there is:
19 *
20 * - A superblock in block zero, taking up fewer than 512 bytes for
21 * atomic writes.
22 *
23 * - A space map managing the metadata blocks.
24 *
25 * - A space map managing the data blocks.
26 *
27 * - A btree mapping our internal thin dev ids onto struct disk_device_details.
28 *
29 * - A hierarchical btree, with 2 levels which effectively maps (thin
30 * dev id, virtual block) -> block_time. Block time is a 64-bit
31 * field holding the time in the low 24 bits, and block in the top 48
32 * bits.
33 *
34 * BTrees consist solely of btree_nodes, that fill a block. Some are
35 * internal nodes, as such their values are a __le64 pointing to other
36 * nodes. Leaf nodes can store data of any reasonable size (ie. much
37 * smaller than the block size). The nodes consist of the header,
38 * followed by an array of keys, followed by an array of values. We have
39 * to binary search on the keys so they're all held together to help the
40 * cpu cache.
41 *
42 * Space maps have 2 btrees:
43 *
44 * - One maps a uint64_t onto a struct index_entry. Which points to a
45 * bitmap block, and has some details about how many free entries there
46 * are etc.
47 *
48 * - The bitmap blocks have a header (for the checksum). Then the rest
49 * of the block is pairs of bits. With the meaning being:
50 *
51 * 0 - ref count is 0
52 * 1 - ref count is 1
53 * 2 - ref count is 2
54 * 3 - ref count is higher than 2
55 *
56 * - If the count is higher than 2 then the ref count is entered in a
57 * second btree that directly maps the block_address to a uint32_t ref
58 * count.
59 *
60 * The space map metadata variant doesn't have a bitmaps btree. Instead
61 * it has one single blocks worth of index_entries. This avoids
62 * recursive issues with the bitmap btree needing to allocate space in
63 * order to insert. With a small data block size such as 64k the
64 * metadata support data devices that are hundreds of terrabytes.
65 *
66 * The space maps allocate space linearly from front to back. Space that
67 * is freed in a transaction is never recycled within that transaction.
68 * To try and avoid fragmenting _free_ space the allocator always goes
69 * back and fills in gaps.
70 *
71 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
72 * from the block manager.
73 *--------------------------------------------------------------------------*/
74
75 #define DM_MSG_PREFIX "thin metadata"
76
77 #define THIN_SUPERBLOCK_MAGIC 27022010
78 #define THIN_SUPERBLOCK_LOCATION 0
79 #define THIN_VERSION 2
80 #define THIN_METADATA_CACHE_SIZE 64
81 #define SECTOR_TO_BLOCK_SHIFT 3
82
83 /*
84 * For btree insert:
85 * 3 for btree insert +
86 * 2 for btree lookup used within space map
87 * For btree remove:
88 * 2 for shadow spine +
89 * 4 for rebalance 3 child node
90 */
91 #define THIN_MAX_CONCURRENT_LOCKS 6
92
93 /* This should be plenty */
94 #define SPACE_MAP_ROOT_SIZE 128
95
96 /*
97 * Little endian on-disk superblock and device details.
98 */
99 struct thin_disk_superblock {
100 __le32 csum; /* Checksum of superblock except for this field. */
101 __le32 flags;
102 __le64 blocknr; /* This block number, dm_block_t. */
103
104 __u8 uuid[16];
105 __le64 magic;
106 __le32 version;
107 __le32 time;
108
109 __le64 trans_id;
110
111 /*
112 * Root held by userspace transactions.
113 */
114 __le64 held_root;
115
116 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
117 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
118
119 /*
120 * 2-level btree mapping (dev_id, (dev block, time)) -> data block
121 */
122 __le64 data_mapping_root;
123
124 /*
125 * Device detail root mapping dev_id -> device_details
126 */
127 __le64 device_details_root;
128
129 __le32 data_block_size; /* In 512-byte sectors. */
130
131 __le32 metadata_block_size; /* In 512-byte sectors. */
132 __le64 metadata_nr_blocks;
133
134 __le32 compat_flags;
135 __le32 compat_ro_flags;
136 __le32 incompat_flags;
137 } __packed;
138
139 struct disk_device_details {
140 __le64 mapped_blocks;
141 __le64 transaction_id; /* When created. */
142 __le32 creation_time;
143 __le32 snapshotted_time;
144 } __packed;
145
146 struct dm_pool_metadata {
147 struct hlist_node hash;
148
149 struct block_device *bdev;
150 struct dm_block_manager *bm;
151 struct dm_space_map *metadata_sm;
152 struct dm_space_map *data_sm;
153 struct dm_transaction_manager *tm;
154 struct dm_transaction_manager *nb_tm;
155
156 /*
157 * Two-level btree.
158 * First level holds thin_dev_t.
159 * Second level holds mappings.
160 */
161 struct dm_btree_info info;
162
163 /*
164 * Non-blocking version of the above.
165 */
166 struct dm_btree_info nb_info;
167
168 /*
169 * Just the top level for deleting whole devices.
170 */
171 struct dm_btree_info tl_info;
172
173 /*
174 * Just the bottom level for creating new devices.
175 */
176 struct dm_btree_info bl_info;
177
178 /*
179 * Describes the device details btree.
180 */
181 struct dm_btree_info details_info;
182
183 struct rw_semaphore root_lock;
184 uint32_t time;
185 dm_block_t root;
186 dm_block_t details_root;
187 struct list_head thin_devices;
188 uint64_t trans_id;
189 unsigned long flags;
190 sector_t data_block_size;
191 bool read_only:1;
192
193 /*
194 * Set if a transaction has to be aborted but the attempt to roll back
195 * to the previous (good) transaction failed. The only pool metadata
196 * operation possible in this state is the closing of the device.
197 */
198 bool fail_io:1;
199
200 /*
201 * Reading the space map roots can fail, so we read it into these
202 * buffers before the superblock is locked and updated.
203 */
204 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
205 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
206 };
207
208 struct dm_thin_device {
209 struct list_head list;
210 struct dm_pool_metadata *pmd;
211 dm_thin_id id;
212
213 int open_count;
214 bool changed:1;
215 bool aborted_with_changes:1;
216 uint64_t mapped_blocks;
217 uint64_t transaction_id;
218 uint32_t creation_time;
219 uint32_t snapshotted_time;
220 };
221
222 /*----------------------------------------------------------------
223 * superblock validator
224 *--------------------------------------------------------------*/
225
226 #define SUPERBLOCK_CSUM_XOR 160774
227
sb_prepare_for_write(struct dm_block_validator * v,struct dm_block * b,size_t block_size)228 static void sb_prepare_for_write(struct dm_block_validator *v,
229 struct dm_block *b,
230 size_t block_size)
231 {
232 struct thin_disk_superblock *disk_super = dm_block_data(b);
233
234 disk_super->blocknr = cpu_to_le64(dm_block_location(b));
235 disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
236 block_size - sizeof(__le32),
237 SUPERBLOCK_CSUM_XOR));
238 }
239
sb_check(struct dm_block_validator * v,struct dm_block * b,size_t block_size)240 static int sb_check(struct dm_block_validator *v,
241 struct dm_block *b,
242 size_t block_size)
243 {
244 struct thin_disk_superblock *disk_super = dm_block_data(b);
245 __le32 csum_le;
246
247 if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
248 DMERR("sb_check failed: blocknr %llu: "
249 "wanted %llu", le64_to_cpu(disk_super->blocknr),
250 (unsigned long long)dm_block_location(b));
251 return -ENOTBLK;
252 }
253
254 if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
255 DMERR("sb_check failed: magic %llu: "
256 "wanted %llu", le64_to_cpu(disk_super->magic),
257 (unsigned long long)THIN_SUPERBLOCK_MAGIC);
258 return -EILSEQ;
259 }
260
261 csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
262 block_size - sizeof(__le32),
263 SUPERBLOCK_CSUM_XOR));
264 if (csum_le != disk_super->csum) {
265 DMERR("sb_check failed: csum %u: wanted %u",
266 le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
267 return -EILSEQ;
268 }
269
270 return 0;
271 }
272
273 static struct dm_block_validator sb_validator = {
274 .name = "superblock",
275 .prepare_for_write = sb_prepare_for_write,
276 .check = sb_check
277 };
278
279 /*----------------------------------------------------------------
280 * Methods for the btree value types
281 *--------------------------------------------------------------*/
282
pack_block_time(dm_block_t b,uint32_t t)283 static uint64_t pack_block_time(dm_block_t b, uint32_t t)
284 {
285 return (b << 24) | t;
286 }
287
unpack_block_time(uint64_t v,dm_block_t * b,uint32_t * t)288 static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
289 {
290 *b = v >> 24;
291 *t = v & ((1 << 24) - 1);
292 }
293
data_block_inc(void * context,const void * value_le)294 static void data_block_inc(void *context, const void *value_le)
295 {
296 struct dm_space_map *sm = context;
297 __le64 v_le;
298 uint64_t b;
299 uint32_t t;
300
301 memcpy(&v_le, value_le, sizeof(v_le));
302 unpack_block_time(le64_to_cpu(v_le), &b, &t);
303 dm_sm_inc_block(sm, b);
304 }
305
data_block_dec(void * context,const void * value_le)306 static void data_block_dec(void *context, const void *value_le)
307 {
308 struct dm_space_map *sm = context;
309 __le64 v_le;
310 uint64_t b;
311 uint32_t t;
312
313 memcpy(&v_le, value_le, sizeof(v_le));
314 unpack_block_time(le64_to_cpu(v_le), &b, &t);
315 dm_sm_dec_block(sm, b);
316 }
317
data_block_equal(void * context,const void * value1_le,const void * value2_le)318 static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
319 {
320 __le64 v1_le, v2_le;
321 uint64_t b1, b2;
322 uint32_t t;
323
324 memcpy(&v1_le, value1_le, sizeof(v1_le));
325 memcpy(&v2_le, value2_le, sizeof(v2_le));
326 unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
327 unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
328
329 return b1 == b2;
330 }
331
subtree_inc(void * context,const void * value)332 static void subtree_inc(void *context, const void *value)
333 {
334 struct dm_btree_info *info = context;
335 __le64 root_le;
336 uint64_t root;
337
338 memcpy(&root_le, value, sizeof(root_le));
339 root = le64_to_cpu(root_le);
340 dm_tm_inc(info->tm, root);
341 }
342
subtree_dec(void * context,const void * value)343 static void subtree_dec(void *context, const void *value)
344 {
345 struct dm_btree_info *info = context;
346 __le64 root_le;
347 uint64_t root;
348
349 memcpy(&root_le, value, sizeof(root_le));
350 root = le64_to_cpu(root_le);
351 if (dm_btree_del(info, root))
352 DMERR("btree delete failed\n");
353 }
354
subtree_equal(void * context,const void * value1_le,const void * value2_le)355 static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
356 {
357 __le64 v1_le, v2_le;
358 memcpy(&v1_le, value1_le, sizeof(v1_le));
359 memcpy(&v2_le, value2_le, sizeof(v2_le));
360
361 return v1_le == v2_le;
362 }
363
364 /*----------------------------------------------------------------*/
365
superblock_lock_zero(struct dm_pool_metadata * pmd,struct dm_block ** sblock)366 static int superblock_lock_zero(struct dm_pool_metadata *pmd,
367 struct dm_block **sblock)
368 {
369 return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
370 &sb_validator, sblock);
371 }
372
superblock_lock(struct dm_pool_metadata * pmd,struct dm_block ** sblock)373 static int superblock_lock(struct dm_pool_metadata *pmd,
374 struct dm_block **sblock)
375 {
376 return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
377 &sb_validator, sblock);
378 }
379
__superblock_all_zeroes(struct dm_block_manager * bm,int * result)380 static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
381 {
382 int r;
383 unsigned i;
384 struct dm_block *b;
385 __le64 *data_le, zero = cpu_to_le64(0);
386 unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
387
388 /*
389 * We can't use a validator here - it may be all zeroes.
390 */
391 r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
392 if (r)
393 return r;
394
395 data_le = dm_block_data(b);
396 *result = 1;
397 for (i = 0; i < block_size; i++) {
398 if (data_le[i] != zero) {
399 *result = 0;
400 break;
401 }
402 }
403
404 return dm_bm_unlock(b);
405 }
406
__setup_btree_details(struct dm_pool_metadata * pmd)407 static void __setup_btree_details(struct dm_pool_metadata *pmd)
408 {
409 pmd->info.tm = pmd->tm;
410 pmd->info.levels = 2;
411 pmd->info.value_type.context = pmd->data_sm;
412 pmd->info.value_type.size = sizeof(__le64);
413 pmd->info.value_type.inc = data_block_inc;
414 pmd->info.value_type.dec = data_block_dec;
415 pmd->info.value_type.equal = data_block_equal;
416
417 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
418 pmd->nb_info.tm = pmd->nb_tm;
419
420 pmd->tl_info.tm = pmd->tm;
421 pmd->tl_info.levels = 1;
422 pmd->tl_info.value_type.context = &pmd->bl_info;
423 pmd->tl_info.value_type.size = sizeof(__le64);
424 pmd->tl_info.value_type.inc = subtree_inc;
425 pmd->tl_info.value_type.dec = subtree_dec;
426 pmd->tl_info.value_type.equal = subtree_equal;
427
428 pmd->bl_info.tm = pmd->tm;
429 pmd->bl_info.levels = 1;
430 pmd->bl_info.value_type.context = pmd->data_sm;
431 pmd->bl_info.value_type.size = sizeof(__le64);
432 pmd->bl_info.value_type.inc = data_block_inc;
433 pmd->bl_info.value_type.dec = data_block_dec;
434 pmd->bl_info.value_type.equal = data_block_equal;
435
436 pmd->details_info.tm = pmd->tm;
437 pmd->details_info.levels = 1;
438 pmd->details_info.value_type.context = NULL;
439 pmd->details_info.value_type.size = sizeof(struct disk_device_details);
440 pmd->details_info.value_type.inc = NULL;
441 pmd->details_info.value_type.dec = NULL;
442 pmd->details_info.value_type.equal = NULL;
443 }
444
save_sm_roots(struct dm_pool_metadata * pmd)445 static int save_sm_roots(struct dm_pool_metadata *pmd)
446 {
447 int r;
448 size_t len;
449
450 r = dm_sm_root_size(pmd->metadata_sm, &len);
451 if (r < 0)
452 return r;
453
454 r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
455 if (r < 0)
456 return r;
457
458 r = dm_sm_root_size(pmd->data_sm, &len);
459 if (r < 0)
460 return r;
461
462 return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
463 }
464
copy_sm_roots(struct dm_pool_metadata * pmd,struct thin_disk_superblock * disk)465 static void copy_sm_roots(struct dm_pool_metadata *pmd,
466 struct thin_disk_superblock *disk)
467 {
468 memcpy(&disk->metadata_space_map_root,
469 &pmd->metadata_space_map_root,
470 sizeof(pmd->metadata_space_map_root));
471
472 memcpy(&disk->data_space_map_root,
473 &pmd->data_space_map_root,
474 sizeof(pmd->data_space_map_root));
475 }
476
__write_initial_superblock(struct dm_pool_metadata * pmd)477 static int __write_initial_superblock(struct dm_pool_metadata *pmd)
478 {
479 int r;
480 struct dm_block *sblock;
481 struct thin_disk_superblock *disk_super;
482 sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
483
484 if (bdev_size > THIN_METADATA_MAX_SECTORS)
485 bdev_size = THIN_METADATA_MAX_SECTORS;
486
487 r = dm_sm_commit(pmd->data_sm);
488 if (r < 0)
489 return r;
490
491 r = dm_tm_pre_commit(pmd->tm);
492 if (r < 0)
493 return r;
494
495 r = save_sm_roots(pmd);
496 if (r < 0)
497 return r;
498
499 r = superblock_lock_zero(pmd, &sblock);
500 if (r)
501 return r;
502
503 disk_super = dm_block_data(sblock);
504 disk_super->flags = 0;
505 memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
506 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
507 disk_super->version = cpu_to_le32(THIN_VERSION);
508 disk_super->time = 0;
509 disk_super->trans_id = 0;
510 disk_super->held_root = 0;
511
512 copy_sm_roots(pmd, disk_super);
513
514 disk_super->data_mapping_root = cpu_to_le64(pmd->root);
515 disk_super->device_details_root = cpu_to_le64(pmd->details_root);
516 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
517 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
518 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
519
520 return dm_tm_commit(pmd->tm, sblock);
521 }
522
__format_metadata(struct dm_pool_metadata * pmd)523 static int __format_metadata(struct dm_pool_metadata *pmd)
524 {
525 int r;
526
527 r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
528 &pmd->tm, &pmd->metadata_sm);
529 if (r < 0) {
530 DMERR("tm_create_with_sm failed");
531 return r;
532 }
533
534 pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
535 if (IS_ERR(pmd->data_sm)) {
536 DMERR("sm_disk_create failed");
537 r = PTR_ERR(pmd->data_sm);
538 goto bad_cleanup_tm;
539 }
540
541 pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
542 if (!pmd->nb_tm) {
543 DMERR("could not create non-blocking clone tm");
544 r = -ENOMEM;
545 goto bad_cleanup_data_sm;
546 }
547
548 __setup_btree_details(pmd);
549
550 r = dm_btree_empty(&pmd->info, &pmd->root);
551 if (r < 0)
552 goto bad_cleanup_nb_tm;
553
554 r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
555 if (r < 0) {
556 DMERR("couldn't create devices root");
557 goto bad_cleanup_nb_tm;
558 }
559
560 r = __write_initial_superblock(pmd);
561 if (r)
562 goto bad_cleanup_nb_tm;
563
564 return 0;
565
566 bad_cleanup_nb_tm:
567 dm_tm_destroy(pmd->nb_tm);
568 bad_cleanup_data_sm:
569 dm_sm_destroy(pmd->data_sm);
570 bad_cleanup_tm:
571 dm_tm_destroy(pmd->tm);
572 dm_sm_destroy(pmd->metadata_sm);
573
574 return r;
575 }
576
__check_incompat_features(struct thin_disk_superblock * disk_super,struct dm_pool_metadata * pmd)577 static int __check_incompat_features(struct thin_disk_superblock *disk_super,
578 struct dm_pool_metadata *pmd)
579 {
580 uint32_t features;
581
582 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
583 if (features) {
584 DMERR("could not access metadata due to unsupported optional features (%lx).",
585 (unsigned long)features);
586 return -EINVAL;
587 }
588
589 /*
590 * Check for read-only metadata to skip the following RDWR checks.
591 */
592 if (get_disk_ro(pmd->bdev->bd_disk))
593 return 0;
594
595 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
596 if (features) {
597 DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
598 (unsigned long)features);
599 return -EINVAL;
600 }
601
602 return 0;
603 }
604
__open_metadata(struct dm_pool_metadata * pmd)605 static int __open_metadata(struct dm_pool_metadata *pmd)
606 {
607 int r;
608 struct dm_block *sblock;
609 struct thin_disk_superblock *disk_super;
610
611 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
612 &sb_validator, &sblock);
613 if (r < 0) {
614 DMERR("couldn't read superblock");
615 return r;
616 }
617
618 disk_super = dm_block_data(sblock);
619
620 /* Verify the data block size hasn't changed */
621 if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
622 DMERR("changing the data block size (from %u to %llu) is not supported",
623 le32_to_cpu(disk_super->data_block_size),
624 (unsigned long long)pmd->data_block_size);
625 r = -EINVAL;
626 goto bad_unlock_sblock;
627 }
628
629 r = __check_incompat_features(disk_super, pmd);
630 if (r < 0)
631 goto bad_unlock_sblock;
632
633 r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
634 disk_super->metadata_space_map_root,
635 sizeof(disk_super->metadata_space_map_root),
636 &pmd->tm, &pmd->metadata_sm);
637 if (r < 0) {
638 DMERR("tm_open_with_sm failed");
639 goto bad_unlock_sblock;
640 }
641
642 pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
643 sizeof(disk_super->data_space_map_root));
644 if (IS_ERR(pmd->data_sm)) {
645 DMERR("sm_disk_open failed");
646 r = PTR_ERR(pmd->data_sm);
647 goto bad_cleanup_tm;
648 }
649
650 pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
651 if (!pmd->nb_tm) {
652 DMERR("could not create non-blocking clone tm");
653 r = -ENOMEM;
654 goto bad_cleanup_data_sm;
655 }
656
657 __setup_btree_details(pmd);
658 return dm_bm_unlock(sblock);
659
660 bad_cleanup_data_sm:
661 dm_sm_destroy(pmd->data_sm);
662 bad_cleanup_tm:
663 dm_tm_destroy(pmd->tm);
664 dm_sm_destroy(pmd->metadata_sm);
665 bad_unlock_sblock:
666 dm_bm_unlock(sblock);
667
668 return r;
669 }
670
__open_or_format_metadata(struct dm_pool_metadata * pmd,bool format_device)671 static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
672 {
673 int r, unformatted;
674
675 r = __superblock_all_zeroes(pmd->bm, &unformatted);
676 if (r)
677 return r;
678
679 if (unformatted)
680 return format_device ? __format_metadata(pmd) : -EPERM;
681
682 return __open_metadata(pmd);
683 }
684
__create_persistent_data_objects(struct dm_pool_metadata * pmd,bool format_device)685 static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
686 {
687 int r;
688
689 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
690 THIN_METADATA_CACHE_SIZE,
691 THIN_MAX_CONCURRENT_LOCKS);
692 if (IS_ERR(pmd->bm)) {
693 DMERR("could not create block manager");
694 return PTR_ERR(pmd->bm);
695 }
696
697 r = __open_or_format_metadata(pmd, format_device);
698 if (r)
699 dm_block_manager_destroy(pmd->bm);
700
701 return r;
702 }
703
__destroy_persistent_data_objects(struct dm_pool_metadata * pmd)704 static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
705 {
706 dm_sm_destroy(pmd->data_sm);
707 dm_sm_destroy(pmd->metadata_sm);
708 dm_tm_destroy(pmd->nb_tm);
709 dm_tm_destroy(pmd->tm);
710 dm_block_manager_destroy(pmd->bm);
711 }
712
__begin_transaction(struct dm_pool_metadata * pmd)713 static int __begin_transaction(struct dm_pool_metadata *pmd)
714 {
715 int r;
716 struct thin_disk_superblock *disk_super;
717 struct dm_block *sblock;
718
719 /*
720 * We re-read the superblock every time. Shouldn't need to do this
721 * really.
722 */
723 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
724 &sb_validator, &sblock);
725 if (r)
726 return r;
727
728 disk_super = dm_block_data(sblock);
729 pmd->time = le32_to_cpu(disk_super->time);
730 pmd->root = le64_to_cpu(disk_super->data_mapping_root);
731 pmd->details_root = le64_to_cpu(disk_super->device_details_root);
732 pmd->trans_id = le64_to_cpu(disk_super->trans_id);
733 pmd->flags = le32_to_cpu(disk_super->flags);
734 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
735
736 dm_bm_unlock(sblock);
737 return 0;
738 }
739
__write_changed_details(struct dm_pool_metadata * pmd)740 static int __write_changed_details(struct dm_pool_metadata *pmd)
741 {
742 int r;
743 struct dm_thin_device *td, *tmp;
744 struct disk_device_details details;
745 uint64_t key;
746
747 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
748 if (!td->changed)
749 continue;
750
751 key = td->id;
752
753 details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
754 details.transaction_id = cpu_to_le64(td->transaction_id);
755 details.creation_time = cpu_to_le32(td->creation_time);
756 details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
757 __dm_bless_for_disk(&details);
758
759 r = dm_btree_insert(&pmd->details_info, pmd->details_root,
760 &key, &details, &pmd->details_root);
761 if (r)
762 return r;
763
764 if (td->open_count)
765 td->changed = 0;
766 else {
767 list_del(&td->list);
768 kfree(td);
769 }
770 }
771
772 return 0;
773 }
774
__commit_transaction(struct dm_pool_metadata * pmd)775 static int __commit_transaction(struct dm_pool_metadata *pmd)
776 {
777 int r;
778 size_t metadata_len, data_len;
779 struct thin_disk_superblock *disk_super;
780 struct dm_block *sblock;
781
782 /*
783 * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
784 */
785 BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
786
787 r = __write_changed_details(pmd);
788 if (r < 0)
789 return r;
790
791 r = dm_sm_commit(pmd->data_sm);
792 if (r < 0)
793 return r;
794
795 r = dm_tm_pre_commit(pmd->tm);
796 if (r < 0)
797 return r;
798
799 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
800 if (r < 0)
801 return r;
802
803 r = dm_sm_root_size(pmd->data_sm, &data_len);
804 if (r < 0)
805 return r;
806
807 r = save_sm_roots(pmd);
808 if (r < 0)
809 return r;
810
811 r = superblock_lock(pmd, &sblock);
812 if (r)
813 return r;
814
815 disk_super = dm_block_data(sblock);
816 disk_super->time = cpu_to_le32(pmd->time);
817 disk_super->data_mapping_root = cpu_to_le64(pmd->root);
818 disk_super->device_details_root = cpu_to_le64(pmd->details_root);
819 disk_super->trans_id = cpu_to_le64(pmd->trans_id);
820 disk_super->flags = cpu_to_le32(pmd->flags);
821
822 copy_sm_roots(pmd, disk_super);
823
824 return dm_tm_commit(pmd->tm, sblock);
825 }
826
dm_pool_metadata_open(struct block_device * bdev,sector_t data_block_size,bool format_device)827 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
828 sector_t data_block_size,
829 bool format_device)
830 {
831 int r;
832 struct dm_pool_metadata *pmd;
833
834 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
835 if (!pmd) {
836 DMERR("could not allocate metadata struct");
837 return ERR_PTR(-ENOMEM);
838 }
839
840 init_rwsem(&pmd->root_lock);
841 pmd->time = 0;
842 INIT_LIST_HEAD(&pmd->thin_devices);
843 pmd->read_only = false;
844 pmd->fail_io = false;
845 pmd->bdev = bdev;
846 pmd->data_block_size = data_block_size;
847
848 r = __create_persistent_data_objects(pmd, format_device);
849 if (r) {
850 kfree(pmd);
851 return ERR_PTR(r);
852 }
853
854 r = __begin_transaction(pmd);
855 if (r < 0) {
856 if (dm_pool_metadata_close(pmd) < 0)
857 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
858 return ERR_PTR(r);
859 }
860
861 return pmd;
862 }
863
dm_pool_metadata_close(struct dm_pool_metadata * pmd)864 int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
865 {
866 int r;
867 unsigned open_devices = 0;
868 struct dm_thin_device *td, *tmp;
869
870 down_read(&pmd->root_lock);
871 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
872 if (td->open_count)
873 open_devices++;
874 else {
875 list_del(&td->list);
876 kfree(td);
877 }
878 }
879 up_read(&pmd->root_lock);
880
881 if (open_devices) {
882 DMERR("attempt to close pmd when %u device(s) are still open",
883 open_devices);
884 return -EBUSY;
885 }
886
887 if (!pmd->read_only && !pmd->fail_io) {
888 r = __commit_transaction(pmd);
889 if (r < 0)
890 DMWARN("%s: __commit_transaction() failed, error = %d",
891 __func__, r);
892 }
893
894 if (!pmd->fail_io)
895 __destroy_persistent_data_objects(pmd);
896
897 kfree(pmd);
898 return 0;
899 }
900
901 /*
902 * __open_device: Returns @td corresponding to device with id @dev,
903 * creating it if @create is set and incrementing @td->open_count.
904 * On failure, @td is undefined.
905 */
__open_device(struct dm_pool_metadata * pmd,dm_thin_id dev,int create,struct dm_thin_device ** td)906 static int __open_device(struct dm_pool_metadata *pmd,
907 dm_thin_id dev, int create,
908 struct dm_thin_device **td)
909 {
910 int r, changed = 0;
911 struct dm_thin_device *td2;
912 uint64_t key = dev;
913 struct disk_device_details details_le;
914
915 /*
916 * If the device is already open, return it.
917 */
918 list_for_each_entry(td2, &pmd->thin_devices, list)
919 if (td2->id == dev) {
920 /*
921 * May not create an already-open device.
922 */
923 if (create)
924 return -EEXIST;
925
926 td2->open_count++;
927 *td = td2;
928 return 0;
929 }
930
931 /*
932 * Check the device exists.
933 */
934 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
935 &key, &details_le);
936 if (r) {
937 if (r != -ENODATA || !create)
938 return r;
939
940 /*
941 * Create new device.
942 */
943 changed = 1;
944 details_le.mapped_blocks = 0;
945 details_le.transaction_id = cpu_to_le64(pmd->trans_id);
946 details_le.creation_time = cpu_to_le32(pmd->time);
947 details_le.snapshotted_time = cpu_to_le32(pmd->time);
948 }
949
950 *td = kmalloc(sizeof(**td), GFP_NOIO);
951 if (!*td)
952 return -ENOMEM;
953
954 (*td)->pmd = pmd;
955 (*td)->id = dev;
956 (*td)->open_count = 1;
957 (*td)->changed = changed;
958 (*td)->aborted_with_changes = false;
959 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
960 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
961 (*td)->creation_time = le32_to_cpu(details_le.creation_time);
962 (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
963
964 list_add(&(*td)->list, &pmd->thin_devices);
965
966 return 0;
967 }
968
__close_device(struct dm_thin_device * td)969 static void __close_device(struct dm_thin_device *td)
970 {
971 --td->open_count;
972 }
973
__create_thin(struct dm_pool_metadata * pmd,dm_thin_id dev)974 static int __create_thin(struct dm_pool_metadata *pmd,
975 dm_thin_id dev)
976 {
977 int r;
978 dm_block_t dev_root;
979 uint64_t key = dev;
980 struct disk_device_details details_le;
981 struct dm_thin_device *td;
982 __le64 value;
983
984 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
985 &key, &details_le);
986 if (!r)
987 return -EEXIST;
988
989 /*
990 * Create an empty btree for the mappings.
991 */
992 r = dm_btree_empty(&pmd->bl_info, &dev_root);
993 if (r)
994 return r;
995
996 /*
997 * Insert it into the main mapping tree.
998 */
999 value = cpu_to_le64(dev_root);
1000 __dm_bless_for_disk(&value);
1001 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1002 if (r) {
1003 dm_btree_del(&pmd->bl_info, dev_root);
1004 return r;
1005 }
1006
1007 r = __open_device(pmd, dev, 1, &td);
1008 if (r) {
1009 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1010 dm_btree_del(&pmd->bl_info, dev_root);
1011 return r;
1012 }
1013 __close_device(td);
1014
1015 return r;
1016 }
1017
dm_pool_create_thin(struct dm_pool_metadata * pmd,dm_thin_id dev)1018 int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
1019 {
1020 int r = -EINVAL;
1021
1022 down_write(&pmd->root_lock);
1023 if (!pmd->fail_io)
1024 r = __create_thin(pmd, dev);
1025 up_write(&pmd->root_lock);
1026
1027 return r;
1028 }
1029
__set_snapshot_details(struct dm_pool_metadata * pmd,struct dm_thin_device * snap,dm_thin_id origin,uint32_t time)1030 static int __set_snapshot_details(struct dm_pool_metadata *pmd,
1031 struct dm_thin_device *snap,
1032 dm_thin_id origin, uint32_t time)
1033 {
1034 int r;
1035 struct dm_thin_device *td;
1036
1037 r = __open_device(pmd, origin, 0, &td);
1038 if (r)
1039 return r;
1040
1041 td->changed = 1;
1042 td->snapshotted_time = time;
1043
1044 snap->mapped_blocks = td->mapped_blocks;
1045 snap->snapshotted_time = time;
1046 __close_device(td);
1047
1048 return 0;
1049 }
1050
__create_snap(struct dm_pool_metadata * pmd,dm_thin_id dev,dm_thin_id origin)1051 static int __create_snap(struct dm_pool_metadata *pmd,
1052 dm_thin_id dev, dm_thin_id origin)
1053 {
1054 int r;
1055 dm_block_t origin_root;
1056 uint64_t key = origin, dev_key = dev;
1057 struct dm_thin_device *td;
1058 struct disk_device_details details_le;
1059 __le64 value;
1060
1061 /* check this device is unused */
1062 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1063 &dev_key, &details_le);
1064 if (!r)
1065 return -EEXIST;
1066
1067 /* find the mapping tree for the origin */
1068 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
1069 if (r)
1070 return r;
1071 origin_root = le64_to_cpu(value);
1072
1073 /* clone the origin, an inc will do */
1074 dm_tm_inc(pmd->tm, origin_root);
1075
1076 /* insert into the main mapping tree */
1077 value = cpu_to_le64(origin_root);
1078 __dm_bless_for_disk(&value);
1079 key = dev;
1080 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1081 if (r) {
1082 dm_tm_dec(pmd->tm, origin_root);
1083 return r;
1084 }
1085
1086 pmd->time++;
1087
1088 r = __open_device(pmd, dev, 1, &td);
1089 if (r)
1090 goto bad;
1091
1092 r = __set_snapshot_details(pmd, td, origin, pmd->time);
1093 __close_device(td);
1094
1095 if (r)
1096 goto bad;
1097
1098 return 0;
1099
1100 bad:
1101 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1102 dm_btree_remove(&pmd->details_info, pmd->details_root,
1103 &key, &pmd->details_root);
1104 return r;
1105 }
1106
dm_pool_create_snap(struct dm_pool_metadata * pmd,dm_thin_id dev,dm_thin_id origin)1107 int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1108 dm_thin_id dev,
1109 dm_thin_id origin)
1110 {
1111 int r = -EINVAL;
1112
1113 down_write(&pmd->root_lock);
1114 if (!pmd->fail_io)
1115 r = __create_snap(pmd, dev, origin);
1116 up_write(&pmd->root_lock);
1117
1118 return r;
1119 }
1120
__delete_device(struct dm_pool_metadata * pmd,dm_thin_id dev)1121 static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
1122 {
1123 int r;
1124 uint64_t key = dev;
1125 struct dm_thin_device *td;
1126
1127 /* TODO: failure should mark the transaction invalid */
1128 r = __open_device(pmd, dev, 0, &td);
1129 if (r)
1130 return r;
1131
1132 if (td->open_count > 1) {
1133 __close_device(td);
1134 return -EBUSY;
1135 }
1136
1137 list_del(&td->list);
1138 kfree(td);
1139 r = dm_btree_remove(&pmd->details_info, pmd->details_root,
1140 &key, &pmd->details_root);
1141 if (r)
1142 return r;
1143
1144 r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1145 if (r)
1146 return r;
1147
1148 return 0;
1149 }
1150
dm_pool_delete_thin_device(struct dm_pool_metadata * pmd,dm_thin_id dev)1151 int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1152 dm_thin_id dev)
1153 {
1154 int r = -EINVAL;
1155
1156 down_write(&pmd->root_lock);
1157 if (!pmd->fail_io)
1158 r = __delete_device(pmd, dev);
1159 up_write(&pmd->root_lock);
1160
1161 return r;
1162 }
1163
dm_pool_set_metadata_transaction_id(struct dm_pool_metadata * pmd,uint64_t current_id,uint64_t new_id)1164 int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1165 uint64_t current_id,
1166 uint64_t new_id)
1167 {
1168 int r = -EINVAL;
1169
1170 down_write(&pmd->root_lock);
1171
1172 if (pmd->fail_io)
1173 goto out;
1174
1175 if (pmd->trans_id != current_id) {
1176 DMERR("mismatched transaction id");
1177 goto out;
1178 }
1179
1180 pmd->trans_id = new_id;
1181 r = 0;
1182
1183 out:
1184 up_write(&pmd->root_lock);
1185
1186 return r;
1187 }
1188
dm_pool_get_metadata_transaction_id(struct dm_pool_metadata * pmd,uint64_t * result)1189 int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1190 uint64_t *result)
1191 {
1192 int r = -EINVAL;
1193
1194 down_read(&pmd->root_lock);
1195 if (!pmd->fail_io) {
1196 *result = pmd->trans_id;
1197 r = 0;
1198 }
1199 up_read(&pmd->root_lock);
1200
1201 return r;
1202 }
1203
__reserve_metadata_snap(struct dm_pool_metadata * pmd)1204 static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1205 {
1206 int r, inc;
1207 struct thin_disk_superblock *disk_super;
1208 struct dm_block *copy, *sblock;
1209 dm_block_t held_root;
1210
1211 /*
1212 * Copy the superblock.
1213 */
1214 dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
1215 r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
1216 &sb_validator, ©, &inc);
1217 if (r)
1218 return r;
1219
1220 BUG_ON(!inc);
1221
1222 held_root = dm_block_location(copy);
1223 disk_super = dm_block_data(copy);
1224
1225 if (le64_to_cpu(disk_super->held_root)) {
1226 DMWARN("Pool metadata snapshot already exists: release this before taking another.");
1227
1228 dm_tm_dec(pmd->tm, held_root);
1229 dm_tm_unlock(pmd->tm, copy);
1230 return -EBUSY;
1231 }
1232
1233 /*
1234 * Wipe the spacemap since we're not publishing this.
1235 */
1236 memset(&disk_super->data_space_map_root, 0,
1237 sizeof(disk_super->data_space_map_root));
1238 memset(&disk_super->metadata_space_map_root, 0,
1239 sizeof(disk_super->metadata_space_map_root));
1240
1241 /*
1242 * Increment the data structures that need to be preserved.
1243 */
1244 dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
1245 dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
1246 dm_tm_unlock(pmd->tm, copy);
1247
1248 /*
1249 * Write the held root into the superblock.
1250 */
1251 r = superblock_lock(pmd, &sblock);
1252 if (r) {
1253 dm_tm_dec(pmd->tm, held_root);
1254 return r;
1255 }
1256
1257 disk_super = dm_block_data(sblock);
1258 disk_super->held_root = cpu_to_le64(held_root);
1259 dm_bm_unlock(sblock);
1260 return 0;
1261 }
1262
dm_pool_reserve_metadata_snap(struct dm_pool_metadata * pmd)1263 int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1264 {
1265 int r = -EINVAL;
1266
1267 down_write(&pmd->root_lock);
1268 if (!pmd->fail_io)
1269 r = __reserve_metadata_snap(pmd);
1270 up_write(&pmd->root_lock);
1271
1272 return r;
1273 }
1274
__release_metadata_snap(struct dm_pool_metadata * pmd)1275 static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1276 {
1277 int r;
1278 struct thin_disk_superblock *disk_super;
1279 struct dm_block *sblock, *copy;
1280 dm_block_t held_root;
1281
1282 r = superblock_lock(pmd, &sblock);
1283 if (r)
1284 return r;
1285
1286 disk_super = dm_block_data(sblock);
1287 held_root = le64_to_cpu(disk_super->held_root);
1288 disk_super->held_root = cpu_to_le64(0);
1289
1290 dm_bm_unlock(sblock);
1291
1292 if (!held_root) {
1293 DMWARN("No pool metadata snapshot found: nothing to release.");
1294 return -EINVAL;
1295 }
1296
1297 r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, ©);
1298 if (r)
1299 return r;
1300
1301 disk_super = dm_block_data(copy);
1302 dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root));
1303 dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
1304 dm_sm_dec_block(pmd->metadata_sm, held_root);
1305
1306 return dm_tm_unlock(pmd->tm, copy);
1307 }
1308
dm_pool_release_metadata_snap(struct dm_pool_metadata * pmd)1309 int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1310 {
1311 int r = -EINVAL;
1312
1313 down_write(&pmd->root_lock);
1314 if (!pmd->fail_io)
1315 r = __release_metadata_snap(pmd);
1316 up_write(&pmd->root_lock);
1317
1318 return r;
1319 }
1320
__get_metadata_snap(struct dm_pool_metadata * pmd,dm_block_t * result)1321 static int __get_metadata_snap(struct dm_pool_metadata *pmd,
1322 dm_block_t *result)
1323 {
1324 int r;
1325 struct thin_disk_superblock *disk_super;
1326 struct dm_block *sblock;
1327
1328 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
1329 &sb_validator, &sblock);
1330 if (r)
1331 return r;
1332
1333 disk_super = dm_block_data(sblock);
1334 *result = le64_to_cpu(disk_super->held_root);
1335
1336 return dm_bm_unlock(sblock);
1337 }
1338
dm_pool_get_metadata_snap(struct dm_pool_metadata * pmd,dm_block_t * result)1339 int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1340 dm_block_t *result)
1341 {
1342 int r = -EINVAL;
1343
1344 down_read(&pmd->root_lock);
1345 if (!pmd->fail_io)
1346 r = __get_metadata_snap(pmd, result);
1347 up_read(&pmd->root_lock);
1348
1349 return r;
1350 }
1351
dm_pool_open_thin_device(struct dm_pool_metadata * pmd,dm_thin_id dev,struct dm_thin_device ** td)1352 int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1353 struct dm_thin_device **td)
1354 {
1355 int r = -EINVAL;
1356
1357 down_write(&pmd->root_lock);
1358 if (!pmd->fail_io)
1359 r = __open_device(pmd, dev, 0, td);
1360 up_write(&pmd->root_lock);
1361
1362 return r;
1363 }
1364
dm_pool_close_thin_device(struct dm_thin_device * td)1365 int dm_pool_close_thin_device(struct dm_thin_device *td)
1366 {
1367 down_write(&td->pmd->root_lock);
1368 __close_device(td);
1369 up_write(&td->pmd->root_lock);
1370
1371 return 0;
1372 }
1373
dm_thin_dev_id(struct dm_thin_device * td)1374 dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1375 {
1376 return td->id;
1377 }
1378
1379 /*
1380 * Check whether @time (of block creation) is older than @td's last snapshot.
1381 * If so then the associated block is shared with the last snapshot device.
1382 * Any block on a device created *after* the device last got snapshotted is
1383 * necessarily not shared.
1384 */
__snapshotted_since(struct dm_thin_device * td,uint32_t time)1385 static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1386 {
1387 return td->snapshotted_time > time;
1388 }
1389
dm_thin_find_block(struct dm_thin_device * td,dm_block_t block,int can_block,struct dm_thin_lookup_result * result)1390 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1391 int can_block, struct dm_thin_lookup_result *result)
1392 {
1393 int r = -EINVAL;
1394 uint64_t block_time = 0;
1395 __le64 value;
1396 struct dm_pool_metadata *pmd = td->pmd;
1397 dm_block_t keys[2] = { td->id, block };
1398 struct dm_btree_info *info;
1399
1400 if (can_block) {
1401 down_read(&pmd->root_lock);
1402 info = &pmd->info;
1403 } else if (down_read_trylock(&pmd->root_lock))
1404 info = &pmd->nb_info;
1405 else
1406 return -EWOULDBLOCK;
1407
1408 if (pmd->fail_io)
1409 goto out;
1410
1411 r = dm_btree_lookup(info, pmd->root, keys, &value);
1412 if (!r)
1413 block_time = le64_to_cpu(value);
1414
1415 out:
1416 up_read(&pmd->root_lock);
1417
1418 if (!r) {
1419 dm_block_t exception_block;
1420 uint32_t exception_time;
1421 unpack_block_time(block_time, &exception_block,
1422 &exception_time);
1423 result->block = exception_block;
1424 result->shared = __snapshotted_since(td, exception_time);
1425 }
1426
1427 return r;
1428 }
1429
__insert(struct dm_thin_device * td,dm_block_t block,dm_block_t data_block)1430 static int __insert(struct dm_thin_device *td, dm_block_t block,
1431 dm_block_t data_block)
1432 {
1433 int r, inserted;
1434 __le64 value;
1435 struct dm_pool_metadata *pmd = td->pmd;
1436 dm_block_t keys[2] = { td->id, block };
1437
1438 value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1439 __dm_bless_for_disk(&value);
1440
1441 r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
1442 &pmd->root, &inserted);
1443 if (r)
1444 return r;
1445
1446 td->changed = 1;
1447 if (inserted)
1448 td->mapped_blocks++;
1449
1450 return 0;
1451 }
1452
dm_thin_insert_block(struct dm_thin_device * td,dm_block_t block,dm_block_t data_block)1453 int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1454 dm_block_t data_block)
1455 {
1456 int r = -EINVAL;
1457
1458 down_write(&td->pmd->root_lock);
1459 if (!td->pmd->fail_io)
1460 r = __insert(td, block, data_block);
1461 up_write(&td->pmd->root_lock);
1462
1463 return r;
1464 }
1465
__remove(struct dm_thin_device * td,dm_block_t block)1466 static int __remove(struct dm_thin_device *td, dm_block_t block)
1467 {
1468 int r;
1469 struct dm_pool_metadata *pmd = td->pmd;
1470 dm_block_t keys[2] = { td->id, block };
1471
1472 r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root);
1473 if (r)
1474 return r;
1475
1476 td->mapped_blocks--;
1477 td->changed = 1;
1478
1479 return 0;
1480 }
1481
dm_thin_remove_block(struct dm_thin_device * td,dm_block_t block)1482 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1483 {
1484 int r = -EINVAL;
1485
1486 down_write(&td->pmd->root_lock);
1487 if (!td->pmd->fail_io)
1488 r = __remove(td, block);
1489 up_write(&td->pmd->root_lock);
1490
1491 return r;
1492 }
1493
dm_pool_block_is_used(struct dm_pool_metadata * pmd,dm_block_t b,bool * result)1494 int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
1495 {
1496 int r;
1497 uint32_t ref_count;
1498
1499 down_read(&pmd->root_lock);
1500 r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
1501 if (!r)
1502 *result = (ref_count != 0);
1503 up_read(&pmd->root_lock);
1504
1505 return r;
1506 }
1507
dm_thin_changed_this_transaction(struct dm_thin_device * td)1508 bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1509 {
1510 int r;
1511
1512 down_read(&td->pmd->root_lock);
1513 r = td->changed;
1514 up_read(&td->pmd->root_lock);
1515
1516 return r;
1517 }
1518
dm_pool_changed_this_transaction(struct dm_pool_metadata * pmd)1519 bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
1520 {
1521 bool r = false;
1522 struct dm_thin_device *td, *tmp;
1523
1524 down_read(&pmd->root_lock);
1525 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1526 if (td->changed) {
1527 r = td->changed;
1528 break;
1529 }
1530 }
1531 up_read(&pmd->root_lock);
1532
1533 return r;
1534 }
1535
dm_thin_aborted_changes(struct dm_thin_device * td)1536 bool dm_thin_aborted_changes(struct dm_thin_device *td)
1537 {
1538 bool r;
1539
1540 down_read(&td->pmd->root_lock);
1541 r = td->aborted_with_changes;
1542 up_read(&td->pmd->root_lock);
1543
1544 return r;
1545 }
1546
dm_pool_alloc_data_block(struct dm_pool_metadata * pmd,dm_block_t * result)1547 int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1548 {
1549 int r = -EINVAL;
1550
1551 down_write(&pmd->root_lock);
1552 if (!pmd->fail_io)
1553 r = dm_sm_new_block(pmd->data_sm, result);
1554 up_write(&pmd->root_lock);
1555
1556 return r;
1557 }
1558
dm_pool_commit_metadata(struct dm_pool_metadata * pmd)1559 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1560 {
1561 int r = -EINVAL;
1562
1563 down_write(&pmd->root_lock);
1564 if (pmd->fail_io)
1565 goto out;
1566
1567 r = __commit_transaction(pmd);
1568 if (r <= 0)
1569 goto out;
1570
1571 /*
1572 * Open the next transaction.
1573 */
1574 r = __begin_transaction(pmd);
1575 out:
1576 up_write(&pmd->root_lock);
1577 return r;
1578 }
1579
__set_abort_with_changes_flags(struct dm_pool_metadata * pmd)1580 static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
1581 {
1582 struct dm_thin_device *td;
1583
1584 list_for_each_entry(td, &pmd->thin_devices, list)
1585 td->aborted_with_changes = td->changed;
1586 }
1587
dm_pool_abort_metadata(struct dm_pool_metadata * pmd)1588 int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1589 {
1590 int r = -EINVAL;
1591
1592 down_write(&pmd->root_lock);
1593 if (pmd->fail_io)
1594 goto out;
1595
1596 __set_abort_with_changes_flags(pmd);
1597 __destroy_persistent_data_objects(pmd);
1598 r = __create_persistent_data_objects(pmd, false);
1599 if (r)
1600 pmd->fail_io = true;
1601
1602 out:
1603 up_write(&pmd->root_lock);
1604
1605 return r;
1606 }
1607
dm_pool_get_free_block_count(struct dm_pool_metadata * pmd,dm_block_t * result)1608 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1609 {
1610 int r = -EINVAL;
1611
1612 down_read(&pmd->root_lock);
1613 if (!pmd->fail_io)
1614 r = dm_sm_get_nr_free(pmd->data_sm, result);
1615 up_read(&pmd->root_lock);
1616
1617 return r;
1618 }
1619
dm_pool_get_free_metadata_block_count(struct dm_pool_metadata * pmd,dm_block_t * result)1620 int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1621 dm_block_t *result)
1622 {
1623 int r = -EINVAL;
1624
1625 down_read(&pmd->root_lock);
1626 if (!pmd->fail_io)
1627 r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1628 up_read(&pmd->root_lock);
1629
1630 return r;
1631 }
1632
dm_pool_get_metadata_dev_size(struct dm_pool_metadata * pmd,dm_block_t * result)1633 int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1634 dm_block_t *result)
1635 {
1636 int r = -EINVAL;
1637
1638 down_read(&pmd->root_lock);
1639 if (!pmd->fail_io)
1640 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1641 up_read(&pmd->root_lock);
1642
1643 return r;
1644 }
1645
dm_pool_get_data_block_size(struct dm_pool_metadata * pmd,sector_t * result)1646 int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
1647 {
1648 down_read(&pmd->root_lock);
1649 *result = pmd->data_block_size;
1650 up_read(&pmd->root_lock);
1651
1652 return 0;
1653 }
1654
dm_pool_get_data_dev_size(struct dm_pool_metadata * pmd,dm_block_t * result)1655 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1656 {
1657 int r = -EINVAL;
1658
1659 down_read(&pmd->root_lock);
1660 if (!pmd->fail_io)
1661 r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1662 up_read(&pmd->root_lock);
1663
1664 return r;
1665 }
1666
dm_thin_get_mapped_count(struct dm_thin_device * td,dm_block_t * result)1667 int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1668 {
1669 int r = -EINVAL;
1670 struct dm_pool_metadata *pmd = td->pmd;
1671
1672 down_read(&pmd->root_lock);
1673 if (!pmd->fail_io) {
1674 *result = td->mapped_blocks;
1675 r = 0;
1676 }
1677 up_read(&pmd->root_lock);
1678
1679 return r;
1680 }
1681
__highest_block(struct dm_thin_device * td,dm_block_t * result)1682 static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
1683 {
1684 int r;
1685 __le64 value_le;
1686 dm_block_t thin_root;
1687 struct dm_pool_metadata *pmd = td->pmd;
1688
1689 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
1690 if (r)
1691 return r;
1692
1693 thin_root = le64_to_cpu(value_le);
1694
1695 return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
1696 }
1697
dm_thin_get_highest_mapped_block(struct dm_thin_device * td,dm_block_t * result)1698 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
1699 dm_block_t *result)
1700 {
1701 int r = -EINVAL;
1702 struct dm_pool_metadata *pmd = td->pmd;
1703
1704 down_read(&pmd->root_lock);
1705 if (!pmd->fail_io)
1706 r = __highest_block(td, result);
1707 up_read(&pmd->root_lock);
1708
1709 return r;
1710 }
1711
__resize_space_map(struct dm_space_map * sm,dm_block_t new_count)1712 static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
1713 {
1714 int r;
1715 dm_block_t old_count;
1716
1717 r = dm_sm_get_nr_blocks(sm, &old_count);
1718 if (r)
1719 return r;
1720
1721 if (new_count == old_count)
1722 return 0;
1723
1724 if (new_count < old_count) {
1725 DMERR("cannot reduce size of space map");
1726 return -EINVAL;
1727 }
1728
1729 return dm_sm_extend(sm, new_count - old_count);
1730 }
1731
dm_pool_resize_data_dev(struct dm_pool_metadata * pmd,dm_block_t new_count)1732 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1733 {
1734 int r = -EINVAL;
1735
1736 down_write(&pmd->root_lock);
1737 if (!pmd->fail_io)
1738 r = __resize_space_map(pmd->data_sm, new_count);
1739 up_write(&pmd->root_lock);
1740
1741 return r;
1742 }
1743
dm_pool_resize_metadata_dev(struct dm_pool_metadata * pmd,dm_block_t new_count)1744 int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1745 {
1746 int r = -EINVAL;
1747
1748 down_write(&pmd->root_lock);
1749 if (!pmd->fail_io)
1750 r = __resize_space_map(pmd->metadata_sm, new_count);
1751 up_write(&pmd->root_lock);
1752
1753 return r;
1754 }
1755
dm_pool_metadata_read_only(struct dm_pool_metadata * pmd)1756 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
1757 {
1758 down_write(&pmd->root_lock);
1759 pmd->read_only = true;
1760 dm_bm_set_read_only(pmd->bm);
1761 up_write(&pmd->root_lock);
1762 }
1763
dm_pool_metadata_read_write(struct dm_pool_metadata * pmd)1764 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
1765 {
1766 down_write(&pmd->root_lock);
1767 pmd->read_only = false;
1768 dm_bm_set_read_write(pmd->bm);
1769 up_write(&pmd->root_lock);
1770 }
1771
dm_pool_register_metadata_threshold(struct dm_pool_metadata * pmd,dm_block_t threshold,dm_sm_threshold_fn fn,void * context)1772 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
1773 dm_block_t threshold,
1774 dm_sm_threshold_fn fn,
1775 void *context)
1776 {
1777 int r;
1778
1779 down_write(&pmd->root_lock);
1780 r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
1781 up_write(&pmd->root_lock);
1782
1783 return r;
1784 }
1785
dm_pool_metadata_set_needs_check(struct dm_pool_metadata * pmd)1786 int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
1787 {
1788 int r;
1789 struct dm_block *sblock;
1790 struct thin_disk_superblock *disk_super;
1791
1792 down_write(&pmd->root_lock);
1793 pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
1794
1795 r = superblock_lock(pmd, &sblock);
1796 if (r) {
1797 DMERR("couldn't read superblock");
1798 goto out;
1799 }
1800
1801 disk_super = dm_block_data(sblock);
1802 disk_super->flags = cpu_to_le32(pmd->flags);
1803
1804 dm_bm_unlock(sblock);
1805 out:
1806 up_write(&pmd->root_lock);
1807 return r;
1808 }
1809
dm_pool_metadata_needs_check(struct dm_pool_metadata * pmd)1810 bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
1811 {
1812 bool needs_check;
1813
1814 down_read(&pmd->root_lock);
1815 needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
1816 up_read(&pmd->root_lock);
1817
1818 return needs_check;
1819 }
1820