• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
3  * Copyright (C) 2006-2008 Red Hat GmbH
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include "dm-exception-store.h"
9 #include "dm-snap.h"
10 
11 #include <linux/mm.h>
12 #include <linux/pagemap.h>
13 #include <linux/vmalloc.h>
14 #include <linux/slab.h>
15 #include <linux/dm-io.h>
16 
17 #define DM_MSG_PREFIX "persistent snapshot"
18 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32	/* 16KB */
19 
20 /*-----------------------------------------------------------------
21  * Persistent snapshots, by persistent we mean that the snapshot
22  * will survive a reboot.
23  *---------------------------------------------------------------*/
24 
25 /*
26  * We need to store a record of which parts of the origin have
27  * been copied to the snapshot device.  The snapshot code
28  * requires that we copy exception chunks to chunk aligned areas
29  * of the COW store.  It makes sense therefore, to store the
30  * metadata in chunk size blocks.
31  *
32  * There is no backward or forward compatibility implemented,
33  * snapshots with different disk versions than the kernel will
34  * not be usable.  It is expected that "lvcreate" will blank out
35  * the start of a fresh COW device before calling the snapshot
36  * constructor.
37  *
38  * The first chunk of the COW device just contains the header.
39  * After this there is a chunk filled with exception metadata,
40  * followed by as many exception chunks as can fit in the
41  * metadata areas.
42  *
43  * All on disk structures are in little-endian format.  The end
44  * of the exceptions info is indicated by an exception with a
45  * new_chunk of 0, which is invalid since it would point to the
46  * header chunk.
47  */
48 
49 /*
50  * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
51  */
52 #define SNAP_MAGIC 0x70416e53
53 
54 /*
55  * The on-disk version of the metadata.
56  */
57 #define SNAPSHOT_DISK_VERSION 1
58 
59 struct disk_header {
60 	uint32_t magic;
61 
62 	/*
63 	 * Is this snapshot valid.  There is no way of recovering
64 	 * an invalid snapshot.
65 	 */
66 	uint32_t valid;
67 
68 	/*
69 	 * Simple, incrementing version. no backward
70 	 * compatibility.
71 	 */
72 	uint32_t version;
73 
74 	/* In sectors */
75 	uint32_t chunk_size;
76 };
77 
78 struct disk_exception {
79 	uint64_t old_chunk;
80 	uint64_t new_chunk;
81 };
82 
83 struct commit_callback {
84 	void (*callback)(void *, int success);
85 	void *context;
86 };
87 
88 /*
89  * The top level structure for a persistent exception store.
90  */
91 struct pstore {
92 	struct dm_snapshot *snap;	/* up pointer to my snapshot */
93 	int version;
94 	int valid;
95 	uint32_t exceptions_per_area;
96 
97 	/*
98 	 * Now that we have an asynchronous kcopyd there is no
99 	 * need for large chunk sizes, so it wont hurt to have a
100 	 * whole chunks worth of metadata in memory at once.
101 	 */
102 	void *area;
103 
104 	/*
105 	 * An area of zeros used to clear the next area.
106 	 */
107 	void *zero_area;
108 
109 	/*
110 	 * Used to keep track of which metadata area the data in
111 	 * 'chunk' refers to.
112 	 */
113 	chunk_t current_area;
114 
115 	/*
116 	 * The next free chunk for an exception.
117 	 */
118 	chunk_t next_free;
119 
120 	/*
121 	 * The index of next free exception in the current
122 	 * metadata area.
123 	 */
124 	uint32_t current_committed;
125 
126 	atomic_t pending_count;
127 	uint32_t callback_count;
128 	struct commit_callback *callbacks;
129 	struct dm_io_client *io_client;
130 
131 	struct workqueue_struct *metadata_wq;
132 };
133 
sectors_to_pages(unsigned sectors)134 static unsigned sectors_to_pages(unsigned sectors)
135 {
136 	return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
137 }
138 
alloc_area(struct pstore * ps)139 static int alloc_area(struct pstore *ps)
140 {
141 	int r = -ENOMEM;
142 	size_t len;
143 
144 	len = ps->snap->chunk_size << SECTOR_SHIFT;
145 
146 	/*
147 	 * Allocate the chunk_size block of memory that will hold
148 	 * a single metadata area.
149 	 */
150 	ps->area = vmalloc(len);
151 	if (!ps->area)
152 		return r;
153 
154 	ps->zero_area = vmalloc(len);
155 	if (!ps->zero_area) {
156 		vfree(ps->area);
157 		return r;
158 	}
159 	memset(ps->zero_area, 0, len);
160 
161 	return 0;
162 }
163 
free_area(struct pstore * ps)164 static void free_area(struct pstore *ps)
165 {
166 	vfree(ps->area);
167 	ps->area = NULL;
168 	vfree(ps->zero_area);
169 	ps->zero_area = NULL;
170 }
171 
172 struct mdata_req {
173 	struct dm_io_region *where;
174 	struct dm_io_request *io_req;
175 	struct work_struct work;
176 	int result;
177 };
178 
do_metadata(struct work_struct * work)179 static void do_metadata(struct work_struct *work)
180 {
181 	struct mdata_req *req = container_of(work, struct mdata_req, work);
182 
183 	req->result = dm_io(req->io_req, 1, req->where, NULL);
184 }
185 
186 /*
187  * Read or write a chunk aligned and sized block of data from a device.
188  */
chunk_io(struct pstore * ps,chunk_t chunk,int rw,int metadata)189 static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
190 {
191 	struct dm_io_region where = {
192 		.bdev = ps->snap->cow->bdev,
193 		.sector = ps->snap->chunk_size * chunk,
194 		.count = ps->snap->chunk_size,
195 	};
196 	struct dm_io_request io_req = {
197 		.bi_rw = rw,
198 		.mem.type = DM_IO_VMA,
199 		.mem.ptr.vma = ps->area,
200 		.client = ps->io_client,
201 		.notify.fn = NULL,
202 	};
203 	struct mdata_req req;
204 
205 	if (!metadata)
206 		return dm_io(&io_req, 1, &where, NULL);
207 
208 	req.where = &where;
209 	req.io_req = &io_req;
210 
211 	/*
212 	 * Issue the synchronous I/O from a different thread
213 	 * to avoid generic_make_request recursion.
214 	 */
215 	INIT_WORK(&req.work, do_metadata);
216 	queue_work(ps->metadata_wq, &req.work);
217 	flush_workqueue(ps->metadata_wq);
218 
219 	return req.result;
220 }
221 
222 /*
223  * Convert a metadata area index to a chunk index.
224  */
area_location(struct pstore * ps,chunk_t area)225 static chunk_t area_location(struct pstore *ps, chunk_t area)
226 {
227 	return 1 + ((ps->exceptions_per_area + 1) * area);
228 }
229 
230 /*
231  * Read or write a metadata area.  Remembering to skip the first
232  * chunk which holds the header.
233  */
area_io(struct pstore * ps,int rw)234 static int area_io(struct pstore *ps, int rw)
235 {
236 	int r;
237 	chunk_t chunk;
238 
239 	chunk = area_location(ps, ps->current_area);
240 
241 	r = chunk_io(ps, chunk, rw, 0);
242 	if (r)
243 		return r;
244 
245 	return 0;
246 }
247 
zero_memory_area(struct pstore * ps)248 static void zero_memory_area(struct pstore *ps)
249 {
250 	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
251 }
252 
zero_disk_area(struct pstore * ps,chunk_t area)253 static int zero_disk_area(struct pstore *ps, chunk_t area)
254 {
255 	struct dm_io_region where = {
256 		.bdev = ps->snap->cow->bdev,
257 		.sector = ps->snap->chunk_size * area_location(ps, area),
258 		.count = ps->snap->chunk_size,
259 	};
260 	struct dm_io_request io_req = {
261 		.bi_rw = WRITE,
262 		.mem.type = DM_IO_VMA,
263 		.mem.ptr.vma = ps->zero_area,
264 		.client = ps->io_client,
265 		.notify.fn = NULL,
266 	};
267 
268 	return dm_io(&io_req, 1, &where, NULL);
269 }
270 
read_header(struct pstore * ps,int * new_snapshot)271 static int read_header(struct pstore *ps, int *new_snapshot)
272 {
273 	int r;
274 	struct disk_header *dh;
275 	chunk_t chunk_size;
276 	int chunk_size_supplied = 1;
277 
278 	/*
279 	 * Use default chunk size (or hardsect_size, if larger) if none supplied
280 	 */
281 	if (!ps->snap->chunk_size) {
282 		ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
283 		    bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
284 		ps->snap->chunk_mask = ps->snap->chunk_size - 1;
285 		ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
286 		chunk_size_supplied = 0;
287 	}
288 
289 	ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
290 							     chunk_size));
291 	if (IS_ERR(ps->io_client))
292 		return PTR_ERR(ps->io_client);
293 
294 	r = alloc_area(ps);
295 	if (r)
296 		return r;
297 
298 	r = chunk_io(ps, 0, READ, 1);
299 	if (r)
300 		goto bad;
301 
302 	dh = (struct disk_header *) ps->area;
303 
304 	if (le32_to_cpu(dh->magic) == 0) {
305 		*new_snapshot = 1;
306 		return 0;
307 	}
308 
309 	if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
310 		DMWARN("Invalid or corrupt snapshot");
311 		r = -ENXIO;
312 		goto bad;
313 	}
314 
315 	*new_snapshot = 0;
316 	ps->valid = le32_to_cpu(dh->valid);
317 	ps->version = le32_to_cpu(dh->version);
318 	chunk_size = le32_to_cpu(dh->chunk_size);
319 
320 	if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
321 		return 0;
322 
323 	DMWARN("chunk size %llu in device metadata overrides "
324 	       "table chunk size of %llu.",
325 	       (unsigned long long)chunk_size,
326 	       (unsigned long long)ps->snap->chunk_size);
327 
328 	/* We had a bogus chunk_size. Fix stuff up. */
329 	free_area(ps);
330 
331 	ps->snap->chunk_size = chunk_size;
332 	ps->snap->chunk_mask = chunk_size - 1;
333 	ps->snap->chunk_shift = ffs(chunk_size) - 1;
334 
335 	r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
336 				ps->io_client);
337 	if (r)
338 		return r;
339 
340 	r = alloc_area(ps);
341 	return r;
342 
343 bad:
344 	free_area(ps);
345 	return r;
346 }
347 
write_header(struct pstore * ps)348 static int write_header(struct pstore *ps)
349 {
350 	struct disk_header *dh;
351 
352 	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
353 
354 	dh = (struct disk_header *) ps->area;
355 	dh->magic = cpu_to_le32(SNAP_MAGIC);
356 	dh->valid = cpu_to_le32(ps->valid);
357 	dh->version = cpu_to_le32(ps->version);
358 	dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
359 
360 	return chunk_io(ps, 0, WRITE, 1);
361 }
362 
363 /*
364  * Access functions for the disk exceptions, these do the endian conversions.
365  */
get_exception(struct pstore * ps,uint32_t index)366 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
367 {
368 	BUG_ON(index >= ps->exceptions_per_area);
369 
370 	return ((struct disk_exception *) ps->area) + index;
371 }
372 
read_exception(struct pstore * ps,uint32_t index,struct disk_exception * result)373 static void read_exception(struct pstore *ps,
374 			   uint32_t index, struct disk_exception *result)
375 {
376 	struct disk_exception *e = get_exception(ps, index);
377 
378 	/* copy it */
379 	result->old_chunk = le64_to_cpu(e->old_chunk);
380 	result->new_chunk = le64_to_cpu(e->new_chunk);
381 }
382 
write_exception(struct pstore * ps,uint32_t index,struct disk_exception * de)383 static void write_exception(struct pstore *ps,
384 			    uint32_t index, struct disk_exception *de)
385 {
386 	struct disk_exception *e = get_exception(ps, index);
387 
388 	/* copy it */
389 	e->old_chunk = cpu_to_le64(de->old_chunk);
390 	e->new_chunk = cpu_to_le64(de->new_chunk);
391 }
392 
393 /*
394  * Registers the exceptions that are present in the current area.
395  * 'full' is filled in to indicate if the area has been
396  * filled.
397  */
insert_exceptions(struct pstore * ps,int (* callback)(void * callback_context,chunk_t old,chunk_t new),void * callback_context,int * full)398 static int insert_exceptions(struct pstore *ps,
399 			     int (*callback)(void *callback_context,
400 					     chunk_t old, chunk_t new),
401 			     void *callback_context,
402 			     int *full)
403 {
404 	int r;
405 	unsigned int i;
406 	struct disk_exception de;
407 
408 	/* presume the area is full */
409 	*full = 1;
410 
411 	for (i = 0; i < ps->exceptions_per_area; i++) {
412 		read_exception(ps, i, &de);
413 
414 		/*
415 		 * If the new_chunk is pointing at the start of
416 		 * the COW device, where the first metadata area
417 		 * is we know that we've hit the end of the
418 		 * exceptions.  Therefore the area is not full.
419 		 */
420 		if (de.new_chunk == 0LL) {
421 			ps->current_committed = i;
422 			*full = 0;
423 			break;
424 		}
425 
426 		/*
427 		 * Keep track of the start of the free chunks.
428 		 */
429 		if (ps->next_free <= de.new_chunk)
430 			ps->next_free = de.new_chunk + 1;
431 
432 		/*
433 		 * Otherwise we add the exception to the snapshot.
434 		 */
435 		r = callback(callback_context, de.old_chunk, de.new_chunk);
436 		if (r)
437 			return r;
438 	}
439 
440 	return 0;
441 }
442 
read_exceptions(struct pstore * ps,int (* callback)(void * callback_context,chunk_t old,chunk_t new),void * callback_context)443 static int read_exceptions(struct pstore *ps,
444 			   int (*callback)(void *callback_context, chunk_t old,
445 					   chunk_t new),
446 			   void *callback_context)
447 {
448 	int r, full = 1;
449 
450 	/*
451 	 * Keeping reading chunks and inserting exceptions until
452 	 * we find a partially full area.
453 	 */
454 	for (ps->current_area = 0; full; ps->current_area++) {
455 		r = area_io(ps, READ);
456 		if (r)
457 			return r;
458 
459 		r = insert_exceptions(ps, callback, callback_context, &full);
460 		if (r)
461 			return r;
462 	}
463 
464 	ps->current_area--;
465 
466 	return 0;
467 }
468 
get_info(struct dm_exception_store * store)469 static struct pstore *get_info(struct dm_exception_store *store)
470 {
471 	return (struct pstore *) store->context;
472 }
473 
persistent_fraction_full(struct dm_exception_store * store,sector_t * numerator,sector_t * denominator)474 static void persistent_fraction_full(struct dm_exception_store *store,
475 				     sector_t *numerator, sector_t *denominator)
476 {
477 	*numerator = get_info(store)->next_free * store->snap->chunk_size;
478 	*denominator = get_dev_size(store->snap->cow->bdev);
479 }
480 
persistent_destroy(struct dm_exception_store * store)481 static void persistent_destroy(struct dm_exception_store *store)
482 {
483 	struct pstore *ps = get_info(store);
484 
485 	destroy_workqueue(ps->metadata_wq);
486 	dm_io_client_destroy(ps->io_client);
487 	vfree(ps->callbacks);
488 	free_area(ps);
489 	kfree(ps);
490 }
491 
persistent_read_metadata(struct dm_exception_store * store,int (* callback)(void * callback_context,chunk_t old,chunk_t new),void * callback_context)492 static int persistent_read_metadata(struct dm_exception_store *store,
493 				    int (*callback)(void *callback_context,
494 						    chunk_t old, chunk_t new),
495 				    void *callback_context)
496 {
497 	int r, uninitialized_var(new_snapshot);
498 	struct pstore *ps = get_info(store);
499 
500 	/*
501 	 * Read the snapshot header.
502 	 */
503 	r = read_header(ps, &new_snapshot);
504 	if (r)
505 		return r;
506 
507 	/*
508 	 * Now we know correct chunk_size, complete the initialisation.
509 	 */
510 	ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
511 				  sizeof(struct disk_exception);
512 	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
513 			sizeof(*ps->callbacks));
514 	if (!ps->callbacks)
515 		return -ENOMEM;
516 
517 	/*
518 	 * Do we need to setup a new snapshot ?
519 	 */
520 	if (new_snapshot) {
521 		r = write_header(ps);
522 		if (r) {
523 			DMWARN("write_header failed");
524 			return r;
525 		}
526 
527 		ps->current_area = 0;
528 		zero_memory_area(ps);
529 		r = zero_disk_area(ps, 0);
530 		if (r) {
531 			DMWARN("zero_disk_area(0) failed");
532 			return r;
533 		}
534 	} else {
535 		/*
536 		 * Sanity checks.
537 		 */
538 		if (ps->version != SNAPSHOT_DISK_VERSION) {
539 			DMWARN("unable to handle snapshot disk version %d",
540 			       ps->version);
541 			return -EINVAL;
542 		}
543 
544 		/*
545 		 * Metadata are valid, but snapshot is invalidated
546 		 */
547 		if (!ps->valid)
548 			return 1;
549 
550 		/*
551 		 * Read the metadata.
552 		 */
553 		r = read_exceptions(ps, callback, callback_context);
554 		if (r)
555 			return r;
556 	}
557 
558 	return 0;
559 }
560 
persistent_prepare_exception(struct dm_exception_store * store,struct dm_snap_exception * e)561 static int persistent_prepare_exception(struct dm_exception_store *store,
562 					struct dm_snap_exception *e)
563 {
564 	struct pstore *ps = get_info(store);
565 	uint32_t stride;
566 	chunk_t next_free;
567 	sector_t size = get_dev_size(store->snap->cow->bdev);
568 
569 	/* Is there enough room ? */
570 	if (size < ((ps->next_free + 1) * store->snap->chunk_size))
571 		return -ENOSPC;
572 
573 	e->new_chunk = ps->next_free;
574 
575 	/*
576 	 * Move onto the next free pending, making sure to take
577 	 * into account the location of the metadata chunks.
578 	 */
579 	stride = (ps->exceptions_per_area + 1);
580 	next_free = ++ps->next_free;
581 	if (sector_div(next_free, stride) == 1)
582 		ps->next_free++;
583 
584 	atomic_inc(&ps->pending_count);
585 	return 0;
586 }
587 
persistent_commit_exception(struct dm_exception_store * store,struct dm_snap_exception * e,void (* callback)(void *,int success),void * callback_context)588 static void persistent_commit_exception(struct dm_exception_store *store,
589 					struct dm_snap_exception *e,
590 					void (*callback) (void *, int success),
591 					void *callback_context)
592 {
593 	unsigned int i;
594 	struct pstore *ps = get_info(store);
595 	struct disk_exception de;
596 	struct commit_callback *cb;
597 
598 	de.old_chunk = e->old_chunk;
599 	de.new_chunk = e->new_chunk;
600 	write_exception(ps, ps->current_committed++, &de);
601 
602 	/*
603 	 * Add the callback to the back of the array.  This code
604 	 * is the only place where the callback array is
605 	 * manipulated, and we know that it will never be called
606 	 * multiple times concurrently.
607 	 */
608 	cb = ps->callbacks + ps->callback_count++;
609 	cb->callback = callback;
610 	cb->context = callback_context;
611 
612 	/*
613 	 * If there are exceptions in flight and we have not yet
614 	 * filled this metadata area there's nothing more to do.
615 	 */
616 	if (!atomic_dec_and_test(&ps->pending_count) &&
617 	    (ps->current_committed != ps->exceptions_per_area))
618 		return;
619 
620 	/*
621 	 * If we completely filled the current area, then wipe the next one.
622 	 */
623 	if ((ps->current_committed == ps->exceptions_per_area) &&
624 	     zero_disk_area(ps, ps->current_area + 1))
625 		ps->valid = 0;
626 
627 	/*
628 	 * Commit exceptions to disk.
629 	 */
630 	if (ps->valid && area_io(ps, WRITE))
631 		ps->valid = 0;
632 
633 	/*
634 	 * Advance to the next area if this one is full.
635 	 */
636 	if (ps->current_committed == ps->exceptions_per_area) {
637 		ps->current_committed = 0;
638 		ps->current_area++;
639 		zero_memory_area(ps);
640 	}
641 
642 	for (i = 0; i < ps->callback_count; i++) {
643 		cb = ps->callbacks + i;
644 		cb->callback(cb->context, ps->valid);
645 	}
646 
647 	ps->callback_count = 0;
648 }
649 
persistent_drop_snapshot(struct dm_exception_store * store)650 static void persistent_drop_snapshot(struct dm_exception_store *store)
651 {
652 	struct pstore *ps = get_info(store);
653 
654 	ps->valid = 0;
655 	if (write_header(ps))
656 		DMWARN("write header failed");
657 }
658 
dm_create_persistent(struct dm_exception_store * store)659 int dm_create_persistent(struct dm_exception_store *store)
660 {
661 	struct pstore *ps;
662 
663 	/* allocate the pstore */
664 	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
665 	if (!ps)
666 		return -ENOMEM;
667 
668 	ps->snap = store->snap;
669 	ps->valid = 1;
670 	ps->version = SNAPSHOT_DISK_VERSION;
671 	ps->area = NULL;
672 	ps->next_free = 2;	/* skipping the header and first area */
673 	ps->current_committed = 0;
674 
675 	ps->callback_count = 0;
676 	atomic_set(&ps->pending_count, 0);
677 	ps->callbacks = NULL;
678 
679 	ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
680 	if (!ps->metadata_wq) {
681 		kfree(ps);
682 		DMERR("couldn't start header metadata update thread");
683 		return -ENOMEM;
684 	}
685 
686 	store->destroy = persistent_destroy;
687 	store->read_metadata = persistent_read_metadata;
688 	store->prepare_exception = persistent_prepare_exception;
689 	store->commit_exception = persistent_commit_exception;
690 	store->drop_snapshot = persistent_drop_snapshot;
691 	store->fraction_full = persistent_fraction_full;
692 	store->context = ps;
693 
694 	return 0;
695 }
696 
dm_persistent_snapshot_init(void)697 int dm_persistent_snapshot_init(void)
698 {
699 	return 0;
700 }
701 
dm_persistent_snapshot_exit(void)702 void dm_persistent_snapshot_exit(void)
703 {
704 }
705