• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2014 Facebook. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include <linux/device-mapper.h>
8 
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/blkdev.h>
12 #include <linux/bio.h>
13 #include <linux/slab.h>
14 #include <linux/kthread.h>
15 #include <linux/freezer.h>
16 
17 #define DM_MSG_PREFIX "log-writes"
18 
19 /*
20  * This target will sequentially log all writes to the target device onto the
21  * log device.  This is helpful for replaying writes to check for fs consistency
22  * at all times.  This target provides a mechanism to mark specific events to
23  * check data at a later time.  So for example you would:
24  *
25  * write data
26  * fsync
27  * dmsetup message /dev/whatever mark mymark
28  * unmount /mnt/test
29  *
30  * Then replay the log up to mymark and check the contents of the replay to
31  * verify it matches what was written.
32  *
33  * We log writes only after they have been flushed, this makes the log describe
34  * close to the order in which the data hits the actual disk, not its cache.  So
35  * for example the following sequence (W means write, C means complete)
36  *
37  * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
38  *
39  * Would result in the log looking like this:
40  *
41  * c,a,flush,fuad,b,<other writes>,<next flush>
42  *
43  * This is meant to help expose problems where file systems do not properly wait
44  * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
45  * completes it is added to the log as it should be on disk.
46  *
47  * We treat DISCARDs as if they don't bypass cache so that they are logged in
48  * order of completion along with the normal writes.  If we didn't do it this
49  * way we would process all the discards first and then write all the data, when
50  * in fact we want to do the data and the discard in the order that they
51  * completed.
52  */
53 #define LOG_FLUSH_FLAG (1 << 0)
54 #define LOG_FUA_FLAG (1 << 1)
55 #define LOG_DISCARD_FLAG (1 << 2)
56 #define LOG_MARK_FLAG (1 << 3)
57 
58 #define WRITE_LOG_VERSION 1ULL
59 #define WRITE_LOG_MAGIC 0x6a736677736872ULL
60 #define WRITE_LOG_SUPER_SECTOR 0
61 
62 /*
63  * The disk format for this is braindead simple.
64  *
65  * At byte 0 we have our super, followed by the following sequence for
66  * nr_entries:
67  *
68  * [   1 sector    ][  entry->nr_sectors ]
69  * [log_write_entry][    data written    ]
70  *
71  * The log_write_entry takes up a full sector so we can have arbitrary length
72  * marks and it leaves us room for extra content in the future.
73  */
74 
75 /*
76  * Basic info about the log for userspace.
77  */
78 struct log_write_super {
79 	__le64 magic;
80 	__le64 version;
81 	__le64 nr_entries;
82 	__le32 sectorsize;
83 };
84 
85 /*
86  * sector - the sector we wrote.
87  * nr_sectors - the number of sectors we wrote.
88  * flags - flags for this log entry.
89  * data_len - the size of the data in this log entry, this is for private log
90  * entry stuff, the MARK data provided by userspace for example.
91  */
92 struct log_write_entry {
93 	__le64 sector;
94 	__le64 nr_sectors;
95 	__le64 flags;
96 	__le64 data_len;
97 };
98 
99 struct log_writes_c {
100 	struct dm_dev *dev;
101 	struct dm_dev *logdev;
102 	u64 logged_entries;
103 	u32 sectorsize;
104 	u32 sectorshift;
105 	atomic_t io_blocks;
106 	atomic_t pending_blocks;
107 	sector_t next_sector;
108 	sector_t end_sector;
109 	bool logging_enabled;
110 	bool device_supports_discard;
111 	spinlock_t blocks_lock;
112 	struct list_head unflushed_blocks;
113 	struct list_head logging_blocks;
114 	wait_queue_head_t wait;
115 	struct task_struct *log_kthread;
116 	struct completion super_done;
117 };
118 
119 struct pending_block {
120 	int vec_cnt;
121 	u64 flags;
122 	sector_t sector;
123 	sector_t nr_sectors;
124 	char *data;
125 	u32 datalen;
126 	struct list_head list;
127 	struct bio_vec vecs[0];
128 };
129 
130 struct per_bio_data {
131 	struct pending_block *block;
132 };
133 
bio_to_dev_sectors(struct log_writes_c * lc,sector_t sectors)134 static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc,
135 					  sector_t sectors)
136 {
137 	return sectors >> (lc->sectorshift - SECTOR_SHIFT);
138 }
139 
dev_to_bio_sectors(struct log_writes_c * lc,sector_t sectors)140 static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc,
141 					  sector_t sectors)
142 {
143 	return sectors << (lc->sectorshift - SECTOR_SHIFT);
144 }
145 
put_pending_block(struct log_writes_c * lc)146 static void put_pending_block(struct log_writes_c *lc)
147 {
148 	if (atomic_dec_and_test(&lc->pending_blocks)) {
149 		smp_mb__after_atomic();
150 		if (waitqueue_active(&lc->wait))
151 			wake_up(&lc->wait);
152 	}
153 }
154 
put_io_block(struct log_writes_c * lc)155 static void put_io_block(struct log_writes_c *lc)
156 {
157 	if (atomic_dec_and_test(&lc->io_blocks)) {
158 		smp_mb__after_atomic();
159 		if (waitqueue_active(&lc->wait))
160 			wake_up(&lc->wait);
161 	}
162 }
163 
log_end_io(struct bio * bio)164 static void log_end_io(struct bio *bio)
165 {
166 	struct log_writes_c *lc = bio->bi_private;
167 
168 	if (bio->bi_status) {
169 		unsigned long flags;
170 
171 		DMERR("Error writing log block, error=%d", bio->bi_status);
172 		spin_lock_irqsave(&lc->blocks_lock, flags);
173 		lc->logging_enabled = false;
174 		spin_unlock_irqrestore(&lc->blocks_lock, flags);
175 	}
176 
177 	bio_free_pages(bio);
178 	put_io_block(lc);
179 	bio_put(bio);
180 }
181 
log_end_super(struct bio * bio)182 static void log_end_super(struct bio *bio)
183 {
184 	struct log_writes_c *lc = bio->bi_private;
185 
186 	complete(&lc->super_done);
187 	log_end_io(bio);
188 }
189 
190 /*
191  * Meant to be called if there is an error, it will free all the pages
192  * associated with the block.
193  */
free_pending_block(struct log_writes_c * lc,struct pending_block * block)194 static void free_pending_block(struct log_writes_c *lc,
195 			       struct pending_block *block)
196 {
197 	int i;
198 
199 	for (i = 0; i < block->vec_cnt; i++) {
200 		if (block->vecs[i].bv_page)
201 			__free_page(block->vecs[i].bv_page);
202 	}
203 	kfree(block->data);
204 	kfree(block);
205 	put_pending_block(lc);
206 }
207 
write_metadata(struct log_writes_c * lc,void * entry,size_t entrylen,void * data,size_t datalen,sector_t sector)208 static int write_metadata(struct log_writes_c *lc, void *entry,
209 			  size_t entrylen, void *data, size_t datalen,
210 			  sector_t sector)
211 {
212 	struct bio *bio;
213 	struct page *page;
214 	void *ptr;
215 	size_t ret;
216 
217 	bio = bio_alloc(GFP_KERNEL, 1);
218 	if (!bio) {
219 		DMERR("Couldn't alloc log bio");
220 		goto error;
221 	}
222 	bio->bi_iter.bi_size = 0;
223 	bio->bi_iter.bi_sector = sector;
224 	bio_set_dev(bio, lc->logdev->bdev);
225 	bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ?
226 			  log_end_super : log_end_io;
227 	bio->bi_private = lc;
228 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
229 
230 	page = alloc_page(GFP_KERNEL);
231 	if (!page) {
232 		DMERR("Couldn't alloc log page");
233 		bio_put(bio);
234 		goto error;
235 	}
236 
237 	ptr = kmap_atomic(page);
238 	memcpy(ptr, entry, entrylen);
239 	if (datalen)
240 		memcpy(ptr + entrylen, data, datalen);
241 	memset(ptr + entrylen + datalen, 0,
242 	       lc->sectorsize - entrylen - datalen);
243 	kunmap_atomic(ptr);
244 
245 	ret = bio_add_page(bio, page, lc->sectorsize, 0);
246 	if (ret != lc->sectorsize) {
247 		DMERR("Couldn't add page to the log block");
248 		goto error_bio;
249 	}
250 	submit_bio(bio);
251 	return 0;
252 error_bio:
253 	bio_put(bio);
254 	__free_page(page);
255 error:
256 	put_io_block(lc);
257 	return -1;
258 }
259 
log_one_block(struct log_writes_c * lc,struct pending_block * block,sector_t sector)260 static int log_one_block(struct log_writes_c *lc,
261 			 struct pending_block *block, sector_t sector)
262 {
263 	struct bio *bio;
264 	struct log_write_entry entry;
265 	size_t ret;
266 	int i;
267 
268 	entry.sector = cpu_to_le64(block->sector);
269 	entry.nr_sectors = cpu_to_le64(block->nr_sectors);
270 	entry.flags = cpu_to_le64(block->flags);
271 	entry.data_len = cpu_to_le64(block->datalen);
272 	if (write_metadata(lc, &entry, sizeof(entry), block->data,
273 			   block->datalen, sector)) {
274 		free_pending_block(lc, block);
275 		return -1;
276 	}
277 
278 	if (!block->vec_cnt)
279 		goto out;
280 	sector += dev_to_bio_sectors(lc, 1);
281 
282 	atomic_inc(&lc->io_blocks);
283 	bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES));
284 	if (!bio) {
285 		DMERR("Couldn't alloc log bio");
286 		goto error;
287 	}
288 	bio->bi_iter.bi_size = 0;
289 	bio->bi_iter.bi_sector = sector;
290 	bio_set_dev(bio, lc->logdev->bdev);
291 	bio->bi_end_io = log_end_io;
292 	bio->bi_private = lc;
293 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
294 
295 	for (i = 0; i < block->vec_cnt; i++) {
296 		/*
297 		 * The page offset is always 0 because we allocate a new page
298 		 * for every bvec in the original bio for simplicity sake.
299 		 */
300 		ret = bio_add_page(bio, block->vecs[i].bv_page,
301 				   block->vecs[i].bv_len, 0);
302 		if (ret != block->vecs[i].bv_len) {
303 			atomic_inc(&lc->io_blocks);
304 			submit_bio(bio);
305 			bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt - i, BIO_MAX_PAGES));
306 			if (!bio) {
307 				DMERR("Couldn't alloc log bio");
308 				goto error;
309 			}
310 			bio->bi_iter.bi_size = 0;
311 			bio->bi_iter.bi_sector = sector;
312 			bio_set_dev(bio, lc->logdev->bdev);
313 			bio->bi_end_io = log_end_io;
314 			bio->bi_private = lc;
315 			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
316 
317 			ret = bio_add_page(bio, block->vecs[i].bv_page,
318 					   block->vecs[i].bv_len, 0);
319 			if (ret != block->vecs[i].bv_len) {
320 				DMERR("Couldn't add page on new bio?");
321 				bio_put(bio);
322 				goto error;
323 			}
324 		}
325 		sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
326 	}
327 	submit_bio(bio);
328 out:
329 	kfree(block->data);
330 	kfree(block);
331 	put_pending_block(lc);
332 	return 0;
333 error:
334 	free_pending_block(lc, block);
335 	put_io_block(lc);
336 	return -1;
337 }
338 
log_super(struct log_writes_c * lc)339 static int log_super(struct log_writes_c *lc)
340 {
341 	struct log_write_super super;
342 
343 	super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
344 	super.version = cpu_to_le64(WRITE_LOG_VERSION);
345 	super.nr_entries = cpu_to_le64(lc->logged_entries);
346 	super.sectorsize = cpu_to_le32(lc->sectorsize);
347 
348 	if (write_metadata(lc, &super, sizeof(super), NULL, 0,
349 			   WRITE_LOG_SUPER_SECTOR)) {
350 		DMERR("Couldn't write super");
351 		return -1;
352 	}
353 
354 	/*
355 	 * Super sector should be writen in-order, otherwise the
356 	 * nr_entries could be rewritten incorrectly by an old bio.
357 	 */
358 	wait_for_completion_io(&lc->super_done);
359 
360 	return 0;
361 }
362 
logdev_last_sector(struct log_writes_c * lc)363 static inline sector_t logdev_last_sector(struct log_writes_c *lc)
364 {
365 	return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
366 }
367 
log_writes_kthread(void * arg)368 static int log_writes_kthread(void *arg)
369 {
370 	struct log_writes_c *lc = (struct log_writes_c *)arg;
371 	sector_t sector = 0;
372 
373 	while (!kthread_should_stop()) {
374 		bool super = false;
375 		bool logging_enabled;
376 		struct pending_block *block = NULL;
377 		int ret;
378 
379 		spin_lock_irq(&lc->blocks_lock);
380 		if (!list_empty(&lc->logging_blocks)) {
381 			block = list_first_entry(&lc->logging_blocks,
382 						 struct pending_block, list);
383 			list_del_init(&block->list);
384 			if (!lc->logging_enabled)
385 				goto next;
386 
387 			sector = lc->next_sector;
388 			if (!(block->flags & LOG_DISCARD_FLAG))
389 				lc->next_sector += dev_to_bio_sectors(lc, block->nr_sectors);
390 			lc->next_sector += dev_to_bio_sectors(lc, 1);
391 
392 			/*
393 			 * Apparently the size of the device may not be known
394 			 * right away, so handle this properly.
395 			 */
396 			if (!lc->end_sector)
397 				lc->end_sector = logdev_last_sector(lc);
398 			if (lc->end_sector &&
399 			    lc->next_sector >= lc->end_sector) {
400 				DMERR("Ran out of space on the logdev");
401 				lc->logging_enabled = false;
402 				goto next;
403 			}
404 			lc->logged_entries++;
405 			atomic_inc(&lc->io_blocks);
406 
407 			super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
408 			if (super)
409 				atomic_inc(&lc->io_blocks);
410 		}
411 next:
412 		logging_enabled = lc->logging_enabled;
413 		spin_unlock_irq(&lc->blocks_lock);
414 		if (block) {
415 			if (logging_enabled) {
416 				ret = log_one_block(lc, block, sector);
417 				if (!ret && super)
418 					ret = log_super(lc);
419 				if (ret) {
420 					spin_lock_irq(&lc->blocks_lock);
421 					lc->logging_enabled = false;
422 					spin_unlock_irq(&lc->blocks_lock);
423 				}
424 			} else
425 				free_pending_block(lc, block);
426 			continue;
427 		}
428 
429 		if (!try_to_freeze()) {
430 			set_current_state(TASK_INTERRUPTIBLE);
431 			if (!kthread_should_stop() &&
432 			    list_empty(&lc->logging_blocks))
433 				schedule();
434 			__set_current_state(TASK_RUNNING);
435 		}
436 	}
437 	return 0;
438 }
439 
440 /*
441  * Construct a log-writes mapping:
442  * log-writes <dev_path> <log_dev_path>
443  */
log_writes_ctr(struct dm_target * ti,unsigned int argc,char ** argv)444 static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
445 {
446 	struct log_writes_c *lc;
447 	struct dm_arg_set as;
448 	const char *devname, *logdevname;
449 	int ret;
450 
451 	as.argc = argc;
452 	as.argv = argv;
453 
454 	if (argc < 2) {
455 		ti->error = "Invalid argument count";
456 		return -EINVAL;
457 	}
458 
459 	lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
460 	if (!lc) {
461 		ti->error = "Cannot allocate context";
462 		return -ENOMEM;
463 	}
464 	spin_lock_init(&lc->blocks_lock);
465 	INIT_LIST_HEAD(&lc->unflushed_blocks);
466 	INIT_LIST_HEAD(&lc->logging_blocks);
467 	init_waitqueue_head(&lc->wait);
468 	init_completion(&lc->super_done);
469 	atomic_set(&lc->io_blocks, 0);
470 	atomic_set(&lc->pending_blocks, 0);
471 
472 	devname = dm_shift_arg(&as);
473 	ret = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev);
474 	if (ret) {
475 		ti->error = "Device lookup failed";
476 		goto bad;
477 	}
478 
479 	logdevname = dm_shift_arg(&as);
480 	ret = dm_get_device(ti, logdevname, dm_table_get_mode(ti->table),
481 			    &lc->logdev);
482 	if (ret) {
483 		ti->error = "Log device lookup failed";
484 		dm_put_device(ti, lc->dev);
485 		goto bad;
486 	}
487 
488 	lc->sectorsize = bdev_logical_block_size(lc->dev->bdev);
489 	lc->sectorshift = ilog2(lc->sectorsize);
490 	lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
491 	if (IS_ERR(lc->log_kthread)) {
492 		ret = PTR_ERR(lc->log_kthread);
493 		ti->error = "Couldn't alloc kthread";
494 		dm_put_device(ti, lc->dev);
495 		dm_put_device(ti, lc->logdev);
496 		goto bad;
497 	}
498 
499 	/*
500 	 * next_sector is in 512b sectors to correspond to what bi_sector expects.
501 	 * The super starts at sector 0, and the next_sector is the next logical
502 	 * one based on the sectorsize of the device.
503 	 */
504 	lc->next_sector = lc->sectorsize >> SECTOR_SHIFT;
505 	lc->logging_enabled = true;
506 	lc->end_sector = logdev_last_sector(lc);
507 	lc->device_supports_discard = true;
508 
509 	ti->num_flush_bios = 1;
510 	ti->flush_supported = true;
511 	ti->num_discard_bios = 1;
512 	ti->discards_supported = true;
513 	ti->per_io_data_size = sizeof(struct per_bio_data);
514 	ti->private = lc;
515 	return 0;
516 
517 bad:
518 	kfree(lc);
519 	return ret;
520 }
521 
log_mark(struct log_writes_c * lc,char * data)522 static int log_mark(struct log_writes_c *lc, char *data)
523 {
524 	struct pending_block *block;
525 	size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
526 
527 	block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
528 	if (!block) {
529 		DMERR("Error allocating pending block");
530 		return -ENOMEM;
531 	}
532 
533 	block->data = kstrndup(data, maxsize, GFP_KERNEL);
534 	if (!block->data) {
535 		DMERR("Error copying mark data");
536 		kfree(block);
537 		return -ENOMEM;
538 	}
539 	atomic_inc(&lc->pending_blocks);
540 	block->datalen = strlen(block->data);
541 	block->flags |= LOG_MARK_FLAG;
542 	spin_lock_irq(&lc->blocks_lock);
543 	list_add_tail(&block->list, &lc->logging_blocks);
544 	spin_unlock_irq(&lc->blocks_lock);
545 	wake_up_process(lc->log_kthread);
546 	return 0;
547 }
548 
log_writes_dtr(struct dm_target * ti)549 static void log_writes_dtr(struct dm_target *ti)
550 {
551 	struct log_writes_c *lc = ti->private;
552 
553 	spin_lock_irq(&lc->blocks_lock);
554 	list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
555 	spin_unlock_irq(&lc->blocks_lock);
556 
557 	/*
558 	 * This is just nice to have since it'll update the super to include the
559 	 * unflushed blocks, if it fails we don't really care.
560 	 */
561 	log_mark(lc, "dm-log-writes-end");
562 	wake_up_process(lc->log_kthread);
563 	wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
564 		   !atomic_read(&lc->pending_blocks));
565 	kthread_stop(lc->log_kthread);
566 
567 	WARN_ON(!list_empty(&lc->logging_blocks));
568 	WARN_ON(!list_empty(&lc->unflushed_blocks));
569 	dm_put_device(ti, lc->dev);
570 	dm_put_device(ti, lc->logdev);
571 	kfree(lc);
572 }
573 
normal_map_bio(struct dm_target * ti,struct bio * bio)574 static void normal_map_bio(struct dm_target *ti, struct bio *bio)
575 {
576 	struct log_writes_c *lc = ti->private;
577 
578 	bio_set_dev(bio, lc->dev->bdev);
579 }
580 
log_writes_map(struct dm_target * ti,struct bio * bio)581 static int log_writes_map(struct dm_target *ti, struct bio *bio)
582 {
583 	struct log_writes_c *lc = ti->private;
584 	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
585 	struct pending_block *block;
586 	struct bvec_iter iter;
587 	struct bio_vec bv;
588 	size_t alloc_size;
589 	int i = 0;
590 	bool flush_bio = (bio->bi_opf & REQ_PREFLUSH);
591 	bool fua_bio = (bio->bi_opf & REQ_FUA);
592 	bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD);
593 
594 	pb->block = NULL;
595 
596 	/* Don't bother doing anything if logging has been disabled */
597 	if (!lc->logging_enabled)
598 		goto map_bio;
599 
600 	/*
601 	 * Map reads as normal.
602 	 */
603 	if (bio_data_dir(bio) == READ)
604 		goto map_bio;
605 
606 	/* No sectors and not a flush?  Don't care */
607 	if (!bio_sectors(bio) && !flush_bio)
608 		goto map_bio;
609 
610 	/*
611 	 * Discards will have bi_size set but there's no actual data, so just
612 	 * allocate the size of the pending block.
613 	 */
614 	if (discard_bio)
615 		alloc_size = sizeof(struct pending_block);
616 	else
617 		alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio);
618 
619 	block = kzalloc(alloc_size, GFP_NOIO);
620 	if (!block) {
621 		DMERR("Error allocating pending block");
622 		spin_lock_irq(&lc->blocks_lock);
623 		lc->logging_enabled = false;
624 		spin_unlock_irq(&lc->blocks_lock);
625 		return DM_MAPIO_KILL;
626 	}
627 	INIT_LIST_HEAD(&block->list);
628 	pb->block = block;
629 	atomic_inc(&lc->pending_blocks);
630 
631 	if (flush_bio)
632 		block->flags |= LOG_FLUSH_FLAG;
633 	if (fua_bio)
634 		block->flags |= LOG_FUA_FLAG;
635 	if (discard_bio)
636 		block->flags |= LOG_DISCARD_FLAG;
637 
638 	block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector);
639 	block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio));
640 
641 	/* We don't need the data, just submit */
642 	if (discard_bio) {
643 		WARN_ON(flush_bio || fua_bio);
644 		if (lc->device_supports_discard)
645 			goto map_bio;
646 		bio_endio(bio);
647 		return DM_MAPIO_SUBMITTED;
648 	}
649 
650 	/* Flush bio, splice the unflushed blocks onto this list and submit */
651 	if (flush_bio && !bio_sectors(bio)) {
652 		spin_lock_irq(&lc->blocks_lock);
653 		list_splice_init(&lc->unflushed_blocks, &block->list);
654 		spin_unlock_irq(&lc->blocks_lock);
655 		goto map_bio;
656 	}
657 
658 	/*
659 	 * We will write this bio somewhere else way later so we need to copy
660 	 * the actual contents into new pages so we know the data will always be
661 	 * there.
662 	 *
663 	 * We do this because this could be a bio from O_DIRECT in which case we
664 	 * can't just hold onto the page until some later point, we have to
665 	 * manually copy the contents.
666 	 */
667 	bio_for_each_segment(bv, bio, iter) {
668 		struct page *page;
669 		void *src, *dst;
670 
671 		page = alloc_page(GFP_NOIO);
672 		if (!page) {
673 			DMERR("Error allocing page");
674 			free_pending_block(lc, block);
675 			spin_lock_irq(&lc->blocks_lock);
676 			lc->logging_enabled = false;
677 			spin_unlock_irq(&lc->blocks_lock);
678 			return DM_MAPIO_KILL;
679 		}
680 
681 		src = kmap_atomic(bv.bv_page);
682 		dst = kmap_atomic(page);
683 		memcpy(dst, src + bv.bv_offset, bv.bv_len);
684 		kunmap_atomic(dst);
685 		kunmap_atomic(src);
686 		block->vecs[i].bv_page = page;
687 		block->vecs[i].bv_len = bv.bv_len;
688 		block->vec_cnt++;
689 		i++;
690 	}
691 
692 	/* Had a flush with data in it, weird */
693 	if (flush_bio) {
694 		spin_lock_irq(&lc->blocks_lock);
695 		list_splice_init(&lc->unflushed_blocks, &block->list);
696 		spin_unlock_irq(&lc->blocks_lock);
697 	}
698 map_bio:
699 	normal_map_bio(ti, bio);
700 	return DM_MAPIO_REMAPPED;
701 }
702 
normal_end_io(struct dm_target * ti,struct bio * bio,blk_status_t * error)703 static int normal_end_io(struct dm_target *ti, struct bio *bio,
704 		blk_status_t *error)
705 {
706 	struct log_writes_c *lc = ti->private;
707 	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
708 
709 	if (bio_data_dir(bio) == WRITE && pb->block) {
710 		struct pending_block *block = pb->block;
711 		unsigned long flags;
712 
713 		spin_lock_irqsave(&lc->blocks_lock, flags);
714 		if (block->flags & LOG_FLUSH_FLAG) {
715 			list_splice_tail_init(&block->list, &lc->logging_blocks);
716 			list_add_tail(&block->list, &lc->logging_blocks);
717 			wake_up_process(lc->log_kthread);
718 		} else if (block->flags & LOG_FUA_FLAG) {
719 			list_add_tail(&block->list, &lc->logging_blocks);
720 			wake_up_process(lc->log_kthread);
721 		} else
722 			list_add_tail(&block->list, &lc->unflushed_blocks);
723 		spin_unlock_irqrestore(&lc->blocks_lock, flags);
724 	}
725 
726 	return DM_ENDIO_DONE;
727 }
728 
729 /*
730  * INFO format: <logged entries> <highest allocated sector>
731  */
log_writes_status(struct dm_target * ti,status_type_t type,unsigned status_flags,char * result,unsigned maxlen)732 static void log_writes_status(struct dm_target *ti, status_type_t type,
733 			      unsigned status_flags, char *result,
734 			      unsigned maxlen)
735 {
736 	unsigned sz = 0;
737 	struct log_writes_c *lc = ti->private;
738 
739 	switch (type) {
740 	case STATUSTYPE_INFO:
741 		DMEMIT("%llu %llu", lc->logged_entries,
742 		       (unsigned long long)lc->next_sector - 1);
743 		if (!lc->logging_enabled)
744 			DMEMIT(" logging_disabled");
745 		break;
746 
747 	case STATUSTYPE_TABLE:
748 		DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
749 		break;
750 	}
751 }
752 
log_writes_prepare_ioctl(struct dm_target * ti,struct block_device ** bdev,fmode_t * mode)753 static int log_writes_prepare_ioctl(struct dm_target *ti,
754 		struct block_device **bdev, fmode_t *mode)
755 {
756 	struct log_writes_c *lc = ti->private;
757 	struct dm_dev *dev = lc->dev;
758 
759 	*bdev = dev->bdev;
760 	/*
761 	 * Only pass ioctls through if the device sizes match exactly.
762 	 */
763 	if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
764 		return 1;
765 	return 0;
766 }
767 
log_writes_iterate_devices(struct dm_target * ti,iterate_devices_callout_fn fn,void * data)768 static int log_writes_iterate_devices(struct dm_target *ti,
769 				      iterate_devices_callout_fn fn,
770 				      void *data)
771 {
772 	struct log_writes_c *lc = ti->private;
773 
774 	return fn(ti, lc->dev, 0, ti->len, data);
775 }
776 
777 /*
778  * Messages supported:
779  *   mark <mark data> - specify the marked data.
780  */
log_writes_message(struct dm_target * ti,unsigned argc,char ** argv)781 static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv)
782 {
783 	int r = -EINVAL;
784 	struct log_writes_c *lc = ti->private;
785 
786 	if (argc != 2) {
787 		DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
788 		return r;
789 	}
790 
791 	if (!strcasecmp(argv[0], "mark"))
792 		r = log_mark(lc, argv[1]);
793 	else
794 		DMWARN("Unrecognised log writes target message received: %s", argv[0]);
795 
796 	return r;
797 }
798 
log_writes_io_hints(struct dm_target * ti,struct queue_limits * limits)799 static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
800 {
801 	struct log_writes_c *lc = ti->private;
802 	struct request_queue *q = bdev_get_queue(lc->dev->bdev);
803 
804 	if (!q || !blk_queue_discard(q)) {
805 		lc->device_supports_discard = false;
806 		limits->discard_granularity = lc->sectorsize;
807 		limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
808 	}
809 	limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev);
810 	limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev);
811 	limits->io_min = limits->physical_block_size;
812 }
813 
814 static struct target_type log_writes_target = {
815 	.name   = "log-writes",
816 	.version = {1, 0, 0},
817 	.module = THIS_MODULE,
818 	.ctr    = log_writes_ctr,
819 	.dtr    = log_writes_dtr,
820 	.map    = log_writes_map,
821 	.end_io = normal_end_io,
822 	.status = log_writes_status,
823 	.prepare_ioctl = log_writes_prepare_ioctl,
824 	.message = log_writes_message,
825 	.iterate_devices = log_writes_iterate_devices,
826 	.io_hints = log_writes_io_hints,
827 };
828 
dm_log_writes_init(void)829 static int __init dm_log_writes_init(void)
830 {
831 	int r = dm_register_target(&log_writes_target);
832 
833 	if (r < 0)
834 		DMERR("register failed %d", r);
835 
836 	return r;
837 }
838 
dm_log_writes_exit(void)839 static void __exit dm_log_writes_exit(void)
840 {
841 	dm_unregister_target(&log_writes_target);
842 }
843 
844 module_init(dm_log_writes_init);
845 module_exit(dm_log_writes_exit);
846 
847 MODULE_DESCRIPTION(DM_NAME " log writes target");
848 MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
849 MODULE_LICENSE("GPL");
850