• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Copyright (C) 2020 Google, Inc
4  * Copyright (C) 2020 Palmer Dabbelt <palmerdabbelt@google.com>
5  */
6 
7 #include <linux/device-mapper.h>
8 #include <uapi/linux/dm-user.h>
9 
10 #include <linux/bio.h>
11 #include <linux/init.h>
12 #include <linux/mempool.h>
13 #include <linux/miscdevice.h>
14 #include <linux/module.h>
15 #include <linux/poll.h>
16 #include <linux/uio.h>
17 #include <linux/wait.h>
18 #include <linux/workqueue.h>
19 
20 #define DM_MSG_PREFIX "user"
21 
22 #define MAX_OUTSTANDING_MESSAGES 128
23 
24 static unsigned int daemon_timeout_msec = 4000;
25 module_param_named(dm_user_daemon_timeout_msec, daemon_timeout_msec, uint,
26 		   0644);
27 MODULE_PARM_DESC(dm_user_daemon_timeout_msec,
28 		 "IO Timeout in msec if daemon does not process");
29 
30 /*
31  * dm-user uses four structures:
32  *
33  *  - "struct target", the outermost structure, corresponds to a single device
34  *    mapper target.  This contains the set of outstanding BIOs that have been
35  *    provided by DM and are not actively being processed by the user, along
36  *    with a misc device that userspace can open to communicate with the
37  *    kernel.  Each time userspaces opens the misc device a new channel is
38  *    created.
39  *  - "struct channel", which represents a single active communication channel
40  *    with userspace.  Userspace may choose arbitrary read/write sizes to use
41  *    when processing messages, channels form these into logical accesses.
42  *    When userspace responds to a full message the channel completes the BIO
43  *    and obtains a new message to process from the target.
44  *  - "struct message", which wraps a BIO with the additional information
45  *    required by the kernel to sort out what to do with BIOs when they return
46  *    from userspace.
47  *  - "struct dm_user_message", which is the exact message format that
48  *    userspace sees.
49  *
50  * The hot path contains three distinct operations:
51  *
52  *  - user_map(), which is provided a BIO from device mapper that is queued
53  *    into the target.  This allocates and enqueues a new message.
54  *  - dev_read(), which dequeues a message, copies it to userspace.
55  *  - dev_write(), which looks up a message (keyed by sequence number) and
56  *    completes the corresponding BIO.
57  *
58  * Lock ordering (outer to inner)
59  *
60  * 1) miscdevice's global lock.  This is held around dev_open, so it has to be
61  *    the outermost lock.
62  * 2) target->lock
63  * 3) channel->lock
64  */
65 
66 struct message {
67 	/*
68 	 * Messages themselves do not need a lock, they're protected by either
69 	 * the target or channel's lock, depending on which can reference them
70 	 * directly.
71 	 */
72 	struct dm_user_message msg;
73 	struct bio *bio;
74 	size_t posn_to_user;
75 	size_t total_to_user;
76 	size_t posn_from_user;
77 	size_t total_from_user;
78 
79 	struct list_head from_user;
80 	struct list_head to_user;
81 
82 	/*
83 	 * These are written back from the user.  They live in the same spot in
84 	 * the message, but we need to either keep the old values around or
85 	 * call a bunch more BIO helpers.  These are only valid after write has
86 	 * adopted the message.
87 	 */
88 	u64 return_type;
89 	u64 return_flags;
90 
91 	struct delayed_work work;
92 	bool delayed;
93 	struct target *t;
94 };
95 
96 struct target {
97 	/*
98 	 * A target has a single lock, which protects everything in the target
99 	 * (but does not protect the channels associated with a target).
100 	 */
101 	struct mutex lock;
102 
103 	/*
104 	 * There is only one point at which anything blocks: userspace blocks
105 	 * reading a new message, which is woken up by device mapper providing
106 	 * a new BIO to process (or tearing down the target).  The
107 	 * corresponding write side doesn't block, instead we treat userspace's
108 	 * response containing a message that has yet to be mapped as an
109 	 * invalid operation.
110 	 */
111 	struct wait_queue_head wq;
112 
113 	/*
114 	 * Messages are delivered to userspace in order, but may be returned
115 	 * out of order.  This allows userspace to schedule IO if it wants to.
116 	 */
117 	mempool_t message_pool;
118 	u64 next_seq_to_map;
119 	u64 next_seq_to_user;
120 	struct list_head to_user;
121 
122 	/*
123 	 * There is a misc device per target.  The name is selected by
124 	 * userspace (via a DM create ioctl argument), and each ends up in
125 	 * /dev/dm-user/.  It looks like a better way to do this may be to have
126 	 * a filesystem to manage these, but this was more expedient.  The
127 	 * current mechanism is functional, but does result in an arbitrary
128 	 * number of dynamically created misc devices.
129 	 */
130 	struct miscdevice miscdev;
131 
132 	/*
133 	 * Device mapper's target destructor triggers tearing this all down,
134 	 * but we can't actually free until every channel associated with this
135 	 * target has been destroyed.  Channels each have a reference to their
136 	 * target, and there is an additional single reference that corresponds
137 	 * to both DM and the misc device (both of which are destroyed by DM).
138 	 *
139 	 * In the common case userspace will be asleep waiting for a new
140 	 * message when device mapper decides to destroy the target, which
141 	 * means no new messages will appear.  The destroyed flag triggers a
142 	 * wakeup, which will end up removing the reference.
143 	 */
144 	struct kref references;
145 	int dm_destroyed;
146 	bool daemon_terminated;
147 };
148 
149 struct channel {
150 	struct target *target;
151 
152 	/*
153 	 * A channel has a single lock, which prevents multiple reads (or
154 	 * multiple writes) from conflicting with each other.
155 	 */
156 	struct mutex lock;
157 
158 	struct message *cur_to_user;
159 	struct message *cur_from_user;
160 	ssize_t to_user_error;
161 	ssize_t from_user_error;
162 
163 	/*
164 	 * Once a message has been forwarded to userspace on a channel it must
165 	 * be responded to on the same channel.  This allows us to error out
166 	 * the messages that have not yet been responded to by a channel when
167 	 * that channel closes, which makes handling errors more reasonable for
168 	 * fault-tolerant userspace daemons.  It also happens to make avoiding
169 	 * shared locks between user_map() and dev_read() a lot easier.
170 	 *
171 	 * This does preclude a multi-threaded work stealing userspace
172 	 * implementation (or at least, force a degree of head-of-line blocking
173 	 * on the response path).
174 	 */
175 	struct list_head from_user;
176 
177 	/*
178 	 * Responses from userspace can arrive in arbitrarily small chunks.
179 	 * We need some place to buffer one up until we can find the
180 	 * corresponding kernel-side message to continue processing, so instead
181 	 * of allocating them we just keep one off to the side here.  This can
182 	 * only ever be pointer to by from_user_cur, and will never have a BIO.
183 	 */
184 	struct message scratch_message_from_user;
185 };
186 
message_kill(struct message * m,mempool_t * pool)187 static void message_kill(struct message *m, mempool_t *pool)
188 {
189 	m->bio->bi_status = BLK_STS_IOERR;
190 	bio_endio(m->bio);
191 	mempool_free(m, pool);
192 }
193 
is_user_space_thread_present(struct target * t)194 static inline bool is_user_space_thread_present(struct target *t)
195 {
196 	lockdep_assert_held(&t->lock);
197 	return (kref_read(&t->references) > 1);
198 }
199 
process_delayed_work(struct work_struct * work)200 static void process_delayed_work(struct work_struct *work)
201 {
202 	struct delayed_work *del_work = to_delayed_work(work);
203 	struct message *msg = container_of(del_work, struct message, work);
204 
205 	struct target *t = msg->t;
206 
207 	mutex_lock(&t->lock);
208 
209 	/*
210 	 * There is at least one thread to process the IO.
211 	 */
212 	if (is_user_space_thread_present(t)) {
213 		mutex_unlock(&t->lock);
214 		return;
215 	}
216 
217 	/*
218 	 * Terminate the IO with an error
219 	 */
220 	list_del(&msg->to_user);
221 	pr_err("I/O error: sector %llu: no user-space daemon for %s target\n",
222 	       msg->bio->bi_iter.bi_sector,
223 	       t->miscdev.name);
224 	message_kill(msg, &t->message_pool);
225 	mutex_unlock(&t->lock);
226 }
227 
enqueue_delayed_work(struct message * m,bool is_delay)228 static void enqueue_delayed_work(struct message *m, bool is_delay)
229 {
230 	unsigned long delay = 0;
231 
232 	m->delayed = true;
233 	INIT_DELAYED_WORK(&m->work, process_delayed_work);
234 
235 	/*
236 	 * Snapuserd daemon is the user-space process
237 	 * which processes IO request from dm-user
238 	 * when OTA is applied. Per the current design,
239 	 * when a dm-user target is created, daemon
240 	 * attaches to target and starts processing
241 	 * the IO's. Daemon is terminated only when
242 	 * dm-user target is destroyed.
243 	 *
244 	 * If for some reason, daemon crashes or terminates early,
245 	 * without destroying the dm-user target; then
246 	 * there is no mechanism to restart the daemon
247 	 * and start processing the IO's from the same target.
248 	 * Theoretically, it is possible but that infrastructure
249 	 * doesn't exist in the android ecosystem.
250 	 *
251 	 * Thus, when the daemon terminates, there is no way the IO's
252 	 * issued on that target will be processed. Hence,
253 	 * we set the delay to 0 and fail the IO's immediately.
254 	 *
255 	 * On the other hand, when a new dm-user target is created,
256 	 * we wait for the daemon to get attached for the first time.
257 	 * This primarily happens when init first stage spins up
258 	 * the daemon. At this point, since the snapshot device is mounted
259 	 * of a root filesystem, dm-user target may receive IO request
260 	 * even though daemon is not fully launched. We don't want
261 	 * to fail those IO requests immediately. Thus, we queue these
262 	 * requests with a timeout so that daemon is ready to process
263 	 * those IO requests. Again, if the daemon fails to launch within
264 	 * the timeout period, then IO's will be failed.
265 	 */
266 	if (is_delay)
267 		delay = msecs_to_jiffies(daemon_timeout_msec);
268 
269 	queue_delayed_work(system_wq, &m->work, delay);
270 }
271 
target_from_target(struct dm_target * target)272 static inline struct target *target_from_target(struct dm_target *target)
273 {
274 	WARN_ON(target->private == NULL);
275 	return target->private;
276 }
277 
target_from_miscdev(struct miscdevice * miscdev)278 static inline struct target *target_from_miscdev(struct miscdevice *miscdev)
279 {
280 	return container_of(miscdev, struct target, miscdev);
281 }
282 
channel_from_file(struct file * file)283 static inline struct channel *channel_from_file(struct file *file)
284 {
285 	WARN_ON(file->private_data == NULL);
286 	return file->private_data;
287 }
288 
target_from_channel(struct channel * c)289 static inline struct target *target_from_channel(struct channel *c)
290 {
291 	WARN_ON(c->target == NULL);
292 	return c->target;
293 }
294 
bio_size(struct bio * bio)295 static inline size_t bio_size(struct bio *bio)
296 {
297 	struct bio_vec bvec;
298 	struct bvec_iter iter;
299 	size_t out = 0;
300 
301 	bio_for_each_segment (bvec, bio, iter)
302 		out += bio_iter_len(bio, iter);
303 	return out;
304 }
305 
bio_bytes_needed_to_user(struct bio * bio)306 static inline size_t bio_bytes_needed_to_user(struct bio *bio)
307 {
308 	switch (bio_op(bio)) {
309 	case REQ_OP_WRITE:
310 		return sizeof(struct dm_user_message) + bio_size(bio);
311 	case REQ_OP_READ:
312 	case REQ_OP_FLUSH:
313 	case REQ_OP_DISCARD:
314 	case REQ_OP_SECURE_ERASE:
315 	case REQ_OP_WRITE_ZEROES:
316 		return sizeof(struct dm_user_message);
317 
318 	/*
319 	 * These ops are not passed to userspace under the assumption that
320 	 * they're not going to be particularly useful in that context.
321 	 */
322 	default:
323 		return -EOPNOTSUPP;
324 	}
325 }
326 
bio_bytes_needed_from_user(struct bio * bio)327 static inline size_t bio_bytes_needed_from_user(struct bio *bio)
328 {
329 	switch (bio_op(bio)) {
330 	case REQ_OP_READ:
331 		return sizeof(struct dm_user_message) + bio_size(bio);
332 	case REQ_OP_WRITE:
333 	case REQ_OP_FLUSH:
334 	case REQ_OP_DISCARD:
335 	case REQ_OP_SECURE_ERASE:
336 	case REQ_OP_WRITE_ZEROES:
337 		return sizeof(struct dm_user_message);
338 
339 	/*
340 	 * These ops are not passed to userspace under the assumption that
341 	 * they're not going to be particularly useful in that context.
342 	 */
343 	default:
344 		return -EOPNOTSUPP;
345 	}
346 }
347 
bio_type_to_user_type(struct bio * bio)348 static inline long bio_type_to_user_type(struct bio *bio)
349 {
350 	switch (bio_op(bio)) {
351 	case REQ_OP_READ:
352 		return DM_USER_REQ_MAP_READ;
353 	case REQ_OP_WRITE:
354 		return DM_USER_REQ_MAP_WRITE;
355 	case REQ_OP_FLUSH:
356 		return DM_USER_REQ_MAP_FLUSH;
357 	case REQ_OP_DISCARD:
358 		return DM_USER_REQ_MAP_DISCARD;
359 	case REQ_OP_SECURE_ERASE:
360 		return DM_USER_REQ_MAP_SECURE_ERASE;
361 	case REQ_OP_WRITE_ZEROES:
362 		return DM_USER_REQ_MAP_WRITE_ZEROES;
363 
364 	/*
365 	 * These ops are not passed to userspace under the assumption that
366 	 * they're not going to be particularly useful in that context.
367 	 */
368 	default:
369 		return -EOPNOTSUPP;
370 	}
371 }
372 
bio_flags_to_user_flags(struct bio * bio)373 static inline long bio_flags_to_user_flags(struct bio *bio)
374 {
375 	u64 out = 0;
376 	typeof(bio->bi_opf) opf = bio->bi_opf & ~REQ_OP_MASK;
377 
378 	if (opf & REQ_FAILFAST_DEV) {
379 		opf &= ~REQ_FAILFAST_DEV;
380 		out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DEV;
381 	}
382 
383 	if (opf & REQ_FAILFAST_TRANSPORT) {
384 		opf &= ~REQ_FAILFAST_TRANSPORT;
385 		out |= DM_USER_REQ_MAP_FLAG_FAILFAST_TRANSPORT;
386 	}
387 
388 	if (opf & REQ_FAILFAST_DRIVER) {
389 		opf &= ~REQ_FAILFAST_DRIVER;
390 		out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DRIVER;
391 	}
392 
393 	if (opf & REQ_SYNC) {
394 		opf &= ~REQ_SYNC;
395 		out |= DM_USER_REQ_MAP_FLAG_SYNC;
396 	}
397 
398 	if (opf & REQ_META) {
399 		opf &= ~REQ_META;
400 		out |= DM_USER_REQ_MAP_FLAG_META;
401 	}
402 
403 	if (opf & REQ_PRIO) {
404 		opf &= ~REQ_PRIO;
405 		out |= DM_USER_REQ_MAP_FLAG_PRIO;
406 	}
407 
408 	if (opf & REQ_NOMERGE) {
409 		opf &= ~REQ_NOMERGE;
410 		out |= DM_USER_REQ_MAP_FLAG_NOMERGE;
411 	}
412 
413 	if (opf & REQ_IDLE) {
414 		opf &= ~REQ_IDLE;
415 		out |= DM_USER_REQ_MAP_FLAG_IDLE;
416 	}
417 
418 	if (opf & REQ_INTEGRITY) {
419 		opf &= ~REQ_INTEGRITY;
420 		out |= DM_USER_REQ_MAP_FLAG_INTEGRITY;
421 	}
422 
423 	if (opf & REQ_FUA) {
424 		opf &= ~REQ_FUA;
425 		out |= DM_USER_REQ_MAP_FLAG_FUA;
426 	}
427 
428 	if (opf & REQ_PREFLUSH) {
429 		opf &= ~REQ_PREFLUSH;
430 		out |= DM_USER_REQ_MAP_FLAG_PREFLUSH;
431 	}
432 
433 	if (opf & REQ_RAHEAD) {
434 		opf &= ~REQ_RAHEAD;
435 		out |= DM_USER_REQ_MAP_FLAG_RAHEAD;
436 	}
437 
438 	if (opf & REQ_BACKGROUND) {
439 		opf &= ~REQ_BACKGROUND;
440 		out |= DM_USER_REQ_MAP_FLAG_BACKGROUND;
441 	}
442 
443 	if (opf & REQ_NOWAIT) {
444 		opf &= ~REQ_NOWAIT;
445 		out |= DM_USER_REQ_MAP_FLAG_NOWAIT;
446 	}
447 
448 	if (opf & REQ_NOUNMAP) {
449 		opf &= ~REQ_NOUNMAP;
450 		out |= DM_USER_REQ_MAP_FLAG_NOUNMAP;
451 	}
452 
453 	if (unlikely(opf)) {
454 		pr_warn("unsupported BIO type %x\n", opf);
455 		return -EOPNOTSUPP;
456 	}
457 	WARN_ON(out < 0);
458 	return out;
459 }
460 
461 /*
462  * Not quite what's in blk-map.c, but instead what I thought the functions in
463  * blk-map did.  This one seems more generally useful and I think we could
464  * write the blk-map version in terms of this one.  The differences are that
465  * this has a return value that counts, and blk-map uses the BIO _all iters.
466  * Neither  advance the BIO iter but don't advance the IOV iter, which is a bit
467  * odd here.
468  */
bio_copy_from_iter(struct bio * bio,struct iov_iter * iter)469 static ssize_t bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
470 {
471 	struct bio_vec bvec;
472 	struct bvec_iter biter;
473 	ssize_t out = 0;
474 
475 	bio_for_each_segment (bvec, bio, biter) {
476 		ssize_t ret;
477 
478 		ret = copy_page_from_iter(bvec.bv_page, bvec.bv_offset,
479 					  bvec.bv_len, iter);
480 
481 		/*
482 		 * FIXME: I thought that IOV copies had a mechanism for
483 		 * terminating early, if for example a signal came in while
484 		 * sleeping waiting for a page to be mapped, but I don't see
485 		 * where that would happen.
486 		 */
487 		WARN_ON(ret < 0);
488 		out += ret;
489 
490 		if (!iov_iter_count(iter))
491 			break;
492 
493 		if (ret < bvec.bv_len)
494 			return ret;
495 	}
496 
497 	return out;
498 }
499 
bio_copy_to_iter(struct bio * bio,struct iov_iter * iter)500 static ssize_t bio_copy_to_iter(struct bio *bio, struct iov_iter *iter)
501 {
502 	struct bio_vec bvec;
503 	struct bvec_iter biter;
504 	ssize_t out = 0;
505 
506 	bio_for_each_segment (bvec, bio, biter) {
507 		ssize_t ret;
508 
509 		ret = copy_page_to_iter(bvec.bv_page, bvec.bv_offset,
510 					bvec.bv_len, iter);
511 
512 		/* as above */
513 		WARN_ON(ret < 0);
514 		out += ret;
515 
516 		if (!iov_iter_count(iter))
517 			break;
518 
519 		if (ret < bvec.bv_len)
520 			return ret;
521 	}
522 
523 	return out;
524 }
525 
msg_copy_to_iov(struct message * msg,struct iov_iter * to)526 static ssize_t msg_copy_to_iov(struct message *msg, struct iov_iter *to)
527 {
528 	ssize_t copied = 0;
529 
530 	if (!iov_iter_count(to))
531 		return 0;
532 
533 	if (msg->posn_to_user < sizeof(msg->msg)) {
534 		copied = copy_to_iter((char *)(&msg->msg) + msg->posn_to_user,
535 				      sizeof(msg->msg) - msg->posn_to_user, to);
536 	} else {
537 		copied = bio_copy_to_iter(msg->bio, to);
538 		if (copied > 0)
539 			bio_advance(msg->bio, copied);
540 	}
541 
542 	if (copied < 0)
543 		return copied;
544 
545 	msg->posn_to_user += copied;
546 	return copied;
547 }
548 
msg_copy_from_iov(struct message * msg,struct iov_iter * from)549 static ssize_t msg_copy_from_iov(struct message *msg, struct iov_iter *from)
550 {
551 	ssize_t copied = 0;
552 
553 	if (!iov_iter_count(from))
554 		return 0;
555 
556 	if (msg->posn_from_user < sizeof(msg->msg)) {
557 		copied = copy_from_iter(
558 			(char *)(&msg->msg) + msg->posn_from_user,
559 			sizeof(msg->msg) - msg->posn_from_user, from);
560 	} else {
561 		copied = bio_copy_from_iter(msg->bio, from);
562 		if (copied > 0)
563 			bio_advance(msg->bio, copied);
564 	}
565 
566 	if (copied < 0)
567 		return copied;
568 
569 	msg->posn_from_user += copied;
570 	return copied;
571 }
572 
msg_get_map(struct target * t)573 static struct message *msg_get_map(struct target *t)
574 {
575 	struct message *m;
576 
577 	lockdep_assert_held(&t->lock);
578 
579 	m = mempool_alloc(&t->message_pool, GFP_NOIO);
580 	m->msg.seq = t->next_seq_to_map++;
581 	INIT_LIST_HEAD(&m->to_user);
582 	INIT_LIST_HEAD(&m->from_user);
583 	return m;
584 }
585 
msg_get_to_user(struct target * t)586 static struct message *msg_get_to_user(struct target *t)
587 {
588 	struct message *m;
589 
590 	lockdep_assert_held(&t->lock);
591 
592 	if (list_empty(&t->to_user))
593 		return NULL;
594 
595 	m = list_first_entry(&t->to_user, struct message, to_user);
596 
597 	list_del(&m->to_user);
598 
599 	/*
600 	 * If the IO was queued to workqueue since there
601 	 * was no daemon to service the IO, then we
602 	 * will have to cancel the delayed work as the
603 	 * IO will be processed by this user-space thread.
604 	 *
605 	 * If the delayed work was already picked up for
606 	 * processing, then wait for it to complete. Note
607 	 * that the IO will not be terminated by the work
608 	 * queue thread.
609 	 */
610 	if (unlikely(m->delayed)) {
611 		mutex_unlock(&t->lock);
612 		cancel_delayed_work_sync(&m->work);
613 		mutex_lock(&t->lock);
614 	}
615 	return m;
616 }
617 
msg_get_from_user(struct channel * c,u64 seq)618 static struct message *msg_get_from_user(struct channel *c, u64 seq)
619 {
620 	struct message *m;
621 	struct list_head *cur, *tmp;
622 
623 	lockdep_assert_held(&c->lock);
624 
625 	list_for_each_safe (cur, tmp, &c->from_user) {
626 		m = list_entry(cur, struct message, from_user);
627 		if (m->msg.seq == seq) {
628 			list_del(&m->from_user);
629 			return m;
630 		}
631 	}
632 
633 	return NULL;
634 }
635 
636 /*
637  * Returns 0 when there is no work left to do.  This must be callable without
638  * holding the target lock, as it is part of the waitqueue's check expression.
639  * When called without the lock it may spuriously indicate there is remaining
640  * work, but when called with the lock it must be accurate.
641  */
target_poll(struct target * t)642 static int target_poll(struct target *t)
643 {
644 	return !list_empty(&t->to_user) || t->dm_destroyed;
645 }
646 
target_release(struct kref * ref)647 static void target_release(struct kref *ref)
648 {
649 	struct target *t = container_of(ref, struct target, references);
650 	struct list_head *cur, *tmp;
651 
652 	/*
653 	 * There may be outstanding BIOs that have not yet been given to
654 	 * userspace.  At this point there's nothing we can do about them, as
655 	 * there are and will never be any channels.
656 	 */
657 	list_for_each_safe (cur, tmp, &t->to_user) {
658 		struct message *m = list_entry(cur, struct message, to_user);
659 
660 		if (unlikely(m->delayed)) {
661 			bool ret;
662 
663 			mutex_unlock(&t->lock);
664 			ret = cancel_delayed_work_sync(&m->work);
665 			mutex_lock(&t->lock);
666 			if (!ret)
667 				continue;
668 		}
669 		message_kill(m, &t->message_pool);
670 	}
671 
672 	mempool_exit(&t->message_pool);
673 	mutex_unlock(&t->lock);
674 	mutex_destroy(&t->lock);
675 	kfree(t);
676 }
677 
target_put(struct target * t)678 static void target_put(struct target *t)
679 {
680 	/*
681 	 * This both releases a reference to the target and the lock.  We leave
682 	 * it up to the caller to hold the lock, as they probably needed it for
683 	 * something else.
684 	 */
685 	lockdep_assert_held(&t->lock);
686 
687 	if (!kref_put(&t->references, target_release)) {
688 		/*
689 		 * User-space thread is getting terminated.
690 		 * We need to scan the list for all those
691 		 * pending IO's which were not processed yet
692 		 * and put them back to work-queue for delayed
693 		 * processing.
694 		 */
695 		if (!is_user_space_thread_present(t)) {
696 			struct list_head *cur, *tmp;
697 
698 			list_for_each_safe(cur, tmp, &t->to_user) {
699 				struct message *m = list_entry(cur,
700 							       struct message,
701 							       to_user);
702 				if (!m->delayed)
703 					enqueue_delayed_work(m, false);
704 			}
705 			/*
706 			 * Daemon attached to this target is terminated.
707 			 */
708 			t->daemon_terminated = true;
709 		}
710 		mutex_unlock(&t->lock);
711 	}
712 }
713 
channel_alloc(struct target * t)714 static struct channel *channel_alloc(struct target *t)
715 {
716 	struct channel *c;
717 
718 	lockdep_assert_held(&t->lock);
719 
720 	c = kzalloc(sizeof(*c), GFP_KERNEL);
721 	if (c == NULL)
722 		return NULL;
723 
724 	kref_get(&t->references);
725 	c->target = t;
726 	c->cur_from_user = &c->scratch_message_from_user;
727 	mutex_init(&c->lock);
728 	INIT_LIST_HEAD(&c->from_user);
729 	return c;
730 }
731 
channel_free(struct channel * c)732 static void channel_free(struct channel *c)
733 {
734 	struct list_head *cur, *tmp;
735 
736 	lockdep_assert_held(&c->lock);
737 
738 	/*
739 	 * There may be outstanding BIOs that have been given to userspace but
740 	 * have not yet been completed.  The channel has been shut down so
741 	 * there's no way to process the rest of those messages, so we just go
742 	 * ahead and error out the BIOs.  Hopefully whatever's on the other end
743 	 * can handle the errors.  One could imagine splitting the BIOs and
744 	 * completing as much as we got, but that seems like overkill here.
745 	 *
746 	 * Our only other options would be to let the BIO hang around (which
747 	 * seems way worse) or to resubmit it to userspace in the hope there's
748 	 * another channel.  I don't really like the idea of submitting a
749 	 * message twice.
750 	 */
751 	if (c->cur_to_user != NULL)
752 		message_kill(c->cur_to_user, &c->target->message_pool);
753 	if (c->cur_from_user != &c->scratch_message_from_user)
754 		message_kill(c->cur_from_user, &c->target->message_pool);
755 	list_for_each_safe (cur, tmp, &c->from_user)
756 		message_kill(list_entry(cur, struct message, from_user),
757 			     &c->target->message_pool);
758 
759 	mutex_lock(&c->target->lock);
760 	target_put(c->target);
761 	mutex_unlock(&c->lock);
762 	mutex_destroy(&c->lock);
763 	kfree(c);
764 }
765 
dev_open(struct inode * inode,struct file * file)766 static int dev_open(struct inode *inode, struct file *file)
767 {
768 	struct channel *c;
769 	struct target *t;
770 
771 	/*
772 	 * This is called by miscdev, which sets private_data to point to the
773 	 * struct miscdevice that was opened.  The rest of our file operations
774 	 * want to refer to the channel that's been opened, so we swap that
775 	 * pointer out with a fresh channel.
776 	 *
777 	 * This is called with the miscdev lock held, which is also held while
778 	 * registering/unregistering the miscdev.  The miscdev must be
779 	 * registered for this to get called, which means there must be an
780 	 * outstanding reference to the target, which means it cannot be freed
781 	 * out from under us despite us not holding a reference yet.
782 	 */
783 	t = container_of(file->private_data, struct target, miscdev);
784 	mutex_lock(&t->lock);
785 	file->private_data = c = channel_alloc(t);
786 
787 	if (c == NULL) {
788 		mutex_unlock(&t->lock);
789 		return -ENOMEM;
790 	}
791 
792 	mutex_unlock(&t->lock);
793 	return 0;
794 }
795 
dev_read(struct kiocb * iocb,struct iov_iter * to)796 static ssize_t dev_read(struct kiocb *iocb, struct iov_iter *to)
797 {
798 	struct channel *c = channel_from_file(iocb->ki_filp);
799 	ssize_t total_processed = 0;
800 	ssize_t processed;
801 
802 	mutex_lock(&c->lock);
803 
804 	if (unlikely(c->to_user_error)) {
805 		total_processed = c->to_user_error;
806 		goto cleanup_unlock;
807 	}
808 
809 	if (c->cur_to_user == NULL) {
810 		struct target *t = target_from_channel(c);
811 
812 		mutex_lock(&t->lock);
813 
814 		while (!target_poll(t)) {
815 			int e;
816 
817 			mutex_unlock(&t->lock);
818 			mutex_unlock(&c->lock);
819 			e = wait_event_interruptible(t->wq, target_poll(t));
820 			mutex_lock(&c->lock);
821 			mutex_lock(&t->lock);
822 
823 			if (unlikely(e != 0)) {
824 				/*
825 				 * We haven't processed any bytes in either the
826 				 * BIO or the IOV, so we can just terminate
827 				 * right now.  Elsewhere in the kernel handles
828 				 * restarting the syscall when appropriate.
829 				 */
830 				total_processed = e;
831 				mutex_unlock(&t->lock);
832 				goto cleanup_unlock;
833 			}
834 		}
835 
836 		if (unlikely(t->dm_destroyed)) {
837 			/*
838 			 * DM has destroyed this target, so just lock
839 			 * the user out.  There's really nothing else
840 			 * we can do here.  Note that we don't actually
841 			 * tear any thing down until userspace has
842 			 * closed the FD, as there may still be
843 			 * outstanding BIOs.
844 			 *
845 			 * This is kind of a wacky error code to
846 			 * return.  My goal was really just to try and
847 			 * find something that wasn't likely to be
848 			 * returned by anything else in the miscdev
849 			 * path.  The message "block device required"
850 			 * seems like a somewhat reasonable thing to
851 			 * say when the target has disappeared out from
852 			 * under us, but "not block" isn't sensible.
853 			 */
854 			c->to_user_error = total_processed = -ENOTBLK;
855 			mutex_unlock(&t->lock);
856 			goto cleanup_unlock;
857 		}
858 
859 		/*
860 		 * Ensures that accesses to the message data are not ordered
861 		 * before the remote accesses that produce that message data.
862 		 *
863 		 * This pairs with the barrier in user_map(), via the
864 		 * conditional within the while loop above. Also see the lack
865 		 * of barrier in user_dtr(), which is why this can be after the
866 		 * destroyed check.
867 		 */
868 		smp_rmb();
869 
870 		c->cur_to_user = msg_get_to_user(t);
871 		WARN_ON(c->cur_to_user == NULL);
872 		mutex_unlock(&t->lock);
873 	}
874 
875 	processed = msg_copy_to_iov(c->cur_to_user, to);
876 	total_processed += processed;
877 
878 	WARN_ON(c->cur_to_user->posn_to_user > c->cur_to_user->total_to_user);
879 	if (c->cur_to_user->posn_to_user == c->cur_to_user->total_to_user) {
880 		struct message *m = c->cur_to_user;
881 
882 		c->cur_to_user = NULL;
883 		list_add_tail(&m->from_user, &c->from_user);
884 	}
885 
886 cleanup_unlock:
887 	mutex_unlock(&c->lock);
888 	return total_processed;
889 }
890 
dev_write(struct kiocb * iocb,struct iov_iter * from)891 static ssize_t dev_write(struct kiocb *iocb, struct iov_iter *from)
892 {
893 	struct channel *c = channel_from_file(iocb->ki_filp);
894 	ssize_t total_processed = 0;
895 	ssize_t processed;
896 
897 	mutex_lock(&c->lock);
898 
899 	if (unlikely(c->from_user_error)) {
900 		total_processed = c->from_user_error;
901 		goto cleanup_unlock;
902 	}
903 
904 	/*
905 	 * cur_from_user can never be NULL.  If there's no real message it must
906 	 * point to the scratch space.
907 	 */
908 	WARN_ON(c->cur_from_user == NULL);
909 	if (c->cur_from_user->posn_from_user < sizeof(struct dm_user_message)) {
910 		struct message *msg, *old;
911 
912 		processed = msg_copy_from_iov(c->cur_from_user, from);
913 		if (processed <= 0) {
914 			pr_warn("msg_copy_from_iov() returned %zu\n",
915 				processed);
916 			c->from_user_error = -EINVAL;
917 			goto cleanup_unlock;
918 		}
919 		total_processed += processed;
920 
921 		/*
922 		 * In the unlikely event the user has provided us a very short
923 		 * write, not even big enough to fill a message, just succeed.
924 		 * We'll eventually build up enough bytes to do something.
925 		 */
926 		if (unlikely(c->cur_from_user->posn_from_user <
927 			     sizeof(struct dm_user_message)))
928 			goto cleanup_unlock;
929 
930 		old = c->cur_from_user;
931 		mutex_lock(&c->target->lock);
932 		msg = msg_get_from_user(c, c->cur_from_user->msg.seq);
933 		if (msg == NULL) {
934 			pr_info("user provided an invalid messag seq of %llx\n",
935 				old->msg.seq);
936 			mutex_unlock(&c->target->lock);
937 			c->from_user_error = -EINVAL;
938 			goto cleanup_unlock;
939 		}
940 		mutex_unlock(&c->target->lock);
941 
942 		WARN_ON(old->posn_from_user != sizeof(struct dm_user_message));
943 		msg->posn_from_user = sizeof(struct dm_user_message);
944 		msg->return_type = old->msg.type;
945 		msg->return_flags = old->msg.flags;
946 		WARN_ON(msg->posn_from_user > msg->total_from_user);
947 		c->cur_from_user = msg;
948 		WARN_ON(old != &c->scratch_message_from_user);
949 	}
950 
951 	/*
952 	 * Userspace can signal an error for single requests by overwriting the
953 	 * seq field.
954 	 */
955 	switch (c->cur_from_user->return_type) {
956 	case DM_USER_RESP_SUCCESS:
957 		c->cur_from_user->bio->bi_status = BLK_STS_OK;
958 		break;
959 	case DM_USER_RESP_ERROR:
960 	case DM_USER_RESP_UNSUPPORTED:
961 	default:
962 		c->cur_from_user->bio->bi_status = BLK_STS_IOERR;
963 		goto finish_bio;
964 	}
965 
966 	/*
967 	 * The op was a success as far as userspace is concerned, so process
968 	 * whatever data may come along with it.  The user may provide the BIO
969 	 * data in multiple chunks, in which case we don't need to finish the
970 	 * BIO.
971 	 */
972 	processed = msg_copy_from_iov(c->cur_from_user, from);
973 	total_processed += processed;
974 
975 	if (c->cur_from_user->posn_from_user <
976 	    c->cur_from_user->total_from_user)
977 		goto cleanup_unlock;
978 
979 finish_bio:
980 	/*
981 	 * When we set up this message the BIO's size matched the
982 	 * message size, if that's not still the case then something
983 	 * has gone off the rails.
984 	 */
985 	WARN_ON(bio_size(c->cur_from_user->bio) != 0);
986 	bio_endio(c->cur_from_user->bio);
987 
988 	/*
989 	 * We don't actually need to take the target lock here, as all
990 	 * we're doing is freeing the message and mempools have their
991 	 * own lock.  Each channel has its ows scratch message.
992 	 */
993 	WARN_ON(c->cur_from_user == &c->scratch_message_from_user);
994 	mempool_free(c->cur_from_user, &c->target->message_pool);
995 	c->scratch_message_from_user.posn_from_user = 0;
996 	c->cur_from_user = &c->scratch_message_from_user;
997 
998 cleanup_unlock:
999 	mutex_unlock(&c->lock);
1000 	return total_processed;
1001 }
1002 
dev_release(struct inode * inode,struct file * file)1003 static int dev_release(struct inode *inode, struct file *file)
1004 {
1005 	struct channel *c;
1006 
1007 	c = channel_from_file(file);
1008 	mutex_lock(&c->lock);
1009 	channel_free(c);
1010 
1011 	return 0;
1012 }
1013 
1014 static const struct file_operations file_operations = {
1015 	.owner = THIS_MODULE,
1016 	.open = dev_open,
1017 	.read_iter = dev_read,
1018 	.write_iter = dev_write,
1019 	.release = dev_release,
1020 };
1021 
user_ctr(struct dm_target * ti,unsigned int argc,char ** argv)1022 static int user_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1023 {
1024 	struct target *t;
1025 	int r;
1026 
1027 	if (argc != 3) {
1028 		ti->error = "Invalid argument count";
1029 		r = -EINVAL;
1030 		goto cleanup_none;
1031 	}
1032 
1033 	t = kzalloc(sizeof(*t), GFP_KERNEL);
1034 	if (t == NULL) {
1035 		r = -ENOMEM;
1036 		goto cleanup_none;
1037 	}
1038 	ti->private = t;
1039 
1040 	/* Enable more BIO types. */
1041 	ti->num_discard_bios = 1;
1042 	ti->discards_supported = true;
1043 	ti->num_flush_bios = 1;
1044 	ti->flush_supported = true;
1045 
1046 	/*
1047 	 * We begin with a single reference to the target, which is miscdev's
1048 	 * reference.  This ensures that the target won't be freed
1049 	 * until after the miscdev has been unregistered and all extant
1050 	 * channels have been closed.
1051 	 */
1052 	kref_init(&t->references);
1053 
1054 	t->daemon_terminated = false;
1055 	mutex_init(&t->lock);
1056 	init_waitqueue_head(&t->wq);
1057 	INIT_LIST_HEAD(&t->to_user);
1058 	mempool_init_kmalloc_pool(&t->message_pool, MAX_OUTSTANDING_MESSAGES,
1059 				  sizeof(struct message));
1060 
1061 	t->miscdev.minor = MISC_DYNAMIC_MINOR;
1062 	t->miscdev.fops = &file_operations;
1063 	t->miscdev.name = kasprintf(GFP_KERNEL, "dm-user/%s", argv[2]);
1064 	if (t->miscdev.name == NULL) {
1065 		r = -ENOMEM;
1066 		goto cleanup_message_pool;
1067 	}
1068 
1069 	/*
1070 	 * Once the miscdev is registered it can be opened and therefor
1071 	 * concurrent references to the channel can happen.  Holding the target
1072 	 * lock during misc_register() could deadlock.  If registration
1073 	 * succeeds then we will not access the target again so we just stick a
1074 	 * barrier here, which pairs with taking the target lock everywhere
1075 	 * else the target is accessed.
1076 	 *
1077 	 * I forgot where we ended up on the RCpc/RCsc locks.  IIU RCsc locks
1078 	 * would mean that we could take the target lock earlier and release it
1079 	 * here instead of the memory barrier.  I'm not sure that's any better,
1080 	 * though, and this isn't on a hot path so it probably doesn't matter
1081 	 * either way.
1082 	 */
1083 	smp_mb();
1084 
1085 	r = misc_register(&t->miscdev);
1086 	if (r) {
1087 		DMERR("Unable to register miscdev %s for dm-user",
1088 		      t->miscdev.name);
1089 		r = -ENOMEM;
1090 		goto cleanup_misc_name;
1091 	}
1092 
1093 	return 0;
1094 
1095 cleanup_misc_name:
1096 	kfree(t->miscdev.name);
1097 cleanup_message_pool:
1098 	mempool_exit(&t->message_pool);
1099 	kfree(t);
1100 cleanup_none:
1101 	return r;
1102 }
1103 
user_dtr(struct dm_target * ti)1104 static void user_dtr(struct dm_target *ti)
1105 {
1106 	struct target *t = target_from_target(ti);
1107 
1108 	/*
1109 	 * Removes the miscdev.  This must be called without the target lock
1110 	 * held to avoid a possible deadlock because our open implementation is
1111 	 * called holding the miscdev lock and must later take the target lock.
1112 	 *
1113 	 * There is no race here because only DM can register/unregister the
1114 	 * miscdev, and DM ensures that doesn't happen twice.  The internal
1115 	 * miscdev lock is sufficient to ensure there are no races between
1116 	 * deregistering the miscdev and open.
1117 	 */
1118 	misc_deregister(&t->miscdev);
1119 
1120 	/*
1121 	 * We are now free to take the target's lock and drop our reference to
1122 	 * the target.  There are almost certainly tasks sleeping in read on at
1123 	 * least one of the channels associated with this target, this
1124 	 * explicitly wakes them up and terminates the read.
1125 	 */
1126 	mutex_lock(&t->lock);
1127 	/*
1128 	 * No barrier here, as wait/wake ensures that the flag visibility is
1129 	 * correct WRT the wake/sleep state of the target tasks.
1130 	 */
1131 	t->dm_destroyed = true;
1132 	wake_up_all(&t->wq);
1133 	target_put(t);
1134 }
1135 
1136 /*
1137  * Consumes a BIO from device mapper, queueing it up for userspace.
1138  */
user_map(struct dm_target * ti,struct bio * bio)1139 static int user_map(struct dm_target *ti, struct bio *bio)
1140 {
1141 	struct target *t;
1142 	struct message *entry;
1143 
1144 	t = target_from_target(ti);
1145 	/*
1146 	 * FIXME
1147 	 *
1148 	 * This seems like a bad idea.  Specifically, here we're
1149 	 * directly on the IO path when we take the target lock, which may also
1150 	 * be taken from a user context.  The user context doesn't actively
1151 	 * trigger anything that may sleep while holding the lock, but this
1152 	 * still seems like a bad idea.
1153 	 *
1154 	 * The obvious way to fix this would be to use a proper queue, which
1155 	 * would result in no shared locks between the direct IO path and user
1156 	 * tasks.  I had a version that did this, but the head-of-line blocking
1157 	 * from the circular buffer resulted in us needing a fairly large
1158 	 * allocation in order to avoid situations in which the queue fills up
1159 	 * and everything goes off the rails.
1160 	 *
1161 	 * I could jump through a some hoops to avoid a shared lock while still
1162 	 * allowing for a large queue, but I'm not actually sure that allowing
1163 	 * for very large queues is the right thing to do here.  Intuitively it
1164 	 * seems better to keep the queues small in here (essentially sized to
1165 	 * the user latency for performance reasons only) and rely on returning
1166 	 * DM_MAPIO_REQUEUE regularly, as that would give the rest of the
1167 	 * kernel more information.
1168 	 *
1169 	 * I'll spend some time trying to figure out what's going on with
1170 	 * DM_MAPIO_REQUEUE, but if someone has a better idea of how to fix
1171 	 * this I'm all ears.
1172 	 */
1173 	mutex_lock(&t->lock);
1174 
1175 	/*
1176 	 * FIXME
1177 	 *
1178 	 * The assumption here is that there's no benefit to returning
1179 	 * DM_MAPIO_KILL as opposed to just erroring out the BIO, but I'm not
1180 	 * sure that's actually true -- for example, I could imagine users
1181 	 * expecting that submitted BIOs are unlikely to fail and therefor
1182 	 * relying on submission failure to indicate an unsupported type.
1183 	 *
1184 	 * There's two ways I can think of to fix this:
1185 	 *   - Add DM arguments that are parsed during the constructor that
1186 	 *     allow various dm_target flags to be set that indicate the op
1187 	 *     types supported by this target.  This may make sense for things
1188 	 *     like discard, where DM can already transform the BIOs to a form
1189 	 *     that's likely to be supported.
1190 	 *   - Some sort of pre-filter that allows userspace to hook in here
1191 	 *     and kill BIOs before marking them as submitted.  My guess would
1192 	 *     be that a userspace round trip is a bad idea here, but a BPF
1193 	 *     call seems resonable.
1194 	 *
1195 	 * My guess is that we'd likely want to do both.  The first one is easy
1196 	 * and gives DM the proper info, so it seems better.  The BPF call
1197 	 * seems overly complex for just this, but one could imagine wanting to
1198 	 * sometimes return _MAPPED and a BPF filter would be the way to do
1199 	 * that.
1200 	 *
1201 	 * For example, in Android we have an in-kernel DM device called
1202 	 * "dm-bow" that takes advange of some portion of the space that has
1203 	 * been discarded on a device to provide opportunistic block-level
1204 	 * backups.  While one could imagine just implementing this entirely in
1205 	 * userspace, that would come with an appreciable performance penalty.
1206 	 * Instead one could keep a BPF program that forwards most accesses
1207 	 * directly to the backing block device while informing a userspace
1208 	 * daemon of any discarded space and on writes to blocks that are to be
1209 	 * backed up.
1210 	 */
1211 	if (unlikely((bio_type_to_user_type(bio) < 0) ||
1212 		     (bio_flags_to_user_flags(bio) < 0))) {
1213 		mutex_unlock(&t->lock);
1214 		return DM_MAPIO_KILL;
1215 	}
1216 
1217 	entry = msg_get_map(t);
1218 	if (unlikely(entry == NULL)) {
1219 		mutex_unlock(&t->lock);
1220 		return DM_MAPIO_REQUEUE;
1221 	}
1222 
1223 	entry->msg.type = bio_type_to_user_type(bio);
1224 	entry->msg.flags = bio_flags_to_user_flags(bio);
1225 	entry->msg.sector = bio->bi_iter.bi_sector;
1226 	entry->msg.len = bio_size(bio);
1227 	entry->bio = bio;
1228 	entry->posn_to_user = 0;
1229 	entry->total_to_user = bio_bytes_needed_to_user(bio);
1230 	entry->posn_from_user = 0;
1231 	entry->total_from_user = bio_bytes_needed_from_user(bio);
1232 	entry->delayed = false;
1233 	entry->t = t;
1234 	/* Pairs with the barrier in dev_read() */
1235 	smp_wmb();
1236 	list_add_tail(&entry->to_user, &t->to_user);
1237 
1238 	/*
1239 	 * If there is no daemon to process the IO's,
1240 	 * queue these messages into a workqueue with
1241 	 * a timeout.
1242 	 */
1243 	if (!is_user_space_thread_present(t))
1244 		enqueue_delayed_work(entry, !t->daemon_terminated);
1245 
1246 	wake_up_interruptible(&t->wq);
1247 	mutex_unlock(&t->lock);
1248 	return DM_MAPIO_SUBMITTED;
1249 }
1250 
1251 static struct target_type user_target = {
1252 	.name = "user",
1253 	.version = { 1, 0, 0 },
1254 	.module = THIS_MODULE,
1255 	.ctr = user_ctr,
1256 	.dtr = user_dtr,
1257 	.map = user_map,
1258 };
1259 
dm_user_init(void)1260 static int __init dm_user_init(void)
1261 {
1262 	int r;
1263 
1264 	r = dm_register_target(&user_target);
1265 	if (r) {
1266 		DMERR("register failed %d", r);
1267 		goto error;
1268 	}
1269 
1270 	return 0;
1271 
1272 error:
1273 	return r;
1274 }
1275 
dm_user_exit(void)1276 static void __exit dm_user_exit(void)
1277 {
1278 	dm_unregister_target(&user_target);
1279 }
1280 
1281 module_init(dm_user_init);
1282 module_exit(dm_user_exit);
1283 MODULE_AUTHOR("Palmer Dabbelt <palmerdabbelt@google.com>");
1284 MODULE_DESCRIPTION(DM_NAME " target returning blocks from userspace");
1285 MODULE_LICENSE("GPL");
1286