• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /*
3    rbd.c -- Export ceph rados objects as a Linux block device
4 
5 
6    based on drivers/block/osdblk.c:
7 
8    Copyright 2009 Red Hat, Inc.
9 
10    This program is free software; you can redistribute it and/or modify
11    it under the terms of the GNU General Public License as published by
12    the Free Software Foundation.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; see the file COPYING.  If not, write to
21    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22 
23 
24 
25    For usage instructions, please refer to:
26 
27                  Documentation/ABI/testing/sysfs-bus-rbd
28 
29  */
30 
31 #include <linux/ceph/libceph.h>
32 #include <linux/ceph/osd_client.h>
33 #include <linux/ceph/mon_client.h>
34 #include <linux/ceph/cls_lock_client.h>
35 #include <linux/ceph/striper.h>
36 #include <linux/ceph/decode.h>
37 #include <linux/fs_parser.h>
38 #include <linux/bsearch.h>
39 
40 #include <linux/kernel.h>
41 #include <linux/device.h>
42 #include <linux/module.h>
43 #include <linux/blk-mq.h>
44 #include <linux/fs.h>
45 #include <linux/blkdev.h>
46 #include <linux/slab.h>
47 #include <linux/idr.h>
48 #include <linux/workqueue.h>
49 
50 #include "rbd_types.h"
51 
52 #define RBD_DEBUG	/* Activate rbd_assert() calls */
53 
54 /*
55  * Increment the given counter and return its updated value.
56  * If the counter is already 0 it will not be incremented.
57  * If the counter is already at its maximum value returns
58  * -EINVAL without updating it.
59  */
atomic_inc_return_safe(atomic_t * v)60 static int atomic_inc_return_safe(atomic_t *v)
61 {
62 	unsigned int counter;
63 
64 	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
65 	if (counter <= (unsigned int)INT_MAX)
66 		return (int)counter;
67 
68 	atomic_dec(v);
69 
70 	return -EINVAL;
71 }
72 
73 /* Decrement the counter.  Return the resulting value, or -EINVAL */
atomic_dec_return_safe(atomic_t * v)74 static int atomic_dec_return_safe(atomic_t *v)
75 {
76 	int counter;
77 
78 	counter = atomic_dec_return(v);
79 	if (counter >= 0)
80 		return counter;
81 
82 	atomic_inc(v);
83 
84 	return -EINVAL;
85 }
86 
87 #define RBD_DRV_NAME "rbd"
88 
89 #define RBD_MINORS_PER_MAJOR		256
90 #define RBD_SINGLE_MAJOR_PART_SHIFT	4
91 
92 #define RBD_MAX_PARENT_CHAIN_LEN	16
93 
94 #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
95 #define RBD_MAX_SNAP_NAME_LEN	\
96 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97 
98 #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
99 
100 #define RBD_SNAP_HEAD_NAME	"-"
101 
102 #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
103 
104 /* This allows a single page to hold an image name sent by OSD */
105 #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
106 #define RBD_IMAGE_ID_LEN_MAX	64
107 
108 #define RBD_OBJ_PREFIX_LEN_MAX	64
109 
110 #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
111 #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
112 
113 /* Feature bits */
114 
115 #define RBD_FEATURE_LAYERING		(1ULL<<0)
116 #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
117 #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
118 #define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
119 #define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
120 #define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
121 #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
122 #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
123 
124 #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
125 				 RBD_FEATURE_STRIPINGV2 |	\
126 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
127 				 RBD_FEATURE_OBJECT_MAP |	\
128 				 RBD_FEATURE_FAST_DIFF |	\
129 				 RBD_FEATURE_DEEP_FLATTEN |	\
130 				 RBD_FEATURE_DATA_POOL |	\
131 				 RBD_FEATURE_OPERATIONS)
132 
133 /* Features supported by this (client software) implementation. */
134 
135 #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
136 
137 /*
138  * An RBD device name will be "rbd#", where the "rbd" comes from
139  * RBD_DRV_NAME above, and # is a unique integer identifier.
140  */
141 #define DEV_NAME_LEN		32
142 
143 /*
144  * block device image metadata (in-memory version)
145  */
146 struct rbd_image_header {
147 	/* These six fields never change for a given rbd image */
148 	char *object_prefix;
149 	__u8 obj_order;
150 	u64 stripe_unit;
151 	u64 stripe_count;
152 	s64 data_pool_id;
153 	u64 features;		/* Might be changeable someday? */
154 
155 	/* The remaining fields need to be updated occasionally */
156 	u64 image_size;
157 	struct ceph_snap_context *snapc;
158 	char *snap_names;	/* format 1 only */
159 	u64 *snap_sizes;	/* format 1 only */
160 };
161 
162 /*
163  * An rbd image specification.
164  *
165  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
166  * identify an image.  Each rbd_dev structure includes a pointer to
167  * an rbd_spec structure that encapsulates this identity.
168  *
169  * Each of the id's in an rbd_spec has an associated name.  For a
170  * user-mapped image, the names are supplied and the id's associated
171  * with them are looked up.  For a layered image, a parent image is
172  * defined by the tuple, and the names are looked up.
173  *
174  * An rbd_dev structure contains a parent_spec pointer which is
175  * non-null if the image it represents is a child in a layered
176  * image.  This pointer will refer to the rbd_spec structure used
177  * by the parent rbd_dev for its own identity (i.e., the structure
178  * is shared between the parent and child).
179  *
180  * Since these structures are populated once, during the discovery
181  * phase of image construction, they are effectively immutable so
182  * we make no effort to synchronize access to them.
183  *
184  * Note that code herein does not assume the image name is known (it
185  * could be a null pointer).
186  */
187 struct rbd_spec {
188 	u64		pool_id;
189 	const char	*pool_name;
190 	const char	*pool_ns;	/* NULL if default, never "" */
191 
192 	const char	*image_id;
193 	const char	*image_name;
194 
195 	u64		snap_id;
196 	const char	*snap_name;
197 
198 	struct kref	kref;
199 };
200 
201 /*
202  * an instance of the client.  multiple devices may share an rbd client.
203  */
204 struct rbd_client {
205 	struct ceph_client	*client;
206 	struct kref		kref;
207 	struct list_head	node;
208 };
209 
210 struct pending_result {
211 	int			result;		/* first nonzero result */
212 	int			num_pending;
213 };
214 
215 struct rbd_img_request;
216 
217 enum obj_request_type {
218 	OBJ_REQUEST_NODATA = 1,
219 	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
220 	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
221 	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
222 };
223 
224 enum obj_operation_type {
225 	OBJ_OP_READ = 1,
226 	OBJ_OP_WRITE,
227 	OBJ_OP_DISCARD,
228 	OBJ_OP_ZEROOUT,
229 };
230 
231 #define RBD_OBJ_FLAG_DELETION			(1U << 0)
232 #define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
233 #define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
234 #define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
235 #define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)
236 
237 enum rbd_obj_read_state {
238 	RBD_OBJ_READ_START = 1,
239 	RBD_OBJ_READ_OBJECT,
240 	RBD_OBJ_READ_PARENT,
241 };
242 
243 /*
244  * Writes go through the following state machine to deal with
245  * layering:
246  *
247  *            . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
248  *            .                 |                                    .
249  *            .                 v                                    .
250  *            .    RBD_OBJ_WRITE_READ_FROM_PARENT. . .               .
251  *            .                 |                    .               .
252  *            .                 v                    v (deep-copyup  .
253  *    (image  .   RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC   .  not needed)  .
254  * flattened) v                 |                    .               .
255  *            .                 v                    .               .
256  *            . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . .      (copyup  .
257  *                              |                        not needed) v
258  *                              v                                    .
259  *                            done . . . . . . . . . . . . . . . . . .
260  *                              ^
261  *                              |
262  *                     RBD_OBJ_WRITE_FLAT
263  *
264  * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
265  * assert_exists guard is needed or not (in some cases it's not needed
266  * even if there is a parent).
267  */
268 enum rbd_obj_write_state {
269 	RBD_OBJ_WRITE_START = 1,
270 	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
271 	RBD_OBJ_WRITE_OBJECT,
272 	__RBD_OBJ_WRITE_COPYUP,
273 	RBD_OBJ_WRITE_COPYUP,
274 	RBD_OBJ_WRITE_POST_OBJECT_MAP,
275 };
276 
277 enum rbd_obj_copyup_state {
278 	RBD_OBJ_COPYUP_START = 1,
279 	RBD_OBJ_COPYUP_READ_PARENT,
280 	__RBD_OBJ_COPYUP_OBJECT_MAPS,
281 	RBD_OBJ_COPYUP_OBJECT_MAPS,
282 	__RBD_OBJ_COPYUP_WRITE_OBJECT,
283 	RBD_OBJ_COPYUP_WRITE_OBJECT,
284 };
285 
286 struct rbd_obj_request {
287 	struct ceph_object_extent ex;
288 	unsigned int		flags;	/* RBD_OBJ_FLAG_* */
289 	union {
290 		enum rbd_obj_read_state	 read_state;	/* for reads */
291 		enum rbd_obj_write_state write_state;	/* for writes */
292 	};
293 
294 	struct rbd_img_request	*img_request;
295 	struct ceph_file_extent	*img_extents;
296 	u32			num_img_extents;
297 
298 	union {
299 		struct ceph_bio_iter	bio_pos;
300 		struct {
301 			struct ceph_bvec_iter	bvec_pos;
302 			u32			bvec_count;
303 			u32			bvec_idx;
304 		};
305 	};
306 
307 	enum rbd_obj_copyup_state copyup_state;
308 	struct bio_vec		*copyup_bvecs;
309 	u32			copyup_bvec_count;
310 
311 	struct list_head	osd_reqs;	/* w/ r_private_item */
312 
313 	struct mutex		state_mutex;
314 	struct pending_result	pending;
315 	struct kref		kref;
316 };
317 
318 enum img_req_flags {
319 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
320 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
321 };
322 
323 enum rbd_img_state {
324 	RBD_IMG_START = 1,
325 	RBD_IMG_EXCLUSIVE_LOCK,
326 	__RBD_IMG_OBJECT_REQUESTS,
327 	RBD_IMG_OBJECT_REQUESTS,
328 };
329 
330 struct rbd_img_request {
331 	struct rbd_device	*rbd_dev;
332 	enum obj_operation_type	op_type;
333 	enum obj_request_type	data_type;
334 	unsigned long		flags;
335 	enum rbd_img_state	state;
336 	union {
337 		u64			snap_id;	/* for reads */
338 		struct ceph_snap_context *snapc;	/* for writes */
339 	};
340 	struct rbd_obj_request	*obj_request;	/* obj req initiator */
341 
342 	struct list_head	lock_item;
343 	struct list_head	object_extents;	/* obj_req.ex structs */
344 
345 	struct mutex		state_mutex;
346 	struct pending_result	pending;
347 	struct work_struct	work;
348 	int			work_result;
349 };
350 
351 #define for_each_obj_request(ireq, oreq) \
352 	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
353 #define for_each_obj_request_safe(ireq, oreq, n) \
354 	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
355 
356 enum rbd_watch_state {
357 	RBD_WATCH_STATE_UNREGISTERED,
358 	RBD_WATCH_STATE_REGISTERED,
359 	RBD_WATCH_STATE_ERROR,
360 };
361 
362 enum rbd_lock_state {
363 	RBD_LOCK_STATE_UNLOCKED,
364 	RBD_LOCK_STATE_LOCKED,
365 	RBD_LOCK_STATE_RELEASING,
366 };
367 
368 /* WatchNotify::ClientId */
369 struct rbd_client_id {
370 	u64 gid;
371 	u64 handle;
372 };
373 
374 struct rbd_mapping {
375 	u64                     size;
376 };
377 
378 /*
379  * a single device
380  */
381 struct rbd_device {
382 	int			dev_id;		/* blkdev unique id */
383 
384 	int			major;		/* blkdev assigned major */
385 	int			minor;
386 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
387 
388 	u32			image_format;	/* Either 1 or 2 */
389 	struct rbd_client	*rbd_client;
390 
391 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
392 
393 	spinlock_t		lock;		/* queue, flags, open_count */
394 
395 	struct rbd_image_header	header;
396 	unsigned long		flags;		/* possibly lock protected */
397 	struct rbd_spec		*spec;
398 	struct rbd_options	*opts;
399 	char			*config_info;	/* add{,_single_major} string */
400 
401 	struct ceph_object_id	header_oid;
402 	struct ceph_object_locator header_oloc;
403 
404 	struct ceph_file_layout	layout;		/* used for all rbd requests */
405 
406 	struct mutex		watch_mutex;
407 	enum rbd_watch_state	watch_state;
408 	struct ceph_osd_linger_request *watch_handle;
409 	u64			watch_cookie;
410 	struct delayed_work	watch_dwork;
411 
412 	struct rw_semaphore	lock_rwsem;
413 	enum rbd_lock_state	lock_state;
414 	char			lock_cookie[32];
415 	struct rbd_client_id	owner_cid;
416 	struct work_struct	acquired_lock_work;
417 	struct work_struct	released_lock_work;
418 	struct delayed_work	lock_dwork;
419 	struct work_struct	unlock_work;
420 	spinlock_t		lock_lists_lock;
421 	struct list_head	acquiring_list;
422 	struct list_head	running_list;
423 	struct completion	acquire_wait;
424 	int			acquire_err;
425 	struct completion	releasing_wait;
426 
427 	spinlock_t		object_map_lock;
428 	u8			*object_map;
429 	u64			object_map_size;	/* in objects */
430 	u64			object_map_flags;
431 
432 	struct workqueue_struct	*task_wq;
433 
434 	struct rbd_spec		*parent_spec;
435 	u64			parent_overlap;
436 	atomic_t		parent_ref;
437 	struct rbd_device	*parent;
438 
439 	/* Block layer tags. */
440 	struct blk_mq_tag_set	tag_set;
441 
442 	/* protects updating the header */
443 	struct rw_semaphore     header_rwsem;
444 
445 	struct rbd_mapping	mapping;
446 
447 	struct list_head	node;
448 
449 	/* sysfs related */
450 	struct device		dev;
451 	unsigned long		open_count;	/* protected by lock */
452 };
453 
454 /*
455  * Flag bits for rbd_dev->flags:
456  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
457  *   by rbd_dev->lock
458  */
459 enum rbd_dev_flags {
460 	RBD_DEV_FLAG_EXISTS,	/* rbd_dev_device_setup() ran */
461 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
462 	RBD_DEV_FLAG_READONLY,  /* -o ro or snapshot */
463 };
464 
465 static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
466 
467 static LIST_HEAD(rbd_dev_list);    /* devices */
468 static DEFINE_SPINLOCK(rbd_dev_list_lock);
469 
470 static LIST_HEAD(rbd_client_list);		/* clients */
471 static DEFINE_SPINLOCK(rbd_client_list_lock);
472 
473 /* Slab caches for frequently-allocated structures */
474 
475 static struct kmem_cache	*rbd_img_request_cache;
476 static struct kmem_cache	*rbd_obj_request_cache;
477 
478 static int rbd_major;
479 static DEFINE_IDA(rbd_dev_id_ida);
480 
481 static struct workqueue_struct *rbd_wq;
482 
483 static struct ceph_snap_context rbd_empty_snapc = {
484 	.nref = REFCOUNT_INIT(1),
485 };
486 
487 /*
488  * single-major requires >= 0.75 version of userspace rbd utility.
489  */
490 static bool single_major = true;
491 module_param(single_major, bool, 0444);
492 MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
493 
494 static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
495 static ssize_t remove_store(struct bus_type *bus, const char *buf,
496 			    size_t count);
497 static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
498 				      size_t count);
499 static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
500 					 size_t count);
501 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
502 
rbd_dev_id_to_minor(int dev_id)503 static int rbd_dev_id_to_minor(int dev_id)
504 {
505 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
506 }
507 
minor_to_rbd_dev_id(int minor)508 static int minor_to_rbd_dev_id(int minor)
509 {
510 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
511 }
512 
rbd_is_ro(struct rbd_device * rbd_dev)513 static bool rbd_is_ro(struct rbd_device *rbd_dev)
514 {
515 	return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
516 }
517 
rbd_is_snap(struct rbd_device * rbd_dev)518 static bool rbd_is_snap(struct rbd_device *rbd_dev)
519 {
520 	return rbd_dev->spec->snap_id != CEPH_NOSNAP;
521 }
522 
__rbd_is_lock_owner(struct rbd_device * rbd_dev)523 static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
524 {
525 	lockdep_assert_held(&rbd_dev->lock_rwsem);
526 
527 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
528 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
529 }
530 
rbd_is_lock_owner(struct rbd_device * rbd_dev)531 static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
532 {
533 	bool is_lock_owner;
534 
535 	down_read(&rbd_dev->lock_rwsem);
536 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
537 	up_read(&rbd_dev->lock_rwsem);
538 	return is_lock_owner;
539 }
540 
supported_features_show(struct bus_type * bus,char * buf)541 static ssize_t supported_features_show(struct bus_type *bus, char *buf)
542 {
543 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
544 }
545 
546 static BUS_ATTR_WO(add);
547 static BUS_ATTR_WO(remove);
548 static BUS_ATTR_WO(add_single_major);
549 static BUS_ATTR_WO(remove_single_major);
550 static BUS_ATTR_RO(supported_features);
551 
552 static struct attribute *rbd_bus_attrs[] = {
553 	&bus_attr_add.attr,
554 	&bus_attr_remove.attr,
555 	&bus_attr_add_single_major.attr,
556 	&bus_attr_remove_single_major.attr,
557 	&bus_attr_supported_features.attr,
558 	NULL,
559 };
560 
rbd_bus_is_visible(struct kobject * kobj,struct attribute * attr,int index)561 static umode_t rbd_bus_is_visible(struct kobject *kobj,
562 				  struct attribute *attr, int index)
563 {
564 	if (!single_major &&
565 	    (attr == &bus_attr_add_single_major.attr ||
566 	     attr == &bus_attr_remove_single_major.attr))
567 		return 0;
568 
569 	return attr->mode;
570 }
571 
572 static const struct attribute_group rbd_bus_group = {
573 	.attrs = rbd_bus_attrs,
574 	.is_visible = rbd_bus_is_visible,
575 };
576 __ATTRIBUTE_GROUPS(rbd_bus);
577 
578 static struct bus_type rbd_bus_type = {
579 	.name		= "rbd",
580 	.bus_groups	= rbd_bus_groups,
581 };
582 
rbd_root_dev_release(struct device * dev)583 static void rbd_root_dev_release(struct device *dev)
584 {
585 }
586 
587 static struct device rbd_root_dev = {
588 	.init_name =    "rbd",
589 	.release =      rbd_root_dev_release,
590 };
591 
592 static __printf(2, 3)
rbd_warn(struct rbd_device * rbd_dev,const char * fmt,...)593 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
594 {
595 	struct va_format vaf;
596 	va_list args;
597 
598 	va_start(args, fmt);
599 	vaf.fmt = fmt;
600 	vaf.va = &args;
601 
602 	if (!rbd_dev)
603 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
604 	else if (rbd_dev->disk)
605 		printk(KERN_WARNING "%s: %s: %pV\n",
606 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
607 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
608 		printk(KERN_WARNING "%s: image %s: %pV\n",
609 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
610 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
611 		printk(KERN_WARNING "%s: id %s: %pV\n",
612 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
613 	else	/* punt */
614 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
615 			RBD_DRV_NAME, rbd_dev, &vaf);
616 	va_end(args);
617 }
618 
619 #ifdef RBD_DEBUG
620 #define rbd_assert(expr)						\
621 		if (unlikely(!(expr))) {				\
622 			printk(KERN_ERR "\nAssertion failure in %s() "	\
623 						"at line %d:\n\n"	\
624 					"\trbd_assert(%s);\n\n",	\
625 					__func__, __LINE__, #expr);	\
626 			BUG();						\
627 		}
628 #else /* !RBD_DEBUG */
629 #  define rbd_assert(expr)	((void) 0)
630 #endif /* !RBD_DEBUG */
631 
632 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
633 
634 static int rbd_dev_refresh(struct rbd_device *rbd_dev);
635 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
636 static int rbd_dev_header_info(struct rbd_device *rbd_dev);
637 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
638 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
639 					u64 snap_id);
640 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
641 				u8 *order, u64 *snap_size);
642 static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
643 
644 static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
645 static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
646 
647 /*
648  * Return true if nothing else is pending.
649  */
pending_result_dec(struct pending_result * pending,int * result)650 static bool pending_result_dec(struct pending_result *pending, int *result)
651 {
652 	rbd_assert(pending->num_pending > 0);
653 
654 	if (*result && !pending->result)
655 		pending->result = *result;
656 	if (--pending->num_pending)
657 		return false;
658 
659 	*result = pending->result;
660 	return true;
661 }
662 
rbd_open(struct block_device * bdev,fmode_t mode)663 static int rbd_open(struct block_device *bdev, fmode_t mode)
664 {
665 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
666 	bool removing = false;
667 
668 	spin_lock_irq(&rbd_dev->lock);
669 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
670 		removing = true;
671 	else
672 		rbd_dev->open_count++;
673 	spin_unlock_irq(&rbd_dev->lock);
674 	if (removing)
675 		return -ENOENT;
676 
677 	(void) get_device(&rbd_dev->dev);
678 
679 	return 0;
680 }
681 
rbd_release(struct gendisk * disk,fmode_t mode)682 static void rbd_release(struct gendisk *disk, fmode_t mode)
683 {
684 	struct rbd_device *rbd_dev = disk->private_data;
685 	unsigned long open_count_before;
686 
687 	spin_lock_irq(&rbd_dev->lock);
688 	open_count_before = rbd_dev->open_count--;
689 	spin_unlock_irq(&rbd_dev->lock);
690 	rbd_assert(open_count_before > 0);
691 
692 	put_device(&rbd_dev->dev);
693 }
694 
rbd_ioctl_set_ro(struct rbd_device * rbd_dev,unsigned long arg)695 static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
696 {
697 	int ro;
698 
699 	if (get_user(ro, (int __user *)arg))
700 		return -EFAULT;
701 
702 	/*
703 	 * Both images mapped read-only and snapshots can't be marked
704 	 * read-write.
705 	 */
706 	if (!ro) {
707 		if (rbd_is_ro(rbd_dev))
708 			return -EROFS;
709 
710 		rbd_assert(!rbd_is_snap(rbd_dev));
711 	}
712 
713 	/* Let blkdev_roset() handle it */
714 	return -ENOTTY;
715 }
716 
rbd_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)717 static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
718 			unsigned int cmd, unsigned long arg)
719 {
720 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
721 	int ret;
722 
723 	switch (cmd) {
724 	case BLKROSET:
725 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
726 		break;
727 	default:
728 		ret = -ENOTTY;
729 	}
730 
731 	return ret;
732 }
733 
734 #ifdef CONFIG_COMPAT
rbd_compat_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)735 static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
736 				unsigned int cmd, unsigned long arg)
737 {
738 	return rbd_ioctl(bdev, mode, cmd, arg);
739 }
740 #endif /* CONFIG_COMPAT */
741 
742 static const struct block_device_operations rbd_bd_ops = {
743 	.owner			= THIS_MODULE,
744 	.open			= rbd_open,
745 	.release		= rbd_release,
746 	.ioctl			= rbd_ioctl,
747 #ifdef CONFIG_COMPAT
748 	.compat_ioctl		= rbd_compat_ioctl,
749 #endif
750 };
751 
752 /*
753  * Initialize an rbd client instance.  Success or not, this function
754  * consumes ceph_opts.  Caller holds client_mutex.
755  */
rbd_client_create(struct ceph_options * ceph_opts)756 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
757 {
758 	struct rbd_client *rbdc;
759 	int ret = -ENOMEM;
760 
761 	dout("%s:\n", __func__);
762 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
763 	if (!rbdc)
764 		goto out_opt;
765 
766 	kref_init(&rbdc->kref);
767 	INIT_LIST_HEAD(&rbdc->node);
768 
769 	rbdc->client = ceph_create_client(ceph_opts, rbdc);
770 	if (IS_ERR(rbdc->client))
771 		goto out_rbdc;
772 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
773 
774 	ret = ceph_open_session(rbdc->client);
775 	if (ret < 0)
776 		goto out_client;
777 
778 	spin_lock(&rbd_client_list_lock);
779 	list_add_tail(&rbdc->node, &rbd_client_list);
780 	spin_unlock(&rbd_client_list_lock);
781 
782 	dout("%s: rbdc %p\n", __func__, rbdc);
783 
784 	return rbdc;
785 out_client:
786 	ceph_destroy_client(rbdc->client);
787 out_rbdc:
788 	kfree(rbdc);
789 out_opt:
790 	if (ceph_opts)
791 		ceph_destroy_options(ceph_opts);
792 	dout("%s: error %d\n", __func__, ret);
793 
794 	return ERR_PTR(ret);
795 }
796 
__rbd_get_client(struct rbd_client * rbdc)797 static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
798 {
799 	kref_get(&rbdc->kref);
800 
801 	return rbdc;
802 }
803 
804 /*
805  * Find a ceph client with specific addr and configuration.  If
806  * found, bump its reference count.
807  */
rbd_client_find(struct ceph_options * ceph_opts)808 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
809 {
810 	struct rbd_client *client_node;
811 	bool found = false;
812 
813 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
814 		return NULL;
815 
816 	spin_lock(&rbd_client_list_lock);
817 	list_for_each_entry(client_node, &rbd_client_list, node) {
818 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
819 			__rbd_get_client(client_node);
820 
821 			found = true;
822 			break;
823 		}
824 	}
825 	spin_unlock(&rbd_client_list_lock);
826 
827 	return found ? client_node : NULL;
828 }
829 
830 /*
831  * (Per device) rbd map options
832  */
833 enum {
834 	Opt_queue_depth,
835 	Opt_alloc_size,
836 	Opt_lock_timeout,
837 	/* int args above */
838 	Opt_pool_ns,
839 	Opt_compression_hint,
840 	/* string args above */
841 	Opt_read_only,
842 	Opt_read_write,
843 	Opt_lock_on_read,
844 	Opt_exclusive,
845 	Opt_notrim,
846 };
847 
848 enum {
849 	Opt_compression_hint_none,
850 	Opt_compression_hint_compressible,
851 	Opt_compression_hint_incompressible,
852 };
853 
854 static const struct constant_table rbd_param_compression_hint[] = {
855 	{"none",		Opt_compression_hint_none},
856 	{"compressible",	Opt_compression_hint_compressible},
857 	{"incompressible",	Opt_compression_hint_incompressible},
858 	{}
859 };
860 
861 static const struct fs_parameter_spec rbd_parameters[] = {
862 	fsparam_u32	("alloc_size",			Opt_alloc_size),
863 	fsparam_enum	("compression_hint",		Opt_compression_hint,
864 			 rbd_param_compression_hint),
865 	fsparam_flag	("exclusive",			Opt_exclusive),
866 	fsparam_flag	("lock_on_read",		Opt_lock_on_read),
867 	fsparam_u32	("lock_timeout",		Opt_lock_timeout),
868 	fsparam_flag	("notrim",			Opt_notrim),
869 	fsparam_string	("_pool_ns",			Opt_pool_ns),
870 	fsparam_u32	("queue_depth",			Opt_queue_depth),
871 	fsparam_flag	("read_only",			Opt_read_only),
872 	fsparam_flag	("read_write",			Opt_read_write),
873 	fsparam_flag	("ro",				Opt_read_only),
874 	fsparam_flag	("rw",				Opt_read_write),
875 	{}
876 };
877 
878 struct rbd_options {
879 	int	queue_depth;
880 	int	alloc_size;
881 	unsigned long	lock_timeout;
882 	bool	read_only;
883 	bool	lock_on_read;
884 	bool	exclusive;
885 	bool	trim;
886 
887 	u32 alloc_hint_flags;  /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
888 };
889 
890 #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
891 #define RBD_ALLOC_SIZE_DEFAULT	(64 * 1024)
892 #define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
893 #define RBD_READ_ONLY_DEFAULT	false
894 #define RBD_LOCK_ON_READ_DEFAULT false
895 #define RBD_EXCLUSIVE_DEFAULT	false
896 #define RBD_TRIM_DEFAULT	true
897 
898 struct rbd_parse_opts_ctx {
899 	struct rbd_spec		*spec;
900 	struct ceph_options	*copts;
901 	struct rbd_options	*opts;
902 };
903 
obj_op_name(enum obj_operation_type op_type)904 static char* obj_op_name(enum obj_operation_type op_type)
905 {
906 	switch (op_type) {
907 	case OBJ_OP_READ:
908 		return "read";
909 	case OBJ_OP_WRITE:
910 		return "write";
911 	case OBJ_OP_DISCARD:
912 		return "discard";
913 	case OBJ_OP_ZEROOUT:
914 		return "zeroout";
915 	default:
916 		return "???";
917 	}
918 }
919 
920 /*
921  * Destroy ceph client
922  *
923  * Caller must hold rbd_client_list_lock.
924  */
rbd_client_release(struct kref * kref)925 static void rbd_client_release(struct kref *kref)
926 {
927 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
928 
929 	dout("%s: rbdc %p\n", __func__, rbdc);
930 	spin_lock(&rbd_client_list_lock);
931 	list_del(&rbdc->node);
932 	spin_unlock(&rbd_client_list_lock);
933 
934 	ceph_destroy_client(rbdc->client);
935 	kfree(rbdc);
936 }
937 
938 /*
939  * Drop reference to ceph client node. If it's not referenced anymore, release
940  * it.
941  */
rbd_put_client(struct rbd_client * rbdc)942 static void rbd_put_client(struct rbd_client *rbdc)
943 {
944 	if (rbdc)
945 		kref_put(&rbdc->kref, rbd_client_release);
946 }
947 
948 /*
949  * Get a ceph client with specific addr and configuration, if one does
950  * not exist create it.  Either way, ceph_opts is consumed by this
951  * function.
952  */
rbd_get_client(struct ceph_options * ceph_opts)953 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
954 {
955 	struct rbd_client *rbdc;
956 	int ret;
957 
958 	mutex_lock(&client_mutex);
959 	rbdc = rbd_client_find(ceph_opts);
960 	if (rbdc) {
961 		ceph_destroy_options(ceph_opts);
962 
963 		/*
964 		 * Using an existing client.  Make sure ->pg_pools is up to
965 		 * date before we look up the pool id in do_rbd_add().
966 		 */
967 		ret = ceph_wait_for_latest_osdmap(rbdc->client,
968 					rbdc->client->options->mount_timeout);
969 		if (ret) {
970 			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
971 			rbd_put_client(rbdc);
972 			rbdc = ERR_PTR(ret);
973 		}
974 	} else {
975 		rbdc = rbd_client_create(ceph_opts);
976 	}
977 	mutex_unlock(&client_mutex);
978 
979 	return rbdc;
980 }
981 
rbd_image_format_valid(u32 image_format)982 static bool rbd_image_format_valid(u32 image_format)
983 {
984 	return image_format == 1 || image_format == 2;
985 }
986 
rbd_dev_ondisk_valid(struct rbd_image_header_ondisk * ondisk)987 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
988 {
989 	size_t size;
990 	u32 snap_count;
991 
992 	/* The header has to start with the magic rbd header text */
993 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
994 		return false;
995 
996 	/* The bio layer requires at least sector-sized I/O */
997 
998 	if (ondisk->options.order < SECTOR_SHIFT)
999 		return false;
1000 
1001 	/* If we use u64 in a few spots we may be able to loosen this */
1002 
1003 	if (ondisk->options.order > 8 * sizeof (int) - 1)
1004 		return false;
1005 
1006 	/*
1007 	 * The size of a snapshot header has to fit in a size_t, and
1008 	 * that limits the number of snapshots.
1009 	 */
1010 	snap_count = le32_to_cpu(ondisk->snap_count);
1011 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
1012 	if (snap_count > size / sizeof (__le64))
1013 		return false;
1014 
1015 	/*
1016 	 * Not only that, but the size of the entire the snapshot
1017 	 * header must also be representable in a size_t.
1018 	 */
1019 	size -= snap_count * sizeof (__le64);
1020 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1021 		return false;
1022 
1023 	return true;
1024 }
1025 
1026 /*
1027  * returns the size of an object in the image
1028  */
rbd_obj_bytes(struct rbd_image_header * header)1029 static u32 rbd_obj_bytes(struct rbd_image_header *header)
1030 {
1031 	return 1U << header->obj_order;
1032 }
1033 
rbd_init_layout(struct rbd_device * rbd_dev)1034 static void rbd_init_layout(struct rbd_device *rbd_dev)
1035 {
1036 	if (rbd_dev->header.stripe_unit == 0 ||
1037 	    rbd_dev->header.stripe_count == 0) {
1038 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1039 		rbd_dev->header.stripe_count = 1;
1040 	}
1041 
1042 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1043 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1044 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
1045 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1046 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
1047 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1048 }
1049 
1050 /*
1051  * Fill an rbd image header with information from the given format 1
1052  * on-disk header.
1053  */
rbd_header_from_disk(struct rbd_device * rbd_dev,struct rbd_image_header_ondisk * ondisk)1054 static int rbd_header_from_disk(struct rbd_device *rbd_dev,
1055 				 struct rbd_image_header_ondisk *ondisk)
1056 {
1057 	struct rbd_image_header *header = &rbd_dev->header;
1058 	bool first_time = header->object_prefix == NULL;
1059 	struct ceph_snap_context *snapc;
1060 	char *object_prefix = NULL;
1061 	char *snap_names = NULL;
1062 	u64 *snap_sizes = NULL;
1063 	u32 snap_count;
1064 	int ret = -ENOMEM;
1065 	u32 i;
1066 
1067 	/* Allocate this now to avoid having to handle failure below */
1068 
1069 	if (first_time) {
1070 		object_prefix = kstrndup(ondisk->object_prefix,
1071 					 sizeof(ondisk->object_prefix),
1072 					 GFP_KERNEL);
1073 		if (!object_prefix)
1074 			return -ENOMEM;
1075 	}
1076 
1077 	/* Allocate the snapshot context and fill it in */
1078 
1079 	snap_count = le32_to_cpu(ondisk->snap_count);
1080 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1081 	if (!snapc)
1082 		goto out_err;
1083 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1084 	if (snap_count) {
1085 		struct rbd_image_snap_ondisk *snaps;
1086 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1087 
1088 		/* We'll keep a copy of the snapshot names... */
1089 
1090 		if (snap_names_len > (u64)SIZE_MAX)
1091 			goto out_2big;
1092 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1093 		if (!snap_names)
1094 			goto out_err;
1095 
1096 		/* ...as well as the array of their sizes. */
1097 		snap_sizes = kmalloc_array(snap_count,
1098 					   sizeof(*header->snap_sizes),
1099 					   GFP_KERNEL);
1100 		if (!snap_sizes)
1101 			goto out_err;
1102 
1103 		/*
1104 		 * Copy the names, and fill in each snapshot's id
1105 		 * and size.
1106 		 *
1107 		 * Note that rbd_dev_v1_header_info() guarantees the
1108 		 * ondisk buffer we're working with has
1109 		 * snap_names_len bytes beyond the end of the
1110 		 * snapshot id array, this memcpy() is safe.
1111 		 */
1112 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1113 		snaps = ondisk->snaps;
1114 		for (i = 0; i < snap_count; i++) {
1115 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1116 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1117 		}
1118 	}
1119 
1120 	/* We won't fail any more, fill in the header */
1121 
1122 	if (first_time) {
1123 		header->object_prefix = object_prefix;
1124 		header->obj_order = ondisk->options.order;
1125 		rbd_init_layout(rbd_dev);
1126 	} else {
1127 		ceph_put_snap_context(header->snapc);
1128 		kfree(header->snap_names);
1129 		kfree(header->snap_sizes);
1130 	}
1131 
1132 	/* The remaining fields always get updated (when we refresh) */
1133 
1134 	header->image_size = le64_to_cpu(ondisk->image_size);
1135 	header->snapc = snapc;
1136 	header->snap_names = snap_names;
1137 	header->snap_sizes = snap_sizes;
1138 
1139 	return 0;
1140 out_2big:
1141 	ret = -EIO;
1142 out_err:
1143 	kfree(snap_sizes);
1144 	kfree(snap_names);
1145 	ceph_put_snap_context(snapc);
1146 	kfree(object_prefix);
1147 
1148 	return ret;
1149 }
1150 
_rbd_dev_v1_snap_name(struct rbd_device * rbd_dev,u32 which)1151 static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1152 {
1153 	const char *snap_name;
1154 
1155 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1156 
1157 	/* Skip over names until we find the one we are looking for */
1158 
1159 	snap_name = rbd_dev->header.snap_names;
1160 	while (which--)
1161 		snap_name += strlen(snap_name) + 1;
1162 
1163 	return kstrdup(snap_name, GFP_KERNEL);
1164 }
1165 
1166 /*
1167  * Snapshot id comparison function for use with qsort()/bsearch().
1168  * Note that result is for snapshots in *descending* order.
1169  */
snapid_compare_reverse(const void * s1,const void * s2)1170 static int snapid_compare_reverse(const void *s1, const void *s2)
1171 {
1172 	u64 snap_id1 = *(u64 *)s1;
1173 	u64 snap_id2 = *(u64 *)s2;
1174 
1175 	if (snap_id1 < snap_id2)
1176 		return 1;
1177 	return snap_id1 == snap_id2 ? 0 : -1;
1178 }
1179 
1180 /*
1181  * Search a snapshot context to see if the given snapshot id is
1182  * present.
1183  *
1184  * Returns the position of the snapshot id in the array if it's found,
1185  * or BAD_SNAP_INDEX otherwise.
1186  *
1187  * Note: The snapshot array is in kept sorted (by the osd) in
1188  * reverse order, highest snapshot id first.
1189  */
rbd_dev_snap_index(struct rbd_device * rbd_dev,u64 snap_id)1190 static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1191 {
1192 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
1193 	u64 *found;
1194 
1195 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1196 				sizeof (snap_id), snapid_compare_reverse);
1197 
1198 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
1199 }
1200 
rbd_dev_v1_snap_name(struct rbd_device * rbd_dev,u64 snap_id)1201 static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1202 					u64 snap_id)
1203 {
1204 	u32 which;
1205 	const char *snap_name;
1206 
1207 	which = rbd_dev_snap_index(rbd_dev, snap_id);
1208 	if (which == BAD_SNAP_INDEX)
1209 		return ERR_PTR(-ENOENT);
1210 
1211 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1212 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
1213 }
1214 
rbd_snap_name(struct rbd_device * rbd_dev,u64 snap_id)1215 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1216 {
1217 	if (snap_id == CEPH_NOSNAP)
1218 		return RBD_SNAP_HEAD_NAME;
1219 
1220 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1221 	if (rbd_dev->image_format == 1)
1222 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
1223 
1224 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
1225 }
1226 
rbd_snap_size(struct rbd_device * rbd_dev,u64 snap_id,u64 * snap_size)1227 static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1228 				u64 *snap_size)
1229 {
1230 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1231 	if (snap_id == CEPH_NOSNAP) {
1232 		*snap_size = rbd_dev->header.image_size;
1233 	} else if (rbd_dev->image_format == 1) {
1234 		u32 which;
1235 
1236 		which = rbd_dev_snap_index(rbd_dev, snap_id);
1237 		if (which == BAD_SNAP_INDEX)
1238 			return -ENOENT;
1239 
1240 		*snap_size = rbd_dev->header.snap_sizes[which];
1241 	} else {
1242 		u64 size = 0;
1243 		int ret;
1244 
1245 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1246 		if (ret)
1247 			return ret;
1248 
1249 		*snap_size = size;
1250 	}
1251 	return 0;
1252 }
1253 
rbd_dev_mapping_set(struct rbd_device * rbd_dev)1254 static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1255 {
1256 	u64 snap_id = rbd_dev->spec->snap_id;
1257 	u64 size = 0;
1258 	int ret;
1259 
1260 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
1261 	if (ret)
1262 		return ret;
1263 
1264 	rbd_dev->mapping.size = size;
1265 	return 0;
1266 }
1267 
rbd_dev_mapping_clear(struct rbd_device * rbd_dev)1268 static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1269 {
1270 	rbd_dev->mapping.size = 0;
1271 }
1272 
zero_bvec(struct bio_vec * bv)1273 static void zero_bvec(struct bio_vec *bv)
1274 {
1275 	void *buf;
1276 	unsigned long flags;
1277 
1278 	buf = bvec_kmap_irq(bv, &flags);
1279 	memset(buf, 0, bv->bv_len);
1280 	flush_dcache_page(bv->bv_page);
1281 	bvec_kunmap_irq(buf, &flags);
1282 }
1283 
zero_bios(struct ceph_bio_iter * bio_pos,u32 off,u32 bytes)1284 static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1285 {
1286 	struct ceph_bio_iter it = *bio_pos;
1287 
1288 	ceph_bio_iter_advance(&it, off);
1289 	ceph_bio_iter_advance_step(&it, bytes, ({
1290 		zero_bvec(&bv);
1291 	}));
1292 }
1293 
zero_bvecs(struct ceph_bvec_iter * bvec_pos,u32 off,u32 bytes)1294 static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1295 {
1296 	struct ceph_bvec_iter it = *bvec_pos;
1297 
1298 	ceph_bvec_iter_advance(&it, off);
1299 	ceph_bvec_iter_advance_step(&it, bytes, ({
1300 		zero_bvec(&bv);
1301 	}));
1302 }
1303 
1304 /*
1305  * Zero a range in @obj_req data buffer defined by a bio (list) or
1306  * (private) bio_vec array.
1307  *
1308  * @off is relative to the start of the data buffer.
1309  */
rbd_obj_zero_range(struct rbd_obj_request * obj_req,u32 off,u32 bytes)1310 static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1311 			       u32 bytes)
1312 {
1313 	dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1314 
1315 	switch (obj_req->img_request->data_type) {
1316 	case OBJ_REQUEST_BIO:
1317 		zero_bios(&obj_req->bio_pos, off, bytes);
1318 		break;
1319 	case OBJ_REQUEST_BVECS:
1320 	case OBJ_REQUEST_OWN_BVECS:
1321 		zero_bvecs(&obj_req->bvec_pos, off, bytes);
1322 		break;
1323 	default:
1324 		BUG();
1325 	}
1326 }
1327 
1328 static void rbd_obj_request_destroy(struct kref *kref);
rbd_obj_request_put(struct rbd_obj_request * obj_request)1329 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1330 {
1331 	rbd_assert(obj_request != NULL);
1332 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
1333 		kref_read(&obj_request->kref));
1334 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1335 }
1336 
rbd_img_obj_request_add(struct rbd_img_request * img_request,struct rbd_obj_request * obj_request)1337 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1338 					struct rbd_obj_request *obj_request)
1339 {
1340 	rbd_assert(obj_request->img_request == NULL);
1341 
1342 	/* Image request now owns object's original reference */
1343 	obj_request->img_request = img_request;
1344 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1345 }
1346 
rbd_img_obj_request_del(struct rbd_img_request * img_request,struct rbd_obj_request * obj_request)1347 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1348 					struct rbd_obj_request *obj_request)
1349 {
1350 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1351 	list_del(&obj_request->ex.oe_item);
1352 	rbd_assert(obj_request->img_request == img_request);
1353 	rbd_obj_request_put(obj_request);
1354 }
1355 
rbd_osd_submit(struct ceph_osd_request * osd_req)1356 static void rbd_osd_submit(struct ceph_osd_request *osd_req)
1357 {
1358 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1359 
1360 	dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1361 	     __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1362 	     obj_req->ex.oe_off, obj_req->ex.oe_len);
1363 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1364 }
1365 
1366 /*
1367  * The default/initial value for all image request flags is 0.  Each
1368  * is conditionally set to 1 at image request initialization time
1369  * and currently never change thereafter.
1370  */
img_request_layered_set(struct rbd_img_request * img_request)1371 static void img_request_layered_set(struct rbd_img_request *img_request)
1372 {
1373 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1374 }
1375 
img_request_layered_test(struct rbd_img_request * img_request)1376 static bool img_request_layered_test(struct rbd_img_request *img_request)
1377 {
1378 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1379 }
1380 
rbd_obj_is_entire(struct rbd_obj_request * obj_req)1381 static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1382 {
1383 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1384 
1385 	return !obj_req->ex.oe_off &&
1386 	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
1387 }
1388 
rbd_obj_is_tail(struct rbd_obj_request * obj_req)1389 static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1390 {
1391 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1392 
1393 	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
1394 					rbd_dev->layout.object_size;
1395 }
1396 
1397 /*
1398  * Must be called after rbd_obj_calc_img_extents().
1399  */
rbd_obj_set_copyup_enabled(struct rbd_obj_request * obj_req)1400 static void rbd_obj_set_copyup_enabled(struct rbd_obj_request *obj_req)
1401 {
1402 	rbd_assert(obj_req->img_request->snapc);
1403 
1404 	if (obj_req->img_request->op_type == OBJ_OP_DISCARD) {
1405 		dout("%s %p objno %llu discard\n", __func__, obj_req,
1406 		     obj_req->ex.oe_objno);
1407 		return;
1408 	}
1409 
1410 	if (!obj_req->num_img_extents) {
1411 		dout("%s %p objno %llu not overlapping\n", __func__, obj_req,
1412 		     obj_req->ex.oe_objno);
1413 		return;
1414 	}
1415 
1416 	if (rbd_obj_is_entire(obj_req) &&
1417 	    !obj_req->img_request->snapc->num_snaps) {
1418 		dout("%s %p objno %llu entire\n", __func__, obj_req,
1419 		     obj_req->ex.oe_objno);
1420 		return;
1421 	}
1422 
1423 	obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
1424 }
1425 
rbd_obj_img_extents_bytes(struct rbd_obj_request * obj_req)1426 static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1427 {
1428 	return ceph_file_extents_bytes(obj_req->img_extents,
1429 				       obj_req->num_img_extents);
1430 }
1431 
rbd_img_is_write(struct rbd_img_request * img_req)1432 static bool rbd_img_is_write(struct rbd_img_request *img_req)
1433 {
1434 	switch (img_req->op_type) {
1435 	case OBJ_OP_READ:
1436 		return false;
1437 	case OBJ_OP_WRITE:
1438 	case OBJ_OP_DISCARD:
1439 	case OBJ_OP_ZEROOUT:
1440 		return true;
1441 	default:
1442 		BUG();
1443 	}
1444 }
1445 
rbd_osd_req_callback(struct ceph_osd_request * osd_req)1446 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1447 {
1448 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1449 	int result;
1450 
1451 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1452 	     osd_req->r_result, obj_req);
1453 
1454 	/*
1455 	 * Writes aren't allowed to return a data payload.  In some
1456 	 * guarded write cases (e.g. stat + zero on an empty object)
1457 	 * a stat response makes it through, but we don't care.
1458 	 */
1459 	if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1460 		result = 0;
1461 	else
1462 		result = osd_req->r_result;
1463 
1464 	rbd_obj_handle_request(obj_req, result);
1465 }
1466 
rbd_osd_format_read(struct ceph_osd_request * osd_req)1467 static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
1468 {
1469 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1470 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1471 	struct ceph_options *opt = rbd_dev->rbd_client->client->options;
1472 
1473 	osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
1474 	osd_req->r_snapid = obj_request->img_request->snap_id;
1475 }
1476 
rbd_osd_format_write(struct ceph_osd_request * osd_req)1477 static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
1478 {
1479 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1480 
1481 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1482 	ktime_get_real_ts64(&osd_req->r_mtime);
1483 	osd_req->r_data_offset = obj_request->ex.oe_off;
1484 }
1485 
1486 static struct ceph_osd_request *
__rbd_obj_add_osd_request(struct rbd_obj_request * obj_req,struct ceph_snap_context * snapc,int num_ops)1487 __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1488 			  struct ceph_snap_context *snapc, int num_ops)
1489 {
1490 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1491 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1492 	struct ceph_osd_request *req;
1493 	const char *name_format = rbd_dev->image_format == 1 ?
1494 				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1495 	int ret;
1496 
1497 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1498 	if (!req)
1499 		return ERR_PTR(-ENOMEM);
1500 
1501 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
1502 	req->r_callback = rbd_osd_req_callback;
1503 	req->r_priv = obj_req;
1504 
1505 	/*
1506 	 * Data objects may be stored in a separate pool, but always in
1507 	 * the same namespace in that pool as the header in its pool.
1508 	 */
1509 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1510 	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1511 
1512 	ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1513 			       rbd_dev->header.object_prefix,
1514 			       obj_req->ex.oe_objno);
1515 	if (ret)
1516 		return ERR_PTR(ret);
1517 
1518 	return req;
1519 }
1520 
1521 static struct ceph_osd_request *
rbd_obj_add_osd_request(struct rbd_obj_request * obj_req,int num_ops)1522 rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
1523 {
1524 	rbd_assert(obj_req->img_request->snapc);
1525 	return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1526 					 num_ops);
1527 }
1528 
rbd_obj_request_create(void)1529 static struct rbd_obj_request *rbd_obj_request_create(void)
1530 {
1531 	struct rbd_obj_request *obj_request;
1532 
1533 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
1534 	if (!obj_request)
1535 		return NULL;
1536 
1537 	ceph_object_extent_init(&obj_request->ex);
1538 	INIT_LIST_HEAD(&obj_request->osd_reqs);
1539 	mutex_init(&obj_request->state_mutex);
1540 	kref_init(&obj_request->kref);
1541 
1542 	dout("%s %p\n", __func__, obj_request);
1543 	return obj_request;
1544 }
1545 
rbd_obj_request_destroy(struct kref * kref)1546 static void rbd_obj_request_destroy(struct kref *kref)
1547 {
1548 	struct rbd_obj_request *obj_request;
1549 	struct ceph_osd_request *osd_req;
1550 	u32 i;
1551 
1552 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1553 
1554 	dout("%s: obj %p\n", __func__, obj_request);
1555 
1556 	while (!list_empty(&obj_request->osd_reqs)) {
1557 		osd_req = list_first_entry(&obj_request->osd_reqs,
1558 				    struct ceph_osd_request, r_private_item);
1559 		list_del_init(&osd_req->r_private_item);
1560 		ceph_osdc_put_request(osd_req);
1561 	}
1562 
1563 	switch (obj_request->img_request->data_type) {
1564 	case OBJ_REQUEST_NODATA:
1565 	case OBJ_REQUEST_BIO:
1566 	case OBJ_REQUEST_BVECS:
1567 		break;		/* Nothing to do */
1568 	case OBJ_REQUEST_OWN_BVECS:
1569 		kfree(obj_request->bvec_pos.bvecs);
1570 		break;
1571 	default:
1572 		BUG();
1573 	}
1574 
1575 	kfree(obj_request->img_extents);
1576 	if (obj_request->copyup_bvecs) {
1577 		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1578 			if (obj_request->copyup_bvecs[i].bv_page)
1579 				__free_page(obj_request->copyup_bvecs[i].bv_page);
1580 		}
1581 		kfree(obj_request->copyup_bvecs);
1582 	}
1583 
1584 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1585 }
1586 
1587 /* It's OK to call this for a device with no parent */
1588 
1589 static void rbd_spec_put(struct rbd_spec *spec);
rbd_dev_unparent(struct rbd_device * rbd_dev)1590 static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1591 {
1592 	rbd_dev_remove_parent(rbd_dev);
1593 	rbd_spec_put(rbd_dev->parent_spec);
1594 	rbd_dev->parent_spec = NULL;
1595 	rbd_dev->parent_overlap = 0;
1596 }
1597 
1598 /*
1599  * Parent image reference counting is used to determine when an
1600  * image's parent fields can be safely torn down--after there are no
1601  * more in-flight requests to the parent image.  When the last
1602  * reference is dropped, cleaning them up is safe.
1603  */
rbd_dev_parent_put(struct rbd_device * rbd_dev)1604 static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1605 {
1606 	int counter;
1607 
1608 	if (!rbd_dev->parent_spec)
1609 		return;
1610 
1611 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1612 	if (counter > 0)
1613 		return;
1614 
1615 	/* Last reference; clean up parent data structures */
1616 
1617 	if (!counter)
1618 		rbd_dev_unparent(rbd_dev);
1619 	else
1620 		rbd_warn(rbd_dev, "parent reference underflow");
1621 }
1622 
1623 /*
1624  * If an image has a non-zero parent overlap, get a reference to its
1625  * parent.
1626  *
1627  * Returns true if the rbd device has a parent with a non-zero
1628  * overlap and a reference for it was successfully taken, or
1629  * false otherwise.
1630  */
rbd_dev_parent_get(struct rbd_device * rbd_dev)1631 static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1632 {
1633 	int counter = 0;
1634 
1635 	if (!rbd_dev->parent_spec)
1636 		return false;
1637 
1638 	if (rbd_dev->parent_overlap)
1639 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1640 
1641 	if (counter < 0)
1642 		rbd_warn(rbd_dev, "parent reference overflow");
1643 
1644 	return counter > 0;
1645 }
1646 
rbd_img_request_init(struct rbd_img_request * img_request,struct rbd_device * rbd_dev,enum obj_operation_type op_type)1647 static void rbd_img_request_init(struct rbd_img_request *img_request,
1648 				 struct rbd_device *rbd_dev,
1649 				 enum obj_operation_type op_type)
1650 {
1651 	memset(img_request, 0, sizeof(*img_request));
1652 
1653 	img_request->rbd_dev = rbd_dev;
1654 	img_request->op_type = op_type;
1655 
1656 	INIT_LIST_HEAD(&img_request->lock_item);
1657 	INIT_LIST_HEAD(&img_request->object_extents);
1658 	mutex_init(&img_request->state_mutex);
1659 }
1660 
1661 /*
1662  * Only snap_id is captured here, for reads.  For writes, snapshot
1663  * context is captured in rbd_img_object_requests() after exclusive
1664  * lock is ensured to be held.
1665  */
rbd_img_capture_header(struct rbd_img_request * img_req)1666 static void rbd_img_capture_header(struct rbd_img_request *img_req)
1667 {
1668 	struct rbd_device *rbd_dev = img_req->rbd_dev;
1669 
1670 	lockdep_assert_held(&rbd_dev->header_rwsem);
1671 
1672 	if (!rbd_img_is_write(img_req))
1673 		img_req->snap_id = rbd_dev->spec->snap_id;
1674 
1675 	if (rbd_dev_parent_get(rbd_dev))
1676 		img_request_layered_set(img_req);
1677 }
1678 
rbd_img_request_destroy(struct rbd_img_request * img_request)1679 static void rbd_img_request_destroy(struct rbd_img_request *img_request)
1680 {
1681 	struct rbd_obj_request *obj_request;
1682 	struct rbd_obj_request *next_obj_request;
1683 
1684 	dout("%s: img %p\n", __func__, img_request);
1685 
1686 	WARN_ON(!list_empty(&img_request->lock_item));
1687 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1688 		rbd_img_obj_request_del(img_request, obj_request);
1689 
1690 	if (img_request_layered_test(img_request))
1691 		rbd_dev_parent_put(img_request->rbd_dev);
1692 
1693 	if (rbd_img_is_write(img_request))
1694 		ceph_put_snap_context(img_request->snapc);
1695 
1696 	if (test_bit(IMG_REQ_CHILD, &img_request->flags))
1697 		kmem_cache_free(rbd_img_request_cache, img_request);
1698 }
1699 
1700 #define BITS_PER_OBJ	2
1701 #define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
1702 #define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)
1703 
__rbd_object_map_index(struct rbd_device * rbd_dev,u64 objno,u64 * index,u8 * shift)1704 static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1705 				   u64 *index, u8 *shift)
1706 {
1707 	u32 off;
1708 
1709 	rbd_assert(objno < rbd_dev->object_map_size);
1710 	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1711 	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1712 }
1713 
__rbd_object_map_get(struct rbd_device * rbd_dev,u64 objno)1714 static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1715 {
1716 	u64 index;
1717 	u8 shift;
1718 
1719 	lockdep_assert_held(&rbd_dev->object_map_lock);
1720 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
1721 	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
1722 }
1723 
__rbd_object_map_set(struct rbd_device * rbd_dev,u64 objno,u8 val)1724 static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
1725 {
1726 	u64 index;
1727 	u8 shift;
1728 	u8 *p;
1729 
1730 	lockdep_assert_held(&rbd_dev->object_map_lock);
1731 	rbd_assert(!(val & ~OBJ_MASK));
1732 
1733 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
1734 	p = &rbd_dev->object_map[index];
1735 	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
1736 }
1737 
rbd_object_map_get(struct rbd_device * rbd_dev,u64 objno)1738 static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1739 {
1740 	u8 state;
1741 
1742 	spin_lock(&rbd_dev->object_map_lock);
1743 	state = __rbd_object_map_get(rbd_dev, objno);
1744 	spin_unlock(&rbd_dev->object_map_lock);
1745 	return state;
1746 }
1747 
use_object_map(struct rbd_device * rbd_dev)1748 static bool use_object_map(struct rbd_device *rbd_dev)
1749 {
1750 	/*
1751 	 * An image mapped read-only can't use the object map -- it isn't
1752 	 * loaded because the header lock isn't acquired.  Someone else can
1753 	 * write to the image and update the object map behind our back.
1754 	 *
1755 	 * A snapshot can't be written to, so using the object map is always
1756 	 * safe.
1757 	 */
1758 	if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
1759 		return false;
1760 
1761 	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1762 		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
1763 }
1764 
rbd_object_map_may_exist(struct rbd_device * rbd_dev,u64 objno)1765 static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
1766 {
1767 	u8 state;
1768 
1769 	/* fall back to default logic if object map is disabled or invalid */
1770 	if (!use_object_map(rbd_dev))
1771 		return true;
1772 
1773 	state = rbd_object_map_get(rbd_dev, objno);
1774 	return state != OBJECT_NONEXISTENT;
1775 }
1776 
rbd_object_map_name(struct rbd_device * rbd_dev,u64 snap_id,struct ceph_object_id * oid)1777 static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1778 				struct ceph_object_id *oid)
1779 {
1780 	if (snap_id == CEPH_NOSNAP)
1781 		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1782 				rbd_dev->spec->image_id);
1783 	else
1784 		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1785 				rbd_dev->spec->image_id, snap_id);
1786 }
1787 
rbd_object_map_lock(struct rbd_device * rbd_dev)1788 static int rbd_object_map_lock(struct rbd_device *rbd_dev)
1789 {
1790 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1791 	CEPH_DEFINE_OID_ONSTACK(oid);
1792 	u8 lock_type;
1793 	char *lock_tag;
1794 	struct ceph_locker *lockers;
1795 	u32 num_lockers;
1796 	bool broke_lock = false;
1797 	int ret;
1798 
1799 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1800 
1801 again:
1802 	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1803 			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1804 	if (ret != -EBUSY || broke_lock) {
1805 		if (ret == -EEXIST)
1806 			ret = 0; /* already locked by myself */
1807 		if (ret)
1808 			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1809 		return ret;
1810 	}
1811 
1812 	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1813 				 RBD_LOCK_NAME, &lock_type, &lock_tag,
1814 				 &lockers, &num_lockers);
1815 	if (ret) {
1816 		if (ret == -ENOENT)
1817 			goto again;
1818 
1819 		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
1820 		return ret;
1821 	}
1822 
1823 	kfree(lock_tag);
1824 	if (num_lockers == 0)
1825 		goto again;
1826 
1827 	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1828 		 ENTITY_NAME(lockers[0].id.name));
1829 
1830 	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1831 				  RBD_LOCK_NAME, lockers[0].id.cookie,
1832 				  &lockers[0].id.name);
1833 	ceph_free_lockers(lockers, num_lockers);
1834 	if (ret) {
1835 		if (ret == -ENOENT)
1836 			goto again;
1837 
1838 		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1839 		return ret;
1840 	}
1841 
1842 	broke_lock = true;
1843 	goto again;
1844 }
1845 
rbd_object_map_unlock(struct rbd_device * rbd_dev)1846 static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
1847 {
1848 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1849 	CEPH_DEFINE_OID_ONSTACK(oid);
1850 	int ret;
1851 
1852 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1853 
1854 	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1855 			      "");
1856 	if (ret && ret != -ENOENT)
1857 		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
1858 }
1859 
decode_object_map_header(void ** p,void * end,u64 * object_map_size)1860 static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
1861 {
1862 	u8 struct_v;
1863 	u32 struct_len;
1864 	u32 header_len;
1865 	void *header_end;
1866 	int ret;
1867 
1868 	ceph_decode_32_safe(p, end, header_len, e_inval);
1869 	header_end = *p + header_len;
1870 
1871 	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1872 				  &struct_len);
1873 	if (ret)
1874 		return ret;
1875 
1876 	ceph_decode_64_safe(p, end, *object_map_size, e_inval);
1877 
1878 	*p = header_end;
1879 	return 0;
1880 
1881 e_inval:
1882 	return -EINVAL;
1883 }
1884 
__rbd_object_map_load(struct rbd_device * rbd_dev)1885 static int __rbd_object_map_load(struct rbd_device *rbd_dev)
1886 {
1887 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1888 	CEPH_DEFINE_OID_ONSTACK(oid);
1889 	struct page **pages;
1890 	void *p, *end;
1891 	size_t reply_len;
1892 	u64 num_objects;
1893 	u64 object_map_bytes;
1894 	u64 object_map_size;
1895 	int num_pages;
1896 	int ret;
1897 
1898 	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
1899 
1900 	num_objects = ceph_get_num_objects(&rbd_dev->layout,
1901 					   rbd_dev->mapping.size);
1902 	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1903 					    BITS_PER_BYTE);
1904 	num_pages = calc_pages_for(0, object_map_bytes) + 1;
1905 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1906 	if (IS_ERR(pages))
1907 		return PTR_ERR(pages);
1908 
1909 	reply_len = num_pages * PAGE_SIZE;
1910 	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1911 	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1912 			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1913 			     NULL, 0, pages, &reply_len);
1914 	if (ret)
1915 		goto out;
1916 
1917 	p = page_address(pages[0]);
1918 	end = p + min(reply_len, (size_t)PAGE_SIZE);
1919 	ret = decode_object_map_header(&p, end, &object_map_size);
1920 	if (ret)
1921 		goto out;
1922 
1923 	if (object_map_size != num_objects) {
1924 		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
1925 			 object_map_size, num_objects);
1926 		ret = -EINVAL;
1927 		goto out;
1928 	}
1929 
1930 	if (offset_in_page(p) + object_map_bytes > reply_len) {
1931 		ret = -EINVAL;
1932 		goto out;
1933 	}
1934 
1935 	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
1936 	if (!rbd_dev->object_map) {
1937 		ret = -ENOMEM;
1938 		goto out;
1939 	}
1940 
1941 	rbd_dev->object_map_size = object_map_size;
1942 	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
1943 				   offset_in_page(p), object_map_bytes);
1944 
1945 out:
1946 	ceph_release_page_vector(pages, num_pages);
1947 	return ret;
1948 }
1949 
rbd_object_map_free(struct rbd_device * rbd_dev)1950 static void rbd_object_map_free(struct rbd_device *rbd_dev)
1951 {
1952 	kvfree(rbd_dev->object_map);
1953 	rbd_dev->object_map = NULL;
1954 	rbd_dev->object_map_size = 0;
1955 }
1956 
rbd_object_map_load(struct rbd_device * rbd_dev)1957 static int rbd_object_map_load(struct rbd_device *rbd_dev)
1958 {
1959 	int ret;
1960 
1961 	ret = __rbd_object_map_load(rbd_dev);
1962 	if (ret)
1963 		return ret;
1964 
1965 	ret = rbd_dev_v2_get_flags(rbd_dev);
1966 	if (ret) {
1967 		rbd_object_map_free(rbd_dev);
1968 		return ret;
1969 	}
1970 
1971 	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
1972 		rbd_warn(rbd_dev, "object map is invalid");
1973 
1974 	return 0;
1975 }
1976 
rbd_object_map_open(struct rbd_device * rbd_dev)1977 static int rbd_object_map_open(struct rbd_device *rbd_dev)
1978 {
1979 	int ret;
1980 
1981 	ret = rbd_object_map_lock(rbd_dev);
1982 	if (ret)
1983 		return ret;
1984 
1985 	ret = rbd_object_map_load(rbd_dev);
1986 	if (ret) {
1987 		rbd_object_map_unlock(rbd_dev);
1988 		return ret;
1989 	}
1990 
1991 	return 0;
1992 }
1993 
rbd_object_map_close(struct rbd_device * rbd_dev)1994 static void rbd_object_map_close(struct rbd_device *rbd_dev)
1995 {
1996 	rbd_object_map_free(rbd_dev);
1997 	rbd_object_map_unlock(rbd_dev);
1998 }
1999 
2000 /*
2001  * This function needs snap_id (or more precisely just something to
2002  * distinguish between HEAD and snapshot object maps), new_state and
2003  * current_state that were passed to rbd_object_map_update().
2004  *
2005  * To avoid allocating and stashing a context we piggyback on the OSD
2006  * request.  A HEAD update has two ops (assert_locked).  For new_state
2007  * and current_state we decode our own object_map_update op, encoded in
2008  * rbd_cls_object_map_update().
2009  */
rbd_object_map_update_finish(struct rbd_obj_request * obj_req,struct ceph_osd_request * osd_req)2010 static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
2011 					struct ceph_osd_request *osd_req)
2012 {
2013 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2014 	struct ceph_osd_data *osd_data;
2015 	u64 objno;
2016 	u8 state, new_state, current_state;
2017 	bool has_current_state;
2018 	void *p;
2019 
2020 	if (osd_req->r_result)
2021 		return osd_req->r_result;
2022 
2023 	/*
2024 	 * Nothing to do for a snapshot object map.
2025 	 */
2026 	if (osd_req->r_num_ops == 1)
2027 		return 0;
2028 
2029 	/*
2030 	 * Update in-memory HEAD object map.
2031 	 */
2032 	rbd_assert(osd_req->r_num_ops == 2);
2033 	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
2034 	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
2035 
2036 	p = page_address(osd_data->pages[0]);
2037 	objno = ceph_decode_64(&p);
2038 	rbd_assert(objno == obj_req->ex.oe_objno);
2039 	rbd_assert(ceph_decode_64(&p) == objno + 1);
2040 	new_state = ceph_decode_8(&p);
2041 	has_current_state = ceph_decode_8(&p);
2042 	if (has_current_state)
2043 		current_state = ceph_decode_8(&p);
2044 
2045 	spin_lock(&rbd_dev->object_map_lock);
2046 	state = __rbd_object_map_get(rbd_dev, objno);
2047 	if (!has_current_state || current_state == state ||
2048 	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
2049 		__rbd_object_map_set(rbd_dev, objno, new_state);
2050 	spin_unlock(&rbd_dev->object_map_lock);
2051 
2052 	return 0;
2053 }
2054 
rbd_object_map_callback(struct ceph_osd_request * osd_req)2055 static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
2056 {
2057 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2058 	int result;
2059 
2060 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
2061 	     osd_req->r_result, obj_req);
2062 
2063 	result = rbd_object_map_update_finish(obj_req, osd_req);
2064 	rbd_obj_handle_request(obj_req, result);
2065 }
2066 
update_needed(struct rbd_device * rbd_dev,u64 objno,u8 new_state)2067 static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
2068 {
2069 	u8 state = rbd_object_map_get(rbd_dev, objno);
2070 
2071 	if (state == new_state ||
2072 	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2073 	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2074 		return false;
2075 
2076 	return true;
2077 }
2078 
rbd_cls_object_map_update(struct ceph_osd_request * req,int which,u64 objno,u8 new_state,const u8 * current_state)2079 static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2080 				     int which, u64 objno, u8 new_state,
2081 				     const u8 *current_state)
2082 {
2083 	struct page **pages;
2084 	void *p, *start;
2085 	int ret;
2086 
2087 	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2088 	if (ret)
2089 		return ret;
2090 
2091 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
2092 	if (IS_ERR(pages))
2093 		return PTR_ERR(pages);
2094 
2095 	p = start = page_address(pages[0]);
2096 	ceph_encode_64(&p, objno);
2097 	ceph_encode_64(&p, objno + 1);
2098 	ceph_encode_8(&p, new_state);
2099 	if (current_state) {
2100 		ceph_encode_8(&p, 1);
2101 		ceph_encode_8(&p, *current_state);
2102 	} else {
2103 		ceph_encode_8(&p, 0);
2104 	}
2105 
2106 	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2107 					  false, true);
2108 	return 0;
2109 }
2110 
2111 /*
2112  * Return:
2113  *   0 - object map update sent
2114  *   1 - object map update isn't needed
2115  *  <0 - error
2116  */
rbd_object_map_update(struct rbd_obj_request * obj_req,u64 snap_id,u8 new_state,const u8 * current_state)2117 static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2118 				 u8 new_state, const u8 *current_state)
2119 {
2120 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2121 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2122 	struct ceph_osd_request *req;
2123 	int num_ops = 1;
2124 	int which = 0;
2125 	int ret;
2126 
2127 	if (snap_id == CEPH_NOSNAP) {
2128 		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2129 			return 1;
2130 
2131 		num_ops++; /* assert_locked */
2132 	}
2133 
2134 	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2135 	if (!req)
2136 		return -ENOMEM;
2137 
2138 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2139 	req->r_callback = rbd_object_map_callback;
2140 	req->r_priv = obj_req;
2141 
2142 	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2143 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2144 	req->r_flags = CEPH_OSD_FLAG_WRITE;
2145 	ktime_get_real_ts64(&req->r_mtime);
2146 
2147 	if (snap_id == CEPH_NOSNAP) {
2148 		/*
2149 		 * Protect against possible race conditions during lock
2150 		 * ownership transitions.
2151 		 */
2152 		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2153 					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
2154 		if (ret)
2155 			return ret;
2156 	}
2157 
2158 	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2159 					new_state, current_state);
2160 	if (ret)
2161 		return ret;
2162 
2163 	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2164 	if (ret)
2165 		return ret;
2166 
2167 	ceph_osdc_start_request(osdc, req, false);
2168 	return 0;
2169 }
2170 
prune_extents(struct ceph_file_extent * img_extents,u32 * num_img_extents,u64 overlap)2171 static void prune_extents(struct ceph_file_extent *img_extents,
2172 			  u32 *num_img_extents, u64 overlap)
2173 {
2174 	u32 cnt = *num_img_extents;
2175 
2176 	/* drop extents completely beyond the overlap */
2177 	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2178 		cnt--;
2179 
2180 	if (cnt) {
2181 		struct ceph_file_extent *ex = &img_extents[cnt - 1];
2182 
2183 		/* trim final overlapping extent */
2184 		if (ex->fe_off + ex->fe_len > overlap)
2185 			ex->fe_len = overlap - ex->fe_off;
2186 	}
2187 
2188 	*num_img_extents = cnt;
2189 }
2190 
2191 /*
2192  * Determine the byte range(s) covered by either just the object extent
2193  * or the entire object in the parent image.
2194  */
rbd_obj_calc_img_extents(struct rbd_obj_request * obj_req,bool entire)2195 static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2196 				    bool entire)
2197 {
2198 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2199 	int ret;
2200 
2201 	if (!rbd_dev->parent_overlap)
2202 		return 0;
2203 
2204 	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2205 				  entire ? 0 : obj_req->ex.oe_off,
2206 				  entire ? rbd_dev->layout.object_size :
2207 							obj_req->ex.oe_len,
2208 				  &obj_req->img_extents,
2209 				  &obj_req->num_img_extents);
2210 	if (ret)
2211 		return ret;
2212 
2213 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2214 		      rbd_dev->parent_overlap);
2215 	return 0;
2216 }
2217 
rbd_osd_setup_data(struct ceph_osd_request * osd_req,int which)2218 static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
2219 {
2220 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2221 
2222 	switch (obj_req->img_request->data_type) {
2223 	case OBJ_REQUEST_BIO:
2224 		osd_req_op_extent_osd_data_bio(osd_req, which,
2225 					       &obj_req->bio_pos,
2226 					       obj_req->ex.oe_len);
2227 		break;
2228 	case OBJ_REQUEST_BVECS:
2229 	case OBJ_REQUEST_OWN_BVECS:
2230 		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
2231 							obj_req->ex.oe_len);
2232 		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
2233 		osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
2234 						    &obj_req->bvec_pos);
2235 		break;
2236 	default:
2237 		BUG();
2238 	}
2239 }
2240 
rbd_osd_setup_stat(struct ceph_osd_request * osd_req,int which)2241 static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
2242 {
2243 	struct page **pages;
2244 
2245 	/*
2246 	 * The response data for a STAT call consists of:
2247 	 *     le64 length;
2248 	 *     struct {
2249 	 *         le32 tv_sec;
2250 	 *         le32 tv_nsec;
2251 	 *     } mtime;
2252 	 */
2253 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
2254 	if (IS_ERR(pages))
2255 		return PTR_ERR(pages);
2256 
2257 	osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2258 	osd_req_op_raw_data_in_pages(osd_req, which, pages,
2259 				     8 + sizeof(struct ceph_timespec),
2260 				     0, false, true);
2261 	return 0;
2262 }
2263 
rbd_osd_setup_copyup(struct ceph_osd_request * osd_req,int which,u32 bytes)2264 static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2265 				u32 bytes)
2266 {
2267 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2268 	int ret;
2269 
2270 	ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2271 	if (ret)
2272 		return ret;
2273 
2274 	osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2275 					  obj_req->copyup_bvec_count, bytes);
2276 	return 0;
2277 }
2278 
rbd_obj_init_read(struct rbd_obj_request * obj_req)2279 static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
2280 {
2281 	obj_req->read_state = RBD_OBJ_READ_START;
2282 	return 0;
2283 }
2284 
__rbd_osd_setup_write_ops(struct ceph_osd_request * osd_req,int which)2285 static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2286 				      int which)
2287 {
2288 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2289 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2290 	u16 opcode;
2291 
2292 	if (!use_object_map(rbd_dev) ||
2293 	    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2294 		osd_req_op_alloc_hint_init(osd_req, which++,
2295 					   rbd_dev->layout.object_size,
2296 					   rbd_dev->layout.object_size,
2297 					   rbd_dev->opts->alloc_hint_flags);
2298 	}
2299 
2300 	if (rbd_obj_is_entire(obj_req))
2301 		opcode = CEPH_OSD_OP_WRITEFULL;
2302 	else
2303 		opcode = CEPH_OSD_OP_WRITE;
2304 
2305 	osd_req_op_extent_init(osd_req, which, opcode,
2306 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2307 	rbd_osd_setup_data(osd_req, which);
2308 }
2309 
rbd_obj_init_write(struct rbd_obj_request * obj_req)2310 static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
2311 {
2312 	int ret;
2313 
2314 	/* reverse map the entire object onto the parent */
2315 	ret = rbd_obj_calc_img_extents(obj_req, true);
2316 	if (ret)
2317 		return ret;
2318 
2319 	obj_req->write_state = RBD_OBJ_WRITE_START;
2320 	return 0;
2321 }
2322 
truncate_or_zero_opcode(struct rbd_obj_request * obj_req)2323 static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2324 {
2325 	return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2326 					  CEPH_OSD_OP_ZERO;
2327 }
2328 
__rbd_osd_setup_discard_ops(struct ceph_osd_request * osd_req,int which)2329 static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2330 					int which)
2331 {
2332 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2333 
2334 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2335 		rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2336 		osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
2337 	} else {
2338 		osd_req_op_extent_init(osd_req, which,
2339 				       truncate_or_zero_opcode(obj_req),
2340 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
2341 				       0, 0);
2342 	}
2343 }
2344 
rbd_obj_init_discard(struct rbd_obj_request * obj_req)2345 static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
2346 {
2347 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2348 	u64 off, next_off;
2349 	int ret;
2350 
2351 	/*
2352 	 * Align the range to alloc_size boundary and punt on discards
2353 	 * that are too small to free up any space.
2354 	 *
2355 	 * alloc_size == object_size && is_tail() is a special case for
2356 	 * filestore with filestore_punch_hole = false, needed to allow
2357 	 * truncate (in addition to delete).
2358 	 */
2359 	if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2360 	    !rbd_obj_is_tail(obj_req)) {
2361 		off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2362 		next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2363 				      rbd_dev->opts->alloc_size);
2364 		if (off >= next_off)
2365 			return 1;
2366 
2367 		dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2368 		     obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2369 		     off, next_off - off);
2370 		obj_req->ex.oe_off = off;
2371 		obj_req->ex.oe_len = next_off - off;
2372 	}
2373 
2374 	/* reverse map the entire object onto the parent */
2375 	ret = rbd_obj_calc_img_extents(obj_req, true);
2376 	if (ret)
2377 		return ret;
2378 
2379 	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2380 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2381 		obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2382 
2383 	obj_req->write_state = RBD_OBJ_WRITE_START;
2384 	return 0;
2385 }
2386 
__rbd_osd_setup_zeroout_ops(struct ceph_osd_request * osd_req,int which)2387 static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2388 					int which)
2389 {
2390 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2391 	u16 opcode;
2392 
2393 	if (rbd_obj_is_entire(obj_req)) {
2394 		if (obj_req->num_img_extents) {
2395 			if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2396 				osd_req_op_init(osd_req, which++,
2397 						CEPH_OSD_OP_CREATE, 0);
2398 			opcode = CEPH_OSD_OP_TRUNCATE;
2399 		} else {
2400 			rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2401 			osd_req_op_init(osd_req, which++,
2402 					CEPH_OSD_OP_DELETE, 0);
2403 			opcode = 0;
2404 		}
2405 	} else {
2406 		opcode = truncate_or_zero_opcode(obj_req);
2407 	}
2408 
2409 	if (opcode)
2410 		osd_req_op_extent_init(osd_req, which, opcode,
2411 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
2412 				       0, 0);
2413 }
2414 
rbd_obj_init_zeroout(struct rbd_obj_request * obj_req)2415 static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
2416 {
2417 	int ret;
2418 
2419 	/* reverse map the entire object onto the parent */
2420 	ret = rbd_obj_calc_img_extents(obj_req, true);
2421 	if (ret)
2422 		return ret;
2423 
2424 	if (!obj_req->num_img_extents) {
2425 		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2426 		if (rbd_obj_is_entire(obj_req))
2427 			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2428 	}
2429 
2430 	obj_req->write_state = RBD_OBJ_WRITE_START;
2431 	return 0;
2432 }
2433 
count_write_ops(struct rbd_obj_request * obj_req)2434 static int count_write_ops(struct rbd_obj_request *obj_req)
2435 {
2436 	struct rbd_img_request *img_req = obj_req->img_request;
2437 
2438 	switch (img_req->op_type) {
2439 	case OBJ_OP_WRITE:
2440 		if (!use_object_map(img_req->rbd_dev) ||
2441 		    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2442 			return 2; /* setallochint + write/writefull */
2443 
2444 		return 1; /* write/writefull */
2445 	case OBJ_OP_DISCARD:
2446 		return 1; /* delete/truncate/zero */
2447 	case OBJ_OP_ZEROOUT:
2448 		if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2449 		    !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2450 			return 2; /* create + truncate */
2451 
2452 		return 1; /* delete/truncate/zero */
2453 	default:
2454 		BUG();
2455 	}
2456 }
2457 
rbd_osd_setup_write_ops(struct ceph_osd_request * osd_req,int which)2458 static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2459 				    int which)
2460 {
2461 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2462 
2463 	switch (obj_req->img_request->op_type) {
2464 	case OBJ_OP_WRITE:
2465 		__rbd_osd_setup_write_ops(osd_req, which);
2466 		break;
2467 	case OBJ_OP_DISCARD:
2468 		__rbd_osd_setup_discard_ops(osd_req, which);
2469 		break;
2470 	case OBJ_OP_ZEROOUT:
2471 		__rbd_osd_setup_zeroout_ops(osd_req, which);
2472 		break;
2473 	default:
2474 		BUG();
2475 	}
2476 }
2477 
2478 /*
2479  * Prune the list of object requests (adjust offset and/or length, drop
2480  * redundant requests).  Prepare object request state machines and image
2481  * request state machine for execution.
2482  */
__rbd_img_fill_request(struct rbd_img_request * img_req)2483 static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2484 {
2485 	struct rbd_obj_request *obj_req, *next_obj_req;
2486 	int ret;
2487 
2488 	for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
2489 		switch (img_req->op_type) {
2490 		case OBJ_OP_READ:
2491 			ret = rbd_obj_init_read(obj_req);
2492 			break;
2493 		case OBJ_OP_WRITE:
2494 			ret = rbd_obj_init_write(obj_req);
2495 			break;
2496 		case OBJ_OP_DISCARD:
2497 			ret = rbd_obj_init_discard(obj_req);
2498 			break;
2499 		case OBJ_OP_ZEROOUT:
2500 			ret = rbd_obj_init_zeroout(obj_req);
2501 			break;
2502 		default:
2503 			BUG();
2504 		}
2505 		if (ret < 0)
2506 			return ret;
2507 		if (ret > 0) {
2508 			rbd_img_obj_request_del(img_req, obj_req);
2509 			continue;
2510 		}
2511 	}
2512 
2513 	img_req->state = RBD_IMG_START;
2514 	return 0;
2515 }
2516 
2517 union rbd_img_fill_iter {
2518 	struct ceph_bio_iter	bio_iter;
2519 	struct ceph_bvec_iter	bvec_iter;
2520 };
2521 
2522 struct rbd_img_fill_ctx {
2523 	enum obj_request_type	pos_type;
2524 	union rbd_img_fill_iter	*pos;
2525 	union rbd_img_fill_iter	iter;
2526 	ceph_object_extent_fn_t	set_pos_fn;
2527 	ceph_object_extent_fn_t	count_fn;
2528 	ceph_object_extent_fn_t	copy_fn;
2529 };
2530 
alloc_object_extent(void * arg)2531 static struct ceph_object_extent *alloc_object_extent(void *arg)
2532 {
2533 	struct rbd_img_request *img_req = arg;
2534 	struct rbd_obj_request *obj_req;
2535 
2536 	obj_req = rbd_obj_request_create();
2537 	if (!obj_req)
2538 		return NULL;
2539 
2540 	rbd_img_obj_request_add(img_req, obj_req);
2541 	return &obj_req->ex;
2542 }
2543 
2544 /*
2545  * While su != os && sc == 1 is technically not fancy (it's the same
2546  * layout as su == os && sc == 1), we can't use the nocopy path for it
2547  * because ->set_pos_fn() should be called only once per object.
2548  * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2549  * treat su != os && sc == 1 as fancy.
2550  */
rbd_layout_is_fancy(struct ceph_file_layout * l)2551 static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2552 {
2553 	return l->stripe_unit != l->object_size;
2554 }
2555 
rbd_img_fill_request_nocopy(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct rbd_img_fill_ctx * fctx)2556 static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2557 				       struct ceph_file_extent *img_extents,
2558 				       u32 num_img_extents,
2559 				       struct rbd_img_fill_ctx *fctx)
2560 {
2561 	u32 i;
2562 	int ret;
2563 
2564 	img_req->data_type = fctx->pos_type;
2565 
2566 	/*
2567 	 * Create object requests and set each object request's starting
2568 	 * position in the provided bio (list) or bio_vec array.
2569 	 */
2570 	fctx->iter = *fctx->pos;
2571 	for (i = 0; i < num_img_extents; i++) {
2572 		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2573 					   img_extents[i].fe_off,
2574 					   img_extents[i].fe_len,
2575 					   &img_req->object_extents,
2576 					   alloc_object_extent, img_req,
2577 					   fctx->set_pos_fn, &fctx->iter);
2578 		if (ret)
2579 			return ret;
2580 	}
2581 
2582 	return __rbd_img_fill_request(img_req);
2583 }
2584 
2585 /*
2586  * Map a list of image extents to a list of object extents, create the
2587  * corresponding object requests (normally each to a different object,
2588  * but not always) and add them to @img_req.  For each object request,
2589  * set up its data descriptor to point to the corresponding chunk(s) of
2590  * @fctx->pos data buffer.
2591  *
2592  * Because ceph_file_to_extents() will merge adjacent object extents
2593  * together, each object request's data descriptor may point to multiple
2594  * different chunks of @fctx->pos data buffer.
2595  *
2596  * @fctx->pos data buffer is assumed to be large enough.
2597  */
rbd_img_fill_request(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct rbd_img_fill_ctx * fctx)2598 static int rbd_img_fill_request(struct rbd_img_request *img_req,
2599 				struct ceph_file_extent *img_extents,
2600 				u32 num_img_extents,
2601 				struct rbd_img_fill_ctx *fctx)
2602 {
2603 	struct rbd_device *rbd_dev = img_req->rbd_dev;
2604 	struct rbd_obj_request *obj_req;
2605 	u32 i;
2606 	int ret;
2607 
2608 	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2609 	    !rbd_layout_is_fancy(&rbd_dev->layout))
2610 		return rbd_img_fill_request_nocopy(img_req, img_extents,
2611 						   num_img_extents, fctx);
2612 
2613 	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2614 
2615 	/*
2616 	 * Create object requests and determine ->bvec_count for each object
2617 	 * request.  Note that ->bvec_count sum over all object requests may
2618 	 * be greater than the number of bio_vecs in the provided bio (list)
2619 	 * or bio_vec array because when mapped, those bio_vecs can straddle
2620 	 * stripe unit boundaries.
2621 	 */
2622 	fctx->iter = *fctx->pos;
2623 	for (i = 0; i < num_img_extents; i++) {
2624 		ret = ceph_file_to_extents(&rbd_dev->layout,
2625 					   img_extents[i].fe_off,
2626 					   img_extents[i].fe_len,
2627 					   &img_req->object_extents,
2628 					   alloc_object_extent, img_req,
2629 					   fctx->count_fn, &fctx->iter);
2630 		if (ret)
2631 			return ret;
2632 	}
2633 
2634 	for_each_obj_request(img_req, obj_req) {
2635 		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2636 					      sizeof(*obj_req->bvec_pos.bvecs),
2637 					      GFP_NOIO);
2638 		if (!obj_req->bvec_pos.bvecs)
2639 			return -ENOMEM;
2640 	}
2641 
2642 	/*
2643 	 * Fill in each object request's private bio_vec array, splitting and
2644 	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2645 	 */
2646 	fctx->iter = *fctx->pos;
2647 	for (i = 0; i < num_img_extents; i++) {
2648 		ret = ceph_iterate_extents(&rbd_dev->layout,
2649 					   img_extents[i].fe_off,
2650 					   img_extents[i].fe_len,
2651 					   &img_req->object_extents,
2652 					   fctx->copy_fn, &fctx->iter);
2653 		if (ret)
2654 			return ret;
2655 	}
2656 
2657 	return __rbd_img_fill_request(img_req);
2658 }
2659 
rbd_img_fill_nodata(struct rbd_img_request * img_req,u64 off,u64 len)2660 static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2661 			       u64 off, u64 len)
2662 {
2663 	struct ceph_file_extent ex = { off, len };
2664 	union rbd_img_fill_iter dummy = {};
2665 	struct rbd_img_fill_ctx fctx = {
2666 		.pos_type = OBJ_REQUEST_NODATA,
2667 		.pos = &dummy,
2668 	};
2669 
2670 	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2671 }
2672 
set_bio_pos(struct ceph_object_extent * ex,u32 bytes,void * arg)2673 static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2674 {
2675 	struct rbd_obj_request *obj_req =
2676 	    container_of(ex, struct rbd_obj_request, ex);
2677 	struct ceph_bio_iter *it = arg;
2678 
2679 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2680 	obj_req->bio_pos = *it;
2681 	ceph_bio_iter_advance(it, bytes);
2682 }
2683 
count_bio_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2684 static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2685 {
2686 	struct rbd_obj_request *obj_req =
2687 	    container_of(ex, struct rbd_obj_request, ex);
2688 	struct ceph_bio_iter *it = arg;
2689 
2690 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2691 	ceph_bio_iter_advance_step(it, bytes, ({
2692 		obj_req->bvec_count++;
2693 	}));
2694 
2695 }
2696 
copy_bio_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2697 static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2698 {
2699 	struct rbd_obj_request *obj_req =
2700 	    container_of(ex, struct rbd_obj_request, ex);
2701 	struct ceph_bio_iter *it = arg;
2702 
2703 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2704 	ceph_bio_iter_advance_step(it, bytes, ({
2705 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2706 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2707 	}));
2708 }
2709 
__rbd_img_fill_from_bio(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct ceph_bio_iter * bio_pos)2710 static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2711 				   struct ceph_file_extent *img_extents,
2712 				   u32 num_img_extents,
2713 				   struct ceph_bio_iter *bio_pos)
2714 {
2715 	struct rbd_img_fill_ctx fctx = {
2716 		.pos_type = OBJ_REQUEST_BIO,
2717 		.pos = (union rbd_img_fill_iter *)bio_pos,
2718 		.set_pos_fn = set_bio_pos,
2719 		.count_fn = count_bio_bvecs,
2720 		.copy_fn = copy_bio_bvecs,
2721 	};
2722 
2723 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2724 				    &fctx);
2725 }
2726 
rbd_img_fill_from_bio(struct rbd_img_request * img_req,u64 off,u64 len,struct bio * bio)2727 static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2728 				 u64 off, u64 len, struct bio *bio)
2729 {
2730 	struct ceph_file_extent ex = { off, len };
2731 	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2732 
2733 	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2734 }
2735 
set_bvec_pos(struct ceph_object_extent * ex,u32 bytes,void * arg)2736 static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2737 {
2738 	struct rbd_obj_request *obj_req =
2739 	    container_of(ex, struct rbd_obj_request, ex);
2740 	struct ceph_bvec_iter *it = arg;
2741 
2742 	obj_req->bvec_pos = *it;
2743 	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2744 	ceph_bvec_iter_advance(it, bytes);
2745 }
2746 
count_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2747 static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2748 {
2749 	struct rbd_obj_request *obj_req =
2750 	    container_of(ex, struct rbd_obj_request, ex);
2751 	struct ceph_bvec_iter *it = arg;
2752 
2753 	ceph_bvec_iter_advance_step(it, bytes, ({
2754 		obj_req->bvec_count++;
2755 	}));
2756 }
2757 
copy_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2758 static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2759 {
2760 	struct rbd_obj_request *obj_req =
2761 	    container_of(ex, struct rbd_obj_request, ex);
2762 	struct ceph_bvec_iter *it = arg;
2763 
2764 	ceph_bvec_iter_advance_step(it, bytes, ({
2765 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2766 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2767 	}));
2768 }
2769 
__rbd_img_fill_from_bvecs(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct ceph_bvec_iter * bvec_pos)2770 static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2771 				     struct ceph_file_extent *img_extents,
2772 				     u32 num_img_extents,
2773 				     struct ceph_bvec_iter *bvec_pos)
2774 {
2775 	struct rbd_img_fill_ctx fctx = {
2776 		.pos_type = OBJ_REQUEST_BVECS,
2777 		.pos = (union rbd_img_fill_iter *)bvec_pos,
2778 		.set_pos_fn = set_bvec_pos,
2779 		.count_fn = count_bvecs,
2780 		.copy_fn = copy_bvecs,
2781 	};
2782 
2783 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2784 				    &fctx);
2785 }
2786 
rbd_img_fill_from_bvecs(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct bio_vec * bvecs)2787 static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2788 				   struct ceph_file_extent *img_extents,
2789 				   u32 num_img_extents,
2790 				   struct bio_vec *bvecs)
2791 {
2792 	struct ceph_bvec_iter it = {
2793 		.bvecs = bvecs,
2794 		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2795 							     num_img_extents) },
2796 	};
2797 
2798 	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2799 					 &it);
2800 }
2801 
rbd_img_handle_request_work(struct work_struct * work)2802 static void rbd_img_handle_request_work(struct work_struct *work)
2803 {
2804 	struct rbd_img_request *img_req =
2805 	    container_of(work, struct rbd_img_request, work);
2806 
2807 	rbd_img_handle_request(img_req, img_req->work_result);
2808 }
2809 
rbd_img_schedule(struct rbd_img_request * img_req,int result)2810 static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2811 {
2812 	INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2813 	img_req->work_result = result;
2814 	queue_work(rbd_wq, &img_req->work);
2815 }
2816 
rbd_obj_may_exist(struct rbd_obj_request * obj_req)2817 static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2818 {
2819 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2820 
2821 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2822 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2823 		return true;
2824 	}
2825 
2826 	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2827 	     obj_req->ex.oe_objno);
2828 	return false;
2829 }
2830 
rbd_obj_read_object(struct rbd_obj_request * obj_req)2831 static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2832 {
2833 	struct ceph_osd_request *osd_req;
2834 	int ret;
2835 
2836 	osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2837 	if (IS_ERR(osd_req))
2838 		return PTR_ERR(osd_req);
2839 
2840 	osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2841 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2842 	rbd_osd_setup_data(osd_req, 0);
2843 	rbd_osd_format_read(osd_req);
2844 
2845 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2846 	if (ret)
2847 		return ret;
2848 
2849 	rbd_osd_submit(osd_req);
2850 	return 0;
2851 }
2852 
rbd_obj_read_from_parent(struct rbd_obj_request * obj_req)2853 static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
2854 {
2855 	struct rbd_img_request *img_req = obj_req->img_request;
2856 	struct rbd_device *parent = img_req->rbd_dev->parent;
2857 	struct rbd_img_request *child_img_req;
2858 	int ret;
2859 
2860 	child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2861 	if (!child_img_req)
2862 		return -ENOMEM;
2863 
2864 	rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
2865 	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2866 	child_img_req->obj_request = obj_req;
2867 
2868 	down_read(&parent->header_rwsem);
2869 	rbd_img_capture_header(child_img_req);
2870 	up_read(&parent->header_rwsem);
2871 
2872 	dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2873 	     obj_req);
2874 
2875 	if (!rbd_img_is_write(img_req)) {
2876 		switch (img_req->data_type) {
2877 		case OBJ_REQUEST_BIO:
2878 			ret = __rbd_img_fill_from_bio(child_img_req,
2879 						      obj_req->img_extents,
2880 						      obj_req->num_img_extents,
2881 						      &obj_req->bio_pos);
2882 			break;
2883 		case OBJ_REQUEST_BVECS:
2884 		case OBJ_REQUEST_OWN_BVECS:
2885 			ret = __rbd_img_fill_from_bvecs(child_img_req,
2886 						      obj_req->img_extents,
2887 						      obj_req->num_img_extents,
2888 						      &obj_req->bvec_pos);
2889 			break;
2890 		default:
2891 			BUG();
2892 		}
2893 	} else {
2894 		ret = rbd_img_fill_from_bvecs(child_img_req,
2895 					      obj_req->img_extents,
2896 					      obj_req->num_img_extents,
2897 					      obj_req->copyup_bvecs);
2898 	}
2899 	if (ret) {
2900 		rbd_img_request_destroy(child_img_req);
2901 		return ret;
2902 	}
2903 
2904 	/* avoid parent chain recursion */
2905 	rbd_img_schedule(child_img_req, 0);
2906 	return 0;
2907 }
2908 
rbd_obj_advance_read(struct rbd_obj_request * obj_req,int * result)2909 static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
2910 {
2911 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2912 	int ret;
2913 
2914 again:
2915 	switch (obj_req->read_state) {
2916 	case RBD_OBJ_READ_START:
2917 		rbd_assert(!*result);
2918 
2919 		if (!rbd_obj_may_exist(obj_req)) {
2920 			*result = -ENOENT;
2921 			obj_req->read_state = RBD_OBJ_READ_OBJECT;
2922 			goto again;
2923 		}
2924 
2925 		ret = rbd_obj_read_object(obj_req);
2926 		if (ret) {
2927 			*result = ret;
2928 			return true;
2929 		}
2930 		obj_req->read_state = RBD_OBJ_READ_OBJECT;
2931 		return false;
2932 	case RBD_OBJ_READ_OBJECT:
2933 		if (*result == -ENOENT && rbd_dev->parent_overlap) {
2934 			/* reverse map this object extent onto the parent */
2935 			ret = rbd_obj_calc_img_extents(obj_req, false);
2936 			if (ret) {
2937 				*result = ret;
2938 				return true;
2939 			}
2940 			if (obj_req->num_img_extents) {
2941 				ret = rbd_obj_read_from_parent(obj_req);
2942 				if (ret) {
2943 					*result = ret;
2944 					return true;
2945 				}
2946 				obj_req->read_state = RBD_OBJ_READ_PARENT;
2947 				return false;
2948 			}
2949 		}
2950 
2951 		/*
2952 		 * -ENOENT means a hole in the image -- zero-fill the entire
2953 		 * length of the request.  A short read also implies zero-fill
2954 		 * to the end of the request.
2955 		 */
2956 		if (*result == -ENOENT) {
2957 			rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2958 			*result = 0;
2959 		} else if (*result >= 0) {
2960 			if (*result < obj_req->ex.oe_len)
2961 				rbd_obj_zero_range(obj_req, *result,
2962 						obj_req->ex.oe_len - *result);
2963 			else
2964 				rbd_assert(*result == obj_req->ex.oe_len);
2965 			*result = 0;
2966 		}
2967 		return true;
2968 	case RBD_OBJ_READ_PARENT:
2969 		/*
2970 		 * The parent image is read only up to the overlap -- zero-fill
2971 		 * from the overlap to the end of the request.
2972 		 */
2973 		if (!*result) {
2974 			u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
2975 
2976 			if (obj_overlap < obj_req->ex.oe_len)
2977 				rbd_obj_zero_range(obj_req, obj_overlap,
2978 					    obj_req->ex.oe_len - obj_overlap);
2979 		}
2980 		return true;
2981 	default:
2982 		BUG();
2983 	}
2984 }
2985 
rbd_obj_write_is_noop(struct rbd_obj_request * obj_req)2986 static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
2987 {
2988 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2989 
2990 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
2991 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2992 
2993 	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
2994 	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
2995 		dout("%s %p noop for nonexistent\n", __func__, obj_req);
2996 		return true;
2997 	}
2998 
2999 	return false;
3000 }
3001 
3002 /*
3003  * Return:
3004  *   0 - object map update sent
3005  *   1 - object map update isn't needed
3006  *  <0 - error
3007  */
rbd_obj_write_pre_object_map(struct rbd_obj_request * obj_req)3008 static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
3009 {
3010 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3011 	u8 new_state;
3012 
3013 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3014 		return 1;
3015 
3016 	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3017 		new_state = OBJECT_PENDING;
3018 	else
3019 		new_state = OBJECT_EXISTS;
3020 
3021 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
3022 }
3023 
rbd_obj_write_object(struct rbd_obj_request * obj_req)3024 static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
3025 {
3026 	struct ceph_osd_request *osd_req;
3027 	int num_ops = count_write_ops(obj_req);
3028 	int which = 0;
3029 	int ret;
3030 
3031 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
3032 		num_ops++; /* stat */
3033 
3034 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3035 	if (IS_ERR(osd_req))
3036 		return PTR_ERR(osd_req);
3037 
3038 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3039 		ret = rbd_osd_setup_stat(osd_req, which++);
3040 		if (ret)
3041 			return ret;
3042 	}
3043 
3044 	rbd_osd_setup_write_ops(osd_req, which);
3045 	rbd_osd_format_write(osd_req);
3046 
3047 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3048 	if (ret)
3049 		return ret;
3050 
3051 	rbd_osd_submit(osd_req);
3052 	return 0;
3053 }
3054 
3055 /*
3056  * copyup_bvecs pages are never highmem pages
3057  */
is_zero_bvecs(struct bio_vec * bvecs,u32 bytes)3058 static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
3059 {
3060 	struct ceph_bvec_iter it = {
3061 		.bvecs = bvecs,
3062 		.iter = { .bi_size = bytes },
3063 	};
3064 
3065 	ceph_bvec_iter_advance_step(&it, bytes, ({
3066 		if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3067 			       bv.bv_len))
3068 			return false;
3069 	}));
3070 	return true;
3071 }
3072 
3073 #define MODS_ONLY	U32_MAX
3074 
rbd_obj_copyup_empty_snapc(struct rbd_obj_request * obj_req,u32 bytes)3075 static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
3076 				      u32 bytes)
3077 {
3078 	struct ceph_osd_request *osd_req;
3079 	int ret;
3080 
3081 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3082 	rbd_assert(bytes > 0 && bytes != MODS_ONLY);
3083 
3084 	osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3085 	if (IS_ERR(osd_req))
3086 		return PTR_ERR(osd_req);
3087 
3088 	ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
3089 	if (ret)
3090 		return ret;
3091 
3092 	rbd_osd_format_write(osd_req);
3093 
3094 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3095 	if (ret)
3096 		return ret;
3097 
3098 	rbd_osd_submit(osd_req);
3099 	return 0;
3100 }
3101 
rbd_obj_copyup_current_snapc(struct rbd_obj_request * obj_req,u32 bytes)3102 static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3103 					u32 bytes)
3104 {
3105 	struct ceph_osd_request *osd_req;
3106 	int num_ops = count_write_ops(obj_req);
3107 	int which = 0;
3108 	int ret;
3109 
3110 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3111 
3112 	if (bytes != MODS_ONLY)
3113 		num_ops++; /* copyup */
3114 
3115 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3116 	if (IS_ERR(osd_req))
3117 		return PTR_ERR(osd_req);
3118 
3119 	if (bytes != MODS_ONLY) {
3120 		ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
3121 		if (ret)
3122 			return ret;
3123 	}
3124 
3125 	rbd_osd_setup_write_ops(osd_req, which);
3126 	rbd_osd_format_write(osd_req);
3127 
3128 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3129 	if (ret)
3130 		return ret;
3131 
3132 	rbd_osd_submit(osd_req);
3133 	return 0;
3134 }
3135 
setup_copyup_bvecs(struct rbd_obj_request * obj_req,u64 obj_overlap)3136 static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
3137 {
3138 	u32 i;
3139 
3140 	rbd_assert(!obj_req->copyup_bvecs);
3141 	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3142 	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3143 					sizeof(*obj_req->copyup_bvecs),
3144 					GFP_NOIO);
3145 	if (!obj_req->copyup_bvecs)
3146 		return -ENOMEM;
3147 
3148 	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3149 		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3150 
3151 		obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3152 		if (!obj_req->copyup_bvecs[i].bv_page)
3153 			return -ENOMEM;
3154 
3155 		obj_req->copyup_bvecs[i].bv_offset = 0;
3156 		obj_req->copyup_bvecs[i].bv_len = len;
3157 		obj_overlap -= len;
3158 	}
3159 
3160 	rbd_assert(!obj_overlap);
3161 	return 0;
3162 }
3163 
3164 /*
3165  * The target object doesn't exist.  Read the data for the entire
3166  * target object up to the overlap point (if any) from the parent,
3167  * so we can use it for a copyup.
3168  */
rbd_obj_copyup_read_parent(struct rbd_obj_request * obj_req)3169 static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
3170 {
3171 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3172 	int ret;
3173 
3174 	rbd_assert(obj_req->num_img_extents);
3175 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3176 		      rbd_dev->parent_overlap);
3177 	if (!obj_req->num_img_extents) {
3178 		/*
3179 		 * The overlap has become 0 (most likely because the
3180 		 * image has been flattened).  Re-submit the original write
3181 		 * request -- pass MODS_ONLY since the copyup isn't needed
3182 		 * anymore.
3183 		 */
3184 		return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
3185 	}
3186 
3187 	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3188 	if (ret)
3189 		return ret;
3190 
3191 	return rbd_obj_read_from_parent(obj_req);
3192 }
3193 
rbd_obj_copyup_object_maps(struct rbd_obj_request * obj_req)3194 static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
3195 {
3196 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3197 	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3198 	u8 new_state;
3199 	u32 i;
3200 	int ret;
3201 
3202 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3203 
3204 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3205 		return;
3206 
3207 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3208 		return;
3209 
3210 	for (i = 0; i < snapc->num_snaps; i++) {
3211 		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3212 		    i + 1 < snapc->num_snaps)
3213 			new_state = OBJECT_EXISTS_CLEAN;
3214 		else
3215 			new_state = OBJECT_EXISTS;
3216 
3217 		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3218 					    new_state, NULL);
3219 		if (ret < 0) {
3220 			obj_req->pending.result = ret;
3221 			return;
3222 		}
3223 
3224 		rbd_assert(!ret);
3225 		obj_req->pending.num_pending++;
3226 	}
3227 }
3228 
rbd_obj_copyup_write_object(struct rbd_obj_request * obj_req)3229 static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3230 {
3231 	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3232 	int ret;
3233 
3234 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3235 
3236 	/*
3237 	 * Only send non-zero copyup data to save some I/O and network
3238 	 * bandwidth -- zero copyup data is equivalent to the object not
3239 	 * existing.
3240 	 */
3241 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3242 		bytes = 0;
3243 
3244 	if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3245 		/*
3246 		 * Send a copyup request with an empty snapshot context to
3247 		 * deep-copyup the object through all existing snapshots.
3248 		 * A second request with the current snapshot context will be
3249 		 * sent for the actual modification.
3250 		 */
3251 		ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3252 		if (ret) {
3253 			obj_req->pending.result = ret;
3254 			return;
3255 		}
3256 
3257 		obj_req->pending.num_pending++;
3258 		bytes = MODS_ONLY;
3259 	}
3260 
3261 	ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3262 	if (ret) {
3263 		obj_req->pending.result = ret;
3264 		return;
3265 	}
3266 
3267 	obj_req->pending.num_pending++;
3268 }
3269 
rbd_obj_advance_copyup(struct rbd_obj_request * obj_req,int * result)3270 static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3271 {
3272 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3273 	int ret;
3274 
3275 again:
3276 	switch (obj_req->copyup_state) {
3277 	case RBD_OBJ_COPYUP_START:
3278 		rbd_assert(!*result);
3279 
3280 		ret = rbd_obj_copyup_read_parent(obj_req);
3281 		if (ret) {
3282 			*result = ret;
3283 			return true;
3284 		}
3285 		if (obj_req->num_img_extents)
3286 			obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3287 		else
3288 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3289 		return false;
3290 	case RBD_OBJ_COPYUP_READ_PARENT:
3291 		if (*result)
3292 			return true;
3293 
3294 		if (is_zero_bvecs(obj_req->copyup_bvecs,
3295 				  rbd_obj_img_extents_bytes(obj_req))) {
3296 			dout("%s %p detected zeros\n", __func__, obj_req);
3297 			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3298 		}
3299 
3300 		rbd_obj_copyup_object_maps(obj_req);
3301 		if (!obj_req->pending.num_pending) {
3302 			*result = obj_req->pending.result;
3303 			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3304 			goto again;
3305 		}
3306 		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3307 		return false;
3308 	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3309 		if (!pending_result_dec(&obj_req->pending, result))
3310 			return false;
3311 		fallthrough;
3312 	case RBD_OBJ_COPYUP_OBJECT_MAPS:
3313 		if (*result) {
3314 			rbd_warn(rbd_dev, "snap object map update failed: %d",
3315 				 *result);
3316 			return true;
3317 		}
3318 
3319 		rbd_obj_copyup_write_object(obj_req);
3320 		if (!obj_req->pending.num_pending) {
3321 			*result = obj_req->pending.result;
3322 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3323 			goto again;
3324 		}
3325 		obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3326 		return false;
3327 	case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3328 		if (!pending_result_dec(&obj_req->pending, result))
3329 			return false;
3330 		fallthrough;
3331 	case RBD_OBJ_COPYUP_WRITE_OBJECT:
3332 		return true;
3333 	default:
3334 		BUG();
3335 	}
3336 }
3337 
3338 /*
3339  * Return:
3340  *   0 - object map update sent
3341  *   1 - object map update isn't needed
3342  *  <0 - error
3343  */
rbd_obj_write_post_object_map(struct rbd_obj_request * obj_req)3344 static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
3345 {
3346 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3347 	u8 current_state = OBJECT_PENDING;
3348 
3349 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3350 		return 1;
3351 
3352 	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3353 		return 1;
3354 
3355 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3356 				     &current_state);
3357 }
3358 
rbd_obj_advance_write(struct rbd_obj_request * obj_req,int * result)3359 static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
3360 {
3361 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3362 	int ret;
3363 
3364 again:
3365 	switch (obj_req->write_state) {
3366 	case RBD_OBJ_WRITE_START:
3367 		rbd_assert(!*result);
3368 
3369 		rbd_obj_set_copyup_enabled(obj_req);
3370 		if (rbd_obj_write_is_noop(obj_req))
3371 			return true;
3372 
3373 		ret = rbd_obj_write_pre_object_map(obj_req);
3374 		if (ret < 0) {
3375 			*result = ret;
3376 			return true;
3377 		}
3378 		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3379 		if (ret > 0)
3380 			goto again;
3381 		return false;
3382 	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3383 		if (*result) {
3384 			rbd_warn(rbd_dev, "pre object map update failed: %d",
3385 				 *result);
3386 			return true;
3387 		}
3388 		ret = rbd_obj_write_object(obj_req);
3389 		if (ret) {
3390 			*result = ret;
3391 			return true;
3392 		}
3393 		obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3394 		return false;
3395 	case RBD_OBJ_WRITE_OBJECT:
3396 		if (*result == -ENOENT) {
3397 			if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3398 				*result = 0;
3399 				obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3400 				obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3401 				goto again;
3402 			}
3403 			/*
3404 			 * On a non-existent object:
3405 			 *   delete - -ENOENT, truncate/zero - 0
3406 			 */
3407 			if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3408 				*result = 0;
3409 		}
3410 		if (*result)
3411 			return true;
3412 
3413 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3414 		goto again;
3415 	case __RBD_OBJ_WRITE_COPYUP:
3416 		if (!rbd_obj_advance_copyup(obj_req, result))
3417 			return false;
3418 		fallthrough;
3419 	case RBD_OBJ_WRITE_COPYUP:
3420 		if (*result) {
3421 			rbd_warn(rbd_dev, "copyup failed: %d", *result);
3422 			return true;
3423 		}
3424 		ret = rbd_obj_write_post_object_map(obj_req);
3425 		if (ret < 0) {
3426 			*result = ret;
3427 			return true;
3428 		}
3429 		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3430 		if (ret > 0)
3431 			goto again;
3432 		return false;
3433 	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3434 		if (*result)
3435 			rbd_warn(rbd_dev, "post object map update failed: %d",
3436 				 *result);
3437 		return true;
3438 	default:
3439 		BUG();
3440 	}
3441 }
3442 
3443 /*
3444  * Return true if @obj_req is completed.
3445  */
__rbd_obj_handle_request(struct rbd_obj_request * obj_req,int * result)3446 static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3447 				     int *result)
3448 {
3449 	struct rbd_img_request *img_req = obj_req->img_request;
3450 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3451 	bool done;
3452 
3453 	mutex_lock(&obj_req->state_mutex);
3454 	if (!rbd_img_is_write(img_req))
3455 		done = rbd_obj_advance_read(obj_req, result);
3456 	else
3457 		done = rbd_obj_advance_write(obj_req, result);
3458 	mutex_unlock(&obj_req->state_mutex);
3459 
3460 	if (done && *result) {
3461 		rbd_assert(*result < 0);
3462 		rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3463 			 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3464 			 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
3465 	}
3466 	return done;
3467 }
3468 
3469 /*
3470  * This is open-coded in rbd_img_handle_request() to avoid parent chain
3471  * recursion.
3472  */
rbd_obj_handle_request(struct rbd_obj_request * obj_req,int result)3473 static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
3474 {
3475 	if (__rbd_obj_handle_request(obj_req, &result))
3476 		rbd_img_handle_request(obj_req->img_request, result);
3477 }
3478 
need_exclusive_lock(struct rbd_img_request * img_req)3479 static bool need_exclusive_lock(struct rbd_img_request *img_req)
3480 {
3481 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3482 
3483 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3484 		return false;
3485 
3486 	if (rbd_is_ro(rbd_dev))
3487 		return false;
3488 
3489 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
3490 	if (rbd_dev->opts->lock_on_read ||
3491 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3492 		return true;
3493 
3494 	return rbd_img_is_write(img_req);
3495 }
3496 
rbd_lock_add_request(struct rbd_img_request * img_req)3497 static bool rbd_lock_add_request(struct rbd_img_request *img_req)
3498 {
3499 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3500 	bool locked;
3501 
3502 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3503 	locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
3504 	spin_lock(&rbd_dev->lock_lists_lock);
3505 	rbd_assert(list_empty(&img_req->lock_item));
3506 	if (!locked)
3507 		list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3508 	else
3509 		list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
3510 	spin_unlock(&rbd_dev->lock_lists_lock);
3511 	return locked;
3512 }
3513 
rbd_lock_del_request(struct rbd_img_request * img_req)3514 static void rbd_lock_del_request(struct rbd_img_request *img_req)
3515 {
3516 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3517 	bool need_wakeup;
3518 
3519 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3520 	spin_lock(&rbd_dev->lock_lists_lock);
3521 	rbd_assert(!list_empty(&img_req->lock_item));
3522 	list_del_init(&img_req->lock_item);
3523 	need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3524 		       list_empty(&rbd_dev->running_list));
3525 	spin_unlock(&rbd_dev->lock_lists_lock);
3526 	if (need_wakeup)
3527 		complete(&rbd_dev->releasing_wait);
3528 }
3529 
rbd_img_exclusive_lock(struct rbd_img_request * img_req)3530 static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3531 {
3532 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3533 
3534 	if (!need_exclusive_lock(img_req))
3535 		return 1;
3536 
3537 	if (rbd_lock_add_request(img_req))
3538 		return 1;
3539 
3540 	if (rbd_dev->opts->exclusive) {
3541 		WARN_ON(1); /* lock got released? */
3542 		return -EROFS;
3543 	}
3544 
3545 	/*
3546 	 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3547 	 * and cancel_delayed_work() in wake_lock_waiters().
3548 	 */
3549 	dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3550 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3551 	return 0;
3552 }
3553 
rbd_img_object_requests(struct rbd_img_request * img_req)3554 static void rbd_img_object_requests(struct rbd_img_request *img_req)
3555 {
3556 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3557 	struct rbd_obj_request *obj_req;
3558 
3559 	rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3560 	rbd_assert(!need_exclusive_lock(img_req) ||
3561 		   __rbd_is_lock_owner(rbd_dev));
3562 
3563 	if (rbd_img_is_write(img_req)) {
3564 		rbd_assert(!img_req->snapc);
3565 		down_read(&rbd_dev->header_rwsem);
3566 		img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
3567 		up_read(&rbd_dev->header_rwsem);
3568 	}
3569 
3570 	for_each_obj_request(img_req, obj_req) {
3571 		int result = 0;
3572 
3573 		if (__rbd_obj_handle_request(obj_req, &result)) {
3574 			if (result) {
3575 				img_req->pending.result = result;
3576 				return;
3577 			}
3578 		} else {
3579 			img_req->pending.num_pending++;
3580 		}
3581 	}
3582 }
3583 
rbd_img_advance(struct rbd_img_request * img_req,int * result)3584 static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
3585 {
3586 	int ret;
3587 
3588 again:
3589 	switch (img_req->state) {
3590 	case RBD_IMG_START:
3591 		rbd_assert(!*result);
3592 
3593 		ret = rbd_img_exclusive_lock(img_req);
3594 		if (ret < 0) {
3595 			*result = ret;
3596 			return true;
3597 		}
3598 		img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3599 		if (ret > 0)
3600 			goto again;
3601 		return false;
3602 	case RBD_IMG_EXCLUSIVE_LOCK:
3603 		if (*result)
3604 			return true;
3605 
3606 		rbd_img_object_requests(img_req);
3607 		if (!img_req->pending.num_pending) {
3608 			*result = img_req->pending.result;
3609 			img_req->state = RBD_IMG_OBJECT_REQUESTS;
3610 			goto again;
3611 		}
3612 		img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3613 		return false;
3614 	case __RBD_IMG_OBJECT_REQUESTS:
3615 		if (!pending_result_dec(&img_req->pending, result))
3616 			return false;
3617 		fallthrough;
3618 	case RBD_IMG_OBJECT_REQUESTS:
3619 		return true;
3620 	default:
3621 		BUG();
3622 	}
3623 }
3624 
3625 /*
3626  * Return true if @img_req is completed.
3627  */
__rbd_img_handle_request(struct rbd_img_request * img_req,int * result)3628 static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3629 				     int *result)
3630 {
3631 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3632 	bool done;
3633 
3634 	if (need_exclusive_lock(img_req)) {
3635 		down_read(&rbd_dev->lock_rwsem);
3636 		mutex_lock(&img_req->state_mutex);
3637 		done = rbd_img_advance(img_req, result);
3638 		if (done)
3639 			rbd_lock_del_request(img_req);
3640 		mutex_unlock(&img_req->state_mutex);
3641 		up_read(&rbd_dev->lock_rwsem);
3642 	} else {
3643 		mutex_lock(&img_req->state_mutex);
3644 		done = rbd_img_advance(img_req, result);
3645 		mutex_unlock(&img_req->state_mutex);
3646 	}
3647 
3648 	if (done && *result) {
3649 		rbd_assert(*result < 0);
3650 		rbd_warn(rbd_dev, "%s%s result %d",
3651 		      test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3652 		      obj_op_name(img_req->op_type), *result);
3653 	}
3654 	return done;
3655 }
3656 
rbd_img_handle_request(struct rbd_img_request * img_req,int result)3657 static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3658 {
3659 again:
3660 	if (!__rbd_img_handle_request(img_req, &result))
3661 		return;
3662 
3663 	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
3664 		struct rbd_obj_request *obj_req = img_req->obj_request;
3665 
3666 		rbd_img_request_destroy(img_req);
3667 		if (__rbd_obj_handle_request(obj_req, &result)) {
3668 			img_req = obj_req->img_request;
3669 			goto again;
3670 		}
3671 	} else {
3672 		struct request *rq = blk_mq_rq_from_pdu(img_req);
3673 
3674 		rbd_img_request_destroy(img_req);
3675 		blk_mq_end_request(rq, errno_to_blk_status(result));
3676 	}
3677 }
3678 
3679 static const struct rbd_client_id rbd_empty_cid;
3680 
rbd_cid_equal(const struct rbd_client_id * lhs,const struct rbd_client_id * rhs)3681 static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3682 			  const struct rbd_client_id *rhs)
3683 {
3684 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3685 }
3686 
rbd_get_cid(struct rbd_device * rbd_dev)3687 static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3688 {
3689 	struct rbd_client_id cid;
3690 
3691 	mutex_lock(&rbd_dev->watch_mutex);
3692 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3693 	cid.handle = rbd_dev->watch_cookie;
3694 	mutex_unlock(&rbd_dev->watch_mutex);
3695 	return cid;
3696 }
3697 
3698 /*
3699  * lock_rwsem must be held for write
3700  */
rbd_set_owner_cid(struct rbd_device * rbd_dev,const struct rbd_client_id * cid)3701 static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3702 			      const struct rbd_client_id *cid)
3703 {
3704 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3705 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3706 	     cid->gid, cid->handle);
3707 	rbd_dev->owner_cid = *cid; /* struct */
3708 }
3709 
format_lock_cookie(struct rbd_device * rbd_dev,char * buf)3710 static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3711 {
3712 	mutex_lock(&rbd_dev->watch_mutex);
3713 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3714 	mutex_unlock(&rbd_dev->watch_mutex);
3715 }
3716 
__rbd_lock(struct rbd_device * rbd_dev,const char * cookie)3717 static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3718 {
3719 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3720 
3721 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3722 	strcpy(rbd_dev->lock_cookie, cookie);
3723 	rbd_set_owner_cid(rbd_dev, &cid);
3724 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3725 }
3726 
3727 /*
3728  * lock_rwsem must be held for write
3729  */
rbd_lock(struct rbd_device * rbd_dev)3730 static int rbd_lock(struct rbd_device *rbd_dev)
3731 {
3732 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3733 	char cookie[32];
3734 	int ret;
3735 
3736 	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3737 		rbd_dev->lock_cookie[0] != '\0');
3738 
3739 	format_lock_cookie(rbd_dev, cookie);
3740 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3741 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3742 			    RBD_LOCK_TAG, "", 0);
3743 	if (ret)
3744 		return ret;
3745 
3746 	__rbd_lock(rbd_dev, cookie);
3747 	return 0;
3748 }
3749 
3750 /*
3751  * lock_rwsem must be held for write
3752  */
rbd_unlock(struct rbd_device * rbd_dev)3753 static void rbd_unlock(struct rbd_device *rbd_dev)
3754 {
3755 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3756 	int ret;
3757 
3758 	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3759 		rbd_dev->lock_cookie[0] == '\0');
3760 
3761 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3762 			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
3763 	if (ret && ret != -ENOENT)
3764 		rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
3765 
3766 	/* treat errors as the image is unlocked */
3767 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3768 	rbd_dev->lock_cookie[0] = '\0';
3769 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3770 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3771 }
3772 
__rbd_notify_op_lock(struct rbd_device * rbd_dev,enum rbd_notify_op notify_op,struct page *** preply_pages,size_t * preply_len)3773 static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3774 				enum rbd_notify_op notify_op,
3775 				struct page ***preply_pages,
3776 				size_t *preply_len)
3777 {
3778 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3779 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3780 	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3781 	int buf_size = sizeof(buf);
3782 	void *p = buf;
3783 
3784 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3785 
3786 	/* encode *LockPayload NotifyMessage (op + ClientId) */
3787 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3788 	ceph_encode_32(&p, notify_op);
3789 	ceph_encode_64(&p, cid.gid);
3790 	ceph_encode_64(&p, cid.handle);
3791 
3792 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3793 				&rbd_dev->header_oloc, buf, buf_size,
3794 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3795 }
3796 
rbd_notify_op_lock(struct rbd_device * rbd_dev,enum rbd_notify_op notify_op)3797 static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3798 			       enum rbd_notify_op notify_op)
3799 {
3800 	__rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
3801 }
3802 
rbd_notify_acquired_lock(struct work_struct * work)3803 static void rbd_notify_acquired_lock(struct work_struct *work)
3804 {
3805 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3806 						  acquired_lock_work);
3807 
3808 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3809 }
3810 
rbd_notify_released_lock(struct work_struct * work)3811 static void rbd_notify_released_lock(struct work_struct *work)
3812 {
3813 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3814 						  released_lock_work);
3815 
3816 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3817 }
3818 
rbd_request_lock(struct rbd_device * rbd_dev)3819 static int rbd_request_lock(struct rbd_device *rbd_dev)
3820 {
3821 	struct page **reply_pages;
3822 	size_t reply_len;
3823 	bool lock_owner_responded = false;
3824 	int ret;
3825 
3826 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3827 
3828 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3829 				   &reply_pages, &reply_len);
3830 	if (ret && ret != -ETIMEDOUT) {
3831 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3832 		goto out;
3833 	}
3834 
3835 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3836 		void *p = page_address(reply_pages[0]);
3837 		void *const end = p + reply_len;
3838 		u32 n;
3839 
3840 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3841 		while (n--) {
3842 			u8 struct_v;
3843 			u32 len;
3844 
3845 			ceph_decode_need(&p, end, 8 + 8, e_inval);
3846 			p += 8 + 8; /* skip gid and cookie */
3847 
3848 			ceph_decode_32_safe(&p, end, len, e_inval);
3849 			if (!len)
3850 				continue;
3851 
3852 			if (lock_owner_responded) {
3853 				rbd_warn(rbd_dev,
3854 					 "duplicate lock owners detected");
3855 				ret = -EIO;
3856 				goto out;
3857 			}
3858 
3859 			lock_owner_responded = true;
3860 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3861 						  &struct_v, &len);
3862 			if (ret) {
3863 				rbd_warn(rbd_dev,
3864 					 "failed to decode ResponseMessage: %d",
3865 					 ret);
3866 				goto e_inval;
3867 			}
3868 
3869 			ret = ceph_decode_32(&p);
3870 		}
3871 	}
3872 
3873 	if (!lock_owner_responded) {
3874 		rbd_warn(rbd_dev, "no lock owners detected");
3875 		ret = -ETIMEDOUT;
3876 	}
3877 
3878 out:
3879 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3880 	return ret;
3881 
3882 e_inval:
3883 	ret = -EINVAL;
3884 	goto out;
3885 }
3886 
3887 /*
3888  * Either image request state machine(s) or rbd_add_acquire_lock()
3889  * (i.e. "rbd map").
3890  */
wake_lock_waiters(struct rbd_device * rbd_dev,int result)3891 static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
3892 {
3893 	struct rbd_img_request *img_req;
3894 
3895 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3896 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
3897 
3898 	cancel_delayed_work(&rbd_dev->lock_dwork);
3899 	if (!completion_done(&rbd_dev->acquire_wait)) {
3900 		rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3901 			   list_empty(&rbd_dev->running_list));
3902 		rbd_dev->acquire_err = result;
3903 		complete_all(&rbd_dev->acquire_wait);
3904 		return;
3905 	}
3906 
3907 	list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3908 		mutex_lock(&img_req->state_mutex);
3909 		rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3910 		rbd_img_schedule(img_req, result);
3911 		mutex_unlock(&img_req->state_mutex);
3912 	}
3913 
3914 	list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
3915 }
3916 
get_lock_owner_info(struct rbd_device * rbd_dev,struct ceph_locker ** lockers,u32 * num_lockers)3917 static int get_lock_owner_info(struct rbd_device *rbd_dev,
3918 			       struct ceph_locker **lockers, u32 *num_lockers)
3919 {
3920 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3921 	u8 lock_type;
3922 	char *lock_tag;
3923 	int ret;
3924 
3925 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3926 
3927 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3928 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3929 				 &lock_type, &lock_tag, lockers, num_lockers);
3930 	if (ret)
3931 		return ret;
3932 
3933 	if (*num_lockers == 0) {
3934 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3935 		goto out;
3936 	}
3937 
3938 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3939 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3940 			 lock_tag);
3941 		ret = -EBUSY;
3942 		goto out;
3943 	}
3944 
3945 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
3946 		rbd_warn(rbd_dev, "shared lock type detected");
3947 		ret = -EBUSY;
3948 		goto out;
3949 	}
3950 
3951 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3952 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
3953 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3954 			 (*lockers)[0].id.cookie);
3955 		ret = -EBUSY;
3956 		goto out;
3957 	}
3958 
3959 out:
3960 	kfree(lock_tag);
3961 	return ret;
3962 }
3963 
find_watcher(struct rbd_device * rbd_dev,const struct ceph_locker * locker)3964 static int find_watcher(struct rbd_device *rbd_dev,
3965 			const struct ceph_locker *locker)
3966 {
3967 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3968 	struct ceph_watch_item *watchers;
3969 	u32 num_watchers;
3970 	u64 cookie;
3971 	int i;
3972 	int ret;
3973 
3974 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3975 				      &rbd_dev->header_oloc, &watchers,
3976 				      &num_watchers);
3977 	if (ret)
3978 		return ret;
3979 
3980 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3981 	for (i = 0; i < num_watchers; i++) {
3982 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
3983 			    sizeof(locker->info.addr)) &&
3984 		    watchers[i].cookie == cookie) {
3985 			struct rbd_client_id cid = {
3986 				.gid = le64_to_cpu(watchers[i].name.num),
3987 				.handle = cookie,
3988 			};
3989 
3990 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3991 			     rbd_dev, cid.gid, cid.handle);
3992 			rbd_set_owner_cid(rbd_dev, &cid);
3993 			ret = 1;
3994 			goto out;
3995 		}
3996 	}
3997 
3998 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3999 	ret = 0;
4000 out:
4001 	kfree(watchers);
4002 	return ret;
4003 }
4004 
4005 /*
4006  * lock_rwsem must be held for write
4007  */
rbd_try_lock(struct rbd_device * rbd_dev)4008 static int rbd_try_lock(struct rbd_device *rbd_dev)
4009 {
4010 	struct ceph_client *client = rbd_dev->rbd_client->client;
4011 	struct ceph_locker *lockers;
4012 	u32 num_lockers;
4013 	int ret;
4014 
4015 	for (;;) {
4016 		ret = rbd_lock(rbd_dev);
4017 		if (ret != -EBUSY)
4018 			return ret;
4019 
4020 		/* determine if the current lock holder is still alive */
4021 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
4022 		if (ret)
4023 			return ret;
4024 
4025 		if (num_lockers == 0)
4026 			goto again;
4027 
4028 		ret = find_watcher(rbd_dev, lockers);
4029 		if (ret)
4030 			goto out; /* request lock or error */
4031 
4032 		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
4033 			 ENTITY_NAME(lockers[0].id.name));
4034 
4035 		ret = ceph_monc_blocklist_add(&client->monc,
4036 					      &lockers[0].info.addr);
4037 		if (ret) {
4038 			rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
4039 				 ENTITY_NAME(lockers[0].id.name), ret);
4040 			goto out;
4041 		}
4042 
4043 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4044 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
4045 					  lockers[0].id.cookie,
4046 					  &lockers[0].id.name);
4047 		if (ret && ret != -ENOENT)
4048 			goto out;
4049 
4050 again:
4051 		ceph_free_lockers(lockers, num_lockers);
4052 	}
4053 
4054 out:
4055 	ceph_free_lockers(lockers, num_lockers);
4056 	return ret;
4057 }
4058 
rbd_post_acquire_action(struct rbd_device * rbd_dev)4059 static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4060 {
4061 	int ret;
4062 
4063 	ret = rbd_dev_refresh(rbd_dev);
4064 	if (ret)
4065 		return ret;
4066 
4067 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
4068 		ret = rbd_object_map_open(rbd_dev);
4069 		if (ret)
4070 			return ret;
4071 	}
4072 
4073 	return 0;
4074 }
4075 
4076 /*
4077  * Return:
4078  *   0 - lock acquired
4079  *   1 - caller should call rbd_request_lock()
4080  *  <0 - error
4081  */
rbd_try_acquire_lock(struct rbd_device * rbd_dev)4082 static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
4083 {
4084 	int ret;
4085 
4086 	down_read(&rbd_dev->lock_rwsem);
4087 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4088 	     rbd_dev->lock_state);
4089 	if (__rbd_is_lock_owner(rbd_dev)) {
4090 		up_read(&rbd_dev->lock_rwsem);
4091 		return 0;
4092 	}
4093 
4094 	up_read(&rbd_dev->lock_rwsem);
4095 	down_write(&rbd_dev->lock_rwsem);
4096 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4097 	     rbd_dev->lock_state);
4098 	if (__rbd_is_lock_owner(rbd_dev)) {
4099 		up_write(&rbd_dev->lock_rwsem);
4100 		return 0;
4101 	}
4102 
4103 	ret = rbd_try_lock(rbd_dev);
4104 	if (ret < 0) {
4105 		rbd_warn(rbd_dev, "failed to lock header: %d", ret);
4106 		if (ret == -EBLOCKLISTED)
4107 			goto out;
4108 
4109 		ret = 1; /* request lock anyway */
4110 	}
4111 	if (ret > 0) {
4112 		up_write(&rbd_dev->lock_rwsem);
4113 		return ret;
4114 	}
4115 
4116 	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4117 	rbd_assert(list_empty(&rbd_dev->running_list));
4118 
4119 	ret = rbd_post_acquire_action(rbd_dev);
4120 	if (ret) {
4121 		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4122 		/*
4123 		 * Can't stay in RBD_LOCK_STATE_LOCKED because
4124 		 * rbd_lock_add_request() would let the request through,
4125 		 * assuming that e.g. object map is locked and loaded.
4126 		 */
4127 		rbd_unlock(rbd_dev);
4128 	}
4129 
4130 out:
4131 	wake_lock_waiters(rbd_dev, ret);
4132 	up_write(&rbd_dev->lock_rwsem);
4133 	return ret;
4134 }
4135 
rbd_acquire_lock(struct work_struct * work)4136 static void rbd_acquire_lock(struct work_struct *work)
4137 {
4138 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4139 					    struct rbd_device, lock_dwork);
4140 	int ret;
4141 
4142 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4143 again:
4144 	ret = rbd_try_acquire_lock(rbd_dev);
4145 	if (ret <= 0) {
4146 		dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
4147 		return;
4148 	}
4149 
4150 	ret = rbd_request_lock(rbd_dev);
4151 	if (ret == -ETIMEDOUT) {
4152 		goto again; /* treat this as a dead client */
4153 	} else if (ret == -EROFS) {
4154 		rbd_warn(rbd_dev, "peer will not release lock");
4155 		down_write(&rbd_dev->lock_rwsem);
4156 		wake_lock_waiters(rbd_dev, ret);
4157 		up_write(&rbd_dev->lock_rwsem);
4158 	} else if (ret < 0) {
4159 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4160 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4161 				 RBD_RETRY_DELAY);
4162 	} else {
4163 		/*
4164 		 * lock owner acked, but resend if we don't see them
4165 		 * release the lock
4166 		 */
4167 		dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
4168 		     rbd_dev);
4169 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4170 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4171 	}
4172 }
4173 
rbd_quiesce_lock(struct rbd_device * rbd_dev)4174 static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
4175 {
4176 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4177 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
4178 
4179 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4180 		return false;
4181 
4182 	/*
4183 	 * Ensure that all in-flight IO is flushed.
4184 	 */
4185 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4186 	rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4187 	if (list_empty(&rbd_dev->running_list))
4188 		return true;
4189 
4190 	up_write(&rbd_dev->lock_rwsem);
4191 	wait_for_completion(&rbd_dev->releasing_wait);
4192 
4193 	down_write(&rbd_dev->lock_rwsem);
4194 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4195 		return false;
4196 
4197 	rbd_assert(list_empty(&rbd_dev->running_list));
4198 	return true;
4199 }
4200 
rbd_pre_release_action(struct rbd_device * rbd_dev)4201 static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4202 {
4203 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4204 		rbd_object_map_close(rbd_dev);
4205 }
4206 
__rbd_release_lock(struct rbd_device * rbd_dev)4207 static void __rbd_release_lock(struct rbd_device *rbd_dev)
4208 {
4209 	rbd_assert(list_empty(&rbd_dev->running_list));
4210 
4211 	rbd_pre_release_action(rbd_dev);
4212 	rbd_unlock(rbd_dev);
4213 }
4214 
4215 /*
4216  * lock_rwsem must be held for write
4217  */
rbd_release_lock(struct rbd_device * rbd_dev)4218 static void rbd_release_lock(struct rbd_device *rbd_dev)
4219 {
4220 	if (!rbd_quiesce_lock(rbd_dev))
4221 		return;
4222 
4223 	__rbd_release_lock(rbd_dev);
4224 
4225 	/*
4226 	 * Give others a chance to grab the lock - we would re-acquire
4227 	 * almost immediately if we got new IO while draining the running
4228 	 * list otherwise.  We need to ack our own notifications, so this
4229 	 * lock_dwork will be requeued from rbd_handle_released_lock() by
4230 	 * way of maybe_kick_acquire().
4231 	 */
4232 	cancel_delayed_work(&rbd_dev->lock_dwork);
4233 }
4234 
rbd_release_lock_work(struct work_struct * work)4235 static void rbd_release_lock_work(struct work_struct *work)
4236 {
4237 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4238 						  unlock_work);
4239 
4240 	down_write(&rbd_dev->lock_rwsem);
4241 	rbd_release_lock(rbd_dev);
4242 	up_write(&rbd_dev->lock_rwsem);
4243 }
4244 
maybe_kick_acquire(struct rbd_device * rbd_dev)4245 static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4246 {
4247 	bool have_requests;
4248 
4249 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4250 	if (__rbd_is_lock_owner(rbd_dev))
4251 		return;
4252 
4253 	spin_lock(&rbd_dev->lock_lists_lock);
4254 	have_requests = !list_empty(&rbd_dev->acquiring_list);
4255 	spin_unlock(&rbd_dev->lock_lists_lock);
4256 	if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4257 		dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4258 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4259 	}
4260 }
4261 
rbd_handle_acquired_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4262 static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4263 				     void **p)
4264 {
4265 	struct rbd_client_id cid = { 0 };
4266 
4267 	if (struct_v >= 2) {
4268 		cid.gid = ceph_decode_64(p);
4269 		cid.handle = ceph_decode_64(p);
4270 	}
4271 
4272 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4273 	     cid.handle);
4274 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4275 		down_write(&rbd_dev->lock_rwsem);
4276 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4277 			dout("%s rbd_dev %p cid %llu-%llu == owner_cid\n",
4278 			     __func__, rbd_dev, cid.gid, cid.handle);
4279 		} else {
4280 			rbd_set_owner_cid(rbd_dev, &cid);
4281 		}
4282 		downgrade_write(&rbd_dev->lock_rwsem);
4283 	} else {
4284 		down_read(&rbd_dev->lock_rwsem);
4285 	}
4286 
4287 	maybe_kick_acquire(rbd_dev);
4288 	up_read(&rbd_dev->lock_rwsem);
4289 }
4290 
rbd_handle_released_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4291 static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4292 				     void **p)
4293 {
4294 	struct rbd_client_id cid = { 0 };
4295 
4296 	if (struct_v >= 2) {
4297 		cid.gid = ceph_decode_64(p);
4298 		cid.handle = ceph_decode_64(p);
4299 	}
4300 
4301 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4302 	     cid.handle);
4303 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4304 		down_write(&rbd_dev->lock_rwsem);
4305 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4306 			dout("%s rbd_dev %p cid %llu-%llu != owner_cid %llu-%llu\n",
4307 			     __func__, rbd_dev, cid.gid, cid.handle,
4308 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
4309 		} else {
4310 			rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4311 		}
4312 		downgrade_write(&rbd_dev->lock_rwsem);
4313 	} else {
4314 		down_read(&rbd_dev->lock_rwsem);
4315 	}
4316 
4317 	maybe_kick_acquire(rbd_dev);
4318 	up_read(&rbd_dev->lock_rwsem);
4319 }
4320 
4321 /*
4322  * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4323  * ResponseMessage is needed.
4324  */
rbd_handle_request_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4325 static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4326 				   void **p)
4327 {
4328 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4329 	struct rbd_client_id cid = { 0 };
4330 	int result = 1;
4331 
4332 	if (struct_v >= 2) {
4333 		cid.gid = ceph_decode_64(p);
4334 		cid.handle = ceph_decode_64(p);
4335 	}
4336 
4337 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4338 	     cid.handle);
4339 	if (rbd_cid_equal(&cid, &my_cid))
4340 		return result;
4341 
4342 	down_read(&rbd_dev->lock_rwsem);
4343 	if (__rbd_is_lock_owner(rbd_dev)) {
4344 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4345 		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4346 			goto out_unlock;
4347 
4348 		/*
4349 		 * encode ResponseMessage(0) so the peer can detect
4350 		 * a missing owner
4351 		 */
4352 		result = 0;
4353 
4354 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
4355 			if (!rbd_dev->opts->exclusive) {
4356 				dout("%s rbd_dev %p queueing unlock_work\n",
4357 				     __func__, rbd_dev);
4358 				queue_work(rbd_dev->task_wq,
4359 					   &rbd_dev->unlock_work);
4360 			} else {
4361 				/* refuse to release the lock */
4362 				result = -EROFS;
4363 			}
4364 		}
4365 	}
4366 
4367 out_unlock:
4368 	up_read(&rbd_dev->lock_rwsem);
4369 	return result;
4370 }
4371 
__rbd_acknowledge_notify(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie,s32 * result)4372 static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4373 				     u64 notify_id, u64 cookie, s32 *result)
4374 {
4375 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4376 	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4377 	int buf_size = sizeof(buf);
4378 	int ret;
4379 
4380 	if (result) {
4381 		void *p = buf;
4382 
4383 		/* encode ResponseMessage */
4384 		ceph_start_encoding(&p, 1, 1,
4385 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
4386 		ceph_encode_32(&p, *result);
4387 	} else {
4388 		buf_size = 0;
4389 	}
4390 
4391 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4392 				   &rbd_dev->header_oloc, notify_id, cookie,
4393 				   buf, buf_size);
4394 	if (ret)
4395 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4396 }
4397 
rbd_acknowledge_notify(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie)4398 static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4399 				   u64 cookie)
4400 {
4401 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4402 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4403 }
4404 
rbd_acknowledge_notify_result(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie,s32 result)4405 static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4406 					  u64 notify_id, u64 cookie, s32 result)
4407 {
4408 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4409 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4410 }
4411 
rbd_watch_cb(void * arg,u64 notify_id,u64 cookie,u64 notifier_id,void * data,size_t data_len)4412 static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4413 			 u64 notifier_id, void *data, size_t data_len)
4414 {
4415 	struct rbd_device *rbd_dev = arg;
4416 	void *p = data;
4417 	void *const end = p + data_len;
4418 	u8 struct_v = 0;
4419 	u32 len;
4420 	u32 notify_op;
4421 	int ret;
4422 
4423 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4424 	     __func__, rbd_dev, cookie, notify_id, data_len);
4425 	if (data_len) {
4426 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4427 					  &struct_v, &len);
4428 		if (ret) {
4429 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4430 				 ret);
4431 			return;
4432 		}
4433 
4434 		notify_op = ceph_decode_32(&p);
4435 	} else {
4436 		/* legacy notification for header updates */
4437 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4438 		len = 0;
4439 	}
4440 
4441 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4442 	switch (notify_op) {
4443 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4444 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4445 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4446 		break;
4447 	case RBD_NOTIFY_OP_RELEASED_LOCK:
4448 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
4449 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4450 		break;
4451 	case RBD_NOTIFY_OP_REQUEST_LOCK:
4452 		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4453 		if (ret <= 0)
4454 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4455 						      cookie, ret);
4456 		else
4457 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4458 		break;
4459 	case RBD_NOTIFY_OP_HEADER_UPDATE:
4460 		ret = rbd_dev_refresh(rbd_dev);
4461 		if (ret)
4462 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
4463 
4464 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4465 		break;
4466 	default:
4467 		if (rbd_is_lock_owner(rbd_dev))
4468 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4469 						      cookie, -EOPNOTSUPP);
4470 		else
4471 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4472 		break;
4473 	}
4474 }
4475 
4476 static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4477 
rbd_watch_errcb(void * arg,u64 cookie,int err)4478 static void rbd_watch_errcb(void *arg, u64 cookie, int err)
4479 {
4480 	struct rbd_device *rbd_dev = arg;
4481 
4482 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
4483 
4484 	down_write(&rbd_dev->lock_rwsem);
4485 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4486 	up_write(&rbd_dev->lock_rwsem);
4487 
4488 	mutex_lock(&rbd_dev->watch_mutex);
4489 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4490 		__rbd_unregister_watch(rbd_dev);
4491 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
4492 
4493 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
4494 	}
4495 	mutex_unlock(&rbd_dev->watch_mutex);
4496 }
4497 
4498 /*
4499  * watch_mutex must be locked
4500  */
__rbd_register_watch(struct rbd_device * rbd_dev)4501 static int __rbd_register_watch(struct rbd_device *rbd_dev)
4502 {
4503 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4504 	struct ceph_osd_linger_request *handle;
4505 
4506 	rbd_assert(!rbd_dev->watch_handle);
4507 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4508 
4509 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4510 				 &rbd_dev->header_oloc, rbd_watch_cb,
4511 				 rbd_watch_errcb, rbd_dev);
4512 	if (IS_ERR(handle))
4513 		return PTR_ERR(handle);
4514 
4515 	rbd_dev->watch_handle = handle;
4516 	return 0;
4517 }
4518 
4519 /*
4520  * watch_mutex must be locked
4521  */
__rbd_unregister_watch(struct rbd_device * rbd_dev)4522 static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
4523 {
4524 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4525 	int ret;
4526 
4527 	rbd_assert(rbd_dev->watch_handle);
4528 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4529 
4530 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4531 	if (ret)
4532 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
4533 
4534 	rbd_dev->watch_handle = NULL;
4535 }
4536 
rbd_register_watch(struct rbd_device * rbd_dev)4537 static int rbd_register_watch(struct rbd_device *rbd_dev)
4538 {
4539 	int ret;
4540 
4541 	mutex_lock(&rbd_dev->watch_mutex);
4542 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4543 	ret = __rbd_register_watch(rbd_dev);
4544 	if (ret)
4545 		goto out;
4546 
4547 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4548 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4549 
4550 out:
4551 	mutex_unlock(&rbd_dev->watch_mutex);
4552 	return ret;
4553 }
4554 
cancel_tasks_sync(struct rbd_device * rbd_dev)4555 static void cancel_tasks_sync(struct rbd_device *rbd_dev)
4556 {
4557 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4558 
4559 	cancel_work_sync(&rbd_dev->acquired_lock_work);
4560 	cancel_work_sync(&rbd_dev->released_lock_work);
4561 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4562 	cancel_work_sync(&rbd_dev->unlock_work);
4563 }
4564 
4565 /*
4566  * header_rwsem must not be held to avoid a deadlock with
4567  * rbd_dev_refresh() when flushing notifies.
4568  */
rbd_unregister_watch(struct rbd_device * rbd_dev)4569 static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4570 {
4571 	cancel_tasks_sync(rbd_dev);
4572 
4573 	mutex_lock(&rbd_dev->watch_mutex);
4574 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4575 		__rbd_unregister_watch(rbd_dev);
4576 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4577 	mutex_unlock(&rbd_dev->watch_mutex);
4578 
4579 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
4580 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
4581 }
4582 
4583 /*
4584  * lock_rwsem must be held for write
4585  */
rbd_reacquire_lock(struct rbd_device * rbd_dev)4586 static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4587 {
4588 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4589 	char cookie[32];
4590 	int ret;
4591 
4592 	if (!rbd_quiesce_lock(rbd_dev))
4593 		return;
4594 
4595 	format_lock_cookie(rbd_dev, cookie);
4596 	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4597 				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
4598 				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4599 				  RBD_LOCK_TAG, cookie);
4600 	if (ret) {
4601 		if (ret != -EOPNOTSUPP)
4602 			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4603 				 ret);
4604 
4605 		/*
4606 		 * Lock cookie cannot be updated on older OSDs, so do
4607 		 * a manual release and queue an acquire.
4608 		 */
4609 		__rbd_release_lock(rbd_dev);
4610 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4611 	} else {
4612 		__rbd_lock(rbd_dev, cookie);
4613 		wake_lock_waiters(rbd_dev, 0);
4614 	}
4615 }
4616 
rbd_reregister_watch(struct work_struct * work)4617 static void rbd_reregister_watch(struct work_struct *work)
4618 {
4619 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4620 					    struct rbd_device, watch_dwork);
4621 	int ret;
4622 
4623 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4624 
4625 	mutex_lock(&rbd_dev->watch_mutex);
4626 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4627 		mutex_unlock(&rbd_dev->watch_mutex);
4628 		return;
4629 	}
4630 
4631 	ret = __rbd_register_watch(rbd_dev);
4632 	if (ret) {
4633 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
4634 		if (ret != -EBLOCKLISTED && ret != -ENOENT) {
4635 			queue_delayed_work(rbd_dev->task_wq,
4636 					   &rbd_dev->watch_dwork,
4637 					   RBD_RETRY_DELAY);
4638 			mutex_unlock(&rbd_dev->watch_mutex);
4639 			return;
4640 		}
4641 
4642 		mutex_unlock(&rbd_dev->watch_mutex);
4643 		down_write(&rbd_dev->lock_rwsem);
4644 		wake_lock_waiters(rbd_dev, ret);
4645 		up_write(&rbd_dev->lock_rwsem);
4646 		return;
4647 	}
4648 
4649 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4650 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4651 	mutex_unlock(&rbd_dev->watch_mutex);
4652 
4653 	down_write(&rbd_dev->lock_rwsem);
4654 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4655 		rbd_reacquire_lock(rbd_dev);
4656 	up_write(&rbd_dev->lock_rwsem);
4657 
4658 	ret = rbd_dev_refresh(rbd_dev);
4659 	if (ret)
4660 		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
4661 }
4662 
4663 /*
4664  * Synchronous osd object method call.  Returns the number of bytes
4665  * returned in the outbound buffer, or a negative error code.
4666  */
rbd_obj_method_sync(struct rbd_device * rbd_dev,struct ceph_object_id * oid,struct ceph_object_locator * oloc,const char * method_name,const void * outbound,size_t outbound_size,void * inbound,size_t inbound_size)4667 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
4668 			     struct ceph_object_id *oid,
4669 			     struct ceph_object_locator *oloc,
4670 			     const char *method_name,
4671 			     const void *outbound,
4672 			     size_t outbound_size,
4673 			     void *inbound,
4674 			     size_t inbound_size)
4675 {
4676 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4677 	struct page *req_page = NULL;
4678 	struct page *reply_page;
4679 	int ret;
4680 
4681 	/*
4682 	 * Method calls are ultimately read operations.  The result
4683 	 * should placed into the inbound buffer provided.  They
4684 	 * also supply outbound data--parameters for the object
4685 	 * method.  Currently if this is present it will be a
4686 	 * snapshot id.
4687 	 */
4688 	if (outbound) {
4689 		if (outbound_size > PAGE_SIZE)
4690 			return -E2BIG;
4691 
4692 		req_page = alloc_page(GFP_KERNEL);
4693 		if (!req_page)
4694 			return -ENOMEM;
4695 
4696 		memcpy(page_address(req_page), outbound, outbound_size);
4697 	}
4698 
4699 	reply_page = alloc_page(GFP_KERNEL);
4700 	if (!reply_page) {
4701 		if (req_page)
4702 			__free_page(req_page);
4703 		return -ENOMEM;
4704 	}
4705 
4706 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4707 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
4708 			     &reply_page, &inbound_size);
4709 	if (!ret) {
4710 		memcpy(inbound, page_address(reply_page), inbound_size);
4711 		ret = inbound_size;
4712 	}
4713 
4714 	if (req_page)
4715 		__free_page(req_page);
4716 	__free_page(reply_page);
4717 	return ret;
4718 }
4719 
rbd_queue_workfn(struct work_struct * work)4720 static void rbd_queue_workfn(struct work_struct *work)
4721 {
4722 	struct rbd_img_request *img_request =
4723 	    container_of(work, struct rbd_img_request, work);
4724 	struct rbd_device *rbd_dev = img_request->rbd_dev;
4725 	enum obj_operation_type op_type = img_request->op_type;
4726 	struct request *rq = blk_mq_rq_from_pdu(img_request);
4727 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4728 	u64 length = blk_rq_bytes(rq);
4729 	u64 mapping_size;
4730 	int result;
4731 
4732 	/* Ignore/skip any zero-length requests */
4733 	if (!length) {
4734 		dout("%s: zero-length request\n", __func__);
4735 		result = 0;
4736 		goto err_img_request;
4737 	}
4738 
4739 	blk_mq_start_request(rq);
4740 
4741 	down_read(&rbd_dev->header_rwsem);
4742 	mapping_size = rbd_dev->mapping.size;
4743 	rbd_img_capture_header(img_request);
4744 	up_read(&rbd_dev->header_rwsem);
4745 
4746 	if (offset + length > mapping_size) {
4747 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4748 			 length, mapping_size);
4749 		result = -EIO;
4750 		goto err_img_request;
4751 	}
4752 
4753 	dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4754 	     img_request, obj_op_name(op_type), offset, length);
4755 
4756 	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
4757 		result = rbd_img_fill_nodata(img_request, offset, length);
4758 	else
4759 		result = rbd_img_fill_from_bio(img_request, offset, length,
4760 					       rq->bio);
4761 	if (result)
4762 		goto err_img_request;
4763 
4764 	rbd_img_handle_request(img_request, 0);
4765 	return;
4766 
4767 err_img_request:
4768 	rbd_img_request_destroy(img_request);
4769 	if (result)
4770 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
4771 			 obj_op_name(op_type), length, offset, result);
4772 	blk_mq_end_request(rq, errno_to_blk_status(result));
4773 }
4774 
rbd_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)4775 static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
4776 		const struct blk_mq_queue_data *bd)
4777 {
4778 	struct rbd_device *rbd_dev = hctx->queue->queuedata;
4779 	struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
4780 	enum obj_operation_type op_type;
4781 
4782 	switch (req_op(bd->rq)) {
4783 	case REQ_OP_DISCARD:
4784 		op_type = OBJ_OP_DISCARD;
4785 		break;
4786 	case REQ_OP_WRITE_ZEROES:
4787 		op_type = OBJ_OP_ZEROOUT;
4788 		break;
4789 	case REQ_OP_WRITE:
4790 		op_type = OBJ_OP_WRITE;
4791 		break;
4792 	case REQ_OP_READ:
4793 		op_type = OBJ_OP_READ;
4794 		break;
4795 	default:
4796 		rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
4797 		return BLK_STS_IOERR;
4798 	}
4799 
4800 	rbd_img_request_init(img_req, rbd_dev, op_type);
4801 
4802 	if (rbd_img_is_write(img_req)) {
4803 		if (rbd_is_ro(rbd_dev)) {
4804 			rbd_warn(rbd_dev, "%s on read-only mapping",
4805 				 obj_op_name(img_req->op_type));
4806 			return BLK_STS_IOERR;
4807 		}
4808 		rbd_assert(!rbd_is_snap(rbd_dev));
4809 	}
4810 
4811 	INIT_WORK(&img_req->work, rbd_queue_workfn);
4812 	queue_work(rbd_wq, &img_req->work);
4813 	return BLK_STS_OK;
4814 }
4815 
rbd_free_disk(struct rbd_device * rbd_dev)4816 static void rbd_free_disk(struct rbd_device *rbd_dev)
4817 {
4818 	blk_cleanup_queue(rbd_dev->disk->queue);
4819 	blk_mq_free_tag_set(&rbd_dev->tag_set);
4820 	put_disk(rbd_dev->disk);
4821 	rbd_dev->disk = NULL;
4822 }
4823 
rbd_obj_read_sync(struct rbd_device * rbd_dev,struct ceph_object_id * oid,struct ceph_object_locator * oloc,void * buf,int buf_len)4824 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4825 			     struct ceph_object_id *oid,
4826 			     struct ceph_object_locator *oloc,
4827 			     void *buf, int buf_len)
4828 
4829 {
4830 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4831 	struct ceph_osd_request *req;
4832 	struct page **pages;
4833 	int num_pages = calc_pages_for(0, buf_len);
4834 	int ret;
4835 
4836 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4837 	if (!req)
4838 		return -ENOMEM;
4839 
4840 	ceph_oid_copy(&req->r_base_oid, oid);
4841 	ceph_oloc_copy(&req->r_base_oloc, oloc);
4842 	req->r_flags = CEPH_OSD_FLAG_READ;
4843 
4844 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4845 	if (IS_ERR(pages)) {
4846 		ret = PTR_ERR(pages);
4847 		goto out_req;
4848 	}
4849 
4850 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4851 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4852 					 true);
4853 
4854 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4855 	if (ret)
4856 		goto out_req;
4857 
4858 	ceph_osdc_start_request(osdc, req, false);
4859 	ret = ceph_osdc_wait_request(osdc, req);
4860 	if (ret >= 0)
4861 		ceph_copy_from_page_vector(pages, buf, 0, ret);
4862 
4863 out_req:
4864 	ceph_osdc_put_request(req);
4865 	return ret;
4866 }
4867 
4868 /*
4869  * Read the complete header for the given rbd device.  On successful
4870  * return, the rbd_dev->header field will contain up-to-date
4871  * information about the image.
4872  */
rbd_dev_v1_header_info(struct rbd_device * rbd_dev)4873 static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
4874 {
4875 	struct rbd_image_header_ondisk *ondisk = NULL;
4876 	u32 snap_count = 0;
4877 	u64 names_size = 0;
4878 	u32 want_count;
4879 	int ret;
4880 
4881 	/*
4882 	 * The complete header will include an array of its 64-bit
4883 	 * snapshot ids, followed by the names of those snapshots as
4884 	 * a contiguous block of NUL-terminated strings.  Note that
4885 	 * the number of snapshots could change by the time we read
4886 	 * it in, in which case we re-read it.
4887 	 */
4888 	do {
4889 		size_t size;
4890 
4891 		kfree(ondisk);
4892 
4893 		size = sizeof (*ondisk);
4894 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4895 		size += names_size;
4896 		ondisk = kmalloc(size, GFP_KERNEL);
4897 		if (!ondisk)
4898 			return -ENOMEM;
4899 
4900 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4901 					&rbd_dev->header_oloc, ondisk, size);
4902 		if (ret < 0)
4903 			goto out;
4904 		if ((size_t)ret < size) {
4905 			ret = -ENXIO;
4906 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4907 				size, ret);
4908 			goto out;
4909 		}
4910 		if (!rbd_dev_ondisk_valid(ondisk)) {
4911 			ret = -ENXIO;
4912 			rbd_warn(rbd_dev, "invalid header");
4913 			goto out;
4914 		}
4915 
4916 		names_size = le64_to_cpu(ondisk->snap_names_len);
4917 		want_count = snap_count;
4918 		snap_count = le32_to_cpu(ondisk->snap_count);
4919 	} while (snap_count != want_count);
4920 
4921 	ret = rbd_header_from_disk(rbd_dev, ondisk);
4922 out:
4923 	kfree(ondisk);
4924 
4925 	return ret;
4926 }
4927 
rbd_dev_update_size(struct rbd_device * rbd_dev)4928 static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4929 {
4930 	sector_t size;
4931 
4932 	/*
4933 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4934 	 * try to update its size.  If REMOVING is set, updating size
4935 	 * is just useless work since the device can't be opened.
4936 	 */
4937 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4938 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
4939 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4940 		dout("setting size to %llu sectors", (unsigned long long)size);
4941 		set_capacity(rbd_dev->disk, size);
4942 		revalidate_disk_size(rbd_dev->disk, true);
4943 	}
4944 }
4945 
rbd_dev_refresh(struct rbd_device * rbd_dev)4946 static int rbd_dev_refresh(struct rbd_device *rbd_dev)
4947 {
4948 	u64 mapping_size;
4949 	int ret;
4950 
4951 	down_write(&rbd_dev->header_rwsem);
4952 	mapping_size = rbd_dev->mapping.size;
4953 
4954 	ret = rbd_dev_header_info(rbd_dev);
4955 	if (ret)
4956 		goto out;
4957 
4958 	/*
4959 	 * If there is a parent, see if it has disappeared due to the
4960 	 * mapped image getting flattened.
4961 	 */
4962 	if (rbd_dev->parent) {
4963 		ret = rbd_dev_v2_parent_info(rbd_dev);
4964 		if (ret)
4965 			goto out;
4966 	}
4967 
4968 	rbd_assert(!rbd_is_snap(rbd_dev));
4969 	rbd_dev->mapping.size = rbd_dev->header.image_size;
4970 
4971 out:
4972 	up_write(&rbd_dev->header_rwsem);
4973 	if (!ret && mapping_size != rbd_dev->mapping.size)
4974 		rbd_dev_update_size(rbd_dev);
4975 
4976 	return ret;
4977 }
4978 
4979 static const struct blk_mq_ops rbd_mq_ops = {
4980 	.queue_rq	= rbd_queue_rq,
4981 };
4982 
rbd_init_disk(struct rbd_device * rbd_dev)4983 static int rbd_init_disk(struct rbd_device *rbd_dev)
4984 {
4985 	struct gendisk *disk;
4986 	struct request_queue *q;
4987 	unsigned int objset_bytes =
4988 	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
4989 	int err;
4990 
4991 	/* create gendisk info */
4992 	disk = alloc_disk(single_major ?
4993 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4994 			  RBD_MINORS_PER_MAJOR);
4995 	if (!disk)
4996 		return -ENOMEM;
4997 
4998 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4999 		 rbd_dev->dev_id);
5000 	disk->major = rbd_dev->major;
5001 	disk->first_minor = rbd_dev->minor;
5002 	if (single_major)
5003 		disk->flags |= GENHD_FL_EXT_DEVT;
5004 	disk->fops = &rbd_bd_ops;
5005 	disk->private_data = rbd_dev;
5006 
5007 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
5008 	rbd_dev->tag_set.ops = &rbd_mq_ops;
5009 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
5010 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
5011 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
5012 	rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
5013 	rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
5014 
5015 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
5016 	if (err)
5017 		goto out_disk;
5018 
5019 	q = blk_mq_init_queue(&rbd_dev->tag_set);
5020 	if (IS_ERR(q)) {
5021 		err = PTR_ERR(q);
5022 		goto out_tag_set;
5023 	}
5024 
5025 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
5026 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
5027 
5028 	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
5029 	q->limits.max_sectors = queue_max_hw_sectors(q);
5030 	blk_queue_max_segments(q, USHRT_MAX);
5031 	blk_queue_max_segment_size(q, UINT_MAX);
5032 	blk_queue_io_min(q, rbd_dev->opts->alloc_size);
5033 	blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
5034 
5035 	if (rbd_dev->opts->trim) {
5036 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
5037 		q->limits.discard_granularity = rbd_dev->opts->alloc_size;
5038 		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5039 		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5040 	}
5041 
5042 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
5043 		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
5044 
5045 	/*
5046 	 * disk_release() expects a queue ref from add_disk() and will
5047 	 * put it.  Hold an extra ref until add_disk() is called.
5048 	 */
5049 	WARN_ON(!blk_get_queue(q));
5050 	disk->queue = q;
5051 	q->queuedata = rbd_dev;
5052 
5053 	rbd_dev->disk = disk;
5054 
5055 	return 0;
5056 out_tag_set:
5057 	blk_mq_free_tag_set(&rbd_dev->tag_set);
5058 out_disk:
5059 	put_disk(disk);
5060 	return err;
5061 }
5062 
5063 /*
5064   sysfs
5065 */
5066 
dev_to_rbd_dev(struct device * dev)5067 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5068 {
5069 	return container_of(dev, struct rbd_device, dev);
5070 }
5071 
rbd_size_show(struct device * dev,struct device_attribute * attr,char * buf)5072 static ssize_t rbd_size_show(struct device *dev,
5073 			     struct device_attribute *attr, char *buf)
5074 {
5075 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5076 
5077 	return sprintf(buf, "%llu\n",
5078 		(unsigned long long)rbd_dev->mapping.size);
5079 }
5080 
rbd_features_show(struct device * dev,struct device_attribute * attr,char * buf)5081 static ssize_t rbd_features_show(struct device *dev,
5082 			     struct device_attribute *attr, char *buf)
5083 {
5084 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5085 
5086 	return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
5087 }
5088 
rbd_major_show(struct device * dev,struct device_attribute * attr,char * buf)5089 static ssize_t rbd_major_show(struct device *dev,
5090 			      struct device_attribute *attr, char *buf)
5091 {
5092 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5093 
5094 	if (rbd_dev->major)
5095 		return sprintf(buf, "%d\n", rbd_dev->major);
5096 
5097 	return sprintf(buf, "(none)\n");
5098 }
5099 
rbd_minor_show(struct device * dev,struct device_attribute * attr,char * buf)5100 static ssize_t rbd_minor_show(struct device *dev,
5101 			      struct device_attribute *attr, char *buf)
5102 {
5103 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5104 
5105 	return sprintf(buf, "%d\n", rbd_dev->minor);
5106 }
5107 
rbd_client_addr_show(struct device * dev,struct device_attribute * attr,char * buf)5108 static ssize_t rbd_client_addr_show(struct device *dev,
5109 				    struct device_attribute *attr, char *buf)
5110 {
5111 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5112 	struct ceph_entity_addr *client_addr =
5113 	    ceph_client_addr(rbd_dev->rbd_client->client);
5114 
5115 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5116 		       le32_to_cpu(client_addr->nonce));
5117 }
5118 
rbd_client_id_show(struct device * dev,struct device_attribute * attr,char * buf)5119 static ssize_t rbd_client_id_show(struct device *dev,
5120 				  struct device_attribute *attr, char *buf)
5121 {
5122 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5123 
5124 	return sprintf(buf, "client%lld\n",
5125 		       ceph_client_gid(rbd_dev->rbd_client->client));
5126 }
5127 
rbd_cluster_fsid_show(struct device * dev,struct device_attribute * attr,char * buf)5128 static ssize_t rbd_cluster_fsid_show(struct device *dev,
5129 				     struct device_attribute *attr, char *buf)
5130 {
5131 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5132 
5133 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5134 }
5135 
rbd_config_info_show(struct device * dev,struct device_attribute * attr,char * buf)5136 static ssize_t rbd_config_info_show(struct device *dev,
5137 				    struct device_attribute *attr, char *buf)
5138 {
5139 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5140 
5141 	if (!capable(CAP_SYS_ADMIN))
5142 		return -EPERM;
5143 
5144 	return sprintf(buf, "%s\n", rbd_dev->config_info);
5145 }
5146 
rbd_pool_show(struct device * dev,struct device_attribute * attr,char * buf)5147 static ssize_t rbd_pool_show(struct device *dev,
5148 			     struct device_attribute *attr, char *buf)
5149 {
5150 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5151 
5152 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
5153 }
5154 
rbd_pool_id_show(struct device * dev,struct device_attribute * attr,char * buf)5155 static ssize_t rbd_pool_id_show(struct device *dev,
5156 			     struct device_attribute *attr, char *buf)
5157 {
5158 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5159 
5160 	return sprintf(buf, "%llu\n",
5161 			(unsigned long long) rbd_dev->spec->pool_id);
5162 }
5163 
rbd_pool_ns_show(struct device * dev,struct device_attribute * attr,char * buf)5164 static ssize_t rbd_pool_ns_show(struct device *dev,
5165 				struct device_attribute *attr, char *buf)
5166 {
5167 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5168 
5169 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5170 }
5171 
rbd_name_show(struct device * dev,struct device_attribute * attr,char * buf)5172 static ssize_t rbd_name_show(struct device *dev,
5173 			     struct device_attribute *attr, char *buf)
5174 {
5175 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5176 
5177 	if (rbd_dev->spec->image_name)
5178 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5179 
5180 	return sprintf(buf, "(unknown)\n");
5181 }
5182 
rbd_image_id_show(struct device * dev,struct device_attribute * attr,char * buf)5183 static ssize_t rbd_image_id_show(struct device *dev,
5184 			     struct device_attribute *attr, char *buf)
5185 {
5186 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5187 
5188 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
5189 }
5190 
5191 /*
5192  * Shows the name of the currently-mapped snapshot (or
5193  * RBD_SNAP_HEAD_NAME for the base image).
5194  */
rbd_snap_show(struct device * dev,struct device_attribute * attr,char * buf)5195 static ssize_t rbd_snap_show(struct device *dev,
5196 			     struct device_attribute *attr,
5197 			     char *buf)
5198 {
5199 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5200 
5201 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
5202 }
5203 
rbd_snap_id_show(struct device * dev,struct device_attribute * attr,char * buf)5204 static ssize_t rbd_snap_id_show(struct device *dev,
5205 				struct device_attribute *attr, char *buf)
5206 {
5207 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5208 
5209 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5210 }
5211 
5212 /*
5213  * For a v2 image, shows the chain of parent images, separated by empty
5214  * lines.  For v1 images or if there is no parent, shows "(no parent
5215  * image)".
5216  */
rbd_parent_show(struct device * dev,struct device_attribute * attr,char * buf)5217 static ssize_t rbd_parent_show(struct device *dev,
5218 			       struct device_attribute *attr,
5219 			       char *buf)
5220 {
5221 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5222 	ssize_t count = 0;
5223 
5224 	if (!rbd_dev->parent)
5225 		return sprintf(buf, "(no parent image)\n");
5226 
5227 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5228 		struct rbd_spec *spec = rbd_dev->parent_spec;
5229 
5230 		count += sprintf(&buf[count], "%s"
5231 			    "pool_id %llu\npool_name %s\n"
5232 			    "pool_ns %s\n"
5233 			    "image_id %s\nimage_name %s\n"
5234 			    "snap_id %llu\nsnap_name %s\n"
5235 			    "overlap %llu\n",
5236 			    !count ? "" : "\n", /* first? */
5237 			    spec->pool_id, spec->pool_name,
5238 			    spec->pool_ns ?: "",
5239 			    spec->image_id, spec->image_name ?: "(unknown)",
5240 			    spec->snap_id, spec->snap_name,
5241 			    rbd_dev->parent_overlap);
5242 	}
5243 
5244 	return count;
5245 }
5246 
rbd_image_refresh(struct device * dev,struct device_attribute * attr,const char * buf,size_t size)5247 static ssize_t rbd_image_refresh(struct device *dev,
5248 				 struct device_attribute *attr,
5249 				 const char *buf,
5250 				 size_t size)
5251 {
5252 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5253 	int ret;
5254 
5255 	if (!capable(CAP_SYS_ADMIN))
5256 		return -EPERM;
5257 
5258 	ret = rbd_dev_refresh(rbd_dev);
5259 	if (ret)
5260 		return ret;
5261 
5262 	return size;
5263 }
5264 
5265 static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5266 static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5267 static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5268 static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5269 static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5270 static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5271 static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5272 static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5273 static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5274 static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
5275 static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5276 static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5277 static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5278 static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5279 static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5280 static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5281 static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
5282 
5283 static struct attribute *rbd_attrs[] = {
5284 	&dev_attr_size.attr,
5285 	&dev_attr_features.attr,
5286 	&dev_attr_major.attr,
5287 	&dev_attr_minor.attr,
5288 	&dev_attr_client_addr.attr,
5289 	&dev_attr_client_id.attr,
5290 	&dev_attr_cluster_fsid.attr,
5291 	&dev_attr_config_info.attr,
5292 	&dev_attr_pool.attr,
5293 	&dev_attr_pool_id.attr,
5294 	&dev_attr_pool_ns.attr,
5295 	&dev_attr_name.attr,
5296 	&dev_attr_image_id.attr,
5297 	&dev_attr_current_snap.attr,
5298 	&dev_attr_snap_id.attr,
5299 	&dev_attr_parent.attr,
5300 	&dev_attr_refresh.attr,
5301 	NULL
5302 };
5303 
5304 static struct attribute_group rbd_attr_group = {
5305 	.attrs = rbd_attrs,
5306 };
5307 
5308 static const struct attribute_group *rbd_attr_groups[] = {
5309 	&rbd_attr_group,
5310 	NULL
5311 };
5312 
5313 static void rbd_dev_release(struct device *dev);
5314 
5315 static const struct device_type rbd_device_type = {
5316 	.name		= "rbd",
5317 	.groups		= rbd_attr_groups,
5318 	.release	= rbd_dev_release,
5319 };
5320 
rbd_spec_get(struct rbd_spec * spec)5321 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5322 {
5323 	kref_get(&spec->kref);
5324 
5325 	return spec;
5326 }
5327 
5328 static void rbd_spec_free(struct kref *kref);
rbd_spec_put(struct rbd_spec * spec)5329 static void rbd_spec_put(struct rbd_spec *spec)
5330 {
5331 	if (spec)
5332 		kref_put(&spec->kref, rbd_spec_free);
5333 }
5334 
rbd_spec_alloc(void)5335 static struct rbd_spec *rbd_spec_alloc(void)
5336 {
5337 	struct rbd_spec *spec;
5338 
5339 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5340 	if (!spec)
5341 		return NULL;
5342 
5343 	spec->pool_id = CEPH_NOPOOL;
5344 	spec->snap_id = CEPH_NOSNAP;
5345 	kref_init(&spec->kref);
5346 
5347 	return spec;
5348 }
5349 
rbd_spec_free(struct kref * kref)5350 static void rbd_spec_free(struct kref *kref)
5351 {
5352 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5353 
5354 	kfree(spec->pool_name);
5355 	kfree(spec->pool_ns);
5356 	kfree(spec->image_id);
5357 	kfree(spec->image_name);
5358 	kfree(spec->snap_name);
5359 	kfree(spec);
5360 }
5361 
rbd_dev_free(struct rbd_device * rbd_dev)5362 static void rbd_dev_free(struct rbd_device *rbd_dev)
5363 {
5364 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
5365 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
5366 
5367 	ceph_oid_destroy(&rbd_dev->header_oid);
5368 	ceph_oloc_destroy(&rbd_dev->header_oloc);
5369 	kfree(rbd_dev->config_info);
5370 
5371 	rbd_put_client(rbd_dev->rbd_client);
5372 	rbd_spec_put(rbd_dev->spec);
5373 	kfree(rbd_dev->opts);
5374 	kfree(rbd_dev);
5375 }
5376 
rbd_dev_release(struct device * dev)5377 static void rbd_dev_release(struct device *dev)
5378 {
5379 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5380 	bool need_put = !!rbd_dev->opts;
5381 
5382 	if (need_put) {
5383 		destroy_workqueue(rbd_dev->task_wq);
5384 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5385 	}
5386 
5387 	rbd_dev_free(rbd_dev);
5388 
5389 	/*
5390 	 * This is racy, but way better than putting module outside of
5391 	 * the release callback.  The race window is pretty small, so
5392 	 * doing something similar to dm (dm-builtin.c) is overkill.
5393 	 */
5394 	if (need_put)
5395 		module_put(THIS_MODULE);
5396 }
5397 
__rbd_dev_create(struct rbd_spec * spec)5398 static struct rbd_device *__rbd_dev_create(struct rbd_spec *spec)
5399 {
5400 	struct rbd_device *rbd_dev;
5401 
5402 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
5403 	if (!rbd_dev)
5404 		return NULL;
5405 
5406 	spin_lock_init(&rbd_dev->lock);
5407 	INIT_LIST_HEAD(&rbd_dev->node);
5408 	init_rwsem(&rbd_dev->header_rwsem);
5409 
5410 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
5411 	ceph_oid_init(&rbd_dev->header_oid);
5412 	rbd_dev->header_oloc.pool = spec->pool_id;
5413 	if (spec->pool_ns) {
5414 		WARN_ON(!*spec->pool_ns);
5415 		rbd_dev->header_oloc.pool_ns =
5416 		    ceph_find_or_create_string(spec->pool_ns,
5417 					       strlen(spec->pool_ns));
5418 	}
5419 
5420 	mutex_init(&rbd_dev->watch_mutex);
5421 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5422 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5423 
5424 	init_rwsem(&rbd_dev->lock_rwsem);
5425 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5426 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5427 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5428 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5429 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
5430 	spin_lock_init(&rbd_dev->lock_lists_lock);
5431 	INIT_LIST_HEAD(&rbd_dev->acquiring_list);
5432 	INIT_LIST_HEAD(&rbd_dev->running_list);
5433 	init_completion(&rbd_dev->acquire_wait);
5434 	init_completion(&rbd_dev->releasing_wait);
5435 
5436 	spin_lock_init(&rbd_dev->object_map_lock);
5437 
5438 	rbd_dev->dev.bus = &rbd_bus_type;
5439 	rbd_dev->dev.type = &rbd_device_type;
5440 	rbd_dev->dev.parent = &rbd_root_dev;
5441 	device_initialize(&rbd_dev->dev);
5442 
5443 	return rbd_dev;
5444 }
5445 
5446 /*
5447  * Create a mapping rbd_dev.
5448  */
rbd_dev_create(struct rbd_client * rbdc,struct rbd_spec * spec,struct rbd_options * opts)5449 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5450 					 struct rbd_spec *spec,
5451 					 struct rbd_options *opts)
5452 {
5453 	struct rbd_device *rbd_dev;
5454 
5455 	rbd_dev = __rbd_dev_create(spec);
5456 	if (!rbd_dev)
5457 		return NULL;
5458 
5459 	/* get an id and fill in device name */
5460 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5461 					 minor_to_rbd_dev_id(1 << MINORBITS),
5462 					 GFP_KERNEL);
5463 	if (rbd_dev->dev_id < 0)
5464 		goto fail_rbd_dev;
5465 
5466 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5467 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5468 						   rbd_dev->name);
5469 	if (!rbd_dev->task_wq)
5470 		goto fail_dev_id;
5471 
5472 	/* we have a ref from do_rbd_add() */
5473 	__module_get(THIS_MODULE);
5474 
5475 	rbd_dev->rbd_client = rbdc;
5476 	rbd_dev->spec = spec;
5477 	rbd_dev->opts = opts;
5478 
5479 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5480 	return rbd_dev;
5481 
5482 fail_dev_id:
5483 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5484 fail_rbd_dev:
5485 	rbd_dev_free(rbd_dev);
5486 	return NULL;
5487 }
5488 
rbd_dev_destroy(struct rbd_device * rbd_dev)5489 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5490 {
5491 	if (rbd_dev)
5492 		put_device(&rbd_dev->dev);
5493 }
5494 
5495 /*
5496  * Get the size and object order for an image snapshot, or if
5497  * snap_id is CEPH_NOSNAP, gets this information for the base
5498  * image.
5499  */
_rbd_dev_v2_snap_size(struct rbd_device * rbd_dev,u64 snap_id,u8 * order,u64 * snap_size)5500 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5501 				u8 *order, u64 *snap_size)
5502 {
5503 	__le64 snapid = cpu_to_le64(snap_id);
5504 	int ret;
5505 	struct {
5506 		u8 order;
5507 		__le64 size;
5508 	} __attribute__ ((packed)) size_buf = { 0 };
5509 
5510 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5511 				  &rbd_dev->header_oloc, "get_size",
5512 				  &snapid, sizeof(snapid),
5513 				  &size_buf, sizeof(size_buf));
5514 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5515 	if (ret < 0)
5516 		return ret;
5517 	if (ret < sizeof (size_buf))
5518 		return -ERANGE;
5519 
5520 	if (order) {
5521 		*order = size_buf.order;
5522 		dout("  order %u", (unsigned int)*order);
5523 	}
5524 	*snap_size = le64_to_cpu(size_buf.size);
5525 
5526 	dout("  snap_id 0x%016llx snap_size = %llu\n",
5527 		(unsigned long long)snap_id,
5528 		(unsigned long long)*snap_size);
5529 
5530 	return 0;
5531 }
5532 
rbd_dev_v2_image_size(struct rbd_device * rbd_dev)5533 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5534 {
5535 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5536 					&rbd_dev->header.obj_order,
5537 					&rbd_dev->header.image_size);
5538 }
5539 
rbd_dev_v2_object_prefix(struct rbd_device * rbd_dev)5540 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5541 {
5542 	size_t size;
5543 	void *reply_buf;
5544 	int ret;
5545 	void *p;
5546 
5547 	/* Response will be an encoded string, which includes a length */
5548 	size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5549 	reply_buf = kzalloc(size, GFP_KERNEL);
5550 	if (!reply_buf)
5551 		return -ENOMEM;
5552 
5553 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5554 				  &rbd_dev->header_oloc, "get_object_prefix",
5555 				  NULL, 0, reply_buf, size);
5556 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5557 	if (ret < 0)
5558 		goto out;
5559 
5560 	p = reply_buf;
5561 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
5562 						p + ret, NULL, GFP_NOIO);
5563 	ret = 0;
5564 
5565 	if (IS_ERR(rbd_dev->header.object_prefix)) {
5566 		ret = PTR_ERR(rbd_dev->header.object_prefix);
5567 		rbd_dev->header.object_prefix = NULL;
5568 	} else {
5569 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
5570 	}
5571 out:
5572 	kfree(reply_buf);
5573 
5574 	return ret;
5575 }
5576 
_rbd_dev_v2_snap_features(struct rbd_device * rbd_dev,u64 snap_id,bool read_only,u64 * snap_features)5577 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5578 				     bool read_only, u64 *snap_features)
5579 {
5580 	struct {
5581 		__le64 snap_id;
5582 		u8 read_only;
5583 	} features_in;
5584 	struct {
5585 		__le64 features;
5586 		__le64 incompat;
5587 	} __attribute__ ((packed)) features_buf = { 0 };
5588 	u64 unsup;
5589 	int ret;
5590 
5591 	features_in.snap_id = cpu_to_le64(snap_id);
5592 	features_in.read_only = read_only;
5593 
5594 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5595 				  &rbd_dev->header_oloc, "get_features",
5596 				  &features_in, sizeof(features_in),
5597 				  &features_buf, sizeof(features_buf));
5598 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5599 	if (ret < 0)
5600 		return ret;
5601 	if (ret < sizeof (features_buf))
5602 		return -ERANGE;
5603 
5604 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5605 	if (unsup) {
5606 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5607 			 unsup);
5608 		return -ENXIO;
5609 	}
5610 
5611 	*snap_features = le64_to_cpu(features_buf.features);
5612 
5613 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
5614 		(unsigned long long)snap_id,
5615 		(unsigned long long)*snap_features,
5616 		(unsigned long long)le64_to_cpu(features_buf.incompat));
5617 
5618 	return 0;
5619 }
5620 
rbd_dev_v2_features(struct rbd_device * rbd_dev)5621 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5622 {
5623 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5624 					 rbd_is_ro(rbd_dev),
5625 					 &rbd_dev->header.features);
5626 }
5627 
5628 /*
5629  * These are generic image flags, but since they are used only for
5630  * object map, store them in rbd_dev->object_map_flags.
5631  *
5632  * For the same reason, this function is called only on object map
5633  * (re)load and not on header refresh.
5634  */
rbd_dev_v2_get_flags(struct rbd_device * rbd_dev)5635 static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5636 {
5637 	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5638 	__le64 flags;
5639 	int ret;
5640 
5641 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5642 				  &rbd_dev->header_oloc, "get_flags",
5643 				  &snapid, sizeof(snapid),
5644 				  &flags, sizeof(flags));
5645 	if (ret < 0)
5646 		return ret;
5647 	if (ret < sizeof(flags))
5648 		return -EBADMSG;
5649 
5650 	rbd_dev->object_map_flags = le64_to_cpu(flags);
5651 	return 0;
5652 }
5653 
5654 struct parent_image_info {
5655 	u64		pool_id;
5656 	const char	*pool_ns;
5657 	const char	*image_id;
5658 	u64		snap_id;
5659 
5660 	bool		has_overlap;
5661 	u64		overlap;
5662 };
5663 
5664 /*
5665  * The caller is responsible for @pii.
5666  */
decode_parent_image_spec(void ** p,void * end,struct parent_image_info * pii)5667 static int decode_parent_image_spec(void **p, void *end,
5668 				    struct parent_image_info *pii)
5669 {
5670 	u8 struct_v;
5671 	u32 struct_len;
5672 	int ret;
5673 
5674 	ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5675 				  &struct_v, &struct_len);
5676 	if (ret)
5677 		return ret;
5678 
5679 	ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5680 	pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5681 	if (IS_ERR(pii->pool_ns)) {
5682 		ret = PTR_ERR(pii->pool_ns);
5683 		pii->pool_ns = NULL;
5684 		return ret;
5685 	}
5686 	pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5687 	if (IS_ERR(pii->image_id)) {
5688 		ret = PTR_ERR(pii->image_id);
5689 		pii->image_id = NULL;
5690 		return ret;
5691 	}
5692 	ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5693 	return 0;
5694 
5695 e_inval:
5696 	return -EINVAL;
5697 }
5698 
__get_parent_info(struct rbd_device * rbd_dev,struct page * req_page,struct page * reply_page,struct parent_image_info * pii)5699 static int __get_parent_info(struct rbd_device *rbd_dev,
5700 			     struct page *req_page,
5701 			     struct page *reply_page,
5702 			     struct parent_image_info *pii)
5703 {
5704 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5705 	size_t reply_len = PAGE_SIZE;
5706 	void *p, *end;
5707 	int ret;
5708 
5709 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5710 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
5711 			     req_page, sizeof(u64), &reply_page, &reply_len);
5712 	if (ret)
5713 		return ret == -EOPNOTSUPP ? 1 : ret;
5714 
5715 	p = page_address(reply_page);
5716 	end = p + reply_len;
5717 	ret = decode_parent_image_spec(&p, end, pii);
5718 	if (ret)
5719 		return ret;
5720 
5721 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5722 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
5723 			     req_page, sizeof(u64), &reply_page, &reply_len);
5724 	if (ret)
5725 		return ret;
5726 
5727 	p = page_address(reply_page);
5728 	end = p + reply_len;
5729 	ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5730 	if (pii->has_overlap)
5731 		ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5732 
5733 	return 0;
5734 
5735 e_inval:
5736 	return -EINVAL;
5737 }
5738 
5739 /*
5740  * The caller is responsible for @pii.
5741  */
__get_parent_info_legacy(struct rbd_device * rbd_dev,struct page * req_page,struct page * reply_page,struct parent_image_info * pii)5742 static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5743 				    struct page *req_page,
5744 				    struct page *reply_page,
5745 				    struct parent_image_info *pii)
5746 {
5747 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5748 	size_t reply_len = PAGE_SIZE;
5749 	void *p, *end;
5750 	int ret;
5751 
5752 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5753 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
5754 			     req_page, sizeof(u64), &reply_page, &reply_len);
5755 	if (ret)
5756 		return ret;
5757 
5758 	p = page_address(reply_page);
5759 	end = p + reply_len;
5760 	ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5761 	pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5762 	if (IS_ERR(pii->image_id)) {
5763 		ret = PTR_ERR(pii->image_id);
5764 		pii->image_id = NULL;
5765 		return ret;
5766 	}
5767 	ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
5768 	pii->has_overlap = true;
5769 	ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5770 
5771 	return 0;
5772 
5773 e_inval:
5774 	return -EINVAL;
5775 }
5776 
get_parent_info(struct rbd_device * rbd_dev,struct parent_image_info * pii)5777 static int get_parent_info(struct rbd_device *rbd_dev,
5778 			   struct parent_image_info *pii)
5779 {
5780 	struct page *req_page, *reply_page;
5781 	void *p;
5782 	int ret;
5783 
5784 	req_page = alloc_page(GFP_KERNEL);
5785 	if (!req_page)
5786 		return -ENOMEM;
5787 
5788 	reply_page = alloc_page(GFP_KERNEL);
5789 	if (!reply_page) {
5790 		__free_page(req_page);
5791 		return -ENOMEM;
5792 	}
5793 
5794 	p = page_address(req_page);
5795 	ceph_encode_64(&p, rbd_dev->spec->snap_id);
5796 	ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5797 	if (ret > 0)
5798 		ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5799 					       pii);
5800 
5801 	__free_page(req_page);
5802 	__free_page(reply_page);
5803 	return ret;
5804 }
5805 
rbd_dev_v2_parent_info(struct rbd_device * rbd_dev)5806 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5807 {
5808 	struct rbd_spec *parent_spec;
5809 	struct parent_image_info pii = { 0 };
5810 	int ret;
5811 
5812 	parent_spec = rbd_spec_alloc();
5813 	if (!parent_spec)
5814 		return -ENOMEM;
5815 
5816 	ret = get_parent_info(rbd_dev, &pii);
5817 	if (ret)
5818 		goto out_err;
5819 
5820 	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5821 	     __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5822 	     pii.has_overlap, pii.overlap);
5823 
5824 	if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
5825 		/*
5826 		 * Either the parent never existed, or we have
5827 		 * record of it but the image got flattened so it no
5828 		 * longer has a parent.  When the parent of a
5829 		 * layered image disappears we immediately set the
5830 		 * overlap to 0.  The effect of this is that all new
5831 		 * requests will be treated as if the image had no
5832 		 * parent.
5833 		 *
5834 		 * If !pii.has_overlap, the parent image spec is not
5835 		 * applicable.  It's there to avoid duplication in each
5836 		 * snapshot record.
5837 		 */
5838 		if (rbd_dev->parent_overlap) {
5839 			rbd_dev->parent_overlap = 0;
5840 			rbd_dev_parent_put(rbd_dev);
5841 			pr_info("%s: clone image has been flattened\n",
5842 				rbd_dev->disk->disk_name);
5843 		}
5844 
5845 		goto out;	/* No parent?  No problem. */
5846 	}
5847 
5848 	/* The ceph file layout needs to fit pool id in 32 bits */
5849 
5850 	ret = -EIO;
5851 	if (pii.pool_id > (u64)U32_MAX) {
5852 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5853 			(unsigned long long)pii.pool_id, U32_MAX);
5854 		goto out_err;
5855 	}
5856 
5857 	/*
5858 	 * The parent won't change (except when the clone is
5859 	 * flattened, already handled that).  So we only need to
5860 	 * record the parent spec we have not already done so.
5861 	 */
5862 	if (!rbd_dev->parent_spec) {
5863 		parent_spec->pool_id = pii.pool_id;
5864 		if (pii.pool_ns && *pii.pool_ns) {
5865 			parent_spec->pool_ns = pii.pool_ns;
5866 			pii.pool_ns = NULL;
5867 		}
5868 		parent_spec->image_id = pii.image_id;
5869 		pii.image_id = NULL;
5870 		parent_spec->snap_id = pii.snap_id;
5871 
5872 		rbd_dev->parent_spec = parent_spec;
5873 		parent_spec = NULL;	/* rbd_dev now owns this */
5874 	}
5875 
5876 	/*
5877 	 * We always update the parent overlap.  If it's zero we issue
5878 	 * a warning, as we will proceed as if there was no parent.
5879 	 */
5880 	if (!pii.overlap) {
5881 		if (parent_spec) {
5882 			/* refresh, careful to warn just once */
5883 			if (rbd_dev->parent_overlap)
5884 				rbd_warn(rbd_dev,
5885 				    "clone now standalone (overlap became 0)");
5886 		} else {
5887 			/* initial probe */
5888 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
5889 		}
5890 	}
5891 	rbd_dev->parent_overlap = pii.overlap;
5892 
5893 out:
5894 	ret = 0;
5895 out_err:
5896 	kfree(pii.pool_ns);
5897 	kfree(pii.image_id);
5898 	rbd_spec_put(parent_spec);
5899 	return ret;
5900 }
5901 
rbd_dev_v2_striping_info(struct rbd_device * rbd_dev)5902 static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5903 {
5904 	struct {
5905 		__le64 stripe_unit;
5906 		__le64 stripe_count;
5907 	} __attribute__ ((packed)) striping_info_buf = { 0 };
5908 	size_t size = sizeof (striping_info_buf);
5909 	void *p;
5910 	int ret;
5911 
5912 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5913 				&rbd_dev->header_oloc, "get_stripe_unit_count",
5914 				NULL, 0, &striping_info_buf, size);
5915 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5916 	if (ret < 0)
5917 		return ret;
5918 	if (ret < size)
5919 		return -ERANGE;
5920 
5921 	p = &striping_info_buf;
5922 	rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5923 	rbd_dev->header.stripe_count = ceph_decode_64(&p);
5924 	return 0;
5925 }
5926 
rbd_dev_v2_data_pool(struct rbd_device * rbd_dev)5927 static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5928 {
5929 	__le64 data_pool_id;
5930 	int ret;
5931 
5932 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5933 				  &rbd_dev->header_oloc, "get_data_pool",
5934 				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
5935 	if (ret < 0)
5936 		return ret;
5937 	if (ret < sizeof(data_pool_id))
5938 		return -EBADMSG;
5939 
5940 	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5941 	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5942 	return 0;
5943 }
5944 
rbd_dev_image_name(struct rbd_device * rbd_dev)5945 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5946 {
5947 	CEPH_DEFINE_OID_ONSTACK(oid);
5948 	size_t image_id_size;
5949 	char *image_id;
5950 	void *p;
5951 	void *end;
5952 	size_t size;
5953 	void *reply_buf = NULL;
5954 	size_t len = 0;
5955 	char *image_name = NULL;
5956 	int ret;
5957 
5958 	rbd_assert(!rbd_dev->spec->image_name);
5959 
5960 	len = strlen(rbd_dev->spec->image_id);
5961 	image_id_size = sizeof (__le32) + len;
5962 	image_id = kmalloc(image_id_size, GFP_KERNEL);
5963 	if (!image_id)
5964 		return NULL;
5965 
5966 	p = image_id;
5967 	end = image_id + image_id_size;
5968 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
5969 
5970 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5971 	reply_buf = kmalloc(size, GFP_KERNEL);
5972 	if (!reply_buf)
5973 		goto out;
5974 
5975 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5976 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5977 				  "dir_get_name", image_id, image_id_size,
5978 				  reply_buf, size);
5979 	if (ret < 0)
5980 		goto out;
5981 	p = reply_buf;
5982 	end = reply_buf + ret;
5983 
5984 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5985 	if (IS_ERR(image_name))
5986 		image_name = NULL;
5987 	else
5988 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5989 out:
5990 	kfree(reply_buf);
5991 	kfree(image_id);
5992 
5993 	return image_name;
5994 }
5995 
rbd_v1_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)5996 static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5997 {
5998 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5999 	const char *snap_name;
6000 	u32 which = 0;
6001 
6002 	/* Skip over names until we find the one we are looking for */
6003 
6004 	snap_name = rbd_dev->header.snap_names;
6005 	while (which < snapc->num_snaps) {
6006 		if (!strcmp(name, snap_name))
6007 			return snapc->snaps[which];
6008 		snap_name += strlen(snap_name) + 1;
6009 		which++;
6010 	}
6011 	return CEPH_NOSNAP;
6012 }
6013 
rbd_v2_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)6014 static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6015 {
6016 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
6017 	u32 which;
6018 	bool found = false;
6019 	u64 snap_id;
6020 
6021 	for (which = 0; !found && which < snapc->num_snaps; which++) {
6022 		const char *snap_name;
6023 
6024 		snap_id = snapc->snaps[which];
6025 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
6026 		if (IS_ERR(snap_name)) {
6027 			/* ignore no-longer existing snapshots */
6028 			if (PTR_ERR(snap_name) == -ENOENT)
6029 				continue;
6030 			else
6031 				break;
6032 		}
6033 		found = !strcmp(name, snap_name);
6034 		kfree(snap_name);
6035 	}
6036 	return found ? snap_id : CEPH_NOSNAP;
6037 }
6038 
6039 /*
6040  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
6041  * no snapshot by that name is found, or if an error occurs.
6042  */
rbd_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)6043 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6044 {
6045 	if (rbd_dev->image_format == 1)
6046 		return rbd_v1_snap_id_by_name(rbd_dev, name);
6047 
6048 	return rbd_v2_snap_id_by_name(rbd_dev, name);
6049 }
6050 
6051 /*
6052  * An image being mapped will have everything but the snap id.
6053  */
rbd_spec_fill_snap_id(struct rbd_device * rbd_dev)6054 static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
6055 {
6056 	struct rbd_spec *spec = rbd_dev->spec;
6057 
6058 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
6059 	rbd_assert(spec->image_id && spec->image_name);
6060 	rbd_assert(spec->snap_name);
6061 
6062 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
6063 		u64 snap_id;
6064 
6065 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
6066 		if (snap_id == CEPH_NOSNAP)
6067 			return -ENOENT;
6068 
6069 		spec->snap_id = snap_id;
6070 	} else {
6071 		spec->snap_id = CEPH_NOSNAP;
6072 	}
6073 
6074 	return 0;
6075 }
6076 
6077 /*
6078  * A parent image will have all ids but none of the names.
6079  *
6080  * All names in an rbd spec are dynamically allocated.  It's OK if we
6081  * can't figure out the name for an image id.
6082  */
rbd_spec_fill_names(struct rbd_device * rbd_dev)6083 static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
6084 {
6085 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
6086 	struct rbd_spec *spec = rbd_dev->spec;
6087 	const char *pool_name;
6088 	const char *image_name;
6089 	const char *snap_name;
6090 	int ret;
6091 
6092 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
6093 	rbd_assert(spec->image_id);
6094 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
6095 
6096 	/* Get the pool name; we have to make our own copy of this */
6097 
6098 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6099 	if (!pool_name) {
6100 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
6101 		return -EIO;
6102 	}
6103 	pool_name = kstrdup(pool_name, GFP_KERNEL);
6104 	if (!pool_name)
6105 		return -ENOMEM;
6106 
6107 	/* Fetch the image name; tolerate failure here */
6108 
6109 	image_name = rbd_dev_image_name(rbd_dev);
6110 	if (!image_name)
6111 		rbd_warn(rbd_dev, "unable to get image name");
6112 
6113 	/* Fetch the snapshot name */
6114 
6115 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
6116 	if (IS_ERR(snap_name)) {
6117 		ret = PTR_ERR(snap_name);
6118 		goto out_err;
6119 	}
6120 
6121 	spec->pool_name = pool_name;
6122 	spec->image_name = image_name;
6123 	spec->snap_name = snap_name;
6124 
6125 	return 0;
6126 
6127 out_err:
6128 	kfree(image_name);
6129 	kfree(pool_name);
6130 	return ret;
6131 }
6132 
rbd_dev_v2_snap_context(struct rbd_device * rbd_dev)6133 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
6134 {
6135 	size_t size;
6136 	int ret;
6137 	void *reply_buf;
6138 	void *p;
6139 	void *end;
6140 	u64 seq;
6141 	u32 snap_count;
6142 	struct ceph_snap_context *snapc;
6143 	u32 i;
6144 
6145 	/*
6146 	 * We'll need room for the seq value (maximum snapshot id),
6147 	 * snapshot count, and array of that many snapshot ids.
6148 	 * For now we have a fixed upper limit on the number we're
6149 	 * prepared to receive.
6150 	 */
6151 	size = sizeof (__le64) + sizeof (__le32) +
6152 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
6153 	reply_buf = kzalloc(size, GFP_KERNEL);
6154 	if (!reply_buf)
6155 		return -ENOMEM;
6156 
6157 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6158 				  &rbd_dev->header_oloc, "get_snapcontext",
6159 				  NULL, 0, reply_buf, size);
6160 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6161 	if (ret < 0)
6162 		goto out;
6163 
6164 	p = reply_buf;
6165 	end = reply_buf + ret;
6166 	ret = -ERANGE;
6167 	ceph_decode_64_safe(&p, end, seq, out);
6168 	ceph_decode_32_safe(&p, end, snap_count, out);
6169 
6170 	/*
6171 	 * Make sure the reported number of snapshot ids wouldn't go
6172 	 * beyond the end of our buffer.  But before checking that,
6173 	 * make sure the computed size of the snapshot context we
6174 	 * allocate is representable in a size_t.
6175 	 */
6176 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6177 				 / sizeof (u64)) {
6178 		ret = -EINVAL;
6179 		goto out;
6180 	}
6181 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6182 		goto out;
6183 	ret = 0;
6184 
6185 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
6186 	if (!snapc) {
6187 		ret = -ENOMEM;
6188 		goto out;
6189 	}
6190 	snapc->seq = seq;
6191 	for (i = 0; i < snap_count; i++)
6192 		snapc->snaps[i] = ceph_decode_64(&p);
6193 
6194 	ceph_put_snap_context(rbd_dev->header.snapc);
6195 	rbd_dev->header.snapc = snapc;
6196 
6197 	dout("  snap context seq = %llu, snap_count = %u\n",
6198 		(unsigned long long)seq, (unsigned int)snap_count);
6199 out:
6200 	kfree(reply_buf);
6201 
6202 	return ret;
6203 }
6204 
rbd_dev_v2_snap_name(struct rbd_device * rbd_dev,u64 snap_id)6205 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6206 					u64 snap_id)
6207 {
6208 	size_t size;
6209 	void *reply_buf;
6210 	__le64 snapid;
6211 	int ret;
6212 	void *p;
6213 	void *end;
6214 	char *snap_name;
6215 
6216 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6217 	reply_buf = kmalloc(size, GFP_KERNEL);
6218 	if (!reply_buf)
6219 		return ERR_PTR(-ENOMEM);
6220 
6221 	snapid = cpu_to_le64(snap_id);
6222 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6223 				  &rbd_dev->header_oloc, "get_snapshot_name",
6224 				  &snapid, sizeof(snapid), reply_buf, size);
6225 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6226 	if (ret < 0) {
6227 		snap_name = ERR_PTR(ret);
6228 		goto out;
6229 	}
6230 
6231 	p = reply_buf;
6232 	end = reply_buf + ret;
6233 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
6234 	if (IS_ERR(snap_name))
6235 		goto out;
6236 
6237 	dout("  snap_id 0x%016llx snap_name = %s\n",
6238 		(unsigned long long)snap_id, snap_name);
6239 out:
6240 	kfree(reply_buf);
6241 
6242 	return snap_name;
6243 }
6244 
rbd_dev_v2_header_info(struct rbd_device * rbd_dev)6245 static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
6246 {
6247 	bool first_time = rbd_dev->header.object_prefix == NULL;
6248 	int ret;
6249 
6250 	ret = rbd_dev_v2_image_size(rbd_dev);
6251 	if (ret)
6252 		return ret;
6253 
6254 	if (first_time) {
6255 		ret = rbd_dev_v2_header_onetime(rbd_dev);
6256 		if (ret)
6257 			return ret;
6258 	}
6259 
6260 	ret = rbd_dev_v2_snap_context(rbd_dev);
6261 	if (ret && first_time) {
6262 		kfree(rbd_dev->header.object_prefix);
6263 		rbd_dev->header.object_prefix = NULL;
6264 	}
6265 
6266 	return ret;
6267 }
6268 
rbd_dev_header_info(struct rbd_device * rbd_dev)6269 static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6270 {
6271 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6272 
6273 	if (rbd_dev->image_format == 1)
6274 		return rbd_dev_v1_header_info(rbd_dev);
6275 
6276 	return rbd_dev_v2_header_info(rbd_dev);
6277 }
6278 
6279 /*
6280  * Skips over white space at *buf, and updates *buf to point to the
6281  * first found non-space character (if any). Returns the length of
6282  * the token (string of non-white space characters) found.  Note
6283  * that *buf must be terminated with '\0'.
6284  */
next_token(const char ** buf)6285 static inline size_t next_token(const char **buf)
6286 {
6287         /*
6288         * These are the characters that produce nonzero for
6289         * isspace() in the "C" and "POSIX" locales.
6290         */
6291         const char *spaces = " \f\n\r\t\v";
6292 
6293         *buf += strspn(*buf, spaces);	/* Find start of token */
6294 
6295 	return strcspn(*buf, spaces);   /* Return token length */
6296 }
6297 
6298 /*
6299  * Finds the next token in *buf, dynamically allocates a buffer big
6300  * enough to hold a copy of it, and copies the token into the new
6301  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
6302  * that a duplicate buffer is created even for a zero-length token.
6303  *
6304  * Returns a pointer to the newly-allocated duplicate, or a null
6305  * pointer if memory for the duplicate was not available.  If
6306  * the lenp argument is a non-null pointer, the length of the token
6307  * (not including the '\0') is returned in *lenp.
6308  *
6309  * If successful, the *buf pointer will be updated to point beyond
6310  * the end of the found token.
6311  *
6312  * Note: uses GFP_KERNEL for allocation.
6313  */
dup_token(const char ** buf,size_t * lenp)6314 static inline char *dup_token(const char **buf, size_t *lenp)
6315 {
6316 	char *dup;
6317 	size_t len;
6318 
6319 	len = next_token(buf);
6320 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
6321 	if (!dup)
6322 		return NULL;
6323 	*(dup + len) = '\0';
6324 	*buf += len;
6325 
6326 	if (lenp)
6327 		*lenp = len;
6328 
6329 	return dup;
6330 }
6331 
rbd_parse_param(struct fs_parameter * param,struct rbd_parse_opts_ctx * pctx)6332 static int rbd_parse_param(struct fs_parameter *param,
6333 			    struct rbd_parse_opts_ctx *pctx)
6334 {
6335 	struct rbd_options *opt = pctx->opts;
6336 	struct fs_parse_result result;
6337 	struct p_log log = {.prefix = "rbd"};
6338 	int token, ret;
6339 
6340 	ret = ceph_parse_param(param, pctx->copts, NULL);
6341 	if (ret != -ENOPARAM)
6342 		return ret;
6343 
6344 	token = __fs_parse(&log, rbd_parameters, param, &result);
6345 	dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
6346 	if (token < 0) {
6347 		if (token == -ENOPARAM)
6348 			return inval_plog(&log, "Unknown parameter '%s'",
6349 					  param->key);
6350 		return token;
6351 	}
6352 
6353 	switch (token) {
6354 	case Opt_queue_depth:
6355 		if (result.uint_32 < 1)
6356 			goto out_of_range;
6357 		opt->queue_depth = result.uint_32;
6358 		break;
6359 	case Opt_alloc_size:
6360 		if (result.uint_32 < SECTOR_SIZE)
6361 			goto out_of_range;
6362 		if (!is_power_of_2(result.uint_32))
6363 			return inval_plog(&log, "alloc_size must be a power of 2");
6364 		opt->alloc_size = result.uint_32;
6365 		break;
6366 	case Opt_lock_timeout:
6367 		/* 0 is "wait forever" (i.e. infinite timeout) */
6368 		if (result.uint_32 > INT_MAX / 1000)
6369 			goto out_of_range;
6370 		opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
6371 		break;
6372 	case Opt_pool_ns:
6373 		kfree(pctx->spec->pool_ns);
6374 		pctx->spec->pool_ns = param->string;
6375 		param->string = NULL;
6376 		break;
6377 	case Opt_compression_hint:
6378 		switch (result.uint_32) {
6379 		case Opt_compression_hint_none:
6380 			opt->alloc_hint_flags &=
6381 			    ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE |
6382 			      CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE);
6383 			break;
6384 		case Opt_compression_hint_compressible:
6385 			opt->alloc_hint_flags |=
6386 			    CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6387 			opt->alloc_hint_flags &=
6388 			    ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6389 			break;
6390 		case Opt_compression_hint_incompressible:
6391 			opt->alloc_hint_flags |=
6392 			    CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6393 			opt->alloc_hint_flags &=
6394 			    ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6395 			break;
6396 		default:
6397 			BUG();
6398 		}
6399 		break;
6400 	case Opt_read_only:
6401 		opt->read_only = true;
6402 		break;
6403 	case Opt_read_write:
6404 		opt->read_only = false;
6405 		break;
6406 	case Opt_lock_on_read:
6407 		opt->lock_on_read = true;
6408 		break;
6409 	case Opt_exclusive:
6410 		opt->exclusive = true;
6411 		break;
6412 	case Opt_notrim:
6413 		opt->trim = false;
6414 		break;
6415 	default:
6416 		BUG();
6417 	}
6418 
6419 	return 0;
6420 
6421 out_of_range:
6422 	return inval_plog(&log, "%s out of range", param->key);
6423 }
6424 
6425 /*
6426  * This duplicates most of generic_parse_monolithic(), untying it from
6427  * fs_context and skipping standard superblock and security options.
6428  */
rbd_parse_options(char * options,struct rbd_parse_opts_ctx * pctx)6429 static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
6430 {
6431 	char *key;
6432 	int ret = 0;
6433 
6434 	dout("%s '%s'\n", __func__, options);
6435 	while ((key = strsep(&options, ",")) != NULL) {
6436 		if (*key) {
6437 			struct fs_parameter param = {
6438 				.key	= key,
6439 				.type	= fs_value_is_flag,
6440 			};
6441 			char *value = strchr(key, '=');
6442 			size_t v_len = 0;
6443 
6444 			if (value) {
6445 				if (value == key)
6446 					continue;
6447 				*value++ = 0;
6448 				v_len = strlen(value);
6449 				param.string = kmemdup_nul(value, v_len,
6450 							   GFP_KERNEL);
6451 				if (!param.string)
6452 					return -ENOMEM;
6453 				param.type = fs_value_is_string;
6454 			}
6455 			param.size = v_len;
6456 
6457 			ret = rbd_parse_param(&param, pctx);
6458 			kfree(param.string);
6459 			if (ret)
6460 				break;
6461 		}
6462 	}
6463 
6464 	return ret;
6465 }
6466 
6467 /*
6468  * Parse the options provided for an "rbd add" (i.e., rbd image
6469  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
6470  * and the data written is passed here via a NUL-terminated buffer.
6471  * Returns 0 if successful or an error code otherwise.
6472  *
6473  * The information extracted from these options is recorded in
6474  * the other parameters which return dynamically-allocated
6475  * structures:
6476  *  ceph_opts
6477  *      The address of a pointer that will refer to a ceph options
6478  *      structure.  Caller must release the returned pointer using
6479  *      ceph_destroy_options() when it is no longer needed.
6480  *  rbd_opts
6481  *	Address of an rbd options pointer.  Fully initialized by
6482  *	this function; caller must release with kfree().
6483  *  spec
6484  *	Address of an rbd image specification pointer.  Fully
6485  *	initialized by this function based on parsed options.
6486  *	Caller must release with rbd_spec_put().
6487  *
6488  * The options passed take this form:
6489  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6490  * where:
6491  *  <mon_addrs>
6492  *      A comma-separated list of one or more monitor addresses.
6493  *      A monitor address is an ip address, optionally followed
6494  *      by a port number (separated by a colon).
6495  *        I.e.:  ip1[:port1][,ip2[:port2]...]
6496  *  <options>
6497  *      A comma-separated list of ceph and/or rbd options.
6498  *  <pool_name>
6499  *      The name of the rados pool containing the rbd image.
6500  *  <image_name>
6501  *      The name of the image in that pool to map.
6502  *  <snap_id>
6503  *      An optional snapshot id.  If provided, the mapping will
6504  *      present data from the image at the time that snapshot was
6505  *      created.  The image head is used if no snapshot id is
6506  *      provided.  Snapshot mappings are always read-only.
6507  */
rbd_add_parse_args(const char * buf,struct ceph_options ** ceph_opts,struct rbd_options ** opts,struct rbd_spec ** rbd_spec)6508 static int rbd_add_parse_args(const char *buf,
6509 				struct ceph_options **ceph_opts,
6510 				struct rbd_options **opts,
6511 				struct rbd_spec **rbd_spec)
6512 {
6513 	size_t len;
6514 	char *options;
6515 	const char *mon_addrs;
6516 	char *snap_name;
6517 	size_t mon_addrs_size;
6518 	struct rbd_parse_opts_ctx pctx = { 0 };
6519 	int ret;
6520 
6521 	/* The first four tokens are required */
6522 
6523 	len = next_token(&buf);
6524 	if (!len) {
6525 		rbd_warn(NULL, "no monitor address(es) provided");
6526 		return -EINVAL;
6527 	}
6528 	mon_addrs = buf;
6529 	mon_addrs_size = len;
6530 	buf += len;
6531 
6532 	ret = -EINVAL;
6533 	options = dup_token(&buf, NULL);
6534 	if (!options)
6535 		return -ENOMEM;
6536 	if (!*options) {
6537 		rbd_warn(NULL, "no options provided");
6538 		goto out_err;
6539 	}
6540 
6541 	pctx.spec = rbd_spec_alloc();
6542 	if (!pctx.spec)
6543 		goto out_mem;
6544 
6545 	pctx.spec->pool_name = dup_token(&buf, NULL);
6546 	if (!pctx.spec->pool_name)
6547 		goto out_mem;
6548 	if (!*pctx.spec->pool_name) {
6549 		rbd_warn(NULL, "no pool name provided");
6550 		goto out_err;
6551 	}
6552 
6553 	pctx.spec->image_name = dup_token(&buf, NULL);
6554 	if (!pctx.spec->image_name)
6555 		goto out_mem;
6556 	if (!*pctx.spec->image_name) {
6557 		rbd_warn(NULL, "no image name provided");
6558 		goto out_err;
6559 	}
6560 
6561 	/*
6562 	 * Snapshot name is optional; default is to use "-"
6563 	 * (indicating the head/no snapshot).
6564 	 */
6565 	len = next_token(&buf);
6566 	if (!len) {
6567 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6568 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
6569 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
6570 		ret = -ENAMETOOLONG;
6571 		goto out_err;
6572 	}
6573 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6574 	if (!snap_name)
6575 		goto out_mem;
6576 	*(snap_name + len) = '\0';
6577 	pctx.spec->snap_name = snap_name;
6578 
6579 	pctx.copts = ceph_alloc_options();
6580 	if (!pctx.copts)
6581 		goto out_mem;
6582 
6583 	/* Initialize all rbd options to the defaults */
6584 
6585 	pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6586 	if (!pctx.opts)
6587 		goto out_mem;
6588 
6589 	pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6590 	pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
6591 	pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
6592 	pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6593 	pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6594 	pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6595 	pctx.opts->trim = RBD_TRIM_DEFAULT;
6596 
6597 	ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL);
6598 	if (ret)
6599 		goto out_err;
6600 
6601 	ret = rbd_parse_options(options, &pctx);
6602 	if (ret)
6603 		goto out_err;
6604 
6605 	*ceph_opts = pctx.copts;
6606 	*opts = pctx.opts;
6607 	*rbd_spec = pctx.spec;
6608 	kfree(options);
6609 	return 0;
6610 
6611 out_mem:
6612 	ret = -ENOMEM;
6613 out_err:
6614 	kfree(pctx.opts);
6615 	ceph_destroy_options(pctx.copts);
6616 	rbd_spec_put(pctx.spec);
6617 	kfree(options);
6618 	return ret;
6619 }
6620 
rbd_dev_image_unlock(struct rbd_device * rbd_dev)6621 static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6622 {
6623 	down_write(&rbd_dev->lock_rwsem);
6624 	if (__rbd_is_lock_owner(rbd_dev))
6625 		__rbd_release_lock(rbd_dev);
6626 	up_write(&rbd_dev->lock_rwsem);
6627 }
6628 
6629 /*
6630  * If the wait is interrupted, an error is returned even if the lock
6631  * was successfully acquired.  rbd_dev_image_unlock() will release it
6632  * if needed.
6633  */
rbd_add_acquire_lock(struct rbd_device * rbd_dev)6634 static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6635 {
6636 	long ret;
6637 
6638 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
6639 		if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6640 			return 0;
6641 
6642 		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6643 		return -EINVAL;
6644 	}
6645 
6646 	if (rbd_is_ro(rbd_dev))
6647 		return 0;
6648 
6649 	rbd_assert(!rbd_is_lock_owner(rbd_dev));
6650 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6651 	ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6652 			    ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
6653 	if (ret > 0) {
6654 		ret = rbd_dev->acquire_err;
6655 	} else {
6656 		cancel_delayed_work_sync(&rbd_dev->lock_dwork);
6657 		if (!ret)
6658 			ret = -ETIMEDOUT;
6659 	}
6660 
6661 	if (ret) {
6662 		rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6663 		return ret;
6664 	}
6665 
6666 	/*
6667 	 * The lock may have been released by now, unless automatic lock
6668 	 * transitions are disabled.
6669 	 */
6670 	rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
6671 	return 0;
6672 }
6673 
6674 /*
6675  * An rbd format 2 image has a unique identifier, distinct from the
6676  * name given to it by the user.  Internally, that identifier is
6677  * what's used to specify the names of objects related to the image.
6678  *
6679  * A special "rbd id" object is used to map an rbd image name to its
6680  * id.  If that object doesn't exist, then there is no v2 rbd image
6681  * with the supplied name.
6682  *
6683  * This function will record the given rbd_dev's image_id field if
6684  * it can be determined, and in that case will return 0.  If any
6685  * errors occur a negative errno will be returned and the rbd_dev's
6686  * image_id field will be unchanged (and should be NULL).
6687  */
rbd_dev_image_id(struct rbd_device * rbd_dev)6688 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6689 {
6690 	int ret;
6691 	size_t size;
6692 	CEPH_DEFINE_OID_ONSTACK(oid);
6693 	void *response;
6694 	char *image_id;
6695 
6696 	/*
6697 	 * When probing a parent image, the image id is already
6698 	 * known (and the image name likely is not).  There's no
6699 	 * need to fetch the image id again in this case.  We
6700 	 * do still need to set the image format though.
6701 	 */
6702 	if (rbd_dev->spec->image_id) {
6703 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6704 
6705 		return 0;
6706 	}
6707 
6708 	/*
6709 	 * First, see if the format 2 image id file exists, and if
6710 	 * so, get the image's persistent id from it.
6711 	 */
6712 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6713 			       rbd_dev->spec->image_name);
6714 	if (ret)
6715 		return ret;
6716 
6717 	dout("rbd id object name is %s\n", oid.name);
6718 
6719 	/* Response will be an encoded string, which includes a length */
6720 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6721 	response = kzalloc(size, GFP_NOIO);
6722 	if (!response) {
6723 		ret = -ENOMEM;
6724 		goto out;
6725 	}
6726 
6727 	/* If it doesn't exist we'll assume it's a format 1 image */
6728 
6729 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6730 				  "get_id", NULL, 0,
6731 				  response, size);
6732 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6733 	if (ret == -ENOENT) {
6734 		image_id = kstrdup("", GFP_KERNEL);
6735 		ret = image_id ? 0 : -ENOMEM;
6736 		if (!ret)
6737 			rbd_dev->image_format = 1;
6738 	} else if (ret >= 0) {
6739 		void *p = response;
6740 
6741 		image_id = ceph_extract_encoded_string(&p, p + ret,
6742 						NULL, GFP_NOIO);
6743 		ret = PTR_ERR_OR_ZERO(image_id);
6744 		if (!ret)
6745 			rbd_dev->image_format = 2;
6746 	}
6747 
6748 	if (!ret) {
6749 		rbd_dev->spec->image_id = image_id;
6750 		dout("image_id is %s\n", image_id);
6751 	}
6752 out:
6753 	kfree(response);
6754 	ceph_oid_destroy(&oid);
6755 	return ret;
6756 }
6757 
6758 /*
6759  * Undo whatever state changes are made by v1 or v2 header info
6760  * call.
6761  */
rbd_dev_unprobe(struct rbd_device * rbd_dev)6762 static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6763 {
6764 	struct rbd_image_header	*header;
6765 
6766 	rbd_dev_parent_put(rbd_dev);
6767 	rbd_object_map_free(rbd_dev);
6768 	rbd_dev_mapping_clear(rbd_dev);
6769 
6770 	/* Free dynamic fields from the header, then zero it out */
6771 
6772 	header = &rbd_dev->header;
6773 	ceph_put_snap_context(header->snapc);
6774 	kfree(header->snap_sizes);
6775 	kfree(header->snap_names);
6776 	kfree(header->object_prefix);
6777 	memset(header, 0, sizeof (*header));
6778 }
6779 
rbd_dev_v2_header_onetime(struct rbd_device * rbd_dev)6780 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
6781 {
6782 	int ret;
6783 
6784 	ret = rbd_dev_v2_object_prefix(rbd_dev);
6785 	if (ret)
6786 		goto out_err;
6787 
6788 	/*
6789 	 * Get the and check features for the image.  Currently the
6790 	 * features are assumed to never change.
6791 	 */
6792 	ret = rbd_dev_v2_features(rbd_dev);
6793 	if (ret)
6794 		goto out_err;
6795 
6796 	/* If the image supports fancy striping, get its parameters */
6797 
6798 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6799 		ret = rbd_dev_v2_striping_info(rbd_dev);
6800 		if (ret < 0)
6801 			goto out_err;
6802 	}
6803 
6804 	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6805 		ret = rbd_dev_v2_data_pool(rbd_dev);
6806 		if (ret)
6807 			goto out_err;
6808 	}
6809 
6810 	rbd_init_layout(rbd_dev);
6811 	return 0;
6812 
6813 out_err:
6814 	rbd_dev->header.features = 0;
6815 	kfree(rbd_dev->header.object_prefix);
6816 	rbd_dev->header.object_prefix = NULL;
6817 	return ret;
6818 }
6819 
6820 /*
6821  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6822  * rbd_dev_image_probe() recursion depth, which means it's also the
6823  * length of the already discovered part of the parent chain.
6824  */
rbd_dev_probe_parent(struct rbd_device * rbd_dev,int depth)6825 static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
6826 {
6827 	struct rbd_device *parent = NULL;
6828 	int ret;
6829 
6830 	if (!rbd_dev->parent_spec)
6831 		return 0;
6832 
6833 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6834 		pr_info("parent chain is too long (%d)\n", depth);
6835 		ret = -EINVAL;
6836 		goto out_err;
6837 	}
6838 
6839 	parent = __rbd_dev_create(rbd_dev->parent_spec);
6840 	if (!parent) {
6841 		ret = -ENOMEM;
6842 		goto out_err;
6843 	}
6844 
6845 	/*
6846 	 * Images related by parent/child relationships always share
6847 	 * rbd_client and spec/parent_spec, so bump their refcounts.
6848 	 */
6849 	parent->rbd_client = __rbd_get_client(rbd_dev->rbd_client);
6850 	parent->spec = rbd_spec_get(rbd_dev->parent_spec);
6851 
6852 	__set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
6853 
6854 	ret = rbd_dev_image_probe(parent, depth);
6855 	if (ret < 0)
6856 		goto out_err;
6857 
6858 	rbd_dev->parent = parent;
6859 	atomic_set(&rbd_dev->parent_ref, 1);
6860 	return 0;
6861 
6862 out_err:
6863 	rbd_dev_unparent(rbd_dev);
6864 	rbd_dev_destroy(parent);
6865 	return ret;
6866 }
6867 
rbd_dev_device_release(struct rbd_device * rbd_dev)6868 static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6869 {
6870 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6871 	rbd_free_disk(rbd_dev);
6872 	if (!single_major)
6873 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6874 }
6875 
6876 /*
6877  * rbd_dev->header_rwsem must be locked for write and will be unlocked
6878  * upon return.
6879  */
rbd_dev_device_setup(struct rbd_device * rbd_dev)6880 static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
6881 {
6882 	int ret;
6883 
6884 	/* Record our major and minor device numbers. */
6885 
6886 	if (!single_major) {
6887 		ret = register_blkdev(0, rbd_dev->name);
6888 		if (ret < 0)
6889 			goto err_out_unlock;
6890 
6891 		rbd_dev->major = ret;
6892 		rbd_dev->minor = 0;
6893 	} else {
6894 		rbd_dev->major = rbd_major;
6895 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6896 	}
6897 
6898 	/* Set up the blkdev mapping. */
6899 
6900 	ret = rbd_init_disk(rbd_dev);
6901 	if (ret)
6902 		goto err_out_blkdev;
6903 
6904 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
6905 	set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
6906 
6907 	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
6908 	if (ret)
6909 		goto err_out_disk;
6910 
6911 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6912 	up_write(&rbd_dev->header_rwsem);
6913 	return 0;
6914 
6915 err_out_disk:
6916 	rbd_free_disk(rbd_dev);
6917 err_out_blkdev:
6918 	if (!single_major)
6919 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6920 err_out_unlock:
6921 	up_write(&rbd_dev->header_rwsem);
6922 	return ret;
6923 }
6924 
rbd_dev_header_name(struct rbd_device * rbd_dev)6925 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6926 {
6927 	struct rbd_spec *spec = rbd_dev->spec;
6928 	int ret;
6929 
6930 	/* Record the header object name for this rbd image. */
6931 
6932 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6933 	if (rbd_dev->image_format == 1)
6934 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6935 				       spec->image_name, RBD_SUFFIX);
6936 	else
6937 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6938 				       RBD_HEADER_PREFIX, spec->image_id);
6939 
6940 	return ret;
6941 }
6942 
rbd_print_dne(struct rbd_device * rbd_dev,bool is_snap)6943 static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6944 {
6945 	if (!is_snap) {
6946 		pr_info("image %s/%s%s%s does not exist\n",
6947 			rbd_dev->spec->pool_name,
6948 			rbd_dev->spec->pool_ns ?: "",
6949 			rbd_dev->spec->pool_ns ? "/" : "",
6950 			rbd_dev->spec->image_name);
6951 	} else {
6952 		pr_info("snap %s/%s%s%s@%s does not exist\n",
6953 			rbd_dev->spec->pool_name,
6954 			rbd_dev->spec->pool_ns ?: "",
6955 			rbd_dev->spec->pool_ns ? "/" : "",
6956 			rbd_dev->spec->image_name,
6957 			rbd_dev->spec->snap_name);
6958 	}
6959 }
6960 
rbd_dev_image_release(struct rbd_device * rbd_dev)6961 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6962 {
6963 	if (!rbd_is_ro(rbd_dev))
6964 		rbd_unregister_watch(rbd_dev);
6965 
6966 	rbd_dev_unprobe(rbd_dev);
6967 	rbd_dev->image_format = 0;
6968 	kfree(rbd_dev->spec->image_id);
6969 	rbd_dev->spec->image_id = NULL;
6970 }
6971 
6972 /*
6973  * Probe for the existence of the header object for the given rbd
6974  * device.  If this image is the one being mapped (i.e., not a
6975  * parent), initiate a watch on its header object before using that
6976  * object to get detailed information about the rbd image.
6977  *
6978  * On success, returns with header_rwsem held for write if called
6979  * with @depth == 0.
6980  */
rbd_dev_image_probe(struct rbd_device * rbd_dev,int depth)6981 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6982 {
6983 	bool need_watch = !rbd_is_ro(rbd_dev);
6984 	int ret;
6985 
6986 	/*
6987 	 * Get the id from the image id object.  Unless there's an
6988 	 * error, rbd_dev->spec->image_id will be filled in with
6989 	 * a dynamically-allocated string, and rbd_dev->image_format
6990 	 * will be set to either 1 or 2.
6991 	 */
6992 	ret = rbd_dev_image_id(rbd_dev);
6993 	if (ret)
6994 		return ret;
6995 
6996 	ret = rbd_dev_header_name(rbd_dev);
6997 	if (ret)
6998 		goto err_out_format;
6999 
7000 	if (need_watch) {
7001 		ret = rbd_register_watch(rbd_dev);
7002 		if (ret) {
7003 			if (ret == -ENOENT)
7004 				rbd_print_dne(rbd_dev, false);
7005 			goto err_out_format;
7006 		}
7007 	}
7008 
7009 	if (!depth)
7010 		down_write(&rbd_dev->header_rwsem);
7011 
7012 	ret = rbd_dev_header_info(rbd_dev);
7013 	if (ret) {
7014 		if (ret == -ENOENT && !need_watch)
7015 			rbd_print_dne(rbd_dev, false);
7016 		goto err_out_probe;
7017 	}
7018 
7019 	/*
7020 	 * If this image is the one being mapped, we have pool name and
7021 	 * id, image name and id, and snap name - need to fill snap id.
7022 	 * Otherwise this is a parent image, identified by pool, image
7023 	 * and snap ids - need to fill in names for those ids.
7024 	 */
7025 	if (!depth)
7026 		ret = rbd_spec_fill_snap_id(rbd_dev);
7027 	else
7028 		ret = rbd_spec_fill_names(rbd_dev);
7029 	if (ret) {
7030 		if (ret == -ENOENT)
7031 			rbd_print_dne(rbd_dev, true);
7032 		goto err_out_probe;
7033 	}
7034 
7035 	ret = rbd_dev_mapping_set(rbd_dev);
7036 	if (ret)
7037 		goto err_out_probe;
7038 
7039 	if (rbd_is_snap(rbd_dev) &&
7040 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
7041 		ret = rbd_object_map_load(rbd_dev);
7042 		if (ret)
7043 			goto err_out_probe;
7044 	}
7045 
7046 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
7047 		ret = rbd_dev_v2_parent_info(rbd_dev);
7048 		if (ret)
7049 			goto err_out_probe;
7050 	}
7051 
7052 	ret = rbd_dev_probe_parent(rbd_dev, depth);
7053 	if (ret)
7054 		goto err_out_probe;
7055 
7056 	dout("discovered format %u image, header name is %s\n",
7057 		rbd_dev->image_format, rbd_dev->header_oid.name);
7058 	return 0;
7059 
7060 err_out_probe:
7061 	if (!depth)
7062 		up_write(&rbd_dev->header_rwsem);
7063 	if (need_watch)
7064 		rbd_unregister_watch(rbd_dev);
7065 	rbd_dev_unprobe(rbd_dev);
7066 err_out_format:
7067 	rbd_dev->image_format = 0;
7068 	kfree(rbd_dev->spec->image_id);
7069 	rbd_dev->spec->image_id = NULL;
7070 	return ret;
7071 }
7072 
do_rbd_add(struct bus_type * bus,const char * buf,size_t count)7073 static ssize_t do_rbd_add(struct bus_type *bus,
7074 			  const char *buf,
7075 			  size_t count)
7076 {
7077 	struct rbd_device *rbd_dev = NULL;
7078 	struct ceph_options *ceph_opts = NULL;
7079 	struct rbd_options *rbd_opts = NULL;
7080 	struct rbd_spec *spec = NULL;
7081 	struct rbd_client *rbdc;
7082 	int rc;
7083 
7084 	if (!capable(CAP_SYS_ADMIN))
7085 		return -EPERM;
7086 
7087 	if (!try_module_get(THIS_MODULE))
7088 		return -ENODEV;
7089 
7090 	/* parse add command */
7091 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
7092 	if (rc < 0)
7093 		goto out;
7094 
7095 	rbdc = rbd_get_client(ceph_opts);
7096 	if (IS_ERR(rbdc)) {
7097 		rc = PTR_ERR(rbdc);
7098 		goto err_out_args;
7099 	}
7100 
7101 	/* pick the pool */
7102 	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
7103 	if (rc < 0) {
7104 		if (rc == -ENOENT)
7105 			pr_info("pool %s does not exist\n", spec->pool_name);
7106 		goto err_out_client;
7107 	}
7108 	spec->pool_id = (u64)rc;
7109 
7110 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
7111 	if (!rbd_dev) {
7112 		rc = -ENOMEM;
7113 		goto err_out_client;
7114 	}
7115 	rbdc = NULL;		/* rbd_dev now owns this */
7116 	spec = NULL;		/* rbd_dev now owns this */
7117 	rbd_opts = NULL;	/* rbd_dev now owns this */
7118 
7119 	/* if we are mapping a snapshot it will be a read-only mapping */
7120 	if (rbd_dev->opts->read_only ||
7121 	    strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
7122 		__set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
7123 
7124 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7125 	if (!rbd_dev->config_info) {
7126 		rc = -ENOMEM;
7127 		goto err_out_rbd_dev;
7128 	}
7129 
7130 	rc = rbd_dev_image_probe(rbd_dev, 0);
7131 	if (rc < 0)
7132 		goto err_out_rbd_dev;
7133 
7134 	if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7135 		rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7136 			 rbd_dev->layout.object_size);
7137 		rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
7138 	}
7139 
7140 	rc = rbd_dev_device_setup(rbd_dev);
7141 	if (rc)
7142 		goto err_out_image_probe;
7143 
7144 	rc = rbd_add_acquire_lock(rbd_dev);
7145 	if (rc)
7146 		goto err_out_image_lock;
7147 
7148 	/* Everything's ready.  Announce the disk to the world. */
7149 
7150 	rc = device_add(&rbd_dev->dev);
7151 	if (rc)
7152 		goto err_out_image_lock;
7153 
7154 	device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
7155 	/* see rbd_init_disk() */
7156 	blk_put_queue(rbd_dev->disk->queue);
7157 
7158 	spin_lock(&rbd_dev_list_lock);
7159 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
7160 	spin_unlock(&rbd_dev_list_lock);
7161 
7162 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7163 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7164 		rbd_dev->header.features);
7165 	rc = count;
7166 out:
7167 	module_put(THIS_MODULE);
7168 	return rc;
7169 
7170 err_out_image_lock:
7171 	rbd_dev_image_unlock(rbd_dev);
7172 	rbd_dev_device_release(rbd_dev);
7173 err_out_image_probe:
7174 	rbd_dev_image_release(rbd_dev);
7175 err_out_rbd_dev:
7176 	rbd_dev_destroy(rbd_dev);
7177 err_out_client:
7178 	rbd_put_client(rbdc);
7179 err_out_args:
7180 	rbd_spec_put(spec);
7181 	kfree(rbd_opts);
7182 	goto out;
7183 }
7184 
add_store(struct bus_type * bus,const char * buf,size_t count)7185 static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
7186 {
7187 	if (single_major)
7188 		return -EINVAL;
7189 
7190 	return do_rbd_add(bus, buf, count);
7191 }
7192 
add_single_major_store(struct bus_type * bus,const char * buf,size_t count)7193 static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7194 				      size_t count)
7195 {
7196 	return do_rbd_add(bus, buf, count);
7197 }
7198 
rbd_dev_remove_parent(struct rbd_device * rbd_dev)7199 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7200 {
7201 	while (rbd_dev->parent) {
7202 		struct rbd_device *first = rbd_dev;
7203 		struct rbd_device *second = first->parent;
7204 		struct rbd_device *third;
7205 
7206 		/*
7207 		 * Follow to the parent with no grandparent and
7208 		 * remove it.
7209 		 */
7210 		while (second && (third = second->parent)) {
7211 			first = second;
7212 			second = third;
7213 		}
7214 		rbd_assert(second);
7215 		rbd_dev_image_release(second);
7216 		rbd_dev_destroy(second);
7217 		first->parent = NULL;
7218 		first->parent_overlap = 0;
7219 
7220 		rbd_assert(first->parent_spec);
7221 		rbd_spec_put(first->parent_spec);
7222 		first->parent_spec = NULL;
7223 	}
7224 }
7225 
do_rbd_remove(struct bus_type * bus,const char * buf,size_t count)7226 static ssize_t do_rbd_remove(struct bus_type *bus,
7227 			     const char *buf,
7228 			     size_t count)
7229 {
7230 	struct rbd_device *rbd_dev = NULL;
7231 	struct list_head *tmp;
7232 	int dev_id;
7233 	char opt_buf[6];
7234 	bool force = false;
7235 	int ret;
7236 
7237 	if (!capable(CAP_SYS_ADMIN))
7238 		return -EPERM;
7239 
7240 	dev_id = -1;
7241 	opt_buf[0] = '\0';
7242 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
7243 	if (dev_id < 0) {
7244 		pr_err("dev_id out of range\n");
7245 		return -EINVAL;
7246 	}
7247 	if (opt_buf[0] != '\0') {
7248 		if (!strcmp(opt_buf, "force")) {
7249 			force = true;
7250 		} else {
7251 			pr_err("bad remove option at '%s'\n", opt_buf);
7252 			return -EINVAL;
7253 		}
7254 	}
7255 
7256 	ret = -ENOENT;
7257 	spin_lock(&rbd_dev_list_lock);
7258 	list_for_each(tmp, &rbd_dev_list) {
7259 		rbd_dev = list_entry(tmp, struct rbd_device, node);
7260 		if (rbd_dev->dev_id == dev_id) {
7261 			ret = 0;
7262 			break;
7263 		}
7264 	}
7265 	if (!ret) {
7266 		spin_lock_irq(&rbd_dev->lock);
7267 		if (rbd_dev->open_count && !force)
7268 			ret = -EBUSY;
7269 		else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7270 					  &rbd_dev->flags))
7271 			ret = -EINPROGRESS;
7272 		spin_unlock_irq(&rbd_dev->lock);
7273 	}
7274 	spin_unlock(&rbd_dev_list_lock);
7275 	if (ret)
7276 		return ret;
7277 
7278 	if (force) {
7279 		/*
7280 		 * Prevent new IO from being queued and wait for existing
7281 		 * IO to complete/fail.
7282 		 */
7283 		blk_mq_freeze_queue(rbd_dev->disk->queue);
7284 		blk_set_queue_dying(rbd_dev->disk->queue);
7285 	}
7286 
7287 	del_gendisk(rbd_dev->disk);
7288 	spin_lock(&rbd_dev_list_lock);
7289 	list_del_init(&rbd_dev->node);
7290 	spin_unlock(&rbd_dev_list_lock);
7291 	device_del(&rbd_dev->dev);
7292 
7293 	rbd_dev_image_unlock(rbd_dev);
7294 	rbd_dev_device_release(rbd_dev);
7295 	rbd_dev_image_release(rbd_dev);
7296 	rbd_dev_destroy(rbd_dev);
7297 	return count;
7298 }
7299 
remove_store(struct bus_type * bus,const char * buf,size_t count)7300 static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
7301 {
7302 	if (single_major)
7303 		return -EINVAL;
7304 
7305 	return do_rbd_remove(bus, buf, count);
7306 }
7307 
remove_single_major_store(struct bus_type * bus,const char * buf,size_t count)7308 static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7309 					 size_t count)
7310 {
7311 	return do_rbd_remove(bus, buf, count);
7312 }
7313 
7314 /*
7315  * create control files in sysfs
7316  * /sys/bus/rbd/...
7317  */
rbd_sysfs_init(void)7318 static int __init rbd_sysfs_init(void)
7319 {
7320 	int ret;
7321 
7322 	ret = device_register(&rbd_root_dev);
7323 	if (ret < 0)
7324 		return ret;
7325 
7326 	ret = bus_register(&rbd_bus_type);
7327 	if (ret < 0)
7328 		device_unregister(&rbd_root_dev);
7329 
7330 	return ret;
7331 }
7332 
rbd_sysfs_cleanup(void)7333 static void __exit rbd_sysfs_cleanup(void)
7334 {
7335 	bus_unregister(&rbd_bus_type);
7336 	device_unregister(&rbd_root_dev);
7337 }
7338 
rbd_slab_init(void)7339 static int __init rbd_slab_init(void)
7340 {
7341 	rbd_assert(!rbd_img_request_cache);
7342 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
7343 	if (!rbd_img_request_cache)
7344 		return -ENOMEM;
7345 
7346 	rbd_assert(!rbd_obj_request_cache);
7347 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
7348 	if (!rbd_obj_request_cache)
7349 		goto out_err;
7350 
7351 	return 0;
7352 
7353 out_err:
7354 	kmem_cache_destroy(rbd_img_request_cache);
7355 	rbd_img_request_cache = NULL;
7356 	return -ENOMEM;
7357 }
7358 
rbd_slab_exit(void)7359 static void rbd_slab_exit(void)
7360 {
7361 	rbd_assert(rbd_obj_request_cache);
7362 	kmem_cache_destroy(rbd_obj_request_cache);
7363 	rbd_obj_request_cache = NULL;
7364 
7365 	rbd_assert(rbd_img_request_cache);
7366 	kmem_cache_destroy(rbd_img_request_cache);
7367 	rbd_img_request_cache = NULL;
7368 }
7369 
rbd_init(void)7370 static int __init rbd_init(void)
7371 {
7372 	int rc;
7373 
7374 	if (!libceph_compatible(NULL)) {
7375 		rbd_warn(NULL, "libceph incompatibility (quitting)");
7376 		return -EINVAL;
7377 	}
7378 
7379 	rc = rbd_slab_init();
7380 	if (rc)
7381 		return rc;
7382 
7383 	/*
7384 	 * The number of active work items is limited by the number of
7385 	 * rbd devices * queue depth, so leave @max_active at default.
7386 	 */
7387 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7388 	if (!rbd_wq) {
7389 		rc = -ENOMEM;
7390 		goto err_out_slab;
7391 	}
7392 
7393 	if (single_major) {
7394 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
7395 		if (rbd_major < 0) {
7396 			rc = rbd_major;
7397 			goto err_out_wq;
7398 		}
7399 	}
7400 
7401 	rc = rbd_sysfs_init();
7402 	if (rc)
7403 		goto err_out_blkdev;
7404 
7405 	if (single_major)
7406 		pr_info("loaded (major %d)\n", rbd_major);
7407 	else
7408 		pr_info("loaded\n");
7409 
7410 	return 0;
7411 
7412 err_out_blkdev:
7413 	if (single_major)
7414 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7415 err_out_wq:
7416 	destroy_workqueue(rbd_wq);
7417 err_out_slab:
7418 	rbd_slab_exit();
7419 	return rc;
7420 }
7421 
rbd_exit(void)7422 static void __exit rbd_exit(void)
7423 {
7424 	ida_destroy(&rbd_dev_id_ida);
7425 	rbd_sysfs_cleanup();
7426 	if (single_major)
7427 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7428 	destroy_workqueue(rbd_wq);
7429 	rbd_slab_exit();
7430 }
7431 
7432 module_init(rbd_init);
7433 module_exit(rbd_exit);
7434 
7435 MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
7436 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7437 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
7438 /* following authorship retained from original osdblk.c */
7439 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7440 
7441 MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
7442 MODULE_LICENSE("GPL");
7443