1
2 /*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
25 For usage instructions, please refer to:
26
27 Documentation/ABI/testing/sysfs-bus-rbd
28
29 */
30
31 #include <linux/ceph/libceph.h>
32 #include <linux/ceph/osd_client.h>
33 #include <linux/ceph/mon_client.h>
34 #include <linux/ceph/cls_lock_client.h>
35 #include <linux/ceph/striper.h>
36 #include <linux/ceph/decode.h>
37 #include <linux/fs_parser.h>
38 #include <linux/bsearch.h>
39
40 #include <linux/kernel.h>
41 #include <linux/device.h>
42 #include <linux/module.h>
43 #include <linux/blk-mq.h>
44 #include <linux/fs.h>
45 #include <linux/blkdev.h>
46 #include <linux/slab.h>
47 #include <linux/idr.h>
48 #include <linux/workqueue.h>
49
50 #include "rbd_types.h"
51
52 #define RBD_DEBUG /* Activate rbd_assert() calls */
53
54 /*
55 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
atomic_inc_return_safe(atomic_t * v)60 static int atomic_inc_return_safe(atomic_t *v)
61 {
62 unsigned int counter;
63
64 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
65 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71 }
72
73 /* Decrement the counter. Return the resulting value, or -EINVAL */
atomic_dec_return_safe(atomic_t * v)74 static int atomic_dec_return_safe(atomic_t *v)
75 {
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85 }
86
87 #define RBD_DRV_NAME "rbd"
88
89 #define RBD_MINORS_PER_MAJOR 256
90 #define RBD_SINGLE_MAJOR_PART_SHIFT 4
91
92 #define RBD_MAX_PARENT_CHAIN_LEN 16
93
94 #define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95 #define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
98 #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
99
100 #define RBD_SNAP_HEAD_NAME "-"
101
102 #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
104 /* This allows a single page to hold an image name sent by OSD */
105 #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
106 #define RBD_IMAGE_ID_LEN_MAX 64
107
108 #define RBD_OBJ_PREFIX_LEN_MAX 64
109
110 #define RBD_NOTIFY_TIMEOUT 5 /* seconds */
111 #define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
113 /* Feature bits */
114
115 #define RBD_FEATURE_LAYERING (1ULL<<0)
116 #define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117 #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
118 #define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
119 #define RBD_FEATURE_FAST_DIFF (1ULL<<4)
120 #define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
121 #define RBD_FEATURE_DATA_POOL (1ULL<<7)
122 #define RBD_FEATURE_OPERATIONS (1ULL<<8)
123
124 #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
125 RBD_FEATURE_STRIPINGV2 | \
126 RBD_FEATURE_EXCLUSIVE_LOCK | \
127 RBD_FEATURE_OBJECT_MAP | \
128 RBD_FEATURE_FAST_DIFF | \
129 RBD_FEATURE_DEEP_FLATTEN | \
130 RBD_FEATURE_DATA_POOL | \
131 RBD_FEATURE_OPERATIONS)
132
133 /* Features supported by this (client software) implementation. */
134
135 #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
136
137 /*
138 * An RBD device name will be "rbd#", where the "rbd" comes from
139 * RBD_DRV_NAME above, and # is a unique integer identifier.
140 */
141 #define DEV_NAME_LEN 32
142
143 /*
144 * block device image metadata (in-memory version)
145 */
146 struct rbd_image_header {
147 /* These six fields never change for a given rbd image */
148 char *object_prefix;
149 __u8 obj_order;
150 u64 stripe_unit;
151 u64 stripe_count;
152 s64 data_pool_id;
153 u64 features; /* Might be changeable someday? */
154
155 /* The remaining fields need to be updated occasionally */
156 u64 image_size;
157 struct ceph_snap_context *snapc;
158 char *snap_names; /* format 1 only */
159 u64 *snap_sizes; /* format 1 only */
160 };
161
162 /*
163 * An rbd image specification.
164 *
165 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
166 * identify an image. Each rbd_dev structure includes a pointer to
167 * an rbd_spec structure that encapsulates this identity.
168 *
169 * Each of the id's in an rbd_spec has an associated name. For a
170 * user-mapped image, the names are supplied and the id's associated
171 * with them are looked up. For a layered image, a parent image is
172 * defined by the tuple, and the names are looked up.
173 *
174 * An rbd_dev structure contains a parent_spec pointer which is
175 * non-null if the image it represents is a child in a layered
176 * image. This pointer will refer to the rbd_spec structure used
177 * by the parent rbd_dev for its own identity (i.e., the structure
178 * is shared between the parent and child).
179 *
180 * Since these structures are populated once, during the discovery
181 * phase of image construction, they are effectively immutable so
182 * we make no effort to synchronize access to them.
183 *
184 * Note that code herein does not assume the image name is known (it
185 * could be a null pointer).
186 */
187 struct rbd_spec {
188 u64 pool_id;
189 const char *pool_name;
190 const char *pool_ns; /* NULL if default, never "" */
191
192 const char *image_id;
193 const char *image_name;
194
195 u64 snap_id;
196 const char *snap_name;
197
198 struct kref kref;
199 };
200
201 /*
202 * an instance of the client. multiple devices may share an rbd client.
203 */
204 struct rbd_client {
205 struct ceph_client *client;
206 struct kref kref;
207 struct list_head node;
208 };
209
210 struct pending_result {
211 int result; /* first nonzero result */
212 int num_pending;
213 };
214
215 struct rbd_img_request;
216
217 enum obj_request_type {
218 OBJ_REQUEST_NODATA = 1,
219 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
220 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
221 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
222 };
223
224 enum obj_operation_type {
225 OBJ_OP_READ = 1,
226 OBJ_OP_WRITE,
227 OBJ_OP_DISCARD,
228 OBJ_OP_ZEROOUT,
229 };
230
231 #define RBD_OBJ_FLAG_DELETION (1U << 0)
232 #define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
233 #define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
234 #define RBD_OBJ_FLAG_MAY_EXIST (1U << 3)
235 #define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4)
236
237 enum rbd_obj_read_state {
238 RBD_OBJ_READ_START = 1,
239 RBD_OBJ_READ_OBJECT,
240 RBD_OBJ_READ_PARENT,
241 };
242
243 /*
244 * Writes go through the following state machine to deal with
245 * layering:
246 *
247 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
248 * . | .
249 * . v .
250 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
251 * . | . .
252 * . v v (deep-copyup .
253 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
254 * flattened) v | . .
255 * . v . .
256 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
257 * | not needed) v
258 * v .
259 * done . . . . . . . . . . . . . . . . . .
260 * ^
261 * |
262 * RBD_OBJ_WRITE_FLAT
263 *
264 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
265 * assert_exists guard is needed or not (in some cases it's not needed
266 * even if there is a parent).
267 */
268 enum rbd_obj_write_state {
269 RBD_OBJ_WRITE_START = 1,
270 RBD_OBJ_WRITE_PRE_OBJECT_MAP,
271 RBD_OBJ_WRITE_OBJECT,
272 __RBD_OBJ_WRITE_COPYUP,
273 RBD_OBJ_WRITE_COPYUP,
274 RBD_OBJ_WRITE_POST_OBJECT_MAP,
275 };
276
277 enum rbd_obj_copyup_state {
278 RBD_OBJ_COPYUP_START = 1,
279 RBD_OBJ_COPYUP_READ_PARENT,
280 __RBD_OBJ_COPYUP_OBJECT_MAPS,
281 RBD_OBJ_COPYUP_OBJECT_MAPS,
282 __RBD_OBJ_COPYUP_WRITE_OBJECT,
283 RBD_OBJ_COPYUP_WRITE_OBJECT,
284 };
285
286 struct rbd_obj_request {
287 struct ceph_object_extent ex;
288 unsigned int flags; /* RBD_OBJ_FLAG_* */
289 union {
290 enum rbd_obj_read_state read_state; /* for reads */
291 enum rbd_obj_write_state write_state; /* for writes */
292 };
293
294 struct rbd_img_request *img_request;
295 struct ceph_file_extent *img_extents;
296 u32 num_img_extents;
297
298 union {
299 struct ceph_bio_iter bio_pos;
300 struct {
301 struct ceph_bvec_iter bvec_pos;
302 u32 bvec_count;
303 u32 bvec_idx;
304 };
305 };
306
307 enum rbd_obj_copyup_state copyup_state;
308 struct bio_vec *copyup_bvecs;
309 u32 copyup_bvec_count;
310
311 struct list_head osd_reqs; /* w/ r_private_item */
312
313 struct mutex state_mutex;
314 struct pending_result pending;
315 struct kref kref;
316 };
317
318 enum img_req_flags {
319 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
320 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
321 };
322
323 enum rbd_img_state {
324 RBD_IMG_START = 1,
325 RBD_IMG_EXCLUSIVE_LOCK,
326 __RBD_IMG_OBJECT_REQUESTS,
327 RBD_IMG_OBJECT_REQUESTS,
328 };
329
330 struct rbd_img_request {
331 struct rbd_device *rbd_dev;
332 enum obj_operation_type op_type;
333 enum obj_request_type data_type;
334 unsigned long flags;
335 enum rbd_img_state state;
336 union {
337 u64 snap_id; /* for reads */
338 struct ceph_snap_context *snapc; /* for writes */
339 };
340 struct rbd_obj_request *obj_request; /* obj req initiator */
341
342 struct list_head lock_item;
343 struct list_head object_extents; /* obj_req.ex structs */
344
345 struct mutex state_mutex;
346 struct pending_result pending;
347 struct work_struct work;
348 int work_result;
349 };
350
351 #define for_each_obj_request(ireq, oreq) \
352 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
353 #define for_each_obj_request_safe(ireq, oreq, n) \
354 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
355
356 enum rbd_watch_state {
357 RBD_WATCH_STATE_UNREGISTERED,
358 RBD_WATCH_STATE_REGISTERED,
359 RBD_WATCH_STATE_ERROR,
360 };
361
362 enum rbd_lock_state {
363 RBD_LOCK_STATE_UNLOCKED,
364 RBD_LOCK_STATE_LOCKED,
365 RBD_LOCK_STATE_RELEASING,
366 };
367
368 /* WatchNotify::ClientId */
369 struct rbd_client_id {
370 u64 gid;
371 u64 handle;
372 };
373
374 struct rbd_mapping {
375 u64 size;
376 };
377
378 /*
379 * a single device
380 */
381 struct rbd_device {
382 int dev_id; /* blkdev unique id */
383
384 int major; /* blkdev assigned major */
385 int minor;
386 struct gendisk *disk; /* blkdev's gendisk and rq */
387
388 u32 image_format; /* Either 1 or 2 */
389 struct rbd_client *rbd_client;
390
391 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
392
393 spinlock_t lock; /* queue, flags, open_count */
394
395 struct rbd_image_header header;
396 unsigned long flags; /* possibly lock protected */
397 struct rbd_spec *spec;
398 struct rbd_options *opts;
399 char *config_info; /* add{,_single_major} string */
400
401 struct ceph_object_id header_oid;
402 struct ceph_object_locator header_oloc;
403
404 struct ceph_file_layout layout; /* used for all rbd requests */
405
406 struct mutex watch_mutex;
407 enum rbd_watch_state watch_state;
408 struct ceph_osd_linger_request *watch_handle;
409 u64 watch_cookie;
410 struct delayed_work watch_dwork;
411
412 struct rw_semaphore lock_rwsem;
413 enum rbd_lock_state lock_state;
414 char lock_cookie[32];
415 struct rbd_client_id owner_cid;
416 struct work_struct acquired_lock_work;
417 struct work_struct released_lock_work;
418 struct delayed_work lock_dwork;
419 struct work_struct unlock_work;
420 spinlock_t lock_lists_lock;
421 struct list_head acquiring_list;
422 struct list_head running_list;
423 struct completion acquire_wait;
424 int acquire_err;
425 struct completion releasing_wait;
426
427 spinlock_t object_map_lock;
428 u8 *object_map;
429 u64 object_map_size; /* in objects */
430 u64 object_map_flags;
431
432 struct workqueue_struct *task_wq;
433
434 struct rbd_spec *parent_spec;
435 u64 parent_overlap;
436 atomic_t parent_ref;
437 struct rbd_device *parent;
438
439 /* Block layer tags. */
440 struct blk_mq_tag_set tag_set;
441
442 /* protects updating the header */
443 struct rw_semaphore header_rwsem;
444
445 struct rbd_mapping mapping;
446
447 struct list_head node;
448
449 /* sysfs related */
450 struct device dev;
451 unsigned long open_count; /* protected by lock */
452 };
453
454 /*
455 * Flag bits for rbd_dev->flags:
456 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
457 * by rbd_dev->lock
458 */
459 enum rbd_dev_flags {
460 RBD_DEV_FLAG_EXISTS, /* rbd_dev_device_setup() ran */
461 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
462 RBD_DEV_FLAG_READONLY, /* -o ro or snapshot */
463 };
464
465 static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
466
467 static LIST_HEAD(rbd_dev_list); /* devices */
468 static DEFINE_SPINLOCK(rbd_dev_list_lock);
469
470 static LIST_HEAD(rbd_client_list); /* clients */
471 static DEFINE_SPINLOCK(rbd_client_list_lock);
472
473 /* Slab caches for frequently-allocated structures */
474
475 static struct kmem_cache *rbd_img_request_cache;
476 static struct kmem_cache *rbd_obj_request_cache;
477
478 static int rbd_major;
479 static DEFINE_IDA(rbd_dev_id_ida);
480
481 static struct workqueue_struct *rbd_wq;
482
483 static struct ceph_snap_context rbd_empty_snapc = {
484 .nref = REFCOUNT_INIT(1),
485 };
486
487 /*
488 * single-major requires >= 0.75 version of userspace rbd utility.
489 */
490 static bool single_major = true;
491 module_param(single_major, bool, 0444);
492 MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
493
494 static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
495 static ssize_t remove_store(struct bus_type *bus, const char *buf,
496 size_t count);
497 static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
498 size_t count);
499 static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
500 size_t count);
501 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
502
rbd_dev_id_to_minor(int dev_id)503 static int rbd_dev_id_to_minor(int dev_id)
504 {
505 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
506 }
507
minor_to_rbd_dev_id(int minor)508 static int minor_to_rbd_dev_id(int minor)
509 {
510 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
511 }
512
rbd_is_ro(struct rbd_device * rbd_dev)513 static bool rbd_is_ro(struct rbd_device *rbd_dev)
514 {
515 return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
516 }
517
rbd_is_snap(struct rbd_device * rbd_dev)518 static bool rbd_is_snap(struct rbd_device *rbd_dev)
519 {
520 return rbd_dev->spec->snap_id != CEPH_NOSNAP;
521 }
522
__rbd_is_lock_owner(struct rbd_device * rbd_dev)523 static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
524 {
525 lockdep_assert_held(&rbd_dev->lock_rwsem);
526
527 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
528 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
529 }
530
rbd_is_lock_owner(struct rbd_device * rbd_dev)531 static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
532 {
533 bool is_lock_owner;
534
535 down_read(&rbd_dev->lock_rwsem);
536 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
537 up_read(&rbd_dev->lock_rwsem);
538 return is_lock_owner;
539 }
540
supported_features_show(struct bus_type * bus,char * buf)541 static ssize_t supported_features_show(struct bus_type *bus, char *buf)
542 {
543 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
544 }
545
546 static BUS_ATTR_WO(add);
547 static BUS_ATTR_WO(remove);
548 static BUS_ATTR_WO(add_single_major);
549 static BUS_ATTR_WO(remove_single_major);
550 static BUS_ATTR_RO(supported_features);
551
552 static struct attribute *rbd_bus_attrs[] = {
553 &bus_attr_add.attr,
554 &bus_attr_remove.attr,
555 &bus_attr_add_single_major.attr,
556 &bus_attr_remove_single_major.attr,
557 &bus_attr_supported_features.attr,
558 NULL,
559 };
560
rbd_bus_is_visible(struct kobject * kobj,struct attribute * attr,int index)561 static umode_t rbd_bus_is_visible(struct kobject *kobj,
562 struct attribute *attr, int index)
563 {
564 if (!single_major &&
565 (attr == &bus_attr_add_single_major.attr ||
566 attr == &bus_attr_remove_single_major.attr))
567 return 0;
568
569 return attr->mode;
570 }
571
572 static const struct attribute_group rbd_bus_group = {
573 .attrs = rbd_bus_attrs,
574 .is_visible = rbd_bus_is_visible,
575 };
576 __ATTRIBUTE_GROUPS(rbd_bus);
577
578 static struct bus_type rbd_bus_type = {
579 .name = "rbd",
580 .bus_groups = rbd_bus_groups,
581 };
582
rbd_root_dev_release(struct device * dev)583 static void rbd_root_dev_release(struct device *dev)
584 {
585 }
586
587 static struct device rbd_root_dev = {
588 .init_name = "rbd",
589 .release = rbd_root_dev_release,
590 };
591
592 static __printf(2, 3)
rbd_warn(struct rbd_device * rbd_dev,const char * fmt,...)593 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
594 {
595 struct va_format vaf;
596 va_list args;
597
598 va_start(args, fmt);
599 vaf.fmt = fmt;
600 vaf.va = &args;
601
602 if (!rbd_dev)
603 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
604 else if (rbd_dev->disk)
605 printk(KERN_WARNING "%s: %s: %pV\n",
606 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
607 else if (rbd_dev->spec && rbd_dev->spec->image_name)
608 printk(KERN_WARNING "%s: image %s: %pV\n",
609 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
610 else if (rbd_dev->spec && rbd_dev->spec->image_id)
611 printk(KERN_WARNING "%s: id %s: %pV\n",
612 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
613 else /* punt */
614 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
615 RBD_DRV_NAME, rbd_dev, &vaf);
616 va_end(args);
617 }
618
619 #ifdef RBD_DEBUG
620 #define rbd_assert(expr) \
621 if (unlikely(!(expr))) { \
622 printk(KERN_ERR "\nAssertion failure in %s() " \
623 "at line %d:\n\n" \
624 "\trbd_assert(%s);\n\n", \
625 __func__, __LINE__, #expr); \
626 BUG(); \
627 }
628 #else /* !RBD_DEBUG */
629 # define rbd_assert(expr) ((void) 0)
630 #endif /* !RBD_DEBUG */
631
632 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
633
634 static int rbd_dev_refresh(struct rbd_device *rbd_dev);
635 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
636 static int rbd_dev_header_info(struct rbd_device *rbd_dev);
637 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
638 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
639 u64 snap_id);
640 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
641 u8 *order, u64 *snap_size);
642 static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
643
644 static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
645 static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
646
647 /*
648 * Return true if nothing else is pending.
649 */
pending_result_dec(struct pending_result * pending,int * result)650 static bool pending_result_dec(struct pending_result *pending, int *result)
651 {
652 rbd_assert(pending->num_pending > 0);
653
654 if (*result && !pending->result)
655 pending->result = *result;
656 if (--pending->num_pending)
657 return false;
658
659 *result = pending->result;
660 return true;
661 }
662
rbd_open(struct block_device * bdev,fmode_t mode)663 static int rbd_open(struct block_device *bdev, fmode_t mode)
664 {
665 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
666 bool removing = false;
667
668 spin_lock_irq(&rbd_dev->lock);
669 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
670 removing = true;
671 else
672 rbd_dev->open_count++;
673 spin_unlock_irq(&rbd_dev->lock);
674 if (removing)
675 return -ENOENT;
676
677 (void) get_device(&rbd_dev->dev);
678
679 return 0;
680 }
681
rbd_release(struct gendisk * disk,fmode_t mode)682 static void rbd_release(struct gendisk *disk, fmode_t mode)
683 {
684 struct rbd_device *rbd_dev = disk->private_data;
685 unsigned long open_count_before;
686
687 spin_lock_irq(&rbd_dev->lock);
688 open_count_before = rbd_dev->open_count--;
689 spin_unlock_irq(&rbd_dev->lock);
690 rbd_assert(open_count_before > 0);
691
692 put_device(&rbd_dev->dev);
693 }
694
rbd_ioctl_set_ro(struct rbd_device * rbd_dev,unsigned long arg)695 static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
696 {
697 int ro;
698
699 if (get_user(ro, (int __user *)arg))
700 return -EFAULT;
701
702 /*
703 * Both images mapped read-only and snapshots can't be marked
704 * read-write.
705 */
706 if (!ro) {
707 if (rbd_is_ro(rbd_dev))
708 return -EROFS;
709
710 rbd_assert(!rbd_is_snap(rbd_dev));
711 }
712
713 /* Let blkdev_roset() handle it */
714 return -ENOTTY;
715 }
716
rbd_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)717 static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
718 unsigned int cmd, unsigned long arg)
719 {
720 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
721 int ret;
722
723 switch (cmd) {
724 case BLKROSET:
725 ret = rbd_ioctl_set_ro(rbd_dev, arg);
726 break;
727 default:
728 ret = -ENOTTY;
729 }
730
731 return ret;
732 }
733
734 #ifdef CONFIG_COMPAT
rbd_compat_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)735 static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
736 unsigned int cmd, unsigned long arg)
737 {
738 return rbd_ioctl(bdev, mode, cmd, arg);
739 }
740 #endif /* CONFIG_COMPAT */
741
742 static const struct block_device_operations rbd_bd_ops = {
743 .owner = THIS_MODULE,
744 .open = rbd_open,
745 .release = rbd_release,
746 .ioctl = rbd_ioctl,
747 #ifdef CONFIG_COMPAT
748 .compat_ioctl = rbd_compat_ioctl,
749 #endif
750 };
751
752 /*
753 * Initialize an rbd client instance. Success or not, this function
754 * consumes ceph_opts. Caller holds client_mutex.
755 */
rbd_client_create(struct ceph_options * ceph_opts)756 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
757 {
758 struct rbd_client *rbdc;
759 int ret = -ENOMEM;
760
761 dout("%s:\n", __func__);
762 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
763 if (!rbdc)
764 goto out_opt;
765
766 kref_init(&rbdc->kref);
767 INIT_LIST_HEAD(&rbdc->node);
768
769 rbdc->client = ceph_create_client(ceph_opts, rbdc);
770 if (IS_ERR(rbdc->client))
771 goto out_rbdc;
772 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
773
774 ret = ceph_open_session(rbdc->client);
775 if (ret < 0)
776 goto out_client;
777
778 spin_lock(&rbd_client_list_lock);
779 list_add_tail(&rbdc->node, &rbd_client_list);
780 spin_unlock(&rbd_client_list_lock);
781
782 dout("%s: rbdc %p\n", __func__, rbdc);
783
784 return rbdc;
785 out_client:
786 ceph_destroy_client(rbdc->client);
787 out_rbdc:
788 kfree(rbdc);
789 out_opt:
790 if (ceph_opts)
791 ceph_destroy_options(ceph_opts);
792 dout("%s: error %d\n", __func__, ret);
793
794 return ERR_PTR(ret);
795 }
796
__rbd_get_client(struct rbd_client * rbdc)797 static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
798 {
799 kref_get(&rbdc->kref);
800
801 return rbdc;
802 }
803
804 /*
805 * Find a ceph client with specific addr and configuration. If
806 * found, bump its reference count.
807 */
rbd_client_find(struct ceph_options * ceph_opts)808 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
809 {
810 struct rbd_client *client_node;
811 bool found = false;
812
813 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
814 return NULL;
815
816 spin_lock(&rbd_client_list_lock);
817 list_for_each_entry(client_node, &rbd_client_list, node) {
818 if (!ceph_compare_options(ceph_opts, client_node->client)) {
819 __rbd_get_client(client_node);
820
821 found = true;
822 break;
823 }
824 }
825 spin_unlock(&rbd_client_list_lock);
826
827 return found ? client_node : NULL;
828 }
829
830 /*
831 * (Per device) rbd map options
832 */
833 enum {
834 Opt_queue_depth,
835 Opt_alloc_size,
836 Opt_lock_timeout,
837 /* int args above */
838 Opt_pool_ns,
839 Opt_compression_hint,
840 /* string args above */
841 Opt_read_only,
842 Opt_read_write,
843 Opt_lock_on_read,
844 Opt_exclusive,
845 Opt_notrim,
846 };
847
848 enum {
849 Opt_compression_hint_none,
850 Opt_compression_hint_compressible,
851 Opt_compression_hint_incompressible,
852 };
853
854 static const struct constant_table rbd_param_compression_hint[] = {
855 {"none", Opt_compression_hint_none},
856 {"compressible", Opt_compression_hint_compressible},
857 {"incompressible", Opt_compression_hint_incompressible},
858 {}
859 };
860
861 static const struct fs_parameter_spec rbd_parameters[] = {
862 fsparam_u32 ("alloc_size", Opt_alloc_size),
863 fsparam_enum ("compression_hint", Opt_compression_hint,
864 rbd_param_compression_hint),
865 fsparam_flag ("exclusive", Opt_exclusive),
866 fsparam_flag ("lock_on_read", Opt_lock_on_read),
867 fsparam_u32 ("lock_timeout", Opt_lock_timeout),
868 fsparam_flag ("notrim", Opt_notrim),
869 fsparam_string ("_pool_ns", Opt_pool_ns),
870 fsparam_u32 ("queue_depth", Opt_queue_depth),
871 fsparam_flag ("read_only", Opt_read_only),
872 fsparam_flag ("read_write", Opt_read_write),
873 fsparam_flag ("ro", Opt_read_only),
874 fsparam_flag ("rw", Opt_read_write),
875 {}
876 };
877
878 struct rbd_options {
879 int queue_depth;
880 int alloc_size;
881 unsigned long lock_timeout;
882 bool read_only;
883 bool lock_on_read;
884 bool exclusive;
885 bool trim;
886
887 u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
888 };
889
890 #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
891 #define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
892 #define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
893 #define RBD_READ_ONLY_DEFAULT false
894 #define RBD_LOCK_ON_READ_DEFAULT false
895 #define RBD_EXCLUSIVE_DEFAULT false
896 #define RBD_TRIM_DEFAULT true
897
898 struct rbd_parse_opts_ctx {
899 struct rbd_spec *spec;
900 struct ceph_options *copts;
901 struct rbd_options *opts;
902 };
903
obj_op_name(enum obj_operation_type op_type)904 static char* obj_op_name(enum obj_operation_type op_type)
905 {
906 switch (op_type) {
907 case OBJ_OP_READ:
908 return "read";
909 case OBJ_OP_WRITE:
910 return "write";
911 case OBJ_OP_DISCARD:
912 return "discard";
913 case OBJ_OP_ZEROOUT:
914 return "zeroout";
915 default:
916 return "???";
917 }
918 }
919
920 /*
921 * Destroy ceph client
922 *
923 * Caller must hold rbd_client_list_lock.
924 */
rbd_client_release(struct kref * kref)925 static void rbd_client_release(struct kref *kref)
926 {
927 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
928
929 dout("%s: rbdc %p\n", __func__, rbdc);
930 spin_lock(&rbd_client_list_lock);
931 list_del(&rbdc->node);
932 spin_unlock(&rbd_client_list_lock);
933
934 ceph_destroy_client(rbdc->client);
935 kfree(rbdc);
936 }
937
938 /*
939 * Drop reference to ceph client node. If it's not referenced anymore, release
940 * it.
941 */
rbd_put_client(struct rbd_client * rbdc)942 static void rbd_put_client(struct rbd_client *rbdc)
943 {
944 if (rbdc)
945 kref_put(&rbdc->kref, rbd_client_release);
946 }
947
948 /*
949 * Get a ceph client with specific addr and configuration, if one does
950 * not exist create it. Either way, ceph_opts is consumed by this
951 * function.
952 */
rbd_get_client(struct ceph_options * ceph_opts)953 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
954 {
955 struct rbd_client *rbdc;
956 int ret;
957
958 mutex_lock(&client_mutex);
959 rbdc = rbd_client_find(ceph_opts);
960 if (rbdc) {
961 ceph_destroy_options(ceph_opts);
962
963 /*
964 * Using an existing client. Make sure ->pg_pools is up to
965 * date before we look up the pool id in do_rbd_add().
966 */
967 ret = ceph_wait_for_latest_osdmap(rbdc->client,
968 rbdc->client->options->mount_timeout);
969 if (ret) {
970 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
971 rbd_put_client(rbdc);
972 rbdc = ERR_PTR(ret);
973 }
974 } else {
975 rbdc = rbd_client_create(ceph_opts);
976 }
977 mutex_unlock(&client_mutex);
978
979 return rbdc;
980 }
981
rbd_image_format_valid(u32 image_format)982 static bool rbd_image_format_valid(u32 image_format)
983 {
984 return image_format == 1 || image_format == 2;
985 }
986
rbd_dev_ondisk_valid(struct rbd_image_header_ondisk * ondisk)987 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
988 {
989 size_t size;
990 u32 snap_count;
991
992 /* The header has to start with the magic rbd header text */
993 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
994 return false;
995
996 /* The bio layer requires at least sector-sized I/O */
997
998 if (ondisk->options.order < SECTOR_SHIFT)
999 return false;
1000
1001 /* If we use u64 in a few spots we may be able to loosen this */
1002
1003 if (ondisk->options.order > 8 * sizeof (int) - 1)
1004 return false;
1005
1006 /*
1007 * The size of a snapshot header has to fit in a size_t, and
1008 * that limits the number of snapshots.
1009 */
1010 snap_count = le32_to_cpu(ondisk->snap_count);
1011 size = SIZE_MAX - sizeof (struct ceph_snap_context);
1012 if (snap_count > size / sizeof (__le64))
1013 return false;
1014
1015 /*
1016 * Not only that, but the size of the entire the snapshot
1017 * header must also be representable in a size_t.
1018 */
1019 size -= snap_count * sizeof (__le64);
1020 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1021 return false;
1022
1023 return true;
1024 }
1025
1026 /*
1027 * returns the size of an object in the image
1028 */
rbd_obj_bytes(struct rbd_image_header * header)1029 static u32 rbd_obj_bytes(struct rbd_image_header *header)
1030 {
1031 return 1U << header->obj_order;
1032 }
1033
rbd_init_layout(struct rbd_device * rbd_dev)1034 static void rbd_init_layout(struct rbd_device *rbd_dev)
1035 {
1036 if (rbd_dev->header.stripe_unit == 0 ||
1037 rbd_dev->header.stripe_count == 0) {
1038 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1039 rbd_dev->header.stripe_count = 1;
1040 }
1041
1042 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1043 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1044 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
1045 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1046 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
1047 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1048 }
1049
1050 /*
1051 * Fill an rbd image header with information from the given format 1
1052 * on-disk header.
1053 */
rbd_header_from_disk(struct rbd_device * rbd_dev,struct rbd_image_header_ondisk * ondisk)1054 static int rbd_header_from_disk(struct rbd_device *rbd_dev,
1055 struct rbd_image_header_ondisk *ondisk)
1056 {
1057 struct rbd_image_header *header = &rbd_dev->header;
1058 bool first_time = header->object_prefix == NULL;
1059 struct ceph_snap_context *snapc;
1060 char *object_prefix = NULL;
1061 char *snap_names = NULL;
1062 u64 *snap_sizes = NULL;
1063 u32 snap_count;
1064 int ret = -ENOMEM;
1065 u32 i;
1066
1067 /* Allocate this now to avoid having to handle failure below */
1068
1069 if (first_time) {
1070 object_prefix = kstrndup(ondisk->object_prefix,
1071 sizeof(ondisk->object_prefix),
1072 GFP_KERNEL);
1073 if (!object_prefix)
1074 return -ENOMEM;
1075 }
1076
1077 /* Allocate the snapshot context and fill it in */
1078
1079 snap_count = le32_to_cpu(ondisk->snap_count);
1080 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1081 if (!snapc)
1082 goto out_err;
1083 snapc->seq = le64_to_cpu(ondisk->snap_seq);
1084 if (snap_count) {
1085 struct rbd_image_snap_ondisk *snaps;
1086 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1087
1088 /* We'll keep a copy of the snapshot names... */
1089
1090 if (snap_names_len > (u64)SIZE_MAX)
1091 goto out_2big;
1092 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1093 if (!snap_names)
1094 goto out_err;
1095
1096 /* ...as well as the array of their sizes. */
1097 snap_sizes = kmalloc_array(snap_count,
1098 sizeof(*header->snap_sizes),
1099 GFP_KERNEL);
1100 if (!snap_sizes)
1101 goto out_err;
1102
1103 /*
1104 * Copy the names, and fill in each snapshot's id
1105 * and size.
1106 *
1107 * Note that rbd_dev_v1_header_info() guarantees the
1108 * ondisk buffer we're working with has
1109 * snap_names_len bytes beyond the end of the
1110 * snapshot id array, this memcpy() is safe.
1111 */
1112 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1113 snaps = ondisk->snaps;
1114 for (i = 0; i < snap_count; i++) {
1115 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1116 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1117 }
1118 }
1119
1120 /* We won't fail any more, fill in the header */
1121
1122 if (first_time) {
1123 header->object_prefix = object_prefix;
1124 header->obj_order = ondisk->options.order;
1125 rbd_init_layout(rbd_dev);
1126 } else {
1127 ceph_put_snap_context(header->snapc);
1128 kfree(header->snap_names);
1129 kfree(header->snap_sizes);
1130 }
1131
1132 /* The remaining fields always get updated (when we refresh) */
1133
1134 header->image_size = le64_to_cpu(ondisk->image_size);
1135 header->snapc = snapc;
1136 header->snap_names = snap_names;
1137 header->snap_sizes = snap_sizes;
1138
1139 return 0;
1140 out_2big:
1141 ret = -EIO;
1142 out_err:
1143 kfree(snap_sizes);
1144 kfree(snap_names);
1145 ceph_put_snap_context(snapc);
1146 kfree(object_prefix);
1147
1148 return ret;
1149 }
1150
_rbd_dev_v1_snap_name(struct rbd_device * rbd_dev,u32 which)1151 static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1152 {
1153 const char *snap_name;
1154
1155 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1156
1157 /* Skip over names until we find the one we are looking for */
1158
1159 snap_name = rbd_dev->header.snap_names;
1160 while (which--)
1161 snap_name += strlen(snap_name) + 1;
1162
1163 return kstrdup(snap_name, GFP_KERNEL);
1164 }
1165
1166 /*
1167 * Snapshot id comparison function for use with qsort()/bsearch().
1168 * Note that result is for snapshots in *descending* order.
1169 */
snapid_compare_reverse(const void * s1,const void * s2)1170 static int snapid_compare_reverse(const void *s1, const void *s2)
1171 {
1172 u64 snap_id1 = *(u64 *)s1;
1173 u64 snap_id2 = *(u64 *)s2;
1174
1175 if (snap_id1 < snap_id2)
1176 return 1;
1177 return snap_id1 == snap_id2 ? 0 : -1;
1178 }
1179
1180 /*
1181 * Search a snapshot context to see if the given snapshot id is
1182 * present.
1183 *
1184 * Returns the position of the snapshot id in the array if it's found,
1185 * or BAD_SNAP_INDEX otherwise.
1186 *
1187 * Note: The snapshot array is in kept sorted (by the osd) in
1188 * reverse order, highest snapshot id first.
1189 */
rbd_dev_snap_index(struct rbd_device * rbd_dev,u64 snap_id)1190 static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1191 {
1192 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
1193 u64 *found;
1194
1195 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1196 sizeof (snap_id), snapid_compare_reverse);
1197
1198 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
1199 }
1200
rbd_dev_v1_snap_name(struct rbd_device * rbd_dev,u64 snap_id)1201 static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1202 u64 snap_id)
1203 {
1204 u32 which;
1205 const char *snap_name;
1206
1207 which = rbd_dev_snap_index(rbd_dev, snap_id);
1208 if (which == BAD_SNAP_INDEX)
1209 return ERR_PTR(-ENOENT);
1210
1211 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1212 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
1213 }
1214
rbd_snap_name(struct rbd_device * rbd_dev,u64 snap_id)1215 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1216 {
1217 if (snap_id == CEPH_NOSNAP)
1218 return RBD_SNAP_HEAD_NAME;
1219
1220 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1221 if (rbd_dev->image_format == 1)
1222 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
1223
1224 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
1225 }
1226
rbd_snap_size(struct rbd_device * rbd_dev,u64 snap_id,u64 * snap_size)1227 static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1228 u64 *snap_size)
1229 {
1230 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1231 if (snap_id == CEPH_NOSNAP) {
1232 *snap_size = rbd_dev->header.image_size;
1233 } else if (rbd_dev->image_format == 1) {
1234 u32 which;
1235
1236 which = rbd_dev_snap_index(rbd_dev, snap_id);
1237 if (which == BAD_SNAP_INDEX)
1238 return -ENOENT;
1239
1240 *snap_size = rbd_dev->header.snap_sizes[which];
1241 } else {
1242 u64 size = 0;
1243 int ret;
1244
1245 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1246 if (ret)
1247 return ret;
1248
1249 *snap_size = size;
1250 }
1251 return 0;
1252 }
1253
rbd_dev_mapping_set(struct rbd_device * rbd_dev)1254 static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1255 {
1256 u64 snap_id = rbd_dev->spec->snap_id;
1257 u64 size = 0;
1258 int ret;
1259
1260 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1261 if (ret)
1262 return ret;
1263
1264 rbd_dev->mapping.size = size;
1265 return 0;
1266 }
1267
rbd_dev_mapping_clear(struct rbd_device * rbd_dev)1268 static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1269 {
1270 rbd_dev->mapping.size = 0;
1271 }
1272
zero_bvec(struct bio_vec * bv)1273 static void zero_bvec(struct bio_vec *bv)
1274 {
1275 void *buf;
1276 unsigned long flags;
1277
1278 buf = bvec_kmap_irq(bv, &flags);
1279 memset(buf, 0, bv->bv_len);
1280 flush_dcache_page(bv->bv_page);
1281 bvec_kunmap_irq(buf, &flags);
1282 }
1283
zero_bios(struct ceph_bio_iter * bio_pos,u32 off,u32 bytes)1284 static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1285 {
1286 struct ceph_bio_iter it = *bio_pos;
1287
1288 ceph_bio_iter_advance(&it, off);
1289 ceph_bio_iter_advance_step(&it, bytes, ({
1290 zero_bvec(&bv);
1291 }));
1292 }
1293
zero_bvecs(struct ceph_bvec_iter * bvec_pos,u32 off,u32 bytes)1294 static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1295 {
1296 struct ceph_bvec_iter it = *bvec_pos;
1297
1298 ceph_bvec_iter_advance(&it, off);
1299 ceph_bvec_iter_advance_step(&it, bytes, ({
1300 zero_bvec(&bv);
1301 }));
1302 }
1303
1304 /*
1305 * Zero a range in @obj_req data buffer defined by a bio (list) or
1306 * (private) bio_vec array.
1307 *
1308 * @off is relative to the start of the data buffer.
1309 */
rbd_obj_zero_range(struct rbd_obj_request * obj_req,u32 off,u32 bytes)1310 static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1311 u32 bytes)
1312 {
1313 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1314
1315 switch (obj_req->img_request->data_type) {
1316 case OBJ_REQUEST_BIO:
1317 zero_bios(&obj_req->bio_pos, off, bytes);
1318 break;
1319 case OBJ_REQUEST_BVECS:
1320 case OBJ_REQUEST_OWN_BVECS:
1321 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1322 break;
1323 default:
1324 BUG();
1325 }
1326 }
1327
1328 static void rbd_obj_request_destroy(struct kref *kref);
rbd_obj_request_put(struct rbd_obj_request * obj_request)1329 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1330 {
1331 rbd_assert(obj_request != NULL);
1332 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1333 kref_read(&obj_request->kref));
1334 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1335 }
1336
rbd_img_obj_request_add(struct rbd_img_request * img_request,struct rbd_obj_request * obj_request)1337 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1338 struct rbd_obj_request *obj_request)
1339 {
1340 rbd_assert(obj_request->img_request == NULL);
1341
1342 /* Image request now owns object's original reference */
1343 obj_request->img_request = img_request;
1344 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1345 }
1346
rbd_img_obj_request_del(struct rbd_img_request * img_request,struct rbd_obj_request * obj_request)1347 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1348 struct rbd_obj_request *obj_request)
1349 {
1350 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1351 list_del(&obj_request->ex.oe_item);
1352 rbd_assert(obj_request->img_request == img_request);
1353 rbd_obj_request_put(obj_request);
1354 }
1355
rbd_osd_submit(struct ceph_osd_request * osd_req)1356 static void rbd_osd_submit(struct ceph_osd_request *osd_req)
1357 {
1358 struct rbd_obj_request *obj_req = osd_req->r_priv;
1359
1360 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1361 __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1362 obj_req->ex.oe_off, obj_req->ex.oe_len);
1363 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1364 }
1365
1366 /*
1367 * The default/initial value for all image request flags is 0. Each
1368 * is conditionally set to 1 at image request initialization time
1369 * and currently never change thereafter.
1370 */
img_request_layered_set(struct rbd_img_request * img_request)1371 static void img_request_layered_set(struct rbd_img_request *img_request)
1372 {
1373 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1374 }
1375
img_request_layered_test(struct rbd_img_request * img_request)1376 static bool img_request_layered_test(struct rbd_img_request *img_request)
1377 {
1378 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1379 }
1380
rbd_obj_is_entire(struct rbd_obj_request * obj_req)1381 static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1382 {
1383 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1384
1385 return !obj_req->ex.oe_off &&
1386 obj_req->ex.oe_len == rbd_dev->layout.object_size;
1387 }
1388
rbd_obj_is_tail(struct rbd_obj_request * obj_req)1389 static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1390 {
1391 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1392
1393 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
1394 rbd_dev->layout.object_size;
1395 }
1396
1397 /*
1398 * Must be called after rbd_obj_calc_img_extents().
1399 */
rbd_obj_set_copyup_enabled(struct rbd_obj_request * obj_req)1400 static void rbd_obj_set_copyup_enabled(struct rbd_obj_request *obj_req)
1401 {
1402 rbd_assert(obj_req->img_request->snapc);
1403
1404 if (obj_req->img_request->op_type == OBJ_OP_DISCARD) {
1405 dout("%s %p objno %llu discard\n", __func__, obj_req,
1406 obj_req->ex.oe_objno);
1407 return;
1408 }
1409
1410 if (!obj_req->num_img_extents) {
1411 dout("%s %p objno %llu not overlapping\n", __func__, obj_req,
1412 obj_req->ex.oe_objno);
1413 return;
1414 }
1415
1416 if (rbd_obj_is_entire(obj_req) &&
1417 !obj_req->img_request->snapc->num_snaps) {
1418 dout("%s %p objno %llu entire\n", __func__, obj_req,
1419 obj_req->ex.oe_objno);
1420 return;
1421 }
1422
1423 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
1424 }
1425
rbd_obj_img_extents_bytes(struct rbd_obj_request * obj_req)1426 static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1427 {
1428 return ceph_file_extents_bytes(obj_req->img_extents,
1429 obj_req->num_img_extents);
1430 }
1431
rbd_img_is_write(struct rbd_img_request * img_req)1432 static bool rbd_img_is_write(struct rbd_img_request *img_req)
1433 {
1434 switch (img_req->op_type) {
1435 case OBJ_OP_READ:
1436 return false;
1437 case OBJ_OP_WRITE:
1438 case OBJ_OP_DISCARD:
1439 case OBJ_OP_ZEROOUT:
1440 return true;
1441 default:
1442 BUG();
1443 }
1444 }
1445
rbd_osd_req_callback(struct ceph_osd_request * osd_req)1446 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1447 {
1448 struct rbd_obj_request *obj_req = osd_req->r_priv;
1449 int result;
1450
1451 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1452 osd_req->r_result, obj_req);
1453
1454 /*
1455 * Writes aren't allowed to return a data payload. In some
1456 * guarded write cases (e.g. stat + zero on an empty object)
1457 * a stat response makes it through, but we don't care.
1458 */
1459 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1460 result = 0;
1461 else
1462 result = osd_req->r_result;
1463
1464 rbd_obj_handle_request(obj_req, result);
1465 }
1466
rbd_osd_format_read(struct ceph_osd_request * osd_req)1467 static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
1468 {
1469 struct rbd_obj_request *obj_request = osd_req->r_priv;
1470 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1471 struct ceph_options *opt = rbd_dev->rbd_client->client->options;
1472
1473 osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
1474 osd_req->r_snapid = obj_request->img_request->snap_id;
1475 }
1476
rbd_osd_format_write(struct ceph_osd_request * osd_req)1477 static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
1478 {
1479 struct rbd_obj_request *obj_request = osd_req->r_priv;
1480
1481 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1482 ktime_get_real_ts64(&osd_req->r_mtime);
1483 osd_req->r_data_offset = obj_request->ex.oe_off;
1484 }
1485
1486 static struct ceph_osd_request *
__rbd_obj_add_osd_request(struct rbd_obj_request * obj_req,struct ceph_snap_context * snapc,int num_ops)1487 __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1488 struct ceph_snap_context *snapc, int num_ops)
1489 {
1490 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1491 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1492 struct ceph_osd_request *req;
1493 const char *name_format = rbd_dev->image_format == 1 ?
1494 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1495 int ret;
1496
1497 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1498 if (!req)
1499 return ERR_PTR(-ENOMEM);
1500
1501 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
1502 req->r_callback = rbd_osd_req_callback;
1503 req->r_priv = obj_req;
1504
1505 /*
1506 * Data objects may be stored in a separate pool, but always in
1507 * the same namespace in that pool as the header in its pool.
1508 */
1509 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1510 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1511
1512 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1513 rbd_dev->header.object_prefix,
1514 obj_req->ex.oe_objno);
1515 if (ret)
1516 return ERR_PTR(ret);
1517
1518 return req;
1519 }
1520
1521 static struct ceph_osd_request *
rbd_obj_add_osd_request(struct rbd_obj_request * obj_req,int num_ops)1522 rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
1523 {
1524 rbd_assert(obj_req->img_request->snapc);
1525 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1526 num_ops);
1527 }
1528
rbd_obj_request_create(void)1529 static struct rbd_obj_request *rbd_obj_request_create(void)
1530 {
1531 struct rbd_obj_request *obj_request;
1532
1533 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
1534 if (!obj_request)
1535 return NULL;
1536
1537 ceph_object_extent_init(&obj_request->ex);
1538 INIT_LIST_HEAD(&obj_request->osd_reqs);
1539 mutex_init(&obj_request->state_mutex);
1540 kref_init(&obj_request->kref);
1541
1542 dout("%s %p\n", __func__, obj_request);
1543 return obj_request;
1544 }
1545
rbd_obj_request_destroy(struct kref * kref)1546 static void rbd_obj_request_destroy(struct kref *kref)
1547 {
1548 struct rbd_obj_request *obj_request;
1549 struct ceph_osd_request *osd_req;
1550 u32 i;
1551
1552 obj_request = container_of(kref, struct rbd_obj_request, kref);
1553
1554 dout("%s: obj %p\n", __func__, obj_request);
1555
1556 while (!list_empty(&obj_request->osd_reqs)) {
1557 osd_req = list_first_entry(&obj_request->osd_reqs,
1558 struct ceph_osd_request, r_private_item);
1559 list_del_init(&osd_req->r_private_item);
1560 ceph_osdc_put_request(osd_req);
1561 }
1562
1563 switch (obj_request->img_request->data_type) {
1564 case OBJ_REQUEST_NODATA:
1565 case OBJ_REQUEST_BIO:
1566 case OBJ_REQUEST_BVECS:
1567 break; /* Nothing to do */
1568 case OBJ_REQUEST_OWN_BVECS:
1569 kfree(obj_request->bvec_pos.bvecs);
1570 break;
1571 default:
1572 BUG();
1573 }
1574
1575 kfree(obj_request->img_extents);
1576 if (obj_request->copyup_bvecs) {
1577 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1578 if (obj_request->copyup_bvecs[i].bv_page)
1579 __free_page(obj_request->copyup_bvecs[i].bv_page);
1580 }
1581 kfree(obj_request->copyup_bvecs);
1582 }
1583
1584 kmem_cache_free(rbd_obj_request_cache, obj_request);
1585 }
1586
1587 /* It's OK to call this for a device with no parent */
1588
1589 static void rbd_spec_put(struct rbd_spec *spec);
rbd_dev_unparent(struct rbd_device * rbd_dev)1590 static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1591 {
1592 rbd_dev_remove_parent(rbd_dev);
1593 rbd_spec_put(rbd_dev->parent_spec);
1594 rbd_dev->parent_spec = NULL;
1595 rbd_dev->parent_overlap = 0;
1596 }
1597
1598 /*
1599 * Parent image reference counting is used to determine when an
1600 * image's parent fields can be safely torn down--after there are no
1601 * more in-flight requests to the parent image. When the last
1602 * reference is dropped, cleaning them up is safe.
1603 */
rbd_dev_parent_put(struct rbd_device * rbd_dev)1604 static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1605 {
1606 int counter;
1607
1608 if (!rbd_dev->parent_spec)
1609 return;
1610
1611 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1612 if (counter > 0)
1613 return;
1614
1615 /* Last reference; clean up parent data structures */
1616
1617 if (!counter)
1618 rbd_dev_unparent(rbd_dev);
1619 else
1620 rbd_warn(rbd_dev, "parent reference underflow");
1621 }
1622
1623 /*
1624 * If an image has a non-zero parent overlap, get a reference to its
1625 * parent.
1626 *
1627 * Returns true if the rbd device has a parent with a non-zero
1628 * overlap and a reference for it was successfully taken, or
1629 * false otherwise.
1630 */
rbd_dev_parent_get(struct rbd_device * rbd_dev)1631 static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1632 {
1633 int counter = 0;
1634
1635 if (!rbd_dev->parent_spec)
1636 return false;
1637
1638 if (rbd_dev->parent_overlap)
1639 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1640
1641 if (counter < 0)
1642 rbd_warn(rbd_dev, "parent reference overflow");
1643
1644 return counter > 0;
1645 }
1646
rbd_img_request_init(struct rbd_img_request * img_request,struct rbd_device * rbd_dev,enum obj_operation_type op_type)1647 static void rbd_img_request_init(struct rbd_img_request *img_request,
1648 struct rbd_device *rbd_dev,
1649 enum obj_operation_type op_type)
1650 {
1651 memset(img_request, 0, sizeof(*img_request));
1652
1653 img_request->rbd_dev = rbd_dev;
1654 img_request->op_type = op_type;
1655
1656 INIT_LIST_HEAD(&img_request->lock_item);
1657 INIT_LIST_HEAD(&img_request->object_extents);
1658 mutex_init(&img_request->state_mutex);
1659 }
1660
1661 /*
1662 * Only snap_id is captured here, for reads. For writes, snapshot
1663 * context is captured in rbd_img_object_requests() after exclusive
1664 * lock is ensured to be held.
1665 */
rbd_img_capture_header(struct rbd_img_request * img_req)1666 static void rbd_img_capture_header(struct rbd_img_request *img_req)
1667 {
1668 struct rbd_device *rbd_dev = img_req->rbd_dev;
1669
1670 lockdep_assert_held(&rbd_dev->header_rwsem);
1671
1672 if (!rbd_img_is_write(img_req))
1673 img_req->snap_id = rbd_dev->spec->snap_id;
1674
1675 if (rbd_dev_parent_get(rbd_dev))
1676 img_request_layered_set(img_req);
1677 }
1678
rbd_img_request_destroy(struct rbd_img_request * img_request)1679 static void rbd_img_request_destroy(struct rbd_img_request *img_request)
1680 {
1681 struct rbd_obj_request *obj_request;
1682 struct rbd_obj_request *next_obj_request;
1683
1684 dout("%s: img %p\n", __func__, img_request);
1685
1686 WARN_ON(!list_empty(&img_request->lock_item));
1687 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1688 rbd_img_obj_request_del(img_request, obj_request);
1689
1690 if (img_request_layered_test(img_request))
1691 rbd_dev_parent_put(img_request->rbd_dev);
1692
1693 if (rbd_img_is_write(img_request))
1694 ceph_put_snap_context(img_request->snapc);
1695
1696 if (test_bit(IMG_REQ_CHILD, &img_request->flags))
1697 kmem_cache_free(rbd_img_request_cache, img_request);
1698 }
1699
1700 #define BITS_PER_OBJ 2
1701 #define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ)
1702 #define OBJ_MASK ((1 << BITS_PER_OBJ) - 1)
1703
__rbd_object_map_index(struct rbd_device * rbd_dev,u64 objno,u64 * index,u8 * shift)1704 static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1705 u64 *index, u8 *shift)
1706 {
1707 u32 off;
1708
1709 rbd_assert(objno < rbd_dev->object_map_size);
1710 *index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1711 *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1712 }
1713
__rbd_object_map_get(struct rbd_device * rbd_dev,u64 objno)1714 static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1715 {
1716 u64 index;
1717 u8 shift;
1718
1719 lockdep_assert_held(&rbd_dev->object_map_lock);
1720 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1721 return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
1722 }
1723
__rbd_object_map_set(struct rbd_device * rbd_dev,u64 objno,u8 val)1724 static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
1725 {
1726 u64 index;
1727 u8 shift;
1728 u8 *p;
1729
1730 lockdep_assert_held(&rbd_dev->object_map_lock);
1731 rbd_assert(!(val & ~OBJ_MASK));
1732
1733 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1734 p = &rbd_dev->object_map[index];
1735 *p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
1736 }
1737
rbd_object_map_get(struct rbd_device * rbd_dev,u64 objno)1738 static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1739 {
1740 u8 state;
1741
1742 spin_lock(&rbd_dev->object_map_lock);
1743 state = __rbd_object_map_get(rbd_dev, objno);
1744 spin_unlock(&rbd_dev->object_map_lock);
1745 return state;
1746 }
1747
use_object_map(struct rbd_device * rbd_dev)1748 static bool use_object_map(struct rbd_device *rbd_dev)
1749 {
1750 /*
1751 * An image mapped read-only can't use the object map -- it isn't
1752 * loaded because the header lock isn't acquired. Someone else can
1753 * write to the image and update the object map behind our back.
1754 *
1755 * A snapshot can't be written to, so using the object map is always
1756 * safe.
1757 */
1758 if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
1759 return false;
1760
1761 return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1762 !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
1763 }
1764
rbd_object_map_may_exist(struct rbd_device * rbd_dev,u64 objno)1765 static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
1766 {
1767 u8 state;
1768
1769 /* fall back to default logic if object map is disabled or invalid */
1770 if (!use_object_map(rbd_dev))
1771 return true;
1772
1773 state = rbd_object_map_get(rbd_dev, objno);
1774 return state != OBJECT_NONEXISTENT;
1775 }
1776
rbd_object_map_name(struct rbd_device * rbd_dev,u64 snap_id,struct ceph_object_id * oid)1777 static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1778 struct ceph_object_id *oid)
1779 {
1780 if (snap_id == CEPH_NOSNAP)
1781 ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1782 rbd_dev->spec->image_id);
1783 else
1784 ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1785 rbd_dev->spec->image_id, snap_id);
1786 }
1787
rbd_object_map_lock(struct rbd_device * rbd_dev)1788 static int rbd_object_map_lock(struct rbd_device *rbd_dev)
1789 {
1790 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1791 CEPH_DEFINE_OID_ONSTACK(oid);
1792 u8 lock_type;
1793 char *lock_tag;
1794 struct ceph_locker *lockers;
1795 u32 num_lockers;
1796 bool broke_lock = false;
1797 int ret;
1798
1799 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1800
1801 again:
1802 ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1803 CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1804 if (ret != -EBUSY || broke_lock) {
1805 if (ret == -EEXIST)
1806 ret = 0; /* already locked by myself */
1807 if (ret)
1808 rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1809 return ret;
1810 }
1811
1812 ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1813 RBD_LOCK_NAME, &lock_type, &lock_tag,
1814 &lockers, &num_lockers);
1815 if (ret) {
1816 if (ret == -ENOENT)
1817 goto again;
1818
1819 rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
1820 return ret;
1821 }
1822
1823 kfree(lock_tag);
1824 if (num_lockers == 0)
1825 goto again;
1826
1827 rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1828 ENTITY_NAME(lockers[0].id.name));
1829
1830 ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1831 RBD_LOCK_NAME, lockers[0].id.cookie,
1832 &lockers[0].id.name);
1833 ceph_free_lockers(lockers, num_lockers);
1834 if (ret) {
1835 if (ret == -ENOENT)
1836 goto again;
1837
1838 rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1839 return ret;
1840 }
1841
1842 broke_lock = true;
1843 goto again;
1844 }
1845
rbd_object_map_unlock(struct rbd_device * rbd_dev)1846 static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
1847 {
1848 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1849 CEPH_DEFINE_OID_ONSTACK(oid);
1850 int ret;
1851
1852 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1853
1854 ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1855 "");
1856 if (ret && ret != -ENOENT)
1857 rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
1858 }
1859
decode_object_map_header(void ** p,void * end,u64 * object_map_size)1860 static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
1861 {
1862 u8 struct_v;
1863 u32 struct_len;
1864 u32 header_len;
1865 void *header_end;
1866 int ret;
1867
1868 ceph_decode_32_safe(p, end, header_len, e_inval);
1869 header_end = *p + header_len;
1870
1871 ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1872 &struct_len);
1873 if (ret)
1874 return ret;
1875
1876 ceph_decode_64_safe(p, end, *object_map_size, e_inval);
1877
1878 *p = header_end;
1879 return 0;
1880
1881 e_inval:
1882 return -EINVAL;
1883 }
1884
__rbd_object_map_load(struct rbd_device * rbd_dev)1885 static int __rbd_object_map_load(struct rbd_device *rbd_dev)
1886 {
1887 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1888 CEPH_DEFINE_OID_ONSTACK(oid);
1889 struct page **pages;
1890 void *p, *end;
1891 size_t reply_len;
1892 u64 num_objects;
1893 u64 object_map_bytes;
1894 u64 object_map_size;
1895 int num_pages;
1896 int ret;
1897
1898 rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
1899
1900 num_objects = ceph_get_num_objects(&rbd_dev->layout,
1901 rbd_dev->mapping.size);
1902 object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1903 BITS_PER_BYTE);
1904 num_pages = calc_pages_for(0, object_map_bytes) + 1;
1905 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1906 if (IS_ERR(pages))
1907 return PTR_ERR(pages);
1908
1909 reply_len = num_pages * PAGE_SIZE;
1910 rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1911 ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1912 "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1913 NULL, 0, pages, &reply_len);
1914 if (ret)
1915 goto out;
1916
1917 p = page_address(pages[0]);
1918 end = p + min(reply_len, (size_t)PAGE_SIZE);
1919 ret = decode_object_map_header(&p, end, &object_map_size);
1920 if (ret)
1921 goto out;
1922
1923 if (object_map_size != num_objects) {
1924 rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
1925 object_map_size, num_objects);
1926 ret = -EINVAL;
1927 goto out;
1928 }
1929
1930 if (offset_in_page(p) + object_map_bytes > reply_len) {
1931 ret = -EINVAL;
1932 goto out;
1933 }
1934
1935 rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
1936 if (!rbd_dev->object_map) {
1937 ret = -ENOMEM;
1938 goto out;
1939 }
1940
1941 rbd_dev->object_map_size = object_map_size;
1942 ceph_copy_from_page_vector(pages, rbd_dev->object_map,
1943 offset_in_page(p), object_map_bytes);
1944
1945 out:
1946 ceph_release_page_vector(pages, num_pages);
1947 return ret;
1948 }
1949
rbd_object_map_free(struct rbd_device * rbd_dev)1950 static void rbd_object_map_free(struct rbd_device *rbd_dev)
1951 {
1952 kvfree(rbd_dev->object_map);
1953 rbd_dev->object_map = NULL;
1954 rbd_dev->object_map_size = 0;
1955 }
1956
rbd_object_map_load(struct rbd_device * rbd_dev)1957 static int rbd_object_map_load(struct rbd_device *rbd_dev)
1958 {
1959 int ret;
1960
1961 ret = __rbd_object_map_load(rbd_dev);
1962 if (ret)
1963 return ret;
1964
1965 ret = rbd_dev_v2_get_flags(rbd_dev);
1966 if (ret) {
1967 rbd_object_map_free(rbd_dev);
1968 return ret;
1969 }
1970
1971 if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
1972 rbd_warn(rbd_dev, "object map is invalid");
1973
1974 return 0;
1975 }
1976
rbd_object_map_open(struct rbd_device * rbd_dev)1977 static int rbd_object_map_open(struct rbd_device *rbd_dev)
1978 {
1979 int ret;
1980
1981 ret = rbd_object_map_lock(rbd_dev);
1982 if (ret)
1983 return ret;
1984
1985 ret = rbd_object_map_load(rbd_dev);
1986 if (ret) {
1987 rbd_object_map_unlock(rbd_dev);
1988 return ret;
1989 }
1990
1991 return 0;
1992 }
1993
rbd_object_map_close(struct rbd_device * rbd_dev)1994 static void rbd_object_map_close(struct rbd_device *rbd_dev)
1995 {
1996 rbd_object_map_free(rbd_dev);
1997 rbd_object_map_unlock(rbd_dev);
1998 }
1999
2000 /*
2001 * This function needs snap_id (or more precisely just something to
2002 * distinguish between HEAD and snapshot object maps), new_state and
2003 * current_state that were passed to rbd_object_map_update().
2004 *
2005 * To avoid allocating and stashing a context we piggyback on the OSD
2006 * request. A HEAD update has two ops (assert_locked). For new_state
2007 * and current_state we decode our own object_map_update op, encoded in
2008 * rbd_cls_object_map_update().
2009 */
rbd_object_map_update_finish(struct rbd_obj_request * obj_req,struct ceph_osd_request * osd_req)2010 static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
2011 struct ceph_osd_request *osd_req)
2012 {
2013 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2014 struct ceph_osd_data *osd_data;
2015 u64 objno;
2016 u8 state, new_state, current_state;
2017 bool has_current_state;
2018 void *p;
2019
2020 if (osd_req->r_result)
2021 return osd_req->r_result;
2022
2023 /*
2024 * Nothing to do for a snapshot object map.
2025 */
2026 if (osd_req->r_num_ops == 1)
2027 return 0;
2028
2029 /*
2030 * Update in-memory HEAD object map.
2031 */
2032 rbd_assert(osd_req->r_num_ops == 2);
2033 osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
2034 rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
2035
2036 p = page_address(osd_data->pages[0]);
2037 objno = ceph_decode_64(&p);
2038 rbd_assert(objno == obj_req->ex.oe_objno);
2039 rbd_assert(ceph_decode_64(&p) == objno + 1);
2040 new_state = ceph_decode_8(&p);
2041 has_current_state = ceph_decode_8(&p);
2042 if (has_current_state)
2043 current_state = ceph_decode_8(&p);
2044
2045 spin_lock(&rbd_dev->object_map_lock);
2046 state = __rbd_object_map_get(rbd_dev, objno);
2047 if (!has_current_state || current_state == state ||
2048 (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
2049 __rbd_object_map_set(rbd_dev, objno, new_state);
2050 spin_unlock(&rbd_dev->object_map_lock);
2051
2052 return 0;
2053 }
2054
rbd_object_map_callback(struct ceph_osd_request * osd_req)2055 static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
2056 {
2057 struct rbd_obj_request *obj_req = osd_req->r_priv;
2058 int result;
2059
2060 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
2061 osd_req->r_result, obj_req);
2062
2063 result = rbd_object_map_update_finish(obj_req, osd_req);
2064 rbd_obj_handle_request(obj_req, result);
2065 }
2066
update_needed(struct rbd_device * rbd_dev,u64 objno,u8 new_state)2067 static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
2068 {
2069 u8 state = rbd_object_map_get(rbd_dev, objno);
2070
2071 if (state == new_state ||
2072 (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2073 (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2074 return false;
2075
2076 return true;
2077 }
2078
rbd_cls_object_map_update(struct ceph_osd_request * req,int which,u64 objno,u8 new_state,const u8 * current_state)2079 static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2080 int which, u64 objno, u8 new_state,
2081 const u8 *current_state)
2082 {
2083 struct page **pages;
2084 void *p, *start;
2085 int ret;
2086
2087 ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2088 if (ret)
2089 return ret;
2090
2091 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2092 if (IS_ERR(pages))
2093 return PTR_ERR(pages);
2094
2095 p = start = page_address(pages[0]);
2096 ceph_encode_64(&p, objno);
2097 ceph_encode_64(&p, objno + 1);
2098 ceph_encode_8(&p, new_state);
2099 if (current_state) {
2100 ceph_encode_8(&p, 1);
2101 ceph_encode_8(&p, *current_state);
2102 } else {
2103 ceph_encode_8(&p, 0);
2104 }
2105
2106 osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2107 false, true);
2108 return 0;
2109 }
2110
2111 /*
2112 * Return:
2113 * 0 - object map update sent
2114 * 1 - object map update isn't needed
2115 * <0 - error
2116 */
rbd_object_map_update(struct rbd_obj_request * obj_req,u64 snap_id,u8 new_state,const u8 * current_state)2117 static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2118 u8 new_state, const u8 *current_state)
2119 {
2120 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2121 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2122 struct ceph_osd_request *req;
2123 int num_ops = 1;
2124 int which = 0;
2125 int ret;
2126
2127 if (snap_id == CEPH_NOSNAP) {
2128 if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2129 return 1;
2130
2131 num_ops++; /* assert_locked */
2132 }
2133
2134 req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2135 if (!req)
2136 return -ENOMEM;
2137
2138 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2139 req->r_callback = rbd_object_map_callback;
2140 req->r_priv = obj_req;
2141
2142 rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2143 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2144 req->r_flags = CEPH_OSD_FLAG_WRITE;
2145 ktime_get_real_ts64(&req->r_mtime);
2146
2147 if (snap_id == CEPH_NOSNAP) {
2148 /*
2149 * Protect against possible race conditions during lock
2150 * ownership transitions.
2151 */
2152 ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2153 CEPH_CLS_LOCK_EXCLUSIVE, "", "");
2154 if (ret)
2155 return ret;
2156 }
2157
2158 ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2159 new_state, current_state);
2160 if (ret)
2161 return ret;
2162
2163 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2164 if (ret)
2165 return ret;
2166
2167 ceph_osdc_start_request(osdc, req, false);
2168 return 0;
2169 }
2170
prune_extents(struct ceph_file_extent * img_extents,u32 * num_img_extents,u64 overlap)2171 static void prune_extents(struct ceph_file_extent *img_extents,
2172 u32 *num_img_extents, u64 overlap)
2173 {
2174 u32 cnt = *num_img_extents;
2175
2176 /* drop extents completely beyond the overlap */
2177 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2178 cnt--;
2179
2180 if (cnt) {
2181 struct ceph_file_extent *ex = &img_extents[cnt - 1];
2182
2183 /* trim final overlapping extent */
2184 if (ex->fe_off + ex->fe_len > overlap)
2185 ex->fe_len = overlap - ex->fe_off;
2186 }
2187
2188 *num_img_extents = cnt;
2189 }
2190
2191 /*
2192 * Determine the byte range(s) covered by either just the object extent
2193 * or the entire object in the parent image.
2194 */
rbd_obj_calc_img_extents(struct rbd_obj_request * obj_req,bool entire)2195 static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2196 bool entire)
2197 {
2198 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2199 int ret;
2200
2201 if (!rbd_dev->parent_overlap)
2202 return 0;
2203
2204 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2205 entire ? 0 : obj_req->ex.oe_off,
2206 entire ? rbd_dev->layout.object_size :
2207 obj_req->ex.oe_len,
2208 &obj_req->img_extents,
2209 &obj_req->num_img_extents);
2210 if (ret)
2211 return ret;
2212
2213 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2214 rbd_dev->parent_overlap);
2215 return 0;
2216 }
2217
rbd_osd_setup_data(struct ceph_osd_request * osd_req,int which)2218 static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
2219 {
2220 struct rbd_obj_request *obj_req = osd_req->r_priv;
2221
2222 switch (obj_req->img_request->data_type) {
2223 case OBJ_REQUEST_BIO:
2224 osd_req_op_extent_osd_data_bio(osd_req, which,
2225 &obj_req->bio_pos,
2226 obj_req->ex.oe_len);
2227 break;
2228 case OBJ_REQUEST_BVECS:
2229 case OBJ_REQUEST_OWN_BVECS:
2230 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
2231 obj_req->ex.oe_len);
2232 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
2233 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
2234 &obj_req->bvec_pos);
2235 break;
2236 default:
2237 BUG();
2238 }
2239 }
2240
rbd_osd_setup_stat(struct ceph_osd_request * osd_req,int which)2241 static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
2242 {
2243 struct page **pages;
2244
2245 /*
2246 * The response data for a STAT call consists of:
2247 * le64 length;
2248 * struct {
2249 * le32 tv_sec;
2250 * le32 tv_nsec;
2251 * } mtime;
2252 */
2253 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2254 if (IS_ERR(pages))
2255 return PTR_ERR(pages);
2256
2257 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2258 osd_req_op_raw_data_in_pages(osd_req, which, pages,
2259 8 + sizeof(struct ceph_timespec),
2260 0, false, true);
2261 return 0;
2262 }
2263
rbd_osd_setup_copyup(struct ceph_osd_request * osd_req,int which,u32 bytes)2264 static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2265 u32 bytes)
2266 {
2267 struct rbd_obj_request *obj_req = osd_req->r_priv;
2268 int ret;
2269
2270 ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2271 if (ret)
2272 return ret;
2273
2274 osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2275 obj_req->copyup_bvec_count, bytes);
2276 return 0;
2277 }
2278
rbd_obj_init_read(struct rbd_obj_request * obj_req)2279 static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
2280 {
2281 obj_req->read_state = RBD_OBJ_READ_START;
2282 return 0;
2283 }
2284
__rbd_osd_setup_write_ops(struct ceph_osd_request * osd_req,int which)2285 static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2286 int which)
2287 {
2288 struct rbd_obj_request *obj_req = osd_req->r_priv;
2289 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2290 u16 opcode;
2291
2292 if (!use_object_map(rbd_dev) ||
2293 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2294 osd_req_op_alloc_hint_init(osd_req, which++,
2295 rbd_dev->layout.object_size,
2296 rbd_dev->layout.object_size,
2297 rbd_dev->opts->alloc_hint_flags);
2298 }
2299
2300 if (rbd_obj_is_entire(obj_req))
2301 opcode = CEPH_OSD_OP_WRITEFULL;
2302 else
2303 opcode = CEPH_OSD_OP_WRITE;
2304
2305 osd_req_op_extent_init(osd_req, which, opcode,
2306 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2307 rbd_osd_setup_data(osd_req, which);
2308 }
2309
rbd_obj_init_write(struct rbd_obj_request * obj_req)2310 static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
2311 {
2312 int ret;
2313
2314 /* reverse map the entire object onto the parent */
2315 ret = rbd_obj_calc_img_extents(obj_req, true);
2316 if (ret)
2317 return ret;
2318
2319 obj_req->write_state = RBD_OBJ_WRITE_START;
2320 return 0;
2321 }
2322
truncate_or_zero_opcode(struct rbd_obj_request * obj_req)2323 static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2324 {
2325 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2326 CEPH_OSD_OP_ZERO;
2327 }
2328
__rbd_osd_setup_discard_ops(struct ceph_osd_request * osd_req,int which)2329 static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2330 int which)
2331 {
2332 struct rbd_obj_request *obj_req = osd_req->r_priv;
2333
2334 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2335 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2336 osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
2337 } else {
2338 osd_req_op_extent_init(osd_req, which,
2339 truncate_or_zero_opcode(obj_req),
2340 obj_req->ex.oe_off, obj_req->ex.oe_len,
2341 0, 0);
2342 }
2343 }
2344
rbd_obj_init_discard(struct rbd_obj_request * obj_req)2345 static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
2346 {
2347 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2348 u64 off, next_off;
2349 int ret;
2350
2351 /*
2352 * Align the range to alloc_size boundary and punt on discards
2353 * that are too small to free up any space.
2354 *
2355 * alloc_size == object_size && is_tail() is a special case for
2356 * filestore with filestore_punch_hole = false, needed to allow
2357 * truncate (in addition to delete).
2358 */
2359 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2360 !rbd_obj_is_tail(obj_req)) {
2361 off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2362 next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2363 rbd_dev->opts->alloc_size);
2364 if (off >= next_off)
2365 return 1;
2366
2367 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2368 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2369 off, next_off - off);
2370 obj_req->ex.oe_off = off;
2371 obj_req->ex.oe_len = next_off - off;
2372 }
2373
2374 /* reverse map the entire object onto the parent */
2375 ret = rbd_obj_calc_img_extents(obj_req, true);
2376 if (ret)
2377 return ret;
2378
2379 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2380 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2381 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2382
2383 obj_req->write_state = RBD_OBJ_WRITE_START;
2384 return 0;
2385 }
2386
__rbd_osd_setup_zeroout_ops(struct ceph_osd_request * osd_req,int which)2387 static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2388 int which)
2389 {
2390 struct rbd_obj_request *obj_req = osd_req->r_priv;
2391 u16 opcode;
2392
2393 if (rbd_obj_is_entire(obj_req)) {
2394 if (obj_req->num_img_extents) {
2395 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2396 osd_req_op_init(osd_req, which++,
2397 CEPH_OSD_OP_CREATE, 0);
2398 opcode = CEPH_OSD_OP_TRUNCATE;
2399 } else {
2400 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2401 osd_req_op_init(osd_req, which++,
2402 CEPH_OSD_OP_DELETE, 0);
2403 opcode = 0;
2404 }
2405 } else {
2406 opcode = truncate_or_zero_opcode(obj_req);
2407 }
2408
2409 if (opcode)
2410 osd_req_op_extent_init(osd_req, which, opcode,
2411 obj_req->ex.oe_off, obj_req->ex.oe_len,
2412 0, 0);
2413 }
2414
rbd_obj_init_zeroout(struct rbd_obj_request * obj_req)2415 static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
2416 {
2417 int ret;
2418
2419 /* reverse map the entire object onto the parent */
2420 ret = rbd_obj_calc_img_extents(obj_req, true);
2421 if (ret)
2422 return ret;
2423
2424 if (!obj_req->num_img_extents) {
2425 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2426 if (rbd_obj_is_entire(obj_req))
2427 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2428 }
2429
2430 obj_req->write_state = RBD_OBJ_WRITE_START;
2431 return 0;
2432 }
2433
count_write_ops(struct rbd_obj_request * obj_req)2434 static int count_write_ops(struct rbd_obj_request *obj_req)
2435 {
2436 struct rbd_img_request *img_req = obj_req->img_request;
2437
2438 switch (img_req->op_type) {
2439 case OBJ_OP_WRITE:
2440 if (!use_object_map(img_req->rbd_dev) ||
2441 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2442 return 2; /* setallochint + write/writefull */
2443
2444 return 1; /* write/writefull */
2445 case OBJ_OP_DISCARD:
2446 return 1; /* delete/truncate/zero */
2447 case OBJ_OP_ZEROOUT:
2448 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2449 !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2450 return 2; /* create + truncate */
2451
2452 return 1; /* delete/truncate/zero */
2453 default:
2454 BUG();
2455 }
2456 }
2457
rbd_osd_setup_write_ops(struct ceph_osd_request * osd_req,int which)2458 static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2459 int which)
2460 {
2461 struct rbd_obj_request *obj_req = osd_req->r_priv;
2462
2463 switch (obj_req->img_request->op_type) {
2464 case OBJ_OP_WRITE:
2465 __rbd_osd_setup_write_ops(osd_req, which);
2466 break;
2467 case OBJ_OP_DISCARD:
2468 __rbd_osd_setup_discard_ops(osd_req, which);
2469 break;
2470 case OBJ_OP_ZEROOUT:
2471 __rbd_osd_setup_zeroout_ops(osd_req, which);
2472 break;
2473 default:
2474 BUG();
2475 }
2476 }
2477
2478 /*
2479 * Prune the list of object requests (adjust offset and/or length, drop
2480 * redundant requests). Prepare object request state machines and image
2481 * request state machine for execution.
2482 */
__rbd_img_fill_request(struct rbd_img_request * img_req)2483 static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2484 {
2485 struct rbd_obj_request *obj_req, *next_obj_req;
2486 int ret;
2487
2488 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
2489 switch (img_req->op_type) {
2490 case OBJ_OP_READ:
2491 ret = rbd_obj_init_read(obj_req);
2492 break;
2493 case OBJ_OP_WRITE:
2494 ret = rbd_obj_init_write(obj_req);
2495 break;
2496 case OBJ_OP_DISCARD:
2497 ret = rbd_obj_init_discard(obj_req);
2498 break;
2499 case OBJ_OP_ZEROOUT:
2500 ret = rbd_obj_init_zeroout(obj_req);
2501 break;
2502 default:
2503 BUG();
2504 }
2505 if (ret < 0)
2506 return ret;
2507 if (ret > 0) {
2508 rbd_img_obj_request_del(img_req, obj_req);
2509 continue;
2510 }
2511 }
2512
2513 img_req->state = RBD_IMG_START;
2514 return 0;
2515 }
2516
2517 union rbd_img_fill_iter {
2518 struct ceph_bio_iter bio_iter;
2519 struct ceph_bvec_iter bvec_iter;
2520 };
2521
2522 struct rbd_img_fill_ctx {
2523 enum obj_request_type pos_type;
2524 union rbd_img_fill_iter *pos;
2525 union rbd_img_fill_iter iter;
2526 ceph_object_extent_fn_t set_pos_fn;
2527 ceph_object_extent_fn_t count_fn;
2528 ceph_object_extent_fn_t copy_fn;
2529 };
2530
alloc_object_extent(void * arg)2531 static struct ceph_object_extent *alloc_object_extent(void *arg)
2532 {
2533 struct rbd_img_request *img_req = arg;
2534 struct rbd_obj_request *obj_req;
2535
2536 obj_req = rbd_obj_request_create();
2537 if (!obj_req)
2538 return NULL;
2539
2540 rbd_img_obj_request_add(img_req, obj_req);
2541 return &obj_req->ex;
2542 }
2543
2544 /*
2545 * While su != os && sc == 1 is technically not fancy (it's the same
2546 * layout as su == os && sc == 1), we can't use the nocopy path for it
2547 * because ->set_pos_fn() should be called only once per object.
2548 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2549 * treat su != os && sc == 1 as fancy.
2550 */
rbd_layout_is_fancy(struct ceph_file_layout * l)2551 static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2552 {
2553 return l->stripe_unit != l->object_size;
2554 }
2555
rbd_img_fill_request_nocopy(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct rbd_img_fill_ctx * fctx)2556 static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2557 struct ceph_file_extent *img_extents,
2558 u32 num_img_extents,
2559 struct rbd_img_fill_ctx *fctx)
2560 {
2561 u32 i;
2562 int ret;
2563
2564 img_req->data_type = fctx->pos_type;
2565
2566 /*
2567 * Create object requests and set each object request's starting
2568 * position in the provided bio (list) or bio_vec array.
2569 */
2570 fctx->iter = *fctx->pos;
2571 for (i = 0; i < num_img_extents; i++) {
2572 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2573 img_extents[i].fe_off,
2574 img_extents[i].fe_len,
2575 &img_req->object_extents,
2576 alloc_object_extent, img_req,
2577 fctx->set_pos_fn, &fctx->iter);
2578 if (ret)
2579 return ret;
2580 }
2581
2582 return __rbd_img_fill_request(img_req);
2583 }
2584
2585 /*
2586 * Map a list of image extents to a list of object extents, create the
2587 * corresponding object requests (normally each to a different object,
2588 * but not always) and add them to @img_req. For each object request,
2589 * set up its data descriptor to point to the corresponding chunk(s) of
2590 * @fctx->pos data buffer.
2591 *
2592 * Because ceph_file_to_extents() will merge adjacent object extents
2593 * together, each object request's data descriptor may point to multiple
2594 * different chunks of @fctx->pos data buffer.
2595 *
2596 * @fctx->pos data buffer is assumed to be large enough.
2597 */
rbd_img_fill_request(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct rbd_img_fill_ctx * fctx)2598 static int rbd_img_fill_request(struct rbd_img_request *img_req,
2599 struct ceph_file_extent *img_extents,
2600 u32 num_img_extents,
2601 struct rbd_img_fill_ctx *fctx)
2602 {
2603 struct rbd_device *rbd_dev = img_req->rbd_dev;
2604 struct rbd_obj_request *obj_req;
2605 u32 i;
2606 int ret;
2607
2608 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2609 !rbd_layout_is_fancy(&rbd_dev->layout))
2610 return rbd_img_fill_request_nocopy(img_req, img_extents,
2611 num_img_extents, fctx);
2612
2613 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2614
2615 /*
2616 * Create object requests and determine ->bvec_count for each object
2617 * request. Note that ->bvec_count sum over all object requests may
2618 * be greater than the number of bio_vecs in the provided bio (list)
2619 * or bio_vec array because when mapped, those bio_vecs can straddle
2620 * stripe unit boundaries.
2621 */
2622 fctx->iter = *fctx->pos;
2623 for (i = 0; i < num_img_extents; i++) {
2624 ret = ceph_file_to_extents(&rbd_dev->layout,
2625 img_extents[i].fe_off,
2626 img_extents[i].fe_len,
2627 &img_req->object_extents,
2628 alloc_object_extent, img_req,
2629 fctx->count_fn, &fctx->iter);
2630 if (ret)
2631 return ret;
2632 }
2633
2634 for_each_obj_request(img_req, obj_req) {
2635 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2636 sizeof(*obj_req->bvec_pos.bvecs),
2637 GFP_NOIO);
2638 if (!obj_req->bvec_pos.bvecs)
2639 return -ENOMEM;
2640 }
2641
2642 /*
2643 * Fill in each object request's private bio_vec array, splitting and
2644 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2645 */
2646 fctx->iter = *fctx->pos;
2647 for (i = 0; i < num_img_extents; i++) {
2648 ret = ceph_iterate_extents(&rbd_dev->layout,
2649 img_extents[i].fe_off,
2650 img_extents[i].fe_len,
2651 &img_req->object_extents,
2652 fctx->copy_fn, &fctx->iter);
2653 if (ret)
2654 return ret;
2655 }
2656
2657 return __rbd_img_fill_request(img_req);
2658 }
2659
rbd_img_fill_nodata(struct rbd_img_request * img_req,u64 off,u64 len)2660 static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2661 u64 off, u64 len)
2662 {
2663 struct ceph_file_extent ex = { off, len };
2664 union rbd_img_fill_iter dummy = {};
2665 struct rbd_img_fill_ctx fctx = {
2666 .pos_type = OBJ_REQUEST_NODATA,
2667 .pos = &dummy,
2668 };
2669
2670 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2671 }
2672
set_bio_pos(struct ceph_object_extent * ex,u32 bytes,void * arg)2673 static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2674 {
2675 struct rbd_obj_request *obj_req =
2676 container_of(ex, struct rbd_obj_request, ex);
2677 struct ceph_bio_iter *it = arg;
2678
2679 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2680 obj_req->bio_pos = *it;
2681 ceph_bio_iter_advance(it, bytes);
2682 }
2683
count_bio_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2684 static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2685 {
2686 struct rbd_obj_request *obj_req =
2687 container_of(ex, struct rbd_obj_request, ex);
2688 struct ceph_bio_iter *it = arg;
2689
2690 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2691 ceph_bio_iter_advance_step(it, bytes, ({
2692 obj_req->bvec_count++;
2693 }));
2694
2695 }
2696
copy_bio_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2697 static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2698 {
2699 struct rbd_obj_request *obj_req =
2700 container_of(ex, struct rbd_obj_request, ex);
2701 struct ceph_bio_iter *it = arg;
2702
2703 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2704 ceph_bio_iter_advance_step(it, bytes, ({
2705 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2706 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2707 }));
2708 }
2709
__rbd_img_fill_from_bio(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct ceph_bio_iter * bio_pos)2710 static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2711 struct ceph_file_extent *img_extents,
2712 u32 num_img_extents,
2713 struct ceph_bio_iter *bio_pos)
2714 {
2715 struct rbd_img_fill_ctx fctx = {
2716 .pos_type = OBJ_REQUEST_BIO,
2717 .pos = (union rbd_img_fill_iter *)bio_pos,
2718 .set_pos_fn = set_bio_pos,
2719 .count_fn = count_bio_bvecs,
2720 .copy_fn = copy_bio_bvecs,
2721 };
2722
2723 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2724 &fctx);
2725 }
2726
rbd_img_fill_from_bio(struct rbd_img_request * img_req,u64 off,u64 len,struct bio * bio)2727 static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2728 u64 off, u64 len, struct bio *bio)
2729 {
2730 struct ceph_file_extent ex = { off, len };
2731 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2732
2733 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2734 }
2735
set_bvec_pos(struct ceph_object_extent * ex,u32 bytes,void * arg)2736 static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2737 {
2738 struct rbd_obj_request *obj_req =
2739 container_of(ex, struct rbd_obj_request, ex);
2740 struct ceph_bvec_iter *it = arg;
2741
2742 obj_req->bvec_pos = *it;
2743 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2744 ceph_bvec_iter_advance(it, bytes);
2745 }
2746
count_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2747 static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2748 {
2749 struct rbd_obj_request *obj_req =
2750 container_of(ex, struct rbd_obj_request, ex);
2751 struct ceph_bvec_iter *it = arg;
2752
2753 ceph_bvec_iter_advance_step(it, bytes, ({
2754 obj_req->bvec_count++;
2755 }));
2756 }
2757
copy_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2758 static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2759 {
2760 struct rbd_obj_request *obj_req =
2761 container_of(ex, struct rbd_obj_request, ex);
2762 struct ceph_bvec_iter *it = arg;
2763
2764 ceph_bvec_iter_advance_step(it, bytes, ({
2765 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2766 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2767 }));
2768 }
2769
__rbd_img_fill_from_bvecs(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct ceph_bvec_iter * bvec_pos)2770 static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2771 struct ceph_file_extent *img_extents,
2772 u32 num_img_extents,
2773 struct ceph_bvec_iter *bvec_pos)
2774 {
2775 struct rbd_img_fill_ctx fctx = {
2776 .pos_type = OBJ_REQUEST_BVECS,
2777 .pos = (union rbd_img_fill_iter *)bvec_pos,
2778 .set_pos_fn = set_bvec_pos,
2779 .count_fn = count_bvecs,
2780 .copy_fn = copy_bvecs,
2781 };
2782
2783 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2784 &fctx);
2785 }
2786
rbd_img_fill_from_bvecs(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct bio_vec * bvecs)2787 static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2788 struct ceph_file_extent *img_extents,
2789 u32 num_img_extents,
2790 struct bio_vec *bvecs)
2791 {
2792 struct ceph_bvec_iter it = {
2793 .bvecs = bvecs,
2794 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2795 num_img_extents) },
2796 };
2797
2798 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2799 &it);
2800 }
2801
rbd_img_handle_request_work(struct work_struct * work)2802 static void rbd_img_handle_request_work(struct work_struct *work)
2803 {
2804 struct rbd_img_request *img_req =
2805 container_of(work, struct rbd_img_request, work);
2806
2807 rbd_img_handle_request(img_req, img_req->work_result);
2808 }
2809
rbd_img_schedule(struct rbd_img_request * img_req,int result)2810 static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2811 {
2812 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2813 img_req->work_result = result;
2814 queue_work(rbd_wq, &img_req->work);
2815 }
2816
rbd_obj_may_exist(struct rbd_obj_request * obj_req)2817 static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2818 {
2819 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2820
2821 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2822 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2823 return true;
2824 }
2825
2826 dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2827 obj_req->ex.oe_objno);
2828 return false;
2829 }
2830
rbd_obj_read_object(struct rbd_obj_request * obj_req)2831 static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2832 {
2833 struct ceph_osd_request *osd_req;
2834 int ret;
2835
2836 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2837 if (IS_ERR(osd_req))
2838 return PTR_ERR(osd_req);
2839
2840 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2841 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2842 rbd_osd_setup_data(osd_req, 0);
2843 rbd_osd_format_read(osd_req);
2844
2845 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2846 if (ret)
2847 return ret;
2848
2849 rbd_osd_submit(osd_req);
2850 return 0;
2851 }
2852
rbd_obj_read_from_parent(struct rbd_obj_request * obj_req)2853 static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
2854 {
2855 struct rbd_img_request *img_req = obj_req->img_request;
2856 struct rbd_device *parent = img_req->rbd_dev->parent;
2857 struct rbd_img_request *child_img_req;
2858 int ret;
2859
2860 child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2861 if (!child_img_req)
2862 return -ENOMEM;
2863
2864 rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
2865 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2866 child_img_req->obj_request = obj_req;
2867
2868 down_read(&parent->header_rwsem);
2869 rbd_img_capture_header(child_img_req);
2870 up_read(&parent->header_rwsem);
2871
2872 dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2873 obj_req);
2874
2875 if (!rbd_img_is_write(img_req)) {
2876 switch (img_req->data_type) {
2877 case OBJ_REQUEST_BIO:
2878 ret = __rbd_img_fill_from_bio(child_img_req,
2879 obj_req->img_extents,
2880 obj_req->num_img_extents,
2881 &obj_req->bio_pos);
2882 break;
2883 case OBJ_REQUEST_BVECS:
2884 case OBJ_REQUEST_OWN_BVECS:
2885 ret = __rbd_img_fill_from_bvecs(child_img_req,
2886 obj_req->img_extents,
2887 obj_req->num_img_extents,
2888 &obj_req->bvec_pos);
2889 break;
2890 default:
2891 BUG();
2892 }
2893 } else {
2894 ret = rbd_img_fill_from_bvecs(child_img_req,
2895 obj_req->img_extents,
2896 obj_req->num_img_extents,
2897 obj_req->copyup_bvecs);
2898 }
2899 if (ret) {
2900 rbd_img_request_destroy(child_img_req);
2901 return ret;
2902 }
2903
2904 /* avoid parent chain recursion */
2905 rbd_img_schedule(child_img_req, 0);
2906 return 0;
2907 }
2908
rbd_obj_advance_read(struct rbd_obj_request * obj_req,int * result)2909 static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
2910 {
2911 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2912 int ret;
2913
2914 again:
2915 switch (obj_req->read_state) {
2916 case RBD_OBJ_READ_START:
2917 rbd_assert(!*result);
2918
2919 if (!rbd_obj_may_exist(obj_req)) {
2920 *result = -ENOENT;
2921 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2922 goto again;
2923 }
2924
2925 ret = rbd_obj_read_object(obj_req);
2926 if (ret) {
2927 *result = ret;
2928 return true;
2929 }
2930 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2931 return false;
2932 case RBD_OBJ_READ_OBJECT:
2933 if (*result == -ENOENT && rbd_dev->parent_overlap) {
2934 /* reverse map this object extent onto the parent */
2935 ret = rbd_obj_calc_img_extents(obj_req, false);
2936 if (ret) {
2937 *result = ret;
2938 return true;
2939 }
2940 if (obj_req->num_img_extents) {
2941 ret = rbd_obj_read_from_parent(obj_req);
2942 if (ret) {
2943 *result = ret;
2944 return true;
2945 }
2946 obj_req->read_state = RBD_OBJ_READ_PARENT;
2947 return false;
2948 }
2949 }
2950
2951 /*
2952 * -ENOENT means a hole in the image -- zero-fill the entire
2953 * length of the request. A short read also implies zero-fill
2954 * to the end of the request.
2955 */
2956 if (*result == -ENOENT) {
2957 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2958 *result = 0;
2959 } else if (*result >= 0) {
2960 if (*result < obj_req->ex.oe_len)
2961 rbd_obj_zero_range(obj_req, *result,
2962 obj_req->ex.oe_len - *result);
2963 else
2964 rbd_assert(*result == obj_req->ex.oe_len);
2965 *result = 0;
2966 }
2967 return true;
2968 case RBD_OBJ_READ_PARENT:
2969 /*
2970 * The parent image is read only up to the overlap -- zero-fill
2971 * from the overlap to the end of the request.
2972 */
2973 if (!*result) {
2974 u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
2975
2976 if (obj_overlap < obj_req->ex.oe_len)
2977 rbd_obj_zero_range(obj_req, obj_overlap,
2978 obj_req->ex.oe_len - obj_overlap);
2979 }
2980 return true;
2981 default:
2982 BUG();
2983 }
2984 }
2985
rbd_obj_write_is_noop(struct rbd_obj_request * obj_req)2986 static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
2987 {
2988 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2989
2990 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
2991 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2992
2993 if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
2994 (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
2995 dout("%s %p noop for nonexistent\n", __func__, obj_req);
2996 return true;
2997 }
2998
2999 return false;
3000 }
3001
3002 /*
3003 * Return:
3004 * 0 - object map update sent
3005 * 1 - object map update isn't needed
3006 * <0 - error
3007 */
rbd_obj_write_pre_object_map(struct rbd_obj_request * obj_req)3008 static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
3009 {
3010 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3011 u8 new_state;
3012
3013 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3014 return 1;
3015
3016 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3017 new_state = OBJECT_PENDING;
3018 else
3019 new_state = OBJECT_EXISTS;
3020
3021 return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
3022 }
3023
rbd_obj_write_object(struct rbd_obj_request * obj_req)3024 static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
3025 {
3026 struct ceph_osd_request *osd_req;
3027 int num_ops = count_write_ops(obj_req);
3028 int which = 0;
3029 int ret;
3030
3031 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
3032 num_ops++; /* stat */
3033
3034 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3035 if (IS_ERR(osd_req))
3036 return PTR_ERR(osd_req);
3037
3038 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3039 ret = rbd_osd_setup_stat(osd_req, which++);
3040 if (ret)
3041 return ret;
3042 }
3043
3044 rbd_osd_setup_write_ops(osd_req, which);
3045 rbd_osd_format_write(osd_req);
3046
3047 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3048 if (ret)
3049 return ret;
3050
3051 rbd_osd_submit(osd_req);
3052 return 0;
3053 }
3054
3055 /*
3056 * copyup_bvecs pages are never highmem pages
3057 */
is_zero_bvecs(struct bio_vec * bvecs,u32 bytes)3058 static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
3059 {
3060 struct ceph_bvec_iter it = {
3061 .bvecs = bvecs,
3062 .iter = { .bi_size = bytes },
3063 };
3064
3065 ceph_bvec_iter_advance_step(&it, bytes, ({
3066 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3067 bv.bv_len))
3068 return false;
3069 }));
3070 return true;
3071 }
3072
3073 #define MODS_ONLY U32_MAX
3074
rbd_obj_copyup_empty_snapc(struct rbd_obj_request * obj_req,u32 bytes)3075 static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
3076 u32 bytes)
3077 {
3078 struct ceph_osd_request *osd_req;
3079 int ret;
3080
3081 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3082 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
3083
3084 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3085 if (IS_ERR(osd_req))
3086 return PTR_ERR(osd_req);
3087
3088 ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
3089 if (ret)
3090 return ret;
3091
3092 rbd_osd_format_write(osd_req);
3093
3094 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3095 if (ret)
3096 return ret;
3097
3098 rbd_osd_submit(osd_req);
3099 return 0;
3100 }
3101
rbd_obj_copyup_current_snapc(struct rbd_obj_request * obj_req,u32 bytes)3102 static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3103 u32 bytes)
3104 {
3105 struct ceph_osd_request *osd_req;
3106 int num_ops = count_write_ops(obj_req);
3107 int which = 0;
3108 int ret;
3109
3110 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3111
3112 if (bytes != MODS_ONLY)
3113 num_ops++; /* copyup */
3114
3115 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3116 if (IS_ERR(osd_req))
3117 return PTR_ERR(osd_req);
3118
3119 if (bytes != MODS_ONLY) {
3120 ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
3121 if (ret)
3122 return ret;
3123 }
3124
3125 rbd_osd_setup_write_ops(osd_req, which);
3126 rbd_osd_format_write(osd_req);
3127
3128 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3129 if (ret)
3130 return ret;
3131
3132 rbd_osd_submit(osd_req);
3133 return 0;
3134 }
3135
setup_copyup_bvecs(struct rbd_obj_request * obj_req,u64 obj_overlap)3136 static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
3137 {
3138 u32 i;
3139
3140 rbd_assert(!obj_req->copyup_bvecs);
3141 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3142 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3143 sizeof(*obj_req->copyup_bvecs),
3144 GFP_NOIO);
3145 if (!obj_req->copyup_bvecs)
3146 return -ENOMEM;
3147
3148 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3149 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3150
3151 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3152 if (!obj_req->copyup_bvecs[i].bv_page)
3153 return -ENOMEM;
3154
3155 obj_req->copyup_bvecs[i].bv_offset = 0;
3156 obj_req->copyup_bvecs[i].bv_len = len;
3157 obj_overlap -= len;
3158 }
3159
3160 rbd_assert(!obj_overlap);
3161 return 0;
3162 }
3163
3164 /*
3165 * The target object doesn't exist. Read the data for the entire
3166 * target object up to the overlap point (if any) from the parent,
3167 * so we can use it for a copyup.
3168 */
rbd_obj_copyup_read_parent(struct rbd_obj_request * obj_req)3169 static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
3170 {
3171 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3172 int ret;
3173
3174 rbd_assert(obj_req->num_img_extents);
3175 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3176 rbd_dev->parent_overlap);
3177 if (!obj_req->num_img_extents) {
3178 /*
3179 * The overlap has become 0 (most likely because the
3180 * image has been flattened). Re-submit the original write
3181 * request -- pass MODS_ONLY since the copyup isn't needed
3182 * anymore.
3183 */
3184 return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
3185 }
3186
3187 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3188 if (ret)
3189 return ret;
3190
3191 return rbd_obj_read_from_parent(obj_req);
3192 }
3193
rbd_obj_copyup_object_maps(struct rbd_obj_request * obj_req)3194 static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
3195 {
3196 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3197 struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3198 u8 new_state;
3199 u32 i;
3200 int ret;
3201
3202 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3203
3204 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3205 return;
3206
3207 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3208 return;
3209
3210 for (i = 0; i < snapc->num_snaps; i++) {
3211 if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3212 i + 1 < snapc->num_snaps)
3213 new_state = OBJECT_EXISTS_CLEAN;
3214 else
3215 new_state = OBJECT_EXISTS;
3216
3217 ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3218 new_state, NULL);
3219 if (ret < 0) {
3220 obj_req->pending.result = ret;
3221 return;
3222 }
3223
3224 rbd_assert(!ret);
3225 obj_req->pending.num_pending++;
3226 }
3227 }
3228
rbd_obj_copyup_write_object(struct rbd_obj_request * obj_req)3229 static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3230 {
3231 u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3232 int ret;
3233
3234 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3235
3236 /*
3237 * Only send non-zero copyup data to save some I/O and network
3238 * bandwidth -- zero copyup data is equivalent to the object not
3239 * existing.
3240 */
3241 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3242 bytes = 0;
3243
3244 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3245 /*
3246 * Send a copyup request with an empty snapshot context to
3247 * deep-copyup the object through all existing snapshots.
3248 * A second request with the current snapshot context will be
3249 * sent for the actual modification.
3250 */
3251 ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3252 if (ret) {
3253 obj_req->pending.result = ret;
3254 return;
3255 }
3256
3257 obj_req->pending.num_pending++;
3258 bytes = MODS_ONLY;
3259 }
3260
3261 ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3262 if (ret) {
3263 obj_req->pending.result = ret;
3264 return;
3265 }
3266
3267 obj_req->pending.num_pending++;
3268 }
3269
rbd_obj_advance_copyup(struct rbd_obj_request * obj_req,int * result)3270 static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3271 {
3272 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3273 int ret;
3274
3275 again:
3276 switch (obj_req->copyup_state) {
3277 case RBD_OBJ_COPYUP_START:
3278 rbd_assert(!*result);
3279
3280 ret = rbd_obj_copyup_read_parent(obj_req);
3281 if (ret) {
3282 *result = ret;
3283 return true;
3284 }
3285 if (obj_req->num_img_extents)
3286 obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3287 else
3288 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3289 return false;
3290 case RBD_OBJ_COPYUP_READ_PARENT:
3291 if (*result)
3292 return true;
3293
3294 if (is_zero_bvecs(obj_req->copyup_bvecs,
3295 rbd_obj_img_extents_bytes(obj_req))) {
3296 dout("%s %p detected zeros\n", __func__, obj_req);
3297 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3298 }
3299
3300 rbd_obj_copyup_object_maps(obj_req);
3301 if (!obj_req->pending.num_pending) {
3302 *result = obj_req->pending.result;
3303 obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3304 goto again;
3305 }
3306 obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3307 return false;
3308 case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3309 if (!pending_result_dec(&obj_req->pending, result))
3310 return false;
3311 fallthrough;
3312 case RBD_OBJ_COPYUP_OBJECT_MAPS:
3313 if (*result) {
3314 rbd_warn(rbd_dev, "snap object map update failed: %d",
3315 *result);
3316 return true;
3317 }
3318
3319 rbd_obj_copyup_write_object(obj_req);
3320 if (!obj_req->pending.num_pending) {
3321 *result = obj_req->pending.result;
3322 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3323 goto again;
3324 }
3325 obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3326 return false;
3327 case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3328 if (!pending_result_dec(&obj_req->pending, result))
3329 return false;
3330 fallthrough;
3331 case RBD_OBJ_COPYUP_WRITE_OBJECT:
3332 return true;
3333 default:
3334 BUG();
3335 }
3336 }
3337
3338 /*
3339 * Return:
3340 * 0 - object map update sent
3341 * 1 - object map update isn't needed
3342 * <0 - error
3343 */
rbd_obj_write_post_object_map(struct rbd_obj_request * obj_req)3344 static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
3345 {
3346 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3347 u8 current_state = OBJECT_PENDING;
3348
3349 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3350 return 1;
3351
3352 if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3353 return 1;
3354
3355 return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3356 ¤t_state);
3357 }
3358
rbd_obj_advance_write(struct rbd_obj_request * obj_req,int * result)3359 static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
3360 {
3361 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3362 int ret;
3363
3364 again:
3365 switch (obj_req->write_state) {
3366 case RBD_OBJ_WRITE_START:
3367 rbd_assert(!*result);
3368
3369 rbd_obj_set_copyup_enabled(obj_req);
3370 if (rbd_obj_write_is_noop(obj_req))
3371 return true;
3372
3373 ret = rbd_obj_write_pre_object_map(obj_req);
3374 if (ret < 0) {
3375 *result = ret;
3376 return true;
3377 }
3378 obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3379 if (ret > 0)
3380 goto again;
3381 return false;
3382 case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3383 if (*result) {
3384 rbd_warn(rbd_dev, "pre object map update failed: %d",
3385 *result);
3386 return true;
3387 }
3388 ret = rbd_obj_write_object(obj_req);
3389 if (ret) {
3390 *result = ret;
3391 return true;
3392 }
3393 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3394 return false;
3395 case RBD_OBJ_WRITE_OBJECT:
3396 if (*result == -ENOENT) {
3397 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3398 *result = 0;
3399 obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3400 obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3401 goto again;
3402 }
3403 /*
3404 * On a non-existent object:
3405 * delete - -ENOENT, truncate/zero - 0
3406 */
3407 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3408 *result = 0;
3409 }
3410 if (*result)
3411 return true;
3412
3413 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3414 goto again;
3415 case __RBD_OBJ_WRITE_COPYUP:
3416 if (!rbd_obj_advance_copyup(obj_req, result))
3417 return false;
3418 fallthrough;
3419 case RBD_OBJ_WRITE_COPYUP:
3420 if (*result) {
3421 rbd_warn(rbd_dev, "copyup failed: %d", *result);
3422 return true;
3423 }
3424 ret = rbd_obj_write_post_object_map(obj_req);
3425 if (ret < 0) {
3426 *result = ret;
3427 return true;
3428 }
3429 obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3430 if (ret > 0)
3431 goto again;
3432 return false;
3433 case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3434 if (*result)
3435 rbd_warn(rbd_dev, "post object map update failed: %d",
3436 *result);
3437 return true;
3438 default:
3439 BUG();
3440 }
3441 }
3442
3443 /*
3444 * Return true if @obj_req is completed.
3445 */
__rbd_obj_handle_request(struct rbd_obj_request * obj_req,int * result)3446 static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3447 int *result)
3448 {
3449 struct rbd_img_request *img_req = obj_req->img_request;
3450 struct rbd_device *rbd_dev = img_req->rbd_dev;
3451 bool done;
3452
3453 mutex_lock(&obj_req->state_mutex);
3454 if (!rbd_img_is_write(img_req))
3455 done = rbd_obj_advance_read(obj_req, result);
3456 else
3457 done = rbd_obj_advance_write(obj_req, result);
3458 mutex_unlock(&obj_req->state_mutex);
3459
3460 if (done && *result) {
3461 rbd_assert(*result < 0);
3462 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3463 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3464 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
3465 }
3466 return done;
3467 }
3468
3469 /*
3470 * This is open-coded in rbd_img_handle_request() to avoid parent chain
3471 * recursion.
3472 */
rbd_obj_handle_request(struct rbd_obj_request * obj_req,int result)3473 static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
3474 {
3475 if (__rbd_obj_handle_request(obj_req, &result))
3476 rbd_img_handle_request(obj_req->img_request, result);
3477 }
3478
need_exclusive_lock(struct rbd_img_request * img_req)3479 static bool need_exclusive_lock(struct rbd_img_request *img_req)
3480 {
3481 struct rbd_device *rbd_dev = img_req->rbd_dev;
3482
3483 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3484 return false;
3485
3486 if (rbd_is_ro(rbd_dev))
3487 return false;
3488
3489 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
3490 if (rbd_dev->opts->lock_on_read ||
3491 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3492 return true;
3493
3494 return rbd_img_is_write(img_req);
3495 }
3496
rbd_lock_add_request(struct rbd_img_request * img_req)3497 static bool rbd_lock_add_request(struct rbd_img_request *img_req)
3498 {
3499 struct rbd_device *rbd_dev = img_req->rbd_dev;
3500 bool locked;
3501
3502 lockdep_assert_held(&rbd_dev->lock_rwsem);
3503 locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
3504 spin_lock(&rbd_dev->lock_lists_lock);
3505 rbd_assert(list_empty(&img_req->lock_item));
3506 if (!locked)
3507 list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3508 else
3509 list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
3510 spin_unlock(&rbd_dev->lock_lists_lock);
3511 return locked;
3512 }
3513
rbd_lock_del_request(struct rbd_img_request * img_req)3514 static void rbd_lock_del_request(struct rbd_img_request *img_req)
3515 {
3516 struct rbd_device *rbd_dev = img_req->rbd_dev;
3517 bool need_wakeup;
3518
3519 lockdep_assert_held(&rbd_dev->lock_rwsem);
3520 spin_lock(&rbd_dev->lock_lists_lock);
3521 rbd_assert(!list_empty(&img_req->lock_item));
3522 list_del_init(&img_req->lock_item);
3523 need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3524 list_empty(&rbd_dev->running_list));
3525 spin_unlock(&rbd_dev->lock_lists_lock);
3526 if (need_wakeup)
3527 complete(&rbd_dev->releasing_wait);
3528 }
3529
rbd_img_exclusive_lock(struct rbd_img_request * img_req)3530 static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3531 {
3532 struct rbd_device *rbd_dev = img_req->rbd_dev;
3533
3534 if (!need_exclusive_lock(img_req))
3535 return 1;
3536
3537 if (rbd_lock_add_request(img_req))
3538 return 1;
3539
3540 if (rbd_dev->opts->exclusive) {
3541 WARN_ON(1); /* lock got released? */
3542 return -EROFS;
3543 }
3544
3545 /*
3546 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3547 * and cancel_delayed_work() in wake_lock_waiters().
3548 */
3549 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3550 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3551 return 0;
3552 }
3553
rbd_img_object_requests(struct rbd_img_request * img_req)3554 static void rbd_img_object_requests(struct rbd_img_request *img_req)
3555 {
3556 struct rbd_device *rbd_dev = img_req->rbd_dev;
3557 struct rbd_obj_request *obj_req;
3558
3559 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3560 rbd_assert(!need_exclusive_lock(img_req) ||
3561 __rbd_is_lock_owner(rbd_dev));
3562
3563 if (rbd_img_is_write(img_req)) {
3564 rbd_assert(!img_req->snapc);
3565 down_read(&rbd_dev->header_rwsem);
3566 img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
3567 up_read(&rbd_dev->header_rwsem);
3568 }
3569
3570 for_each_obj_request(img_req, obj_req) {
3571 int result = 0;
3572
3573 if (__rbd_obj_handle_request(obj_req, &result)) {
3574 if (result) {
3575 img_req->pending.result = result;
3576 return;
3577 }
3578 } else {
3579 img_req->pending.num_pending++;
3580 }
3581 }
3582 }
3583
rbd_img_advance(struct rbd_img_request * img_req,int * result)3584 static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
3585 {
3586 int ret;
3587
3588 again:
3589 switch (img_req->state) {
3590 case RBD_IMG_START:
3591 rbd_assert(!*result);
3592
3593 ret = rbd_img_exclusive_lock(img_req);
3594 if (ret < 0) {
3595 *result = ret;
3596 return true;
3597 }
3598 img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3599 if (ret > 0)
3600 goto again;
3601 return false;
3602 case RBD_IMG_EXCLUSIVE_LOCK:
3603 if (*result)
3604 return true;
3605
3606 rbd_img_object_requests(img_req);
3607 if (!img_req->pending.num_pending) {
3608 *result = img_req->pending.result;
3609 img_req->state = RBD_IMG_OBJECT_REQUESTS;
3610 goto again;
3611 }
3612 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3613 return false;
3614 case __RBD_IMG_OBJECT_REQUESTS:
3615 if (!pending_result_dec(&img_req->pending, result))
3616 return false;
3617 fallthrough;
3618 case RBD_IMG_OBJECT_REQUESTS:
3619 return true;
3620 default:
3621 BUG();
3622 }
3623 }
3624
3625 /*
3626 * Return true if @img_req is completed.
3627 */
__rbd_img_handle_request(struct rbd_img_request * img_req,int * result)3628 static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3629 int *result)
3630 {
3631 struct rbd_device *rbd_dev = img_req->rbd_dev;
3632 bool done;
3633
3634 if (need_exclusive_lock(img_req)) {
3635 down_read(&rbd_dev->lock_rwsem);
3636 mutex_lock(&img_req->state_mutex);
3637 done = rbd_img_advance(img_req, result);
3638 if (done)
3639 rbd_lock_del_request(img_req);
3640 mutex_unlock(&img_req->state_mutex);
3641 up_read(&rbd_dev->lock_rwsem);
3642 } else {
3643 mutex_lock(&img_req->state_mutex);
3644 done = rbd_img_advance(img_req, result);
3645 mutex_unlock(&img_req->state_mutex);
3646 }
3647
3648 if (done && *result) {
3649 rbd_assert(*result < 0);
3650 rbd_warn(rbd_dev, "%s%s result %d",
3651 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3652 obj_op_name(img_req->op_type), *result);
3653 }
3654 return done;
3655 }
3656
rbd_img_handle_request(struct rbd_img_request * img_req,int result)3657 static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3658 {
3659 again:
3660 if (!__rbd_img_handle_request(img_req, &result))
3661 return;
3662
3663 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
3664 struct rbd_obj_request *obj_req = img_req->obj_request;
3665
3666 rbd_img_request_destroy(img_req);
3667 if (__rbd_obj_handle_request(obj_req, &result)) {
3668 img_req = obj_req->img_request;
3669 goto again;
3670 }
3671 } else {
3672 struct request *rq = blk_mq_rq_from_pdu(img_req);
3673
3674 rbd_img_request_destroy(img_req);
3675 blk_mq_end_request(rq, errno_to_blk_status(result));
3676 }
3677 }
3678
3679 static const struct rbd_client_id rbd_empty_cid;
3680
rbd_cid_equal(const struct rbd_client_id * lhs,const struct rbd_client_id * rhs)3681 static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3682 const struct rbd_client_id *rhs)
3683 {
3684 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3685 }
3686
rbd_get_cid(struct rbd_device * rbd_dev)3687 static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3688 {
3689 struct rbd_client_id cid;
3690
3691 mutex_lock(&rbd_dev->watch_mutex);
3692 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3693 cid.handle = rbd_dev->watch_cookie;
3694 mutex_unlock(&rbd_dev->watch_mutex);
3695 return cid;
3696 }
3697
3698 /*
3699 * lock_rwsem must be held for write
3700 */
rbd_set_owner_cid(struct rbd_device * rbd_dev,const struct rbd_client_id * cid)3701 static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3702 const struct rbd_client_id *cid)
3703 {
3704 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3705 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3706 cid->gid, cid->handle);
3707 rbd_dev->owner_cid = *cid; /* struct */
3708 }
3709
format_lock_cookie(struct rbd_device * rbd_dev,char * buf)3710 static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3711 {
3712 mutex_lock(&rbd_dev->watch_mutex);
3713 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3714 mutex_unlock(&rbd_dev->watch_mutex);
3715 }
3716
__rbd_lock(struct rbd_device * rbd_dev,const char * cookie)3717 static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3718 {
3719 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3720
3721 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3722 strcpy(rbd_dev->lock_cookie, cookie);
3723 rbd_set_owner_cid(rbd_dev, &cid);
3724 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3725 }
3726
3727 /*
3728 * lock_rwsem must be held for write
3729 */
rbd_lock(struct rbd_device * rbd_dev)3730 static int rbd_lock(struct rbd_device *rbd_dev)
3731 {
3732 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3733 char cookie[32];
3734 int ret;
3735
3736 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3737 rbd_dev->lock_cookie[0] != '\0');
3738
3739 format_lock_cookie(rbd_dev, cookie);
3740 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3741 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3742 RBD_LOCK_TAG, "", 0);
3743 if (ret)
3744 return ret;
3745
3746 __rbd_lock(rbd_dev, cookie);
3747 return 0;
3748 }
3749
3750 /*
3751 * lock_rwsem must be held for write
3752 */
rbd_unlock(struct rbd_device * rbd_dev)3753 static void rbd_unlock(struct rbd_device *rbd_dev)
3754 {
3755 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3756 int ret;
3757
3758 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3759 rbd_dev->lock_cookie[0] == '\0');
3760
3761 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3762 RBD_LOCK_NAME, rbd_dev->lock_cookie);
3763 if (ret && ret != -ENOENT)
3764 rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
3765
3766 /* treat errors as the image is unlocked */
3767 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3768 rbd_dev->lock_cookie[0] = '\0';
3769 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3770 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3771 }
3772
__rbd_notify_op_lock(struct rbd_device * rbd_dev,enum rbd_notify_op notify_op,struct page *** preply_pages,size_t * preply_len)3773 static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3774 enum rbd_notify_op notify_op,
3775 struct page ***preply_pages,
3776 size_t *preply_len)
3777 {
3778 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3779 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3780 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3781 int buf_size = sizeof(buf);
3782 void *p = buf;
3783
3784 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3785
3786 /* encode *LockPayload NotifyMessage (op + ClientId) */
3787 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3788 ceph_encode_32(&p, notify_op);
3789 ceph_encode_64(&p, cid.gid);
3790 ceph_encode_64(&p, cid.handle);
3791
3792 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3793 &rbd_dev->header_oloc, buf, buf_size,
3794 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3795 }
3796
rbd_notify_op_lock(struct rbd_device * rbd_dev,enum rbd_notify_op notify_op)3797 static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3798 enum rbd_notify_op notify_op)
3799 {
3800 __rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
3801 }
3802
rbd_notify_acquired_lock(struct work_struct * work)3803 static void rbd_notify_acquired_lock(struct work_struct *work)
3804 {
3805 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3806 acquired_lock_work);
3807
3808 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3809 }
3810
rbd_notify_released_lock(struct work_struct * work)3811 static void rbd_notify_released_lock(struct work_struct *work)
3812 {
3813 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3814 released_lock_work);
3815
3816 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3817 }
3818
rbd_request_lock(struct rbd_device * rbd_dev)3819 static int rbd_request_lock(struct rbd_device *rbd_dev)
3820 {
3821 struct page **reply_pages;
3822 size_t reply_len;
3823 bool lock_owner_responded = false;
3824 int ret;
3825
3826 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3827
3828 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3829 &reply_pages, &reply_len);
3830 if (ret && ret != -ETIMEDOUT) {
3831 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3832 goto out;
3833 }
3834
3835 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3836 void *p = page_address(reply_pages[0]);
3837 void *const end = p + reply_len;
3838 u32 n;
3839
3840 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3841 while (n--) {
3842 u8 struct_v;
3843 u32 len;
3844
3845 ceph_decode_need(&p, end, 8 + 8, e_inval);
3846 p += 8 + 8; /* skip gid and cookie */
3847
3848 ceph_decode_32_safe(&p, end, len, e_inval);
3849 if (!len)
3850 continue;
3851
3852 if (lock_owner_responded) {
3853 rbd_warn(rbd_dev,
3854 "duplicate lock owners detected");
3855 ret = -EIO;
3856 goto out;
3857 }
3858
3859 lock_owner_responded = true;
3860 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3861 &struct_v, &len);
3862 if (ret) {
3863 rbd_warn(rbd_dev,
3864 "failed to decode ResponseMessage: %d",
3865 ret);
3866 goto e_inval;
3867 }
3868
3869 ret = ceph_decode_32(&p);
3870 }
3871 }
3872
3873 if (!lock_owner_responded) {
3874 rbd_warn(rbd_dev, "no lock owners detected");
3875 ret = -ETIMEDOUT;
3876 }
3877
3878 out:
3879 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3880 return ret;
3881
3882 e_inval:
3883 ret = -EINVAL;
3884 goto out;
3885 }
3886
3887 /*
3888 * Either image request state machine(s) or rbd_add_acquire_lock()
3889 * (i.e. "rbd map").
3890 */
wake_lock_waiters(struct rbd_device * rbd_dev,int result)3891 static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
3892 {
3893 struct rbd_img_request *img_req;
3894
3895 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3896 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
3897
3898 cancel_delayed_work(&rbd_dev->lock_dwork);
3899 if (!completion_done(&rbd_dev->acquire_wait)) {
3900 rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3901 list_empty(&rbd_dev->running_list));
3902 rbd_dev->acquire_err = result;
3903 complete_all(&rbd_dev->acquire_wait);
3904 return;
3905 }
3906
3907 list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3908 mutex_lock(&img_req->state_mutex);
3909 rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3910 rbd_img_schedule(img_req, result);
3911 mutex_unlock(&img_req->state_mutex);
3912 }
3913
3914 list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
3915 }
3916
get_lock_owner_info(struct rbd_device * rbd_dev,struct ceph_locker ** lockers,u32 * num_lockers)3917 static int get_lock_owner_info(struct rbd_device *rbd_dev,
3918 struct ceph_locker **lockers, u32 *num_lockers)
3919 {
3920 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3921 u8 lock_type;
3922 char *lock_tag;
3923 int ret;
3924
3925 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3926
3927 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3928 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3929 &lock_type, &lock_tag, lockers, num_lockers);
3930 if (ret)
3931 return ret;
3932
3933 if (*num_lockers == 0) {
3934 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3935 goto out;
3936 }
3937
3938 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3939 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3940 lock_tag);
3941 ret = -EBUSY;
3942 goto out;
3943 }
3944
3945 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3946 rbd_warn(rbd_dev, "shared lock type detected");
3947 ret = -EBUSY;
3948 goto out;
3949 }
3950
3951 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3952 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3953 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3954 (*lockers)[0].id.cookie);
3955 ret = -EBUSY;
3956 goto out;
3957 }
3958
3959 out:
3960 kfree(lock_tag);
3961 return ret;
3962 }
3963
find_watcher(struct rbd_device * rbd_dev,const struct ceph_locker * locker)3964 static int find_watcher(struct rbd_device *rbd_dev,
3965 const struct ceph_locker *locker)
3966 {
3967 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3968 struct ceph_watch_item *watchers;
3969 u32 num_watchers;
3970 u64 cookie;
3971 int i;
3972 int ret;
3973
3974 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3975 &rbd_dev->header_oloc, &watchers,
3976 &num_watchers);
3977 if (ret)
3978 return ret;
3979
3980 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3981 for (i = 0; i < num_watchers; i++) {
3982 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3983 sizeof(locker->info.addr)) &&
3984 watchers[i].cookie == cookie) {
3985 struct rbd_client_id cid = {
3986 .gid = le64_to_cpu(watchers[i].name.num),
3987 .handle = cookie,
3988 };
3989
3990 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3991 rbd_dev, cid.gid, cid.handle);
3992 rbd_set_owner_cid(rbd_dev, &cid);
3993 ret = 1;
3994 goto out;
3995 }
3996 }
3997
3998 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3999 ret = 0;
4000 out:
4001 kfree(watchers);
4002 return ret;
4003 }
4004
4005 /*
4006 * lock_rwsem must be held for write
4007 */
rbd_try_lock(struct rbd_device * rbd_dev)4008 static int rbd_try_lock(struct rbd_device *rbd_dev)
4009 {
4010 struct ceph_client *client = rbd_dev->rbd_client->client;
4011 struct ceph_locker *lockers;
4012 u32 num_lockers;
4013 int ret;
4014
4015 for (;;) {
4016 ret = rbd_lock(rbd_dev);
4017 if (ret != -EBUSY)
4018 return ret;
4019
4020 /* determine if the current lock holder is still alive */
4021 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
4022 if (ret)
4023 return ret;
4024
4025 if (num_lockers == 0)
4026 goto again;
4027
4028 ret = find_watcher(rbd_dev, lockers);
4029 if (ret)
4030 goto out; /* request lock or error */
4031
4032 rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
4033 ENTITY_NAME(lockers[0].id.name));
4034
4035 ret = ceph_monc_blocklist_add(&client->monc,
4036 &lockers[0].info.addr);
4037 if (ret) {
4038 rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
4039 ENTITY_NAME(lockers[0].id.name), ret);
4040 goto out;
4041 }
4042
4043 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4044 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4045 lockers[0].id.cookie,
4046 &lockers[0].id.name);
4047 if (ret && ret != -ENOENT)
4048 goto out;
4049
4050 again:
4051 ceph_free_lockers(lockers, num_lockers);
4052 }
4053
4054 out:
4055 ceph_free_lockers(lockers, num_lockers);
4056 return ret;
4057 }
4058
rbd_post_acquire_action(struct rbd_device * rbd_dev)4059 static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4060 {
4061 int ret;
4062
4063 ret = rbd_dev_refresh(rbd_dev);
4064 if (ret)
4065 return ret;
4066
4067 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
4068 ret = rbd_object_map_open(rbd_dev);
4069 if (ret)
4070 return ret;
4071 }
4072
4073 return 0;
4074 }
4075
4076 /*
4077 * Return:
4078 * 0 - lock acquired
4079 * 1 - caller should call rbd_request_lock()
4080 * <0 - error
4081 */
rbd_try_acquire_lock(struct rbd_device * rbd_dev)4082 static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
4083 {
4084 int ret;
4085
4086 down_read(&rbd_dev->lock_rwsem);
4087 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4088 rbd_dev->lock_state);
4089 if (__rbd_is_lock_owner(rbd_dev)) {
4090 up_read(&rbd_dev->lock_rwsem);
4091 return 0;
4092 }
4093
4094 up_read(&rbd_dev->lock_rwsem);
4095 down_write(&rbd_dev->lock_rwsem);
4096 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4097 rbd_dev->lock_state);
4098 if (__rbd_is_lock_owner(rbd_dev)) {
4099 up_write(&rbd_dev->lock_rwsem);
4100 return 0;
4101 }
4102
4103 ret = rbd_try_lock(rbd_dev);
4104 if (ret < 0) {
4105 rbd_warn(rbd_dev, "failed to lock header: %d", ret);
4106 if (ret == -EBLOCKLISTED)
4107 goto out;
4108
4109 ret = 1; /* request lock anyway */
4110 }
4111 if (ret > 0) {
4112 up_write(&rbd_dev->lock_rwsem);
4113 return ret;
4114 }
4115
4116 rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4117 rbd_assert(list_empty(&rbd_dev->running_list));
4118
4119 ret = rbd_post_acquire_action(rbd_dev);
4120 if (ret) {
4121 rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4122 /*
4123 * Can't stay in RBD_LOCK_STATE_LOCKED because
4124 * rbd_lock_add_request() would let the request through,
4125 * assuming that e.g. object map is locked and loaded.
4126 */
4127 rbd_unlock(rbd_dev);
4128 }
4129
4130 out:
4131 wake_lock_waiters(rbd_dev, ret);
4132 up_write(&rbd_dev->lock_rwsem);
4133 return ret;
4134 }
4135
rbd_acquire_lock(struct work_struct * work)4136 static void rbd_acquire_lock(struct work_struct *work)
4137 {
4138 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4139 struct rbd_device, lock_dwork);
4140 int ret;
4141
4142 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4143 again:
4144 ret = rbd_try_acquire_lock(rbd_dev);
4145 if (ret <= 0) {
4146 dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
4147 return;
4148 }
4149
4150 ret = rbd_request_lock(rbd_dev);
4151 if (ret == -ETIMEDOUT) {
4152 goto again; /* treat this as a dead client */
4153 } else if (ret == -EROFS) {
4154 rbd_warn(rbd_dev, "peer will not release lock");
4155 down_write(&rbd_dev->lock_rwsem);
4156 wake_lock_waiters(rbd_dev, ret);
4157 up_write(&rbd_dev->lock_rwsem);
4158 } else if (ret < 0) {
4159 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4160 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4161 RBD_RETRY_DELAY);
4162 } else {
4163 /*
4164 * lock owner acked, but resend if we don't see them
4165 * release the lock
4166 */
4167 dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
4168 rbd_dev);
4169 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4170 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4171 }
4172 }
4173
rbd_quiesce_lock(struct rbd_device * rbd_dev)4174 static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
4175 {
4176 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4177 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
4178
4179 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4180 return false;
4181
4182 /*
4183 * Ensure that all in-flight IO is flushed.
4184 */
4185 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4186 rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4187 if (list_empty(&rbd_dev->running_list))
4188 return true;
4189
4190 up_write(&rbd_dev->lock_rwsem);
4191 wait_for_completion(&rbd_dev->releasing_wait);
4192
4193 down_write(&rbd_dev->lock_rwsem);
4194 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4195 return false;
4196
4197 rbd_assert(list_empty(&rbd_dev->running_list));
4198 return true;
4199 }
4200
rbd_pre_release_action(struct rbd_device * rbd_dev)4201 static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4202 {
4203 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4204 rbd_object_map_close(rbd_dev);
4205 }
4206
__rbd_release_lock(struct rbd_device * rbd_dev)4207 static void __rbd_release_lock(struct rbd_device *rbd_dev)
4208 {
4209 rbd_assert(list_empty(&rbd_dev->running_list));
4210
4211 rbd_pre_release_action(rbd_dev);
4212 rbd_unlock(rbd_dev);
4213 }
4214
4215 /*
4216 * lock_rwsem must be held for write
4217 */
rbd_release_lock(struct rbd_device * rbd_dev)4218 static void rbd_release_lock(struct rbd_device *rbd_dev)
4219 {
4220 if (!rbd_quiesce_lock(rbd_dev))
4221 return;
4222
4223 __rbd_release_lock(rbd_dev);
4224
4225 /*
4226 * Give others a chance to grab the lock - we would re-acquire
4227 * almost immediately if we got new IO while draining the running
4228 * list otherwise. We need to ack our own notifications, so this
4229 * lock_dwork will be requeued from rbd_handle_released_lock() by
4230 * way of maybe_kick_acquire().
4231 */
4232 cancel_delayed_work(&rbd_dev->lock_dwork);
4233 }
4234
rbd_release_lock_work(struct work_struct * work)4235 static void rbd_release_lock_work(struct work_struct *work)
4236 {
4237 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4238 unlock_work);
4239
4240 down_write(&rbd_dev->lock_rwsem);
4241 rbd_release_lock(rbd_dev);
4242 up_write(&rbd_dev->lock_rwsem);
4243 }
4244
maybe_kick_acquire(struct rbd_device * rbd_dev)4245 static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4246 {
4247 bool have_requests;
4248
4249 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4250 if (__rbd_is_lock_owner(rbd_dev))
4251 return;
4252
4253 spin_lock(&rbd_dev->lock_lists_lock);
4254 have_requests = !list_empty(&rbd_dev->acquiring_list);
4255 spin_unlock(&rbd_dev->lock_lists_lock);
4256 if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4257 dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4258 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4259 }
4260 }
4261
rbd_handle_acquired_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4262 static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4263 void **p)
4264 {
4265 struct rbd_client_id cid = { 0 };
4266
4267 if (struct_v >= 2) {
4268 cid.gid = ceph_decode_64(p);
4269 cid.handle = ceph_decode_64(p);
4270 }
4271
4272 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4273 cid.handle);
4274 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4275 down_write(&rbd_dev->lock_rwsem);
4276 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4277 dout("%s rbd_dev %p cid %llu-%llu == owner_cid\n",
4278 __func__, rbd_dev, cid.gid, cid.handle);
4279 } else {
4280 rbd_set_owner_cid(rbd_dev, &cid);
4281 }
4282 downgrade_write(&rbd_dev->lock_rwsem);
4283 } else {
4284 down_read(&rbd_dev->lock_rwsem);
4285 }
4286
4287 maybe_kick_acquire(rbd_dev);
4288 up_read(&rbd_dev->lock_rwsem);
4289 }
4290
rbd_handle_released_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4291 static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4292 void **p)
4293 {
4294 struct rbd_client_id cid = { 0 };
4295
4296 if (struct_v >= 2) {
4297 cid.gid = ceph_decode_64(p);
4298 cid.handle = ceph_decode_64(p);
4299 }
4300
4301 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4302 cid.handle);
4303 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4304 down_write(&rbd_dev->lock_rwsem);
4305 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4306 dout("%s rbd_dev %p cid %llu-%llu != owner_cid %llu-%llu\n",
4307 __func__, rbd_dev, cid.gid, cid.handle,
4308 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
4309 } else {
4310 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4311 }
4312 downgrade_write(&rbd_dev->lock_rwsem);
4313 } else {
4314 down_read(&rbd_dev->lock_rwsem);
4315 }
4316
4317 maybe_kick_acquire(rbd_dev);
4318 up_read(&rbd_dev->lock_rwsem);
4319 }
4320
4321 /*
4322 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4323 * ResponseMessage is needed.
4324 */
rbd_handle_request_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4325 static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4326 void **p)
4327 {
4328 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4329 struct rbd_client_id cid = { 0 };
4330 int result = 1;
4331
4332 if (struct_v >= 2) {
4333 cid.gid = ceph_decode_64(p);
4334 cid.handle = ceph_decode_64(p);
4335 }
4336
4337 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4338 cid.handle);
4339 if (rbd_cid_equal(&cid, &my_cid))
4340 return result;
4341
4342 down_read(&rbd_dev->lock_rwsem);
4343 if (__rbd_is_lock_owner(rbd_dev)) {
4344 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4345 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4346 goto out_unlock;
4347
4348 /*
4349 * encode ResponseMessage(0) so the peer can detect
4350 * a missing owner
4351 */
4352 result = 0;
4353
4354 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
4355 if (!rbd_dev->opts->exclusive) {
4356 dout("%s rbd_dev %p queueing unlock_work\n",
4357 __func__, rbd_dev);
4358 queue_work(rbd_dev->task_wq,
4359 &rbd_dev->unlock_work);
4360 } else {
4361 /* refuse to release the lock */
4362 result = -EROFS;
4363 }
4364 }
4365 }
4366
4367 out_unlock:
4368 up_read(&rbd_dev->lock_rwsem);
4369 return result;
4370 }
4371
__rbd_acknowledge_notify(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie,s32 * result)4372 static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4373 u64 notify_id, u64 cookie, s32 *result)
4374 {
4375 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4376 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4377 int buf_size = sizeof(buf);
4378 int ret;
4379
4380 if (result) {
4381 void *p = buf;
4382
4383 /* encode ResponseMessage */
4384 ceph_start_encoding(&p, 1, 1,
4385 buf_size - CEPH_ENCODING_START_BLK_LEN);
4386 ceph_encode_32(&p, *result);
4387 } else {
4388 buf_size = 0;
4389 }
4390
4391 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4392 &rbd_dev->header_oloc, notify_id, cookie,
4393 buf, buf_size);
4394 if (ret)
4395 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4396 }
4397
rbd_acknowledge_notify(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie)4398 static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4399 u64 cookie)
4400 {
4401 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4402 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4403 }
4404
rbd_acknowledge_notify_result(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie,s32 result)4405 static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4406 u64 notify_id, u64 cookie, s32 result)
4407 {
4408 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4409 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4410 }
4411
rbd_watch_cb(void * arg,u64 notify_id,u64 cookie,u64 notifier_id,void * data,size_t data_len)4412 static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4413 u64 notifier_id, void *data, size_t data_len)
4414 {
4415 struct rbd_device *rbd_dev = arg;
4416 void *p = data;
4417 void *const end = p + data_len;
4418 u8 struct_v = 0;
4419 u32 len;
4420 u32 notify_op;
4421 int ret;
4422
4423 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4424 __func__, rbd_dev, cookie, notify_id, data_len);
4425 if (data_len) {
4426 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4427 &struct_v, &len);
4428 if (ret) {
4429 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4430 ret);
4431 return;
4432 }
4433
4434 notify_op = ceph_decode_32(&p);
4435 } else {
4436 /* legacy notification for header updates */
4437 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4438 len = 0;
4439 }
4440
4441 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4442 switch (notify_op) {
4443 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4444 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4445 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4446 break;
4447 case RBD_NOTIFY_OP_RELEASED_LOCK:
4448 rbd_handle_released_lock(rbd_dev, struct_v, &p);
4449 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4450 break;
4451 case RBD_NOTIFY_OP_REQUEST_LOCK:
4452 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4453 if (ret <= 0)
4454 rbd_acknowledge_notify_result(rbd_dev, notify_id,
4455 cookie, ret);
4456 else
4457 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4458 break;
4459 case RBD_NOTIFY_OP_HEADER_UPDATE:
4460 ret = rbd_dev_refresh(rbd_dev);
4461 if (ret)
4462 rbd_warn(rbd_dev, "refresh failed: %d", ret);
4463
4464 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4465 break;
4466 default:
4467 if (rbd_is_lock_owner(rbd_dev))
4468 rbd_acknowledge_notify_result(rbd_dev, notify_id,
4469 cookie, -EOPNOTSUPP);
4470 else
4471 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4472 break;
4473 }
4474 }
4475
4476 static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4477
rbd_watch_errcb(void * arg,u64 cookie,int err)4478 static void rbd_watch_errcb(void *arg, u64 cookie, int err)
4479 {
4480 struct rbd_device *rbd_dev = arg;
4481
4482 rbd_warn(rbd_dev, "encountered watch error: %d", err);
4483
4484 down_write(&rbd_dev->lock_rwsem);
4485 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4486 up_write(&rbd_dev->lock_rwsem);
4487
4488 mutex_lock(&rbd_dev->watch_mutex);
4489 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4490 __rbd_unregister_watch(rbd_dev);
4491 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
4492
4493 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
4494 }
4495 mutex_unlock(&rbd_dev->watch_mutex);
4496 }
4497
4498 /*
4499 * watch_mutex must be locked
4500 */
__rbd_register_watch(struct rbd_device * rbd_dev)4501 static int __rbd_register_watch(struct rbd_device *rbd_dev)
4502 {
4503 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4504 struct ceph_osd_linger_request *handle;
4505
4506 rbd_assert(!rbd_dev->watch_handle);
4507 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4508
4509 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4510 &rbd_dev->header_oloc, rbd_watch_cb,
4511 rbd_watch_errcb, rbd_dev);
4512 if (IS_ERR(handle))
4513 return PTR_ERR(handle);
4514
4515 rbd_dev->watch_handle = handle;
4516 return 0;
4517 }
4518
4519 /*
4520 * watch_mutex must be locked
4521 */
__rbd_unregister_watch(struct rbd_device * rbd_dev)4522 static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
4523 {
4524 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4525 int ret;
4526
4527 rbd_assert(rbd_dev->watch_handle);
4528 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4529
4530 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4531 if (ret)
4532 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
4533
4534 rbd_dev->watch_handle = NULL;
4535 }
4536
rbd_register_watch(struct rbd_device * rbd_dev)4537 static int rbd_register_watch(struct rbd_device *rbd_dev)
4538 {
4539 int ret;
4540
4541 mutex_lock(&rbd_dev->watch_mutex);
4542 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4543 ret = __rbd_register_watch(rbd_dev);
4544 if (ret)
4545 goto out;
4546
4547 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4548 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4549
4550 out:
4551 mutex_unlock(&rbd_dev->watch_mutex);
4552 return ret;
4553 }
4554
cancel_tasks_sync(struct rbd_device * rbd_dev)4555 static void cancel_tasks_sync(struct rbd_device *rbd_dev)
4556 {
4557 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4558
4559 cancel_work_sync(&rbd_dev->acquired_lock_work);
4560 cancel_work_sync(&rbd_dev->released_lock_work);
4561 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4562 cancel_work_sync(&rbd_dev->unlock_work);
4563 }
4564
4565 /*
4566 * header_rwsem must not be held to avoid a deadlock with
4567 * rbd_dev_refresh() when flushing notifies.
4568 */
rbd_unregister_watch(struct rbd_device * rbd_dev)4569 static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4570 {
4571 cancel_tasks_sync(rbd_dev);
4572
4573 mutex_lock(&rbd_dev->watch_mutex);
4574 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4575 __rbd_unregister_watch(rbd_dev);
4576 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4577 mutex_unlock(&rbd_dev->watch_mutex);
4578
4579 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
4580 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
4581 }
4582
4583 /*
4584 * lock_rwsem must be held for write
4585 */
rbd_reacquire_lock(struct rbd_device * rbd_dev)4586 static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4587 {
4588 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4589 char cookie[32];
4590 int ret;
4591
4592 if (!rbd_quiesce_lock(rbd_dev))
4593 return;
4594
4595 format_lock_cookie(rbd_dev, cookie);
4596 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4597 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4598 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4599 RBD_LOCK_TAG, cookie);
4600 if (ret) {
4601 if (ret != -EOPNOTSUPP)
4602 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4603 ret);
4604
4605 /*
4606 * Lock cookie cannot be updated on older OSDs, so do
4607 * a manual release and queue an acquire.
4608 */
4609 __rbd_release_lock(rbd_dev);
4610 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4611 } else {
4612 __rbd_lock(rbd_dev, cookie);
4613 wake_lock_waiters(rbd_dev, 0);
4614 }
4615 }
4616
rbd_reregister_watch(struct work_struct * work)4617 static void rbd_reregister_watch(struct work_struct *work)
4618 {
4619 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4620 struct rbd_device, watch_dwork);
4621 int ret;
4622
4623 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4624
4625 mutex_lock(&rbd_dev->watch_mutex);
4626 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4627 mutex_unlock(&rbd_dev->watch_mutex);
4628 return;
4629 }
4630
4631 ret = __rbd_register_watch(rbd_dev);
4632 if (ret) {
4633 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
4634 if (ret != -EBLOCKLISTED && ret != -ENOENT) {
4635 queue_delayed_work(rbd_dev->task_wq,
4636 &rbd_dev->watch_dwork,
4637 RBD_RETRY_DELAY);
4638 mutex_unlock(&rbd_dev->watch_mutex);
4639 return;
4640 }
4641
4642 mutex_unlock(&rbd_dev->watch_mutex);
4643 down_write(&rbd_dev->lock_rwsem);
4644 wake_lock_waiters(rbd_dev, ret);
4645 up_write(&rbd_dev->lock_rwsem);
4646 return;
4647 }
4648
4649 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4650 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4651 mutex_unlock(&rbd_dev->watch_mutex);
4652
4653 down_write(&rbd_dev->lock_rwsem);
4654 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4655 rbd_reacquire_lock(rbd_dev);
4656 up_write(&rbd_dev->lock_rwsem);
4657
4658 ret = rbd_dev_refresh(rbd_dev);
4659 if (ret)
4660 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
4661 }
4662
4663 /*
4664 * Synchronous osd object method call. Returns the number of bytes
4665 * returned in the outbound buffer, or a negative error code.
4666 */
rbd_obj_method_sync(struct rbd_device * rbd_dev,struct ceph_object_id * oid,struct ceph_object_locator * oloc,const char * method_name,const void * outbound,size_t outbound_size,void * inbound,size_t inbound_size)4667 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
4668 struct ceph_object_id *oid,
4669 struct ceph_object_locator *oloc,
4670 const char *method_name,
4671 const void *outbound,
4672 size_t outbound_size,
4673 void *inbound,
4674 size_t inbound_size)
4675 {
4676 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4677 struct page *req_page = NULL;
4678 struct page *reply_page;
4679 int ret;
4680
4681 /*
4682 * Method calls are ultimately read operations. The result
4683 * should placed into the inbound buffer provided. They
4684 * also supply outbound data--parameters for the object
4685 * method. Currently if this is present it will be a
4686 * snapshot id.
4687 */
4688 if (outbound) {
4689 if (outbound_size > PAGE_SIZE)
4690 return -E2BIG;
4691
4692 req_page = alloc_page(GFP_KERNEL);
4693 if (!req_page)
4694 return -ENOMEM;
4695
4696 memcpy(page_address(req_page), outbound, outbound_size);
4697 }
4698
4699 reply_page = alloc_page(GFP_KERNEL);
4700 if (!reply_page) {
4701 if (req_page)
4702 __free_page(req_page);
4703 return -ENOMEM;
4704 }
4705
4706 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4707 CEPH_OSD_FLAG_READ, req_page, outbound_size,
4708 &reply_page, &inbound_size);
4709 if (!ret) {
4710 memcpy(inbound, page_address(reply_page), inbound_size);
4711 ret = inbound_size;
4712 }
4713
4714 if (req_page)
4715 __free_page(req_page);
4716 __free_page(reply_page);
4717 return ret;
4718 }
4719
rbd_queue_workfn(struct work_struct * work)4720 static void rbd_queue_workfn(struct work_struct *work)
4721 {
4722 struct rbd_img_request *img_request =
4723 container_of(work, struct rbd_img_request, work);
4724 struct rbd_device *rbd_dev = img_request->rbd_dev;
4725 enum obj_operation_type op_type = img_request->op_type;
4726 struct request *rq = blk_mq_rq_from_pdu(img_request);
4727 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4728 u64 length = blk_rq_bytes(rq);
4729 u64 mapping_size;
4730 int result;
4731
4732 /* Ignore/skip any zero-length requests */
4733 if (!length) {
4734 dout("%s: zero-length request\n", __func__);
4735 result = 0;
4736 goto err_img_request;
4737 }
4738
4739 blk_mq_start_request(rq);
4740
4741 down_read(&rbd_dev->header_rwsem);
4742 mapping_size = rbd_dev->mapping.size;
4743 rbd_img_capture_header(img_request);
4744 up_read(&rbd_dev->header_rwsem);
4745
4746 if (offset + length > mapping_size) {
4747 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4748 length, mapping_size);
4749 result = -EIO;
4750 goto err_img_request;
4751 }
4752
4753 dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4754 img_request, obj_op_name(op_type), offset, length);
4755
4756 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
4757 result = rbd_img_fill_nodata(img_request, offset, length);
4758 else
4759 result = rbd_img_fill_from_bio(img_request, offset, length,
4760 rq->bio);
4761 if (result)
4762 goto err_img_request;
4763
4764 rbd_img_handle_request(img_request, 0);
4765 return;
4766
4767 err_img_request:
4768 rbd_img_request_destroy(img_request);
4769 if (result)
4770 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
4771 obj_op_name(op_type), length, offset, result);
4772 blk_mq_end_request(rq, errno_to_blk_status(result));
4773 }
4774
rbd_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)4775 static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
4776 const struct blk_mq_queue_data *bd)
4777 {
4778 struct rbd_device *rbd_dev = hctx->queue->queuedata;
4779 struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
4780 enum obj_operation_type op_type;
4781
4782 switch (req_op(bd->rq)) {
4783 case REQ_OP_DISCARD:
4784 op_type = OBJ_OP_DISCARD;
4785 break;
4786 case REQ_OP_WRITE_ZEROES:
4787 op_type = OBJ_OP_ZEROOUT;
4788 break;
4789 case REQ_OP_WRITE:
4790 op_type = OBJ_OP_WRITE;
4791 break;
4792 case REQ_OP_READ:
4793 op_type = OBJ_OP_READ;
4794 break;
4795 default:
4796 rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
4797 return BLK_STS_IOERR;
4798 }
4799
4800 rbd_img_request_init(img_req, rbd_dev, op_type);
4801
4802 if (rbd_img_is_write(img_req)) {
4803 if (rbd_is_ro(rbd_dev)) {
4804 rbd_warn(rbd_dev, "%s on read-only mapping",
4805 obj_op_name(img_req->op_type));
4806 return BLK_STS_IOERR;
4807 }
4808 rbd_assert(!rbd_is_snap(rbd_dev));
4809 }
4810
4811 INIT_WORK(&img_req->work, rbd_queue_workfn);
4812 queue_work(rbd_wq, &img_req->work);
4813 return BLK_STS_OK;
4814 }
4815
rbd_free_disk(struct rbd_device * rbd_dev)4816 static void rbd_free_disk(struct rbd_device *rbd_dev)
4817 {
4818 blk_cleanup_queue(rbd_dev->disk->queue);
4819 blk_mq_free_tag_set(&rbd_dev->tag_set);
4820 put_disk(rbd_dev->disk);
4821 rbd_dev->disk = NULL;
4822 }
4823
rbd_obj_read_sync(struct rbd_device * rbd_dev,struct ceph_object_id * oid,struct ceph_object_locator * oloc,void * buf,int buf_len)4824 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4825 struct ceph_object_id *oid,
4826 struct ceph_object_locator *oloc,
4827 void *buf, int buf_len)
4828
4829 {
4830 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4831 struct ceph_osd_request *req;
4832 struct page **pages;
4833 int num_pages = calc_pages_for(0, buf_len);
4834 int ret;
4835
4836 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4837 if (!req)
4838 return -ENOMEM;
4839
4840 ceph_oid_copy(&req->r_base_oid, oid);
4841 ceph_oloc_copy(&req->r_base_oloc, oloc);
4842 req->r_flags = CEPH_OSD_FLAG_READ;
4843
4844 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4845 if (IS_ERR(pages)) {
4846 ret = PTR_ERR(pages);
4847 goto out_req;
4848 }
4849
4850 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4851 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4852 true);
4853
4854 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4855 if (ret)
4856 goto out_req;
4857
4858 ceph_osdc_start_request(osdc, req, false);
4859 ret = ceph_osdc_wait_request(osdc, req);
4860 if (ret >= 0)
4861 ceph_copy_from_page_vector(pages, buf, 0, ret);
4862
4863 out_req:
4864 ceph_osdc_put_request(req);
4865 return ret;
4866 }
4867
4868 /*
4869 * Read the complete header for the given rbd device. On successful
4870 * return, the rbd_dev->header field will contain up-to-date
4871 * information about the image.
4872 */
rbd_dev_v1_header_info(struct rbd_device * rbd_dev)4873 static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
4874 {
4875 struct rbd_image_header_ondisk *ondisk = NULL;
4876 u32 snap_count = 0;
4877 u64 names_size = 0;
4878 u32 want_count;
4879 int ret;
4880
4881 /*
4882 * The complete header will include an array of its 64-bit
4883 * snapshot ids, followed by the names of those snapshots as
4884 * a contiguous block of NUL-terminated strings. Note that
4885 * the number of snapshots could change by the time we read
4886 * it in, in which case we re-read it.
4887 */
4888 do {
4889 size_t size;
4890
4891 kfree(ondisk);
4892
4893 size = sizeof (*ondisk);
4894 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4895 size += names_size;
4896 ondisk = kmalloc(size, GFP_KERNEL);
4897 if (!ondisk)
4898 return -ENOMEM;
4899
4900 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4901 &rbd_dev->header_oloc, ondisk, size);
4902 if (ret < 0)
4903 goto out;
4904 if ((size_t)ret < size) {
4905 ret = -ENXIO;
4906 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4907 size, ret);
4908 goto out;
4909 }
4910 if (!rbd_dev_ondisk_valid(ondisk)) {
4911 ret = -ENXIO;
4912 rbd_warn(rbd_dev, "invalid header");
4913 goto out;
4914 }
4915
4916 names_size = le64_to_cpu(ondisk->snap_names_len);
4917 want_count = snap_count;
4918 snap_count = le32_to_cpu(ondisk->snap_count);
4919 } while (snap_count != want_count);
4920
4921 ret = rbd_header_from_disk(rbd_dev, ondisk);
4922 out:
4923 kfree(ondisk);
4924
4925 return ret;
4926 }
4927
rbd_dev_update_size(struct rbd_device * rbd_dev)4928 static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4929 {
4930 sector_t size;
4931
4932 /*
4933 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4934 * try to update its size. If REMOVING is set, updating size
4935 * is just useless work since the device can't be opened.
4936 */
4937 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4938 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
4939 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4940 dout("setting size to %llu sectors", (unsigned long long)size);
4941 set_capacity(rbd_dev->disk, size);
4942 revalidate_disk_size(rbd_dev->disk, true);
4943 }
4944 }
4945
rbd_dev_refresh(struct rbd_device * rbd_dev)4946 static int rbd_dev_refresh(struct rbd_device *rbd_dev)
4947 {
4948 u64 mapping_size;
4949 int ret;
4950
4951 down_write(&rbd_dev->header_rwsem);
4952 mapping_size = rbd_dev->mapping.size;
4953
4954 ret = rbd_dev_header_info(rbd_dev);
4955 if (ret)
4956 goto out;
4957
4958 /*
4959 * If there is a parent, see if it has disappeared due to the
4960 * mapped image getting flattened.
4961 */
4962 if (rbd_dev->parent) {
4963 ret = rbd_dev_v2_parent_info(rbd_dev);
4964 if (ret)
4965 goto out;
4966 }
4967
4968 rbd_assert(!rbd_is_snap(rbd_dev));
4969 rbd_dev->mapping.size = rbd_dev->header.image_size;
4970
4971 out:
4972 up_write(&rbd_dev->header_rwsem);
4973 if (!ret && mapping_size != rbd_dev->mapping.size)
4974 rbd_dev_update_size(rbd_dev);
4975
4976 return ret;
4977 }
4978
4979 static const struct blk_mq_ops rbd_mq_ops = {
4980 .queue_rq = rbd_queue_rq,
4981 };
4982
rbd_init_disk(struct rbd_device * rbd_dev)4983 static int rbd_init_disk(struct rbd_device *rbd_dev)
4984 {
4985 struct gendisk *disk;
4986 struct request_queue *q;
4987 unsigned int objset_bytes =
4988 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
4989 int err;
4990
4991 /* create gendisk info */
4992 disk = alloc_disk(single_major ?
4993 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4994 RBD_MINORS_PER_MAJOR);
4995 if (!disk)
4996 return -ENOMEM;
4997
4998 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4999 rbd_dev->dev_id);
5000 disk->major = rbd_dev->major;
5001 disk->first_minor = rbd_dev->minor;
5002 if (single_major)
5003 disk->flags |= GENHD_FL_EXT_DEVT;
5004 disk->fops = &rbd_bd_ops;
5005 disk->private_data = rbd_dev;
5006
5007 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
5008 rbd_dev->tag_set.ops = &rbd_mq_ops;
5009 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
5010 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
5011 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
5012 rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
5013 rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
5014
5015 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
5016 if (err)
5017 goto out_disk;
5018
5019 q = blk_mq_init_queue(&rbd_dev->tag_set);
5020 if (IS_ERR(q)) {
5021 err = PTR_ERR(q);
5022 goto out_tag_set;
5023 }
5024
5025 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
5026 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
5027
5028 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
5029 q->limits.max_sectors = queue_max_hw_sectors(q);
5030 blk_queue_max_segments(q, USHRT_MAX);
5031 blk_queue_max_segment_size(q, UINT_MAX);
5032 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
5033 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
5034
5035 if (rbd_dev->opts->trim) {
5036 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
5037 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
5038 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5039 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5040 }
5041
5042 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
5043 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
5044
5045 /*
5046 * disk_release() expects a queue ref from add_disk() and will
5047 * put it. Hold an extra ref until add_disk() is called.
5048 */
5049 WARN_ON(!blk_get_queue(q));
5050 disk->queue = q;
5051 q->queuedata = rbd_dev;
5052
5053 rbd_dev->disk = disk;
5054
5055 return 0;
5056 out_tag_set:
5057 blk_mq_free_tag_set(&rbd_dev->tag_set);
5058 out_disk:
5059 put_disk(disk);
5060 return err;
5061 }
5062
5063 /*
5064 sysfs
5065 */
5066
dev_to_rbd_dev(struct device * dev)5067 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5068 {
5069 return container_of(dev, struct rbd_device, dev);
5070 }
5071
rbd_size_show(struct device * dev,struct device_attribute * attr,char * buf)5072 static ssize_t rbd_size_show(struct device *dev,
5073 struct device_attribute *attr, char *buf)
5074 {
5075 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5076
5077 return sprintf(buf, "%llu\n",
5078 (unsigned long long)rbd_dev->mapping.size);
5079 }
5080
rbd_features_show(struct device * dev,struct device_attribute * attr,char * buf)5081 static ssize_t rbd_features_show(struct device *dev,
5082 struct device_attribute *attr, char *buf)
5083 {
5084 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5085
5086 return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
5087 }
5088
rbd_major_show(struct device * dev,struct device_attribute * attr,char * buf)5089 static ssize_t rbd_major_show(struct device *dev,
5090 struct device_attribute *attr, char *buf)
5091 {
5092 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5093
5094 if (rbd_dev->major)
5095 return sprintf(buf, "%d\n", rbd_dev->major);
5096
5097 return sprintf(buf, "(none)\n");
5098 }
5099
rbd_minor_show(struct device * dev,struct device_attribute * attr,char * buf)5100 static ssize_t rbd_minor_show(struct device *dev,
5101 struct device_attribute *attr, char *buf)
5102 {
5103 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5104
5105 return sprintf(buf, "%d\n", rbd_dev->minor);
5106 }
5107
rbd_client_addr_show(struct device * dev,struct device_attribute * attr,char * buf)5108 static ssize_t rbd_client_addr_show(struct device *dev,
5109 struct device_attribute *attr, char *buf)
5110 {
5111 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5112 struct ceph_entity_addr *client_addr =
5113 ceph_client_addr(rbd_dev->rbd_client->client);
5114
5115 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5116 le32_to_cpu(client_addr->nonce));
5117 }
5118
rbd_client_id_show(struct device * dev,struct device_attribute * attr,char * buf)5119 static ssize_t rbd_client_id_show(struct device *dev,
5120 struct device_attribute *attr, char *buf)
5121 {
5122 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5123
5124 return sprintf(buf, "client%lld\n",
5125 ceph_client_gid(rbd_dev->rbd_client->client));
5126 }
5127
rbd_cluster_fsid_show(struct device * dev,struct device_attribute * attr,char * buf)5128 static ssize_t rbd_cluster_fsid_show(struct device *dev,
5129 struct device_attribute *attr, char *buf)
5130 {
5131 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5132
5133 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5134 }
5135
rbd_config_info_show(struct device * dev,struct device_attribute * attr,char * buf)5136 static ssize_t rbd_config_info_show(struct device *dev,
5137 struct device_attribute *attr, char *buf)
5138 {
5139 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5140
5141 if (!capable(CAP_SYS_ADMIN))
5142 return -EPERM;
5143
5144 return sprintf(buf, "%s\n", rbd_dev->config_info);
5145 }
5146
rbd_pool_show(struct device * dev,struct device_attribute * attr,char * buf)5147 static ssize_t rbd_pool_show(struct device *dev,
5148 struct device_attribute *attr, char *buf)
5149 {
5150 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5151
5152 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
5153 }
5154
rbd_pool_id_show(struct device * dev,struct device_attribute * attr,char * buf)5155 static ssize_t rbd_pool_id_show(struct device *dev,
5156 struct device_attribute *attr, char *buf)
5157 {
5158 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5159
5160 return sprintf(buf, "%llu\n",
5161 (unsigned long long) rbd_dev->spec->pool_id);
5162 }
5163
rbd_pool_ns_show(struct device * dev,struct device_attribute * attr,char * buf)5164 static ssize_t rbd_pool_ns_show(struct device *dev,
5165 struct device_attribute *attr, char *buf)
5166 {
5167 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5168
5169 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5170 }
5171
rbd_name_show(struct device * dev,struct device_attribute * attr,char * buf)5172 static ssize_t rbd_name_show(struct device *dev,
5173 struct device_attribute *attr, char *buf)
5174 {
5175 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5176
5177 if (rbd_dev->spec->image_name)
5178 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5179
5180 return sprintf(buf, "(unknown)\n");
5181 }
5182
rbd_image_id_show(struct device * dev,struct device_attribute * attr,char * buf)5183 static ssize_t rbd_image_id_show(struct device *dev,
5184 struct device_attribute *attr, char *buf)
5185 {
5186 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5187
5188 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
5189 }
5190
5191 /*
5192 * Shows the name of the currently-mapped snapshot (or
5193 * RBD_SNAP_HEAD_NAME for the base image).
5194 */
rbd_snap_show(struct device * dev,struct device_attribute * attr,char * buf)5195 static ssize_t rbd_snap_show(struct device *dev,
5196 struct device_attribute *attr,
5197 char *buf)
5198 {
5199 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5200
5201 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
5202 }
5203
rbd_snap_id_show(struct device * dev,struct device_attribute * attr,char * buf)5204 static ssize_t rbd_snap_id_show(struct device *dev,
5205 struct device_attribute *attr, char *buf)
5206 {
5207 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5208
5209 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5210 }
5211
5212 /*
5213 * For a v2 image, shows the chain of parent images, separated by empty
5214 * lines. For v1 images or if there is no parent, shows "(no parent
5215 * image)".
5216 */
rbd_parent_show(struct device * dev,struct device_attribute * attr,char * buf)5217 static ssize_t rbd_parent_show(struct device *dev,
5218 struct device_attribute *attr,
5219 char *buf)
5220 {
5221 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5222 ssize_t count = 0;
5223
5224 if (!rbd_dev->parent)
5225 return sprintf(buf, "(no parent image)\n");
5226
5227 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5228 struct rbd_spec *spec = rbd_dev->parent_spec;
5229
5230 count += sprintf(&buf[count], "%s"
5231 "pool_id %llu\npool_name %s\n"
5232 "pool_ns %s\n"
5233 "image_id %s\nimage_name %s\n"
5234 "snap_id %llu\nsnap_name %s\n"
5235 "overlap %llu\n",
5236 !count ? "" : "\n", /* first? */
5237 spec->pool_id, spec->pool_name,
5238 spec->pool_ns ?: "",
5239 spec->image_id, spec->image_name ?: "(unknown)",
5240 spec->snap_id, spec->snap_name,
5241 rbd_dev->parent_overlap);
5242 }
5243
5244 return count;
5245 }
5246
rbd_image_refresh(struct device * dev,struct device_attribute * attr,const char * buf,size_t size)5247 static ssize_t rbd_image_refresh(struct device *dev,
5248 struct device_attribute *attr,
5249 const char *buf,
5250 size_t size)
5251 {
5252 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5253 int ret;
5254
5255 if (!capable(CAP_SYS_ADMIN))
5256 return -EPERM;
5257
5258 ret = rbd_dev_refresh(rbd_dev);
5259 if (ret)
5260 return ret;
5261
5262 return size;
5263 }
5264
5265 static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5266 static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5267 static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5268 static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5269 static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5270 static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5271 static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5272 static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5273 static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5274 static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
5275 static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5276 static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5277 static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5278 static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5279 static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5280 static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5281 static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
5282
5283 static struct attribute *rbd_attrs[] = {
5284 &dev_attr_size.attr,
5285 &dev_attr_features.attr,
5286 &dev_attr_major.attr,
5287 &dev_attr_minor.attr,
5288 &dev_attr_client_addr.attr,
5289 &dev_attr_client_id.attr,
5290 &dev_attr_cluster_fsid.attr,
5291 &dev_attr_config_info.attr,
5292 &dev_attr_pool.attr,
5293 &dev_attr_pool_id.attr,
5294 &dev_attr_pool_ns.attr,
5295 &dev_attr_name.attr,
5296 &dev_attr_image_id.attr,
5297 &dev_attr_current_snap.attr,
5298 &dev_attr_snap_id.attr,
5299 &dev_attr_parent.attr,
5300 &dev_attr_refresh.attr,
5301 NULL
5302 };
5303
5304 static struct attribute_group rbd_attr_group = {
5305 .attrs = rbd_attrs,
5306 };
5307
5308 static const struct attribute_group *rbd_attr_groups[] = {
5309 &rbd_attr_group,
5310 NULL
5311 };
5312
5313 static void rbd_dev_release(struct device *dev);
5314
5315 static const struct device_type rbd_device_type = {
5316 .name = "rbd",
5317 .groups = rbd_attr_groups,
5318 .release = rbd_dev_release,
5319 };
5320
rbd_spec_get(struct rbd_spec * spec)5321 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5322 {
5323 kref_get(&spec->kref);
5324
5325 return spec;
5326 }
5327
5328 static void rbd_spec_free(struct kref *kref);
rbd_spec_put(struct rbd_spec * spec)5329 static void rbd_spec_put(struct rbd_spec *spec)
5330 {
5331 if (spec)
5332 kref_put(&spec->kref, rbd_spec_free);
5333 }
5334
rbd_spec_alloc(void)5335 static struct rbd_spec *rbd_spec_alloc(void)
5336 {
5337 struct rbd_spec *spec;
5338
5339 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5340 if (!spec)
5341 return NULL;
5342
5343 spec->pool_id = CEPH_NOPOOL;
5344 spec->snap_id = CEPH_NOSNAP;
5345 kref_init(&spec->kref);
5346
5347 return spec;
5348 }
5349
rbd_spec_free(struct kref * kref)5350 static void rbd_spec_free(struct kref *kref)
5351 {
5352 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5353
5354 kfree(spec->pool_name);
5355 kfree(spec->pool_ns);
5356 kfree(spec->image_id);
5357 kfree(spec->image_name);
5358 kfree(spec->snap_name);
5359 kfree(spec);
5360 }
5361
rbd_dev_free(struct rbd_device * rbd_dev)5362 static void rbd_dev_free(struct rbd_device *rbd_dev)
5363 {
5364 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
5365 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
5366
5367 ceph_oid_destroy(&rbd_dev->header_oid);
5368 ceph_oloc_destroy(&rbd_dev->header_oloc);
5369 kfree(rbd_dev->config_info);
5370
5371 rbd_put_client(rbd_dev->rbd_client);
5372 rbd_spec_put(rbd_dev->spec);
5373 kfree(rbd_dev->opts);
5374 kfree(rbd_dev);
5375 }
5376
rbd_dev_release(struct device * dev)5377 static void rbd_dev_release(struct device *dev)
5378 {
5379 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5380 bool need_put = !!rbd_dev->opts;
5381
5382 if (need_put) {
5383 destroy_workqueue(rbd_dev->task_wq);
5384 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5385 }
5386
5387 rbd_dev_free(rbd_dev);
5388
5389 /*
5390 * This is racy, but way better than putting module outside of
5391 * the release callback. The race window is pretty small, so
5392 * doing something similar to dm (dm-builtin.c) is overkill.
5393 */
5394 if (need_put)
5395 module_put(THIS_MODULE);
5396 }
5397
__rbd_dev_create(struct rbd_spec * spec)5398 static struct rbd_device *__rbd_dev_create(struct rbd_spec *spec)
5399 {
5400 struct rbd_device *rbd_dev;
5401
5402 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
5403 if (!rbd_dev)
5404 return NULL;
5405
5406 spin_lock_init(&rbd_dev->lock);
5407 INIT_LIST_HEAD(&rbd_dev->node);
5408 init_rwsem(&rbd_dev->header_rwsem);
5409
5410 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
5411 ceph_oid_init(&rbd_dev->header_oid);
5412 rbd_dev->header_oloc.pool = spec->pool_id;
5413 if (spec->pool_ns) {
5414 WARN_ON(!*spec->pool_ns);
5415 rbd_dev->header_oloc.pool_ns =
5416 ceph_find_or_create_string(spec->pool_ns,
5417 strlen(spec->pool_ns));
5418 }
5419
5420 mutex_init(&rbd_dev->watch_mutex);
5421 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5422 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5423
5424 init_rwsem(&rbd_dev->lock_rwsem);
5425 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5426 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5427 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5428 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5429 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
5430 spin_lock_init(&rbd_dev->lock_lists_lock);
5431 INIT_LIST_HEAD(&rbd_dev->acquiring_list);
5432 INIT_LIST_HEAD(&rbd_dev->running_list);
5433 init_completion(&rbd_dev->acquire_wait);
5434 init_completion(&rbd_dev->releasing_wait);
5435
5436 spin_lock_init(&rbd_dev->object_map_lock);
5437
5438 rbd_dev->dev.bus = &rbd_bus_type;
5439 rbd_dev->dev.type = &rbd_device_type;
5440 rbd_dev->dev.parent = &rbd_root_dev;
5441 device_initialize(&rbd_dev->dev);
5442
5443 return rbd_dev;
5444 }
5445
5446 /*
5447 * Create a mapping rbd_dev.
5448 */
rbd_dev_create(struct rbd_client * rbdc,struct rbd_spec * spec,struct rbd_options * opts)5449 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5450 struct rbd_spec *spec,
5451 struct rbd_options *opts)
5452 {
5453 struct rbd_device *rbd_dev;
5454
5455 rbd_dev = __rbd_dev_create(spec);
5456 if (!rbd_dev)
5457 return NULL;
5458
5459 /* get an id and fill in device name */
5460 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5461 minor_to_rbd_dev_id(1 << MINORBITS),
5462 GFP_KERNEL);
5463 if (rbd_dev->dev_id < 0)
5464 goto fail_rbd_dev;
5465
5466 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5467 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5468 rbd_dev->name);
5469 if (!rbd_dev->task_wq)
5470 goto fail_dev_id;
5471
5472 /* we have a ref from do_rbd_add() */
5473 __module_get(THIS_MODULE);
5474
5475 rbd_dev->rbd_client = rbdc;
5476 rbd_dev->spec = spec;
5477 rbd_dev->opts = opts;
5478
5479 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5480 return rbd_dev;
5481
5482 fail_dev_id:
5483 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5484 fail_rbd_dev:
5485 rbd_dev_free(rbd_dev);
5486 return NULL;
5487 }
5488
rbd_dev_destroy(struct rbd_device * rbd_dev)5489 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5490 {
5491 if (rbd_dev)
5492 put_device(&rbd_dev->dev);
5493 }
5494
5495 /*
5496 * Get the size and object order for an image snapshot, or if
5497 * snap_id is CEPH_NOSNAP, gets this information for the base
5498 * image.
5499 */
_rbd_dev_v2_snap_size(struct rbd_device * rbd_dev,u64 snap_id,u8 * order,u64 * snap_size)5500 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5501 u8 *order, u64 *snap_size)
5502 {
5503 __le64 snapid = cpu_to_le64(snap_id);
5504 int ret;
5505 struct {
5506 u8 order;
5507 __le64 size;
5508 } __attribute__ ((packed)) size_buf = { 0 };
5509
5510 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5511 &rbd_dev->header_oloc, "get_size",
5512 &snapid, sizeof(snapid),
5513 &size_buf, sizeof(size_buf));
5514 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5515 if (ret < 0)
5516 return ret;
5517 if (ret < sizeof (size_buf))
5518 return -ERANGE;
5519
5520 if (order) {
5521 *order = size_buf.order;
5522 dout(" order %u", (unsigned int)*order);
5523 }
5524 *snap_size = le64_to_cpu(size_buf.size);
5525
5526 dout(" snap_id 0x%016llx snap_size = %llu\n",
5527 (unsigned long long)snap_id,
5528 (unsigned long long)*snap_size);
5529
5530 return 0;
5531 }
5532
rbd_dev_v2_image_size(struct rbd_device * rbd_dev)5533 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5534 {
5535 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5536 &rbd_dev->header.obj_order,
5537 &rbd_dev->header.image_size);
5538 }
5539
rbd_dev_v2_object_prefix(struct rbd_device * rbd_dev)5540 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5541 {
5542 size_t size;
5543 void *reply_buf;
5544 int ret;
5545 void *p;
5546
5547 /* Response will be an encoded string, which includes a length */
5548 size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5549 reply_buf = kzalloc(size, GFP_KERNEL);
5550 if (!reply_buf)
5551 return -ENOMEM;
5552
5553 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5554 &rbd_dev->header_oloc, "get_object_prefix",
5555 NULL, 0, reply_buf, size);
5556 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5557 if (ret < 0)
5558 goto out;
5559
5560 p = reply_buf;
5561 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
5562 p + ret, NULL, GFP_NOIO);
5563 ret = 0;
5564
5565 if (IS_ERR(rbd_dev->header.object_prefix)) {
5566 ret = PTR_ERR(rbd_dev->header.object_prefix);
5567 rbd_dev->header.object_prefix = NULL;
5568 } else {
5569 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
5570 }
5571 out:
5572 kfree(reply_buf);
5573
5574 return ret;
5575 }
5576
_rbd_dev_v2_snap_features(struct rbd_device * rbd_dev,u64 snap_id,bool read_only,u64 * snap_features)5577 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5578 bool read_only, u64 *snap_features)
5579 {
5580 struct {
5581 __le64 snap_id;
5582 u8 read_only;
5583 } features_in;
5584 struct {
5585 __le64 features;
5586 __le64 incompat;
5587 } __attribute__ ((packed)) features_buf = { 0 };
5588 u64 unsup;
5589 int ret;
5590
5591 features_in.snap_id = cpu_to_le64(snap_id);
5592 features_in.read_only = read_only;
5593
5594 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5595 &rbd_dev->header_oloc, "get_features",
5596 &features_in, sizeof(features_in),
5597 &features_buf, sizeof(features_buf));
5598 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5599 if (ret < 0)
5600 return ret;
5601 if (ret < sizeof (features_buf))
5602 return -ERANGE;
5603
5604 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5605 if (unsup) {
5606 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5607 unsup);
5608 return -ENXIO;
5609 }
5610
5611 *snap_features = le64_to_cpu(features_buf.features);
5612
5613 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
5614 (unsigned long long)snap_id,
5615 (unsigned long long)*snap_features,
5616 (unsigned long long)le64_to_cpu(features_buf.incompat));
5617
5618 return 0;
5619 }
5620
rbd_dev_v2_features(struct rbd_device * rbd_dev)5621 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5622 {
5623 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5624 rbd_is_ro(rbd_dev),
5625 &rbd_dev->header.features);
5626 }
5627
5628 /*
5629 * These are generic image flags, but since they are used only for
5630 * object map, store them in rbd_dev->object_map_flags.
5631 *
5632 * For the same reason, this function is called only on object map
5633 * (re)load and not on header refresh.
5634 */
rbd_dev_v2_get_flags(struct rbd_device * rbd_dev)5635 static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5636 {
5637 __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5638 __le64 flags;
5639 int ret;
5640
5641 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5642 &rbd_dev->header_oloc, "get_flags",
5643 &snapid, sizeof(snapid),
5644 &flags, sizeof(flags));
5645 if (ret < 0)
5646 return ret;
5647 if (ret < sizeof(flags))
5648 return -EBADMSG;
5649
5650 rbd_dev->object_map_flags = le64_to_cpu(flags);
5651 return 0;
5652 }
5653
5654 struct parent_image_info {
5655 u64 pool_id;
5656 const char *pool_ns;
5657 const char *image_id;
5658 u64 snap_id;
5659
5660 bool has_overlap;
5661 u64 overlap;
5662 };
5663
5664 /*
5665 * The caller is responsible for @pii.
5666 */
decode_parent_image_spec(void ** p,void * end,struct parent_image_info * pii)5667 static int decode_parent_image_spec(void **p, void *end,
5668 struct parent_image_info *pii)
5669 {
5670 u8 struct_v;
5671 u32 struct_len;
5672 int ret;
5673
5674 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5675 &struct_v, &struct_len);
5676 if (ret)
5677 return ret;
5678
5679 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5680 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5681 if (IS_ERR(pii->pool_ns)) {
5682 ret = PTR_ERR(pii->pool_ns);
5683 pii->pool_ns = NULL;
5684 return ret;
5685 }
5686 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5687 if (IS_ERR(pii->image_id)) {
5688 ret = PTR_ERR(pii->image_id);
5689 pii->image_id = NULL;
5690 return ret;
5691 }
5692 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5693 return 0;
5694
5695 e_inval:
5696 return -EINVAL;
5697 }
5698
__get_parent_info(struct rbd_device * rbd_dev,struct page * req_page,struct page * reply_page,struct parent_image_info * pii)5699 static int __get_parent_info(struct rbd_device *rbd_dev,
5700 struct page *req_page,
5701 struct page *reply_page,
5702 struct parent_image_info *pii)
5703 {
5704 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5705 size_t reply_len = PAGE_SIZE;
5706 void *p, *end;
5707 int ret;
5708
5709 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5710 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
5711 req_page, sizeof(u64), &reply_page, &reply_len);
5712 if (ret)
5713 return ret == -EOPNOTSUPP ? 1 : ret;
5714
5715 p = page_address(reply_page);
5716 end = p + reply_len;
5717 ret = decode_parent_image_spec(&p, end, pii);
5718 if (ret)
5719 return ret;
5720
5721 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5722 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
5723 req_page, sizeof(u64), &reply_page, &reply_len);
5724 if (ret)
5725 return ret;
5726
5727 p = page_address(reply_page);
5728 end = p + reply_len;
5729 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5730 if (pii->has_overlap)
5731 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5732
5733 return 0;
5734
5735 e_inval:
5736 return -EINVAL;
5737 }
5738
5739 /*
5740 * The caller is responsible for @pii.
5741 */
__get_parent_info_legacy(struct rbd_device * rbd_dev,struct page * req_page,struct page * reply_page,struct parent_image_info * pii)5742 static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5743 struct page *req_page,
5744 struct page *reply_page,
5745 struct parent_image_info *pii)
5746 {
5747 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5748 size_t reply_len = PAGE_SIZE;
5749 void *p, *end;
5750 int ret;
5751
5752 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5753 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
5754 req_page, sizeof(u64), &reply_page, &reply_len);
5755 if (ret)
5756 return ret;
5757
5758 p = page_address(reply_page);
5759 end = p + reply_len;
5760 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5761 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5762 if (IS_ERR(pii->image_id)) {
5763 ret = PTR_ERR(pii->image_id);
5764 pii->image_id = NULL;
5765 return ret;
5766 }
5767 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
5768 pii->has_overlap = true;
5769 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5770
5771 return 0;
5772
5773 e_inval:
5774 return -EINVAL;
5775 }
5776
get_parent_info(struct rbd_device * rbd_dev,struct parent_image_info * pii)5777 static int get_parent_info(struct rbd_device *rbd_dev,
5778 struct parent_image_info *pii)
5779 {
5780 struct page *req_page, *reply_page;
5781 void *p;
5782 int ret;
5783
5784 req_page = alloc_page(GFP_KERNEL);
5785 if (!req_page)
5786 return -ENOMEM;
5787
5788 reply_page = alloc_page(GFP_KERNEL);
5789 if (!reply_page) {
5790 __free_page(req_page);
5791 return -ENOMEM;
5792 }
5793
5794 p = page_address(req_page);
5795 ceph_encode_64(&p, rbd_dev->spec->snap_id);
5796 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5797 if (ret > 0)
5798 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5799 pii);
5800
5801 __free_page(req_page);
5802 __free_page(reply_page);
5803 return ret;
5804 }
5805
rbd_dev_v2_parent_info(struct rbd_device * rbd_dev)5806 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5807 {
5808 struct rbd_spec *parent_spec;
5809 struct parent_image_info pii = { 0 };
5810 int ret;
5811
5812 parent_spec = rbd_spec_alloc();
5813 if (!parent_spec)
5814 return -ENOMEM;
5815
5816 ret = get_parent_info(rbd_dev, &pii);
5817 if (ret)
5818 goto out_err;
5819
5820 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5821 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5822 pii.has_overlap, pii.overlap);
5823
5824 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
5825 /*
5826 * Either the parent never existed, or we have
5827 * record of it but the image got flattened so it no
5828 * longer has a parent. When the parent of a
5829 * layered image disappears we immediately set the
5830 * overlap to 0. The effect of this is that all new
5831 * requests will be treated as if the image had no
5832 * parent.
5833 *
5834 * If !pii.has_overlap, the parent image spec is not
5835 * applicable. It's there to avoid duplication in each
5836 * snapshot record.
5837 */
5838 if (rbd_dev->parent_overlap) {
5839 rbd_dev->parent_overlap = 0;
5840 rbd_dev_parent_put(rbd_dev);
5841 pr_info("%s: clone image has been flattened\n",
5842 rbd_dev->disk->disk_name);
5843 }
5844
5845 goto out; /* No parent? No problem. */
5846 }
5847
5848 /* The ceph file layout needs to fit pool id in 32 bits */
5849
5850 ret = -EIO;
5851 if (pii.pool_id > (u64)U32_MAX) {
5852 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5853 (unsigned long long)pii.pool_id, U32_MAX);
5854 goto out_err;
5855 }
5856
5857 /*
5858 * The parent won't change (except when the clone is
5859 * flattened, already handled that). So we only need to
5860 * record the parent spec we have not already done so.
5861 */
5862 if (!rbd_dev->parent_spec) {
5863 parent_spec->pool_id = pii.pool_id;
5864 if (pii.pool_ns && *pii.pool_ns) {
5865 parent_spec->pool_ns = pii.pool_ns;
5866 pii.pool_ns = NULL;
5867 }
5868 parent_spec->image_id = pii.image_id;
5869 pii.image_id = NULL;
5870 parent_spec->snap_id = pii.snap_id;
5871
5872 rbd_dev->parent_spec = parent_spec;
5873 parent_spec = NULL; /* rbd_dev now owns this */
5874 }
5875
5876 /*
5877 * We always update the parent overlap. If it's zero we issue
5878 * a warning, as we will proceed as if there was no parent.
5879 */
5880 if (!pii.overlap) {
5881 if (parent_spec) {
5882 /* refresh, careful to warn just once */
5883 if (rbd_dev->parent_overlap)
5884 rbd_warn(rbd_dev,
5885 "clone now standalone (overlap became 0)");
5886 } else {
5887 /* initial probe */
5888 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
5889 }
5890 }
5891 rbd_dev->parent_overlap = pii.overlap;
5892
5893 out:
5894 ret = 0;
5895 out_err:
5896 kfree(pii.pool_ns);
5897 kfree(pii.image_id);
5898 rbd_spec_put(parent_spec);
5899 return ret;
5900 }
5901
rbd_dev_v2_striping_info(struct rbd_device * rbd_dev)5902 static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5903 {
5904 struct {
5905 __le64 stripe_unit;
5906 __le64 stripe_count;
5907 } __attribute__ ((packed)) striping_info_buf = { 0 };
5908 size_t size = sizeof (striping_info_buf);
5909 void *p;
5910 int ret;
5911
5912 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5913 &rbd_dev->header_oloc, "get_stripe_unit_count",
5914 NULL, 0, &striping_info_buf, size);
5915 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5916 if (ret < 0)
5917 return ret;
5918 if (ret < size)
5919 return -ERANGE;
5920
5921 p = &striping_info_buf;
5922 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5923 rbd_dev->header.stripe_count = ceph_decode_64(&p);
5924 return 0;
5925 }
5926
rbd_dev_v2_data_pool(struct rbd_device * rbd_dev)5927 static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5928 {
5929 __le64 data_pool_id;
5930 int ret;
5931
5932 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5933 &rbd_dev->header_oloc, "get_data_pool",
5934 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5935 if (ret < 0)
5936 return ret;
5937 if (ret < sizeof(data_pool_id))
5938 return -EBADMSG;
5939
5940 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5941 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5942 return 0;
5943 }
5944
rbd_dev_image_name(struct rbd_device * rbd_dev)5945 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5946 {
5947 CEPH_DEFINE_OID_ONSTACK(oid);
5948 size_t image_id_size;
5949 char *image_id;
5950 void *p;
5951 void *end;
5952 size_t size;
5953 void *reply_buf = NULL;
5954 size_t len = 0;
5955 char *image_name = NULL;
5956 int ret;
5957
5958 rbd_assert(!rbd_dev->spec->image_name);
5959
5960 len = strlen(rbd_dev->spec->image_id);
5961 image_id_size = sizeof (__le32) + len;
5962 image_id = kmalloc(image_id_size, GFP_KERNEL);
5963 if (!image_id)
5964 return NULL;
5965
5966 p = image_id;
5967 end = image_id + image_id_size;
5968 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
5969
5970 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5971 reply_buf = kmalloc(size, GFP_KERNEL);
5972 if (!reply_buf)
5973 goto out;
5974
5975 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5976 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5977 "dir_get_name", image_id, image_id_size,
5978 reply_buf, size);
5979 if (ret < 0)
5980 goto out;
5981 p = reply_buf;
5982 end = reply_buf + ret;
5983
5984 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5985 if (IS_ERR(image_name))
5986 image_name = NULL;
5987 else
5988 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5989 out:
5990 kfree(reply_buf);
5991 kfree(image_id);
5992
5993 return image_name;
5994 }
5995
rbd_v1_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)5996 static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5997 {
5998 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5999 const char *snap_name;
6000 u32 which = 0;
6001
6002 /* Skip over names until we find the one we are looking for */
6003
6004 snap_name = rbd_dev->header.snap_names;
6005 while (which < snapc->num_snaps) {
6006 if (!strcmp(name, snap_name))
6007 return snapc->snaps[which];
6008 snap_name += strlen(snap_name) + 1;
6009 which++;
6010 }
6011 return CEPH_NOSNAP;
6012 }
6013
rbd_v2_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)6014 static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6015 {
6016 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
6017 u32 which;
6018 bool found = false;
6019 u64 snap_id;
6020
6021 for (which = 0; !found && which < snapc->num_snaps; which++) {
6022 const char *snap_name;
6023
6024 snap_id = snapc->snaps[which];
6025 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
6026 if (IS_ERR(snap_name)) {
6027 /* ignore no-longer existing snapshots */
6028 if (PTR_ERR(snap_name) == -ENOENT)
6029 continue;
6030 else
6031 break;
6032 }
6033 found = !strcmp(name, snap_name);
6034 kfree(snap_name);
6035 }
6036 return found ? snap_id : CEPH_NOSNAP;
6037 }
6038
6039 /*
6040 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
6041 * no snapshot by that name is found, or if an error occurs.
6042 */
rbd_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)6043 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6044 {
6045 if (rbd_dev->image_format == 1)
6046 return rbd_v1_snap_id_by_name(rbd_dev, name);
6047
6048 return rbd_v2_snap_id_by_name(rbd_dev, name);
6049 }
6050
6051 /*
6052 * An image being mapped will have everything but the snap id.
6053 */
rbd_spec_fill_snap_id(struct rbd_device * rbd_dev)6054 static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
6055 {
6056 struct rbd_spec *spec = rbd_dev->spec;
6057
6058 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
6059 rbd_assert(spec->image_id && spec->image_name);
6060 rbd_assert(spec->snap_name);
6061
6062 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
6063 u64 snap_id;
6064
6065 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
6066 if (snap_id == CEPH_NOSNAP)
6067 return -ENOENT;
6068
6069 spec->snap_id = snap_id;
6070 } else {
6071 spec->snap_id = CEPH_NOSNAP;
6072 }
6073
6074 return 0;
6075 }
6076
6077 /*
6078 * A parent image will have all ids but none of the names.
6079 *
6080 * All names in an rbd spec are dynamically allocated. It's OK if we
6081 * can't figure out the name for an image id.
6082 */
rbd_spec_fill_names(struct rbd_device * rbd_dev)6083 static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
6084 {
6085 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
6086 struct rbd_spec *spec = rbd_dev->spec;
6087 const char *pool_name;
6088 const char *image_name;
6089 const char *snap_name;
6090 int ret;
6091
6092 rbd_assert(spec->pool_id != CEPH_NOPOOL);
6093 rbd_assert(spec->image_id);
6094 rbd_assert(spec->snap_id != CEPH_NOSNAP);
6095
6096 /* Get the pool name; we have to make our own copy of this */
6097
6098 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6099 if (!pool_name) {
6100 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
6101 return -EIO;
6102 }
6103 pool_name = kstrdup(pool_name, GFP_KERNEL);
6104 if (!pool_name)
6105 return -ENOMEM;
6106
6107 /* Fetch the image name; tolerate failure here */
6108
6109 image_name = rbd_dev_image_name(rbd_dev);
6110 if (!image_name)
6111 rbd_warn(rbd_dev, "unable to get image name");
6112
6113 /* Fetch the snapshot name */
6114
6115 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
6116 if (IS_ERR(snap_name)) {
6117 ret = PTR_ERR(snap_name);
6118 goto out_err;
6119 }
6120
6121 spec->pool_name = pool_name;
6122 spec->image_name = image_name;
6123 spec->snap_name = snap_name;
6124
6125 return 0;
6126
6127 out_err:
6128 kfree(image_name);
6129 kfree(pool_name);
6130 return ret;
6131 }
6132
rbd_dev_v2_snap_context(struct rbd_device * rbd_dev)6133 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
6134 {
6135 size_t size;
6136 int ret;
6137 void *reply_buf;
6138 void *p;
6139 void *end;
6140 u64 seq;
6141 u32 snap_count;
6142 struct ceph_snap_context *snapc;
6143 u32 i;
6144
6145 /*
6146 * We'll need room for the seq value (maximum snapshot id),
6147 * snapshot count, and array of that many snapshot ids.
6148 * For now we have a fixed upper limit on the number we're
6149 * prepared to receive.
6150 */
6151 size = sizeof (__le64) + sizeof (__le32) +
6152 RBD_MAX_SNAP_COUNT * sizeof (__le64);
6153 reply_buf = kzalloc(size, GFP_KERNEL);
6154 if (!reply_buf)
6155 return -ENOMEM;
6156
6157 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6158 &rbd_dev->header_oloc, "get_snapcontext",
6159 NULL, 0, reply_buf, size);
6160 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6161 if (ret < 0)
6162 goto out;
6163
6164 p = reply_buf;
6165 end = reply_buf + ret;
6166 ret = -ERANGE;
6167 ceph_decode_64_safe(&p, end, seq, out);
6168 ceph_decode_32_safe(&p, end, snap_count, out);
6169
6170 /*
6171 * Make sure the reported number of snapshot ids wouldn't go
6172 * beyond the end of our buffer. But before checking that,
6173 * make sure the computed size of the snapshot context we
6174 * allocate is representable in a size_t.
6175 */
6176 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6177 / sizeof (u64)) {
6178 ret = -EINVAL;
6179 goto out;
6180 }
6181 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6182 goto out;
6183 ret = 0;
6184
6185 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
6186 if (!snapc) {
6187 ret = -ENOMEM;
6188 goto out;
6189 }
6190 snapc->seq = seq;
6191 for (i = 0; i < snap_count; i++)
6192 snapc->snaps[i] = ceph_decode_64(&p);
6193
6194 ceph_put_snap_context(rbd_dev->header.snapc);
6195 rbd_dev->header.snapc = snapc;
6196
6197 dout(" snap context seq = %llu, snap_count = %u\n",
6198 (unsigned long long)seq, (unsigned int)snap_count);
6199 out:
6200 kfree(reply_buf);
6201
6202 return ret;
6203 }
6204
rbd_dev_v2_snap_name(struct rbd_device * rbd_dev,u64 snap_id)6205 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6206 u64 snap_id)
6207 {
6208 size_t size;
6209 void *reply_buf;
6210 __le64 snapid;
6211 int ret;
6212 void *p;
6213 void *end;
6214 char *snap_name;
6215
6216 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6217 reply_buf = kmalloc(size, GFP_KERNEL);
6218 if (!reply_buf)
6219 return ERR_PTR(-ENOMEM);
6220
6221 snapid = cpu_to_le64(snap_id);
6222 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6223 &rbd_dev->header_oloc, "get_snapshot_name",
6224 &snapid, sizeof(snapid), reply_buf, size);
6225 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6226 if (ret < 0) {
6227 snap_name = ERR_PTR(ret);
6228 goto out;
6229 }
6230
6231 p = reply_buf;
6232 end = reply_buf + ret;
6233 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
6234 if (IS_ERR(snap_name))
6235 goto out;
6236
6237 dout(" snap_id 0x%016llx snap_name = %s\n",
6238 (unsigned long long)snap_id, snap_name);
6239 out:
6240 kfree(reply_buf);
6241
6242 return snap_name;
6243 }
6244
rbd_dev_v2_header_info(struct rbd_device * rbd_dev)6245 static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
6246 {
6247 bool first_time = rbd_dev->header.object_prefix == NULL;
6248 int ret;
6249
6250 ret = rbd_dev_v2_image_size(rbd_dev);
6251 if (ret)
6252 return ret;
6253
6254 if (first_time) {
6255 ret = rbd_dev_v2_header_onetime(rbd_dev);
6256 if (ret)
6257 return ret;
6258 }
6259
6260 ret = rbd_dev_v2_snap_context(rbd_dev);
6261 if (ret && first_time) {
6262 kfree(rbd_dev->header.object_prefix);
6263 rbd_dev->header.object_prefix = NULL;
6264 }
6265
6266 return ret;
6267 }
6268
rbd_dev_header_info(struct rbd_device * rbd_dev)6269 static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6270 {
6271 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6272
6273 if (rbd_dev->image_format == 1)
6274 return rbd_dev_v1_header_info(rbd_dev);
6275
6276 return rbd_dev_v2_header_info(rbd_dev);
6277 }
6278
6279 /*
6280 * Skips over white space at *buf, and updates *buf to point to the
6281 * first found non-space character (if any). Returns the length of
6282 * the token (string of non-white space characters) found. Note
6283 * that *buf must be terminated with '\0'.
6284 */
next_token(const char ** buf)6285 static inline size_t next_token(const char **buf)
6286 {
6287 /*
6288 * These are the characters that produce nonzero for
6289 * isspace() in the "C" and "POSIX" locales.
6290 */
6291 const char *spaces = " \f\n\r\t\v";
6292
6293 *buf += strspn(*buf, spaces); /* Find start of token */
6294
6295 return strcspn(*buf, spaces); /* Return token length */
6296 }
6297
6298 /*
6299 * Finds the next token in *buf, dynamically allocates a buffer big
6300 * enough to hold a copy of it, and copies the token into the new
6301 * buffer. The copy is guaranteed to be terminated with '\0'. Note
6302 * that a duplicate buffer is created even for a zero-length token.
6303 *
6304 * Returns a pointer to the newly-allocated duplicate, or a null
6305 * pointer if memory for the duplicate was not available. If
6306 * the lenp argument is a non-null pointer, the length of the token
6307 * (not including the '\0') is returned in *lenp.
6308 *
6309 * If successful, the *buf pointer will be updated to point beyond
6310 * the end of the found token.
6311 *
6312 * Note: uses GFP_KERNEL for allocation.
6313 */
dup_token(const char ** buf,size_t * lenp)6314 static inline char *dup_token(const char **buf, size_t *lenp)
6315 {
6316 char *dup;
6317 size_t len;
6318
6319 len = next_token(buf);
6320 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
6321 if (!dup)
6322 return NULL;
6323 *(dup + len) = '\0';
6324 *buf += len;
6325
6326 if (lenp)
6327 *lenp = len;
6328
6329 return dup;
6330 }
6331
rbd_parse_param(struct fs_parameter * param,struct rbd_parse_opts_ctx * pctx)6332 static int rbd_parse_param(struct fs_parameter *param,
6333 struct rbd_parse_opts_ctx *pctx)
6334 {
6335 struct rbd_options *opt = pctx->opts;
6336 struct fs_parse_result result;
6337 struct p_log log = {.prefix = "rbd"};
6338 int token, ret;
6339
6340 ret = ceph_parse_param(param, pctx->copts, NULL);
6341 if (ret != -ENOPARAM)
6342 return ret;
6343
6344 token = __fs_parse(&log, rbd_parameters, param, &result);
6345 dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
6346 if (token < 0) {
6347 if (token == -ENOPARAM)
6348 return inval_plog(&log, "Unknown parameter '%s'",
6349 param->key);
6350 return token;
6351 }
6352
6353 switch (token) {
6354 case Opt_queue_depth:
6355 if (result.uint_32 < 1)
6356 goto out_of_range;
6357 opt->queue_depth = result.uint_32;
6358 break;
6359 case Opt_alloc_size:
6360 if (result.uint_32 < SECTOR_SIZE)
6361 goto out_of_range;
6362 if (!is_power_of_2(result.uint_32))
6363 return inval_plog(&log, "alloc_size must be a power of 2");
6364 opt->alloc_size = result.uint_32;
6365 break;
6366 case Opt_lock_timeout:
6367 /* 0 is "wait forever" (i.e. infinite timeout) */
6368 if (result.uint_32 > INT_MAX / 1000)
6369 goto out_of_range;
6370 opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
6371 break;
6372 case Opt_pool_ns:
6373 kfree(pctx->spec->pool_ns);
6374 pctx->spec->pool_ns = param->string;
6375 param->string = NULL;
6376 break;
6377 case Opt_compression_hint:
6378 switch (result.uint_32) {
6379 case Opt_compression_hint_none:
6380 opt->alloc_hint_flags &=
6381 ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE |
6382 CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE);
6383 break;
6384 case Opt_compression_hint_compressible:
6385 opt->alloc_hint_flags |=
6386 CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6387 opt->alloc_hint_flags &=
6388 ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6389 break;
6390 case Opt_compression_hint_incompressible:
6391 opt->alloc_hint_flags |=
6392 CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6393 opt->alloc_hint_flags &=
6394 ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6395 break;
6396 default:
6397 BUG();
6398 }
6399 break;
6400 case Opt_read_only:
6401 opt->read_only = true;
6402 break;
6403 case Opt_read_write:
6404 opt->read_only = false;
6405 break;
6406 case Opt_lock_on_read:
6407 opt->lock_on_read = true;
6408 break;
6409 case Opt_exclusive:
6410 opt->exclusive = true;
6411 break;
6412 case Opt_notrim:
6413 opt->trim = false;
6414 break;
6415 default:
6416 BUG();
6417 }
6418
6419 return 0;
6420
6421 out_of_range:
6422 return inval_plog(&log, "%s out of range", param->key);
6423 }
6424
6425 /*
6426 * This duplicates most of generic_parse_monolithic(), untying it from
6427 * fs_context and skipping standard superblock and security options.
6428 */
rbd_parse_options(char * options,struct rbd_parse_opts_ctx * pctx)6429 static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
6430 {
6431 char *key;
6432 int ret = 0;
6433
6434 dout("%s '%s'\n", __func__, options);
6435 while ((key = strsep(&options, ",")) != NULL) {
6436 if (*key) {
6437 struct fs_parameter param = {
6438 .key = key,
6439 .type = fs_value_is_flag,
6440 };
6441 char *value = strchr(key, '=');
6442 size_t v_len = 0;
6443
6444 if (value) {
6445 if (value == key)
6446 continue;
6447 *value++ = 0;
6448 v_len = strlen(value);
6449 param.string = kmemdup_nul(value, v_len,
6450 GFP_KERNEL);
6451 if (!param.string)
6452 return -ENOMEM;
6453 param.type = fs_value_is_string;
6454 }
6455 param.size = v_len;
6456
6457 ret = rbd_parse_param(¶m, pctx);
6458 kfree(param.string);
6459 if (ret)
6460 break;
6461 }
6462 }
6463
6464 return ret;
6465 }
6466
6467 /*
6468 * Parse the options provided for an "rbd add" (i.e., rbd image
6469 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
6470 * and the data written is passed here via a NUL-terminated buffer.
6471 * Returns 0 if successful or an error code otherwise.
6472 *
6473 * The information extracted from these options is recorded in
6474 * the other parameters which return dynamically-allocated
6475 * structures:
6476 * ceph_opts
6477 * The address of a pointer that will refer to a ceph options
6478 * structure. Caller must release the returned pointer using
6479 * ceph_destroy_options() when it is no longer needed.
6480 * rbd_opts
6481 * Address of an rbd options pointer. Fully initialized by
6482 * this function; caller must release with kfree().
6483 * spec
6484 * Address of an rbd image specification pointer. Fully
6485 * initialized by this function based on parsed options.
6486 * Caller must release with rbd_spec_put().
6487 *
6488 * The options passed take this form:
6489 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6490 * where:
6491 * <mon_addrs>
6492 * A comma-separated list of one or more monitor addresses.
6493 * A monitor address is an ip address, optionally followed
6494 * by a port number (separated by a colon).
6495 * I.e.: ip1[:port1][,ip2[:port2]...]
6496 * <options>
6497 * A comma-separated list of ceph and/or rbd options.
6498 * <pool_name>
6499 * The name of the rados pool containing the rbd image.
6500 * <image_name>
6501 * The name of the image in that pool to map.
6502 * <snap_id>
6503 * An optional snapshot id. If provided, the mapping will
6504 * present data from the image at the time that snapshot was
6505 * created. The image head is used if no snapshot id is
6506 * provided. Snapshot mappings are always read-only.
6507 */
rbd_add_parse_args(const char * buf,struct ceph_options ** ceph_opts,struct rbd_options ** opts,struct rbd_spec ** rbd_spec)6508 static int rbd_add_parse_args(const char *buf,
6509 struct ceph_options **ceph_opts,
6510 struct rbd_options **opts,
6511 struct rbd_spec **rbd_spec)
6512 {
6513 size_t len;
6514 char *options;
6515 const char *mon_addrs;
6516 char *snap_name;
6517 size_t mon_addrs_size;
6518 struct rbd_parse_opts_ctx pctx = { 0 };
6519 int ret;
6520
6521 /* The first four tokens are required */
6522
6523 len = next_token(&buf);
6524 if (!len) {
6525 rbd_warn(NULL, "no monitor address(es) provided");
6526 return -EINVAL;
6527 }
6528 mon_addrs = buf;
6529 mon_addrs_size = len;
6530 buf += len;
6531
6532 ret = -EINVAL;
6533 options = dup_token(&buf, NULL);
6534 if (!options)
6535 return -ENOMEM;
6536 if (!*options) {
6537 rbd_warn(NULL, "no options provided");
6538 goto out_err;
6539 }
6540
6541 pctx.spec = rbd_spec_alloc();
6542 if (!pctx.spec)
6543 goto out_mem;
6544
6545 pctx.spec->pool_name = dup_token(&buf, NULL);
6546 if (!pctx.spec->pool_name)
6547 goto out_mem;
6548 if (!*pctx.spec->pool_name) {
6549 rbd_warn(NULL, "no pool name provided");
6550 goto out_err;
6551 }
6552
6553 pctx.spec->image_name = dup_token(&buf, NULL);
6554 if (!pctx.spec->image_name)
6555 goto out_mem;
6556 if (!*pctx.spec->image_name) {
6557 rbd_warn(NULL, "no image name provided");
6558 goto out_err;
6559 }
6560
6561 /*
6562 * Snapshot name is optional; default is to use "-"
6563 * (indicating the head/no snapshot).
6564 */
6565 len = next_token(&buf);
6566 if (!len) {
6567 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6568 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
6569 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
6570 ret = -ENAMETOOLONG;
6571 goto out_err;
6572 }
6573 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6574 if (!snap_name)
6575 goto out_mem;
6576 *(snap_name + len) = '\0';
6577 pctx.spec->snap_name = snap_name;
6578
6579 pctx.copts = ceph_alloc_options();
6580 if (!pctx.copts)
6581 goto out_mem;
6582
6583 /* Initialize all rbd options to the defaults */
6584
6585 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6586 if (!pctx.opts)
6587 goto out_mem;
6588
6589 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6590 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
6591 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
6592 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6593 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6594 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6595 pctx.opts->trim = RBD_TRIM_DEFAULT;
6596
6597 ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL);
6598 if (ret)
6599 goto out_err;
6600
6601 ret = rbd_parse_options(options, &pctx);
6602 if (ret)
6603 goto out_err;
6604
6605 *ceph_opts = pctx.copts;
6606 *opts = pctx.opts;
6607 *rbd_spec = pctx.spec;
6608 kfree(options);
6609 return 0;
6610
6611 out_mem:
6612 ret = -ENOMEM;
6613 out_err:
6614 kfree(pctx.opts);
6615 ceph_destroy_options(pctx.copts);
6616 rbd_spec_put(pctx.spec);
6617 kfree(options);
6618 return ret;
6619 }
6620
rbd_dev_image_unlock(struct rbd_device * rbd_dev)6621 static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6622 {
6623 down_write(&rbd_dev->lock_rwsem);
6624 if (__rbd_is_lock_owner(rbd_dev))
6625 __rbd_release_lock(rbd_dev);
6626 up_write(&rbd_dev->lock_rwsem);
6627 }
6628
6629 /*
6630 * If the wait is interrupted, an error is returned even if the lock
6631 * was successfully acquired. rbd_dev_image_unlock() will release it
6632 * if needed.
6633 */
rbd_add_acquire_lock(struct rbd_device * rbd_dev)6634 static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6635 {
6636 long ret;
6637
6638 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
6639 if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6640 return 0;
6641
6642 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6643 return -EINVAL;
6644 }
6645
6646 if (rbd_is_ro(rbd_dev))
6647 return 0;
6648
6649 rbd_assert(!rbd_is_lock_owner(rbd_dev));
6650 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6651 ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6652 ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
6653 if (ret > 0) {
6654 ret = rbd_dev->acquire_err;
6655 } else {
6656 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
6657 if (!ret)
6658 ret = -ETIMEDOUT;
6659 }
6660
6661 if (ret) {
6662 rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6663 return ret;
6664 }
6665
6666 /*
6667 * The lock may have been released by now, unless automatic lock
6668 * transitions are disabled.
6669 */
6670 rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
6671 return 0;
6672 }
6673
6674 /*
6675 * An rbd format 2 image has a unique identifier, distinct from the
6676 * name given to it by the user. Internally, that identifier is
6677 * what's used to specify the names of objects related to the image.
6678 *
6679 * A special "rbd id" object is used to map an rbd image name to its
6680 * id. If that object doesn't exist, then there is no v2 rbd image
6681 * with the supplied name.
6682 *
6683 * This function will record the given rbd_dev's image_id field if
6684 * it can be determined, and in that case will return 0. If any
6685 * errors occur a negative errno will be returned and the rbd_dev's
6686 * image_id field will be unchanged (and should be NULL).
6687 */
rbd_dev_image_id(struct rbd_device * rbd_dev)6688 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6689 {
6690 int ret;
6691 size_t size;
6692 CEPH_DEFINE_OID_ONSTACK(oid);
6693 void *response;
6694 char *image_id;
6695
6696 /*
6697 * When probing a parent image, the image id is already
6698 * known (and the image name likely is not). There's no
6699 * need to fetch the image id again in this case. We
6700 * do still need to set the image format though.
6701 */
6702 if (rbd_dev->spec->image_id) {
6703 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6704
6705 return 0;
6706 }
6707
6708 /*
6709 * First, see if the format 2 image id file exists, and if
6710 * so, get the image's persistent id from it.
6711 */
6712 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6713 rbd_dev->spec->image_name);
6714 if (ret)
6715 return ret;
6716
6717 dout("rbd id object name is %s\n", oid.name);
6718
6719 /* Response will be an encoded string, which includes a length */
6720 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6721 response = kzalloc(size, GFP_NOIO);
6722 if (!response) {
6723 ret = -ENOMEM;
6724 goto out;
6725 }
6726
6727 /* If it doesn't exist we'll assume it's a format 1 image */
6728
6729 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6730 "get_id", NULL, 0,
6731 response, size);
6732 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6733 if (ret == -ENOENT) {
6734 image_id = kstrdup("", GFP_KERNEL);
6735 ret = image_id ? 0 : -ENOMEM;
6736 if (!ret)
6737 rbd_dev->image_format = 1;
6738 } else if (ret >= 0) {
6739 void *p = response;
6740
6741 image_id = ceph_extract_encoded_string(&p, p + ret,
6742 NULL, GFP_NOIO);
6743 ret = PTR_ERR_OR_ZERO(image_id);
6744 if (!ret)
6745 rbd_dev->image_format = 2;
6746 }
6747
6748 if (!ret) {
6749 rbd_dev->spec->image_id = image_id;
6750 dout("image_id is %s\n", image_id);
6751 }
6752 out:
6753 kfree(response);
6754 ceph_oid_destroy(&oid);
6755 return ret;
6756 }
6757
6758 /*
6759 * Undo whatever state changes are made by v1 or v2 header info
6760 * call.
6761 */
rbd_dev_unprobe(struct rbd_device * rbd_dev)6762 static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6763 {
6764 struct rbd_image_header *header;
6765
6766 rbd_dev_parent_put(rbd_dev);
6767 rbd_object_map_free(rbd_dev);
6768 rbd_dev_mapping_clear(rbd_dev);
6769
6770 /* Free dynamic fields from the header, then zero it out */
6771
6772 header = &rbd_dev->header;
6773 ceph_put_snap_context(header->snapc);
6774 kfree(header->snap_sizes);
6775 kfree(header->snap_names);
6776 kfree(header->object_prefix);
6777 memset(header, 0, sizeof (*header));
6778 }
6779
rbd_dev_v2_header_onetime(struct rbd_device * rbd_dev)6780 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
6781 {
6782 int ret;
6783
6784 ret = rbd_dev_v2_object_prefix(rbd_dev);
6785 if (ret)
6786 goto out_err;
6787
6788 /*
6789 * Get the and check features for the image. Currently the
6790 * features are assumed to never change.
6791 */
6792 ret = rbd_dev_v2_features(rbd_dev);
6793 if (ret)
6794 goto out_err;
6795
6796 /* If the image supports fancy striping, get its parameters */
6797
6798 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6799 ret = rbd_dev_v2_striping_info(rbd_dev);
6800 if (ret < 0)
6801 goto out_err;
6802 }
6803
6804 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6805 ret = rbd_dev_v2_data_pool(rbd_dev);
6806 if (ret)
6807 goto out_err;
6808 }
6809
6810 rbd_init_layout(rbd_dev);
6811 return 0;
6812
6813 out_err:
6814 rbd_dev->header.features = 0;
6815 kfree(rbd_dev->header.object_prefix);
6816 rbd_dev->header.object_prefix = NULL;
6817 return ret;
6818 }
6819
6820 /*
6821 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6822 * rbd_dev_image_probe() recursion depth, which means it's also the
6823 * length of the already discovered part of the parent chain.
6824 */
rbd_dev_probe_parent(struct rbd_device * rbd_dev,int depth)6825 static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
6826 {
6827 struct rbd_device *parent = NULL;
6828 int ret;
6829
6830 if (!rbd_dev->parent_spec)
6831 return 0;
6832
6833 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6834 pr_info("parent chain is too long (%d)\n", depth);
6835 ret = -EINVAL;
6836 goto out_err;
6837 }
6838
6839 parent = __rbd_dev_create(rbd_dev->parent_spec);
6840 if (!parent) {
6841 ret = -ENOMEM;
6842 goto out_err;
6843 }
6844
6845 /*
6846 * Images related by parent/child relationships always share
6847 * rbd_client and spec/parent_spec, so bump their refcounts.
6848 */
6849 parent->rbd_client = __rbd_get_client(rbd_dev->rbd_client);
6850 parent->spec = rbd_spec_get(rbd_dev->parent_spec);
6851
6852 __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
6853
6854 ret = rbd_dev_image_probe(parent, depth);
6855 if (ret < 0)
6856 goto out_err;
6857
6858 rbd_dev->parent = parent;
6859 atomic_set(&rbd_dev->parent_ref, 1);
6860 return 0;
6861
6862 out_err:
6863 rbd_dev_unparent(rbd_dev);
6864 rbd_dev_destroy(parent);
6865 return ret;
6866 }
6867
rbd_dev_device_release(struct rbd_device * rbd_dev)6868 static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6869 {
6870 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6871 rbd_free_disk(rbd_dev);
6872 if (!single_major)
6873 unregister_blkdev(rbd_dev->major, rbd_dev->name);
6874 }
6875
6876 /*
6877 * rbd_dev->header_rwsem must be locked for write and will be unlocked
6878 * upon return.
6879 */
rbd_dev_device_setup(struct rbd_device * rbd_dev)6880 static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
6881 {
6882 int ret;
6883
6884 /* Record our major and minor device numbers. */
6885
6886 if (!single_major) {
6887 ret = register_blkdev(0, rbd_dev->name);
6888 if (ret < 0)
6889 goto err_out_unlock;
6890
6891 rbd_dev->major = ret;
6892 rbd_dev->minor = 0;
6893 } else {
6894 rbd_dev->major = rbd_major;
6895 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6896 }
6897
6898 /* Set up the blkdev mapping. */
6899
6900 ret = rbd_init_disk(rbd_dev);
6901 if (ret)
6902 goto err_out_blkdev;
6903
6904 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
6905 set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
6906
6907 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
6908 if (ret)
6909 goto err_out_disk;
6910
6911 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6912 up_write(&rbd_dev->header_rwsem);
6913 return 0;
6914
6915 err_out_disk:
6916 rbd_free_disk(rbd_dev);
6917 err_out_blkdev:
6918 if (!single_major)
6919 unregister_blkdev(rbd_dev->major, rbd_dev->name);
6920 err_out_unlock:
6921 up_write(&rbd_dev->header_rwsem);
6922 return ret;
6923 }
6924
rbd_dev_header_name(struct rbd_device * rbd_dev)6925 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6926 {
6927 struct rbd_spec *spec = rbd_dev->spec;
6928 int ret;
6929
6930 /* Record the header object name for this rbd image. */
6931
6932 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6933 if (rbd_dev->image_format == 1)
6934 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6935 spec->image_name, RBD_SUFFIX);
6936 else
6937 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6938 RBD_HEADER_PREFIX, spec->image_id);
6939
6940 return ret;
6941 }
6942
rbd_print_dne(struct rbd_device * rbd_dev,bool is_snap)6943 static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6944 {
6945 if (!is_snap) {
6946 pr_info("image %s/%s%s%s does not exist\n",
6947 rbd_dev->spec->pool_name,
6948 rbd_dev->spec->pool_ns ?: "",
6949 rbd_dev->spec->pool_ns ? "/" : "",
6950 rbd_dev->spec->image_name);
6951 } else {
6952 pr_info("snap %s/%s%s%s@%s does not exist\n",
6953 rbd_dev->spec->pool_name,
6954 rbd_dev->spec->pool_ns ?: "",
6955 rbd_dev->spec->pool_ns ? "/" : "",
6956 rbd_dev->spec->image_name,
6957 rbd_dev->spec->snap_name);
6958 }
6959 }
6960
rbd_dev_image_release(struct rbd_device * rbd_dev)6961 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6962 {
6963 if (!rbd_is_ro(rbd_dev))
6964 rbd_unregister_watch(rbd_dev);
6965
6966 rbd_dev_unprobe(rbd_dev);
6967 rbd_dev->image_format = 0;
6968 kfree(rbd_dev->spec->image_id);
6969 rbd_dev->spec->image_id = NULL;
6970 }
6971
6972 /*
6973 * Probe for the existence of the header object for the given rbd
6974 * device. If this image is the one being mapped (i.e., not a
6975 * parent), initiate a watch on its header object before using that
6976 * object to get detailed information about the rbd image.
6977 *
6978 * On success, returns with header_rwsem held for write if called
6979 * with @depth == 0.
6980 */
rbd_dev_image_probe(struct rbd_device * rbd_dev,int depth)6981 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6982 {
6983 bool need_watch = !rbd_is_ro(rbd_dev);
6984 int ret;
6985
6986 /*
6987 * Get the id from the image id object. Unless there's an
6988 * error, rbd_dev->spec->image_id will be filled in with
6989 * a dynamically-allocated string, and rbd_dev->image_format
6990 * will be set to either 1 or 2.
6991 */
6992 ret = rbd_dev_image_id(rbd_dev);
6993 if (ret)
6994 return ret;
6995
6996 ret = rbd_dev_header_name(rbd_dev);
6997 if (ret)
6998 goto err_out_format;
6999
7000 if (need_watch) {
7001 ret = rbd_register_watch(rbd_dev);
7002 if (ret) {
7003 if (ret == -ENOENT)
7004 rbd_print_dne(rbd_dev, false);
7005 goto err_out_format;
7006 }
7007 }
7008
7009 if (!depth)
7010 down_write(&rbd_dev->header_rwsem);
7011
7012 ret = rbd_dev_header_info(rbd_dev);
7013 if (ret) {
7014 if (ret == -ENOENT && !need_watch)
7015 rbd_print_dne(rbd_dev, false);
7016 goto err_out_probe;
7017 }
7018
7019 /*
7020 * If this image is the one being mapped, we have pool name and
7021 * id, image name and id, and snap name - need to fill snap id.
7022 * Otherwise this is a parent image, identified by pool, image
7023 * and snap ids - need to fill in names for those ids.
7024 */
7025 if (!depth)
7026 ret = rbd_spec_fill_snap_id(rbd_dev);
7027 else
7028 ret = rbd_spec_fill_names(rbd_dev);
7029 if (ret) {
7030 if (ret == -ENOENT)
7031 rbd_print_dne(rbd_dev, true);
7032 goto err_out_probe;
7033 }
7034
7035 ret = rbd_dev_mapping_set(rbd_dev);
7036 if (ret)
7037 goto err_out_probe;
7038
7039 if (rbd_is_snap(rbd_dev) &&
7040 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
7041 ret = rbd_object_map_load(rbd_dev);
7042 if (ret)
7043 goto err_out_probe;
7044 }
7045
7046 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
7047 ret = rbd_dev_v2_parent_info(rbd_dev);
7048 if (ret)
7049 goto err_out_probe;
7050 }
7051
7052 ret = rbd_dev_probe_parent(rbd_dev, depth);
7053 if (ret)
7054 goto err_out_probe;
7055
7056 dout("discovered format %u image, header name is %s\n",
7057 rbd_dev->image_format, rbd_dev->header_oid.name);
7058 return 0;
7059
7060 err_out_probe:
7061 if (!depth)
7062 up_write(&rbd_dev->header_rwsem);
7063 if (need_watch)
7064 rbd_unregister_watch(rbd_dev);
7065 rbd_dev_unprobe(rbd_dev);
7066 err_out_format:
7067 rbd_dev->image_format = 0;
7068 kfree(rbd_dev->spec->image_id);
7069 rbd_dev->spec->image_id = NULL;
7070 return ret;
7071 }
7072
do_rbd_add(struct bus_type * bus,const char * buf,size_t count)7073 static ssize_t do_rbd_add(struct bus_type *bus,
7074 const char *buf,
7075 size_t count)
7076 {
7077 struct rbd_device *rbd_dev = NULL;
7078 struct ceph_options *ceph_opts = NULL;
7079 struct rbd_options *rbd_opts = NULL;
7080 struct rbd_spec *spec = NULL;
7081 struct rbd_client *rbdc;
7082 int rc;
7083
7084 if (!capable(CAP_SYS_ADMIN))
7085 return -EPERM;
7086
7087 if (!try_module_get(THIS_MODULE))
7088 return -ENODEV;
7089
7090 /* parse add command */
7091 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
7092 if (rc < 0)
7093 goto out;
7094
7095 rbdc = rbd_get_client(ceph_opts);
7096 if (IS_ERR(rbdc)) {
7097 rc = PTR_ERR(rbdc);
7098 goto err_out_args;
7099 }
7100
7101 /* pick the pool */
7102 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
7103 if (rc < 0) {
7104 if (rc == -ENOENT)
7105 pr_info("pool %s does not exist\n", spec->pool_name);
7106 goto err_out_client;
7107 }
7108 spec->pool_id = (u64)rc;
7109
7110 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
7111 if (!rbd_dev) {
7112 rc = -ENOMEM;
7113 goto err_out_client;
7114 }
7115 rbdc = NULL; /* rbd_dev now owns this */
7116 spec = NULL; /* rbd_dev now owns this */
7117 rbd_opts = NULL; /* rbd_dev now owns this */
7118
7119 /* if we are mapping a snapshot it will be a read-only mapping */
7120 if (rbd_dev->opts->read_only ||
7121 strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
7122 __set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
7123
7124 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7125 if (!rbd_dev->config_info) {
7126 rc = -ENOMEM;
7127 goto err_out_rbd_dev;
7128 }
7129
7130 rc = rbd_dev_image_probe(rbd_dev, 0);
7131 if (rc < 0)
7132 goto err_out_rbd_dev;
7133
7134 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7135 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7136 rbd_dev->layout.object_size);
7137 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
7138 }
7139
7140 rc = rbd_dev_device_setup(rbd_dev);
7141 if (rc)
7142 goto err_out_image_probe;
7143
7144 rc = rbd_add_acquire_lock(rbd_dev);
7145 if (rc)
7146 goto err_out_image_lock;
7147
7148 /* Everything's ready. Announce the disk to the world. */
7149
7150 rc = device_add(&rbd_dev->dev);
7151 if (rc)
7152 goto err_out_image_lock;
7153
7154 device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
7155 /* see rbd_init_disk() */
7156 blk_put_queue(rbd_dev->disk->queue);
7157
7158 spin_lock(&rbd_dev_list_lock);
7159 list_add_tail(&rbd_dev->node, &rbd_dev_list);
7160 spin_unlock(&rbd_dev_list_lock);
7161
7162 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7163 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7164 rbd_dev->header.features);
7165 rc = count;
7166 out:
7167 module_put(THIS_MODULE);
7168 return rc;
7169
7170 err_out_image_lock:
7171 rbd_dev_image_unlock(rbd_dev);
7172 rbd_dev_device_release(rbd_dev);
7173 err_out_image_probe:
7174 rbd_dev_image_release(rbd_dev);
7175 err_out_rbd_dev:
7176 rbd_dev_destroy(rbd_dev);
7177 err_out_client:
7178 rbd_put_client(rbdc);
7179 err_out_args:
7180 rbd_spec_put(spec);
7181 kfree(rbd_opts);
7182 goto out;
7183 }
7184
add_store(struct bus_type * bus,const char * buf,size_t count)7185 static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
7186 {
7187 if (single_major)
7188 return -EINVAL;
7189
7190 return do_rbd_add(bus, buf, count);
7191 }
7192
add_single_major_store(struct bus_type * bus,const char * buf,size_t count)7193 static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7194 size_t count)
7195 {
7196 return do_rbd_add(bus, buf, count);
7197 }
7198
rbd_dev_remove_parent(struct rbd_device * rbd_dev)7199 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7200 {
7201 while (rbd_dev->parent) {
7202 struct rbd_device *first = rbd_dev;
7203 struct rbd_device *second = first->parent;
7204 struct rbd_device *third;
7205
7206 /*
7207 * Follow to the parent with no grandparent and
7208 * remove it.
7209 */
7210 while (second && (third = second->parent)) {
7211 first = second;
7212 second = third;
7213 }
7214 rbd_assert(second);
7215 rbd_dev_image_release(second);
7216 rbd_dev_destroy(second);
7217 first->parent = NULL;
7218 first->parent_overlap = 0;
7219
7220 rbd_assert(first->parent_spec);
7221 rbd_spec_put(first->parent_spec);
7222 first->parent_spec = NULL;
7223 }
7224 }
7225
do_rbd_remove(struct bus_type * bus,const char * buf,size_t count)7226 static ssize_t do_rbd_remove(struct bus_type *bus,
7227 const char *buf,
7228 size_t count)
7229 {
7230 struct rbd_device *rbd_dev = NULL;
7231 struct list_head *tmp;
7232 int dev_id;
7233 char opt_buf[6];
7234 bool force = false;
7235 int ret;
7236
7237 if (!capable(CAP_SYS_ADMIN))
7238 return -EPERM;
7239
7240 dev_id = -1;
7241 opt_buf[0] = '\0';
7242 sscanf(buf, "%d %5s", &dev_id, opt_buf);
7243 if (dev_id < 0) {
7244 pr_err("dev_id out of range\n");
7245 return -EINVAL;
7246 }
7247 if (opt_buf[0] != '\0') {
7248 if (!strcmp(opt_buf, "force")) {
7249 force = true;
7250 } else {
7251 pr_err("bad remove option at '%s'\n", opt_buf);
7252 return -EINVAL;
7253 }
7254 }
7255
7256 ret = -ENOENT;
7257 spin_lock(&rbd_dev_list_lock);
7258 list_for_each(tmp, &rbd_dev_list) {
7259 rbd_dev = list_entry(tmp, struct rbd_device, node);
7260 if (rbd_dev->dev_id == dev_id) {
7261 ret = 0;
7262 break;
7263 }
7264 }
7265 if (!ret) {
7266 spin_lock_irq(&rbd_dev->lock);
7267 if (rbd_dev->open_count && !force)
7268 ret = -EBUSY;
7269 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7270 &rbd_dev->flags))
7271 ret = -EINPROGRESS;
7272 spin_unlock_irq(&rbd_dev->lock);
7273 }
7274 spin_unlock(&rbd_dev_list_lock);
7275 if (ret)
7276 return ret;
7277
7278 if (force) {
7279 /*
7280 * Prevent new IO from being queued and wait for existing
7281 * IO to complete/fail.
7282 */
7283 blk_mq_freeze_queue(rbd_dev->disk->queue);
7284 blk_set_queue_dying(rbd_dev->disk->queue);
7285 }
7286
7287 del_gendisk(rbd_dev->disk);
7288 spin_lock(&rbd_dev_list_lock);
7289 list_del_init(&rbd_dev->node);
7290 spin_unlock(&rbd_dev_list_lock);
7291 device_del(&rbd_dev->dev);
7292
7293 rbd_dev_image_unlock(rbd_dev);
7294 rbd_dev_device_release(rbd_dev);
7295 rbd_dev_image_release(rbd_dev);
7296 rbd_dev_destroy(rbd_dev);
7297 return count;
7298 }
7299
remove_store(struct bus_type * bus,const char * buf,size_t count)7300 static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
7301 {
7302 if (single_major)
7303 return -EINVAL;
7304
7305 return do_rbd_remove(bus, buf, count);
7306 }
7307
remove_single_major_store(struct bus_type * bus,const char * buf,size_t count)7308 static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7309 size_t count)
7310 {
7311 return do_rbd_remove(bus, buf, count);
7312 }
7313
7314 /*
7315 * create control files in sysfs
7316 * /sys/bus/rbd/...
7317 */
rbd_sysfs_init(void)7318 static int __init rbd_sysfs_init(void)
7319 {
7320 int ret;
7321
7322 ret = device_register(&rbd_root_dev);
7323 if (ret < 0)
7324 return ret;
7325
7326 ret = bus_register(&rbd_bus_type);
7327 if (ret < 0)
7328 device_unregister(&rbd_root_dev);
7329
7330 return ret;
7331 }
7332
rbd_sysfs_cleanup(void)7333 static void __exit rbd_sysfs_cleanup(void)
7334 {
7335 bus_unregister(&rbd_bus_type);
7336 device_unregister(&rbd_root_dev);
7337 }
7338
rbd_slab_init(void)7339 static int __init rbd_slab_init(void)
7340 {
7341 rbd_assert(!rbd_img_request_cache);
7342 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
7343 if (!rbd_img_request_cache)
7344 return -ENOMEM;
7345
7346 rbd_assert(!rbd_obj_request_cache);
7347 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
7348 if (!rbd_obj_request_cache)
7349 goto out_err;
7350
7351 return 0;
7352
7353 out_err:
7354 kmem_cache_destroy(rbd_img_request_cache);
7355 rbd_img_request_cache = NULL;
7356 return -ENOMEM;
7357 }
7358
rbd_slab_exit(void)7359 static void rbd_slab_exit(void)
7360 {
7361 rbd_assert(rbd_obj_request_cache);
7362 kmem_cache_destroy(rbd_obj_request_cache);
7363 rbd_obj_request_cache = NULL;
7364
7365 rbd_assert(rbd_img_request_cache);
7366 kmem_cache_destroy(rbd_img_request_cache);
7367 rbd_img_request_cache = NULL;
7368 }
7369
rbd_init(void)7370 static int __init rbd_init(void)
7371 {
7372 int rc;
7373
7374 if (!libceph_compatible(NULL)) {
7375 rbd_warn(NULL, "libceph incompatibility (quitting)");
7376 return -EINVAL;
7377 }
7378
7379 rc = rbd_slab_init();
7380 if (rc)
7381 return rc;
7382
7383 /*
7384 * The number of active work items is limited by the number of
7385 * rbd devices * queue depth, so leave @max_active at default.
7386 */
7387 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7388 if (!rbd_wq) {
7389 rc = -ENOMEM;
7390 goto err_out_slab;
7391 }
7392
7393 if (single_major) {
7394 rbd_major = register_blkdev(0, RBD_DRV_NAME);
7395 if (rbd_major < 0) {
7396 rc = rbd_major;
7397 goto err_out_wq;
7398 }
7399 }
7400
7401 rc = rbd_sysfs_init();
7402 if (rc)
7403 goto err_out_blkdev;
7404
7405 if (single_major)
7406 pr_info("loaded (major %d)\n", rbd_major);
7407 else
7408 pr_info("loaded\n");
7409
7410 return 0;
7411
7412 err_out_blkdev:
7413 if (single_major)
7414 unregister_blkdev(rbd_major, RBD_DRV_NAME);
7415 err_out_wq:
7416 destroy_workqueue(rbd_wq);
7417 err_out_slab:
7418 rbd_slab_exit();
7419 return rc;
7420 }
7421
rbd_exit(void)7422 static void __exit rbd_exit(void)
7423 {
7424 ida_destroy(&rbd_dev_id_ida);
7425 rbd_sysfs_cleanup();
7426 if (single_major)
7427 unregister_blkdev(rbd_major, RBD_DRV_NAME);
7428 destroy_workqueue(rbd_wq);
7429 rbd_slab_exit();
7430 }
7431
7432 module_init(rbd_init);
7433 module_exit(rbd_exit);
7434
7435 MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
7436 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7437 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
7438 /* following authorship retained from original osdblk.c */
7439 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7440
7441 MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
7442 MODULE_LICENSE("GPL");
7443