1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * NVM Express device driver
4 * Copyright (c) 2011-2014, Intel Corporation.
5 */
6
7 #include <linux/blkdev.h>
8 #include <linux/blk-mq.h>
9 #include <linux/compat.h>
10 #include <linux/delay.h>
11 #include <linux/errno.h>
12 #include <linux/hdreg.h>
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/backing-dev.h>
16 #include <linux/slab.h>
17 #include <linux/types.h>
18 #include <linux/pr.h>
19 #include <linux/ptrace.h>
20 #include <linux/nvme_ioctl.h>
21 #include <linux/pm_qos.h>
22 #include <asm/unaligned.h>
23
24 #include "nvme.h"
25 #include "fabrics.h"
26
27 #define CREATE_TRACE_POINTS
28 #include "trace.h"
29
30 #define NVME_MINORS (1U << MINORBITS)
31
32 unsigned int admin_timeout = 60;
33 module_param(admin_timeout, uint, 0644);
34 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
35 EXPORT_SYMBOL_GPL(admin_timeout);
36
37 unsigned int nvme_io_timeout = 30;
38 module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
39 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
40 EXPORT_SYMBOL_GPL(nvme_io_timeout);
41
42 static unsigned char shutdown_timeout = 5;
43 module_param(shutdown_timeout, byte, 0644);
44 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
45
46 static u8 nvme_max_retries = 5;
47 module_param_named(max_retries, nvme_max_retries, byte, 0644);
48 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
49
50 static unsigned long default_ps_max_latency_us = 100000;
51 module_param(default_ps_max_latency_us, ulong, 0644);
52 MODULE_PARM_DESC(default_ps_max_latency_us,
53 "max power saving latency for new devices; use PM QOS to change per device");
54
55 static bool force_apst;
56 module_param(force_apst, bool, 0644);
57 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
58
59 static bool streams;
60 module_param(streams, bool, 0644);
61 MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
62
63 /*
64 * nvme_wq - hosts nvme related works that are not reset or delete
65 * nvme_reset_wq - hosts nvme reset works
66 * nvme_delete_wq - hosts nvme delete works
67 *
68 * nvme_wq will host works such as scan, aen handling, fw activation,
69 * keep-alive, periodic reconnects etc. nvme_reset_wq
70 * runs reset works which also flush works hosted on nvme_wq for
71 * serialization purposes. nvme_delete_wq host controller deletion
72 * works which flush reset works for serialization.
73 */
74 struct workqueue_struct *nvme_wq;
75 EXPORT_SYMBOL_GPL(nvme_wq);
76
77 struct workqueue_struct *nvme_reset_wq;
78 EXPORT_SYMBOL_GPL(nvme_reset_wq);
79
80 struct workqueue_struct *nvme_delete_wq;
81 EXPORT_SYMBOL_GPL(nvme_delete_wq);
82
83 static LIST_HEAD(nvme_subsystems);
84 static DEFINE_MUTEX(nvme_subsystems_lock);
85
86 static DEFINE_IDA(nvme_instance_ida);
87 static dev_t nvme_chr_devt;
88 static struct class *nvme_class;
89 static struct class *nvme_subsys_class;
90
91 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
92 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
93 unsigned nsid);
94
nvme_update_bdev_size(struct gendisk * disk)95 static void nvme_update_bdev_size(struct gendisk *disk)
96 {
97 struct block_device *bdev = bdget_disk(disk, 0);
98
99 if (bdev) {
100 bd_set_nr_sectors(bdev, get_capacity(disk));
101 bdput(bdev);
102 }
103 }
104
nvme_queue_scan(struct nvme_ctrl * ctrl)105 static void nvme_queue_scan(struct nvme_ctrl *ctrl)
106 {
107 /*
108 * Only new queue scan work when admin and IO queues are both alive
109 */
110 if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
111 queue_work(nvme_wq, &ctrl->scan_work);
112 }
113
114 /*
115 * Use this function to proceed with scheduling reset_work for a controller
116 * that had previously been set to the resetting state. This is intended for
117 * code paths that can't be interrupted by other reset attempts. A hot removal
118 * may prevent this from succeeding.
119 */
nvme_try_sched_reset(struct nvme_ctrl * ctrl)120 int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
121 {
122 if (ctrl->state != NVME_CTRL_RESETTING)
123 return -EBUSY;
124 if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
125 return -EBUSY;
126 return 0;
127 }
128 EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
129
nvme_reset_ctrl(struct nvme_ctrl * ctrl)130 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
131 {
132 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
133 return -EBUSY;
134 if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
135 return -EBUSY;
136 return 0;
137 }
138 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
139
nvme_reset_ctrl_sync(struct nvme_ctrl * ctrl)140 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
141 {
142 int ret;
143
144 ret = nvme_reset_ctrl(ctrl);
145 if (!ret) {
146 flush_work(&ctrl->reset_work);
147 if (ctrl->state != NVME_CTRL_LIVE)
148 ret = -ENETRESET;
149 }
150
151 return ret;
152 }
153 EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
154
nvme_do_delete_ctrl(struct nvme_ctrl * ctrl)155 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
156 {
157 dev_info(ctrl->device,
158 "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
159
160 flush_work(&ctrl->reset_work);
161 nvme_stop_ctrl(ctrl);
162 nvme_remove_namespaces(ctrl);
163 ctrl->ops->delete_ctrl(ctrl);
164 nvme_uninit_ctrl(ctrl);
165 }
166
nvme_delete_ctrl_work(struct work_struct * work)167 static void nvme_delete_ctrl_work(struct work_struct *work)
168 {
169 struct nvme_ctrl *ctrl =
170 container_of(work, struct nvme_ctrl, delete_work);
171
172 nvme_do_delete_ctrl(ctrl);
173 }
174
nvme_delete_ctrl(struct nvme_ctrl * ctrl)175 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
176 {
177 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
178 return -EBUSY;
179 if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
180 return -EBUSY;
181 return 0;
182 }
183 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
184
nvme_delete_ctrl_sync(struct nvme_ctrl * ctrl)185 static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
186 {
187 /*
188 * Keep a reference until nvme_do_delete_ctrl() complete,
189 * since ->delete_ctrl can free the controller.
190 */
191 nvme_get_ctrl(ctrl);
192 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
193 nvme_do_delete_ctrl(ctrl);
194 nvme_put_ctrl(ctrl);
195 }
196
nvme_error_status(u16 status)197 static blk_status_t nvme_error_status(u16 status)
198 {
199 switch (status & 0x7ff) {
200 case NVME_SC_SUCCESS:
201 return BLK_STS_OK;
202 case NVME_SC_CAP_EXCEEDED:
203 return BLK_STS_NOSPC;
204 case NVME_SC_LBA_RANGE:
205 case NVME_SC_CMD_INTERRUPTED:
206 case NVME_SC_NS_NOT_READY:
207 return BLK_STS_TARGET;
208 case NVME_SC_BAD_ATTRIBUTES:
209 case NVME_SC_ONCS_NOT_SUPPORTED:
210 case NVME_SC_INVALID_OPCODE:
211 case NVME_SC_INVALID_FIELD:
212 case NVME_SC_INVALID_NS:
213 return BLK_STS_NOTSUPP;
214 case NVME_SC_WRITE_FAULT:
215 case NVME_SC_READ_ERROR:
216 case NVME_SC_UNWRITTEN_BLOCK:
217 case NVME_SC_ACCESS_DENIED:
218 case NVME_SC_READ_ONLY:
219 case NVME_SC_COMPARE_FAILED:
220 return BLK_STS_MEDIUM;
221 case NVME_SC_GUARD_CHECK:
222 case NVME_SC_APPTAG_CHECK:
223 case NVME_SC_REFTAG_CHECK:
224 case NVME_SC_INVALID_PI:
225 return BLK_STS_PROTECTION;
226 case NVME_SC_RESERVATION_CONFLICT:
227 return BLK_STS_NEXUS;
228 case NVME_SC_HOST_PATH_ERROR:
229 return BLK_STS_TRANSPORT;
230 case NVME_SC_ZONE_TOO_MANY_ACTIVE:
231 return BLK_STS_ZONE_ACTIVE_RESOURCE;
232 case NVME_SC_ZONE_TOO_MANY_OPEN:
233 return BLK_STS_ZONE_OPEN_RESOURCE;
234 default:
235 return BLK_STS_IOERR;
236 }
237 }
238
nvme_retry_req(struct request * req)239 static void nvme_retry_req(struct request *req)
240 {
241 struct nvme_ns *ns = req->q->queuedata;
242 unsigned long delay = 0;
243 u16 crd;
244
245 /* The mask and shift result must be <= 3 */
246 crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
247 if (ns && crd)
248 delay = ns->ctrl->crdt[crd - 1] * 100;
249
250 nvme_req(req)->retries++;
251 blk_mq_requeue_request(req, false);
252 blk_mq_delay_kick_requeue_list(req->q, delay);
253 }
254
255 enum nvme_disposition {
256 COMPLETE,
257 RETRY,
258 FAILOVER,
259 };
260
nvme_decide_disposition(struct request * req)261 static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
262 {
263 if (likely(nvme_req(req)->status == 0))
264 return COMPLETE;
265
266 if (blk_noretry_request(req) ||
267 (nvme_req(req)->status & NVME_SC_DNR) ||
268 nvme_req(req)->retries >= nvme_max_retries)
269 return COMPLETE;
270
271 if (req->cmd_flags & REQ_NVME_MPATH) {
272 if (nvme_is_path_error(nvme_req(req)->status) ||
273 blk_queue_dying(req->q))
274 return FAILOVER;
275 } else {
276 if (blk_queue_dying(req->q))
277 return COMPLETE;
278 }
279
280 return RETRY;
281 }
282
nvme_end_req(struct request * req)283 static inline void nvme_end_req(struct request *req)
284 {
285 blk_status_t status = nvme_error_status(nvme_req(req)->status);
286
287 if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
288 req_op(req) == REQ_OP_ZONE_APPEND)
289 req->__sector = nvme_lba_to_sect(req->q->queuedata,
290 le64_to_cpu(nvme_req(req)->result.u64));
291
292 nvme_trace_bio_complete(req, status);
293 blk_mq_end_request(req, status);
294 }
295
nvme_complete_rq(struct request * req)296 void nvme_complete_rq(struct request *req)
297 {
298 trace_nvme_complete_rq(req);
299 nvme_cleanup_cmd(req);
300
301 if (nvme_req(req)->ctrl->kas)
302 nvme_req(req)->ctrl->comp_seen = true;
303
304 switch (nvme_decide_disposition(req)) {
305 case COMPLETE:
306 nvme_end_req(req);
307 return;
308 case RETRY:
309 nvme_retry_req(req);
310 return;
311 case FAILOVER:
312 nvme_failover_req(req);
313 return;
314 }
315 }
316 EXPORT_SYMBOL_GPL(nvme_complete_rq);
317
nvme_cancel_request(struct request * req,void * data,bool reserved)318 bool nvme_cancel_request(struct request *req, void *data, bool reserved)
319 {
320 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
321 "Cancelling I/O %d", req->tag);
322
323 /* don't abort one completed request */
324 if (blk_mq_request_completed(req))
325 return true;
326
327 nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
328 nvme_req(req)->flags |= NVME_REQ_CANCELLED;
329 blk_mq_complete_request(req);
330 return true;
331 }
332 EXPORT_SYMBOL_GPL(nvme_cancel_request);
333
nvme_cancel_tagset(struct nvme_ctrl * ctrl)334 void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
335 {
336 if (ctrl->tagset) {
337 blk_mq_tagset_busy_iter(ctrl->tagset,
338 nvme_cancel_request, ctrl);
339 blk_mq_tagset_wait_completed_request(ctrl->tagset);
340 }
341 }
342 EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
343
nvme_cancel_admin_tagset(struct nvme_ctrl * ctrl)344 void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
345 {
346 if (ctrl->admin_tagset) {
347 blk_mq_tagset_busy_iter(ctrl->admin_tagset,
348 nvme_cancel_request, ctrl);
349 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
350 }
351 }
352 EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
353
nvme_change_ctrl_state(struct nvme_ctrl * ctrl,enum nvme_ctrl_state new_state)354 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
355 enum nvme_ctrl_state new_state)
356 {
357 enum nvme_ctrl_state old_state;
358 unsigned long flags;
359 bool changed = false;
360
361 spin_lock_irqsave(&ctrl->lock, flags);
362
363 old_state = ctrl->state;
364 switch (new_state) {
365 case NVME_CTRL_LIVE:
366 switch (old_state) {
367 case NVME_CTRL_NEW:
368 case NVME_CTRL_RESETTING:
369 case NVME_CTRL_CONNECTING:
370 changed = true;
371 fallthrough;
372 default:
373 break;
374 }
375 break;
376 case NVME_CTRL_RESETTING:
377 switch (old_state) {
378 case NVME_CTRL_NEW:
379 case NVME_CTRL_LIVE:
380 changed = true;
381 fallthrough;
382 default:
383 break;
384 }
385 break;
386 case NVME_CTRL_CONNECTING:
387 switch (old_state) {
388 case NVME_CTRL_NEW:
389 case NVME_CTRL_RESETTING:
390 changed = true;
391 fallthrough;
392 default:
393 break;
394 }
395 break;
396 case NVME_CTRL_DELETING:
397 switch (old_state) {
398 case NVME_CTRL_LIVE:
399 case NVME_CTRL_RESETTING:
400 case NVME_CTRL_CONNECTING:
401 changed = true;
402 fallthrough;
403 default:
404 break;
405 }
406 break;
407 case NVME_CTRL_DELETING_NOIO:
408 switch (old_state) {
409 case NVME_CTRL_DELETING:
410 case NVME_CTRL_DEAD:
411 changed = true;
412 fallthrough;
413 default:
414 break;
415 }
416 break;
417 case NVME_CTRL_DEAD:
418 switch (old_state) {
419 case NVME_CTRL_DELETING:
420 changed = true;
421 fallthrough;
422 default:
423 break;
424 }
425 break;
426 default:
427 break;
428 }
429
430 if (changed) {
431 ctrl->state = new_state;
432 wake_up_all(&ctrl->state_wq);
433 }
434
435 spin_unlock_irqrestore(&ctrl->lock, flags);
436 if (changed && ctrl->state == NVME_CTRL_LIVE)
437 nvme_kick_requeue_lists(ctrl);
438 return changed;
439 }
440 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
441
442 /*
443 * Returns true for sink states that can't ever transition back to live.
444 */
nvme_state_terminal(struct nvme_ctrl * ctrl)445 static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
446 {
447 switch (ctrl->state) {
448 case NVME_CTRL_NEW:
449 case NVME_CTRL_LIVE:
450 case NVME_CTRL_RESETTING:
451 case NVME_CTRL_CONNECTING:
452 return false;
453 case NVME_CTRL_DELETING:
454 case NVME_CTRL_DELETING_NOIO:
455 case NVME_CTRL_DEAD:
456 return true;
457 default:
458 WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
459 return true;
460 }
461 }
462
463 /*
464 * Waits for the controller state to be resetting, or returns false if it is
465 * not possible to ever transition to that state.
466 */
nvme_wait_reset(struct nvme_ctrl * ctrl)467 bool nvme_wait_reset(struct nvme_ctrl *ctrl)
468 {
469 wait_event(ctrl->state_wq,
470 nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
471 nvme_state_terminal(ctrl));
472 return ctrl->state == NVME_CTRL_RESETTING;
473 }
474 EXPORT_SYMBOL_GPL(nvme_wait_reset);
475
nvme_free_ns_head(struct kref * ref)476 static void nvme_free_ns_head(struct kref *ref)
477 {
478 struct nvme_ns_head *head =
479 container_of(ref, struct nvme_ns_head, ref);
480
481 nvme_mpath_remove_disk(head);
482 ida_simple_remove(&head->subsys->ns_ida, head->instance);
483 cleanup_srcu_struct(&head->srcu);
484 nvme_put_subsystem(head->subsys);
485 kfree(head);
486 }
487
nvme_put_ns_head(struct nvme_ns_head * head)488 static void nvme_put_ns_head(struct nvme_ns_head *head)
489 {
490 kref_put(&head->ref, nvme_free_ns_head);
491 }
492
nvme_free_ns(struct kref * kref)493 static void nvme_free_ns(struct kref *kref)
494 {
495 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
496
497 if (ns->ndev)
498 nvme_nvm_unregister(ns);
499
500 put_disk(ns->disk);
501 nvme_put_ns_head(ns->head);
502 nvme_put_ctrl(ns->ctrl);
503 kfree(ns);
504 }
505
nvme_put_ns(struct nvme_ns * ns)506 void nvme_put_ns(struct nvme_ns *ns)
507 {
508 kref_put(&ns->kref, nvme_free_ns);
509 }
510 EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
511
nvme_clear_nvme_request(struct request * req)512 static inline void nvme_clear_nvme_request(struct request *req)
513 {
514 nvme_req(req)->retries = 0;
515 nvme_req(req)->flags = 0;
516 req->rq_flags |= RQF_DONTPREP;
517 }
518
nvme_req_op(struct nvme_command * cmd)519 static inline unsigned int nvme_req_op(struct nvme_command *cmd)
520 {
521 return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
522 }
523
nvme_init_request(struct request * req,struct nvme_command * cmd)524 static inline void nvme_init_request(struct request *req,
525 struct nvme_command *cmd)
526 {
527 if (req->q->queuedata)
528 req->timeout = NVME_IO_TIMEOUT;
529 else /* no queuedata implies admin queue */
530 req->timeout = ADMIN_TIMEOUT;
531
532 req->cmd_flags |= REQ_FAILFAST_DRIVER;
533 nvme_clear_nvme_request(req);
534 nvme_req(req)->cmd = cmd;
535 }
536
nvme_alloc_request(struct request_queue * q,struct nvme_command * cmd,blk_mq_req_flags_t flags)537 struct request *nvme_alloc_request(struct request_queue *q,
538 struct nvme_command *cmd, blk_mq_req_flags_t flags)
539 {
540 struct request *req;
541
542 req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
543 if (!IS_ERR(req))
544 nvme_init_request(req, cmd);
545 return req;
546 }
547 EXPORT_SYMBOL_GPL(nvme_alloc_request);
548
nvme_alloc_request_qid(struct request_queue * q,struct nvme_command * cmd,blk_mq_req_flags_t flags,int qid)549 struct request *nvme_alloc_request_qid(struct request_queue *q,
550 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
551 {
552 struct request *req;
553
554 req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
555 qid ? qid - 1 : 0);
556 if (!IS_ERR(req))
557 nvme_init_request(req, cmd);
558 return req;
559 }
560 EXPORT_SYMBOL_GPL(nvme_alloc_request_qid);
561
nvme_toggle_streams(struct nvme_ctrl * ctrl,bool enable)562 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
563 {
564 struct nvme_command c;
565
566 memset(&c, 0, sizeof(c));
567
568 c.directive.opcode = nvme_admin_directive_send;
569 c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
570 c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
571 c.directive.dtype = NVME_DIR_IDENTIFY;
572 c.directive.tdtype = NVME_DIR_STREAMS;
573 c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
574
575 return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
576 }
577
nvme_disable_streams(struct nvme_ctrl * ctrl)578 static int nvme_disable_streams(struct nvme_ctrl *ctrl)
579 {
580 return nvme_toggle_streams(ctrl, false);
581 }
582
nvme_enable_streams(struct nvme_ctrl * ctrl)583 static int nvme_enable_streams(struct nvme_ctrl *ctrl)
584 {
585 return nvme_toggle_streams(ctrl, true);
586 }
587
nvme_get_stream_params(struct nvme_ctrl * ctrl,struct streams_directive_params * s,u32 nsid)588 static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
589 struct streams_directive_params *s, u32 nsid)
590 {
591 struct nvme_command c;
592
593 memset(&c, 0, sizeof(c));
594 memset(s, 0, sizeof(*s));
595
596 c.directive.opcode = nvme_admin_directive_recv;
597 c.directive.nsid = cpu_to_le32(nsid);
598 c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s)));
599 c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
600 c.directive.dtype = NVME_DIR_STREAMS;
601
602 return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
603 }
604
nvme_configure_directives(struct nvme_ctrl * ctrl)605 static int nvme_configure_directives(struct nvme_ctrl *ctrl)
606 {
607 struct streams_directive_params s;
608 int ret;
609
610 if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
611 return 0;
612 if (!streams)
613 return 0;
614
615 ret = nvme_enable_streams(ctrl);
616 if (ret)
617 return ret;
618
619 ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
620 if (ret)
621 goto out_disable_stream;
622
623 ctrl->nssa = le16_to_cpu(s.nssa);
624 if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
625 dev_info(ctrl->device, "too few streams (%u) available\n",
626 ctrl->nssa);
627 goto out_disable_stream;
628 }
629
630 ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
631 dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
632 return 0;
633
634 out_disable_stream:
635 nvme_disable_streams(ctrl);
636 return ret;
637 }
638
639 /*
640 * Check if 'req' has a write hint associated with it. If it does, assign
641 * a valid namespace stream to the write.
642 */
nvme_assign_write_stream(struct nvme_ctrl * ctrl,struct request * req,u16 * control,u32 * dsmgmt)643 static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
644 struct request *req, u16 *control,
645 u32 *dsmgmt)
646 {
647 enum rw_hint streamid = req->write_hint;
648
649 if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
650 streamid = 0;
651 else {
652 streamid--;
653 if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
654 return;
655
656 *control |= NVME_RW_DTYPE_STREAMS;
657 *dsmgmt |= streamid << 16;
658 }
659
660 if (streamid < ARRAY_SIZE(req->q->write_hints))
661 req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
662 }
663
nvme_setup_passthrough(struct request * req,struct nvme_command * cmd)664 static inline void nvme_setup_passthrough(struct request *req,
665 struct nvme_command *cmd)
666 {
667 memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
668 /* passthru commands should let the driver set the SGL flags */
669 cmd->common.flags &= ~NVME_CMD_SGL_ALL;
670 }
671
nvme_setup_flush(struct nvme_ns * ns,struct nvme_command * cmnd)672 static inline void nvme_setup_flush(struct nvme_ns *ns,
673 struct nvme_command *cmnd)
674 {
675 cmnd->common.opcode = nvme_cmd_flush;
676 cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
677 }
678
nvme_setup_discard(struct nvme_ns * ns,struct request * req,struct nvme_command * cmnd)679 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
680 struct nvme_command *cmnd)
681 {
682 unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
683 struct nvme_dsm_range *range;
684 struct bio *bio;
685
686 /*
687 * Some devices do not consider the DSM 'Number of Ranges' field when
688 * determining how much data to DMA. Always allocate memory for maximum
689 * number of segments to prevent device reading beyond end of buffer.
690 */
691 static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
692
693 range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
694 if (!range) {
695 /*
696 * If we fail allocation our range, fallback to the controller
697 * discard page. If that's also busy, it's safe to return
698 * busy, as we know we can make progress once that's freed.
699 */
700 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
701 return BLK_STS_RESOURCE;
702
703 range = page_address(ns->ctrl->discard_page);
704 }
705
706 __rq_for_each_bio(bio, req) {
707 u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
708 u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
709
710 if (n < segments) {
711 range[n].cattr = cpu_to_le32(0);
712 range[n].nlb = cpu_to_le32(nlb);
713 range[n].slba = cpu_to_le64(slba);
714 }
715 n++;
716 }
717
718 if (WARN_ON_ONCE(n != segments)) {
719 if (virt_to_page(range) == ns->ctrl->discard_page)
720 clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
721 else
722 kfree(range);
723 return BLK_STS_IOERR;
724 }
725
726 cmnd->dsm.opcode = nvme_cmd_dsm;
727 cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
728 cmnd->dsm.nr = cpu_to_le32(segments - 1);
729 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
730
731 req->special_vec.bv_page = virt_to_page(range);
732 req->special_vec.bv_offset = offset_in_page(range);
733 req->special_vec.bv_len = alloc_size;
734 req->rq_flags |= RQF_SPECIAL_PAYLOAD;
735
736 return BLK_STS_OK;
737 }
738
nvme_setup_write_zeroes(struct nvme_ns * ns,struct request * req,struct nvme_command * cmnd)739 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
740 struct request *req, struct nvme_command *cmnd)
741 {
742 if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
743 return nvme_setup_discard(ns, req, cmnd);
744
745 cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
746 cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
747 cmnd->write_zeroes.slba =
748 cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
749 cmnd->write_zeroes.length =
750 cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
751 if (nvme_ns_has_pi(ns))
752 cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
753 else
754 cmnd->write_zeroes.control = 0;
755 return BLK_STS_OK;
756 }
757
nvme_setup_rw(struct nvme_ns * ns,struct request * req,struct nvme_command * cmnd,enum nvme_opcode op)758 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
759 struct request *req, struct nvme_command *cmnd,
760 enum nvme_opcode op)
761 {
762 struct nvme_ctrl *ctrl = ns->ctrl;
763 u16 control = 0;
764 u32 dsmgmt = 0;
765
766 if (req->cmd_flags & REQ_FUA)
767 control |= NVME_RW_FUA;
768 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
769 control |= NVME_RW_LR;
770
771 if (req->cmd_flags & REQ_RAHEAD)
772 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
773
774 cmnd->rw.opcode = op;
775 cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
776 cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
777 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
778
779 if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
780 nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
781
782 if (ns->ms) {
783 /*
784 * If formated with metadata, the block layer always provides a
785 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
786 * we enable the PRACT bit for protection information or set the
787 * namespace capacity to zero to prevent any I/O.
788 */
789 if (!blk_integrity_rq(req)) {
790 if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
791 return BLK_STS_NOTSUPP;
792 control |= NVME_RW_PRINFO_PRACT;
793 }
794
795 switch (ns->pi_type) {
796 case NVME_NS_DPS_PI_TYPE3:
797 control |= NVME_RW_PRINFO_PRCHK_GUARD;
798 break;
799 case NVME_NS_DPS_PI_TYPE1:
800 case NVME_NS_DPS_PI_TYPE2:
801 control |= NVME_RW_PRINFO_PRCHK_GUARD |
802 NVME_RW_PRINFO_PRCHK_REF;
803 if (op == nvme_cmd_zone_append)
804 control |= NVME_RW_APPEND_PIREMAP;
805 cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
806 break;
807 }
808 }
809
810 cmnd->rw.control = cpu_to_le16(control);
811 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
812 return 0;
813 }
814
nvme_cleanup_cmd(struct request * req)815 void nvme_cleanup_cmd(struct request *req)
816 {
817 if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
818 struct nvme_ns *ns = req->rq_disk->private_data;
819 struct page *page = req->special_vec.bv_page;
820
821 if (page == ns->ctrl->discard_page)
822 clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
823 else
824 kfree(page_address(page) + req->special_vec.bv_offset);
825 }
826 }
827 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
828
nvme_setup_cmd(struct nvme_ns * ns,struct request * req,struct nvme_command * cmd)829 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
830 struct nvme_command *cmd)
831 {
832 struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
833 blk_status_t ret = BLK_STS_OK;
834
835 if (!(req->rq_flags & RQF_DONTPREP))
836 nvme_clear_nvme_request(req);
837
838 memset(cmd, 0, sizeof(*cmd));
839 switch (req_op(req)) {
840 case REQ_OP_DRV_IN:
841 case REQ_OP_DRV_OUT:
842 nvme_setup_passthrough(req, cmd);
843 break;
844 case REQ_OP_FLUSH:
845 nvme_setup_flush(ns, cmd);
846 break;
847 case REQ_OP_ZONE_RESET_ALL:
848 case REQ_OP_ZONE_RESET:
849 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
850 break;
851 case REQ_OP_ZONE_OPEN:
852 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
853 break;
854 case REQ_OP_ZONE_CLOSE:
855 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
856 break;
857 case REQ_OP_ZONE_FINISH:
858 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
859 break;
860 case REQ_OP_WRITE_ZEROES:
861 ret = nvme_setup_write_zeroes(ns, req, cmd);
862 break;
863 case REQ_OP_DISCARD:
864 ret = nvme_setup_discard(ns, req, cmd);
865 break;
866 case REQ_OP_READ:
867 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
868 break;
869 case REQ_OP_WRITE:
870 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
871 break;
872 case REQ_OP_ZONE_APPEND:
873 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
874 break;
875 default:
876 WARN_ON_ONCE(1);
877 return BLK_STS_IOERR;
878 }
879
880 if (!(ctrl->quirks & NVME_QUIRK_SKIP_CID_GEN))
881 nvme_req(req)->genctr++;
882 cmd->common.command_id = nvme_cid(req);
883 trace_nvme_setup_cmd(req, cmd);
884 return ret;
885 }
886 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
887
nvme_end_sync_rq(struct request * rq,blk_status_t error)888 static void nvme_end_sync_rq(struct request *rq, blk_status_t error)
889 {
890 struct completion *waiting = rq->end_io_data;
891
892 rq->end_io_data = NULL;
893 complete(waiting);
894 }
895
nvme_execute_rq_polled(struct request_queue * q,struct gendisk * bd_disk,struct request * rq,int at_head)896 static void nvme_execute_rq_polled(struct request_queue *q,
897 struct gendisk *bd_disk, struct request *rq, int at_head)
898 {
899 DECLARE_COMPLETION_ONSTACK(wait);
900
901 WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));
902
903 rq->cmd_flags |= REQ_HIPRI;
904 rq->end_io_data = &wait;
905 blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
906
907 while (!completion_done(&wait)) {
908 blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
909 cond_resched();
910 }
911 }
912
913 /*
914 * Returns 0 on success. If the result is negative, it's a Linux error code;
915 * if the result is positive, it's an NVM Express status code
916 */
__nvme_submit_sync_cmd(struct request_queue * q,struct nvme_command * cmd,union nvme_result * result,void * buffer,unsigned bufflen,unsigned timeout,int qid,int at_head,blk_mq_req_flags_t flags,bool poll)917 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
918 union nvme_result *result, void *buffer, unsigned bufflen,
919 unsigned timeout, int qid, int at_head,
920 blk_mq_req_flags_t flags, bool poll)
921 {
922 struct request *req;
923 int ret;
924
925 if (qid == NVME_QID_ANY)
926 req = nvme_alloc_request(q, cmd, flags);
927 else
928 req = nvme_alloc_request_qid(q, cmd, flags, qid);
929 if (IS_ERR(req))
930 return PTR_ERR(req);
931
932 if (timeout)
933 req->timeout = timeout;
934
935 if (buffer && bufflen) {
936 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
937 if (ret)
938 goto out;
939 }
940
941 if (poll)
942 nvme_execute_rq_polled(req->q, NULL, req, at_head);
943 else
944 blk_execute_rq(req->q, NULL, req, at_head);
945 if (result)
946 *result = nvme_req(req)->result;
947 if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
948 ret = -EINTR;
949 else
950 ret = nvme_req(req)->status;
951 out:
952 blk_mq_free_request(req);
953 return ret;
954 }
955 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
956
nvme_submit_sync_cmd(struct request_queue * q,struct nvme_command * cmd,void * buffer,unsigned bufflen)957 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
958 void *buffer, unsigned bufflen)
959 {
960 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
961 NVME_QID_ANY, 0, 0, false);
962 }
963 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
964
nvme_add_user_metadata(struct bio * bio,void __user * ubuf,unsigned len,u32 seed,bool write)965 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
966 unsigned len, u32 seed, bool write)
967 {
968 struct bio_integrity_payload *bip;
969 int ret = -ENOMEM;
970 void *buf;
971
972 buf = kmalloc(len, GFP_KERNEL);
973 if (!buf)
974 goto out;
975
976 ret = -EFAULT;
977 if (write && copy_from_user(buf, ubuf, len))
978 goto out_free_meta;
979
980 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
981 if (IS_ERR(bip)) {
982 ret = PTR_ERR(bip);
983 goto out_free_meta;
984 }
985
986 bip->bip_iter.bi_size = len;
987 bip->bip_iter.bi_sector = seed;
988 ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
989 offset_in_page(buf));
990 if (ret == len)
991 return buf;
992 ret = -ENOMEM;
993 out_free_meta:
994 kfree(buf);
995 out:
996 return ERR_PTR(ret);
997 }
998
nvme_known_admin_effects(u8 opcode)999 static u32 nvme_known_admin_effects(u8 opcode)
1000 {
1001 switch (opcode) {
1002 case nvme_admin_format_nvm:
1003 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
1004 NVME_CMD_EFFECTS_CSE_MASK;
1005 case nvme_admin_sanitize_nvm:
1006 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
1007 default:
1008 break;
1009 }
1010 return 0;
1011 }
1012
nvme_command_effects(struct nvme_ctrl * ctrl,struct nvme_ns * ns,u8 opcode)1013 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1014 {
1015 u32 effects = 0;
1016
1017 if (ns) {
1018 if (ns->head->effects)
1019 effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
1020 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
1021 dev_warn(ctrl->device,
1022 "IO command:%02x has unhandled effects:%08x\n",
1023 opcode, effects);
1024 return 0;
1025 }
1026
1027 if (ctrl->effects)
1028 effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1029 effects |= nvme_known_admin_effects(opcode);
1030
1031 return effects;
1032 }
1033 EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
1034
nvme_passthru_start(struct nvme_ctrl * ctrl,struct nvme_ns * ns,u8 opcode)1035 static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1036 u8 opcode)
1037 {
1038 u32 effects = nvme_command_effects(ctrl, ns, opcode);
1039
1040 /*
1041 * For simplicity, IO to all namespaces is quiesced even if the command
1042 * effects say only one namespace is affected.
1043 */
1044 if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1045 mutex_lock(&ctrl->scan_lock);
1046 mutex_lock(&ctrl->subsys->lock);
1047 nvme_mpath_start_freeze(ctrl->subsys);
1048 nvme_mpath_wait_freeze(ctrl->subsys);
1049 nvme_start_freeze(ctrl);
1050 nvme_wait_freeze(ctrl);
1051 }
1052 return effects;
1053 }
1054
nvme_passthru_end(struct nvme_ctrl * ctrl,u32 effects)1055 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
1056 {
1057 if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1058 nvme_unfreeze(ctrl);
1059 nvme_mpath_unfreeze(ctrl->subsys);
1060 mutex_unlock(&ctrl->subsys->lock);
1061 nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
1062 mutex_unlock(&ctrl->scan_lock);
1063 }
1064 if (effects & NVME_CMD_EFFECTS_CCC)
1065 nvme_init_identify(ctrl);
1066 if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
1067 nvme_queue_scan(ctrl);
1068 flush_work(&ctrl->scan_work);
1069 }
1070 }
1071
nvme_execute_passthru_rq(struct request * rq)1072 void nvme_execute_passthru_rq(struct request *rq)
1073 {
1074 struct nvme_command *cmd = nvme_req(rq)->cmd;
1075 struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
1076 struct nvme_ns *ns = rq->q->queuedata;
1077 struct gendisk *disk = ns ? ns->disk : NULL;
1078 u32 effects;
1079
1080 effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
1081 blk_execute_rq(rq->q, disk, rq, 0);
1082 nvme_passthru_end(ctrl, effects);
1083 }
1084 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
1085
nvme_submit_user_cmd(struct request_queue * q,struct nvme_command * cmd,void __user * ubuffer,unsigned bufflen,void __user * meta_buffer,unsigned meta_len,u32 meta_seed,u64 * result,unsigned timeout)1086 static int nvme_submit_user_cmd(struct request_queue *q,
1087 struct nvme_command *cmd, void __user *ubuffer,
1088 unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
1089 u32 meta_seed, u64 *result, unsigned timeout)
1090 {
1091 bool write = nvme_is_write(cmd);
1092 struct nvme_ns *ns = q->queuedata;
1093 struct gendisk *disk = ns ? ns->disk : NULL;
1094 struct request *req;
1095 struct bio *bio = NULL;
1096 void *meta = NULL;
1097 int ret;
1098
1099 req = nvme_alloc_request(q, cmd, 0);
1100 if (IS_ERR(req))
1101 return PTR_ERR(req);
1102
1103 if (timeout)
1104 req->timeout = timeout;
1105 nvme_req(req)->flags |= NVME_REQ_USERCMD;
1106
1107 if (ubuffer && bufflen) {
1108 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
1109 GFP_KERNEL);
1110 if (ret)
1111 goto out;
1112 bio = req->bio;
1113 bio->bi_disk = disk;
1114 if (disk && meta_buffer && meta_len) {
1115 meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
1116 meta_seed, write);
1117 if (IS_ERR(meta)) {
1118 ret = PTR_ERR(meta);
1119 goto out_unmap;
1120 }
1121 req->cmd_flags |= REQ_INTEGRITY;
1122 }
1123 }
1124
1125 nvme_execute_passthru_rq(req);
1126 if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
1127 ret = -EINTR;
1128 else
1129 ret = nvme_req(req)->status;
1130 if (result)
1131 *result = le64_to_cpu(nvme_req(req)->result.u64);
1132 if (meta && !ret && !write) {
1133 if (copy_to_user(meta_buffer, meta, meta_len))
1134 ret = -EFAULT;
1135 }
1136 kfree(meta);
1137 out_unmap:
1138 if (bio)
1139 blk_rq_unmap_user(bio);
1140 out:
1141 blk_mq_free_request(req);
1142 return ret;
1143 }
1144
nvme_keep_alive_end_io(struct request * rq,blk_status_t status)1145 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
1146 {
1147 struct nvme_ctrl *ctrl = rq->end_io_data;
1148 unsigned long flags;
1149 bool startka = false;
1150
1151 blk_mq_free_request(rq);
1152
1153 if (status) {
1154 dev_err(ctrl->device,
1155 "failed nvme_keep_alive_end_io error=%d\n",
1156 status);
1157 return;
1158 }
1159
1160 ctrl->comp_seen = false;
1161 spin_lock_irqsave(&ctrl->lock, flags);
1162 if (ctrl->state == NVME_CTRL_LIVE ||
1163 ctrl->state == NVME_CTRL_CONNECTING)
1164 startka = true;
1165 spin_unlock_irqrestore(&ctrl->lock, flags);
1166 if (startka)
1167 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
1168 }
1169
nvme_keep_alive(struct nvme_ctrl * ctrl)1170 static int nvme_keep_alive(struct nvme_ctrl *ctrl)
1171 {
1172 struct request *rq;
1173
1174 rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd,
1175 BLK_MQ_REQ_RESERVED);
1176 if (IS_ERR(rq))
1177 return PTR_ERR(rq);
1178
1179 rq->timeout = ctrl->kato * HZ;
1180 rq->end_io_data = ctrl;
1181
1182 blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
1183
1184 return 0;
1185 }
1186
nvme_keep_alive_work(struct work_struct * work)1187 static void nvme_keep_alive_work(struct work_struct *work)
1188 {
1189 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
1190 struct nvme_ctrl, ka_work);
1191 bool comp_seen = ctrl->comp_seen;
1192
1193 if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1194 dev_dbg(ctrl->device,
1195 "reschedule traffic based keep-alive timer\n");
1196 ctrl->comp_seen = false;
1197 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
1198 return;
1199 }
1200
1201 if (nvme_keep_alive(ctrl)) {
1202 /* allocation failure, reset the controller */
1203 dev_err(ctrl->device, "keep-alive failed\n");
1204 nvme_reset_ctrl(ctrl);
1205 return;
1206 }
1207 }
1208
nvme_start_keep_alive(struct nvme_ctrl * ctrl)1209 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
1210 {
1211 if (unlikely(ctrl->kato == 0))
1212 return;
1213
1214 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
1215 }
1216
nvme_stop_keep_alive(struct nvme_ctrl * ctrl)1217 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
1218 {
1219 if (unlikely(ctrl->kato == 0))
1220 return;
1221
1222 cancel_delayed_work_sync(&ctrl->ka_work);
1223 }
1224 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
1225
1226 /*
1227 * In NVMe 1.0 the CNS field was just a binary controller or namespace
1228 * flag, thus sending any new CNS opcodes has a big chance of not working.
1229 * Qemu unfortunately had that bug after reporting a 1.1 version compliance
1230 * (but not for any later version).
1231 */
nvme_ctrl_limited_cns(struct nvme_ctrl * ctrl)1232 static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
1233 {
1234 if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
1235 return ctrl->vs < NVME_VS(1, 2, 0);
1236 return ctrl->vs < NVME_VS(1, 1, 0);
1237 }
1238
nvme_identify_ctrl(struct nvme_ctrl * dev,struct nvme_id_ctrl ** id)1239 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
1240 {
1241 struct nvme_command c = { };
1242 int error;
1243
1244 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1245 c.identify.opcode = nvme_admin_identify;
1246 c.identify.cns = NVME_ID_CNS_CTRL;
1247
1248 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1249 if (!*id)
1250 return -ENOMEM;
1251
1252 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1253 sizeof(struct nvme_id_ctrl));
1254 if (error)
1255 kfree(*id);
1256 return error;
1257 }
1258
nvme_multi_css(struct nvme_ctrl * ctrl)1259 static bool nvme_multi_css(struct nvme_ctrl *ctrl)
1260 {
1261 return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
1262 }
1263
nvme_process_ns_desc(struct nvme_ctrl * ctrl,struct nvme_ns_ids * ids,struct nvme_ns_id_desc * cur,bool * csi_seen)1264 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1265 struct nvme_ns_id_desc *cur, bool *csi_seen)
1266 {
1267 const char *warn_str = "ctrl returned bogus length:";
1268 void *data = cur;
1269
1270 switch (cur->nidt) {
1271 case NVME_NIDT_EUI64:
1272 if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1273 dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1274 warn_str, cur->nidl);
1275 return -1;
1276 }
1277 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1278 return NVME_NIDT_EUI64_LEN;
1279 memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1280 return NVME_NIDT_EUI64_LEN;
1281 case NVME_NIDT_NGUID:
1282 if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1283 dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1284 warn_str, cur->nidl);
1285 return -1;
1286 }
1287 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1288 return NVME_NIDT_NGUID_LEN;
1289 memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1290 return NVME_NIDT_NGUID_LEN;
1291 case NVME_NIDT_UUID:
1292 if (cur->nidl != NVME_NIDT_UUID_LEN) {
1293 dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1294 warn_str, cur->nidl);
1295 return -1;
1296 }
1297 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1298 return NVME_NIDT_UUID_LEN;
1299 uuid_copy(&ids->uuid, data + sizeof(*cur));
1300 return NVME_NIDT_UUID_LEN;
1301 case NVME_NIDT_CSI:
1302 if (cur->nidl != NVME_NIDT_CSI_LEN) {
1303 dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1304 warn_str, cur->nidl);
1305 return -1;
1306 }
1307 memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1308 *csi_seen = true;
1309 return NVME_NIDT_CSI_LEN;
1310 default:
1311 /* Skip unknown types */
1312 return cur->nidl;
1313 }
1314 }
1315
nvme_identify_ns_descs(struct nvme_ctrl * ctrl,unsigned nsid,struct nvme_ns_ids * ids)1316 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
1317 struct nvme_ns_ids *ids)
1318 {
1319 struct nvme_command c = { };
1320 bool csi_seen = false;
1321 int status, pos, len;
1322 void *data;
1323
1324 if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
1325 return 0;
1326 if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1327 return 0;
1328
1329 c.identify.opcode = nvme_admin_identify;
1330 c.identify.nsid = cpu_to_le32(nsid);
1331 c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
1332
1333 data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
1334 if (!data)
1335 return -ENOMEM;
1336
1337 status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
1338 NVME_IDENTIFY_DATA_SIZE);
1339 if (status) {
1340 dev_warn(ctrl->device,
1341 "Identify Descriptors failed (%d)\n", status);
1342 goto free_data;
1343 }
1344
1345 for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
1346 struct nvme_ns_id_desc *cur = data + pos;
1347
1348 if (cur->nidl == 0)
1349 break;
1350
1351 len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
1352 if (len < 0)
1353 break;
1354
1355 len += sizeof(*cur);
1356 }
1357
1358 if (nvme_multi_css(ctrl) && !csi_seen) {
1359 dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1360 nsid);
1361 status = -EINVAL;
1362 }
1363
1364 free_data:
1365 kfree(data);
1366 return status;
1367 }
1368
nvme_identify_ns(struct nvme_ctrl * ctrl,unsigned nsid,struct nvme_ns_ids * ids,struct nvme_id_ns ** id)1369 static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
1370 struct nvme_ns_ids *ids, struct nvme_id_ns **id)
1371 {
1372 struct nvme_command c = { };
1373 int error;
1374
1375 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1376 c.identify.opcode = nvme_admin_identify;
1377 c.identify.nsid = cpu_to_le32(nsid);
1378 c.identify.cns = NVME_ID_CNS_NS;
1379
1380 *id = kmalloc(sizeof(**id), GFP_KERNEL);
1381 if (!*id)
1382 return -ENOMEM;
1383
1384 error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
1385 if (error) {
1386 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1387 goto out_free_id;
1388 }
1389
1390 error = NVME_SC_INVALID_NS | NVME_SC_DNR;
1391 if ((*id)->ncap == 0) /* namespace not allocated or attached */
1392 goto out_free_id;
1393
1394
1395 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
1396 dev_info(ctrl->device,
1397 "Ignoring bogus Namespace Identifiers\n");
1398 } else {
1399 if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1400 !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
1401 memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
1402 if (ctrl->vs >= NVME_VS(1, 2, 0) &&
1403 !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
1404 memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
1405 }
1406
1407 return 0;
1408
1409 out_free_id:
1410 kfree(*id);
1411 return error;
1412 }
1413
nvme_features(struct nvme_ctrl * dev,u8 op,unsigned int fid,unsigned int dword11,void * buffer,size_t buflen,u32 * result)1414 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
1415 unsigned int dword11, void *buffer, size_t buflen, u32 *result)
1416 {
1417 union nvme_result res = { 0 };
1418 struct nvme_command c;
1419 int ret;
1420
1421 memset(&c, 0, sizeof(c));
1422 c.features.opcode = op;
1423 c.features.fid = cpu_to_le32(fid);
1424 c.features.dword11 = cpu_to_le32(dword11);
1425
1426 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1427 buffer, buflen, 0, NVME_QID_ANY, 0, 0, false);
1428 if (ret >= 0 && result)
1429 *result = le32_to_cpu(res.u32);
1430 return ret;
1431 }
1432
nvme_set_features(struct nvme_ctrl * dev,unsigned int fid,unsigned int dword11,void * buffer,size_t buflen,u32 * result)1433 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
1434 unsigned int dword11, void *buffer, size_t buflen,
1435 u32 *result)
1436 {
1437 return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
1438 buflen, result);
1439 }
1440 EXPORT_SYMBOL_GPL(nvme_set_features);
1441
nvme_get_features(struct nvme_ctrl * dev,unsigned int fid,unsigned int dword11,void * buffer,size_t buflen,u32 * result)1442 int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
1443 unsigned int dword11, void *buffer, size_t buflen,
1444 u32 *result)
1445 {
1446 return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
1447 buflen, result);
1448 }
1449 EXPORT_SYMBOL_GPL(nvme_get_features);
1450
nvme_set_queue_count(struct nvme_ctrl * ctrl,int * count)1451 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
1452 {
1453 u32 q_count = (*count - 1) | ((*count - 1) << 16);
1454 u32 result;
1455 int status, nr_io_queues;
1456
1457 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1458 &result);
1459 if (status < 0)
1460 return status;
1461
1462 /*
1463 * Degraded controllers might return an error when setting the queue
1464 * count. We still want to be able to bring them online and offer
1465 * access to the admin queue, as that might be only way to fix them up.
1466 */
1467 if (status > 0) {
1468 dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1469 *count = 0;
1470 } else {
1471 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
1472 *count = min(*count, nr_io_queues);
1473 }
1474
1475 return 0;
1476 }
1477 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1478
1479 #define NVME_AEN_SUPPORTED \
1480 (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
1481 NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
1482
nvme_enable_aen(struct nvme_ctrl * ctrl)1483 static void nvme_enable_aen(struct nvme_ctrl *ctrl)
1484 {
1485 u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
1486 int status;
1487
1488 if (!supported_aens)
1489 return;
1490
1491 status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
1492 NULL, 0, &result);
1493 if (status)
1494 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
1495 supported_aens);
1496
1497 queue_work(nvme_wq, &ctrl->async_event_work);
1498 }
1499
1500 /*
1501 * Convert integer values from ioctl structures to user pointers, silently
1502 * ignoring the upper bits in the compat case to match behaviour of 32-bit
1503 * kernels.
1504 */
nvme_to_user_ptr(uintptr_t ptrval)1505 static void __user *nvme_to_user_ptr(uintptr_t ptrval)
1506 {
1507 if (in_compat_syscall())
1508 ptrval = (compat_uptr_t)ptrval;
1509 return (void __user *)ptrval;
1510 }
1511
nvme_submit_io(struct nvme_ns * ns,struct nvme_user_io __user * uio)1512 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1513 {
1514 struct nvme_user_io io;
1515 struct nvme_command c;
1516 unsigned length, meta_len;
1517 void __user *metadata;
1518
1519 if (copy_from_user(&io, uio, sizeof(io)))
1520 return -EFAULT;
1521 if (io.flags)
1522 return -EINVAL;
1523
1524 switch (io.opcode) {
1525 case nvme_cmd_write:
1526 case nvme_cmd_read:
1527 case nvme_cmd_compare:
1528 break;
1529 default:
1530 return -EINVAL;
1531 }
1532
1533 length = (io.nblocks + 1) << ns->lba_shift;
1534
1535 if ((io.control & NVME_RW_PRINFO_PRACT) &&
1536 ns->ms == sizeof(struct t10_pi_tuple)) {
1537 /*
1538 * Protection information is stripped/inserted by the
1539 * controller.
1540 */
1541 if (nvme_to_user_ptr(io.metadata))
1542 return -EINVAL;
1543 meta_len = 0;
1544 metadata = NULL;
1545 } else {
1546 meta_len = (io.nblocks + 1) * ns->ms;
1547 metadata = nvme_to_user_ptr(io.metadata);
1548 }
1549
1550 if (ns->features & NVME_NS_EXT_LBAS) {
1551 length += meta_len;
1552 meta_len = 0;
1553 } else if (meta_len) {
1554 if ((io.metadata & 3) || !io.metadata)
1555 return -EINVAL;
1556 }
1557
1558 memset(&c, 0, sizeof(c));
1559 c.rw.opcode = io.opcode;
1560 c.rw.flags = io.flags;
1561 c.rw.nsid = cpu_to_le32(ns->head->ns_id);
1562 c.rw.slba = cpu_to_le64(io.slba);
1563 c.rw.length = cpu_to_le16(io.nblocks);
1564 c.rw.control = cpu_to_le16(io.control);
1565 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
1566 c.rw.reftag = cpu_to_le32(io.reftag);
1567 c.rw.apptag = cpu_to_le16(io.apptag);
1568 c.rw.appmask = cpu_to_le16(io.appmask);
1569
1570 return nvme_submit_user_cmd(ns->queue, &c,
1571 nvme_to_user_ptr(io.addr), length,
1572 metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
1573 }
1574
nvme_user_cmd(struct nvme_ctrl * ctrl,struct nvme_ns * ns,struct nvme_passthru_cmd __user * ucmd)1575 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1576 struct nvme_passthru_cmd __user *ucmd)
1577 {
1578 struct nvme_passthru_cmd cmd;
1579 struct nvme_command c;
1580 unsigned timeout = 0;
1581 u64 result;
1582 int status;
1583
1584 if (!capable(CAP_SYS_ADMIN))
1585 return -EACCES;
1586 if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1587 return -EFAULT;
1588 if (cmd.flags)
1589 return -EINVAL;
1590
1591 memset(&c, 0, sizeof(c));
1592 c.common.opcode = cmd.opcode;
1593 c.common.flags = cmd.flags;
1594 c.common.nsid = cpu_to_le32(cmd.nsid);
1595 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1596 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1597 c.common.cdw10 = cpu_to_le32(cmd.cdw10);
1598 c.common.cdw11 = cpu_to_le32(cmd.cdw11);
1599 c.common.cdw12 = cpu_to_le32(cmd.cdw12);
1600 c.common.cdw13 = cpu_to_le32(cmd.cdw13);
1601 c.common.cdw14 = cpu_to_le32(cmd.cdw14);
1602 c.common.cdw15 = cpu_to_le32(cmd.cdw15);
1603
1604 if (cmd.timeout_ms)
1605 timeout = msecs_to_jiffies(cmd.timeout_ms);
1606
1607 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1608 nvme_to_user_ptr(cmd.addr), cmd.data_len,
1609 nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
1610 0, &result, timeout);
1611
1612 if (status >= 0) {
1613 if (put_user(result, &ucmd->result))
1614 return -EFAULT;
1615 }
1616
1617 return status;
1618 }
1619
nvme_user_cmd64(struct nvme_ctrl * ctrl,struct nvme_ns * ns,struct nvme_passthru_cmd64 __user * ucmd)1620 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1621 struct nvme_passthru_cmd64 __user *ucmd)
1622 {
1623 struct nvme_passthru_cmd64 cmd;
1624 struct nvme_command c;
1625 unsigned timeout = 0;
1626 int status;
1627
1628 if (!capable(CAP_SYS_ADMIN))
1629 return -EACCES;
1630 if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1631 return -EFAULT;
1632 if (cmd.flags)
1633 return -EINVAL;
1634
1635 memset(&c, 0, sizeof(c));
1636 c.common.opcode = cmd.opcode;
1637 c.common.flags = cmd.flags;
1638 c.common.nsid = cpu_to_le32(cmd.nsid);
1639 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1640 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1641 c.common.cdw10 = cpu_to_le32(cmd.cdw10);
1642 c.common.cdw11 = cpu_to_le32(cmd.cdw11);
1643 c.common.cdw12 = cpu_to_le32(cmd.cdw12);
1644 c.common.cdw13 = cpu_to_le32(cmd.cdw13);
1645 c.common.cdw14 = cpu_to_le32(cmd.cdw14);
1646 c.common.cdw15 = cpu_to_le32(cmd.cdw15);
1647
1648 if (cmd.timeout_ms)
1649 timeout = msecs_to_jiffies(cmd.timeout_ms);
1650
1651 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1652 nvme_to_user_ptr(cmd.addr), cmd.data_len,
1653 nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
1654 0, &cmd.result, timeout);
1655
1656 if (status >= 0) {
1657 if (put_user(cmd.result, &ucmd->result))
1658 return -EFAULT;
1659 }
1660
1661 return status;
1662 }
1663
1664 /*
1665 * Issue ioctl requests on the first available path. Note that unlike normal
1666 * block layer requests we will not retry failed request on another controller.
1667 */
nvme_get_ns_from_disk(struct gendisk * disk,struct nvme_ns_head ** head,int * srcu_idx)1668 struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
1669 struct nvme_ns_head **head, int *srcu_idx)
1670 {
1671 #ifdef CONFIG_NVME_MULTIPATH
1672 if (disk->fops == &nvme_ns_head_ops) {
1673 struct nvme_ns *ns;
1674
1675 *head = disk->private_data;
1676 *srcu_idx = srcu_read_lock(&(*head)->srcu);
1677 ns = nvme_find_path(*head);
1678 if (!ns)
1679 srcu_read_unlock(&(*head)->srcu, *srcu_idx);
1680 return ns;
1681 }
1682 #endif
1683 *head = NULL;
1684 *srcu_idx = -1;
1685 return disk->private_data;
1686 }
1687
nvme_put_ns_from_disk(struct nvme_ns_head * head,int idx)1688 void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
1689 {
1690 if (head)
1691 srcu_read_unlock(&head->srcu, idx);
1692 }
1693
is_ctrl_ioctl(unsigned int cmd)1694 static bool is_ctrl_ioctl(unsigned int cmd)
1695 {
1696 if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
1697 return true;
1698 if (is_sed_ioctl(cmd))
1699 return true;
1700 return false;
1701 }
1702
nvme_handle_ctrl_ioctl(struct nvme_ns * ns,unsigned int cmd,void __user * argp,struct nvme_ns_head * head,int srcu_idx)1703 static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
1704 void __user *argp,
1705 struct nvme_ns_head *head,
1706 int srcu_idx)
1707 {
1708 struct nvme_ctrl *ctrl = ns->ctrl;
1709 int ret;
1710
1711 nvme_get_ctrl(ns->ctrl);
1712 nvme_put_ns_from_disk(head, srcu_idx);
1713
1714 switch (cmd) {
1715 case NVME_IOCTL_ADMIN_CMD:
1716 ret = nvme_user_cmd(ctrl, NULL, argp);
1717 break;
1718 case NVME_IOCTL_ADMIN64_CMD:
1719 ret = nvme_user_cmd64(ctrl, NULL, argp);
1720 break;
1721 default:
1722 ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
1723 break;
1724 }
1725 nvme_put_ctrl(ctrl);
1726 return ret;
1727 }
1728
nvme_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)1729 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
1730 unsigned int cmd, unsigned long arg)
1731 {
1732 struct nvme_ns_head *head = NULL;
1733 void __user *argp = (void __user *)arg;
1734 struct nvme_ns *ns;
1735 int srcu_idx, ret;
1736
1737 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
1738 if (unlikely(!ns))
1739 return -EWOULDBLOCK;
1740
1741 /*
1742 * Handle ioctls that apply to the controller instead of the namespace
1743 * seperately and drop the ns SRCU reference early. This avoids a
1744 * deadlock when deleting namespaces using the passthrough interface.
1745 */
1746 if (is_ctrl_ioctl(cmd))
1747 return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
1748
1749 switch (cmd) {
1750 case NVME_IOCTL_ID:
1751 force_successful_syscall_return();
1752 ret = ns->head->ns_id;
1753 break;
1754 case NVME_IOCTL_IO_CMD:
1755 ret = nvme_user_cmd(ns->ctrl, ns, argp);
1756 break;
1757 case NVME_IOCTL_SUBMIT_IO:
1758 ret = nvme_submit_io(ns, argp);
1759 break;
1760 case NVME_IOCTL_IO64_CMD:
1761 ret = nvme_user_cmd64(ns->ctrl, ns, argp);
1762 break;
1763 default:
1764 if (ns->ndev)
1765 ret = nvme_nvm_ioctl(ns, cmd, arg);
1766 else
1767 ret = -ENOTTY;
1768 }
1769
1770 nvme_put_ns_from_disk(head, srcu_idx);
1771 return ret;
1772 }
1773
1774 #ifdef CONFIG_COMPAT
1775 struct nvme_user_io32 {
1776 __u8 opcode;
1777 __u8 flags;
1778 __u16 control;
1779 __u16 nblocks;
1780 __u16 rsvd;
1781 __u64 metadata;
1782 __u64 addr;
1783 __u64 slba;
1784 __u32 dsmgmt;
1785 __u32 reftag;
1786 __u16 apptag;
1787 __u16 appmask;
1788 } __attribute__((__packed__));
1789
1790 #define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32)
1791
nvme_compat_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)1792 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1793 unsigned int cmd, unsigned long arg)
1794 {
1795 /*
1796 * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO
1797 * between 32 bit programs and 64 bit kernel.
1798 * The cause is that the results of sizeof(struct nvme_user_io),
1799 * which is used to define NVME_IOCTL_SUBMIT_IO,
1800 * are not same between 32 bit compiler and 64 bit compiler.
1801 * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling
1802 * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs.
1803 * Other IOCTL numbers are same between 32 bit and 64 bit.
1804 * So there is nothing to do regarding to other IOCTL numbers.
1805 */
1806 if (cmd == NVME_IOCTL_SUBMIT_IO32)
1807 return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg);
1808
1809 return nvme_ioctl(bdev, mode, cmd, arg);
1810 }
1811 #else
1812 #define nvme_compat_ioctl NULL
1813 #endif /* CONFIG_COMPAT */
1814
nvme_open(struct block_device * bdev,fmode_t mode)1815 static int nvme_open(struct block_device *bdev, fmode_t mode)
1816 {
1817 struct nvme_ns *ns = bdev->bd_disk->private_data;
1818
1819 #ifdef CONFIG_NVME_MULTIPATH
1820 /* should never be called due to GENHD_FL_HIDDEN */
1821 if (WARN_ON_ONCE(ns->head->disk))
1822 goto fail;
1823 #endif
1824 if (!kref_get_unless_zero(&ns->kref))
1825 goto fail;
1826 if (!try_module_get(ns->ctrl->ops->module))
1827 goto fail_put_ns;
1828
1829 return 0;
1830
1831 fail_put_ns:
1832 nvme_put_ns(ns);
1833 fail:
1834 return -ENXIO;
1835 }
1836
nvme_release(struct gendisk * disk,fmode_t mode)1837 static void nvme_release(struct gendisk *disk, fmode_t mode)
1838 {
1839 struct nvme_ns *ns = disk->private_data;
1840
1841 module_put(ns->ctrl->ops->module);
1842 nvme_put_ns(ns);
1843 }
1844
nvme_getgeo(struct block_device * bdev,struct hd_geometry * geo)1845 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1846 {
1847 /* some standard values */
1848 geo->heads = 1 << 6;
1849 geo->sectors = 1 << 5;
1850 geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1851 return 0;
1852 }
1853
1854 #ifdef CONFIG_BLK_DEV_INTEGRITY
nvme_init_integrity(struct gendisk * disk,u16 ms,u8 pi_type,u32 max_integrity_segments)1855 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1856 u32 max_integrity_segments)
1857 {
1858 struct blk_integrity integrity;
1859
1860 memset(&integrity, 0, sizeof(integrity));
1861 switch (pi_type) {
1862 case NVME_NS_DPS_PI_TYPE3:
1863 integrity.profile = &t10_pi_type3_crc;
1864 integrity.tag_size = sizeof(u16) + sizeof(u32);
1865 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1866 break;
1867 case NVME_NS_DPS_PI_TYPE1:
1868 case NVME_NS_DPS_PI_TYPE2:
1869 integrity.profile = &t10_pi_type1_crc;
1870 integrity.tag_size = sizeof(u16);
1871 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1872 break;
1873 default:
1874 integrity.profile = NULL;
1875 break;
1876 }
1877 integrity.tuple_size = ms;
1878 blk_integrity_register(disk, &integrity);
1879 blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
1880 }
1881 #else
nvme_init_integrity(struct gendisk * disk,u16 ms,u8 pi_type,u32 max_integrity_segments)1882 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1883 u32 max_integrity_segments)
1884 {
1885 }
1886 #endif /* CONFIG_BLK_DEV_INTEGRITY */
1887
nvme_config_discard(struct gendisk * disk,struct nvme_ns * ns)1888 static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
1889 {
1890 struct nvme_ctrl *ctrl = ns->ctrl;
1891 struct request_queue *queue = disk->queue;
1892 u32 size = queue_logical_block_size(queue);
1893
1894 if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) {
1895 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue);
1896 return;
1897 }
1898
1899 if (ctrl->nr_streams && ns->sws && ns->sgs)
1900 size *= ns->sws * ns->sgs;
1901
1902 BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
1903 NVME_DSM_MAX_RANGES);
1904
1905 queue->limits.discard_alignment = 0;
1906 queue->limits.discard_granularity = size;
1907
1908 /* If discard is already enabled, don't reset queue limits */
1909 if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue))
1910 return;
1911
1912 blk_queue_max_discard_sectors(queue, UINT_MAX);
1913 blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
1914
1915 if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1916 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
1917 }
1918
1919 /*
1920 * Even though NVMe spec explicitly states that MDTS is not applicable to the
1921 * write-zeroes, we are cautious and limit the size to the controllers
1922 * max_hw_sectors value, which is based on the MDTS field and possibly other
1923 * limiting factors.
1924 */
nvme_config_write_zeroes(struct request_queue * q,struct nvme_ctrl * ctrl)1925 static void nvme_config_write_zeroes(struct request_queue *q,
1926 struct nvme_ctrl *ctrl)
1927 {
1928 if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
1929 !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
1930 blk_queue_max_write_zeroes_sectors(q, ctrl->max_hw_sectors);
1931 }
1932
nvme_ns_ids_valid(struct nvme_ns_ids * ids)1933 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
1934 {
1935 return !uuid_is_null(&ids->uuid) ||
1936 memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
1937 memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
1938 }
1939
nvme_ns_ids_equal(struct nvme_ns_ids * a,struct nvme_ns_ids * b)1940 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1941 {
1942 return uuid_equal(&a->uuid, &b->uuid) &&
1943 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1944 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
1945 a->csi == b->csi;
1946 }
1947
nvme_setup_streams_ns(struct nvme_ctrl * ctrl,struct nvme_ns * ns,u32 * phys_bs,u32 * io_opt)1948 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1949 u32 *phys_bs, u32 *io_opt)
1950 {
1951 struct streams_directive_params s;
1952 int ret;
1953
1954 if (!ctrl->nr_streams)
1955 return 0;
1956
1957 ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
1958 if (ret)
1959 return ret;
1960
1961 ns->sws = le32_to_cpu(s.sws);
1962 ns->sgs = le16_to_cpu(s.sgs);
1963
1964 if (ns->sws) {
1965 *phys_bs = ns->sws * (1 << ns->lba_shift);
1966 if (ns->sgs)
1967 *io_opt = *phys_bs * ns->sgs;
1968 }
1969
1970 return 0;
1971 }
1972
nvme_configure_metadata(struct nvme_ns * ns,struct nvme_id_ns * id)1973 static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
1974 {
1975 struct nvme_ctrl *ctrl = ns->ctrl;
1976
1977 /*
1978 * The PI implementation requires the metadata size to be equal to the
1979 * t10 pi tuple size.
1980 */
1981 ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1982 if (ns->ms == sizeof(struct t10_pi_tuple))
1983 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1984 else
1985 ns->pi_type = 0;
1986
1987 ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
1988 if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1989 return 0;
1990 if (ctrl->ops->flags & NVME_F_FABRICS) {
1991 /*
1992 * The NVMe over Fabrics specification only supports metadata as
1993 * part of the extended data LBA. We rely on HCA/HBA support to
1994 * remap the separate metadata buffer from the block layer.
1995 */
1996 if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
1997 return -EINVAL;
1998 if (ctrl->max_integrity_segments)
1999 ns->features |=
2000 (NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
2001 } else {
2002 /*
2003 * For PCIe controllers, we can't easily remap the separate
2004 * metadata buffer from the block layer and thus require a
2005 * separate metadata buffer for block layer metadata/PI support.
2006 * We allow extended LBAs for the passthrough interface, though.
2007 */
2008 if (id->flbas & NVME_NS_FLBAS_META_EXT)
2009 ns->features |= NVME_NS_EXT_LBAS;
2010 else
2011 ns->features |= NVME_NS_METADATA_SUPPORTED;
2012 }
2013
2014 return 0;
2015 }
2016
nvme_set_queue_limits(struct nvme_ctrl * ctrl,struct request_queue * q)2017 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
2018 struct request_queue *q)
2019 {
2020 bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
2021
2022 if (ctrl->max_hw_sectors) {
2023 u32 max_segments =
2024 (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
2025
2026 max_segments = min_not_zero(max_segments, ctrl->max_segments);
2027 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
2028 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
2029 }
2030 blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
2031 blk_queue_dma_alignment(q, 3);
2032 blk_queue_write_cache(q, vwc, vwc);
2033 }
2034
nvme_update_disk_info(struct gendisk * disk,struct nvme_ns * ns,struct nvme_id_ns * id)2035 static void nvme_update_disk_info(struct gendisk *disk,
2036 struct nvme_ns *ns, struct nvme_id_ns *id)
2037 {
2038 sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
2039 unsigned short bs = 1 << ns->lba_shift;
2040 u32 atomic_bs, phys_bs, io_opt = 0;
2041
2042 /*
2043 * The block layer can't support LBA sizes larger than the page size
2044 * yet, so catch this early and don't allow block I/O.
2045 */
2046 if (ns->lba_shift > PAGE_SHIFT) {
2047 capacity = 0;
2048 bs = (1 << 9);
2049 }
2050
2051 blk_integrity_unregister(disk);
2052
2053 atomic_bs = phys_bs = bs;
2054 nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt);
2055 if (id->nabo == 0) {
2056 /*
2057 * Bit 1 indicates whether NAWUPF is defined for this namespace
2058 * and whether it should be used instead of AWUPF. If NAWUPF ==
2059 * 0 then AWUPF must be used instead.
2060 */
2061 if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
2062 atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
2063 else
2064 atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
2065 }
2066
2067 if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
2068 /* NPWG = Namespace Preferred Write Granularity */
2069 phys_bs = bs * (1 + le16_to_cpu(id->npwg));
2070 /* NOWS = Namespace Optimal Write Size */
2071 io_opt = bs * (1 + le16_to_cpu(id->nows));
2072 }
2073
2074 blk_queue_logical_block_size(disk->queue, bs);
2075 /*
2076 * Linux filesystems assume writing a single physical block is
2077 * an atomic operation. Hence limit the physical block size to the
2078 * value of the Atomic Write Unit Power Fail parameter.
2079 */
2080 blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
2081 blk_queue_io_min(disk->queue, phys_bs);
2082 blk_queue_io_opt(disk->queue, io_opt);
2083
2084 /*
2085 * Register a metadata profile for PI, or the plain non-integrity NVMe
2086 * metadata masquerading as Type 0 if supported, otherwise reject block
2087 * I/O to namespaces with metadata except when the namespace supports
2088 * PI, as it can strip/insert in that case.
2089 */
2090 if (ns->ms) {
2091 if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
2092 (ns->features & NVME_NS_METADATA_SUPPORTED))
2093 nvme_init_integrity(disk, ns->ms, ns->pi_type,
2094 ns->ctrl->max_integrity_segments);
2095 else if (!nvme_ns_has_pi(ns))
2096 capacity = 0;
2097 }
2098
2099 set_capacity_revalidate_and_notify(disk, capacity, false);
2100
2101 nvme_config_discard(disk, ns);
2102 nvme_config_write_zeroes(disk->queue, ns->ctrl);
2103
2104 if (id->nsattr & NVME_NS_ATTR_RO)
2105 set_disk_ro(disk, true);
2106 }
2107
nvme_first_scan(struct gendisk * disk)2108 static inline bool nvme_first_scan(struct gendisk *disk)
2109 {
2110 /* nvme_alloc_ns() scans the disk prior to adding it */
2111 return !(disk->flags & GENHD_FL_UP);
2112 }
2113
nvme_set_chunk_sectors(struct nvme_ns * ns,struct nvme_id_ns * id)2114 static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
2115 {
2116 struct nvme_ctrl *ctrl = ns->ctrl;
2117 u32 iob;
2118
2119 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
2120 is_power_of_2(ctrl->max_hw_sectors))
2121 iob = ctrl->max_hw_sectors;
2122 else
2123 iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
2124
2125 if (!iob)
2126 return;
2127
2128 if (!is_power_of_2(iob)) {
2129 if (nvme_first_scan(ns->disk))
2130 pr_warn("%s: ignoring unaligned IO boundary:%u\n",
2131 ns->disk->disk_name, iob);
2132 return;
2133 }
2134
2135 if (blk_queue_is_zoned(ns->disk->queue)) {
2136 if (nvme_first_scan(ns->disk))
2137 pr_warn("%s: ignoring zoned namespace IO boundary\n",
2138 ns->disk->disk_name);
2139 return;
2140 }
2141
2142 blk_queue_chunk_sectors(ns->queue, iob);
2143 }
2144
nvme_update_ns_info(struct nvme_ns * ns,struct nvme_id_ns * id)2145 static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
2146 {
2147 unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
2148 int ret;
2149
2150 blk_mq_freeze_queue(ns->disk->queue);
2151 ns->lba_shift = id->lbaf[lbaf].ds;
2152 nvme_set_queue_limits(ns->ctrl, ns->queue);
2153
2154 if (ns->head->ids.csi == NVME_CSI_ZNS) {
2155 ret = nvme_update_zone_info(ns, lbaf);
2156 if (ret)
2157 goto out_unfreeze;
2158 }
2159
2160 ret = nvme_configure_metadata(ns, id);
2161 if (ret)
2162 goto out_unfreeze;
2163 nvme_set_chunk_sectors(ns, id);
2164 nvme_update_disk_info(ns->disk, ns, id);
2165 blk_mq_unfreeze_queue(ns->disk->queue);
2166
2167 if (blk_queue_is_zoned(ns->queue)) {
2168 ret = nvme_revalidate_zones(ns);
2169 if (ret && !nvme_first_scan(ns->disk))
2170 return ret;
2171 }
2172
2173 #ifdef CONFIG_NVME_MULTIPATH
2174 if (ns->head->disk) {
2175 blk_mq_freeze_queue(ns->head->disk->queue);
2176 nvme_update_disk_info(ns->head->disk, ns, id);
2177 blk_stack_limits(&ns->head->disk->queue->limits,
2178 &ns->queue->limits, 0);
2179 blk_queue_update_readahead(ns->head->disk->queue);
2180 nvme_update_bdev_size(ns->head->disk);
2181 blk_mq_unfreeze_queue(ns->head->disk->queue);
2182 }
2183 #endif
2184 return 0;
2185
2186 out_unfreeze:
2187 blk_mq_unfreeze_queue(ns->disk->queue);
2188 return ret;
2189 }
2190
nvme_pr_type(enum pr_type type)2191 static char nvme_pr_type(enum pr_type type)
2192 {
2193 switch (type) {
2194 case PR_WRITE_EXCLUSIVE:
2195 return 1;
2196 case PR_EXCLUSIVE_ACCESS:
2197 return 2;
2198 case PR_WRITE_EXCLUSIVE_REG_ONLY:
2199 return 3;
2200 case PR_EXCLUSIVE_ACCESS_REG_ONLY:
2201 return 4;
2202 case PR_WRITE_EXCLUSIVE_ALL_REGS:
2203 return 5;
2204 case PR_EXCLUSIVE_ACCESS_ALL_REGS:
2205 return 6;
2206 default:
2207 return 0;
2208 }
2209 };
2210
nvme_pr_command(struct block_device * bdev,u32 cdw10,u64 key,u64 sa_key,u8 op)2211 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
2212 u64 key, u64 sa_key, u8 op)
2213 {
2214 struct nvme_ns_head *head = NULL;
2215 struct nvme_ns *ns;
2216 struct nvme_command c;
2217 int srcu_idx, ret;
2218 u8 data[16] = { 0, };
2219
2220 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
2221 if (unlikely(!ns))
2222 return -EWOULDBLOCK;
2223
2224 put_unaligned_le64(key, &data[0]);
2225 put_unaligned_le64(sa_key, &data[8]);
2226
2227 memset(&c, 0, sizeof(c));
2228 c.common.opcode = op;
2229 c.common.nsid = cpu_to_le32(ns->head->ns_id);
2230 c.common.cdw10 = cpu_to_le32(cdw10);
2231
2232 ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
2233 nvme_put_ns_from_disk(head, srcu_idx);
2234 return ret;
2235 }
2236
nvme_pr_register(struct block_device * bdev,u64 old,u64 new,unsigned flags)2237 static int nvme_pr_register(struct block_device *bdev, u64 old,
2238 u64 new, unsigned flags)
2239 {
2240 u32 cdw10;
2241
2242 if (flags & ~PR_FL_IGNORE_KEY)
2243 return -EOPNOTSUPP;
2244
2245 cdw10 = old ? 2 : 0;
2246 cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
2247 cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
2248 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
2249 }
2250
nvme_pr_reserve(struct block_device * bdev,u64 key,enum pr_type type,unsigned flags)2251 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
2252 enum pr_type type, unsigned flags)
2253 {
2254 u32 cdw10;
2255
2256 if (flags & ~PR_FL_IGNORE_KEY)
2257 return -EOPNOTSUPP;
2258
2259 cdw10 = nvme_pr_type(type) << 8;
2260 cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
2261 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
2262 }
2263
nvme_pr_preempt(struct block_device * bdev,u64 old,u64 new,enum pr_type type,bool abort)2264 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
2265 enum pr_type type, bool abort)
2266 {
2267 u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
2268
2269 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
2270 }
2271
nvme_pr_clear(struct block_device * bdev,u64 key)2272 static int nvme_pr_clear(struct block_device *bdev, u64 key)
2273 {
2274 u32 cdw10 = 1 | (key ? 0 : 1 << 3);
2275
2276 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
2277 }
2278
nvme_pr_release(struct block_device * bdev,u64 key,enum pr_type type)2279 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2280 {
2281 u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 0 : 1 << 3);
2282
2283 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
2284 }
2285
2286 static const struct pr_ops nvme_pr_ops = {
2287 .pr_register = nvme_pr_register,
2288 .pr_reserve = nvme_pr_reserve,
2289 .pr_release = nvme_pr_release,
2290 .pr_preempt = nvme_pr_preempt,
2291 .pr_clear = nvme_pr_clear,
2292 };
2293
2294 #ifdef CONFIG_BLK_SED_OPAL
nvme_sec_submit(void * data,u16 spsp,u8 secp,void * buffer,size_t len,bool send)2295 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
2296 bool send)
2297 {
2298 struct nvme_ctrl *ctrl = data;
2299 struct nvme_command cmd;
2300
2301 memset(&cmd, 0, sizeof(cmd));
2302 if (send)
2303 cmd.common.opcode = nvme_admin_security_send;
2304 else
2305 cmd.common.opcode = nvme_admin_security_recv;
2306 cmd.common.nsid = 0;
2307 cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
2308 cmd.common.cdw11 = cpu_to_le32(len);
2309
2310 return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
2311 ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false);
2312 }
2313 EXPORT_SYMBOL_GPL(nvme_sec_submit);
2314 #endif /* CONFIG_BLK_SED_OPAL */
2315
2316 static const struct block_device_operations nvme_fops = {
2317 .owner = THIS_MODULE,
2318 .ioctl = nvme_ioctl,
2319 .compat_ioctl = nvme_compat_ioctl,
2320 .open = nvme_open,
2321 .release = nvme_release,
2322 .getgeo = nvme_getgeo,
2323 .report_zones = nvme_report_zones,
2324 .pr_ops = &nvme_pr_ops,
2325 };
2326
2327 #ifdef CONFIG_NVME_MULTIPATH
nvme_ns_head_open(struct block_device * bdev,fmode_t mode)2328 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
2329 {
2330 struct nvme_ns_head *head = bdev->bd_disk->private_data;
2331
2332 if (!kref_get_unless_zero(&head->ref))
2333 return -ENXIO;
2334 return 0;
2335 }
2336
nvme_ns_head_release(struct gendisk * disk,fmode_t mode)2337 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
2338 {
2339 nvme_put_ns_head(disk->private_data);
2340 }
2341
2342 const struct block_device_operations nvme_ns_head_ops = {
2343 .owner = THIS_MODULE,
2344 .submit_bio = nvme_ns_head_submit_bio,
2345 .open = nvme_ns_head_open,
2346 .release = nvme_ns_head_release,
2347 .ioctl = nvme_ioctl,
2348 .compat_ioctl = nvme_compat_ioctl,
2349 .getgeo = nvme_getgeo,
2350 .report_zones = nvme_report_zones,
2351 .pr_ops = &nvme_pr_ops,
2352 };
2353 #endif /* CONFIG_NVME_MULTIPATH */
2354
nvme_wait_ready(struct nvme_ctrl * ctrl,u64 cap,bool enabled)2355 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
2356 {
2357 unsigned long timeout =
2358 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
2359 u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
2360 int ret;
2361
2362 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2363 if (csts == ~0)
2364 return -ENODEV;
2365 if ((csts & NVME_CSTS_RDY) == bit)
2366 break;
2367
2368 usleep_range(1000, 2000);
2369 if (fatal_signal_pending(current))
2370 return -EINTR;
2371 if (time_after(jiffies, timeout)) {
2372 dev_err(ctrl->device,
2373 "Device not ready; aborting %s, CSTS=0x%x\n",
2374 enabled ? "initialisation" : "reset", csts);
2375 return -ENODEV;
2376 }
2377 }
2378
2379 return ret;
2380 }
2381
2382 /*
2383 * If the device has been passed off to us in an enabled state, just clear
2384 * the enabled bit. The spec says we should set the 'shutdown notification
2385 * bits', but doing so may cause the device to complete commands to the
2386 * admin queue ... and we don't know what memory that might be pointing at!
2387 */
nvme_disable_ctrl(struct nvme_ctrl * ctrl)2388 int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
2389 {
2390 int ret;
2391
2392 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2393 ctrl->ctrl_config &= ~NVME_CC_ENABLE;
2394
2395 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2396 if (ret)
2397 return ret;
2398
2399 if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
2400 msleep(NVME_QUIRK_DELAY_AMOUNT);
2401
2402 return nvme_wait_ready(ctrl, ctrl->cap, false);
2403 }
2404 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
2405
nvme_enable_ctrl(struct nvme_ctrl * ctrl)2406 int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
2407 {
2408 unsigned dev_page_min;
2409 int ret;
2410
2411 ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2412 if (ret) {
2413 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2414 return ret;
2415 }
2416 dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
2417
2418 if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
2419 dev_err(ctrl->device,
2420 "Minimum device page size %u too large for host (%u)\n",
2421 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
2422 return -ENODEV;
2423 }
2424
2425 if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2426 ctrl->ctrl_config = NVME_CC_CSS_CSI;
2427 else
2428 ctrl->ctrl_config = NVME_CC_CSS_NVM;
2429 ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
2430 ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
2431 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
2432 ctrl->ctrl_config |= NVME_CC_ENABLE;
2433
2434 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2435 if (ret)
2436 return ret;
2437 return nvme_wait_ready(ctrl, ctrl->cap, true);
2438 }
2439 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
2440
nvme_shutdown_ctrl(struct nvme_ctrl * ctrl)2441 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
2442 {
2443 unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
2444 u32 csts;
2445 int ret;
2446
2447 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2448 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
2449
2450 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2451 if (ret)
2452 return ret;
2453
2454 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2455 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
2456 break;
2457
2458 msleep(100);
2459 if (fatal_signal_pending(current))
2460 return -EINTR;
2461 if (time_after(jiffies, timeout)) {
2462 dev_err(ctrl->device,
2463 "Device shutdown incomplete; abort shutdown\n");
2464 return -ENODEV;
2465 }
2466 }
2467
2468 return ret;
2469 }
2470 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
2471
nvme_configure_timestamp(struct nvme_ctrl * ctrl)2472 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
2473 {
2474 __le64 ts;
2475 int ret;
2476
2477 if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
2478 return 0;
2479
2480 ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
2481 ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
2482 NULL);
2483 if (ret)
2484 dev_warn_once(ctrl->device,
2485 "could not set timestamp (%d)\n", ret);
2486 return ret;
2487 }
2488
nvme_configure_acre(struct nvme_ctrl * ctrl)2489 static int nvme_configure_acre(struct nvme_ctrl *ctrl)
2490 {
2491 struct nvme_feat_host_behavior *host;
2492 int ret;
2493
2494 /* Don't bother enabling the feature if retry delay is not reported */
2495 if (!ctrl->crdt[0])
2496 return 0;
2497
2498 host = kzalloc(sizeof(*host), GFP_KERNEL);
2499 if (!host)
2500 return 0;
2501
2502 host->acre = NVME_ENABLE_ACRE;
2503 ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
2504 host, sizeof(*host), NULL);
2505 kfree(host);
2506 return ret;
2507 }
2508
nvme_configure_apst(struct nvme_ctrl * ctrl)2509 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
2510 {
2511 /*
2512 * APST (Autonomous Power State Transition) lets us program a
2513 * table of power state transitions that the controller will
2514 * perform automatically. We configure it with a simple
2515 * heuristic: we are willing to spend at most 2% of the time
2516 * transitioning between power states. Therefore, when running
2517 * in any given state, we will enter the next lower-power
2518 * non-operational state after waiting 50 * (enlat + exlat)
2519 * microseconds, as long as that state's exit latency is under
2520 * the requested maximum latency.
2521 *
2522 * We will not autonomously enter any non-operational state for
2523 * which the total latency exceeds ps_max_latency_us. Users
2524 * can set ps_max_latency_us to zero to turn off APST.
2525 */
2526
2527 unsigned apste;
2528 struct nvme_feat_auto_pst *table;
2529 u64 max_lat_us = 0;
2530 int max_ps = -1;
2531 int ret;
2532
2533 /*
2534 * If APST isn't supported or if we haven't been initialized yet,
2535 * then don't do anything.
2536 */
2537 if (!ctrl->apsta)
2538 return 0;
2539
2540 if (ctrl->npss > 31) {
2541 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
2542 return 0;
2543 }
2544
2545 table = kzalloc(sizeof(*table), GFP_KERNEL);
2546 if (!table)
2547 return 0;
2548
2549 if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
2550 /* Turn off APST. */
2551 apste = 0;
2552 dev_dbg(ctrl->device, "APST disabled\n");
2553 } else {
2554 __le64 target = cpu_to_le64(0);
2555 int state;
2556
2557 /*
2558 * Walk through all states from lowest- to highest-power.
2559 * According to the spec, lower-numbered states use more
2560 * power. NPSS, despite the name, is the index of the
2561 * lowest-power state, not the number of states.
2562 */
2563 for (state = (int)ctrl->npss; state >= 0; state--) {
2564 u64 total_latency_us, exit_latency_us, transition_ms;
2565
2566 if (target)
2567 table->entries[state] = target;
2568
2569 /*
2570 * Don't allow transitions to the deepest state
2571 * if it's quirked off.
2572 */
2573 if (state == ctrl->npss &&
2574 (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
2575 continue;
2576
2577 /*
2578 * Is this state a useful non-operational state for
2579 * higher-power states to autonomously transition to?
2580 */
2581 if (!(ctrl->psd[state].flags &
2582 NVME_PS_FLAGS_NON_OP_STATE))
2583 continue;
2584
2585 exit_latency_us =
2586 (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
2587 if (exit_latency_us > ctrl->ps_max_latency_us)
2588 continue;
2589
2590 total_latency_us =
2591 exit_latency_us +
2592 le32_to_cpu(ctrl->psd[state].entry_lat);
2593
2594 /*
2595 * This state is good. Use it as the APST idle
2596 * target for higher power states.
2597 */
2598 transition_ms = total_latency_us + 19;
2599 do_div(transition_ms, 20);
2600 if (transition_ms > (1 << 24) - 1)
2601 transition_ms = (1 << 24) - 1;
2602
2603 target = cpu_to_le64((state << 3) |
2604 (transition_ms << 8));
2605
2606 if (max_ps == -1)
2607 max_ps = state;
2608
2609 if (total_latency_us > max_lat_us)
2610 max_lat_us = total_latency_us;
2611 }
2612
2613 apste = 1;
2614
2615 if (max_ps == -1) {
2616 dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
2617 } else {
2618 dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
2619 max_ps, max_lat_us, (int)sizeof(*table), table);
2620 }
2621 }
2622
2623 ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
2624 table, sizeof(*table), NULL);
2625 if (ret)
2626 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
2627
2628 kfree(table);
2629 return ret;
2630 }
2631
nvme_set_latency_tolerance(struct device * dev,s32 val)2632 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
2633 {
2634 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2635 u64 latency;
2636
2637 switch (val) {
2638 case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
2639 case PM_QOS_LATENCY_ANY:
2640 latency = U64_MAX;
2641 break;
2642
2643 default:
2644 latency = val;
2645 }
2646
2647 if (ctrl->ps_max_latency_us != latency) {
2648 ctrl->ps_max_latency_us = latency;
2649 if (ctrl->state == NVME_CTRL_LIVE)
2650 nvme_configure_apst(ctrl);
2651 }
2652 }
2653
2654 struct nvme_core_quirk_entry {
2655 /*
2656 * NVMe model and firmware strings are padded with spaces. For
2657 * simplicity, strings in the quirk table are padded with NULLs
2658 * instead.
2659 */
2660 u16 vid;
2661 const char *mn;
2662 const char *fr;
2663 unsigned long quirks;
2664 };
2665
2666 static const struct nvme_core_quirk_entry core_quirks[] = {
2667 {
2668 /*
2669 * This Toshiba device seems to die using any APST states. See:
2670 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
2671 */
2672 .vid = 0x1179,
2673 .mn = "THNSF5256GPUK TOSHIBA",
2674 .quirks = NVME_QUIRK_NO_APST,
2675 },
2676 {
2677 /*
2678 * This LiteON CL1-3D*-Q11 firmware version has a race
2679 * condition associated with actions related to suspend to idle
2680 * LiteON has resolved the problem in future firmware
2681 */
2682 .vid = 0x14a4,
2683 .fr = "22301111",
2684 .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
2685 },
2686 {
2687 /*
2688 * This Kioxia CD6-V Series / HPE PE8030 device times out and
2689 * aborts I/O during any load, but more easily reproducible
2690 * with discards (fstrim).
2691 *
2692 * The device is left in a state where it is also not possible
2693 * to use "nvme set-feature" to disable APST, but booting with
2694 * nvme_core.default_ps_max_latency=0 works.
2695 */
2696 .vid = 0x1e0f,
2697 .mn = "KCD6XVUL6T40",
2698 .quirks = NVME_QUIRK_NO_APST,
2699 },
2700 {
2701 /*
2702 * The external Samsung X5 SSD fails initialization without a
2703 * delay before checking if it is ready and has a whole set of
2704 * other problems. To make this even more interesting, it
2705 * shares the PCI ID with internal Samsung 970 Evo Plus that
2706 * does not need or want these quirks.
2707 */
2708 .vid = 0x144d,
2709 .mn = "Samsung Portable SSD X5",
2710 .quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
2711 NVME_QUIRK_NO_DEEPEST_PS |
2712 NVME_QUIRK_IGNORE_DEV_SUBNQN,
2713 }
2714 };
2715
2716 /* match is null-terminated but idstr is space-padded. */
string_matches(const char * idstr,const char * match,size_t len)2717 static bool string_matches(const char *idstr, const char *match, size_t len)
2718 {
2719 size_t matchlen;
2720
2721 if (!match)
2722 return true;
2723
2724 matchlen = strlen(match);
2725 WARN_ON_ONCE(matchlen > len);
2726
2727 if (memcmp(idstr, match, matchlen))
2728 return false;
2729
2730 for (; matchlen < len; matchlen++)
2731 if (idstr[matchlen] != ' ')
2732 return false;
2733
2734 return true;
2735 }
2736
quirk_matches(const struct nvme_id_ctrl * id,const struct nvme_core_quirk_entry * q)2737 static bool quirk_matches(const struct nvme_id_ctrl *id,
2738 const struct nvme_core_quirk_entry *q)
2739 {
2740 return q->vid == le16_to_cpu(id->vid) &&
2741 string_matches(id->mn, q->mn, sizeof(id->mn)) &&
2742 string_matches(id->fr, q->fr, sizeof(id->fr));
2743 }
2744
nvme_init_subnqn(struct nvme_subsystem * subsys,struct nvme_ctrl * ctrl,struct nvme_id_ctrl * id)2745 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
2746 struct nvme_id_ctrl *id)
2747 {
2748 size_t nqnlen;
2749 int off;
2750
2751 if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
2752 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2753 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2754 strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2755 return;
2756 }
2757
2758 if (ctrl->vs >= NVME_VS(1, 2, 1))
2759 dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2760 }
2761
2762 /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
2763 off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
2764 "nqn.2014.08.org.nvmexpress:%04x%04x",
2765 le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
2766 memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
2767 off += sizeof(id->sn);
2768 memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
2769 off += sizeof(id->mn);
2770 memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
2771 }
2772
nvme_release_subsystem(struct device * dev)2773 static void nvme_release_subsystem(struct device *dev)
2774 {
2775 struct nvme_subsystem *subsys =
2776 container_of(dev, struct nvme_subsystem, dev);
2777
2778 if (subsys->instance >= 0)
2779 ida_simple_remove(&nvme_instance_ida, subsys->instance);
2780 kfree(subsys);
2781 }
2782
nvme_destroy_subsystem(struct kref * ref)2783 static void nvme_destroy_subsystem(struct kref *ref)
2784 {
2785 struct nvme_subsystem *subsys =
2786 container_of(ref, struct nvme_subsystem, ref);
2787
2788 mutex_lock(&nvme_subsystems_lock);
2789 list_del(&subsys->entry);
2790 mutex_unlock(&nvme_subsystems_lock);
2791
2792 ida_destroy(&subsys->ns_ida);
2793 device_del(&subsys->dev);
2794 put_device(&subsys->dev);
2795 }
2796
nvme_put_subsystem(struct nvme_subsystem * subsys)2797 static void nvme_put_subsystem(struct nvme_subsystem *subsys)
2798 {
2799 kref_put(&subsys->ref, nvme_destroy_subsystem);
2800 }
2801
__nvme_find_get_subsystem(const char * subsysnqn)2802 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
2803 {
2804 struct nvme_subsystem *subsys;
2805
2806 lockdep_assert_held(&nvme_subsystems_lock);
2807
2808 /*
2809 * Fail matches for discovery subsystems. This results
2810 * in each discovery controller bound to a unique subsystem.
2811 * This avoids issues with validating controller values
2812 * that can only be true when there is a single unique subsystem.
2813 * There may be multiple and completely independent entities
2814 * that provide discovery controllers.
2815 */
2816 if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
2817 return NULL;
2818
2819 list_for_each_entry(subsys, &nvme_subsystems, entry) {
2820 if (strcmp(subsys->subnqn, subsysnqn))
2821 continue;
2822 if (!kref_get_unless_zero(&subsys->ref))
2823 continue;
2824 return subsys;
2825 }
2826
2827 return NULL;
2828 }
2829
2830 #define SUBSYS_ATTR_RO(_name, _mode, _show) \
2831 struct device_attribute subsys_attr_##_name = \
2832 __ATTR(_name, _mode, _show, NULL)
2833
nvme_subsys_show_nqn(struct device * dev,struct device_attribute * attr,char * buf)2834 static ssize_t nvme_subsys_show_nqn(struct device *dev,
2835 struct device_attribute *attr,
2836 char *buf)
2837 {
2838 struct nvme_subsystem *subsys =
2839 container_of(dev, struct nvme_subsystem, dev);
2840
2841 return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
2842 }
2843 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
2844
2845 #define nvme_subsys_show_str_function(field) \
2846 static ssize_t subsys_##field##_show(struct device *dev, \
2847 struct device_attribute *attr, char *buf) \
2848 { \
2849 struct nvme_subsystem *subsys = \
2850 container_of(dev, struct nvme_subsystem, dev); \
2851 return sysfs_emit(buf, "%.*s\n", \
2852 (int)sizeof(subsys->field), subsys->field); \
2853 } \
2854 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
2855
2856 nvme_subsys_show_str_function(model);
2857 nvme_subsys_show_str_function(serial);
2858 nvme_subsys_show_str_function(firmware_rev);
2859
2860 static struct attribute *nvme_subsys_attrs[] = {
2861 &subsys_attr_model.attr,
2862 &subsys_attr_serial.attr,
2863 &subsys_attr_firmware_rev.attr,
2864 &subsys_attr_subsysnqn.attr,
2865 #ifdef CONFIG_NVME_MULTIPATH
2866 &subsys_attr_iopolicy.attr,
2867 #endif
2868 NULL,
2869 };
2870
2871 static struct attribute_group nvme_subsys_attrs_group = {
2872 .attrs = nvme_subsys_attrs,
2873 };
2874
2875 static const struct attribute_group *nvme_subsys_attrs_groups[] = {
2876 &nvme_subsys_attrs_group,
2877 NULL,
2878 };
2879
nvme_discovery_ctrl(struct nvme_ctrl * ctrl)2880 static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
2881 {
2882 return ctrl->opts && ctrl->opts->discovery_nqn;
2883 }
2884
nvme_validate_cntlid(struct nvme_subsystem * subsys,struct nvme_ctrl * ctrl,struct nvme_id_ctrl * id)2885 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
2886 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2887 {
2888 struct nvme_ctrl *tmp;
2889
2890 lockdep_assert_held(&nvme_subsystems_lock);
2891
2892 list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2893 if (nvme_state_terminal(tmp))
2894 continue;
2895
2896 if (tmp->cntlid == ctrl->cntlid) {
2897 dev_err(ctrl->device,
2898 "Duplicate cntlid %u with %s, rejecting\n",
2899 ctrl->cntlid, dev_name(tmp->device));
2900 return false;
2901 }
2902
2903 if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
2904 nvme_discovery_ctrl(ctrl))
2905 continue;
2906
2907 dev_err(ctrl->device,
2908 "Subsystem does not support multiple controllers\n");
2909 return false;
2910 }
2911
2912 return true;
2913 }
2914
nvme_init_subsystem(struct nvme_ctrl * ctrl,struct nvme_id_ctrl * id)2915 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2916 {
2917 struct nvme_subsystem *subsys, *found;
2918 int ret;
2919
2920 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2921 if (!subsys)
2922 return -ENOMEM;
2923
2924 subsys->instance = -1;
2925 mutex_init(&subsys->lock);
2926 kref_init(&subsys->ref);
2927 INIT_LIST_HEAD(&subsys->ctrls);
2928 INIT_LIST_HEAD(&subsys->nsheads);
2929 nvme_init_subnqn(subsys, ctrl, id);
2930 memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
2931 memcpy(subsys->model, id->mn, sizeof(subsys->model));
2932 subsys->vendor_id = le16_to_cpu(id->vid);
2933 subsys->cmic = id->cmic;
2934 subsys->awupf = le16_to_cpu(id->awupf);
2935 #ifdef CONFIG_NVME_MULTIPATH
2936 subsys->iopolicy = NVME_IOPOLICY_NUMA;
2937 #endif
2938
2939 subsys->dev.class = nvme_subsys_class;
2940 subsys->dev.release = nvme_release_subsystem;
2941 subsys->dev.groups = nvme_subsys_attrs_groups;
2942 dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
2943 device_initialize(&subsys->dev);
2944
2945 mutex_lock(&nvme_subsystems_lock);
2946 found = __nvme_find_get_subsystem(subsys->subnqn);
2947 if (found) {
2948 put_device(&subsys->dev);
2949 subsys = found;
2950
2951 if (!nvme_validate_cntlid(subsys, ctrl, id)) {
2952 ret = -EINVAL;
2953 goto out_put_subsystem;
2954 }
2955 } else {
2956 ret = device_add(&subsys->dev);
2957 if (ret) {
2958 dev_err(ctrl->device,
2959 "failed to register subsystem device.\n");
2960 put_device(&subsys->dev);
2961 goto out_unlock;
2962 }
2963 ida_init(&subsys->ns_ida);
2964 list_add_tail(&subsys->entry, &nvme_subsystems);
2965 }
2966
2967 ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2968 dev_name(ctrl->device));
2969 if (ret) {
2970 dev_err(ctrl->device,
2971 "failed to create sysfs link from subsystem.\n");
2972 goto out_put_subsystem;
2973 }
2974
2975 if (!found)
2976 subsys->instance = ctrl->instance;
2977 ctrl->subsys = subsys;
2978 list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
2979 mutex_unlock(&nvme_subsystems_lock);
2980 return 0;
2981
2982 out_put_subsystem:
2983 nvme_put_subsystem(subsys);
2984 out_unlock:
2985 mutex_unlock(&nvme_subsystems_lock);
2986 return ret;
2987 }
2988
nvme_get_log(struct nvme_ctrl * ctrl,u32 nsid,u8 log_page,u8 lsp,u8 csi,void * log,size_t size,u64 offset)2989 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
2990 void *log, size_t size, u64 offset)
2991 {
2992 struct nvme_command c = { };
2993 u32 dwlen = nvme_bytes_to_numd(size);
2994
2995 c.get_log_page.opcode = nvme_admin_get_log_page;
2996 c.get_log_page.nsid = cpu_to_le32(nsid);
2997 c.get_log_page.lid = log_page;
2998 c.get_log_page.lsp = lsp;
2999 c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
3000 c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
3001 c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
3002 c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
3003 c.get_log_page.csi = csi;
3004
3005 return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
3006 }
3007
nvme_get_effects_log(struct nvme_ctrl * ctrl,u8 csi,struct nvme_effects_log ** log)3008 static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
3009 struct nvme_effects_log **log)
3010 {
3011 struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
3012 int ret;
3013
3014 if (cel)
3015 goto out;
3016
3017 cel = kzalloc(sizeof(*cel), GFP_KERNEL);
3018 if (!cel)
3019 return -ENOMEM;
3020
3021 ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
3022 cel, sizeof(*cel), 0);
3023 if (ret) {
3024 kfree(cel);
3025 return ret;
3026 }
3027
3028 xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
3029 out:
3030 *log = cel;
3031 return 0;
3032 }
3033
3034 /*
3035 * Initialize the cached copies of the Identify data and various controller
3036 * register in our nvme_ctrl structure. This should be called as soon as
3037 * the admin queue is fully up and running.
3038 */
nvme_init_identify(struct nvme_ctrl * ctrl)3039 int nvme_init_identify(struct nvme_ctrl *ctrl)
3040 {
3041 struct nvme_id_ctrl *id;
3042 int ret, page_shift;
3043 u32 max_hw_sectors;
3044 bool prev_apst_enabled;
3045
3046 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
3047 if (ret) {
3048 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
3049 return ret;
3050 }
3051 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
3052 ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
3053
3054 if (ctrl->vs >= NVME_VS(1, 1, 0))
3055 ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
3056
3057 ret = nvme_identify_ctrl(ctrl, &id);
3058 if (ret) {
3059 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
3060 return -EIO;
3061 }
3062
3063 if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
3064 ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
3065 if (ret < 0)
3066 goto out_free;
3067 }
3068
3069 if (!(ctrl->ops->flags & NVME_F_FABRICS))
3070 ctrl->cntlid = le16_to_cpu(id->cntlid);
3071
3072 if (!ctrl->identified) {
3073 int i;
3074
3075 /*
3076 * Check for quirks. Quirk can depend on firmware version,
3077 * so, in principle, the set of quirks present can change
3078 * across a reset. As a possible future enhancement, we
3079 * could re-scan for quirks every time we reinitialize
3080 * the device, but we'd have to make sure that the driver
3081 * behaves intelligently if the quirks change.
3082 */
3083 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
3084 if (quirk_matches(id, &core_quirks[i]))
3085 ctrl->quirks |= core_quirks[i].quirks;
3086 }
3087
3088 ret = nvme_init_subsystem(ctrl, id);
3089 if (ret)
3090 goto out_free;
3091 }
3092 memcpy(ctrl->subsys->firmware_rev, id->fr,
3093 sizeof(ctrl->subsys->firmware_rev));
3094
3095 if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
3096 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
3097 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
3098 }
3099
3100 ctrl->crdt[0] = le16_to_cpu(id->crdt1);
3101 ctrl->crdt[1] = le16_to_cpu(id->crdt2);
3102 ctrl->crdt[2] = le16_to_cpu(id->crdt3);
3103
3104 ctrl->oacs = le16_to_cpu(id->oacs);
3105 ctrl->oncs = le16_to_cpu(id->oncs);
3106 ctrl->mtfa = le16_to_cpu(id->mtfa);
3107 ctrl->oaes = le32_to_cpu(id->oaes);
3108 ctrl->wctemp = le16_to_cpu(id->wctemp);
3109 ctrl->cctemp = le16_to_cpu(id->cctemp);
3110
3111 atomic_set(&ctrl->abort_limit, id->acl + 1);
3112 ctrl->vwc = id->vwc;
3113 if (id->mdts)
3114 max_hw_sectors = 1 << (id->mdts + page_shift - 9);
3115 else
3116 max_hw_sectors = UINT_MAX;
3117 ctrl->max_hw_sectors =
3118 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
3119
3120 nvme_set_queue_limits(ctrl, ctrl->admin_q);
3121 ctrl->sgls = le32_to_cpu(id->sgls);
3122 ctrl->kas = le16_to_cpu(id->kas);
3123 ctrl->max_namespaces = le32_to_cpu(id->mnan);
3124 ctrl->ctratt = le32_to_cpu(id->ctratt);
3125
3126 if (id->rtd3e) {
3127 /* us -> s */
3128 u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
3129
3130 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
3131 shutdown_timeout, 60);
3132
3133 if (ctrl->shutdown_timeout != shutdown_timeout)
3134 dev_info(ctrl->device,
3135 "Shutdown timeout set to %u seconds\n",
3136 ctrl->shutdown_timeout);
3137 } else
3138 ctrl->shutdown_timeout = shutdown_timeout;
3139
3140 ctrl->npss = id->npss;
3141 ctrl->apsta = id->apsta;
3142 prev_apst_enabled = ctrl->apst_enabled;
3143 if (ctrl->quirks & NVME_QUIRK_NO_APST) {
3144 if (force_apst && id->apsta) {
3145 dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
3146 ctrl->apst_enabled = true;
3147 } else {
3148 ctrl->apst_enabled = false;
3149 }
3150 } else {
3151 ctrl->apst_enabled = id->apsta;
3152 }
3153 memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
3154
3155 if (ctrl->ops->flags & NVME_F_FABRICS) {
3156 ctrl->icdoff = le16_to_cpu(id->icdoff);
3157 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
3158 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
3159 ctrl->maxcmd = le16_to_cpu(id->maxcmd);
3160
3161 /*
3162 * In fabrics we need to verify the cntlid matches the
3163 * admin connect
3164 */
3165 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
3166 dev_err(ctrl->device,
3167 "Mismatching cntlid: Connect %u vs Identify "
3168 "%u, rejecting\n",
3169 ctrl->cntlid, le16_to_cpu(id->cntlid));
3170 ret = -EINVAL;
3171 goto out_free;
3172 }
3173
3174 if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
3175 dev_err(ctrl->device,
3176 "keep-alive support is mandatory for fabrics\n");
3177 ret = -EINVAL;
3178 goto out_free;
3179 }
3180 } else {
3181 ctrl->hmpre = le32_to_cpu(id->hmpre);
3182 ctrl->hmmin = le32_to_cpu(id->hmmin);
3183 ctrl->hmminds = le32_to_cpu(id->hmminds);
3184 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
3185 }
3186
3187 ret = nvme_mpath_init_identify(ctrl, id);
3188 kfree(id);
3189
3190 if (ret < 0)
3191 return ret;
3192
3193 if (ctrl->apst_enabled && !prev_apst_enabled)
3194 dev_pm_qos_expose_latency_tolerance(ctrl->device);
3195 else if (!ctrl->apst_enabled && prev_apst_enabled)
3196 dev_pm_qos_hide_latency_tolerance(ctrl->device);
3197
3198 ret = nvme_configure_apst(ctrl);
3199 if (ret < 0)
3200 return ret;
3201
3202 ret = nvme_configure_timestamp(ctrl);
3203 if (ret < 0)
3204 return ret;
3205
3206 ret = nvme_configure_directives(ctrl);
3207 if (ret < 0)
3208 return ret;
3209
3210 ret = nvme_configure_acre(ctrl);
3211 if (ret < 0)
3212 return ret;
3213
3214 if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
3215 /*
3216 * Do not return errors unless we are in a controller reset,
3217 * the controller works perfectly fine without hwmon.
3218 */
3219 ret = nvme_hwmon_init(ctrl);
3220 if (ret == -EINTR)
3221 return ret;
3222 }
3223
3224 ctrl->identified = true;
3225
3226 return 0;
3227
3228 out_free:
3229 kfree(id);
3230 return ret;
3231 }
3232 EXPORT_SYMBOL_GPL(nvme_init_identify);
3233
nvme_dev_open(struct inode * inode,struct file * file)3234 static int nvme_dev_open(struct inode *inode, struct file *file)
3235 {
3236 struct nvme_ctrl *ctrl =
3237 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3238
3239 switch (ctrl->state) {
3240 case NVME_CTRL_LIVE:
3241 break;
3242 default:
3243 return -EWOULDBLOCK;
3244 }
3245
3246 nvme_get_ctrl(ctrl);
3247 if (!try_module_get(ctrl->ops->module)) {
3248 nvme_put_ctrl(ctrl);
3249 return -EINVAL;
3250 }
3251
3252 file->private_data = ctrl;
3253 return 0;
3254 }
3255
nvme_dev_release(struct inode * inode,struct file * file)3256 static int nvme_dev_release(struct inode *inode, struct file *file)
3257 {
3258 struct nvme_ctrl *ctrl =
3259 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3260
3261 module_put(ctrl->ops->module);
3262 nvme_put_ctrl(ctrl);
3263 return 0;
3264 }
3265
nvme_dev_user_cmd(struct nvme_ctrl * ctrl,void __user * argp)3266 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
3267 {
3268 struct nvme_ns *ns;
3269 int ret;
3270
3271 down_read(&ctrl->namespaces_rwsem);
3272 if (list_empty(&ctrl->namespaces)) {
3273 ret = -ENOTTY;
3274 goto out_unlock;
3275 }
3276
3277 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
3278 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
3279 dev_warn(ctrl->device,
3280 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
3281 ret = -EINVAL;
3282 goto out_unlock;
3283 }
3284
3285 dev_warn(ctrl->device,
3286 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
3287 kref_get(&ns->kref);
3288 up_read(&ctrl->namespaces_rwsem);
3289
3290 ret = nvme_user_cmd(ctrl, ns, argp);
3291 nvme_put_ns(ns);
3292 return ret;
3293
3294 out_unlock:
3295 up_read(&ctrl->namespaces_rwsem);
3296 return ret;
3297 }
3298
nvme_dev_ioctl(struct file * file,unsigned int cmd,unsigned long arg)3299 static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
3300 unsigned long arg)
3301 {
3302 struct nvme_ctrl *ctrl = file->private_data;
3303 void __user *argp = (void __user *)arg;
3304
3305 switch (cmd) {
3306 case NVME_IOCTL_ADMIN_CMD:
3307 return nvme_user_cmd(ctrl, NULL, argp);
3308 case NVME_IOCTL_ADMIN64_CMD:
3309 return nvme_user_cmd64(ctrl, NULL, argp);
3310 case NVME_IOCTL_IO_CMD:
3311 return nvme_dev_user_cmd(ctrl, argp);
3312 case NVME_IOCTL_RESET:
3313 if (!capable(CAP_SYS_ADMIN))
3314 return -EACCES;
3315 dev_warn(ctrl->device, "resetting controller\n");
3316 return nvme_reset_ctrl_sync(ctrl);
3317 case NVME_IOCTL_SUBSYS_RESET:
3318 if (!capable(CAP_SYS_ADMIN))
3319 return -EACCES;
3320 return nvme_reset_subsystem(ctrl);
3321 case NVME_IOCTL_RESCAN:
3322 if (!capable(CAP_SYS_ADMIN))
3323 return -EACCES;
3324 nvme_queue_scan(ctrl);
3325 return 0;
3326 default:
3327 return -ENOTTY;
3328 }
3329 }
3330
3331 static const struct file_operations nvme_dev_fops = {
3332 .owner = THIS_MODULE,
3333 .open = nvme_dev_open,
3334 .release = nvme_dev_release,
3335 .unlocked_ioctl = nvme_dev_ioctl,
3336 .compat_ioctl = compat_ptr_ioctl,
3337 };
3338
nvme_sysfs_reset(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)3339 static ssize_t nvme_sysfs_reset(struct device *dev,
3340 struct device_attribute *attr, const char *buf,
3341 size_t count)
3342 {
3343 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3344 int ret;
3345
3346 ret = nvme_reset_ctrl_sync(ctrl);
3347 if (ret < 0)
3348 return ret;
3349 return count;
3350 }
3351 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
3352
nvme_sysfs_rescan(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)3353 static ssize_t nvme_sysfs_rescan(struct device *dev,
3354 struct device_attribute *attr, const char *buf,
3355 size_t count)
3356 {
3357 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3358
3359 nvme_queue_scan(ctrl);
3360 return count;
3361 }
3362 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
3363
dev_to_ns_head(struct device * dev)3364 static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
3365 {
3366 struct gendisk *disk = dev_to_disk(dev);
3367
3368 if (disk->fops == &nvme_fops)
3369 return nvme_get_ns_from_dev(dev)->head;
3370 else
3371 return disk->private_data;
3372 }
3373
wwid_show(struct device * dev,struct device_attribute * attr,char * buf)3374 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
3375 char *buf)
3376 {
3377 struct nvme_ns_head *head = dev_to_ns_head(dev);
3378 struct nvme_ns_ids *ids = &head->ids;
3379 struct nvme_subsystem *subsys = head->subsys;
3380 int serial_len = sizeof(subsys->serial);
3381 int model_len = sizeof(subsys->model);
3382
3383 if (!uuid_is_null(&ids->uuid))
3384 return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid);
3385
3386 if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3387 return sysfs_emit(buf, "eui.%16phN\n", ids->nguid);
3388
3389 if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
3390 return sysfs_emit(buf, "eui.%8phN\n", ids->eui64);
3391
3392 while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
3393 subsys->serial[serial_len - 1] == '\0'))
3394 serial_len--;
3395 while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
3396 subsys->model[model_len - 1] == '\0'))
3397 model_len--;
3398
3399 return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
3400 serial_len, subsys->serial, model_len, subsys->model,
3401 head->ns_id);
3402 }
3403 static DEVICE_ATTR_RO(wwid);
3404
nguid_show(struct device * dev,struct device_attribute * attr,char * buf)3405 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
3406 char *buf)
3407 {
3408 return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
3409 }
3410 static DEVICE_ATTR_RO(nguid);
3411
uuid_show(struct device * dev,struct device_attribute * attr,char * buf)3412 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
3413 char *buf)
3414 {
3415 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
3416
3417 /* For backward compatibility expose the NGUID to userspace if
3418 * we have no UUID set
3419 */
3420 if (uuid_is_null(&ids->uuid)) {
3421 dev_warn_ratelimited(dev,
3422 "No UUID available providing old NGUID\n");
3423 return sysfs_emit(buf, "%pU\n", ids->nguid);
3424 }
3425 return sysfs_emit(buf, "%pU\n", &ids->uuid);
3426 }
3427 static DEVICE_ATTR_RO(uuid);
3428
eui_show(struct device * dev,struct device_attribute * attr,char * buf)3429 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
3430 char *buf)
3431 {
3432 return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
3433 }
3434 static DEVICE_ATTR_RO(eui);
3435
nsid_show(struct device * dev,struct device_attribute * attr,char * buf)3436 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
3437 char *buf)
3438 {
3439 return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
3440 }
3441 static DEVICE_ATTR_RO(nsid);
3442
3443 static struct attribute *nvme_ns_id_attrs[] = {
3444 &dev_attr_wwid.attr,
3445 &dev_attr_uuid.attr,
3446 &dev_attr_nguid.attr,
3447 &dev_attr_eui.attr,
3448 &dev_attr_nsid.attr,
3449 #ifdef CONFIG_NVME_MULTIPATH
3450 &dev_attr_ana_grpid.attr,
3451 &dev_attr_ana_state.attr,
3452 #endif
3453 NULL,
3454 };
3455
nvme_ns_id_attrs_are_visible(struct kobject * kobj,struct attribute * a,int n)3456 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
3457 struct attribute *a, int n)
3458 {
3459 struct device *dev = container_of(kobj, struct device, kobj);
3460 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
3461
3462 if (a == &dev_attr_uuid.attr) {
3463 if (uuid_is_null(&ids->uuid) &&
3464 !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3465 return 0;
3466 }
3467 if (a == &dev_attr_nguid.attr) {
3468 if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3469 return 0;
3470 }
3471 if (a == &dev_attr_eui.attr) {
3472 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
3473 return 0;
3474 }
3475 #ifdef CONFIG_NVME_MULTIPATH
3476 if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
3477 if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */
3478 return 0;
3479 if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
3480 return 0;
3481 }
3482 #endif
3483 return a->mode;
3484 }
3485
3486 static const struct attribute_group nvme_ns_id_attr_group = {
3487 .attrs = nvme_ns_id_attrs,
3488 .is_visible = nvme_ns_id_attrs_are_visible,
3489 };
3490
3491 const struct attribute_group *nvme_ns_id_attr_groups[] = {
3492 &nvme_ns_id_attr_group,
3493 #ifdef CONFIG_NVM
3494 &nvme_nvm_attr_group,
3495 #endif
3496 NULL,
3497 };
3498
3499 #define nvme_show_str_function(field) \
3500 static ssize_t field##_show(struct device *dev, \
3501 struct device_attribute *attr, char *buf) \
3502 { \
3503 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
3504 return sysfs_emit(buf, "%.*s\n", \
3505 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \
3506 } \
3507 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
3508
3509 nvme_show_str_function(model);
3510 nvme_show_str_function(serial);
3511 nvme_show_str_function(firmware_rev);
3512
3513 #define nvme_show_int_function(field) \
3514 static ssize_t field##_show(struct device *dev, \
3515 struct device_attribute *attr, char *buf) \
3516 { \
3517 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
3518 return sysfs_emit(buf, "%d\n", ctrl->field); \
3519 } \
3520 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
3521
3522 nvme_show_int_function(cntlid);
3523 nvme_show_int_function(numa_node);
3524 nvme_show_int_function(queue_count);
3525 nvme_show_int_function(sqsize);
3526
nvme_sysfs_delete(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)3527 static ssize_t nvme_sysfs_delete(struct device *dev,
3528 struct device_attribute *attr, const char *buf,
3529 size_t count)
3530 {
3531 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3532
3533 if (device_remove_file_self(dev, attr))
3534 nvme_delete_ctrl_sync(ctrl);
3535 return count;
3536 }
3537 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
3538
nvme_sysfs_show_transport(struct device * dev,struct device_attribute * attr,char * buf)3539 static ssize_t nvme_sysfs_show_transport(struct device *dev,
3540 struct device_attribute *attr,
3541 char *buf)
3542 {
3543 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3544
3545 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
3546 }
3547 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
3548
nvme_sysfs_show_state(struct device * dev,struct device_attribute * attr,char * buf)3549 static ssize_t nvme_sysfs_show_state(struct device *dev,
3550 struct device_attribute *attr,
3551 char *buf)
3552 {
3553 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3554 static const char *const state_name[] = {
3555 [NVME_CTRL_NEW] = "new",
3556 [NVME_CTRL_LIVE] = "live",
3557 [NVME_CTRL_RESETTING] = "resetting",
3558 [NVME_CTRL_CONNECTING] = "connecting",
3559 [NVME_CTRL_DELETING] = "deleting",
3560 [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
3561 [NVME_CTRL_DEAD] = "dead",
3562 };
3563
3564 if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
3565 state_name[ctrl->state])
3566 return sysfs_emit(buf, "%s\n", state_name[ctrl->state]);
3567
3568 return sysfs_emit(buf, "unknown state\n");
3569 }
3570
3571 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
3572
nvme_sysfs_show_subsysnqn(struct device * dev,struct device_attribute * attr,char * buf)3573 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
3574 struct device_attribute *attr,
3575 char *buf)
3576 {
3577 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3578
3579 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
3580 }
3581 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
3582
nvme_sysfs_show_hostnqn(struct device * dev,struct device_attribute * attr,char * buf)3583 static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
3584 struct device_attribute *attr,
3585 char *buf)
3586 {
3587 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3588
3589 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn);
3590 }
3591 static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
3592
nvme_sysfs_show_hostid(struct device * dev,struct device_attribute * attr,char * buf)3593 static ssize_t nvme_sysfs_show_hostid(struct device *dev,
3594 struct device_attribute *attr,
3595 char *buf)
3596 {
3597 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3598
3599 return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id);
3600 }
3601 static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
3602
nvme_sysfs_show_address(struct device * dev,struct device_attribute * attr,char * buf)3603 static ssize_t nvme_sysfs_show_address(struct device *dev,
3604 struct device_attribute *attr,
3605 char *buf)
3606 {
3607 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3608
3609 return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
3610 }
3611 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
3612
nvme_ctrl_loss_tmo_show(struct device * dev,struct device_attribute * attr,char * buf)3613 static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
3614 struct device_attribute *attr, char *buf)
3615 {
3616 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3617 struct nvmf_ctrl_options *opts = ctrl->opts;
3618
3619 if (ctrl->opts->max_reconnects == -1)
3620 return sysfs_emit(buf, "off\n");
3621 return sysfs_emit(buf, "%d\n",
3622 opts->max_reconnects * opts->reconnect_delay);
3623 }
3624
nvme_ctrl_loss_tmo_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)3625 static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
3626 struct device_attribute *attr, const char *buf, size_t count)
3627 {
3628 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3629 struct nvmf_ctrl_options *opts = ctrl->opts;
3630 int ctrl_loss_tmo, err;
3631
3632 err = kstrtoint(buf, 10, &ctrl_loss_tmo);
3633 if (err)
3634 return -EINVAL;
3635
3636 else if (ctrl_loss_tmo < 0)
3637 opts->max_reconnects = -1;
3638 else
3639 opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
3640 opts->reconnect_delay);
3641 return count;
3642 }
3643 static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
3644 nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
3645
nvme_ctrl_reconnect_delay_show(struct device * dev,struct device_attribute * attr,char * buf)3646 static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
3647 struct device_attribute *attr, char *buf)
3648 {
3649 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3650
3651 if (ctrl->opts->reconnect_delay == -1)
3652 return sysfs_emit(buf, "off\n");
3653 return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay);
3654 }
3655
nvme_ctrl_reconnect_delay_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)3656 static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
3657 struct device_attribute *attr, const char *buf, size_t count)
3658 {
3659 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3660 unsigned int v;
3661 int err;
3662
3663 err = kstrtou32(buf, 10, &v);
3664 if (err)
3665 return err;
3666
3667 ctrl->opts->reconnect_delay = v;
3668 return count;
3669 }
3670 static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
3671 nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
3672
3673 static struct attribute *nvme_dev_attrs[] = {
3674 &dev_attr_reset_controller.attr,
3675 &dev_attr_rescan_controller.attr,
3676 &dev_attr_model.attr,
3677 &dev_attr_serial.attr,
3678 &dev_attr_firmware_rev.attr,
3679 &dev_attr_cntlid.attr,
3680 &dev_attr_delete_controller.attr,
3681 &dev_attr_transport.attr,
3682 &dev_attr_subsysnqn.attr,
3683 &dev_attr_address.attr,
3684 &dev_attr_state.attr,
3685 &dev_attr_numa_node.attr,
3686 &dev_attr_queue_count.attr,
3687 &dev_attr_sqsize.attr,
3688 &dev_attr_hostnqn.attr,
3689 &dev_attr_hostid.attr,
3690 &dev_attr_ctrl_loss_tmo.attr,
3691 &dev_attr_reconnect_delay.attr,
3692 NULL
3693 };
3694
nvme_dev_attrs_are_visible(struct kobject * kobj,struct attribute * a,int n)3695 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
3696 struct attribute *a, int n)
3697 {
3698 struct device *dev = container_of(kobj, struct device, kobj);
3699 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3700
3701 if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
3702 return 0;
3703 if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
3704 return 0;
3705 if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
3706 return 0;
3707 if (a == &dev_attr_hostid.attr && !ctrl->opts)
3708 return 0;
3709 if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
3710 return 0;
3711 if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
3712 return 0;
3713
3714 return a->mode;
3715 }
3716
3717 static struct attribute_group nvme_dev_attrs_group = {
3718 .attrs = nvme_dev_attrs,
3719 .is_visible = nvme_dev_attrs_are_visible,
3720 };
3721
3722 static const struct attribute_group *nvme_dev_attr_groups[] = {
3723 &nvme_dev_attrs_group,
3724 NULL,
3725 };
3726
nvme_find_ns_head(struct nvme_subsystem * subsys,unsigned nsid)3727 static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,
3728 unsigned nsid)
3729 {
3730 struct nvme_ns_head *h;
3731
3732 lockdep_assert_held(&subsys->lock);
3733
3734 list_for_each_entry(h, &subsys->nsheads, entry) {
3735 if (h->ns_id == nsid && kref_get_unless_zero(&h->ref))
3736 return h;
3737 }
3738
3739 return NULL;
3740 }
3741
nvme_subsys_check_duplicate_ids(struct nvme_subsystem * subsys,struct nvme_ns_ids * ids)3742 static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
3743 struct nvme_ns_ids *ids)
3744 {
3745 struct nvme_ns_head *h;
3746
3747 lockdep_assert_held(&subsys->lock);
3748
3749 list_for_each_entry(h, &subsys->nsheads, entry) {
3750 if (nvme_ns_ids_valid(ids) && nvme_ns_ids_equal(ids, &h->ids))
3751 return -EINVAL;
3752 }
3753
3754 return 0;
3755 }
3756
nvme_alloc_ns_head(struct nvme_ctrl * ctrl,unsigned nsid,struct nvme_ns_ids * ids)3757 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
3758 unsigned nsid, struct nvme_ns_ids *ids)
3759 {
3760 struct nvme_ns_head *head;
3761 size_t size = sizeof(*head);
3762 int ret = -ENOMEM;
3763
3764 #ifdef CONFIG_NVME_MULTIPATH
3765 size += num_possible_nodes() * sizeof(struct nvme_ns *);
3766 #endif
3767
3768 head = kzalloc(size, GFP_KERNEL);
3769 if (!head)
3770 goto out;
3771 ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
3772 if (ret < 0)
3773 goto out_free_head;
3774 head->instance = ret;
3775 INIT_LIST_HEAD(&head->list);
3776 ret = init_srcu_struct(&head->srcu);
3777 if (ret)
3778 goto out_ida_remove;
3779 head->subsys = ctrl->subsys;
3780 head->ns_id = nsid;
3781 head->ids = *ids;
3782 kref_init(&head->ref);
3783
3784 ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &head->ids);
3785 if (ret) {
3786 dev_err(ctrl->device,
3787 "duplicate IDs for nsid %d\n", nsid);
3788 goto out_cleanup_srcu;
3789 }
3790
3791 if (head->ids.csi) {
3792 ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
3793 if (ret)
3794 goto out_cleanup_srcu;
3795 } else
3796 head->effects = ctrl->effects;
3797
3798 ret = nvme_mpath_alloc_disk(ctrl, head);
3799 if (ret)
3800 goto out_cleanup_srcu;
3801
3802 list_add_tail(&head->entry, &ctrl->subsys->nsheads);
3803
3804 kref_get(&ctrl->subsys->ref);
3805
3806 return head;
3807 out_cleanup_srcu:
3808 cleanup_srcu_struct(&head->srcu);
3809 out_ida_remove:
3810 ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
3811 out_free_head:
3812 kfree(head);
3813 out:
3814 if (ret > 0)
3815 ret = blk_status_to_errno(nvme_error_status(ret));
3816 return ERR_PTR(ret);
3817 }
3818
nvme_init_ns_head(struct nvme_ns * ns,unsigned nsid,struct nvme_ns_ids * ids,bool is_shared)3819 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
3820 struct nvme_ns_ids *ids, bool is_shared)
3821 {
3822 struct nvme_ctrl *ctrl = ns->ctrl;
3823 struct nvme_ns_head *head = NULL;
3824 int ret = 0;
3825
3826 mutex_lock(&ctrl->subsys->lock);
3827 head = nvme_find_ns_head(ctrl->subsys, nsid);
3828 if (!head) {
3829 head = nvme_alloc_ns_head(ctrl, nsid, ids);
3830 if (IS_ERR(head)) {
3831 ret = PTR_ERR(head);
3832 goto out_unlock;
3833 }
3834 head->shared = is_shared;
3835 } else {
3836 ret = -EINVAL;
3837 if (!is_shared || !head->shared) {
3838 dev_err(ctrl->device,
3839 "Duplicate unshared namespace %d\n", nsid);
3840 goto out_put_ns_head;
3841 }
3842 if (!nvme_ns_ids_equal(&head->ids, ids)) {
3843 dev_err(ctrl->device,
3844 "IDs don't match for shared namespace %d\n",
3845 nsid);
3846 goto out_put_ns_head;
3847 }
3848 }
3849
3850 list_add_tail(&ns->siblings, &head->list);
3851 ns->head = head;
3852 mutex_unlock(&ctrl->subsys->lock);
3853 return 0;
3854
3855 out_put_ns_head:
3856 nvme_put_ns_head(head);
3857 out_unlock:
3858 mutex_unlock(&ctrl->subsys->lock);
3859 return ret;
3860 }
3861
nvme_find_get_ns(struct nvme_ctrl * ctrl,unsigned nsid)3862 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3863 {
3864 struct nvme_ns *ns, *ret = NULL;
3865
3866 down_read(&ctrl->namespaces_rwsem);
3867 list_for_each_entry(ns, &ctrl->namespaces, list) {
3868 if (ns->head->ns_id == nsid) {
3869 if (!kref_get_unless_zero(&ns->kref))
3870 continue;
3871 ret = ns;
3872 break;
3873 }
3874 if (ns->head->ns_id > nsid)
3875 break;
3876 }
3877 up_read(&ctrl->namespaces_rwsem);
3878 return ret;
3879 }
3880 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
3881
3882 /*
3883 * Add the namespace to the controller list while keeping the list ordered.
3884 */
nvme_ns_add_to_ctrl_list(struct nvme_ns * ns)3885 static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
3886 {
3887 struct nvme_ns *tmp;
3888
3889 list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
3890 if (tmp->head->ns_id < ns->head->ns_id) {
3891 list_add(&ns->list, &tmp->list);
3892 return;
3893 }
3894 }
3895 list_add(&ns->list, &ns->ctrl->namespaces);
3896 }
3897
nvme_alloc_ns(struct nvme_ctrl * ctrl,unsigned nsid,struct nvme_ns_ids * ids)3898 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
3899 struct nvme_ns_ids *ids)
3900 {
3901 struct nvme_ns *ns;
3902 struct gendisk *disk;
3903 struct nvme_id_ns *id;
3904 char disk_name[DISK_NAME_LEN];
3905 int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
3906
3907 if (nvme_identify_ns(ctrl, nsid, ids, &id))
3908 return;
3909
3910 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
3911 if (!ns)
3912 goto out_free_id;
3913
3914 ns->queue = blk_mq_init_queue(ctrl->tagset);
3915 if (IS_ERR(ns->queue))
3916 goto out_free_ns;
3917
3918 if (ctrl->opts && ctrl->opts->data_digest)
3919 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
3920
3921 blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
3922 if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
3923 blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
3924
3925 ns->queue->queuedata = ns;
3926 ns->ctrl = ctrl;
3927 kref_init(&ns->kref);
3928
3929 ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED);
3930 if (ret)
3931 goto out_free_queue;
3932 nvme_set_disk_name(disk_name, ns, ctrl, &flags);
3933
3934 disk = alloc_disk_node(0, node);
3935 if (!disk)
3936 goto out_unlink_ns;
3937
3938 disk->fops = &nvme_fops;
3939 disk->private_data = ns;
3940 disk->queue = ns->queue;
3941 disk->flags = flags;
3942 memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
3943 ns->disk = disk;
3944
3945 if (nvme_update_ns_info(ns, id))
3946 goto out_put_disk;
3947
3948 if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
3949 ret = nvme_nvm_register(ns, disk_name, node);
3950 if (ret) {
3951 dev_warn(ctrl->device, "LightNVM init failure\n");
3952 goto out_put_disk;
3953 }
3954 }
3955
3956 down_write(&ctrl->namespaces_rwsem);
3957 nvme_ns_add_to_ctrl_list(ns);
3958 up_write(&ctrl->namespaces_rwsem);
3959 nvme_get_ctrl(ctrl);
3960
3961 device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups);
3962
3963 nvme_mpath_add_disk(ns, id);
3964 nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
3965 kfree(id);
3966
3967 return;
3968 out_put_disk:
3969 /* prevent double queue cleanup */
3970 ns->disk->queue = NULL;
3971 put_disk(ns->disk);
3972 out_unlink_ns:
3973 mutex_lock(&ctrl->subsys->lock);
3974 list_del_rcu(&ns->siblings);
3975 if (list_empty(&ns->head->list))
3976 list_del_init(&ns->head->entry);
3977 mutex_unlock(&ctrl->subsys->lock);
3978 nvme_put_ns_head(ns->head);
3979 out_free_queue:
3980 blk_cleanup_queue(ns->queue);
3981 out_free_ns:
3982 kfree(ns);
3983 out_free_id:
3984 kfree(id);
3985 }
3986
nvme_ns_remove(struct nvme_ns * ns)3987 static void nvme_ns_remove(struct nvme_ns *ns)
3988 {
3989 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
3990 return;
3991
3992 set_capacity(ns->disk, 0);
3993 nvme_fault_inject_fini(&ns->fault_inject);
3994
3995 mutex_lock(&ns->ctrl->subsys->lock);
3996 list_del_rcu(&ns->siblings);
3997 if (list_empty(&ns->head->list))
3998 list_del_init(&ns->head->entry);
3999 mutex_unlock(&ns->ctrl->subsys->lock);
4000
4001 synchronize_rcu(); /* guarantee not available in head->list */
4002 nvme_mpath_clear_current_path(ns);
4003 synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
4004
4005 if (ns->disk->flags & GENHD_FL_UP) {
4006 del_gendisk(ns->disk);
4007 blk_cleanup_queue(ns->queue);
4008 if (blk_get_integrity(ns->disk))
4009 blk_integrity_unregister(ns->disk);
4010 }
4011
4012 down_write(&ns->ctrl->namespaces_rwsem);
4013 list_del_init(&ns->list);
4014 up_write(&ns->ctrl->namespaces_rwsem);
4015
4016 nvme_mpath_check_last_path(ns);
4017 nvme_put_ns(ns);
4018 }
4019
nvme_ns_remove_by_nsid(struct nvme_ctrl * ctrl,u32 nsid)4020 static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
4021 {
4022 struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
4023
4024 if (ns) {
4025 nvme_ns_remove(ns);
4026 nvme_put_ns(ns);
4027 }
4028 }
4029
nvme_validate_ns(struct nvme_ns * ns,struct nvme_ns_ids * ids)4030 static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
4031 {
4032 struct nvme_id_ns *id;
4033 int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
4034
4035 if (test_bit(NVME_NS_DEAD, &ns->flags))
4036 goto out;
4037
4038 ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
4039 if (ret)
4040 goto out;
4041
4042 ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
4043 if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
4044 dev_err(ns->ctrl->device,
4045 "identifiers changed for nsid %d\n", ns->head->ns_id);
4046 goto out_free_id;
4047 }
4048
4049 ret = nvme_update_ns_info(ns, id);
4050
4051 out_free_id:
4052 kfree(id);
4053 out:
4054 /*
4055 * Only remove the namespace if we got a fatal error back from the
4056 * device, otherwise ignore the error and just move on.
4057 *
4058 * TODO: we should probably schedule a delayed retry here.
4059 */
4060 if (ret > 0 && (ret & NVME_SC_DNR))
4061 nvme_ns_remove(ns);
4062 else
4063 revalidate_disk_size(ns->disk, true);
4064 }
4065
nvme_validate_or_alloc_ns(struct nvme_ctrl * ctrl,unsigned nsid)4066 static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4067 {
4068 struct nvme_ns_ids ids = { };
4069 struct nvme_ns *ns;
4070
4071 if (nvme_identify_ns_descs(ctrl, nsid, &ids))
4072 return;
4073
4074 ns = nvme_find_get_ns(ctrl, nsid);
4075 if (ns) {
4076 nvme_validate_ns(ns, &ids);
4077 nvme_put_ns(ns);
4078 return;
4079 }
4080
4081 switch (ids.csi) {
4082 case NVME_CSI_NVM:
4083 nvme_alloc_ns(ctrl, nsid, &ids);
4084 break;
4085 case NVME_CSI_ZNS:
4086 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
4087 dev_warn(ctrl->device,
4088 "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
4089 nsid);
4090 break;
4091 }
4092 if (!nvme_multi_css(ctrl)) {
4093 dev_warn(ctrl->device,
4094 "command set not reported for nsid: %d\n",
4095 nsid);
4096 break;
4097 }
4098 nvme_alloc_ns(ctrl, nsid, &ids);
4099 break;
4100 default:
4101 dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
4102 ids.csi, nsid);
4103 break;
4104 }
4105 }
4106
nvme_remove_invalid_namespaces(struct nvme_ctrl * ctrl,unsigned nsid)4107 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
4108 unsigned nsid)
4109 {
4110 struct nvme_ns *ns, *next;
4111 LIST_HEAD(rm_list);
4112
4113 down_write(&ctrl->namespaces_rwsem);
4114 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
4115 if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
4116 list_move_tail(&ns->list, &rm_list);
4117 }
4118 up_write(&ctrl->namespaces_rwsem);
4119
4120 list_for_each_entry_safe(ns, next, &rm_list, list)
4121 nvme_ns_remove(ns);
4122
4123 }
4124
nvme_scan_ns_list(struct nvme_ctrl * ctrl)4125 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
4126 {
4127 const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
4128 __le32 *ns_list;
4129 u32 prev = 0;
4130 int ret = 0, i;
4131
4132 if (nvme_ctrl_limited_cns(ctrl))
4133 return -EOPNOTSUPP;
4134
4135 ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
4136 if (!ns_list)
4137 return -ENOMEM;
4138
4139 for (;;) {
4140 struct nvme_command cmd = {
4141 .identify.opcode = nvme_admin_identify,
4142 .identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST,
4143 .identify.nsid = cpu_to_le32(prev),
4144 };
4145
4146 ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
4147 NVME_IDENTIFY_DATA_SIZE);
4148 if (ret)
4149 goto free;
4150
4151 for (i = 0; i < nr_entries; i++) {
4152 u32 nsid = le32_to_cpu(ns_list[i]);
4153
4154 if (!nsid) /* end of the list? */
4155 goto out;
4156 nvme_validate_or_alloc_ns(ctrl, nsid);
4157 while (++prev < nsid)
4158 nvme_ns_remove_by_nsid(ctrl, prev);
4159 }
4160 }
4161 out:
4162 nvme_remove_invalid_namespaces(ctrl, prev);
4163 free:
4164 kfree(ns_list);
4165 return ret;
4166 }
4167
nvme_scan_ns_sequential(struct nvme_ctrl * ctrl)4168 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
4169 {
4170 struct nvme_id_ctrl *id;
4171 u32 nn, i;
4172
4173 if (nvme_identify_ctrl(ctrl, &id))
4174 return;
4175 nn = le32_to_cpu(id->nn);
4176 kfree(id);
4177
4178 for (i = 1; i <= nn; i++)
4179 nvme_validate_or_alloc_ns(ctrl, i);
4180
4181 nvme_remove_invalid_namespaces(ctrl, nn);
4182 }
4183
nvme_clear_changed_ns_log(struct nvme_ctrl * ctrl)4184 static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
4185 {
4186 size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
4187 __le32 *log;
4188 int error;
4189
4190 log = kzalloc(log_size, GFP_KERNEL);
4191 if (!log)
4192 return;
4193
4194 /*
4195 * We need to read the log to clear the AEN, but we don't want to rely
4196 * on it for the changed namespace information as userspace could have
4197 * raced with us in reading the log page, which could cause us to miss
4198 * updates.
4199 */
4200 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
4201 NVME_CSI_NVM, log, log_size, 0);
4202 if (error)
4203 dev_warn(ctrl->device,
4204 "reading changed ns log failed: %d\n", error);
4205
4206 kfree(log);
4207 }
4208
nvme_scan_work(struct work_struct * work)4209 static void nvme_scan_work(struct work_struct *work)
4210 {
4211 struct nvme_ctrl *ctrl =
4212 container_of(work, struct nvme_ctrl, scan_work);
4213
4214 /* No tagset on a live ctrl means IO queues could not created */
4215 if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
4216 return;
4217
4218 if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
4219 dev_info(ctrl->device, "rescanning namespaces.\n");
4220 nvme_clear_changed_ns_log(ctrl);
4221 }
4222
4223 mutex_lock(&ctrl->scan_lock);
4224 if (nvme_scan_ns_list(ctrl) != 0)
4225 nvme_scan_ns_sequential(ctrl);
4226 mutex_unlock(&ctrl->scan_lock);
4227 }
4228
4229 /*
4230 * This function iterates the namespace list unlocked to allow recovery from
4231 * controller failure. It is up to the caller to ensure the namespace list is
4232 * not modified by scan work while this function is executing.
4233 */
nvme_remove_namespaces(struct nvme_ctrl * ctrl)4234 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
4235 {
4236 struct nvme_ns *ns, *next;
4237 LIST_HEAD(ns_list);
4238
4239 /*
4240 * make sure to requeue I/O to all namespaces as these
4241 * might result from the scan itself and must complete
4242 * for the scan_work to make progress
4243 */
4244 nvme_mpath_clear_ctrl_paths(ctrl);
4245
4246 /* prevent racing with ns scanning */
4247 flush_work(&ctrl->scan_work);
4248
4249 /*
4250 * The dead states indicates the controller was not gracefully
4251 * disconnected. In that case, we won't be able to flush any data while
4252 * removing the namespaces' disks; fail all the queues now to avoid
4253 * potentially having to clean up the failed sync later.
4254 */
4255 if (ctrl->state == NVME_CTRL_DEAD)
4256 nvme_kill_queues(ctrl);
4257
4258 /* this is a no-op when called from the controller reset handler */
4259 nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4260
4261 down_write(&ctrl->namespaces_rwsem);
4262 list_splice_init(&ctrl->namespaces, &ns_list);
4263 up_write(&ctrl->namespaces_rwsem);
4264
4265 list_for_each_entry_safe(ns, next, &ns_list, list)
4266 nvme_ns_remove(ns);
4267 }
4268 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4269
nvme_class_uevent(struct device * dev,struct kobj_uevent_env * env)4270 static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
4271 {
4272 struct nvme_ctrl *ctrl =
4273 container_of(dev, struct nvme_ctrl, ctrl_device);
4274 struct nvmf_ctrl_options *opts = ctrl->opts;
4275 int ret;
4276
4277 ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
4278 if (ret)
4279 return ret;
4280
4281 if (opts) {
4282 ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
4283 if (ret)
4284 return ret;
4285
4286 ret = add_uevent_var(env, "NVME_TRSVCID=%s",
4287 opts->trsvcid ?: "none");
4288 if (ret)
4289 return ret;
4290
4291 ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
4292 opts->host_traddr ?: "none");
4293 }
4294 return ret;
4295 }
4296
nvme_aen_uevent(struct nvme_ctrl * ctrl)4297 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
4298 {
4299 char *envp[2] = { NULL, NULL };
4300 u32 aen_result = ctrl->aen_result;
4301
4302 ctrl->aen_result = 0;
4303 if (!aen_result)
4304 return;
4305
4306 envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
4307 if (!envp[0])
4308 return;
4309 kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4310 kfree(envp[0]);
4311 }
4312
nvme_async_event_work(struct work_struct * work)4313 static void nvme_async_event_work(struct work_struct *work)
4314 {
4315 struct nvme_ctrl *ctrl =
4316 container_of(work, struct nvme_ctrl, async_event_work);
4317
4318 nvme_aen_uevent(ctrl);
4319
4320 /*
4321 * The transport drivers must guarantee AER submission here is safe by
4322 * flushing ctrl async_event_work after changing the controller state
4323 * from LIVE and before freeing the admin queue.
4324 */
4325 if (ctrl->state == NVME_CTRL_LIVE)
4326 ctrl->ops->submit_async_event(ctrl);
4327 }
4328
nvme_ctrl_pp_status(struct nvme_ctrl * ctrl)4329 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
4330 {
4331
4332 u32 csts;
4333
4334 if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
4335 return false;
4336
4337 if (csts == ~0)
4338 return false;
4339
4340 return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
4341 }
4342
nvme_get_fw_slot_info(struct nvme_ctrl * ctrl)4343 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
4344 {
4345 struct nvme_fw_slot_info_log *log;
4346
4347 log = kmalloc(sizeof(*log), GFP_KERNEL);
4348 if (!log)
4349 return;
4350
4351 if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4352 log, sizeof(*log), 0))
4353 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
4354 kfree(log);
4355 }
4356
nvme_fw_act_work(struct work_struct * work)4357 static void nvme_fw_act_work(struct work_struct *work)
4358 {
4359 struct nvme_ctrl *ctrl = container_of(work,
4360 struct nvme_ctrl, fw_act_work);
4361 unsigned long fw_act_timeout;
4362
4363 if (ctrl->mtfa)
4364 fw_act_timeout = jiffies +
4365 msecs_to_jiffies(ctrl->mtfa * 100);
4366 else
4367 fw_act_timeout = jiffies +
4368 msecs_to_jiffies(admin_timeout * 1000);
4369
4370 nvme_stop_queues(ctrl);
4371 while (nvme_ctrl_pp_status(ctrl)) {
4372 if (time_after(jiffies, fw_act_timeout)) {
4373 dev_warn(ctrl->device,
4374 "Fw activation timeout, reset controller\n");
4375 nvme_try_sched_reset(ctrl);
4376 return;
4377 }
4378 msleep(100);
4379 }
4380
4381 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
4382 return;
4383
4384 nvme_start_queues(ctrl);
4385 /* read FW slot information to clear the AER */
4386 nvme_get_fw_slot_info(ctrl);
4387 }
4388
nvme_handle_aen_notice(struct nvme_ctrl * ctrl,u32 result)4389 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
4390 {
4391 u32 aer_notice_type = (result & 0xff00) >> 8;
4392
4393 trace_nvme_async_event(ctrl, aer_notice_type);
4394
4395 switch (aer_notice_type) {
4396 case NVME_AER_NOTICE_NS_CHANGED:
4397 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
4398 nvme_queue_scan(ctrl);
4399 break;
4400 case NVME_AER_NOTICE_FW_ACT_STARTING:
4401 /*
4402 * We are (ab)using the RESETTING state to prevent subsequent
4403 * recovery actions from interfering with the controller's
4404 * firmware activation.
4405 */
4406 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
4407 queue_work(nvme_wq, &ctrl->fw_act_work);
4408 break;
4409 #ifdef CONFIG_NVME_MULTIPATH
4410 case NVME_AER_NOTICE_ANA:
4411 if (!ctrl->ana_log_buf)
4412 break;
4413 queue_work(nvme_wq, &ctrl->ana_work);
4414 break;
4415 #endif
4416 case NVME_AER_NOTICE_DISC_CHANGED:
4417 ctrl->aen_result = result;
4418 break;
4419 default:
4420 dev_warn(ctrl->device, "async event result %08x\n", result);
4421 }
4422 }
4423
nvme_complete_async_event(struct nvme_ctrl * ctrl,__le16 status,volatile union nvme_result * res)4424 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
4425 volatile union nvme_result *res)
4426 {
4427 u32 result = le32_to_cpu(res->u32);
4428 u32 aer_type = result & 0x07;
4429
4430 if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
4431 return;
4432
4433 switch (aer_type) {
4434 case NVME_AER_NOTICE:
4435 nvme_handle_aen_notice(ctrl, result);
4436 break;
4437 case NVME_AER_ERROR:
4438 case NVME_AER_SMART:
4439 case NVME_AER_CSS:
4440 case NVME_AER_VS:
4441 trace_nvme_async_event(ctrl, aer_type);
4442 ctrl->aen_result = result;
4443 break;
4444 default:
4445 break;
4446 }
4447 queue_work(nvme_wq, &ctrl->async_event_work);
4448 }
4449 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
4450
nvme_stop_ctrl(struct nvme_ctrl * ctrl)4451 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
4452 {
4453 nvme_mpath_stop(ctrl);
4454 nvme_stop_keep_alive(ctrl);
4455 flush_work(&ctrl->async_event_work);
4456 cancel_work_sync(&ctrl->fw_act_work);
4457 if (ctrl->ops->stop_ctrl)
4458 ctrl->ops->stop_ctrl(ctrl);
4459 }
4460 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
4461
nvme_start_ctrl(struct nvme_ctrl * ctrl)4462 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
4463 {
4464 nvme_start_keep_alive(ctrl);
4465
4466 nvme_enable_aen(ctrl);
4467
4468 if (ctrl->queue_count > 1) {
4469 nvme_queue_scan(ctrl);
4470 nvme_start_queues(ctrl);
4471 nvme_mpath_update(ctrl);
4472 }
4473 }
4474 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
4475
nvme_uninit_ctrl(struct nvme_ctrl * ctrl)4476 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
4477 {
4478 nvme_hwmon_exit(ctrl);
4479 nvme_fault_inject_fini(&ctrl->fault_inject);
4480 dev_pm_qos_hide_latency_tolerance(ctrl->device);
4481 cdev_device_del(&ctrl->cdev, ctrl->device);
4482 nvme_put_ctrl(ctrl);
4483 }
4484 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
4485
nvme_free_cels(struct nvme_ctrl * ctrl)4486 static void nvme_free_cels(struct nvme_ctrl *ctrl)
4487 {
4488 struct nvme_effects_log *cel;
4489 unsigned long i;
4490
4491 xa_for_each (&ctrl->cels, i, cel) {
4492 xa_erase(&ctrl->cels, i);
4493 kfree(cel);
4494 }
4495
4496 xa_destroy(&ctrl->cels);
4497 }
4498
nvme_free_ctrl(struct device * dev)4499 static void nvme_free_ctrl(struct device *dev)
4500 {
4501 struct nvme_ctrl *ctrl =
4502 container_of(dev, struct nvme_ctrl, ctrl_device);
4503 struct nvme_subsystem *subsys = ctrl->subsys;
4504
4505 if (!subsys || ctrl->instance != subsys->instance)
4506 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
4507
4508 nvme_free_cels(ctrl);
4509 nvme_mpath_uninit(ctrl);
4510 __free_page(ctrl->discard_page);
4511
4512 if (subsys) {
4513 mutex_lock(&nvme_subsystems_lock);
4514 list_del(&ctrl->subsys_entry);
4515 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
4516 mutex_unlock(&nvme_subsystems_lock);
4517 }
4518
4519 ctrl->ops->free_ctrl(ctrl);
4520
4521 if (subsys)
4522 nvme_put_subsystem(subsys);
4523 }
4524
4525 /*
4526 * Initialize a NVMe controller structures. This needs to be called during
4527 * earliest initialization so that we have the initialized structured around
4528 * during probing.
4529 */
nvme_init_ctrl(struct nvme_ctrl * ctrl,struct device * dev,const struct nvme_ctrl_ops * ops,unsigned long quirks)4530 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
4531 const struct nvme_ctrl_ops *ops, unsigned long quirks)
4532 {
4533 int ret;
4534
4535 ctrl->state = NVME_CTRL_NEW;
4536 spin_lock_init(&ctrl->lock);
4537 mutex_init(&ctrl->scan_lock);
4538 INIT_LIST_HEAD(&ctrl->namespaces);
4539 xa_init(&ctrl->cels);
4540 init_rwsem(&ctrl->namespaces_rwsem);
4541 ctrl->dev = dev;
4542 ctrl->ops = ops;
4543 ctrl->quirks = quirks;
4544 ctrl->numa_node = NUMA_NO_NODE;
4545 INIT_WORK(&ctrl->scan_work, nvme_scan_work);
4546 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
4547 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
4548 INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
4549 init_waitqueue_head(&ctrl->state_wq);
4550
4551 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
4552 memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
4553 ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
4554
4555 BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
4556 PAGE_SIZE);
4557 ctrl->discard_page = alloc_page(GFP_KERNEL);
4558 if (!ctrl->discard_page) {
4559 ret = -ENOMEM;
4560 goto out;
4561 }
4562
4563 ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
4564 if (ret < 0)
4565 goto out;
4566 ctrl->instance = ret;
4567
4568 device_initialize(&ctrl->ctrl_device);
4569 ctrl->device = &ctrl->ctrl_device;
4570 ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance);
4571 ctrl->device->class = nvme_class;
4572 ctrl->device->parent = ctrl->dev;
4573 ctrl->device->groups = nvme_dev_attr_groups;
4574 ctrl->device->release = nvme_free_ctrl;
4575 dev_set_drvdata(ctrl->device, ctrl);
4576 ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
4577 if (ret)
4578 goto out_release_instance;
4579
4580 nvme_get_ctrl(ctrl);
4581 cdev_init(&ctrl->cdev, &nvme_dev_fops);
4582 ctrl->cdev.owner = ops->module;
4583 ret = cdev_device_add(&ctrl->cdev, ctrl->device);
4584 if (ret)
4585 goto out_free_name;
4586
4587 /*
4588 * Initialize latency tolerance controls. The sysfs files won't
4589 * be visible to userspace unless the device actually supports APST.
4590 */
4591 ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
4592 dev_pm_qos_update_user_latency_tolerance(ctrl->device,
4593 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
4594
4595 nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
4596 nvme_mpath_init_ctrl(ctrl);
4597
4598 return 0;
4599 out_free_name:
4600 nvme_put_ctrl(ctrl);
4601 kfree_const(ctrl->device->kobj.name);
4602 out_release_instance:
4603 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
4604 out:
4605 if (ctrl->discard_page)
4606 __free_page(ctrl->discard_page);
4607 return ret;
4608 }
4609 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
4610
nvme_start_ns_queue(struct nvme_ns * ns)4611 static void nvme_start_ns_queue(struct nvme_ns *ns)
4612 {
4613 if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
4614 blk_mq_unquiesce_queue(ns->queue);
4615 }
4616
nvme_stop_ns_queue(struct nvme_ns * ns)4617 static void nvme_stop_ns_queue(struct nvme_ns *ns)
4618 {
4619 if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
4620 blk_mq_quiesce_queue(ns->queue);
4621 }
4622
4623 /*
4624 * Prepare a queue for teardown.
4625 *
4626 * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
4627 * the capacity to 0 after that to avoid blocking dispatchers that may be
4628 * holding bd_butex. This will end buffered writers dirtying pages that can't
4629 * be synced.
4630 */
nvme_set_queue_dying(struct nvme_ns * ns)4631 static void nvme_set_queue_dying(struct nvme_ns *ns)
4632 {
4633 if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
4634 return;
4635
4636 blk_set_queue_dying(ns->queue);
4637 nvme_start_ns_queue(ns);
4638
4639 set_capacity(ns->disk, 0);
4640 nvme_update_bdev_size(ns->disk);
4641 }
4642
4643 /**
4644 * nvme_kill_queues(): Ends all namespace queues
4645 * @ctrl: the dead controller that needs to end
4646 *
4647 * Call this function when the driver determines it is unable to get the
4648 * controller in a state capable of servicing IO.
4649 */
nvme_kill_queues(struct nvme_ctrl * ctrl)4650 void nvme_kill_queues(struct nvme_ctrl *ctrl)
4651 {
4652 struct nvme_ns *ns;
4653
4654 down_read(&ctrl->namespaces_rwsem);
4655
4656 /* Forcibly unquiesce queues to avoid blocking dispatch */
4657 if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
4658 nvme_start_admin_queue(ctrl);
4659
4660 list_for_each_entry(ns, &ctrl->namespaces, list)
4661 nvme_set_queue_dying(ns);
4662
4663 up_read(&ctrl->namespaces_rwsem);
4664 }
4665 EXPORT_SYMBOL_GPL(nvme_kill_queues);
4666
nvme_unfreeze(struct nvme_ctrl * ctrl)4667 void nvme_unfreeze(struct nvme_ctrl *ctrl)
4668 {
4669 struct nvme_ns *ns;
4670
4671 down_read(&ctrl->namespaces_rwsem);
4672 list_for_each_entry(ns, &ctrl->namespaces, list)
4673 blk_mq_unfreeze_queue(ns->queue);
4674 up_read(&ctrl->namespaces_rwsem);
4675 }
4676 EXPORT_SYMBOL_GPL(nvme_unfreeze);
4677
nvme_wait_freeze_timeout(struct nvme_ctrl * ctrl,long timeout)4678 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
4679 {
4680 struct nvme_ns *ns;
4681
4682 down_read(&ctrl->namespaces_rwsem);
4683 list_for_each_entry(ns, &ctrl->namespaces, list) {
4684 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
4685 if (timeout <= 0)
4686 break;
4687 }
4688 up_read(&ctrl->namespaces_rwsem);
4689 return timeout;
4690 }
4691 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
4692
nvme_wait_freeze(struct nvme_ctrl * ctrl)4693 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
4694 {
4695 struct nvme_ns *ns;
4696
4697 down_read(&ctrl->namespaces_rwsem);
4698 list_for_each_entry(ns, &ctrl->namespaces, list)
4699 blk_mq_freeze_queue_wait(ns->queue);
4700 up_read(&ctrl->namespaces_rwsem);
4701 }
4702 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
4703
nvme_start_freeze(struct nvme_ctrl * ctrl)4704 void nvme_start_freeze(struct nvme_ctrl *ctrl)
4705 {
4706 struct nvme_ns *ns;
4707
4708 down_read(&ctrl->namespaces_rwsem);
4709 list_for_each_entry(ns, &ctrl->namespaces, list)
4710 blk_freeze_queue_start(ns->queue);
4711 up_read(&ctrl->namespaces_rwsem);
4712 }
4713 EXPORT_SYMBOL_GPL(nvme_start_freeze);
4714
nvme_stop_queues(struct nvme_ctrl * ctrl)4715 void nvme_stop_queues(struct nvme_ctrl *ctrl)
4716 {
4717 struct nvme_ns *ns;
4718
4719 down_read(&ctrl->namespaces_rwsem);
4720 list_for_each_entry(ns, &ctrl->namespaces, list)
4721 nvme_stop_ns_queue(ns);
4722 up_read(&ctrl->namespaces_rwsem);
4723 }
4724 EXPORT_SYMBOL_GPL(nvme_stop_queues);
4725
nvme_start_queues(struct nvme_ctrl * ctrl)4726 void nvme_start_queues(struct nvme_ctrl *ctrl)
4727 {
4728 struct nvme_ns *ns;
4729
4730 down_read(&ctrl->namespaces_rwsem);
4731 list_for_each_entry(ns, &ctrl->namespaces, list)
4732 nvme_start_ns_queue(ns);
4733 up_read(&ctrl->namespaces_rwsem);
4734 }
4735 EXPORT_SYMBOL_GPL(nvme_start_queues);
4736
nvme_stop_admin_queue(struct nvme_ctrl * ctrl)4737 void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
4738 {
4739 if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
4740 blk_mq_quiesce_queue(ctrl->admin_q);
4741 }
4742 EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
4743
nvme_start_admin_queue(struct nvme_ctrl * ctrl)4744 void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
4745 {
4746 if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
4747 blk_mq_unquiesce_queue(ctrl->admin_q);
4748 }
4749 EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
4750
nvme_sync_io_queues(struct nvme_ctrl * ctrl)4751 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
4752 {
4753 struct nvme_ns *ns;
4754
4755 down_read(&ctrl->namespaces_rwsem);
4756 list_for_each_entry(ns, &ctrl->namespaces, list)
4757 blk_sync_queue(ns->queue);
4758 up_read(&ctrl->namespaces_rwsem);
4759 }
4760 EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
4761
nvme_sync_queues(struct nvme_ctrl * ctrl)4762 void nvme_sync_queues(struct nvme_ctrl *ctrl)
4763 {
4764 nvme_sync_io_queues(ctrl);
4765 if (ctrl->admin_q)
4766 blk_sync_queue(ctrl->admin_q);
4767 }
4768 EXPORT_SYMBOL_GPL(nvme_sync_queues);
4769
nvme_ctrl_from_file(struct file * file)4770 struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
4771 {
4772 if (file->f_op != &nvme_dev_fops)
4773 return NULL;
4774 return file->private_data;
4775 }
4776 EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
4777
4778 /*
4779 * Check we didn't inadvertently grow the command structure sizes:
4780 */
_nvme_check_size(void)4781 static inline void _nvme_check_size(void)
4782 {
4783 BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
4784 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
4785 BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
4786 BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
4787 BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
4788 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
4789 BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
4790 BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
4791 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
4792 BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
4793 BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
4794 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
4795 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
4796 BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
4797 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
4798 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
4799 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
4800 BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
4801 BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
4802 }
4803
4804
nvme_core_init(void)4805 static int __init nvme_core_init(void)
4806 {
4807 int result = -ENOMEM;
4808
4809 _nvme_check_size();
4810
4811 nvme_wq = alloc_workqueue("nvme-wq",
4812 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4813 if (!nvme_wq)
4814 goto out;
4815
4816 nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
4817 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4818 if (!nvme_reset_wq)
4819 goto destroy_wq;
4820
4821 nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
4822 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4823 if (!nvme_delete_wq)
4824 goto destroy_reset_wq;
4825
4826 result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
4827 if (result < 0)
4828 goto destroy_delete_wq;
4829
4830 nvme_class = class_create(THIS_MODULE, "nvme");
4831 if (IS_ERR(nvme_class)) {
4832 result = PTR_ERR(nvme_class);
4833 goto unregister_chrdev;
4834 }
4835 nvme_class->dev_uevent = nvme_class_uevent;
4836
4837 nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
4838 if (IS_ERR(nvme_subsys_class)) {
4839 result = PTR_ERR(nvme_subsys_class);
4840 goto destroy_class;
4841 }
4842 return 0;
4843
4844 destroy_class:
4845 class_destroy(nvme_class);
4846 unregister_chrdev:
4847 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
4848 destroy_delete_wq:
4849 destroy_workqueue(nvme_delete_wq);
4850 destroy_reset_wq:
4851 destroy_workqueue(nvme_reset_wq);
4852 destroy_wq:
4853 destroy_workqueue(nvme_wq);
4854 out:
4855 return result;
4856 }
4857
nvme_core_exit(void)4858 static void __exit nvme_core_exit(void)
4859 {
4860 class_destroy(nvme_subsys_class);
4861 class_destroy(nvme_class);
4862 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
4863 destroy_workqueue(nvme_delete_wq);
4864 destroy_workqueue(nvme_reset_wq);
4865 destroy_workqueue(nvme_wq);
4866 ida_destroy(&nvme_instance_ida);
4867 }
4868
4869 MODULE_LICENSE("GPL");
4870 MODULE_VERSION("1.0");
4871 module_init(nvme_core_init);
4872 module_exit(nvme_core_exit);
4873