• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * NVM Express device driver
3  * Copyright (c) 2011-2014, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  */
14 
15 #include <linux/blkdev.h>
16 #include <linux/blk-mq.h>
17 #include <linux/delay.h>
18 #include <linux/errno.h>
19 #include <linux/hdreg.h>
20 #include <linux/kernel.h>
21 #include <linux/module.h>
22 #include <linux/list_sort.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/pr.h>
26 #include <linux/ptrace.h>
27 #include <linux/nvme_ioctl.h>
28 #include <linux/t10-pi.h>
29 #include <linux/pm_qos.h>
30 #include <asm/unaligned.h>
31 
32 #include "nvme.h"
33 #include "fabrics.h"
34 
35 #define NVME_MINORS		(1U << MINORBITS)
36 
37 unsigned char admin_timeout = 60;
38 module_param(admin_timeout, byte, 0644);
39 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
40 EXPORT_SYMBOL_GPL(admin_timeout);
41 
42 unsigned char nvme_io_timeout = 30;
43 module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
44 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
45 EXPORT_SYMBOL_GPL(nvme_io_timeout);
46 
47 static unsigned char shutdown_timeout = 5;
48 module_param(shutdown_timeout, byte, 0644);
49 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
50 
51 static u8 nvme_max_retries = 5;
52 module_param_named(max_retries, nvme_max_retries, byte, 0644);
53 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
54 
55 static int nvme_char_major;
56 module_param(nvme_char_major, int, 0);
57 
58 static unsigned long default_ps_max_latency_us = 100000;
59 module_param(default_ps_max_latency_us, ulong, 0644);
60 MODULE_PARM_DESC(default_ps_max_latency_us,
61 		 "max power saving latency for new devices; use PM QOS to change per device");
62 
63 static bool force_apst;
64 module_param(force_apst, bool, 0644);
65 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
66 
67 static bool streams;
68 module_param(streams, bool, 0644);
69 MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
70 
71 struct workqueue_struct *nvme_wq;
72 EXPORT_SYMBOL_GPL(nvme_wq);
73 
74 static LIST_HEAD(nvme_ctrl_list);
75 static DEFINE_SPINLOCK(dev_list_lock);
76 
77 static struct class *nvme_class;
78 
nvme_get_log_dw10(u8 lid,size_t size)79 static __le32 nvme_get_log_dw10(u8 lid, size_t size)
80 {
81 	return cpu_to_le32((((size / 4) - 1) << 16) | lid);
82 }
83 
nvme_reset_ctrl(struct nvme_ctrl * ctrl)84 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
85 {
86 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
87 		return -EBUSY;
88 	if (!queue_work(nvme_wq, &ctrl->reset_work))
89 		return -EBUSY;
90 	return 0;
91 }
92 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
93 
nvme_reset_ctrl_sync(struct nvme_ctrl * ctrl)94 static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
95 {
96 	int ret;
97 
98 	ret = nvme_reset_ctrl(ctrl);
99 	if (!ret)
100 		flush_work(&ctrl->reset_work);
101 	return ret;
102 }
103 
nvme_error_status(struct request * req)104 static blk_status_t nvme_error_status(struct request *req)
105 {
106 	switch (nvme_req(req)->status & 0x7ff) {
107 	case NVME_SC_SUCCESS:
108 		return BLK_STS_OK;
109 	case NVME_SC_CAP_EXCEEDED:
110 		return BLK_STS_NOSPC;
111 	case NVME_SC_ONCS_NOT_SUPPORTED:
112 		return BLK_STS_NOTSUPP;
113 	case NVME_SC_WRITE_FAULT:
114 	case NVME_SC_READ_ERROR:
115 	case NVME_SC_UNWRITTEN_BLOCK:
116 	case NVME_SC_ACCESS_DENIED:
117 	case NVME_SC_READ_ONLY:
118 		return BLK_STS_MEDIUM;
119 	case NVME_SC_GUARD_CHECK:
120 	case NVME_SC_APPTAG_CHECK:
121 	case NVME_SC_REFTAG_CHECK:
122 	case NVME_SC_INVALID_PI:
123 		return BLK_STS_PROTECTION;
124 	case NVME_SC_RESERVATION_CONFLICT:
125 		return BLK_STS_NEXUS;
126 	default:
127 		return BLK_STS_IOERR;
128 	}
129 }
130 
nvme_req_needs_retry(struct request * req)131 static inline bool nvme_req_needs_retry(struct request *req)
132 {
133 	if (blk_noretry_request(req))
134 		return false;
135 	if (nvme_req(req)->status & NVME_SC_DNR)
136 		return false;
137 	if (nvme_req(req)->retries >= nvme_max_retries)
138 		return false;
139 	return true;
140 }
141 
nvme_complete_rq(struct request * req)142 void nvme_complete_rq(struct request *req)
143 {
144 	if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
145 		nvme_req(req)->retries++;
146 		blk_mq_requeue_request(req, true);
147 		return;
148 	}
149 
150 	blk_mq_end_request(req, nvme_error_status(req));
151 }
152 EXPORT_SYMBOL_GPL(nvme_complete_rq);
153 
nvme_cancel_request(struct request * req,void * data,bool reserved)154 void nvme_cancel_request(struct request *req, void *data, bool reserved)
155 {
156 	int status;
157 
158 	if (!blk_mq_request_started(req))
159 		return;
160 
161 	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
162 				"Cancelling I/O %d", req->tag);
163 
164 	status = NVME_SC_ABORT_REQ;
165 	if (blk_queue_dying(req->q))
166 		status |= NVME_SC_DNR;
167 	nvme_req(req)->status = status;
168 	blk_mq_complete_request(req);
169 
170 }
171 EXPORT_SYMBOL_GPL(nvme_cancel_request);
172 
nvme_change_ctrl_state(struct nvme_ctrl * ctrl,enum nvme_ctrl_state new_state)173 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
174 		enum nvme_ctrl_state new_state)
175 {
176 	enum nvme_ctrl_state old_state;
177 	unsigned long flags;
178 	bool changed = false;
179 
180 	spin_lock_irqsave(&ctrl->lock, flags);
181 
182 	old_state = ctrl->state;
183 	switch (new_state) {
184 	case NVME_CTRL_LIVE:
185 		switch (old_state) {
186 		case NVME_CTRL_NEW:
187 		case NVME_CTRL_RESETTING:
188 		case NVME_CTRL_RECONNECTING:
189 			changed = true;
190 			/* FALLTHRU */
191 		default:
192 			break;
193 		}
194 		break;
195 	case NVME_CTRL_RESETTING:
196 		switch (old_state) {
197 		case NVME_CTRL_NEW:
198 		case NVME_CTRL_LIVE:
199 			changed = true;
200 			/* FALLTHRU */
201 		default:
202 			break;
203 		}
204 		break;
205 	case NVME_CTRL_RECONNECTING:
206 		switch (old_state) {
207 		case NVME_CTRL_LIVE:
208 			changed = true;
209 			/* FALLTHRU */
210 		default:
211 			break;
212 		}
213 		break;
214 	case NVME_CTRL_DELETING:
215 		switch (old_state) {
216 		case NVME_CTRL_LIVE:
217 		case NVME_CTRL_RESETTING:
218 		case NVME_CTRL_RECONNECTING:
219 			changed = true;
220 			/* FALLTHRU */
221 		default:
222 			break;
223 		}
224 		break;
225 	case NVME_CTRL_DEAD:
226 		switch (old_state) {
227 		case NVME_CTRL_DELETING:
228 			changed = true;
229 			/* FALLTHRU */
230 		default:
231 			break;
232 		}
233 		break;
234 	default:
235 		break;
236 	}
237 
238 	if (changed)
239 		ctrl->state = new_state;
240 
241 	spin_unlock_irqrestore(&ctrl->lock, flags);
242 
243 	return changed;
244 }
245 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
246 
nvme_free_ns(struct kref * kref)247 static void nvme_free_ns(struct kref *kref)
248 {
249 	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
250 
251 	if (ns->ndev)
252 		nvme_nvm_unregister(ns);
253 
254 	if (ns->disk) {
255 		spin_lock(&dev_list_lock);
256 		ns->disk->private_data = NULL;
257 		spin_unlock(&dev_list_lock);
258 	}
259 
260 	put_disk(ns->disk);
261 	ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
262 	nvme_put_ctrl(ns->ctrl);
263 	kfree(ns);
264 }
265 
nvme_put_ns(struct nvme_ns * ns)266 static void nvme_put_ns(struct nvme_ns *ns)
267 {
268 	kref_put(&ns->kref, nvme_free_ns);
269 }
270 
nvme_get_ns_from_disk(struct gendisk * disk)271 static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
272 {
273 	struct nvme_ns *ns;
274 
275 	spin_lock(&dev_list_lock);
276 	ns = disk->private_data;
277 	if (ns) {
278 		if (!kref_get_unless_zero(&ns->kref))
279 			goto fail;
280 		if (!try_module_get(ns->ctrl->ops->module))
281 			goto fail_put_ns;
282 	}
283 	spin_unlock(&dev_list_lock);
284 
285 	return ns;
286 
287 fail_put_ns:
288 	kref_put(&ns->kref, nvme_free_ns);
289 fail:
290 	spin_unlock(&dev_list_lock);
291 	return NULL;
292 }
293 
nvme_alloc_request(struct request_queue * q,struct nvme_command * cmd,unsigned int flags,int qid)294 struct request *nvme_alloc_request(struct request_queue *q,
295 		struct nvme_command *cmd, unsigned int flags, int qid)
296 {
297 	unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
298 	struct request *req;
299 
300 	if (qid == NVME_QID_ANY) {
301 		req = blk_mq_alloc_request(q, op, flags);
302 	} else {
303 		req = blk_mq_alloc_request_hctx(q, op, flags,
304 				qid ? qid - 1 : 0);
305 	}
306 	if (IS_ERR(req))
307 		return req;
308 
309 	req->cmd_flags |= REQ_FAILFAST_DRIVER;
310 	nvme_req(req)->cmd = cmd;
311 
312 	return req;
313 }
314 EXPORT_SYMBOL_GPL(nvme_alloc_request);
315 
nvme_toggle_streams(struct nvme_ctrl * ctrl,bool enable)316 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
317 {
318 	struct nvme_command c;
319 
320 	memset(&c, 0, sizeof(c));
321 
322 	c.directive.opcode = nvme_admin_directive_send;
323 	c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
324 	c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
325 	c.directive.dtype = NVME_DIR_IDENTIFY;
326 	c.directive.tdtype = NVME_DIR_STREAMS;
327 	c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
328 
329 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
330 }
331 
nvme_disable_streams(struct nvme_ctrl * ctrl)332 static int nvme_disable_streams(struct nvme_ctrl *ctrl)
333 {
334 	return nvme_toggle_streams(ctrl, false);
335 }
336 
nvme_enable_streams(struct nvme_ctrl * ctrl)337 static int nvme_enable_streams(struct nvme_ctrl *ctrl)
338 {
339 	return nvme_toggle_streams(ctrl, true);
340 }
341 
nvme_get_stream_params(struct nvme_ctrl * ctrl,struct streams_directive_params * s,u32 nsid)342 static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
343 				  struct streams_directive_params *s, u32 nsid)
344 {
345 	struct nvme_command c;
346 
347 	memset(&c, 0, sizeof(c));
348 	memset(s, 0, sizeof(*s));
349 
350 	c.directive.opcode = nvme_admin_directive_recv;
351 	c.directive.nsid = cpu_to_le32(nsid);
352 	c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
353 	c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
354 	c.directive.dtype = NVME_DIR_STREAMS;
355 
356 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
357 }
358 
nvme_configure_directives(struct nvme_ctrl * ctrl)359 static int nvme_configure_directives(struct nvme_ctrl *ctrl)
360 {
361 	struct streams_directive_params s;
362 	int ret;
363 
364 	if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
365 		return 0;
366 	if (!streams)
367 		return 0;
368 
369 	ret = nvme_enable_streams(ctrl);
370 	if (ret)
371 		return ret;
372 
373 	ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
374 	if (ret)
375 		return ret;
376 
377 	ctrl->nssa = le16_to_cpu(s.nssa);
378 	if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
379 		dev_info(ctrl->device, "too few streams (%u) available\n",
380 					ctrl->nssa);
381 		nvme_disable_streams(ctrl);
382 		return 0;
383 	}
384 
385 	ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
386 	dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
387 	return 0;
388 }
389 
390 /*
391  * Check if 'req' has a write hint associated with it. If it does, assign
392  * a valid namespace stream to the write.
393  */
nvme_assign_write_stream(struct nvme_ctrl * ctrl,struct request * req,u16 * control,u32 * dsmgmt)394 static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
395 				     struct request *req, u16 *control,
396 				     u32 *dsmgmt)
397 {
398 	enum rw_hint streamid = req->write_hint;
399 
400 	if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
401 		streamid = 0;
402 	else {
403 		streamid--;
404 		if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
405 			return;
406 
407 		*control |= NVME_RW_DTYPE_STREAMS;
408 		*dsmgmt |= streamid << 16;
409 	}
410 
411 	if (streamid < ARRAY_SIZE(req->q->write_hints))
412 		req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
413 }
414 
nvme_setup_flush(struct nvme_ns * ns,struct nvme_command * cmnd)415 static inline void nvme_setup_flush(struct nvme_ns *ns,
416 		struct nvme_command *cmnd)
417 {
418 	memset(cmnd, 0, sizeof(*cmnd));
419 	cmnd->common.opcode = nvme_cmd_flush;
420 	cmnd->common.nsid = cpu_to_le32(ns->ns_id);
421 }
422 
nvme_setup_discard(struct nvme_ns * ns,struct request * req,struct nvme_command * cmnd)423 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
424 		struct nvme_command *cmnd)
425 {
426 	unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
427 	struct nvme_dsm_range *range;
428 	struct bio *bio;
429 
430 	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
431 	if (!range)
432 		return BLK_STS_RESOURCE;
433 
434 	__rq_for_each_bio(bio, req) {
435 		u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
436 		u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
437 
438 		range[n].cattr = cpu_to_le32(0);
439 		range[n].nlb = cpu_to_le32(nlb);
440 		range[n].slba = cpu_to_le64(slba);
441 		n++;
442 	}
443 
444 	if (WARN_ON_ONCE(n != segments)) {
445 		kfree(range);
446 		return BLK_STS_IOERR;
447 	}
448 
449 	memset(cmnd, 0, sizeof(*cmnd));
450 	cmnd->dsm.opcode = nvme_cmd_dsm;
451 	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
452 	cmnd->dsm.nr = cpu_to_le32(segments - 1);
453 	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
454 
455 	req->special_vec.bv_page = virt_to_page(range);
456 	req->special_vec.bv_offset = offset_in_page(range);
457 	req->special_vec.bv_len = sizeof(*range) * segments;
458 	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
459 
460 	return BLK_STS_OK;
461 }
462 
nvme_setup_rw(struct nvme_ns * ns,struct request * req,struct nvme_command * cmnd)463 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
464 		struct request *req, struct nvme_command *cmnd)
465 {
466 	struct nvme_ctrl *ctrl = ns->ctrl;
467 	u16 control = 0;
468 	u32 dsmgmt = 0;
469 
470 	/*
471 	 * If formated with metadata, require the block layer provide a buffer
472 	 * unless this namespace is formated such that the metadata can be
473 	 * stripped/generated by the controller with PRACT=1.
474 	 */
475 	if (ns && ns->ms &&
476 	    (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) &&
477 	    !blk_integrity_rq(req) && !blk_rq_is_passthrough(req))
478 		return BLK_STS_NOTSUPP;
479 
480 	if (req->cmd_flags & REQ_FUA)
481 		control |= NVME_RW_FUA;
482 	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
483 		control |= NVME_RW_LR;
484 
485 	if (req->cmd_flags & REQ_RAHEAD)
486 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
487 
488 	memset(cmnd, 0, sizeof(*cmnd));
489 	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
490 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
491 	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
492 	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
493 
494 	if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
495 		nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
496 
497 	if (ns->ms) {
498 		switch (ns->pi_type) {
499 		case NVME_NS_DPS_PI_TYPE3:
500 			control |= NVME_RW_PRINFO_PRCHK_GUARD;
501 			break;
502 		case NVME_NS_DPS_PI_TYPE1:
503 		case NVME_NS_DPS_PI_TYPE2:
504 			control |= NVME_RW_PRINFO_PRCHK_GUARD |
505 					NVME_RW_PRINFO_PRCHK_REF;
506 			cmnd->rw.reftag = cpu_to_le32(
507 					nvme_block_nr(ns, blk_rq_pos(req)));
508 			break;
509 		}
510 		if (!blk_integrity_rq(req))
511 			control |= NVME_RW_PRINFO_PRACT;
512 	}
513 
514 	cmnd->rw.control = cpu_to_le16(control);
515 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
516 	return 0;
517 }
518 
nvme_setup_cmd(struct nvme_ns * ns,struct request * req,struct nvme_command * cmd)519 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
520 		struct nvme_command *cmd)
521 {
522 	blk_status_t ret = BLK_STS_OK;
523 
524 	if (!(req->rq_flags & RQF_DONTPREP)) {
525 		nvme_req(req)->retries = 0;
526 		nvme_req(req)->flags = 0;
527 		req->rq_flags |= RQF_DONTPREP;
528 	}
529 
530 	switch (req_op(req)) {
531 	case REQ_OP_DRV_IN:
532 	case REQ_OP_DRV_OUT:
533 		memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
534 		break;
535 	case REQ_OP_FLUSH:
536 		nvme_setup_flush(ns, cmd);
537 		break;
538 	case REQ_OP_WRITE_ZEROES:
539 		/* currently only aliased to deallocate for a few ctrls: */
540 	case REQ_OP_DISCARD:
541 		ret = nvme_setup_discard(ns, req, cmd);
542 		break;
543 	case REQ_OP_READ:
544 	case REQ_OP_WRITE:
545 		ret = nvme_setup_rw(ns, req, cmd);
546 		break;
547 	default:
548 		WARN_ON_ONCE(1);
549 		return BLK_STS_IOERR;
550 	}
551 
552 	cmd->common.command_id = req->tag;
553 	return ret;
554 }
555 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
556 
557 /*
558  * Returns 0 on success.  If the result is negative, it's a Linux error code;
559  * if the result is positive, it's an NVM Express status code
560  */
__nvme_submit_sync_cmd(struct request_queue * q,struct nvme_command * cmd,union nvme_result * result,void * buffer,unsigned bufflen,unsigned timeout,int qid,int at_head,int flags)561 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
562 		union nvme_result *result, void *buffer, unsigned bufflen,
563 		unsigned timeout, int qid, int at_head, int flags)
564 {
565 	struct request *req;
566 	int ret;
567 
568 	req = nvme_alloc_request(q, cmd, flags, qid);
569 	if (IS_ERR(req))
570 		return PTR_ERR(req);
571 
572 	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
573 
574 	if (buffer && bufflen) {
575 		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
576 		if (ret)
577 			goto out;
578 	}
579 
580 	blk_execute_rq(req->q, NULL, req, at_head);
581 	if (result)
582 		*result = nvme_req(req)->result;
583 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
584 		ret = -EINTR;
585 	else
586 		ret = nvme_req(req)->status;
587  out:
588 	blk_mq_free_request(req);
589 	return ret;
590 }
591 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
592 
nvme_submit_sync_cmd(struct request_queue * q,struct nvme_command * cmd,void * buffer,unsigned bufflen)593 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
594 		void *buffer, unsigned bufflen)
595 {
596 	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
597 			NVME_QID_ANY, 0, 0);
598 }
599 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
600 
nvme_add_user_metadata(struct bio * bio,void __user * ubuf,unsigned len,u32 seed,bool write)601 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
602 		unsigned len, u32 seed, bool write)
603 {
604 	struct bio_integrity_payload *bip;
605 	int ret = -ENOMEM;
606 	void *buf;
607 
608 	buf = kmalloc(len, GFP_KERNEL);
609 	if (!buf)
610 		goto out;
611 
612 	ret = -EFAULT;
613 	if (write && copy_from_user(buf, ubuf, len))
614 		goto out_free_meta;
615 
616 	bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
617 	if (IS_ERR(bip)) {
618 		ret = PTR_ERR(bip);
619 		goto out_free_meta;
620 	}
621 
622 	bip->bip_iter.bi_size = len;
623 	bip->bip_iter.bi_sector = seed;
624 	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
625 			offset_in_page(buf));
626 	if (ret == len)
627 		return buf;
628 	ret = -ENOMEM;
629 out_free_meta:
630 	kfree(buf);
631 out:
632 	return ERR_PTR(ret);
633 }
634 
nvme_submit_user_cmd(struct request_queue * q,struct nvme_command * cmd,void __user * ubuffer,unsigned bufflen,void __user * meta_buffer,unsigned meta_len,u32 meta_seed,u32 * result,unsigned timeout)635 static int nvme_submit_user_cmd(struct request_queue *q,
636 		struct nvme_command *cmd, void __user *ubuffer,
637 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
638 		u32 meta_seed, u32 *result, unsigned timeout)
639 {
640 	bool write = nvme_is_write(cmd);
641 	struct nvme_ns *ns = q->queuedata;
642 	struct gendisk *disk = ns ? ns->disk : NULL;
643 	struct request *req;
644 	struct bio *bio = NULL;
645 	void *meta = NULL;
646 	int ret;
647 
648 	req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
649 	if (IS_ERR(req))
650 		return PTR_ERR(req);
651 
652 	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
653 
654 	if (ubuffer && bufflen) {
655 		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
656 				GFP_KERNEL);
657 		if (ret)
658 			goto out;
659 		bio = req->bio;
660 		bio->bi_disk = disk;
661 		if (disk && meta_buffer && meta_len) {
662 			meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
663 					meta_seed, write);
664 			if (IS_ERR(meta)) {
665 				ret = PTR_ERR(meta);
666 				goto out_unmap;
667 			}
668 			req->cmd_flags |= REQ_INTEGRITY;
669 		}
670 	}
671 
672 	blk_execute_rq(req->q, disk, req, 0);
673 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
674 		ret = -EINTR;
675 	else
676 		ret = nvme_req(req)->status;
677 	if (result)
678 		*result = le32_to_cpu(nvme_req(req)->result.u32);
679 	if (meta && !ret && !write) {
680 		if (copy_to_user(meta_buffer, meta, meta_len))
681 			ret = -EFAULT;
682 	}
683 	kfree(meta);
684  out_unmap:
685 	if (bio)
686 		blk_rq_unmap_user(bio);
687  out:
688 	blk_mq_free_request(req);
689 	return ret;
690 }
691 
nvme_keep_alive_end_io(struct request * rq,blk_status_t status)692 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
693 {
694 	struct nvme_ctrl *ctrl = rq->end_io_data;
695 
696 	blk_mq_free_request(rq);
697 
698 	if (status) {
699 		dev_err(ctrl->device,
700 			"failed nvme_keep_alive_end_io error=%d\n",
701 				status);
702 		return;
703 	}
704 
705 	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
706 }
707 
nvme_keep_alive(struct nvme_ctrl * ctrl)708 static int nvme_keep_alive(struct nvme_ctrl *ctrl)
709 {
710 	struct nvme_command c;
711 	struct request *rq;
712 
713 	memset(&c, 0, sizeof(c));
714 	c.common.opcode = nvme_admin_keep_alive;
715 
716 	rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED,
717 			NVME_QID_ANY);
718 	if (IS_ERR(rq))
719 		return PTR_ERR(rq);
720 
721 	rq->timeout = ctrl->kato * HZ;
722 	rq->end_io_data = ctrl;
723 
724 	blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
725 
726 	return 0;
727 }
728 
nvme_keep_alive_work(struct work_struct * work)729 static void nvme_keep_alive_work(struct work_struct *work)
730 {
731 	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
732 			struct nvme_ctrl, ka_work);
733 
734 	if (nvme_keep_alive(ctrl)) {
735 		/* allocation failure, reset the controller */
736 		dev_err(ctrl->device, "keep-alive failed\n");
737 		nvme_reset_ctrl(ctrl);
738 		return;
739 	}
740 }
741 
nvme_start_keep_alive(struct nvme_ctrl * ctrl)742 void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
743 {
744 	if (unlikely(ctrl->kato == 0))
745 		return;
746 
747 	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
748 	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
749 }
750 EXPORT_SYMBOL_GPL(nvme_start_keep_alive);
751 
nvme_stop_keep_alive(struct nvme_ctrl * ctrl)752 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
753 {
754 	if (unlikely(ctrl->kato == 0))
755 		return;
756 
757 	cancel_delayed_work_sync(&ctrl->ka_work);
758 }
759 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
760 
nvme_identify_ctrl(struct nvme_ctrl * dev,struct nvme_id_ctrl ** id)761 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
762 {
763 	struct nvme_command c = { };
764 	int error;
765 
766 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
767 	c.identify.opcode = nvme_admin_identify;
768 	c.identify.cns = NVME_ID_CNS_CTRL;
769 
770 	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
771 	if (!*id)
772 		return -ENOMEM;
773 
774 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
775 			sizeof(struct nvme_id_ctrl));
776 	if (error)
777 		kfree(*id);
778 	return error;
779 }
780 
nvme_identify_ns_descs(struct nvme_ctrl * ctrl,unsigned nsid,u8 * eui64,u8 * nguid,uuid_t * uuid)781 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
782 		u8 *eui64, u8 *nguid, uuid_t *uuid)
783 {
784 	struct nvme_command c = { };
785 	int status;
786 	void *data;
787 	int pos;
788 	int len;
789 
790 	c.identify.opcode = nvme_admin_identify;
791 	c.identify.nsid = cpu_to_le32(nsid);
792 	c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
793 
794 	data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
795 	if (!data)
796 		return -ENOMEM;
797 
798 	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
799 				      NVME_IDENTIFY_DATA_SIZE);
800 	if (status)
801 		goto free_data;
802 
803 	for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
804 		struct nvme_ns_id_desc *cur = data + pos;
805 
806 		if (cur->nidl == 0)
807 			break;
808 
809 		switch (cur->nidt) {
810 		case NVME_NIDT_EUI64:
811 			if (cur->nidl != NVME_NIDT_EUI64_LEN) {
812 				dev_warn(ctrl->device,
813 					 "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n",
814 					 cur->nidl);
815 				goto free_data;
816 			}
817 			len = NVME_NIDT_EUI64_LEN;
818 			memcpy(eui64, data + pos + sizeof(*cur), len);
819 			break;
820 		case NVME_NIDT_NGUID:
821 			if (cur->nidl != NVME_NIDT_NGUID_LEN) {
822 				dev_warn(ctrl->device,
823 					 "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n",
824 					 cur->nidl);
825 				goto free_data;
826 			}
827 			len = NVME_NIDT_NGUID_LEN;
828 			memcpy(nguid, data + pos + sizeof(*cur), len);
829 			break;
830 		case NVME_NIDT_UUID:
831 			if (cur->nidl != NVME_NIDT_UUID_LEN) {
832 				dev_warn(ctrl->device,
833 					 "ctrl returned bogus length: %d for NVME_NIDT_UUID\n",
834 					 cur->nidl);
835 				goto free_data;
836 			}
837 			len = NVME_NIDT_UUID_LEN;
838 			uuid_copy(uuid, data + pos + sizeof(*cur));
839 			break;
840 		default:
841 			/* Skip unnkown types */
842 			len = cur->nidl;
843 			break;
844 		}
845 
846 		len += sizeof(*cur);
847 	}
848 free_data:
849 	kfree(data);
850 	return status;
851 }
852 
nvme_identify_ns_list(struct nvme_ctrl * dev,unsigned nsid,__le32 * ns_list)853 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
854 {
855 	struct nvme_command c = { };
856 
857 	c.identify.opcode = nvme_admin_identify;
858 	c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
859 	c.identify.nsid = cpu_to_le32(nsid);
860 	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
861 }
862 
nvme_identify_ns(struct nvme_ctrl * ctrl,unsigned nsid)863 static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
864 		unsigned nsid)
865 {
866 	struct nvme_id_ns *id;
867 	struct nvme_command c = { };
868 	int error;
869 
870 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
871 	c.identify.opcode = nvme_admin_identify;
872 	c.identify.nsid = cpu_to_le32(nsid);
873 	c.identify.cns = NVME_ID_CNS_NS;
874 
875 	id = kmalloc(sizeof(*id), GFP_KERNEL);
876 	if (!id)
877 		return NULL;
878 
879 	error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
880 	if (error) {
881 		dev_warn(ctrl->device, "Identify namespace failed\n");
882 		kfree(id);
883 		return NULL;
884 	}
885 
886 	return id;
887 }
888 
nvme_set_features(struct nvme_ctrl * dev,unsigned fid,unsigned dword11,void * buffer,size_t buflen,u32 * result)889 static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
890 		      void *buffer, size_t buflen, u32 *result)
891 {
892 	union nvme_result res = { 0 };
893 	struct nvme_command c;
894 	int ret;
895 
896 	memset(&c, 0, sizeof(c));
897 	c.features.opcode = nvme_admin_set_features;
898 	c.features.fid = cpu_to_le32(fid);
899 	c.features.dword11 = cpu_to_le32(dword11);
900 
901 	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
902 			buffer, buflen, 0, NVME_QID_ANY, 0, 0);
903 	if (ret >= 0 && result)
904 		*result = le32_to_cpu(res.u32);
905 	return ret;
906 }
907 
nvme_set_queue_count(struct nvme_ctrl * ctrl,int * count)908 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
909 {
910 	u32 q_count = (*count - 1) | ((*count - 1) << 16);
911 	u32 result;
912 	int status, nr_io_queues;
913 
914 	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
915 			&result);
916 	if (status < 0)
917 		return status;
918 
919 	/*
920 	 * Degraded controllers might return an error when setting the queue
921 	 * count.  We still want to be able to bring them online and offer
922 	 * access to the admin queue, as that might be only way to fix them up.
923 	 */
924 	if (status > 0) {
925 		dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
926 		*count = 0;
927 	} else {
928 		nr_io_queues = min(result & 0xffff, result >> 16) + 1;
929 		*count = min(*count, nr_io_queues);
930 	}
931 
932 	return 0;
933 }
934 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
935 
nvme_submit_io(struct nvme_ns * ns,struct nvme_user_io __user * uio)936 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
937 {
938 	struct nvme_user_io io;
939 	struct nvme_command c;
940 	unsigned length, meta_len;
941 	void __user *metadata;
942 
943 	if (copy_from_user(&io, uio, sizeof(io)))
944 		return -EFAULT;
945 	if (io.flags)
946 		return -EINVAL;
947 
948 	switch (io.opcode) {
949 	case nvme_cmd_write:
950 	case nvme_cmd_read:
951 	case nvme_cmd_compare:
952 		break;
953 	default:
954 		return -EINVAL;
955 	}
956 
957 	length = (io.nblocks + 1) << ns->lba_shift;
958 	meta_len = (io.nblocks + 1) * ns->ms;
959 	metadata = (void __user *)(uintptr_t)io.metadata;
960 
961 	if (ns->ext) {
962 		length += meta_len;
963 		meta_len = 0;
964 	} else if (meta_len) {
965 		if ((io.metadata & 3) || !io.metadata)
966 			return -EINVAL;
967 	}
968 
969 	memset(&c, 0, sizeof(c));
970 	c.rw.opcode = io.opcode;
971 	c.rw.flags = io.flags;
972 	c.rw.nsid = cpu_to_le32(ns->ns_id);
973 	c.rw.slba = cpu_to_le64(io.slba);
974 	c.rw.length = cpu_to_le16(io.nblocks);
975 	c.rw.control = cpu_to_le16(io.control);
976 	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
977 	c.rw.reftag = cpu_to_le32(io.reftag);
978 	c.rw.apptag = cpu_to_le16(io.apptag);
979 	c.rw.appmask = cpu_to_le16(io.appmask);
980 
981 	return nvme_submit_user_cmd(ns->queue, &c,
982 			(void __user *)(uintptr_t)io.addr, length,
983 			metadata, meta_len, io.slba, NULL, 0);
984 }
985 
nvme_user_cmd(struct nvme_ctrl * ctrl,struct nvme_ns * ns,struct nvme_passthru_cmd __user * ucmd)986 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
987 			struct nvme_passthru_cmd __user *ucmd)
988 {
989 	struct nvme_passthru_cmd cmd;
990 	struct nvme_command c;
991 	unsigned timeout = 0;
992 	int status;
993 
994 	if (!capable(CAP_SYS_ADMIN))
995 		return -EACCES;
996 	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
997 		return -EFAULT;
998 	if (cmd.flags)
999 		return -EINVAL;
1000 
1001 	memset(&c, 0, sizeof(c));
1002 	c.common.opcode = cmd.opcode;
1003 	c.common.flags = cmd.flags;
1004 	c.common.nsid = cpu_to_le32(cmd.nsid);
1005 	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1006 	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1007 	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1008 	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1009 	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1010 	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1011 	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1012 	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1013 
1014 	if (cmd.timeout_ms)
1015 		timeout = msecs_to_jiffies(cmd.timeout_ms);
1016 
1017 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1018 			(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
1019 			(void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len,
1020 			0, &cmd.result, timeout);
1021 	if (status >= 0) {
1022 		if (put_user(cmd.result, &ucmd->result))
1023 			return -EFAULT;
1024 	}
1025 
1026 	return status;
1027 }
1028 
nvme_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)1029 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
1030 		unsigned int cmd, unsigned long arg)
1031 {
1032 	struct nvme_ns *ns = bdev->bd_disk->private_data;
1033 
1034 	switch (cmd) {
1035 	case NVME_IOCTL_ID:
1036 		force_successful_syscall_return();
1037 		return ns->ns_id;
1038 	case NVME_IOCTL_ADMIN_CMD:
1039 		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
1040 	case NVME_IOCTL_IO_CMD:
1041 		return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
1042 	case NVME_IOCTL_SUBMIT_IO:
1043 		return nvme_submit_io(ns, (void __user *)arg);
1044 	default:
1045 		if (ns->ndev)
1046 			return nvme_nvm_ioctl(ns, cmd, arg);
1047 		if (is_sed_ioctl(cmd))
1048 			return sed_ioctl(ns->ctrl->opal_dev, cmd,
1049 					 (void __user *) arg);
1050 		return -ENOTTY;
1051 	}
1052 }
1053 
1054 #ifdef CONFIG_COMPAT
nvme_compat_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)1055 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1056 			unsigned int cmd, unsigned long arg)
1057 {
1058 	return nvme_ioctl(bdev, mode, cmd, arg);
1059 }
1060 #else
1061 #define nvme_compat_ioctl	NULL
1062 #endif
1063 
nvme_open(struct block_device * bdev,fmode_t mode)1064 static int nvme_open(struct block_device *bdev, fmode_t mode)
1065 {
1066 	return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
1067 }
1068 
nvme_release(struct gendisk * disk,fmode_t mode)1069 static void nvme_release(struct gendisk *disk, fmode_t mode)
1070 {
1071 	struct nvme_ns *ns = disk->private_data;
1072 
1073 	module_put(ns->ctrl->ops->module);
1074 	nvme_put_ns(ns);
1075 }
1076 
nvme_getgeo(struct block_device * bdev,struct hd_geometry * geo)1077 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1078 {
1079 	/* some standard values */
1080 	geo->heads = 1 << 6;
1081 	geo->sectors = 1 << 5;
1082 	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1083 	return 0;
1084 }
1085 
1086 #ifdef CONFIG_BLK_DEV_INTEGRITY
nvme_prep_integrity(struct gendisk * disk,struct nvme_id_ns * id,u16 bs)1087 static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
1088 		u16 bs)
1089 {
1090 	struct nvme_ns *ns = disk->private_data;
1091 	u16 old_ms = ns->ms;
1092 	u8 pi_type = 0;
1093 
1094 	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1095 	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
1096 
1097 	/* PI implementation requires metadata equal t10 pi tuple size */
1098 	if (ns->ms == sizeof(struct t10_pi_tuple))
1099 		pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1100 
1101 	if (blk_get_integrity(disk) &&
1102 	    (ns->pi_type != pi_type || ns->ms != old_ms ||
1103 	     bs != queue_logical_block_size(disk->queue) ||
1104 	     (ns->ms && ns->ext)))
1105 		blk_integrity_unregister(disk);
1106 
1107 	ns->pi_type = pi_type;
1108 }
1109 
nvme_init_integrity(struct nvme_ns * ns)1110 static void nvme_init_integrity(struct nvme_ns *ns)
1111 {
1112 	struct blk_integrity integrity;
1113 
1114 	memset(&integrity, 0, sizeof(integrity));
1115 	switch (ns->pi_type) {
1116 	case NVME_NS_DPS_PI_TYPE3:
1117 		integrity.profile = &t10_pi_type3_crc;
1118 		integrity.tag_size = sizeof(u16) + sizeof(u32);
1119 		integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1120 		break;
1121 	case NVME_NS_DPS_PI_TYPE1:
1122 	case NVME_NS_DPS_PI_TYPE2:
1123 		integrity.profile = &t10_pi_type1_crc;
1124 		integrity.tag_size = sizeof(u16);
1125 		integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1126 		break;
1127 	default:
1128 		integrity.profile = NULL;
1129 		break;
1130 	}
1131 	integrity.tuple_size = ns->ms;
1132 	blk_integrity_register(ns->disk, &integrity);
1133 	blk_queue_max_integrity_segments(ns->queue, 1);
1134 }
1135 #else
nvme_prep_integrity(struct gendisk * disk,struct nvme_id_ns * id,u16 bs)1136 static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
1137 		u16 bs)
1138 {
1139 }
nvme_init_integrity(struct nvme_ns * ns)1140 static void nvme_init_integrity(struct nvme_ns *ns)
1141 {
1142 }
1143 #endif /* CONFIG_BLK_DEV_INTEGRITY */
1144 
nvme_set_chunk_size(struct nvme_ns * ns)1145 static void nvme_set_chunk_size(struct nvme_ns *ns)
1146 {
1147 	u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9));
1148 	blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
1149 }
1150 
nvme_config_discard(struct nvme_ns * ns)1151 static void nvme_config_discard(struct nvme_ns *ns)
1152 {
1153 	struct nvme_ctrl *ctrl = ns->ctrl;
1154 	u32 logical_block_size = queue_logical_block_size(ns->queue);
1155 
1156 	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
1157 			NVME_DSM_MAX_RANGES);
1158 
1159 	if (ctrl->nr_streams && ns->sws && ns->sgs) {
1160 		unsigned int sz = logical_block_size * ns->sws * ns->sgs;
1161 
1162 		ns->queue->limits.discard_alignment = sz;
1163 		ns->queue->limits.discard_granularity = sz;
1164 	} else {
1165 		ns->queue->limits.discard_alignment = logical_block_size;
1166 		ns->queue->limits.discard_granularity = logical_block_size;
1167 	}
1168 	blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
1169 	blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
1170 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
1171 
1172 	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1173 		blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
1174 }
1175 
nvme_report_ns_ids(struct nvme_ctrl * ctrl,unsigned int nsid,struct nvme_id_ns * id,u8 * eui64,u8 * nguid,uuid_t * uuid)1176 static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
1177 		struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid)
1178 {
1179 	if (ctrl->vs >= NVME_VS(1, 1, 0))
1180 		memcpy(eui64, id->eui64, sizeof(id->eui64));
1181 	if (ctrl->vs >= NVME_VS(1, 2, 0))
1182 		memcpy(nguid, id->nguid, sizeof(id->nguid));
1183 	if (ctrl->vs >= NVME_VS(1, 3, 0)) {
1184 		 /* Don't treat error as fatal we potentially
1185 		  * already have a NGUID or EUI-64
1186 		  */
1187 		if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid))
1188 			dev_warn(ctrl->device,
1189 				 "%s: Identify Descriptors failed\n", __func__);
1190 	}
1191 }
1192 
__nvme_revalidate_disk(struct gendisk * disk,struct nvme_id_ns * id)1193 static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
1194 {
1195 	struct nvme_ns *ns = disk->private_data;
1196 	struct nvme_ctrl *ctrl = ns->ctrl;
1197 	u16 bs;
1198 
1199 	/*
1200 	 * If identify namespace failed, use default 512 byte block size so
1201 	 * block layer can use before failing read/write for 0 capacity.
1202 	 */
1203 	ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
1204 	if (ns->lba_shift == 0)
1205 		ns->lba_shift = 9;
1206 	bs = 1 << ns->lba_shift;
1207 	ns->noiob = le16_to_cpu(id->noiob);
1208 
1209 	blk_mq_freeze_queue(disk->queue);
1210 
1211 	if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
1212 		nvme_prep_integrity(disk, id, bs);
1213 	blk_queue_logical_block_size(ns->queue, bs);
1214 	if (ns->noiob)
1215 		nvme_set_chunk_size(ns);
1216 	if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
1217 		nvme_init_integrity(ns);
1218 	if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
1219 		set_capacity(disk, 0);
1220 	else
1221 		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
1222 
1223 	if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
1224 		nvme_config_discard(ns);
1225 	blk_mq_unfreeze_queue(disk->queue);
1226 }
1227 
nvme_revalidate_disk(struct gendisk * disk)1228 static int nvme_revalidate_disk(struct gendisk *disk)
1229 {
1230 	struct nvme_ns *ns = disk->private_data;
1231 	struct nvme_ctrl *ctrl = ns->ctrl;
1232 	struct nvme_id_ns *id;
1233 	u8 eui64[8] = { 0 }, nguid[16] = { 0 };
1234 	uuid_t uuid = uuid_null;
1235 	int ret = 0;
1236 
1237 	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
1238 		set_capacity(disk, 0);
1239 		return -ENODEV;
1240 	}
1241 
1242 	id = nvme_identify_ns(ctrl, ns->ns_id);
1243 	if (!id)
1244 		return -ENODEV;
1245 
1246 	if (id->ncap == 0) {
1247 		ret = -ENODEV;
1248 		goto out;
1249 	}
1250 
1251 	__nvme_revalidate_disk(disk, id);
1252 	nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid);
1253 	if (!uuid_equal(&ns->uuid, &uuid) ||
1254 	    memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) ||
1255 	    memcmp(&ns->eui, &eui64, sizeof(ns->eui))) {
1256 		dev_err(ctrl->device,
1257 			"identifiers changed for nsid %d\n", ns->ns_id);
1258 		ret = -ENODEV;
1259 	}
1260 
1261 out:
1262 	kfree(id);
1263 	return ret;
1264 }
1265 
nvme_pr_type(enum pr_type type)1266 static char nvme_pr_type(enum pr_type type)
1267 {
1268 	switch (type) {
1269 	case PR_WRITE_EXCLUSIVE:
1270 		return 1;
1271 	case PR_EXCLUSIVE_ACCESS:
1272 		return 2;
1273 	case PR_WRITE_EXCLUSIVE_REG_ONLY:
1274 		return 3;
1275 	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
1276 		return 4;
1277 	case PR_WRITE_EXCLUSIVE_ALL_REGS:
1278 		return 5;
1279 	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
1280 		return 6;
1281 	default:
1282 		return 0;
1283 	}
1284 };
1285 
nvme_pr_command(struct block_device * bdev,u32 cdw10,u64 key,u64 sa_key,u8 op)1286 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
1287 				u64 key, u64 sa_key, u8 op)
1288 {
1289 	struct nvme_ns *ns = bdev->bd_disk->private_data;
1290 	struct nvme_command c;
1291 	u8 data[16] = { 0, };
1292 
1293 	put_unaligned_le64(key, &data[0]);
1294 	put_unaligned_le64(sa_key, &data[8]);
1295 
1296 	memset(&c, 0, sizeof(c));
1297 	c.common.opcode = op;
1298 	c.common.nsid = cpu_to_le32(ns->ns_id);
1299 	c.common.cdw10[0] = cpu_to_le32(cdw10);
1300 
1301 	return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
1302 }
1303 
nvme_pr_register(struct block_device * bdev,u64 old,u64 new,unsigned flags)1304 static int nvme_pr_register(struct block_device *bdev, u64 old,
1305 		u64 new, unsigned flags)
1306 {
1307 	u32 cdw10;
1308 
1309 	if (flags & ~PR_FL_IGNORE_KEY)
1310 		return -EOPNOTSUPP;
1311 
1312 	cdw10 = old ? 2 : 0;
1313 	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
1314 	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
1315 	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
1316 }
1317 
nvme_pr_reserve(struct block_device * bdev,u64 key,enum pr_type type,unsigned flags)1318 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
1319 		enum pr_type type, unsigned flags)
1320 {
1321 	u32 cdw10;
1322 
1323 	if (flags & ~PR_FL_IGNORE_KEY)
1324 		return -EOPNOTSUPP;
1325 
1326 	cdw10 = nvme_pr_type(type) << 8;
1327 	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
1328 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
1329 }
1330 
nvme_pr_preempt(struct block_device * bdev,u64 old,u64 new,enum pr_type type,bool abort)1331 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
1332 		enum pr_type type, bool abort)
1333 {
1334 	u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
1335 	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
1336 }
1337 
nvme_pr_clear(struct block_device * bdev,u64 key)1338 static int nvme_pr_clear(struct block_device *bdev, u64 key)
1339 {
1340 	u32 cdw10 = 1 | (key ? 1 << 3 : 0);
1341 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
1342 }
1343 
nvme_pr_release(struct block_device * bdev,u64 key,enum pr_type type)1344 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
1345 {
1346 	u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
1347 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
1348 }
1349 
1350 static const struct pr_ops nvme_pr_ops = {
1351 	.pr_register	= nvme_pr_register,
1352 	.pr_reserve	= nvme_pr_reserve,
1353 	.pr_release	= nvme_pr_release,
1354 	.pr_preempt	= nvme_pr_preempt,
1355 	.pr_clear	= nvme_pr_clear,
1356 };
1357 
1358 #ifdef CONFIG_BLK_SED_OPAL
nvme_sec_submit(void * data,u16 spsp,u8 secp,void * buffer,size_t len,bool send)1359 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
1360 		bool send)
1361 {
1362 	struct nvme_ctrl *ctrl = data;
1363 	struct nvme_command cmd;
1364 
1365 	memset(&cmd, 0, sizeof(cmd));
1366 	if (send)
1367 		cmd.common.opcode = nvme_admin_security_send;
1368 	else
1369 		cmd.common.opcode = nvme_admin_security_recv;
1370 	cmd.common.nsid = 0;
1371 	cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
1372 	cmd.common.cdw10[1] = cpu_to_le32(len);
1373 
1374 	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
1375 				      ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
1376 }
1377 EXPORT_SYMBOL_GPL(nvme_sec_submit);
1378 #endif /* CONFIG_BLK_SED_OPAL */
1379 
1380 static const struct block_device_operations nvme_fops = {
1381 	.owner		= THIS_MODULE,
1382 	.ioctl		= nvme_ioctl,
1383 	.compat_ioctl	= nvme_compat_ioctl,
1384 	.open		= nvme_open,
1385 	.release	= nvme_release,
1386 	.getgeo		= nvme_getgeo,
1387 	.revalidate_disk= nvme_revalidate_disk,
1388 	.pr_ops		= &nvme_pr_ops,
1389 };
1390 
nvme_wait_ready(struct nvme_ctrl * ctrl,u64 cap,bool enabled)1391 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
1392 {
1393 	unsigned long timeout =
1394 		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1395 	u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
1396 	int ret;
1397 
1398 	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1399 		if (csts == ~0)
1400 			return -ENODEV;
1401 		if ((csts & NVME_CSTS_RDY) == bit)
1402 			break;
1403 
1404 		msleep(100);
1405 		if (fatal_signal_pending(current))
1406 			return -EINTR;
1407 		if (time_after(jiffies, timeout)) {
1408 			dev_err(ctrl->device,
1409 				"Device not ready; aborting %s\n", enabled ?
1410 						"initialisation" : "reset");
1411 			return -ENODEV;
1412 		}
1413 	}
1414 
1415 	return ret;
1416 }
1417 
1418 /*
1419  * If the device has been passed off to us in an enabled state, just clear
1420  * the enabled bit.  The spec says we should set the 'shutdown notification
1421  * bits', but doing so may cause the device to complete commands to the
1422  * admin queue ... and we don't know what memory that might be pointing at!
1423  */
nvme_disable_ctrl(struct nvme_ctrl * ctrl,u64 cap)1424 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
1425 {
1426 	int ret;
1427 
1428 	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
1429 	ctrl->ctrl_config &= ~NVME_CC_ENABLE;
1430 
1431 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1432 	if (ret)
1433 		return ret;
1434 
1435 	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
1436 		msleep(NVME_QUIRK_DELAY_AMOUNT);
1437 
1438 	return nvme_wait_ready(ctrl, cap, false);
1439 }
1440 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
1441 
nvme_enable_ctrl(struct nvme_ctrl * ctrl,u64 cap)1442 int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
1443 {
1444 	/*
1445 	 * Default to a 4K page size, with the intention to update this
1446 	 * path in the future to accomodate architectures with differing
1447 	 * kernel and IO page sizes.
1448 	 */
1449 	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
1450 	int ret;
1451 
1452 	if (page_shift < dev_page_min) {
1453 		dev_err(ctrl->device,
1454 			"Minimum device page size %u too large for host (%u)\n",
1455 			1 << dev_page_min, 1 << page_shift);
1456 		return -ENODEV;
1457 	}
1458 
1459 	ctrl->page_size = 1 << page_shift;
1460 
1461 	ctrl->ctrl_config = NVME_CC_CSS_NVM;
1462 	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
1463 	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
1464 	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
1465 	ctrl->ctrl_config |= NVME_CC_ENABLE;
1466 
1467 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1468 	if (ret)
1469 		return ret;
1470 	return nvme_wait_ready(ctrl, cap, true);
1471 }
1472 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
1473 
nvme_shutdown_ctrl(struct nvme_ctrl * ctrl)1474 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
1475 {
1476 	unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
1477 	u32 csts;
1478 	int ret;
1479 
1480 	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
1481 	ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
1482 
1483 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1484 	if (ret)
1485 		return ret;
1486 
1487 	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1488 		if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
1489 			break;
1490 
1491 		msleep(100);
1492 		if (fatal_signal_pending(current))
1493 			return -EINTR;
1494 		if (time_after(jiffies, timeout)) {
1495 			dev_err(ctrl->device,
1496 				"Device shutdown incomplete; abort shutdown\n");
1497 			return -ENODEV;
1498 		}
1499 	}
1500 
1501 	return ret;
1502 }
1503 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
1504 
nvme_set_queue_limits(struct nvme_ctrl * ctrl,struct request_queue * q)1505 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1506 		struct request_queue *q)
1507 {
1508 	bool vwc = false;
1509 
1510 	if (ctrl->max_hw_sectors) {
1511 		u32 max_segments =
1512 			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
1513 
1514 		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1515 		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1516 	}
1517 	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
1518 	    is_power_of_2(ctrl->max_hw_sectors))
1519 		blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
1520 	blk_queue_virt_boundary(q, ctrl->page_size - 1);
1521 	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
1522 		vwc = true;
1523 	blk_queue_write_cache(q, vwc, vwc);
1524 }
1525 
nvme_configure_timestamp(struct nvme_ctrl * ctrl)1526 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
1527 {
1528 	__le64 ts;
1529 	int ret;
1530 
1531 	if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
1532 		return 0;
1533 
1534 	ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
1535 	ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
1536 			NULL);
1537 	if (ret)
1538 		dev_warn_once(ctrl->device,
1539 			"could not set timestamp (%d)\n", ret);
1540 	return ret;
1541 }
1542 
nvme_configure_apst(struct nvme_ctrl * ctrl)1543 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
1544 {
1545 	/*
1546 	 * APST (Autonomous Power State Transition) lets us program a
1547 	 * table of power state transitions that the controller will
1548 	 * perform automatically.  We configure it with a simple
1549 	 * heuristic: we are willing to spend at most 2% of the time
1550 	 * transitioning between power states.  Therefore, when running
1551 	 * in any given state, we will enter the next lower-power
1552 	 * non-operational state after waiting 50 * (enlat + exlat)
1553 	 * microseconds, as long as that state's exit latency is under
1554 	 * the requested maximum latency.
1555 	 *
1556 	 * We will not autonomously enter any non-operational state for
1557 	 * which the total latency exceeds ps_max_latency_us.  Users
1558 	 * can set ps_max_latency_us to zero to turn off APST.
1559 	 */
1560 
1561 	unsigned apste;
1562 	struct nvme_feat_auto_pst *table;
1563 	u64 max_lat_us = 0;
1564 	int max_ps = -1;
1565 	int ret;
1566 
1567 	/*
1568 	 * If APST isn't supported or if we haven't been initialized yet,
1569 	 * then don't do anything.
1570 	 */
1571 	if (!ctrl->apsta)
1572 		return 0;
1573 
1574 	if (ctrl->npss > 31) {
1575 		dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
1576 		return 0;
1577 	}
1578 
1579 	table = kzalloc(sizeof(*table), GFP_KERNEL);
1580 	if (!table)
1581 		return 0;
1582 
1583 	if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
1584 		/* Turn off APST. */
1585 		apste = 0;
1586 		dev_dbg(ctrl->device, "APST disabled\n");
1587 	} else {
1588 		__le64 target = cpu_to_le64(0);
1589 		int state;
1590 
1591 		/*
1592 		 * Walk through all states from lowest- to highest-power.
1593 		 * According to the spec, lower-numbered states use more
1594 		 * power.  NPSS, despite the name, is the index of the
1595 		 * lowest-power state, not the number of states.
1596 		 */
1597 		for (state = (int)ctrl->npss; state >= 0; state--) {
1598 			u64 total_latency_us, exit_latency_us, transition_ms;
1599 
1600 			if (target)
1601 				table->entries[state] = target;
1602 
1603 			/*
1604 			 * Don't allow transitions to the deepest state
1605 			 * if it's quirked off.
1606 			 */
1607 			if (state == ctrl->npss &&
1608 			    (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
1609 				continue;
1610 
1611 			/*
1612 			 * Is this state a useful non-operational state for
1613 			 * higher-power states to autonomously transition to?
1614 			 */
1615 			if (!(ctrl->psd[state].flags &
1616 			      NVME_PS_FLAGS_NON_OP_STATE))
1617 				continue;
1618 
1619 			exit_latency_us =
1620 				(u64)le32_to_cpu(ctrl->psd[state].exit_lat);
1621 			if (exit_latency_us > ctrl->ps_max_latency_us)
1622 				continue;
1623 
1624 			total_latency_us =
1625 				exit_latency_us +
1626 				le32_to_cpu(ctrl->psd[state].entry_lat);
1627 
1628 			/*
1629 			 * This state is good.  Use it as the APST idle
1630 			 * target for higher power states.
1631 			 */
1632 			transition_ms = total_latency_us + 19;
1633 			do_div(transition_ms, 20);
1634 			if (transition_ms > (1 << 24) - 1)
1635 				transition_ms = (1 << 24) - 1;
1636 
1637 			target = cpu_to_le64((state << 3) |
1638 					     (transition_ms << 8));
1639 
1640 			if (max_ps == -1)
1641 				max_ps = state;
1642 
1643 			if (total_latency_us > max_lat_us)
1644 				max_lat_us = total_latency_us;
1645 		}
1646 
1647 		apste = 1;
1648 
1649 		if (max_ps == -1) {
1650 			dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
1651 		} else {
1652 			dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
1653 				max_ps, max_lat_us, (int)sizeof(*table), table);
1654 		}
1655 	}
1656 
1657 	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
1658 				table, sizeof(*table), NULL);
1659 	if (ret)
1660 		dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
1661 
1662 	kfree(table);
1663 	return ret;
1664 }
1665 
nvme_set_latency_tolerance(struct device * dev,s32 val)1666 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
1667 {
1668 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1669 	u64 latency;
1670 
1671 	switch (val) {
1672 	case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
1673 	case PM_QOS_LATENCY_ANY:
1674 		latency = U64_MAX;
1675 		break;
1676 
1677 	default:
1678 		latency = val;
1679 	}
1680 
1681 	if (ctrl->ps_max_latency_us != latency) {
1682 		ctrl->ps_max_latency_us = latency;
1683 		nvme_configure_apst(ctrl);
1684 	}
1685 }
1686 
1687 struct nvme_core_quirk_entry {
1688 	/*
1689 	 * NVMe model and firmware strings are padded with spaces.  For
1690 	 * simplicity, strings in the quirk table are padded with NULLs
1691 	 * instead.
1692 	 */
1693 	u16 vid;
1694 	const char *mn;
1695 	const char *fr;
1696 	unsigned long quirks;
1697 };
1698 
1699 static const struct nvme_core_quirk_entry core_quirks[] = {
1700 	{
1701 		/*
1702 		 * This Toshiba device seems to die using any APST states.  See:
1703 		 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
1704 		 */
1705 		.vid = 0x1179,
1706 		.mn = "THNSF5256GPUK TOSHIBA",
1707 		.quirks = NVME_QUIRK_NO_APST,
1708 	}
1709 };
1710 
1711 /* match is null-terminated but idstr is space-padded. */
string_matches(const char * idstr,const char * match,size_t len)1712 static bool string_matches(const char *idstr, const char *match, size_t len)
1713 {
1714 	size_t matchlen;
1715 
1716 	if (!match)
1717 		return true;
1718 
1719 	matchlen = strlen(match);
1720 	WARN_ON_ONCE(matchlen > len);
1721 
1722 	if (memcmp(idstr, match, matchlen))
1723 		return false;
1724 
1725 	for (; matchlen < len; matchlen++)
1726 		if (idstr[matchlen] != ' ')
1727 			return false;
1728 
1729 	return true;
1730 }
1731 
quirk_matches(const struct nvme_id_ctrl * id,const struct nvme_core_quirk_entry * q)1732 static bool quirk_matches(const struct nvme_id_ctrl *id,
1733 			  const struct nvme_core_quirk_entry *q)
1734 {
1735 	return q->vid == le16_to_cpu(id->vid) &&
1736 		string_matches(id->mn, q->mn, sizeof(id->mn)) &&
1737 		string_matches(id->fr, q->fr, sizeof(id->fr));
1738 }
1739 
nvme_init_subnqn(struct nvme_ctrl * ctrl,struct nvme_id_ctrl * id)1740 static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
1741 {
1742 	size_t nqnlen;
1743 	int off;
1744 
1745 	nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
1746 	if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
1747 		strcpy(ctrl->subnqn, id->subnqn);
1748 		return;
1749 	}
1750 
1751 	if (ctrl->vs >= NVME_VS(1, 2, 1))
1752 		dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
1753 
1754 	/* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
1755 	off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE,
1756 			"nqn.2014.08.org.nvmexpress:%4x%4x",
1757 			le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
1758 	memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn));
1759 	off += sizeof(id->sn);
1760 	memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn));
1761 	off += sizeof(id->mn);
1762 	memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off);
1763 }
1764 
1765 /*
1766  * Initialize the cached copies of the Identify data and various controller
1767  * register in our nvme_ctrl structure.  This should be called as soon as
1768  * the admin queue is fully up and running.
1769  */
nvme_init_identify(struct nvme_ctrl * ctrl)1770 int nvme_init_identify(struct nvme_ctrl *ctrl)
1771 {
1772 	struct nvme_id_ctrl *id;
1773 	u64 cap;
1774 	int ret, page_shift;
1775 	u32 max_hw_sectors;
1776 	bool prev_apst_enabled;
1777 
1778 	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
1779 	if (ret) {
1780 		dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
1781 		return ret;
1782 	}
1783 
1784 	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
1785 	if (ret) {
1786 		dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
1787 		return ret;
1788 	}
1789 	page_shift = NVME_CAP_MPSMIN(cap) + 12;
1790 
1791 	if (ctrl->vs >= NVME_VS(1, 1, 0))
1792 		ctrl->subsystem = NVME_CAP_NSSRC(cap);
1793 
1794 	ret = nvme_identify_ctrl(ctrl, &id);
1795 	if (ret) {
1796 		dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
1797 		return -EIO;
1798 	}
1799 
1800 	nvme_init_subnqn(ctrl, id);
1801 
1802 	if (!ctrl->identified) {
1803 		/*
1804 		 * Check for quirks.  Quirk can depend on firmware version,
1805 		 * so, in principle, the set of quirks present can change
1806 		 * across a reset.  As a possible future enhancement, we
1807 		 * could re-scan for quirks every time we reinitialize
1808 		 * the device, but we'd have to make sure that the driver
1809 		 * behaves intelligently if the quirks change.
1810 		 */
1811 
1812 		int i;
1813 
1814 		for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
1815 			if (quirk_matches(id, &core_quirks[i]))
1816 				ctrl->quirks |= core_quirks[i].quirks;
1817 		}
1818 	}
1819 
1820 	if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
1821 		dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
1822 		ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
1823 	}
1824 
1825 	ctrl->oacs = le16_to_cpu(id->oacs);
1826 	ctrl->vid = le16_to_cpu(id->vid);
1827 	ctrl->oncs = le16_to_cpup(&id->oncs);
1828 	atomic_set(&ctrl->abort_limit, id->acl + 1);
1829 	ctrl->vwc = id->vwc;
1830 	ctrl->cntlid = le16_to_cpup(&id->cntlid);
1831 	memcpy(ctrl->serial, id->sn, sizeof(id->sn));
1832 	memcpy(ctrl->model, id->mn, sizeof(id->mn));
1833 	memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
1834 	if (id->mdts)
1835 		max_hw_sectors = 1 << (id->mdts + page_shift - 9);
1836 	else
1837 		max_hw_sectors = UINT_MAX;
1838 	ctrl->max_hw_sectors =
1839 		min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
1840 
1841 	nvme_set_queue_limits(ctrl, ctrl->admin_q);
1842 	ctrl->sgls = le32_to_cpu(id->sgls);
1843 	ctrl->kas = le16_to_cpu(id->kas);
1844 
1845 	if (id->rtd3e) {
1846 		/* us -> s */
1847 		u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
1848 
1849 		ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
1850 						 shutdown_timeout, 60);
1851 
1852 		if (ctrl->shutdown_timeout != shutdown_timeout)
1853 			dev_warn(ctrl->device,
1854 				 "Shutdown timeout set to %u seconds\n",
1855 				 ctrl->shutdown_timeout);
1856 	} else
1857 		ctrl->shutdown_timeout = shutdown_timeout;
1858 
1859 	ctrl->npss = id->npss;
1860 	ctrl->apsta = id->apsta;
1861 	prev_apst_enabled = ctrl->apst_enabled;
1862 	if (ctrl->quirks & NVME_QUIRK_NO_APST) {
1863 		if (force_apst && id->apsta) {
1864 			dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
1865 			ctrl->apst_enabled = true;
1866 		} else {
1867 			ctrl->apst_enabled = false;
1868 		}
1869 	} else {
1870 		ctrl->apst_enabled = id->apsta;
1871 	}
1872 	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
1873 
1874 	if (ctrl->ops->flags & NVME_F_FABRICS) {
1875 		ctrl->icdoff = le16_to_cpu(id->icdoff);
1876 		ctrl->ioccsz = le32_to_cpu(id->ioccsz);
1877 		ctrl->iorcsz = le32_to_cpu(id->iorcsz);
1878 		ctrl->maxcmd = le16_to_cpu(id->maxcmd);
1879 
1880 		/*
1881 		 * In fabrics we need to verify the cntlid matches the
1882 		 * admin connect
1883 		 */
1884 		if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
1885 			ret = -EINVAL;
1886 			goto out_free;
1887 		}
1888 
1889 		if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
1890 			dev_err(ctrl->device,
1891 				"keep-alive support is mandatory for fabrics\n");
1892 			ret = -EINVAL;
1893 			goto out_free;
1894 		}
1895 	} else {
1896 		ctrl->cntlid = le16_to_cpu(id->cntlid);
1897 		ctrl->hmpre = le32_to_cpu(id->hmpre);
1898 		ctrl->hmmin = le32_to_cpu(id->hmmin);
1899 		ctrl->hmminds = le32_to_cpu(id->hmminds);
1900 		ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
1901 	}
1902 
1903 	kfree(id);
1904 
1905 	if (ctrl->apst_enabled && !prev_apst_enabled)
1906 		dev_pm_qos_expose_latency_tolerance(ctrl->device);
1907 	else if (!ctrl->apst_enabled && prev_apst_enabled)
1908 		dev_pm_qos_hide_latency_tolerance(ctrl->device);
1909 
1910 	ret = nvme_configure_apst(ctrl);
1911 	if (ret < 0)
1912 		return ret;
1913 
1914 	ret = nvme_configure_timestamp(ctrl);
1915 	if (ret < 0)
1916 		return ret;
1917 
1918 	ret = nvme_configure_directives(ctrl);
1919 	if (ret < 0)
1920 		return ret;
1921 
1922 	ctrl->identified = true;
1923 
1924 	return 0;
1925 
1926 out_free:
1927 	kfree(id);
1928 	return ret;
1929 }
1930 EXPORT_SYMBOL_GPL(nvme_init_identify);
1931 
nvme_dev_open(struct inode * inode,struct file * file)1932 static int nvme_dev_open(struct inode *inode, struct file *file)
1933 {
1934 	struct nvme_ctrl *ctrl;
1935 	int instance = iminor(inode);
1936 	int ret = -ENODEV;
1937 
1938 	spin_lock(&dev_list_lock);
1939 	list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
1940 		if (ctrl->instance != instance)
1941 			continue;
1942 
1943 		if (!ctrl->admin_q) {
1944 			ret = -EWOULDBLOCK;
1945 			break;
1946 		}
1947 		if (!kref_get_unless_zero(&ctrl->kref))
1948 			break;
1949 		file->private_data = ctrl;
1950 		ret = 0;
1951 		break;
1952 	}
1953 	spin_unlock(&dev_list_lock);
1954 
1955 	return ret;
1956 }
1957 
nvme_dev_release(struct inode * inode,struct file * file)1958 static int nvme_dev_release(struct inode *inode, struct file *file)
1959 {
1960 	nvme_put_ctrl(file->private_data);
1961 	return 0;
1962 }
1963 
nvme_dev_user_cmd(struct nvme_ctrl * ctrl,void __user * argp)1964 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
1965 {
1966 	struct nvme_ns *ns;
1967 	int ret;
1968 
1969 	mutex_lock(&ctrl->namespaces_mutex);
1970 	if (list_empty(&ctrl->namespaces)) {
1971 		ret = -ENOTTY;
1972 		goto out_unlock;
1973 	}
1974 
1975 	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
1976 	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
1977 		dev_warn(ctrl->device,
1978 			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
1979 		ret = -EINVAL;
1980 		goto out_unlock;
1981 	}
1982 
1983 	dev_warn(ctrl->device,
1984 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
1985 	kref_get(&ns->kref);
1986 	mutex_unlock(&ctrl->namespaces_mutex);
1987 
1988 	ret = nvme_user_cmd(ctrl, ns, argp);
1989 	nvme_put_ns(ns);
1990 	return ret;
1991 
1992 out_unlock:
1993 	mutex_unlock(&ctrl->namespaces_mutex);
1994 	return ret;
1995 }
1996 
nvme_dev_ioctl(struct file * file,unsigned int cmd,unsigned long arg)1997 static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
1998 		unsigned long arg)
1999 {
2000 	struct nvme_ctrl *ctrl = file->private_data;
2001 	void __user *argp = (void __user *)arg;
2002 
2003 	switch (cmd) {
2004 	case NVME_IOCTL_ADMIN_CMD:
2005 		return nvme_user_cmd(ctrl, NULL, argp);
2006 	case NVME_IOCTL_IO_CMD:
2007 		return nvme_dev_user_cmd(ctrl, argp);
2008 	case NVME_IOCTL_RESET:
2009 		dev_warn(ctrl->device, "resetting controller\n");
2010 		return nvme_reset_ctrl_sync(ctrl);
2011 	case NVME_IOCTL_SUBSYS_RESET:
2012 		return nvme_reset_subsystem(ctrl);
2013 	case NVME_IOCTL_RESCAN:
2014 		nvme_queue_scan(ctrl);
2015 		return 0;
2016 	default:
2017 		return -ENOTTY;
2018 	}
2019 }
2020 
2021 static const struct file_operations nvme_dev_fops = {
2022 	.owner		= THIS_MODULE,
2023 	.open		= nvme_dev_open,
2024 	.release	= nvme_dev_release,
2025 	.unlocked_ioctl	= nvme_dev_ioctl,
2026 	.compat_ioctl	= nvme_dev_ioctl,
2027 };
2028 
nvme_sysfs_reset(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)2029 static ssize_t nvme_sysfs_reset(struct device *dev,
2030 				struct device_attribute *attr, const char *buf,
2031 				size_t count)
2032 {
2033 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2034 	int ret;
2035 
2036 	ret = nvme_reset_ctrl_sync(ctrl);
2037 	if (ret < 0)
2038 		return ret;
2039 	return count;
2040 }
2041 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
2042 
nvme_sysfs_rescan(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)2043 static ssize_t nvme_sysfs_rescan(struct device *dev,
2044 				struct device_attribute *attr, const char *buf,
2045 				size_t count)
2046 {
2047 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2048 
2049 	nvme_queue_scan(ctrl);
2050 	return count;
2051 }
2052 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
2053 
wwid_show(struct device * dev,struct device_attribute * attr,char * buf)2054 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
2055 								char *buf)
2056 {
2057 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
2058 	struct nvme_ctrl *ctrl = ns->ctrl;
2059 	int serial_len = sizeof(ctrl->serial);
2060 	int model_len = sizeof(ctrl->model);
2061 
2062 	if (!uuid_is_null(&ns->uuid))
2063 		return sprintf(buf, "uuid.%pU\n", &ns->uuid);
2064 
2065 	if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
2066 		return sprintf(buf, "eui.%16phN\n", ns->nguid);
2067 
2068 	if (memchr_inv(ns->eui, 0, sizeof(ns->eui)))
2069 		return sprintf(buf, "eui.%8phN\n", ns->eui);
2070 
2071 	while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' ||
2072 				  ctrl->serial[serial_len - 1] == '\0'))
2073 		serial_len--;
2074 	while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' ||
2075 				 ctrl->model[model_len - 1] == '\0'))
2076 		model_len--;
2077 
2078 	return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid,
2079 		serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id);
2080 }
2081 static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
2082 
nguid_show(struct device * dev,struct device_attribute * attr,char * buf)2083 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
2084 			  char *buf)
2085 {
2086 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
2087 	return sprintf(buf, "%pU\n", ns->nguid);
2088 }
2089 static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL);
2090 
uuid_show(struct device * dev,struct device_attribute * attr,char * buf)2091 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
2092 								char *buf)
2093 {
2094 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
2095 
2096 	/* For backward compatibility expose the NGUID to userspace if
2097 	 * we have no UUID set
2098 	 */
2099 	if (uuid_is_null(&ns->uuid)) {
2100 		printk_ratelimited(KERN_WARNING
2101 				   "No UUID available providing old NGUID\n");
2102 		return sprintf(buf, "%pU\n", ns->nguid);
2103 	}
2104 	return sprintf(buf, "%pU\n", &ns->uuid);
2105 }
2106 static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
2107 
eui_show(struct device * dev,struct device_attribute * attr,char * buf)2108 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
2109 								char *buf)
2110 {
2111 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
2112 	return sprintf(buf, "%8phd\n", ns->eui);
2113 }
2114 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
2115 
nsid_show(struct device * dev,struct device_attribute * attr,char * buf)2116 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
2117 								char *buf)
2118 {
2119 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
2120 	return sprintf(buf, "%d\n", ns->ns_id);
2121 }
2122 static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
2123 
2124 static struct attribute *nvme_ns_attrs[] = {
2125 	&dev_attr_wwid.attr,
2126 	&dev_attr_uuid.attr,
2127 	&dev_attr_nguid.attr,
2128 	&dev_attr_eui.attr,
2129 	&dev_attr_nsid.attr,
2130 	NULL,
2131 };
2132 
nvme_ns_attrs_are_visible(struct kobject * kobj,struct attribute * a,int n)2133 static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
2134 		struct attribute *a, int n)
2135 {
2136 	struct device *dev = container_of(kobj, struct device, kobj);
2137 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
2138 
2139 	if (a == &dev_attr_uuid.attr) {
2140 		if (uuid_is_null(&ns->uuid) &&
2141 		    !memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
2142 			return 0;
2143 	}
2144 	if (a == &dev_attr_nguid.attr) {
2145 		if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
2146 			return 0;
2147 	}
2148 	if (a == &dev_attr_eui.attr) {
2149 		if (!memchr_inv(ns->eui, 0, sizeof(ns->eui)))
2150 			return 0;
2151 	}
2152 	return a->mode;
2153 }
2154 
2155 static const struct attribute_group nvme_ns_attr_group = {
2156 	.attrs		= nvme_ns_attrs,
2157 	.is_visible	= nvme_ns_attrs_are_visible,
2158 };
2159 
2160 #define nvme_show_str_function(field)						\
2161 static ssize_t  field##_show(struct device *dev,				\
2162 			    struct device_attribute *attr, char *buf)		\
2163 {										\
2164         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
2165         return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field);	\
2166 }										\
2167 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2168 
2169 #define nvme_show_int_function(field)						\
2170 static ssize_t  field##_show(struct device *dev,				\
2171 			    struct device_attribute *attr, char *buf)		\
2172 {										\
2173         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
2174         return sprintf(buf, "%d\n", ctrl->field);	\
2175 }										\
2176 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2177 
2178 nvme_show_str_function(model);
2179 nvme_show_str_function(serial);
2180 nvme_show_str_function(firmware_rev);
2181 nvme_show_int_function(cntlid);
2182 
nvme_sysfs_delete(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)2183 static ssize_t nvme_sysfs_delete(struct device *dev,
2184 				struct device_attribute *attr, const char *buf,
2185 				size_t count)
2186 {
2187 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2188 
2189 	if (device_remove_file_self(dev, attr))
2190 		ctrl->ops->delete_ctrl(ctrl);
2191 	return count;
2192 }
2193 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
2194 
nvme_sysfs_show_transport(struct device * dev,struct device_attribute * attr,char * buf)2195 static ssize_t nvme_sysfs_show_transport(struct device *dev,
2196 					 struct device_attribute *attr,
2197 					 char *buf)
2198 {
2199 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2200 
2201 	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
2202 }
2203 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
2204 
nvme_sysfs_show_state(struct device * dev,struct device_attribute * attr,char * buf)2205 static ssize_t nvme_sysfs_show_state(struct device *dev,
2206 				     struct device_attribute *attr,
2207 				     char *buf)
2208 {
2209 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2210 	static const char *const state_name[] = {
2211 		[NVME_CTRL_NEW]		= "new",
2212 		[NVME_CTRL_LIVE]	= "live",
2213 		[NVME_CTRL_RESETTING]	= "resetting",
2214 		[NVME_CTRL_RECONNECTING]= "reconnecting",
2215 		[NVME_CTRL_DELETING]	= "deleting",
2216 		[NVME_CTRL_DEAD]	= "dead",
2217 	};
2218 
2219 	if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
2220 	    state_name[ctrl->state])
2221 		return sprintf(buf, "%s\n", state_name[ctrl->state]);
2222 
2223 	return sprintf(buf, "unknown state\n");
2224 }
2225 
2226 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
2227 
nvme_sysfs_show_subsysnqn(struct device * dev,struct device_attribute * attr,char * buf)2228 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
2229 					 struct device_attribute *attr,
2230 					 char *buf)
2231 {
2232 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2233 
2234 	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn);
2235 }
2236 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
2237 
nvme_sysfs_show_address(struct device * dev,struct device_attribute * attr,char * buf)2238 static ssize_t nvme_sysfs_show_address(struct device *dev,
2239 					 struct device_attribute *attr,
2240 					 char *buf)
2241 {
2242 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2243 
2244 	return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
2245 }
2246 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
2247 
2248 static struct attribute *nvme_dev_attrs[] = {
2249 	&dev_attr_reset_controller.attr,
2250 	&dev_attr_rescan_controller.attr,
2251 	&dev_attr_model.attr,
2252 	&dev_attr_serial.attr,
2253 	&dev_attr_firmware_rev.attr,
2254 	&dev_attr_cntlid.attr,
2255 	&dev_attr_delete_controller.attr,
2256 	&dev_attr_transport.attr,
2257 	&dev_attr_subsysnqn.attr,
2258 	&dev_attr_address.attr,
2259 	&dev_attr_state.attr,
2260 	NULL
2261 };
2262 
nvme_dev_attrs_are_visible(struct kobject * kobj,struct attribute * a,int n)2263 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
2264 		struct attribute *a, int n)
2265 {
2266 	struct device *dev = container_of(kobj, struct device, kobj);
2267 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2268 
2269 	if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
2270 		return 0;
2271 	if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
2272 		return 0;
2273 
2274 	return a->mode;
2275 }
2276 
2277 static struct attribute_group nvme_dev_attrs_group = {
2278 	.attrs		= nvme_dev_attrs,
2279 	.is_visible	= nvme_dev_attrs_are_visible,
2280 };
2281 
2282 static const struct attribute_group *nvme_dev_attr_groups[] = {
2283 	&nvme_dev_attrs_group,
2284 	NULL,
2285 };
2286 
ns_cmp(void * priv,struct list_head * a,struct list_head * b)2287 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
2288 {
2289 	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
2290 	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
2291 
2292 	return nsa->ns_id - nsb->ns_id;
2293 }
2294 
nvme_find_get_ns(struct nvme_ctrl * ctrl,unsigned nsid)2295 static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2296 {
2297 	struct nvme_ns *ns, *ret = NULL;
2298 
2299 	mutex_lock(&ctrl->namespaces_mutex);
2300 	list_for_each_entry(ns, &ctrl->namespaces, list) {
2301 		if (ns->ns_id == nsid) {
2302 			if (!kref_get_unless_zero(&ns->kref))
2303 				continue;
2304 			ret = ns;
2305 			break;
2306 		}
2307 		if (ns->ns_id > nsid)
2308 			break;
2309 	}
2310 	mutex_unlock(&ctrl->namespaces_mutex);
2311 	return ret;
2312 }
2313 
nvme_setup_streams_ns(struct nvme_ctrl * ctrl,struct nvme_ns * ns)2314 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
2315 {
2316 	struct streams_directive_params s;
2317 	int ret;
2318 
2319 	if (!ctrl->nr_streams)
2320 		return 0;
2321 
2322 	ret = nvme_get_stream_params(ctrl, &s, ns->ns_id);
2323 	if (ret)
2324 		return ret;
2325 
2326 	ns->sws = le32_to_cpu(s.sws);
2327 	ns->sgs = le16_to_cpu(s.sgs);
2328 
2329 	if (ns->sws) {
2330 		unsigned int bs = 1 << ns->lba_shift;
2331 
2332 		blk_queue_io_min(ns->queue, bs * ns->sws);
2333 		if (ns->sgs)
2334 			blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
2335 	}
2336 
2337 	return 0;
2338 }
2339 
nvme_alloc_ns(struct nvme_ctrl * ctrl,unsigned nsid)2340 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2341 {
2342 	struct nvme_ns *ns;
2343 	struct gendisk *disk;
2344 	struct nvme_id_ns *id;
2345 	char disk_name[DISK_NAME_LEN];
2346 	int node = dev_to_node(ctrl->dev);
2347 
2348 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
2349 	if (!ns)
2350 		return;
2351 
2352 	ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
2353 	if (ns->instance < 0)
2354 		goto out_free_ns;
2355 
2356 	ns->queue = blk_mq_init_queue(ctrl->tagset);
2357 	if (IS_ERR(ns->queue))
2358 		goto out_release_instance;
2359 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
2360 	ns->queue->queuedata = ns;
2361 	ns->ctrl = ctrl;
2362 
2363 	kref_init(&ns->kref);
2364 	ns->ns_id = nsid;
2365 	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
2366 
2367 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
2368 	nvme_set_queue_limits(ctrl, ns->queue);
2369 	nvme_setup_streams_ns(ctrl, ns);
2370 
2371 	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
2372 
2373 	id = nvme_identify_ns(ctrl, nsid);
2374 	if (!id)
2375 		goto out_free_queue;
2376 
2377 	if (id->ncap == 0)
2378 		goto out_free_id;
2379 
2380 	nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid);
2381 
2382 	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
2383 		if (nvme_nvm_register(ns, disk_name, node)) {
2384 			dev_warn(ctrl->device, "LightNVM init failure\n");
2385 			goto out_free_id;
2386 		}
2387 	}
2388 
2389 	disk = alloc_disk_node(0, node);
2390 	if (!disk)
2391 		goto out_free_id;
2392 
2393 	disk->fops = &nvme_fops;
2394 	disk->private_data = ns;
2395 	disk->queue = ns->queue;
2396 	disk->flags = GENHD_FL_EXT_DEVT;
2397 	memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
2398 	ns->disk = disk;
2399 
2400 	__nvme_revalidate_disk(disk, id);
2401 
2402 	mutex_lock(&ctrl->namespaces_mutex);
2403 	list_add_tail(&ns->list, &ctrl->namespaces);
2404 	mutex_unlock(&ctrl->namespaces_mutex);
2405 
2406 	kref_get(&ctrl->kref);
2407 
2408 	kfree(id);
2409 
2410 	device_add_disk(ctrl->device, ns->disk);
2411 	if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
2412 					&nvme_ns_attr_group))
2413 		pr_warn("%s: failed to create sysfs group for identification\n",
2414 			ns->disk->disk_name);
2415 	if (ns->ndev && nvme_nvm_register_sysfs(ns))
2416 		pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
2417 			ns->disk->disk_name);
2418 	return;
2419  out_free_id:
2420 	kfree(id);
2421  out_free_queue:
2422 	blk_cleanup_queue(ns->queue);
2423  out_release_instance:
2424 	ida_simple_remove(&ctrl->ns_ida, ns->instance);
2425  out_free_ns:
2426 	kfree(ns);
2427 }
2428 
nvme_ns_remove(struct nvme_ns * ns)2429 static void nvme_ns_remove(struct nvme_ns *ns)
2430 {
2431 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
2432 		return;
2433 
2434 	if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
2435 		if (blk_get_integrity(ns->disk))
2436 			blk_integrity_unregister(ns->disk);
2437 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
2438 					&nvme_ns_attr_group);
2439 		if (ns->ndev)
2440 			nvme_nvm_unregister_sysfs(ns);
2441 		del_gendisk(ns->disk);
2442 		blk_cleanup_queue(ns->queue);
2443 	}
2444 
2445 	mutex_lock(&ns->ctrl->namespaces_mutex);
2446 	list_del_init(&ns->list);
2447 	mutex_unlock(&ns->ctrl->namespaces_mutex);
2448 
2449 	nvme_put_ns(ns);
2450 }
2451 
nvme_validate_ns(struct nvme_ctrl * ctrl,unsigned nsid)2452 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2453 {
2454 	struct nvme_ns *ns;
2455 
2456 	ns = nvme_find_get_ns(ctrl, nsid);
2457 	if (ns) {
2458 		if (ns->disk && revalidate_disk(ns->disk))
2459 			nvme_ns_remove(ns);
2460 		nvme_put_ns(ns);
2461 	} else
2462 		nvme_alloc_ns(ctrl, nsid);
2463 }
2464 
nvme_remove_invalid_namespaces(struct nvme_ctrl * ctrl,unsigned nsid)2465 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
2466 					unsigned nsid)
2467 {
2468 	struct nvme_ns *ns, *next;
2469 
2470 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
2471 		if (ns->ns_id > nsid)
2472 			nvme_ns_remove(ns);
2473 	}
2474 }
2475 
nvme_scan_ns_list(struct nvme_ctrl * ctrl,unsigned nn)2476 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
2477 {
2478 	struct nvme_ns *ns;
2479 	__le32 *ns_list;
2480 	unsigned i, j, nsid, prev = 0;
2481 	unsigned num_lists = DIV_ROUND_UP_ULL((u64)nn, 1024);
2482 	int ret = 0;
2483 
2484 	ns_list = kzalloc(0x1000, GFP_KERNEL);
2485 	if (!ns_list)
2486 		return -ENOMEM;
2487 
2488 	for (i = 0; i < num_lists; i++) {
2489 		ret = nvme_identify_ns_list(ctrl, prev, ns_list);
2490 		if (ret)
2491 			goto free;
2492 
2493 		for (j = 0; j < min(nn, 1024U); j++) {
2494 			nsid = le32_to_cpu(ns_list[j]);
2495 			if (!nsid)
2496 				goto out;
2497 
2498 			nvme_validate_ns(ctrl, nsid);
2499 
2500 			while (++prev < nsid) {
2501 				ns = nvme_find_get_ns(ctrl, prev);
2502 				if (ns) {
2503 					nvme_ns_remove(ns);
2504 					nvme_put_ns(ns);
2505 				}
2506 			}
2507 		}
2508 		nn -= j;
2509 	}
2510  out:
2511 	nvme_remove_invalid_namespaces(ctrl, prev);
2512  free:
2513 	kfree(ns_list);
2514 	return ret;
2515 }
2516 
nvme_scan_ns_sequential(struct nvme_ctrl * ctrl,unsigned nn)2517 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
2518 {
2519 	unsigned i;
2520 
2521 	for (i = 1; i <= nn; i++)
2522 		nvme_validate_ns(ctrl, i);
2523 
2524 	nvme_remove_invalid_namespaces(ctrl, nn);
2525 }
2526 
nvme_scan_work(struct work_struct * work)2527 static void nvme_scan_work(struct work_struct *work)
2528 {
2529 	struct nvme_ctrl *ctrl =
2530 		container_of(work, struct nvme_ctrl, scan_work);
2531 	struct nvme_id_ctrl *id;
2532 	unsigned nn;
2533 
2534 	if (ctrl->state != NVME_CTRL_LIVE)
2535 		return;
2536 
2537 	if (nvme_identify_ctrl(ctrl, &id))
2538 		return;
2539 
2540 	nn = le32_to_cpu(id->nn);
2541 	if (ctrl->vs >= NVME_VS(1, 1, 0) &&
2542 	    !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
2543 		if (!nvme_scan_ns_list(ctrl, nn))
2544 			goto done;
2545 	}
2546 	nvme_scan_ns_sequential(ctrl, nn);
2547  done:
2548 	mutex_lock(&ctrl->namespaces_mutex);
2549 	list_sort(NULL, &ctrl->namespaces, ns_cmp);
2550 	mutex_unlock(&ctrl->namespaces_mutex);
2551 	kfree(id);
2552 }
2553 
nvme_queue_scan(struct nvme_ctrl * ctrl)2554 void nvme_queue_scan(struct nvme_ctrl *ctrl)
2555 {
2556 	/*
2557 	 * Do not queue new scan work when a controller is reset during
2558 	 * removal.
2559 	 */
2560 	if (ctrl->state == NVME_CTRL_LIVE)
2561 		queue_work(nvme_wq, &ctrl->scan_work);
2562 }
2563 EXPORT_SYMBOL_GPL(nvme_queue_scan);
2564 
2565 /*
2566  * This function iterates the namespace list unlocked to allow recovery from
2567  * controller failure. It is up to the caller to ensure the namespace list is
2568  * not modified by scan work while this function is executing.
2569  */
nvme_remove_namespaces(struct nvme_ctrl * ctrl)2570 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
2571 {
2572 	struct nvme_ns *ns, *next;
2573 
2574 	/* prevent racing with ns scanning */
2575 	flush_work(&ctrl->scan_work);
2576 
2577 	/*
2578 	 * The dead states indicates the controller was not gracefully
2579 	 * disconnected. In that case, we won't be able to flush any data while
2580 	 * removing the namespaces' disks; fail all the queues now to avoid
2581 	 * potentially having to clean up the failed sync later.
2582 	 */
2583 	if (ctrl->state == NVME_CTRL_DEAD)
2584 		nvme_kill_queues(ctrl);
2585 
2586 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
2587 		nvme_ns_remove(ns);
2588 }
2589 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
2590 
nvme_async_event_work(struct work_struct * work)2591 static void nvme_async_event_work(struct work_struct *work)
2592 {
2593 	struct nvme_ctrl *ctrl =
2594 		container_of(work, struct nvme_ctrl, async_event_work);
2595 
2596 	spin_lock_irq(&ctrl->lock);
2597 	while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) {
2598 		int aer_idx = --ctrl->event_limit;
2599 
2600 		spin_unlock_irq(&ctrl->lock);
2601 		ctrl->ops->submit_async_event(ctrl, aer_idx);
2602 		spin_lock_irq(&ctrl->lock);
2603 	}
2604 	spin_unlock_irq(&ctrl->lock);
2605 }
2606 
nvme_ctrl_pp_status(struct nvme_ctrl * ctrl)2607 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
2608 {
2609 
2610 	u32 csts;
2611 
2612 	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
2613 		return false;
2614 
2615 	if (csts == ~0)
2616 		return false;
2617 
2618 	return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
2619 }
2620 
nvme_get_fw_slot_info(struct nvme_ctrl * ctrl)2621 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
2622 {
2623 	struct nvme_command c = { };
2624 	struct nvme_fw_slot_info_log *log;
2625 
2626 	log = kmalloc(sizeof(*log), GFP_KERNEL);
2627 	if (!log)
2628 		return;
2629 
2630 	c.common.opcode = nvme_admin_get_log_page;
2631 	c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
2632 	c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log));
2633 
2634 	if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log)))
2635 		dev_warn(ctrl->device,
2636 				"Get FW SLOT INFO log error\n");
2637 	kfree(log);
2638 }
2639 
nvme_fw_act_work(struct work_struct * work)2640 static void nvme_fw_act_work(struct work_struct *work)
2641 {
2642 	struct nvme_ctrl *ctrl = container_of(work,
2643 				struct nvme_ctrl, fw_act_work);
2644 	unsigned long fw_act_timeout;
2645 
2646 	if (ctrl->mtfa)
2647 		fw_act_timeout = jiffies +
2648 				msecs_to_jiffies(ctrl->mtfa * 100);
2649 	else
2650 		fw_act_timeout = jiffies +
2651 				msecs_to_jiffies(admin_timeout * 1000);
2652 
2653 	nvme_stop_queues(ctrl);
2654 	while (nvme_ctrl_pp_status(ctrl)) {
2655 		if (time_after(jiffies, fw_act_timeout)) {
2656 			dev_warn(ctrl->device,
2657 				"Fw activation timeout, reset controller\n");
2658 			nvme_reset_ctrl(ctrl);
2659 			break;
2660 		}
2661 		msleep(100);
2662 	}
2663 
2664 	if (ctrl->state != NVME_CTRL_LIVE)
2665 		return;
2666 
2667 	nvme_start_queues(ctrl);
2668 	/* read FW slot informationi to clear the AER*/
2669 	nvme_get_fw_slot_info(ctrl);
2670 }
2671 
nvme_complete_async_event(struct nvme_ctrl * ctrl,__le16 status,union nvme_result * res)2672 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
2673 		union nvme_result *res)
2674 {
2675 	u32 result = le32_to_cpu(res->u32);
2676 	bool done = true;
2677 
2678 	switch (le16_to_cpu(status) >> 1) {
2679 	case NVME_SC_SUCCESS:
2680 		done = false;
2681 		/*FALLTHRU*/
2682 	case NVME_SC_ABORT_REQ:
2683 		++ctrl->event_limit;
2684 		if (ctrl->state == NVME_CTRL_LIVE)
2685 			queue_work(nvme_wq, &ctrl->async_event_work);
2686 		break;
2687 	default:
2688 		break;
2689 	}
2690 
2691 	if (done)
2692 		return;
2693 
2694 	switch (result & 0xff07) {
2695 	case NVME_AER_NOTICE_NS_CHANGED:
2696 		dev_info(ctrl->device, "rescanning\n");
2697 		nvme_queue_scan(ctrl);
2698 		break;
2699 	case NVME_AER_NOTICE_FW_ACT_STARTING:
2700 		queue_work(nvme_wq, &ctrl->fw_act_work);
2701 		break;
2702 	default:
2703 		dev_warn(ctrl->device, "async event result %08x\n", result);
2704 	}
2705 }
2706 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
2707 
nvme_queue_async_events(struct nvme_ctrl * ctrl)2708 void nvme_queue_async_events(struct nvme_ctrl *ctrl)
2709 {
2710 	ctrl->event_limit = NVME_NR_AERS;
2711 	queue_work(nvme_wq, &ctrl->async_event_work);
2712 }
2713 EXPORT_SYMBOL_GPL(nvme_queue_async_events);
2714 
2715 static DEFINE_IDA(nvme_instance_ida);
2716 
nvme_set_instance(struct nvme_ctrl * ctrl)2717 static int nvme_set_instance(struct nvme_ctrl *ctrl)
2718 {
2719 	int instance, error;
2720 
2721 	do {
2722 		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
2723 			return -ENODEV;
2724 
2725 		spin_lock(&dev_list_lock);
2726 		error = ida_get_new(&nvme_instance_ida, &instance);
2727 		spin_unlock(&dev_list_lock);
2728 	} while (error == -EAGAIN);
2729 
2730 	if (error)
2731 		return -ENODEV;
2732 
2733 	ctrl->instance = instance;
2734 	return 0;
2735 }
2736 
nvme_release_instance(struct nvme_ctrl * ctrl)2737 static void nvme_release_instance(struct nvme_ctrl *ctrl)
2738 {
2739 	spin_lock(&dev_list_lock);
2740 	ida_remove(&nvme_instance_ida, ctrl->instance);
2741 	spin_unlock(&dev_list_lock);
2742 }
2743 
nvme_stop_ctrl(struct nvme_ctrl * ctrl)2744 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
2745 {
2746 	nvme_stop_keep_alive(ctrl);
2747 	flush_work(&ctrl->async_event_work);
2748 	cancel_work_sync(&ctrl->fw_act_work);
2749 }
2750 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
2751 
nvme_start_ctrl(struct nvme_ctrl * ctrl)2752 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
2753 {
2754 	if (ctrl->kato)
2755 		nvme_start_keep_alive(ctrl);
2756 
2757 	if (ctrl->queue_count > 1) {
2758 		nvme_queue_scan(ctrl);
2759 		nvme_queue_async_events(ctrl);
2760 		nvme_start_queues(ctrl);
2761 	}
2762 }
2763 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
2764 
nvme_uninit_ctrl(struct nvme_ctrl * ctrl)2765 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
2766 {
2767 	device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
2768 
2769 	spin_lock(&dev_list_lock);
2770 	list_del(&ctrl->node);
2771 	spin_unlock(&dev_list_lock);
2772 }
2773 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
2774 
nvme_free_ctrl(struct kref * kref)2775 static void nvme_free_ctrl(struct kref *kref)
2776 {
2777 	struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
2778 
2779 	put_device(ctrl->device);
2780 	nvme_release_instance(ctrl);
2781 	ida_destroy(&ctrl->ns_ida);
2782 
2783 	ctrl->ops->free_ctrl(ctrl);
2784 }
2785 
nvme_put_ctrl(struct nvme_ctrl * ctrl)2786 void nvme_put_ctrl(struct nvme_ctrl *ctrl)
2787 {
2788 	kref_put(&ctrl->kref, nvme_free_ctrl);
2789 }
2790 EXPORT_SYMBOL_GPL(nvme_put_ctrl);
2791 
2792 /*
2793  * Initialize a NVMe controller structures.  This needs to be called during
2794  * earliest initialization so that we have the initialized structured around
2795  * during probing.
2796  */
nvme_init_ctrl(struct nvme_ctrl * ctrl,struct device * dev,const struct nvme_ctrl_ops * ops,unsigned long quirks)2797 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
2798 		const struct nvme_ctrl_ops *ops, unsigned long quirks)
2799 {
2800 	int ret;
2801 
2802 	ctrl->state = NVME_CTRL_NEW;
2803 	spin_lock_init(&ctrl->lock);
2804 	INIT_LIST_HEAD(&ctrl->namespaces);
2805 	mutex_init(&ctrl->namespaces_mutex);
2806 	kref_init(&ctrl->kref);
2807 	ctrl->dev = dev;
2808 	ctrl->ops = ops;
2809 	ctrl->quirks = quirks;
2810 	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
2811 	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
2812 	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
2813 
2814 	ret = nvme_set_instance(ctrl);
2815 	if (ret)
2816 		goto out;
2817 
2818 	ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
2819 				MKDEV(nvme_char_major, ctrl->instance),
2820 				ctrl, nvme_dev_attr_groups,
2821 				"nvme%d", ctrl->instance);
2822 	if (IS_ERR(ctrl->device)) {
2823 		ret = PTR_ERR(ctrl->device);
2824 		goto out_release_instance;
2825 	}
2826 	get_device(ctrl->device);
2827 	ida_init(&ctrl->ns_ida);
2828 
2829 	spin_lock(&dev_list_lock);
2830 	list_add_tail(&ctrl->node, &nvme_ctrl_list);
2831 	spin_unlock(&dev_list_lock);
2832 
2833 	/*
2834 	 * Initialize latency tolerance controls.  The sysfs files won't
2835 	 * be visible to userspace unless the device actually supports APST.
2836 	 */
2837 	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
2838 	dev_pm_qos_update_user_latency_tolerance(ctrl->device,
2839 		min(default_ps_max_latency_us, (unsigned long)S32_MAX));
2840 
2841 	return 0;
2842 out_release_instance:
2843 	nvme_release_instance(ctrl);
2844 out:
2845 	return ret;
2846 }
2847 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
2848 
2849 /**
2850  * nvme_kill_queues(): Ends all namespace queues
2851  * @ctrl: the dead controller that needs to end
2852  *
2853  * Call this function when the driver determines it is unable to get the
2854  * controller in a state capable of servicing IO.
2855  */
nvme_kill_queues(struct nvme_ctrl * ctrl)2856 void nvme_kill_queues(struct nvme_ctrl *ctrl)
2857 {
2858 	struct nvme_ns *ns;
2859 
2860 	mutex_lock(&ctrl->namespaces_mutex);
2861 
2862 	/* Forcibly unquiesce queues to avoid blocking dispatch */
2863 	if (ctrl->admin_q)
2864 		blk_mq_unquiesce_queue(ctrl->admin_q);
2865 
2866 	list_for_each_entry(ns, &ctrl->namespaces, list) {
2867 		/*
2868 		 * Revalidating a dead namespace sets capacity to 0. This will
2869 		 * end buffered writers dirtying pages that can't be synced.
2870 		 */
2871 		if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
2872 			continue;
2873 		revalidate_disk(ns->disk);
2874 		blk_set_queue_dying(ns->queue);
2875 
2876 		/* Forcibly unquiesce queues to avoid blocking dispatch */
2877 		blk_mq_unquiesce_queue(ns->queue);
2878 	}
2879 	mutex_unlock(&ctrl->namespaces_mutex);
2880 }
2881 EXPORT_SYMBOL_GPL(nvme_kill_queues);
2882 
nvme_unfreeze(struct nvme_ctrl * ctrl)2883 void nvme_unfreeze(struct nvme_ctrl *ctrl)
2884 {
2885 	struct nvme_ns *ns;
2886 
2887 	mutex_lock(&ctrl->namespaces_mutex);
2888 	list_for_each_entry(ns, &ctrl->namespaces, list)
2889 		blk_mq_unfreeze_queue(ns->queue);
2890 	mutex_unlock(&ctrl->namespaces_mutex);
2891 }
2892 EXPORT_SYMBOL_GPL(nvme_unfreeze);
2893 
nvme_wait_freeze_timeout(struct nvme_ctrl * ctrl,long timeout)2894 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
2895 {
2896 	struct nvme_ns *ns;
2897 
2898 	mutex_lock(&ctrl->namespaces_mutex);
2899 	list_for_each_entry(ns, &ctrl->namespaces, list) {
2900 		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
2901 		if (timeout <= 0)
2902 			break;
2903 	}
2904 	mutex_unlock(&ctrl->namespaces_mutex);
2905 }
2906 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
2907 
nvme_wait_freeze(struct nvme_ctrl * ctrl)2908 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
2909 {
2910 	struct nvme_ns *ns;
2911 
2912 	mutex_lock(&ctrl->namespaces_mutex);
2913 	list_for_each_entry(ns, &ctrl->namespaces, list)
2914 		blk_mq_freeze_queue_wait(ns->queue);
2915 	mutex_unlock(&ctrl->namespaces_mutex);
2916 }
2917 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
2918 
nvme_start_freeze(struct nvme_ctrl * ctrl)2919 void nvme_start_freeze(struct nvme_ctrl *ctrl)
2920 {
2921 	struct nvme_ns *ns;
2922 
2923 	mutex_lock(&ctrl->namespaces_mutex);
2924 	list_for_each_entry(ns, &ctrl->namespaces, list)
2925 		blk_freeze_queue_start(ns->queue);
2926 	mutex_unlock(&ctrl->namespaces_mutex);
2927 }
2928 EXPORT_SYMBOL_GPL(nvme_start_freeze);
2929 
nvme_stop_queues(struct nvme_ctrl * ctrl)2930 void nvme_stop_queues(struct nvme_ctrl *ctrl)
2931 {
2932 	struct nvme_ns *ns;
2933 
2934 	mutex_lock(&ctrl->namespaces_mutex);
2935 	list_for_each_entry(ns, &ctrl->namespaces, list)
2936 		blk_mq_quiesce_queue(ns->queue);
2937 	mutex_unlock(&ctrl->namespaces_mutex);
2938 }
2939 EXPORT_SYMBOL_GPL(nvme_stop_queues);
2940 
nvme_start_queues(struct nvme_ctrl * ctrl)2941 void nvme_start_queues(struct nvme_ctrl *ctrl)
2942 {
2943 	struct nvme_ns *ns;
2944 
2945 	mutex_lock(&ctrl->namespaces_mutex);
2946 	list_for_each_entry(ns, &ctrl->namespaces, list)
2947 		blk_mq_unquiesce_queue(ns->queue);
2948 	mutex_unlock(&ctrl->namespaces_mutex);
2949 }
2950 EXPORT_SYMBOL_GPL(nvme_start_queues);
2951 
nvme_core_init(void)2952 int __init nvme_core_init(void)
2953 {
2954 	int result;
2955 
2956 	nvme_wq = alloc_workqueue("nvme-wq",
2957 			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
2958 	if (!nvme_wq)
2959 		return -ENOMEM;
2960 
2961 	result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
2962 							&nvme_dev_fops);
2963 	if (result < 0)
2964 		goto destroy_wq;
2965 	else if (result > 0)
2966 		nvme_char_major = result;
2967 
2968 	nvme_class = class_create(THIS_MODULE, "nvme");
2969 	if (IS_ERR(nvme_class)) {
2970 		result = PTR_ERR(nvme_class);
2971 		goto unregister_chrdev;
2972 	}
2973 
2974 	return 0;
2975 
2976 unregister_chrdev:
2977 	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
2978 destroy_wq:
2979 	destroy_workqueue(nvme_wq);
2980 	return result;
2981 }
2982 
nvme_core_exit(void)2983 void nvme_core_exit(void)
2984 {
2985 	class_destroy(nvme_class);
2986 	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
2987 	destroy_workqueue(nvme_wq);
2988 }
2989 
2990 MODULE_LICENSE("GPL");
2991 MODULE_VERSION("1.0");
2992 module_init(nvme_core_init);
2993 module_exit(nvme_core_exit);
2994