core.c 126 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4 5 6 7 8
/*
 * NVM Express device driver
 * Copyright (c) 2011-2014, Intel Corporation.
 */

#include <linux/blkdev.h>
#include <linux/blk-mq.h>
9
#include <linux/blk-integrity.h>
10
#include <linux/compat.h>
11
#include <linux/delay.h>
12
#include <linux/errno.h>
13
#include <linux/hdreg.h>
14
#include <linux/kernel.h>
15
#include <linux/module.h>
16
#include <linux/backing-dev.h>
17 18
#include <linux/slab.h>
#include <linux/types.h>
19 20 21
#include <linux/pr.h>
#include <linux/ptrace.h>
#include <linux/nvme_ioctl.h>
22
#include <linux/pm_qos.h>
23
#include <asm/unaligned.h>
24 25

#include "nvme.h"
26
#include "fabrics.h"
27
#include <linux/nvme-auth.h>
28

29 30 31
#define CREATE_TRACE_POINTS
#include "trace.h"

32 33
#define NVME_MINORS		(1U << MINORBITS)

34 35 36 37 38
struct nvme_ns_info {
	struct nvme_ns_ids ids;
	u32 nsid;
	__le32 anagrpid;
	bool is_shared;
39
	bool is_readonly;
40
	bool is_ready;
41
	bool is_removed;
42 43
};

44 45
unsigned int admin_timeout = 60;
module_param(admin_timeout, uint, 0644);
46
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
47
EXPORT_SYMBOL_GPL(admin_timeout);
48

49 50
unsigned int nvme_io_timeout = 30;
module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
51
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
52
EXPORT_SYMBOL_GPL(nvme_io_timeout);
53

54
static unsigned char shutdown_timeout = 5;
55 56 57
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");

58 59
static u8 nvme_max_retries = 5;
module_param_named(max_retries, nvme_max_retries, byte, 0644);
Keith Busch's avatar
Keith Busch committed
60
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
61

62
static unsigned long default_ps_max_latency_us = 100000;
63 64 65 66
module_param(default_ps_max_latency_us, ulong, 0644);
MODULE_PARM_DESC(default_ps_max_latency_us,
		 "max power saving latency for new devices; use PM QOS to change per device");

67 68 69 70
static bool force_apst;
module_param(force_apst, bool, 0644);
MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");

71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
static unsigned long apst_primary_timeout_ms = 100;
module_param(apst_primary_timeout_ms, ulong, 0644);
MODULE_PARM_DESC(apst_primary_timeout_ms,
	"primary APST timeout in ms");

static unsigned long apst_secondary_timeout_ms = 2000;
module_param(apst_secondary_timeout_ms, ulong, 0644);
MODULE_PARM_DESC(apst_secondary_timeout_ms,
	"secondary APST timeout in ms");

static unsigned long apst_primary_latency_tol_us = 15000;
module_param(apst_primary_latency_tol_us, ulong, 0644);
MODULE_PARM_DESC(apst_primary_latency_tol_us,
	"primary APST latency tolerance in us");

static unsigned long apst_secondary_latency_tol_us = 100000;
module_param(apst_secondary_latency_tol_us, ulong, 0644);
MODULE_PARM_DESC(apst_secondary_latency_tol_us,
	"secondary APST latency tolerance in us");

91 92 93 94 95
/*
 * nvme_wq - hosts nvme related works that are not reset or delete
 * nvme_reset_wq - hosts nvme reset works
 * nvme_delete_wq - hosts nvme delete works
 *
96 97
 * nvme_wq will host works such as scan, aen handling, fw activation,
 * keep-alive, periodic reconnects etc. nvme_reset_wq
98 99 100 101
 * runs reset works which also flush works hosted on nvme_wq for
 * serialization purposes. nvme_delete_wq host controller deletion
 * works which flush reset works for serialization.
 */
102 103 104
struct workqueue_struct *nvme_wq;
EXPORT_SYMBOL_GPL(nvme_wq);

105 106 107 108 109 110
struct workqueue_struct *nvme_reset_wq;
EXPORT_SYMBOL_GPL(nvme_reset_wq);

struct workqueue_struct *nvme_delete_wq;
EXPORT_SYMBOL_GPL(nvme_delete_wq);

Christoph Hellwig's avatar
Christoph Hellwig committed
111 112
static LIST_HEAD(nvme_subsystems);
static DEFINE_MUTEX(nvme_subsystems_lock);
113

114
static DEFINE_IDA(nvme_instance_ida);
115
static dev_t nvme_ctrl_base_chr_devt;
116
static struct class *nvme_class;
Christoph Hellwig's avatar
Christoph Hellwig committed
117
static struct class *nvme_subsys_class;
118

119 120 121 122
static DEFINE_IDA(nvme_ns_chr_minor_ida);
static dev_t nvme_ns_chr_devt;
static struct class *nvme_ns_chr_class;

123
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
124 125
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
					   unsigned nsid);
126 127
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
				   struct nvme_command *cmd);
128

129
void nvme_queue_scan(struct nvme_ctrl *ctrl)
130 131 132 133
{
	/*
	 * Only new queue scan work when admin and IO queues are both alive
	 */
134
	if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
135 136 137
		queue_work(nvme_wq, &ctrl->scan_work);
}

138 139 140 141 142 143
/*
 * Use this function to proceed with scheduling reset_work for a controller
 * that had previously been set to the resetting state. This is intended for
 * code paths that can't be interrupted by other reset attempts. A hot removal
 * may prevent this from succeeding.
 */
144
int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
145 146 147 148 149 150 151
{
	if (ctrl->state != NVME_CTRL_RESETTING)
		return -EBUSY;
	if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
		return -EBUSY;
	return 0;
}
152
EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
153

154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
static void nvme_failfast_work(struct work_struct *work)
{
	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
			struct nvme_ctrl, failfast_work);

	if (ctrl->state != NVME_CTRL_CONNECTING)
		return;

	set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
	dev_info(ctrl->device, "failfast expired\n");
	nvme_kick_requeue_lists(ctrl);
}

static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
{
	if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
		return;

	schedule_delayed_work(&ctrl->failfast_work,
			      ctrl->opts->fast_io_fail_tmo * HZ);
}

static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
{
	if (!ctrl->opts)
		return;

	cancel_delayed_work_sync(&ctrl->failfast_work);
	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
}


186 187 188 189
int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{
	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
		return -EBUSY;
190
	if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
191 192 193 194 195
		return -EBUSY;
	return 0;
}
EXPORT_SYMBOL_GPL(nvme_reset_ctrl);

196
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
197 198 199 200
{
	int ret;

	ret = nvme_reset_ctrl(ctrl);
201
	if (!ret) {
202
		flush_work(&ctrl->reset_work);
203
		if (ctrl->state != NVME_CTRL_LIVE)
204 205 206
			ret = -ENETRESET;
	}

207 208 209
	return ret;
}

210
static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
211
{
212
	dev_info(ctrl->device,
213
		 "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
214

215
	flush_work(&ctrl->reset_work);
216 217
	nvme_stop_ctrl(ctrl);
	nvme_remove_namespaces(ctrl);
218
	ctrl->ops->delete_ctrl(ctrl);
219
	nvme_uninit_ctrl(ctrl);
220 221
}

222 223 224 225 226 227 228 229
static void nvme_delete_ctrl_work(struct work_struct *work)
{
	struct nvme_ctrl *ctrl =
		container_of(work, struct nvme_ctrl, delete_work);

	nvme_do_delete_ctrl(ctrl);
}

230 231 232 233
int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
{
	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
		return -EBUSY;
234
	if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
235 236 237 238 239
		return -EBUSY;
	return 0;
}
EXPORT_SYMBOL_GPL(nvme_delete_ctrl);

240
void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
241 242
{
	/*
243 244
	 * Keep a reference until nvme_do_delete_ctrl() complete,
	 * since ->delete_ctrl can free the controller.
245 246
	 */
	nvme_get_ctrl(ctrl);
247
	if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
248
		nvme_do_delete_ctrl(ctrl);
249 250 251
	nvme_put_ctrl(ctrl);
}

252
static blk_status_t nvme_error_status(u16 status)
253
{
254
	switch (status & 0x7ff) {
255
	case NVME_SC_SUCCESS:
256
		return BLK_STS_OK;
257
	case NVME_SC_CAP_EXCEEDED:
258
		return BLK_STS_NOSPC;
259
	case NVME_SC_LBA_RANGE:
260 261
	case NVME_SC_CMD_INTERRUPTED:
	case NVME_SC_NS_NOT_READY:
262 263
		return BLK_STS_TARGET;
	case NVME_SC_BAD_ATTRIBUTES:
264
	case NVME_SC_ONCS_NOT_SUPPORTED:
265 266 267
	case NVME_SC_INVALID_OPCODE:
	case NVME_SC_INVALID_FIELD:
	case NVME_SC_INVALID_NS:
268
		return BLK_STS_NOTSUPP;
269 270 271
	case NVME_SC_WRITE_FAULT:
	case NVME_SC_READ_ERROR:
	case NVME_SC_UNWRITTEN_BLOCK:
272 273
	case NVME_SC_ACCESS_DENIED:
	case NVME_SC_READ_ONLY:
274
	case NVME_SC_COMPARE_FAILED:
275
		return BLK_STS_MEDIUM;
276 277 278 279 280 281
	case NVME_SC_GUARD_CHECK:
	case NVME_SC_APPTAG_CHECK:
	case NVME_SC_REFTAG_CHECK:
	case NVME_SC_INVALID_PI:
		return BLK_STS_PROTECTION;
	case NVME_SC_RESERVATION_CONFLICT:
282
		return BLK_STS_RESV_CONFLICT;
283 284
	case NVME_SC_HOST_PATH_ERROR:
		return BLK_STS_TRANSPORT;
285 286 287 288
	case NVME_SC_ZONE_TOO_MANY_ACTIVE:
		return BLK_STS_ZONE_ACTIVE_RESOURCE;
	case NVME_SC_ZONE_TOO_MANY_OPEN:
		return BLK_STS_ZONE_OPEN_RESOURCE;
289 290
	default:
		return BLK_STS_IOERR;
291 292 293
	}
}

294 295 296 297 298 299 300
static void nvme_retry_req(struct request *req)
{
	unsigned long delay = 0;
	u16 crd;

	/* The mask and shift result must be <= 3 */
	crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
301 302
	if (crd)
		delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
303 304 305 306 307 308

	nvme_req(req)->retries++;
	blk_mq_requeue_request(req, false);
	blk_mq_delay_kick_requeue_list(req->q, delay);
}

309 310 311 312 313 314
static void nvme_log_error(struct request *req)
{
	struct nvme_ns *ns = req->q->queuedata;
	struct nvme_request *nr = nvme_req(req);

	if (ns) {
315
		pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
316 317 318
		       ns->disk ? ns->disk->disk_name : "?",
		       nvme_get_opcode_str(nr->cmd->common.opcode),
		       nr->cmd->common.opcode,
319 320
		       nvme_sect_to_lba(ns, blk_rq_pos(req)),
		       blk_rq_bytes(req) >> ns->head->lba_shift,
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
		       nvme_get_error_status_str(nr->status),
		       nr->status >> 8 & 7,	/* Status Code Type */
		       nr->status & 0xff,	/* Status Code */
		       nr->status & NVME_SC_MORE ? "MORE " : "",
		       nr->status & NVME_SC_DNR  ? "DNR "  : "");
		return;
	}

	pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
			   dev_name(nr->ctrl->device),
			   nvme_get_admin_opcode_str(nr->cmd->common.opcode),
			   nr->cmd->common.opcode,
			   nvme_get_error_status_str(nr->status),
			   nr->status >> 8 & 7,	/* Status Code Type */
			   nr->status & 0xff,	/* Status Code */
			   nr->status & NVME_SC_MORE ? "MORE " : "",
			   nr->status & NVME_SC_DNR  ? "DNR "  : "");
}

340 341 342 343
enum nvme_disposition {
	COMPLETE,
	RETRY,
	FAILOVER,
344
	AUTHENTICATE,
345 346 347
};

static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
348
{
349 350
	if (likely(nvme_req(req)->status == 0))
		return COMPLETE;
351

352 353 354
	if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED)
		return AUTHENTICATE;

355 356 357 358
	if (blk_noretry_request(req) ||
	    (nvme_req(req)->status & NVME_SC_DNR) ||
	    nvme_req(req)->retries >= nvme_max_retries)
		return COMPLETE;
359

360
	if (req->cmd_flags & REQ_NVME_MPATH) {
361 362
		if (nvme_is_path_error(nvme_req(req)->status) ||
		    blk_queue_dying(req->q))
363
			return FAILOVER;
364 365 366
	} else {
		if (blk_queue_dying(req->q))
			return COMPLETE;
367
	}
368

369 370
	return RETRY;
}
371

372
static inline void nvme_end_req_zoned(struct request *req)
373 374 375
{
	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
	    req_op(req) == REQ_OP_ZONE_APPEND)
376 377
		req->__sector = nvme_lba_to_sect(req->q->queuedata,
			le64_to_cpu(nvme_req(req)->result.u64));
378 379 380 381 382
}

static inline void nvme_end_req(struct request *req)
{
	blk_status_t status = nvme_error_status(nvme_req(req)->status);
383

384
	if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET)))
385
		nvme_log_error(req);
386
	nvme_end_req_zoned(req);
387
	nvme_trace_bio_complete(req);
388 389
	if (req->cmd_flags & REQ_NVME_MPATH)
		nvme_mpath_end_request(req);
390
	blk_mq_end_request(req, status);
391
}
392 393 394

void nvme_complete_rq(struct request *req)
{
395 396
	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;

397 398 399
	trace_nvme_complete_rq(req);
	nvme_cleanup_cmd(req);

400 401 402 403 404 405 406 407 408 409
	/*
	 * Completions of long-running commands should not be able to
	 * defer sending of periodic keep alives, since the controller
	 * may have completed processing such commands a long time ago
	 * (arbitrarily close to command submission time).
	 * req->deadline - req->timeout is the command submission time
	 * in jiffies.
	 */
	if (ctrl->kas &&
	    req->deadline - req->timeout >= ctrl->ka_last_check_time)
410
		ctrl->comp_seen = true;
411 412 413 414 415 416 417 418 419 420 421

	switch (nvme_decide_disposition(req)) {
	case COMPLETE:
		nvme_end_req(req);
		return;
	case RETRY:
		nvme_retry_req(req);
		return;
	case FAILOVER:
		nvme_failover_req(req);
		return;
422
	case AUTHENTICATE:
423
#ifdef CONFIG_NVME_HOST_AUTH
424 425 426 427 428 429
		queue_work(nvme_wq, &ctrl->dhchap_auth_work);
		nvme_retry_req(req);
#else
		nvme_end_req(req);
#endif
		return;
430 431
	}
}
432 433
EXPORT_SYMBOL_GPL(nvme_complete_rq);

434 435
void nvme_complete_batch_req(struct request *req)
{
436
	trace_nvme_complete_rq(req);
437 438 439 440 441
	nvme_cleanup_cmd(req);
	nvme_end_req_zoned(req);
}
EXPORT_SYMBOL_GPL(nvme_complete_batch_req);

442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
/*
 * Called to unwind from ->queue_rq on a failed command submission so that the
 * multipathing code gets called to potentially failover to another path.
 * The caller needs to unwind all transport specific resource allocations and
 * must return propagate the return value.
 */
blk_status_t nvme_host_path_error(struct request *req)
{
	nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
	blk_mq_set_request_complete(req);
	nvme_complete_rq(req);
	return BLK_STS_OK;
}
EXPORT_SYMBOL_GPL(nvme_host_path_error);

457
bool nvme_cancel_request(struct request *req, void *data)
458 459 460 461
{
	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
				"Cancelling I/O %d", req->tag);

462 463
	/* don't abort one completed or idle request */
	if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT)
464 465
		return true;

466
	nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
467
	nvme_req(req)->flags |= NVME_REQ_CANCELLED;
468
	blk_mq_complete_request(req);
469
	return true;
470 471 472
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);

473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492
void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
{
	if (ctrl->tagset) {
		blk_mq_tagset_busy_iter(ctrl->tagset,
				nvme_cancel_request, ctrl);
		blk_mq_tagset_wait_completed_request(ctrl->tagset);
	}
}
EXPORT_SYMBOL_GPL(nvme_cancel_tagset);

void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
{
	if (ctrl->admin_tagset) {
		blk_mq_tagset_busy_iter(ctrl->admin_tagset,
				nvme_cancel_request, ctrl);
		blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
	}
}
EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);

493 494 495
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
		enum nvme_ctrl_state new_state)
{
496
	enum nvme_ctrl_state old_state;
497
	unsigned long flags;
498 499
	bool changed = false;

500
	spin_lock_irqsave(&ctrl->lock, flags);
501 502

	old_state = ctrl->state;
503 504 505
	switch (new_state) {
	case NVME_CTRL_LIVE:
		switch (old_state) {
506
		case NVME_CTRL_NEW:
507
		case NVME_CTRL_RESETTING:
508
		case NVME_CTRL_CONNECTING:
509
			changed = true;
510
			fallthrough;
511 512 513 514 515 516 517
		default:
			break;
		}
		break;
	case NVME_CTRL_RESETTING:
		switch (old_state) {
		case NVME_CTRL_NEW:
518 519
		case NVME_CTRL_LIVE:
			changed = true;
520
			fallthrough;
521 522 523 524
		default:
			break;
		}
		break;
525
	case NVME_CTRL_CONNECTING:
526
		switch (old_state) {
527
		case NVME_CTRL_NEW:
528
		case NVME_CTRL_RESETTING:
529
			changed = true;
530
			fallthrough;
531 532 533 534 535 536 537 538
		default:
			break;
		}
		break;
	case NVME_CTRL_DELETING:
		switch (old_state) {
		case NVME_CTRL_LIVE:
		case NVME_CTRL_RESETTING:
539
		case NVME_CTRL_CONNECTING:
540
			changed = true;
541
			fallthrough;
542 543 544 545
		default:
			break;
		}
		break;
546 547 548 549 550
	case NVME_CTRL_DELETING_NOIO:
		switch (old_state) {
		case NVME_CTRL_DELETING:
		case NVME_CTRL_DEAD:
			changed = true;
551
			fallthrough;
552 553 554 555
		default:
			break;
		}
		break;
556 557 558 559
	case NVME_CTRL_DEAD:
		switch (old_state) {
		case NVME_CTRL_DELETING:
			changed = true;
560
			fallthrough;
561 562 563 564
		default:
			break;
		}
		break;
565 566 567 568
	default:
		break;
	}

569
	if (changed) {
570
		ctrl->state = new_state;
571 572
		wake_up_all(&ctrl->state_wq);
	}
573

574
	spin_unlock_irqrestore(&ctrl->lock, flags);
575 576 577 578 579 580
	if (!changed)
		return false;

	if (ctrl->state == NVME_CTRL_LIVE) {
		if (old_state == NVME_CTRL_CONNECTING)
			nvme_stop_failfast_work(ctrl);
581
		nvme_kick_requeue_lists(ctrl);
582 583 584 585
	} else if (ctrl->state == NVME_CTRL_CONNECTING &&
		old_state == NVME_CTRL_RESETTING) {
		nvme_start_failfast_work(ctrl);
	}
586 587 588 589
	return changed;
}
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);

590 591 592 593 594 595 596 597 598 599 600 601
/*
 * Returns true for sink states that can't ever transition back to live.
 */
static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
{
	switch (ctrl->state) {
	case NVME_CTRL_NEW:
	case NVME_CTRL_LIVE:
	case NVME_CTRL_RESETTING:
	case NVME_CTRL_CONNECTING:
		return false;
	case NVME_CTRL_DELETING:
602
	case NVME_CTRL_DELETING_NOIO:
603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623
	case NVME_CTRL_DEAD:
		return true;
	default:
		WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
		return true;
	}
}

/*
 * Waits for the controller state to be resetting, or returns false if it is
 * not possible to ever transition to that state.
 */
bool nvme_wait_reset(struct nvme_ctrl *ctrl)
{
	wait_event(ctrl->state_wq,
		   nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
		   nvme_state_terminal(ctrl));
	return ctrl->state == NVME_CTRL_RESETTING;
}
EXPORT_SYMBOL_GPL(nvme_wait_reset);

624 625 626 627 628
static void nvme_free_ns_head(struct kref *ref)
{
	struct nvme_ns_head *head =
		container_of(ref, struct nvme_ns_head, ref);

629
	nvme_mpath_remove_disk(head);
630
	ida_free(&head->subsys->ns_ida, head->instance);
631
	cleanup_srcu_struct(&head->srcu);
632
	nvme_put_subsystem(head->subsys);
633 634 635
	kfree(head);
}

636
bool nvme_tryget_ns_head(struct nvme_ns_head *head)
637 638 639 640
{
	return kref_get_unless_zero(&head->ref);
}

641
void nvme_put_ns_head(struct nvme_ns_head *head)
642 643 644 645
{
	kref_put(&head->ref, nvme_free_ns_head);
}

646 647 648 649 650
static void nvme_free_ns(struct kref *kref)
{
	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);

	put_disk(ns->disk);
651
	nvme_put_ns_head(ns->head);
652
	nvme_put_ctrl(ns->ctrl);
653 654 655
	kfree(ns);
}

656 657 658 659 660
static inline bool nvme_get_ns(struct nvme_ns *ns)
{
	return kref_get_unless_zero(&ns->kref);
}

661
void nvme_put_ns(struct nvme_ns *ns)
662 663 664
{
	kref_put(&ns->kref, nvme_free_ns);
}
665
EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
666

667 668
static inline void nvme_clear_nvme_request(struct request *req)
{
669
	nvme_req(req)->status = 0;
670 671 672
	nvme_req(req)->retries = 0;
	nvme_req(req)->flags = 0;
	req->rq_flags |= RQF_DONTPREP;
673 674
}

675 676
/* initialize a passthrough request */
void nvme_init_request(struct request *req, struct nvme_command *cmd)
677
{
678 679 680
	if (req->q->queuedata)
		req->timeout = NVME_IO_TIMEOUT;
	else /* no queuedata implies admin queue */
681
		req->timeout = NVME_ADMIN_TIMEOUT;
682

683 684 685
	/* passthru commands should let the driver set the SGL flags */
	cmd->common.flags &= ~NVME_CMD_SGL_ALL;

686
	req->cmd_flags |= REQ_FAILFAST_DRIVER;
687
	if (req->mq_hctx->type == HCTX_TYPE_POLL)
688
		req->cmd_flags |= REQ_POLLED;
689
	nvme_clear_nvme_request(req);
690
	req->rq_flags |= RQF_QUIET;
691
	memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));
692
}
693
EXPORT_SYMBOL_GPL(nvme_init_request);
694

695 696 697 698 699 700 701 702 703 704 705 706 707
/*
 * For something we're not in a state to send to the device the default action
 * is to busy it and retry it after the controller state is recovered.  However,
 * if the controller is deleting or if anything is marked for failfast or
 * nvme multipath it is immediately failed.
 *
 * Note: commands used to initialize the controller will be marked for failfast.
 * Note: nvme cli/ioctl commands are marked for failfast.
 */
blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
		struct request *rq)
{
	if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
708
	    ctrl->state != NVME_CTRL_DELETING &&
709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741
	    ctrl->state != NVME_CTRL_DEAD &&
	    !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
		return BLK_STS_RESOURCE;
	return nvme_host_path_error(rq);
}
EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);

bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
		bool queue_live)
{
	struct nvme_request *req = nvme_req(rq);

	/*
	 * currently we have a problem sending passthru commands
	 * on the admin_q if the controller is not LIVE because we can't
	 * make sure that they are going out after the admin connect,
	 * controller enable and/or other commands in the initialization
	 * sequence. until the controller will be LIVE, fail with
	 * BLK_STS_RESOURCE so that they will be rescheduled.
	 */
	if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
		return false;

	if (ctrl->ops->flags & NVME_F_FABRICS) {
		/*
		 * Only allow commands on a live queue, except for the connect
		 * command, which is require to set the queue live in the
		 * appropinquate states.
		 */
		switch (ctrl->state) {
		case NVME_CTRL_CONNECTING:
			if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
742 743 744
			    (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
			     req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
			     req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
745 746 747 748 749 750 751 752 753 754 755 756 757
				return true;
			break;
		default:
			break;
		case NVME_CTRL_DEAD:
			return false;
		}
	}

	return queue_live;
}
EXPORT_SYMBOL_GPL(__nvme_check_ready);

758 759 760
static inline void nvme_setup_flush(struct nvme_ns *ns,
		struct nvme_command *cmnd)
{
761
	memset(cmnd, 0, sizeof(*cmnd));
762
	cmnd->common.opcode = nvme_cmd_flush;
763
	cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
764 765
}

766
static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
767 768
		struct nvme_command *cmnd)
{
769
	unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
770
	struct nvme_dsm_range *range;
771
	struct bio *bio;
772

773 774 775 776 777 778 779 780
	/*
	 * Some devices do not consider the DSM 'Number of Ranges' field when
	 * determining how much data to DMA. Always allocate memory for maximum
	 * number of segments to prevent device reading beyond end of buffer.
	 */
	static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;

	range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
781 782 783 784 785 786 787 788 789 790 791
	if (!range) {
		/*
		 * If we fail allocation our range, fallback to the controller
		 * discard page. If that's also busy, it's safe to return
		 * busy, as we know we can make progress once that's freed.
		 */
		if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
			return BLK_STS_RESOURCE;

		range = page_address(ns->ctrl->discard_page);
	}
792

793 794
	if (queue_max_discard_segments(req->q) == 1) {
		u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req));
795
		u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);
796 797 798 799 800 801 802 803

		range[0].cattr = cpu_to_le32(0);
		range[0].nlb = cpu_to_le32(nlb);
		range[0].slba = cpu_to_le64(slba);
		n = 1;
	} else {
		__rq_for_each_bio(bio, req) {
			u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
804
			u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
805 806 807 808 809 810 811

			if (n < segments) {
				range[n].cattr = cpu_to_le32(0);
				range[n].nlb = cpu_to_le32(nlb);
				range[n].slba = cpu_to_le64(slba);
			}
			n++;
812
		}
813 814 815
	}

	if (WARN_ON_ONCE(n != segments)) {
816 817 818 819
		if (virt_to_page(range) == ns->ctrl->discard_page)
			clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
		else
			kfree(range);
820
		return BLK_STS_IOERR;
821
	}
822

823
	memset(cmnd, 0, sizeof(*cmnd));
824
	cmnd->dsm.opcode = nvme_cmd_dsm;
825
	cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
826
	cmnd->dsm.nr = cpu_to_le32(segments - 1);
827 828
	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);

829
	bvec_set_virt(&req->special_vec, range, alloc_size);
830
	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
831

832
	return BLK_STS_OK;
833 834
}

835 836 837 838 839 840 841
static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
			      struct request *req)
{
	u32 upper, lower;
	u64 ref48;

	/* both rw and write zeroes share the same reftag format */
842
	switch (ns->head->guard_type) {
843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858
	case NVME_NVM_NS_16B_GUARD:
		cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
		break;
	case NVME_NVM_NS_64B_GUARD:
		ref48 = ext_pi_ref_tag(req);
		lower = lower_32_bits(ref48);
		upper = upper_32_bits(ref48);

		cmnd->rw.reftag = cpu_to_le32(lower);
		cmnd->rw.cdw3 = cpu_to_le32(upper);
		break;
	default:
		break;
	}
}

859 860 861
static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
		struct request *req, struct nvme_command *cmnd)
{
862 863
	memset(cmnd, 0, sizeof(*cmnd));

864 865 866 867 868 869
	if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
		return nvme_setup_discard(ns, req, cmnd);

	cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
	cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
	cmnd->write_zeroes.slba =
870
		cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
871
	cmnd->write_zeroes.length =
872
		cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
Klaus Jensen's avatar
Klaus Jensen committed
873

874 875
	if (!(req->cmd_flags & REQ_NOUNMAP) &&
	    (ns->head->features & NVME_NS_DEAC))
876 877
		cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);

Klaus Jensen's avatar
Klaus Jensen committed
878
	if (nvme_ns_has_pi(ns)) {
879
		cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
Klaus Jensen's avatar
Klaus Jensen committed
880

881
		switch (ns->head->pi_type) {
Klaus Jensen's avatar
Klaus Jensen committed
882 883
		case NVME_NS_DPS_PI_TYPE1:
		case NVME_NS_DPS_PI_TYPE2:
884
			nvme_set_ref_tag(ns, cmnd, req);
Klaus Jensen's avatar
Klaus Jensen committed
885 886 887 888
			break;
		}
	}

889 890 891
	return BLK_STS_OK;
}

892
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
893 894
		struct request *req, struct nvme_command *cmnd,
		enum nvme_opcode op)
895 896 897 898 899 900 901 902 903 904 905 906
{
	u16 control = 0;
	u32 dsmgmt = 0;

	if (req->cmd_flags & REQ_FUA)
		control |= NVME_RW_FUA;
	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
		control |= NVME_RW_LR;

	if (req->cmd_flags & REQ_RAHEAD)
		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;

907
	cmnd->rw.opcode = op;
908
	cmnd->rw.flags = 0;
909
	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
910 911
	cmnd->rw.cdw2 = 0;
	cmnd->rw.cdw3 = 0;
912
	cmnd->rw.metadata = 0;
913
	cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
914 915
	cmnd->rw.length =
		cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
916 917 918
	cmnd->rw.reftag = 0;
	cmnd->rw.apptag = 0;
	cmnd->rw.appmask = 0;
919

920
	if (ns->head->ms) {
921 922 923 924 925 926 927 928 929 930 931 932
		/*
		 * If formated with metadata, the block layer always provides a
		 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
		 * we enable the PRACT bit for protection information or set the
		 * namespace capacity to zero to prevent any I/O.
		 */
		if (!blk_integrity_rq(req)) {
			if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
				return BLK_STS_NOTSUPP;
			control |= NVME_RW_PRINFO_PRACT;
		}

933
		switch (ns->head->pi_type) {
934 935 936 937 938 939 940
		case NVME_NS_DPS_PI_TYPE3:
			control |= NVME_RW_PRINFO_PRCHK_GUARD;
			break;
		case NVME_NS_DPS_PI_TYPE1:
		case NVME_NS_DPS_PI_TYPE2:
			control |= NVME_RW_PRINFO_PRCHK_GUARD |
					NVME_RW_PRINFO_PRCHK_REF;
941 942
			if (op == nvme_cmd_zone_append)
				control |= NVME_RW_APPEND_PIREMAP;
943
			nvme_set_ref_tag(ns, cmnd, req);
944 945 946 947 948 949
			break;
		}
	}

	cmnd->rw.control = cpu_to_le16(control);
	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
950
	return 0;
951 952
}

953 954 955
void nvme_cleanup_cmd(struct request *req)
{
	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
956
		struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
957

Christoph Hellwig's avatar
Christoph Hellwig committed
958
		if (req->special_vec.bv_page == ctrl->discard_page)
959
			clear_bit_unlock(0, &ctrl->discard_page_busy);
960
		else
Christoph Hellwig's avatar
Christoph Hellwig committed
961
			kfree(bvec_virt(&req->special_vec));
962 963 964 965
	}
}
EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);

966
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
967
{
968
	struct nvme_command *cmd = nvme_req(req)->cmd;
969
	blk_status_t ret = BLK_STS_OK;
970

971
	if (!(req->rq_flags & RQF_DONTPREP))
972
		nvme_clear_nvme_request(req);
973

974 975 976
	switch (req_op(req)) {
	case REQ_OP_DRV_IN:
	case REQ_OP_DRV_OUT:
977
		/* these are setup prior to execution in nvme_init_request() */
978 979
		break;
	case REQ_OP_FLUSH:
980
		nvme_setup_flush(ns, cmd);
981
		break;
982 983 984 985 986 987 988 989 990 991 992 993 994
	case REQ_OP_ZONE_RESET_ALL:
	case REQ_OP_ZONE_RESET:
		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
		break;
	case REQ_OP_ZONE_OPEN:
		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
		break;
	case REQ_OP_ZONE_CLOSE:
		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
		break;
	case REQ_OP_ZONE_FINISH:
		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
		break;
995
	case REQ_OP_WRITE_ZEROES:
996 997
		ret = nvme_setup_write_zeroes(ns, req, cmd);
		break;
998
	case REQ_OP_DISCARD:
999
		ret = nvme_setup_discard(ns, req, cmd);
1000 1001
		break;
	case REQ_OP_READ:
1002 1003
		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
		break;
1004
	case REQ_OP_WRITE:
1005 1006 1007 1008
		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
		break;
	case REQ_OP_ZONE_APPEND:
		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
1009 1010 1011
		break;
	default:
		WARN_ON_ONCE(1);
1012
		return BLK_STS_IOERR;
1013
	}
1014

1015
	cmd->common.command_id = nvme_cid(req);
1016
	trace_nvme_setup_cmd(req, cmd);
1017 1018 1019 1020
	return ret;
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);

1021 1022 1023 1024 1025 1026
/*
 * Return values:
 * 0:  success
 * >0: nvme controller's cqe status response
 * <0: kernel error in lieu of controller response
 */
1027
int nvme_execute_rq(struct request *rq, bool at_head)
1028 1029 1030
{
	blk_status_t status;

1031
	status = blk_execute_rq(rq, at_head);
1032 1033 1034 1035 1036 1037
	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
		return -EINTR;
	if (nvme_req(rq)->status)
		return nvme_req(rq)->status;
	return blk_status_to_errno(status);
}
1038
EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, NVME_TARGET_PASSTHRU);
1039

1040 1041 1042 1043 1044
/*
 * Returns 0 on success.  If the result is negative, it's a Linux error code;
 * if the result is positive, it's an NVM Express status code
 */
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1045
		union nvme_result *result, void *buffer, unsigned bufflen,
1046
		int qid, int at_head, blk_mq_req_flags_t flags)
1047 1048 1049 1050
{
	struct request *req;
	int ret;

1051
	if (qid == NVME_QID_ANY)
1052
		req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
1053
	else
1054
		req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
1055
						qid - 1);
1056

1057 1058
	if (IS_ERR(req))
		return PTR_ERR(req);
1059
	nvme_init_request(req, cmd);
1060

1061 1062 1063 1064
	if (buffer && bufflen) {
		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
		if (ret)
			goto out;
1065 1066
	}

1067
	ret = nvme_execute_rq(req, at_head);
1068
	if (result && ret >= 0)
1069
		*result = nvme_req(req)->result;
1070 1071 1072 1073
 out:
	blk_mq_free_request(req);
	return ret;
}
1074
EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
1075 1076 1077 1078

int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
		void *buffer, unsigned bufflen)
{
1079
	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
1080
			NVME_QID_ANY, 0, 0);
1081
}
1082
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
1083

1084 1085 1086 1087 1088
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
{
	u32 effects = 0;

	if (ns) {
1089
		effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
1090
		if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
1091
			dev_warn_once(ctrl->device,
1092
				"IO command:%02x has unusual effects:%08x\n",
1093
				opcode, effects);
1094

1095 1096 1097 1098 1099 1100 1101
		/*
		 * NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues,
		 * which would deadlock when done on an I/O command.  Note that
		 * We already warn about an unusual effect above.
		 */
		effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
	} else {
1102
		effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1103
	}
1104 1105 1106 1107 1108

	return effects;
}
EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);

1109
u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1110 1111 1112 1113 1114 1115 1116
{
	u32 effects = nvme_command_effects(ctrl, ns, opcode);

	/*
	 * For simplicity, IO to all namespaces is quiesced even if the command
	 * effects say only one namespace is affected.
	 */
1117
	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1118 1119 1120 1121 1122 1123 1124 1125 1126
		mutex_lock(&ctrl->scan_lock);
		mutex_lock(&ctrl->subsys->lock);
		nvme_mpath_start_freeze(ctrl->subsys);
		nvme_mpath_wait_freeze(ctrl->subsys);
		nvme_start_freeze(ctrl);
		nvme_wait_freeze(ctrl);
	}
	return effects;
}
1127
EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, NVME_TARGET_PASSTHRU);
1128

1129
void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
1130
		       struct nvme_command *cmd, int status)
1131
{
1132
	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1133 1134 1135 1136 1137
		nvme_unfreeze(ctrl);
		nvme_mpath_unfreeze(ctrl->subsys);
		mutex_unlock(&ctrl->subsys->lock);
		mutex_unlock(&ctrl->scan_lock);
	}
1138
	if (effects & NVME_CMD_EFFECTS_CCC) {
1139 1140 1141
		if (!test_and_set_bit(NVME_CTRL_DIRTY_CAPABILITY,
				      &ctrl->flags)) {
			dev_info(ctrl->device,
1142
"controller capabilities changed, reset may be required to take effect.\n");
1143
		}
1144
	}
1145 1146 1147 1148
	if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
		nvme_queue_scan(ctrl);
		flush_work(&ctrl->scan_work);
	}
1149 1150
	if (ns)
		return;
1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170

	switch (cmd->common.opcode) {
	case nvme_admin_set_features:
		switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
		case NVME_FEAT_KATO:
			/*
			 * Keep alive commands interval on the host should be
			 * updated when KATO is modified by Set Features
			 * commands.
			 */
			if (!status)
				nvme_update_keep_alive(ctrl, cmd);
			break;
		default:
			break;
		}
		break;
	default:
		break;
	}
1171
}
1172
EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU);
1173

1174 1175 1176 1177 1178 1179
/*
 * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
 * 
 *   The host should send Keep Alive commands at half of the Keep Alive Timeout
 *   accounting for transport roundtrip times [..].
 */
1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
static unsigned long nvme_keep_alive_work_period(struct nvme_ctrl *ctrl)
{
	unsigned long delay = ctrl->kato * HZ / 2;

	/*
	 * When using Traffic Based Keep Alive, we need to run
	 * nvme_keep_alive_work at twice the normal frequency, as one
	 * command completion can postpone sending a keep alive command
	 * by up to twice the delay between runs.
	 */
	if (ctrl->ctratt & NVME_CTRL_ATTR_TBKAS)
		delay /= 2;
	return delay;
}

1195
static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
1196
{
1197 1198
	queue_delayed_work(nvme_wq, &ctrl->ka_work,
			   nvme_keep_alive_work_period(ctrl));
1199 1200
}

1201 1202
static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
						 blk_status_t status)
1203 1204
{
	struct nvme_ctrl *ctrl = rq->end_io_data;
1205 1206
	unsigned long flags;
	bool startka = false;
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220
	unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
	unsigned long delay = nvme_keep_alive_work_period(ctrl);

	/*
	 * Subtract off the keepalive RTT so nvme_keep_alive_work runs
	 * at the desired frequency.
	 */
	if (rtt <= delay) {
		delay -= rtt;
	} else {
		dev_warn(ctrl->device, "long keepalive RTT (%u ms)\n",
			 jiffies_to_msecs(rtt));
		delay = 0;
	}
1221 1222 1223

	blk_mq_free_request(rq);

1224
	if (status) {
1225
		dev_err(ctrl->device,
1226 1227
			"failed nvme_keep_alive_end_io error=%d\n",
				status);
1228
		return RQ_END_IO_NONE;
1229 1230
	}

1231
	ctrl->ka_last_check_time = jiffies;
1232
	ctrl->comp_seen = false;
1233 1234 1235 1236 1237 1238
	spin_lock_irqsave(&ctrl->lock, flags);
	if (ctrl->state == NVME_CTRL_LIVE ||
	    ctrl->state == NVME_CTRL_CONNECTING)
		startka = true;
	spin_unlock_irqrestore(&ctrl->lock, flags);
	if (startka)
1239
		queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
1240
	return RQ_END_IO_NONE;
1241 1242 1243 1244 1245 1246
}

static void nvme_keep_alive_work(struct work_struct *work)
{
	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
			struct nvme_ctrl, ka_work);
1247
	bool comp_seen = ctrl->comp_seen;
1248
	struct request *rq;
1249

1250 1251
	ctrl->ka_last_check_time = jiffies;

1252 1253 1254 1255
	if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
		dev_dbg(ctrl->device,
			"reschedule traffic based keep-alive timer\n");
		ctrl->comp_seen = false;
1256
		nvme_queue_keep_alive_work(ctrl);
1257 1258
		return;
	}
1259

1260 1261
	rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd),
				  BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
1262
	if (IS_ERR(rq)) {
1263
		/* allocation failure, reset the controller */
1264
		dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
1265
		nvme_reset_ctrl(ctrl);
1266 1267
		return;
	}
1268
	nvme_init_request(rq, &ctrl->ka_cmd);
1269 1270

	rq->timeout = ctrl->kato * HZ;
1271
	rq->end_io = nvme_keep_alive_end_io;
1272
	rq->end_io_data = ctrl;
1273
	blk_execute_rq_nowait(rq, false);
1274 1275
}

1276
static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
1277 1278 1279 1280
{
	if (unlikely(ctrl->kato == 0))
		return;

1281
	nvme_queue_keep_alive_work(ctrl);
1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292
}

void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
{
	if (unlikely(ctrl->kato == 0))
		return;

	cancel_delayed_work_sync(&ctrl->ka_work);
}
EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);

1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
				   struct nvme_command *cmd)
{
	unsigned int new_kato =
		DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);

	dev_info(ctrl->device,
		 "keep alive interval updated from %u ms to %u ms\n",
		 ctrl->kato * 1000 / 2, new_kato * 1000 / 2);

	nvme_stop_keep_alive(ctrl);
	ctrl->kato = new_kato;
	nvme_start_keep_alive(ctrl);
}

1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320
/*
 * In NVMe 1.0 the CNS field was just a binary controller or namespace
 * flag, thus sending any new CNS opcodes has a big chance of not working.
 * Qemu unfortunately had that bug after reporting a 1.1 version compliance
 * (but not for any later version).
 */
static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
{
	if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
		return ctrl->vs < NVME_VS(1, 2, 0);
	return ctrl->vs < NVME_VS(1, 1, 0);
}

1321
static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
1322 1323 1324 1325 1326 1327
{
	struct nvme_command c = { };
	int error;

	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
	c.identify.opcode = nvme_admin_identify;
1328
	c.identify.cns = NVME_ID_CNS_CTRL;
1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340

	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
	if (!*id)
		return -ENOMEM;

	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
			sizeof(struct nvme_id_ctrl));
	if (error)
		kfree(*id);
	return error;
}

1341
static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1342
		struct nvme_ns_id_desc *cur, bool *csi_seen)
1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353
{
	const char *warn_str = "ctrl returned bogus length:";
	void *data = cur;

	switch (cur->nidt) {
	case NVME_NIDT_EUI64:
		if (cur->nidl != NVME_NIDT_EUI64_LEN) {
			dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
				 warn_str, cur->nidl);
			return -1;
		}
1354 1355
		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
			return NVME_NIDT_EUI64_LEN;
1356 1357 1358 1359 1360 1361 1362 1363
		memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
		return NVME_NIDT_EUI64_LEN;
	case NVME_NIDT_NGUID:
		if (cur->nidl != NVME_NIDT_NGUID_LEN) {
			dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
				 warn_str, cur->nidl);
			return -1;
		}
1364 1365
		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
			return NVME_NIDT_NGUID_LEN;
1366 1367 1368 1369 1370 1371 1372 1373
		memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
		return NVME_NIDT_NGUID_LEN;
	case NVME_NIDT_UUID:
		if (cur->nidl != NVME_NIDT_UUID_LEN) {
			dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
				 warn_str, cur->nidl);
			return -1;
		}
1374 1375
		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
			return NVME_NIDT_UUID_LEN;
1376 1377
		uuid_copy(&ids->uuid, data + sizeof(*cur));
		return NVME_NIDT_UUID_LEN;
1378 1379 1380 1381 1382 1383 1384 1385 1386
	case NVME_NIDT_CSI:
		if (cur->nidl != NVME_NIDT_CSI_LEN) {
			dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
				 warn_str, cur->nidl);
			return -1;
		}
		memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
		*csi_seen = true;
		return NVME_NIDT_CSI_LEN;
1387 1388 1389 1390 1391 1392
	default:
		/* Skip unknown types */
		return cur->nidl;
	}
}

1393 1394
static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
		struct nvme_ns_info *info)
1395 1396
{
	struct nvme_command c = { };
1397 1398
	bool csi_seen = false;
	int status, pos, len;
1399 1400
	void *data;

1401 1402
	if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
		return 0;
1403 1404 1405
	if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
		return 0;

1406
	c.identify.opcode = nvme_admin_identify;
1407
	c.identify.nsid = cpu_to_le32(info->nsid);
1408 1409 1410 1411 1412 1413
	c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;

	data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
	if (!data)
		return -ENOMEM;

1414
	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
1415
				      NVME_IDENTIFY_DATA_SIZE);
1416 1417
	if (status) {
		dev_warn(ctrl->device,
1418
			"Identify Descriptors failed (nsid=%u, status=0x%x)\n",
1419
			info->nsid, status);
1420
		goto free_data;
1421
	}
1422 1423 1424 1425 1426 1427 1428

	for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
		struct nvme_ns_id_desc *cur = data + pos;

		if (cur->nidl == 0)
			break;

1429
		len = nvme_process_ns_desc(ctrl, &info->ids, cur, &csi_seen);
1430
		if (len < 0)
1431
			break;
1432 1433 1434

		len += sizeof(*cur);
	}
1435 1436 1437

	if (nvme_multi_css(ctrl) && !csi_seen) {
		dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1438
			 info->nsid);
1439 1440 1441
		status = -EINVAL;
	}

1442 1443 1444 1445 1446
free_data:
	kfree(data);
	return status;
}

1447
static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
1448
			struct nvme_id_ns **id)
1449 1450 1451 1452 1453
{
	struct nvme_command c = { };
	int error;

	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1454 1455
	c.identify.opcode = nvme_admin_identify;
	c.identify.nsid = cpu_to_le32(nsid);
1456
	c.identify.cns = NVME_ID_CNS_NS;
1457

1458 1459 1460
	*id = kmalloc(sizeof(**id), GFP_KERNEL);
	if (!*id)
		return -ENOMEM;
1461

1462
	error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
1463
	if (error) {
1464
		dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1465
		kfree(*id);
1466
	}
1467 1468
	return error;
}
1469

1470 1471 1472 1473 1474 1475 1476 1477 1478 1479
static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
		struct nvme_ns_info *info)
{
	struct nvme_ns_ids *ids = &info->ids;
	struct nvme_id_ns *id;
	int ret;

	ret = nvme_identify_ns(ctrl, info->nsid, &id);
	if (ret)
		return ret;
1480 1481 1482 1483 1484 1485 1486

	if (id->ncap == 0) {
		/* namespace not allocated or attached */
		info->is_removed = true;
		return -ENODEV;
	}

1487 1488
	info->anagrpid = id->anagrpid;
	info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1489
	info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1490
	info->is_ready = true;
1491 1492 1493 1494 1495 1496
	if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
		dev_info(ctrl->device,
			 "Ignoring bogus Namespace Identifiers\n");
	} else {
		if (ctrl->vs >= NVME_VS(1, 1, 0) &&
		    !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
1497
			memcpy(ids->eui64, id->eui64, sizeof(ids->eui64));
1498 1499
		if (ctrl->vs >= NVME_VS(1, 2, 0) &&
		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
1500
			memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
1501
	}
1502
	kfree(id);
1503
	return 0;
1504 1505
}

1506 1507
static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
		struct nvme_ns_info *info)
1508
{
1509
	struct nvme_id_ns_cs_indep *id;
1510 1511
	struct nvme_command c = {
		.identify.opcode	= nvme_admin_identify,
1512
		.identify.nsid		= cpu_to_le32(info->nsid),
1513 1514 1515 1516
		.identify.cns		= NVME_ID_CNS_NS_CS_INDEP,
	};
	int ret;

1517 1518
	id = kmalloc(sizeof(*id), GFP_KERNEL);
	if (!id)
1519 1520
		return -ENOMEM;

1521 1522 1523 1524
	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
	if (!ret) {
		info->anagrpid = id->anagrpid;
		info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1525
		info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1526
		info->is_ready = id->nstat & NVME_NSTAT_NRDY;
1527
	}
1528 1529
	kfree(id);
	return ret;
1530 1531
}

1532 1533
static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
		unsigned int dword11, void *buffer, size_t buflen, u32 *result)
1534
{
1535
	union nvme_result res = { 0 };
1536
	struct nvme_command c = { };
1537
	int ret;
1538

1539
	c.features.opcode = op;
1540 1541 1542
	c.features.fid = cpu_to_le32(fid);
	c.features.dword11 = cpu_to_le32(dword11);

1543
	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1544
			buffer, buflen, NVME_QID_ANY, 0, 0);
1545
	if (ret >= 0 && result)
1546
		*result = le32_to_cpu(res.u32);
1547
	return ret;
1548 1549
}

1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
		      unsigned int dword11, void *buffer, size_t buflen,
		      u32 *result)
{
	return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
			     buflen, result);
}
EXPORT_SYMBOL_GPL(nvme_set_features);

int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
		      unsigned int dword11, void *buffer, size_t buflen,
		      u32 *result)
{
	return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
			     buflen, result);
}
EXPORT_SYMBOL_GPL(nvme_get_features);

1568 1569 1570 1571 1572 1573
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
{
	u32 q_count = (*count - 1) | ((*count - 1) << 16);
	u32 result;
	int status, nr_io_queues;

1574
	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1575
			&result);
1576
	if (status < 0)
1577 1578
		return status;

1579 1580 1581 1582 1583 1584
	/*
	 * Degraded controllers might return an error when setting the queue
	 * count.  We still want to be able to bring them online and offer
	 * access to the admin queue, as that might be only way to fix them up.
	 */
	if (status > 0) {
1585
		dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1586 1587 1588 1589 1590 1591
		*count = 0;
	} else {
		nr_io_queues = min(result & 0xffff, result >> 16) + 1;
		*count = min(*count, nr_io_queues);
	}

1592 1593
	return 0;
}
1594
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1595

1596
#define NVME_AEN_SUPPORTED \
1597 1598
	(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
	 NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
1599 1600 1601

static void nvme_enable_aen(struct nvme_ctrl *ctrl)
{
1602
	u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
1603 1604
	int status;

1605 1606 1607 1608 1609
	if (!supported_aens)
		return;

	status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
			NULL, 0, &result);
1610 1611
	if (status)
		dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
1612
			 supported_aens);
1613 1614

	queue_work(nvme_wq, &ctrl->async_event_work);
1615 1616
}

1617
static int nvme_ns_open(struct nvme_ns *ns)
1618 1619
{

1620
	/* should never be called due to GENHD_FL_HIDDEN */
1621
	if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
1622
		goto fail;
1623
	if (!nvme_get_ns(ns))
1624 1625 1626 1627
		goto fail;
	if (!try_module_get(ns->ctrl->ops->module))
		goto fail_put_ns;

1628
	return 0;
1629 1630 1631 1632 1633

fail_put_ns:
	nvme_put_ns(ns);
fail:
	return -ENXIO;
1634 1635
}

1636
static void nvme_ns_release(struct nvme_ns *ns)
1637
{
1638 1639 1640

	module_put(ns->ctrl->ops->module);
	nvme_put_ns(ns);
1641 1642
}

1643
static int nvme_open(struct gendisk *disk, blk_mode_t mode)
1644
{
1645
	return nvme_ns_open(disk->private_data);
1646 1647
}

1648
static void nvme_release(struct gendisk *disk)
1649 1650 1651 1652
{
	nvme_ns_release(disk->private_data);
}

1653
int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1654 1655 1656 1657 1658 1659 1660 1661 1662
{
	/* some standard values */
	geo->heads = 1 << 6;
	geo->sectors = 1 << 5;
	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
	return 0;
}

#ifdef CONFIG_BLK_DEV_INTEGRITY
1663
static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
1664
				u32 max_integrity_segments)
1665
{
1666
	struct blk_integrity integrity = { };
1667

1668
	switch (ns->head->pi_type) {
1669
	case NVME_NS_DPS_PI_TYPE3:
1670
		switch (ns->head->guard_type) {
1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
		case NVME_NVM_NS_16B_GUARD:
			integrity.profile = &t10_pi_type3_crc;
			integrity.tag_size = sizeof(u16) + sizeof(u32);
			integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
			break;
		case NVME_NVM_NS_64B_GUARD:
			integrity.profile = &ext_pi_type3_crc64;
			integrity.tag_size = sizeof(u16) + 6;
			integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
			break;
		default:
			integrity.profile = NULL;
			break;
		}
1685 1686 1687
		break;
	case NVME_NS_DPS_PI_TYPE1:
	case NVME_NS_DPS_PI_TYPE2:
1688
		switch (ns->head->guard_type) {
1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702
		case NVME_NVM_NS_16B_GUARD:
			integrity.profile = &t10_pi_type1_crc;
			integrity.tag_size = sizeof(u16);
			integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
			break;
		case NVME_NVM_NS_64B_GUARD:
			integrity.profile = &ext_pi_type1_crc64;
			integrity.tag_size = sizeof(u16);
			integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
			break;
		default:
			integrity.profile = NULL;
			break;
		}
1703 1704 1705 1706 1707
		break;
	default:
		integrity.profile = NULL;
		break;
	}
1708

1709
	integrity.tuple_size = ns->head->ms;
1710
	blk_integrity_register(disk, &integrity);
1711
	blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
1712 1713
}
#else
1714
static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
1715
				u32 max_integrity_segments)
1716 1717 1718 1719
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */

1720
static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
1721
{
1722
	struct nvme_ctrl *ctrl = ns->ctrl;
1723
	struct request_queue *queue = disk->queue;
1724 1725
	u32 size = queue_logical_block_size(queue);

1726 1727 1728
	if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX))
		ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl);

1729
	if (ctrl->max_discard_sectors == 0) {
1730
		blk_queue_max_discard_sectors(queue, 0);
1731 1732 1733
		return;
	}

1734 1735 1736
	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
			NVME_DSM_MAX_RANGES);

1737
	queue->limits.discard_granularity = size;
1738

1739
	/* If discard is already enabled, don't reset queue limits */
1740
	if (queue->limits.max_discard_sectors)
1741 1742
		return;

1743 1744
	blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
	blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
1745 1746

	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1747
		blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
1748 1749
}

1750 1751 1752 1753
static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
{
	return uuid_equal(&a->uuid, &b->uuid) &&
		memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1754 1755
		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
		a->csi == b->csi;
1756 1757
}

1758
static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
1759
{
1760 1761
	bool first = id->dps & NVME_NS_DPS_PI_FIRST;
	unsigned lbaf = nvme_lbaf_index(id->flbas);
1762
	struct nvme_ctrl *ctrl = ns->ctrl;
1763 1764 1765 1766 1767
	struct nvme_command c = { };
	struct nvme_id_ns_nvm *nvm;
	int ret = 0;
	u32 elbaf;

1768 1769
	ns->head->pi_size = 0;
	ns->head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
1770
	if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
1771 1772
		ns->head->pi_size = sizeof(struct t10_pi_tuple);
		ns->head->guard_type = NVME_NVM_NS_16B_GUARD;
1773 1774
		goto set_pi;
	}
1775

1776 1777 1778
	nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
	if (!nvm)
		return -ENOMEM;
1779

1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794
	c.identify.opcode = nvme_admin_identify;
	c.identify.nsid = cpu_to_le32(ns->head->ns_id);
	c.identify.cns = NVME_ID_CNS_CS_NS;
	c.identify.csi = NVME_CSI_NVM;

	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, nvm, sizeof(*nvm));
	if (ret)
		goto free_data;

	elbaf = le32_to_cpu(nvm->elbaf[lbaf]);

	/* no support for storage tag formats right now */
	if (nvme_elbaf_sts(elbaf))
		goto free_data;

1795 1796
	ns->head->guard_type = nvme_elbaf_guard_type(elbaf);
	switch (ns->head->guard_type) {
1797
	case NVME_NVM_NS_64B_GUARD:
1798
		ns->head->pi_size = sizeof(struct crc64_pi_tuple);
1799 1800
		break;
	case NVME_NVM_NS_16B_GUARD:
1801
		ns->head->pi_size = sizeof(struct t10_pi_tuple);
1802 1803 1804 1805 1806 1807 1808 1809
		break;
	default:
		break;
	}

free_data:
	kfree(nvm);
set_pi:
1810 1811
	if (ns->head->pi_size && (first || ns->head->ms == ns->head->pi_size))
		ns->head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1812
	else
1813
		ns->head->pi_type = 0;
1814

1815 1816 1817
	return ret;
}

1818
static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
1819 1820
{
	struct nvme_ctrl *ctrl = ns->ctrl;
1821
	int ret;
1822

1823 1824 1825
	ret = nvme_init_ms(ns, id);
	if (ret)
		return ret;
1826

1827 1828
	ns->head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
	if (!ns->head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1829
		return 0;
1830

1831 1832 1833 1834 1835 1836 1837
	if (ctrl->ops->flags & NVME_F_FABRICS) {
		/*
		 * The NVMe over Fabrics specification only supports metadata as
		 * part of the extended data LBA.  We rely on HCA/HBA support to
		 * remap the separate metadata buffer from the block layer.
		 */
		if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
1838
			return 0;
1839

1840
		ns->head->features |= NVME_NS_EXT_LBAS;
1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851

		/*
		 * The current fabrics transport drivers support namespace
		 * metadata formats only if nvme_ns_has_pi() returns true.
		 * Suppress support for all other formats so the namespace will
		 * have a 0 capacity and not be usable through the block stack.
		 *
		 * Note, this check will need to be modified if any drivers
		 * gain the ability to use other metadata formats.
		 */
		if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
1852
			ns->head->features |= NVME_NS_METADATA_SUPPORTED;
1853 1854 1855 1856 1857 1858 1859 1860
	} else {
		/*
		 * For PCIe controllers, we can't easily remap the separate
		 * metadata buffer from the block layer and thus require a
		 * separate metadata buffer for block layer metadata/PI support.
		 * We allow extended LBAs for the passthrough interface, though.
		 */
		if (id->flbas & NVME_NS_FLBAS_META_EXT)
1861
			ns->head->features |= NVME_NS_EXT_LBAS;
1862
		else
1863
			ns->head->features |= NVME_NS_METADATA_SUPPORTED;
1864
	}
1865
	return 0;
1866 1867
}

1868 1869 1870
static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
		struct request_queue *q)
{
1871
	bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
1872 1873 1874 1875 1876 1877 1878 1879 1880 1881

	if (ctrl->max_hw_sectors) {
		u32 max_segments =
			(ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;

		max_segments = min_not_zero(max_segments, ctrl->max_segments);
		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
	}
	blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
1882
	blk_queue_dma_alignment(q, 3);
1883 1884 1885
	blk_queue_write_cache(q, vwc, vwc);
}

1886 1887 1888
static void nvme_update_disk_info(struct gendisk *disk,
		struct nvme_ns *ns, struct nvme_id_ns *id)
{
1889
	sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
1890
	u32 bs = 1U << ns->head->lba_shift;
1891
	u32 atomic_bs, phys_bs, io_opt = 0;
1892

1893 1894 1895 1896
	/*
	 * The block layer can't support LBA sizes larger than the page size
	 * yet, so catch this early and don't allow block I/O.
	 */
1897
	if (ns->head->lba_shift > PAGE_SHIFT) {
1898
		capacity = 0;
1899 1900
		bs = (1 << 9);
	}
1901

1902 1903
	blk_integrity_unregister(disk);

1904
	atomic_bs = phys_bs = bs;
1905 1906 1907 1908 1909 1910
	if (id->nabo == 0) {
		/*
		 * Bit 1 indicates whether NAWUPF is defined for this namespace
		 * and whether it should be used instead of AWUPF. If NAWUPF ==
		 * 0 then AWUPF must be used instead.
		 */
1911
		if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
1912 1913 1914 1915
			atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
		else
			atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
	}
1916

1917
	if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
1918
		/* NPWG = Namespace Preferred Write Granularity */
1919
		phys_bs = bs * (1 + le16_to_cpu(id->npwg));
1920
		/* NOWS = Namespace Optimal Write Size */
1921
		io_opt = bs * (1 + le16_to_cpu(id->nows));
1922 1923
	}

1924
	blk_queue_logical_block_size(disk->queue, bs);
1925 1926 1927 1928 1929 1930 1931 1932
	/*
	 * Linux filesystems assume writing a single physical block is
	 * an atomic operation. Hence limit the physical block size to the
	 * value of the Atomic Write Unit Power Fail parameter.
	 */
	blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
	blk_queue_io_min(disk->queue, phys_bs);
	blk_queue_io_opt(disk->queue, io_opt);
1933

1934 1935 1936 1937 1938 1939
	/*
	 * Register a metadata profile for PI, or the plain non-integrity NVMe
	 * metadata masquerading as Type 0 if supported, otherwise reject block
	 * I/O to namespaces with metadata except when the namespace supports
	 * PI, as it can strip/insert in that case.
	 */
1940
	if (ns->head->ms) {
1941
		if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
1942
		    (ns->head->features & NVME_NS_METADATA_SUPPORTED))
1943
			nvme_init_integrity(disk, ns,
1944
					    ns->ctrl->max_integrity_segments);
1945 1946 1947 1948
		else if (!nvme_ns_has_pi(ns))
			capacity = 0;
	}

1949
	set_capacity_and_notify(disk, capacity);
1950

1951
	nvme_config_discard(disk, ns);
1952 1953
	blk_queue_max_write_zeroes_sectors(disk->queue,
					   ns->ctrl->max_zeroes_sectors);
1954 1955
}

1956 1957 1958 1959 1960
static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
{
	return info->is_readonly || test_bit(NVME_NS_FORCE_RO, &ns->flags);
}

1961 1962 1963
static inline bool nvme_first_scan(struct gendisk *disk)
{
	/* nvme_alloc_ns() scans the disk prior to adding it */
1964
	return !disk_live(disk);
1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997
}

static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
{
	struct nvme_ctrl *ctrl = ns->ctrl;
	u32 iob;

	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
	    is_power_of_2(ctrl->max_hw_sectors))
		iob = ctrl->max_hw_sectors;
	else
		iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));

	if (!iob)
		return;

	if (!is_power_of_2(iob)) {
		if (nvme_first_scan(ns->disk))
			pr_warn("%s: ignoring unaligned IO boundary:%u\n",
				ns->disk->disk_name, iob);
		return;
	}

	if (blk_queue_is_zoned(ns->disk->queue)) {
		if (nvme_first_scan(ns->disk))
			pr_warn("%s: ignoring zoned namespace IO boundary\n",
				ns->disk->disk_name);
		return;
	}

	blk_queue_chunk_sectors(ns->queue, iob);
}

1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
static int nvme_update_ns_info_generic(struct nvme_ns *ns,
		struct nvme_ns_info *info)
{
	blk_mq_freeze_queue(ns->disk->queue);
	nvme_set_queue_limits(ns->ctrl, ns->queue);
	set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
	blk_mq_unfreeze_queue(ns->disk->queue);

	if (nvme_ns_head_multipath(ns->head)) {
		blk_mq_freeze_queue(ns->head->disk->queue);
		set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
		nvme_mpath_revalidate_paths(ns);
		blk_stack_limits(&ns->head->disk->queue->limits,
				 &ns->queue->limits, 0);
		ns->head->disk->flags |= GENHD_FL_HIDDEN;
		blk_mq_unfreeze_queue(ns->head->disk->queue);
	}

	/* Hide the block-interface for these devices */
	ns->disk->flags |= GENHD_FL_HIDDEN;
	set_bit(NVME_NS_READY, &ns->flags);

	return 0;
}

2023 2024
static int nvme_update_ns_info_block(struct nvme_ns *ns,
		struct nvme_ns_info *info)
2025
{
2026 2027
	struct nvme_id_ns *id;
	unsigned lbaf;
2028
	int ret;
2029

2030 2031 2032 2033
	ret = nvme_identify_ns(ns->ctrl, info->nsid, &id);
	if (ret)
		return ret;

2034
	blk_mq_freeze_queue(ns->disk->queue);
2035
	lbaf = nvme_lbaf_index(id->flbas);
2036
	ns->head->lba_shift = id->lbaf[lbaf].ds;
2037
	nvme_set_queue_limits(ns->ctrl, ns->queue);
2038

2039 2040 2041 2042 2043
	ret = nvme_configure_metadata(ns, id);
	if (ret < 0) {
		blk_mq_unfreeze_queue(ns->disk->queue);
		goto out;
	}
2044 2045 2046
	nvme_set_chunk_sectors(ns, id);
	nvme_update_disk_info(ns->disk, ns, id);

2047
	if (ns->head->ids.csi == NVME_CSI_ZNS) {
2048
		ret = nvme_update_zone_info(ns, lbaf);
2049 2050 2051 2052
		if (ret) {
			blk_mq_unfreeze_queue(ns->disk->queue);
			goto out;
		}
2053 2054
	}

2055 2056 2057 2058 2059 2060 2061
	/*
	 * Only set the DEAC bit if the device guarantees that reads from
	 * deallocated data return zeroes.  While the DEAC bit does not
	 * require that, it must be a no-op if reads from deallocated data
	 * do not return zeroes.
	 */
	if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
2062
		ns->head->features |= NVME_NS_DEAC;
2063
	set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
2064
	set_bit(NVME_NS_READY, &ns->flags);
2065
	blk_mq_unfreeze_queue(ns->disk->queue);
2066

2067 2068
	if (blk_queue_is_zoned(ns->queue)) {
		ret = nvme_revalidate_zones(ns);
2069
		if (ret && !nvme_first_scan(ns->disk))
2070
			goto out;
2071 2072
	}

2073
	if (nvme_ns_head_multipath(ns->head)) {
2074
		blk_mq_freeze_queue(ns->head->disk->queue);
2075
		nvme_update_disk_info(ns->head->disk, ns, id);
2076
		set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
2077
		nvme_mpath_revalidate_paths(ns);
2078 2079
		blk_stack_limits(&ns->head->disk->queue->limits,
				 &ns->queue->limits, 0);
2080
		disk_update_readahead(ns->head->disk);
2081
		blk_mq_unfreeze_queue(ns->head->disk->queue);
2082
	}
2083

2084 2085
	ret = 0;
out:
2086 2087 2088 2089 2090 2091
	/*
	 * If probing fails due an unsupported feature, hide the block device,
	 * but still allow other access.
	 */
	if (ret == -ENODEV) {
		ns->disk->flags |= GENHD_FL_HIDDEN;
2092
		set_bit(NVME_NS_READY, &ns->flags);
2093 2094
		ret = 0;
	}
2095
	kfree(id);
2096 2097 2098
	return ret;
}

2099 2100 2101 2102 2103
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
{
	switch (info->ids.csi) {
	case NVME_CSI_ZNS:
		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
2104 2105
			dev_info(ns->ctrl->device,
	"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
2106
				info->nsid);
2107
			return nvme_update_ns_info_generic(ns, info);
2108 2109 2110 2111 2112
		}
		return nvme_update_ns_info_block(ns, info);
	case NVME_CSI_NVM:
		return nvme_update_ns_info_block(ns, info);
	default:
2113 2114 2115 2116
		dev_info(ns->ctrl->device,
			"block device for nsid %u not supported (csi %u)\n",
			info->nsid, info->ids.csi);
		return nvme_update_ns_info_generic(ns, info);
2117 2118 2119
	}
}

2120
#ifdef CONFIG_BLK_SED_OPAL
2121
static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
2122
		bool send)
2123
{
2124
	struct nvme_ctrl *ctrl = data;
2125
	struct nvme_command cmd = { };
2126 2127 2128 2129 2130 2131

	if (send)
		cmd.common.opcode = nvme_admin_security_send;
	else
		cmd.common.opcode = nvme_admin_security_recv;
	cmd.common.nsid = 0;
2132 2133
	cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
	cmd.common.cdw11 = cpu_to_le32(len);
2134

2135
	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
2136
			NVME_QID_ANY, 1, 0);
2137
}
2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154

static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
{
	if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) {
		if (!ctrl->opal_dev)
			ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit);
		else if (was_suspended)
			opal_unlock_from_suspend(ctrl->opal_dev);
	} else {
		free_opal_dev(ctrl->opal_dev);
		ctrl->opal_dev = NULL;
	}
}
#else
static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
{
}
2155 2156
#endif /* CONFIG_BLK_SED_OPAL */

2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167
#ifdef CONFIG_BLK_DEV_ZONED
static int nvme_report_zones(struct gendisk *disk, sector_t sector,
		unsigned int nr_zones, report_zones_cb cb, void *data)
{
	return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
			data);
}
#else
#define nvme_report_zones	NULL
#endif /* CONFIG_BLK_DEV_ZONED */

2168
const struct block_device_operations nvme_bdev_ops = {
2169 2170
	.owner		= THIS_MODULE,
	.ioctl		= nvme_ioctl,
2171
	.compat_ioctl	= blkdev_compat_ptr_ioctl,
2172 2173 2174
	.open		= nvme_open,
	.release	= nvme_release,
	.getgeo		= nvme_getgeo,
2175
	.report_zones	= nvme_report_zones,
2176 2177 2178
	.pr_ops		= &nvme_pr_ops,
};

2179 2180
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
		u32 timeout, const char *op)
2181
{
2182 2183
	unsigned long timeout_jiffies = jiffies + timeout * HZ;
	u32 csts;
2184 2185 2186
	int ret;

	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2187 2188
		if (csts == ~0)
			return -ENODEV;
2189
		if ((csts & mask) == val)
2190 2191
			break;

2192
		usleep_range(1000, 2000);
2193 2194
		if (fatal_signal_pending(current))
			return -EINTR;
2195
		if (time_after(jiffies, timeout_jiffies)) {
2196
			dev_err(ctrl->device,
2197
				"Device not ready; aborting %s, CSTS=0x%x\n",
2198
				op, csts);
2199 2200 2201 2202 2203 2204 2205
			return -ENODEV;
		}
	}

	return ret;
}

2206
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2207 2208 2209 2210
{
	int ret;

	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2211 2212 2213 2214
	if (shutdown)
		ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
	else
		ctrl->ctrl_config &= ~NVME_CC_ENABLE;
2215 2216 2217 2218

	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
	if (ret)
		return ret;
2219

2220 2221 2222 2223 2224
	if (shutdown) {
		return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
				       NVME_CSTS_SHST_CMPLT,
				       ctrl->shutdown_timeout, "shutdown");
	}
2225
	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
2226
		msleep(NVME_QUIRK_DELAY_AMOUNT);
2227 2228
	return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
			       (NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
2229
}
2230
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
2231

2232
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
2233
{
2234
	unsigned dev_page_min;
2235
	u32 timeout;
2236 2237
	int ret;

2238 2239 2240 2241 2242 2243 2244
	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
	if (ret) {
		dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
		return ret;
	}
	dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;

2245
	if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
2246
		dev_err(ctrl->device,
2247
			"Minimum device page size %u too large for host (%u)\n",
2248
			1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
2249 2250 2251
		return -ENODEV;
	}

2252 2253 2254 2255
	if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
		ctrl->ctrl_config = NVME_CC_CSS_CSI;
	else
		ctrl->ctrl_config = NVME_CC_CSS_NVM;
2256

2257 2258
	if (ctrl->cap & NVME_CAP_CRMS_CRWMS && ctrl->cap & NVME_CAP_CRMS_CRIMS)
		ctrl->ctrl_config |= NVME_CC_CRIME;
2259

2260
	ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
2261
	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
2262
	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
2263 2264 2265
	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
	if (ret)
		return ret;
2266

2267 2268 2269 2270 2271
	/* Flush write to device (required if transport is PCI) */
	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CC, &ctrl->ctrl_config);
	if (ret)
		return ret;

2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304
	/* CAP value may change after initial CC write */
	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
	if (ret)
		return ret;

	timeout = NVME_CAP_TIMEOUT(ctrl->cap);
	if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
		u32 crto, ready_timeout;

		ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
		if (ret) {
			dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
				ret);
			return ret;
		}

		/*
		 * CRTO should always be greater or equal to CAP.TO, but some
		 * devices are known to get this wrong. Use the larger of the
		 * two values.
		 */
		if (ctrl->ctrl_config & NVME_CC_CRIME)
			ready_timeout = NVME_CRTO_CRIMT(crto);
		else
			ready_timeout = NVME_CRTO_CRWMT(crto);

		if (ready_timeout < timeout)
			dev_warn_once(ctrl->device, "bad crto:%x cap:%llx\n",
				      crto, ctrl->cap);
		else
			timeout = ready_timeout;
	}

2305
	ctrl->ctrl_config |= NVME_CC_ENABLE;
2306 2307 2308
	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
	if (ret)
		return ret;
2309 2310
	return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
			       (timeout + 1) / 2, "initialisation");
2311
}
2312
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
2313

2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330
static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
{
	__le64 ts;
	int ret;

	if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
		return 0;

	ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
	ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
			NULL);
	if (ret)
		dev_warn_once(ctrl->device,
			"could not set timestamp (%d)\n", ret);
	return ret;
}

2331
static int nvme_configure_host_options(struct nvme_ctrl *ctrl)
2332 2333
{
	struct nvme_feat_host_behavior *host;
2334
	u8 acre = 0, lbafee = 0;
2335 2336 2337
	int ret;

	/* Don't bother enabling the feature if retry delay is not reported */
2338 2339 2340 2341 2342 2343
	if (ctrl->crdt[0])
		acre = NVME_ENABLE_ACRE;
	if (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)
		lbafee = NVME_ENABLE_LBAFEE;

	if (!acre && !lbafee)
2344 2345 2346 2347 2348 2349
		return 0;

	host = kzalloc(sizeof(*host), GFP_KERNEL);
	if (!host)
		return 0;

2350 2351
	host->acre = acre;
	host->lbafee = lbafee;
2352 2353 2354 2355 2356 2357
	ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
				host, sizeof(*host), NULL);
	kfree(host);
	return ret;
}

2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386
/*
 * The function checks whether the given total (exlat + enlat) latency of
 * a power state allows the latter to be used as an APST transition target.
 * It does so by comparing the latency to the primary and secondary latency
 * tolerances defined by module params. If there's a match, the corresponding
 * timeout value is returned and the matching tolerance index (1 or 2) is
 * reported.
 */
static bool nvme_apst_get_transition_time(u64 total_latency,
		u64 *transition_time, unsigned *last_index)
{
	if (total_latency <= apst_primary_latency_tol_us) {
		if (*last_index == 1)
			return false;
		*last_index = 1;
		*transition_time = apst_primary_timeout_ms;
		return true;
	}
	if (apst_secondary_timeout_ms &&
		total_latency <= apst_secondary_latency_tol_us) {
		if (*last_index <= 2)
			return false;
		*last_index = 2;
		*transition_time = apst_secondary_timeout_ms;
		return true;
	}
	return false;
}

2387 2388 2389
/*
 * APST (Autonomous Power State Transition) lets us program a table of power
 * state transitions that the controller will perform automatically.
2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405
 *
 * Depending on module params, one of the two supported techniques will be used:
 *
 * - If the parameters provide explicit timeouts and tolerances, they will be
 *   used to build a table with up to 2 non-operational states to transition to.
 *   The default parameter values were selected based on the values used by
 *   Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
 *   regeneration of the APST table in the event of switching between external
 *   and battery power, the timeouts and tolerances reflect a compromise
 *   between values used by Microsoft for AC and battery scenarios.
 * - If not, we'll configure the table with a simple heuristic: we are willing
 *   to spend at most 2% of the time transitioning between power states.
 *   Therefore, when running in any given state, we will enter the next
 *   lower-power non-operational state after waiting 50 * (enlat + exlat)
 *   microseconds, as long as that state's exit latency is under the requested
 *   maximum latency.
2406 2407 2408 2409 2410 2411
 *
 * We will not autonomously enter any non-operational state for which the total
 * latency exceeds ps_max_latency_us.
 *
 * Users can set ps_max_latency_us to zero to turn off APST.
 */
2412
static int nvme_configure_apst(struct nvme_ctrl *ctrl)
2413 2414
{
	struct nvme_feat_auto_pst *table;
2415
	unsigned apste = 0;
2416
	u64 max_lat_us = 0;
2417
	__le64 target = 0;
2418
	int max_ps = -1;
2419
	int state;
2420
	int ret;
2421
	unsigned last_lt_index = UINT_MAX;
2422 2423 2424 2425 2426 2427

	/*
	 * If APST isn't supported or if we haven't been initialized yet,
	 * then don't do anything.
	 */
	if (!ctrl->apsta)
2428
		return 0;
2429 2430 2431

	if (ctrl->npss > 31) {
		dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
2432
		return 0;
2433 2434 2435 2436
	}

	table = kzalloc(sizeof(*table), GFP_KERNEL);
	if (!table)
2437
		return 0;
2438

2439
	if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
2440
		/* Turn off APST. */
2441
		dev_dbg(ctrl->device, "APST disabled\n");
2442 2443
		goto done;
	}
2444

2445 2446 2447 2448 2449 2450 2451 2452
	/*
	 * Walk through all states from lowest- to highest-power.
	 * According to the spec, lower-numbered states use more power.  NPSS,
	 * despite the name, is the index of the lowest-power state, not the
	 * number of states.
	 */
	for (state = (int)ctrl->npss; state >= 0; state--) {
		u64 total_latency_us, exit_latency_us, transition_ms;
2453

2454 2455
		if (target)
			table->entries[state] = target;
2456 2457

		/*
2458 2459
		 * Don't allow transitions to the deepest state if it's quirked
		 * off.
2460
		 */
2461 2462 2463
		if (state == ctrl->npss &&
		    (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
			continue;
2464

2465 2466 2467 2468 2469 2470
		/*
		 * Is this state a useful non-operational state for higher-power
		 * states to autonomously transition to?
		 */
		if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE))
			continue;
2471

2472 2473 2474
		exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
		if (exit_latency_us > ctrl->ps_max_latency_us)
			continue;
2475

2476 2477
		total_latency_us = exit_latency_us +
			le32_to_cpu(ctrl->psd[state].entry_lat);
2478

2479
		/*
2480 2481
		 * This state is good. It can be used as the APST idle target
		 * for higher power states.
2482
		 */
2483 2484 2485 2486 2487 2488 2489 2490 2491 2492
		if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
			if (!nvme_apst_get_transition_time(total_latency_us,
					&transition_ms, &last_lt_index))
				continue;
		} else {
			transition_ms = total_latency_us + 19;
			do_div(transition_ms, 20);
			if (transition_ms > (1 << 24) - 1)
				transition_ms = (1 << 24) - 1;
		}
2493 2494 2495 2496 2497 2498

		target = cpu_to_le64((state << 3) | (transition_ms << 8));
		if (max_ps == -1)
			max_ps = state;
		if (total_latency_us > max_lat_us)
			max_lat_us = total_latency_us;
2499 2500
	}

2501 2502 2503 2504 2505 2506 2507 2508
	if (max_ps == -1)
		dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
	else
		dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
			max_ps, max_lat_us, (int)sizeof(*table), table);
	apste = 1;

done:
2509 2510 2511 2512 2513
	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
				table, sizeof(*table), NULL);
	if (ret)
		dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
	kfree(table);
2514
	return ret;
2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533
}

static void nvme_set_latency_tolerance(struct device *dev, s32 val)
{
	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
	u64 latency;

	switch (val) {
	case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
	case PM_QOS_LATENCY_ANY:
		latency = U64_MAX;
		break;

	default:
		latency = val;
	}

	if (ctrl->ps_max_latency_us != latency) {
		ctrl->ps_max_latency_us = latency;
2534 2535
		if (ctrl->state == NVME_CTRL_LIVE)
			nvme_configure_apst(ctrl);
2536 2537 2538
	}
}

2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551
struct nvme_core_quirk_entry {
	/*
	 * NVMe model and firmware strings are padded with spaces.  For
	 * simplicity, strings in the quirk table are padded with NULLs
	 * instead.
	 */
	u16 vid;
	const char *mn;
	const char *fr;
	unsigned long quirks;
};

static const struct nvme_core_quirk_entry core_quirks[] = {
2552
	{
2553 2554 2555 2556 2557 2558
		/*
		 * This Toshiba device seems to die using any APST states.  See:
		 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
		 */
		.vid = 0x1179,
		.mn = "THNSF5256GPUK TOSHIBA",
2559
		.quirks = NVME_QUIRK_NO_APST,
2560 2561 2562 2563 2564 2565 2566 2567 2568 2569
	},
	{
		/*
		 * This LiteON CL1-3D*-Q11 firmware version has a race
		 * condition associated with actions related to suspend to idle
		 * LiteON has resolved the problem in future firmware
		 */
		.vid = 0x14a4,
		.fr = "22301111",
		.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583
	},
	{
		/*
		 * This Kioxia CD6-V Series / HPE PE8030 device times out and
		 * aborts I/O during any load, but more easily reproducible
		 * with discards (fstrim).
		 *
		 * The device is left in a state where it is also not possible
		 * to use "nvme set-feature" to disable APST, but booting with
		 * nvme_core.default_ps_max_latency=0 works.
		 */
		.vid = 0x1e0f,
		.mn = "KCD6XVUL6T40",
		.quirks = NVME_QUIRK_NO_APST,
2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597
	},
	{
		/*
		 * The external Samsung X5 SSD fails initialization without a
		 * delay before checking if it is ready and has a whole set of
		 * other problems.  To make this even more interesting, it
		 * shares the PCI ID with internal Samsung 970 Evo Plus that
		 * does not need or want these quirks.
		 */
		.vid = 0x144d,
		.mn = "Samsung Portable SSD X5",
		.quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
			  NVME_QUIRK_NO_DEEPEST_PS |
			  NVME_QUIRK_IGNORE_DEV_SUBNQN,
2598
	}
2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629
};

/* match is null-terminated but idstr is space-padded. */
static bool string_matches(const char *idstr, const char *match, size_t len)
{
	size_t matchlen;

	if (!match)
		return true;

	matchlen = strlen(match);
	WARN_ON_ONCE(matchlen > len);

	if (memcmp(idstr, match, matchlen))
		return false;

	for (; matchlen < len; matchlen++)
		if (idstr[matchlen] != ' ')
			return false;

	return true;
}

static bool quirk_matches(const struct nvme_id_ctrl *id,
			  const struct nvme_core_quirk_entry *q)
{
	return q->vid == le16_to_cpu(id->vid) &&
		string_matches(id->mn, q->mn, sizeof(id->mn)) &&
		string_matches(id->fr, q->fr, sizeof(id->fr));
}

Christoph Hellwig's avatar
Christoph Hellwig committed
2630 2631
static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
		struct nvme_id_ctrl *id)
2632 2633 2634 2635
{
	size_t nqnlen;
	int off;

2636 2637 2638
	if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
		nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
		if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2639
			strscpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2640 2641
			return;
		}
2642

2643 2644 2645
		if (ctrl->vs >= NVME_VS(1, 2, 1))
			dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
	}
2646

2647 2648 2649 2650 2651
	/*
	 * Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe
	 * Base Specification 2.0.  It is slightly different from the format
	 * specified there due to historic reasons, and we can't change it now.
	 */
Christoph Hellwig's avatar
Christoph Hellwig committed
2652
	off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
2653
			"nqn.2014.08.org.nvmexpress:%04x%04x",
2654
			le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
Christoph Hellwig's avatar
Christoph Hellwig committed
2655
	memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
2656
	off += sizeof(id->sn);
Christoph Hellwig's avatar
Christoph Hellwig committed
2657
	memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
2658
	off += sizeof(id->mn);
Christoph Hellwig's avatar
Christoph Hellwig committed
2659 2660 2661
	memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
}

2662
static void nvme_release_subsystem(struct device *dev)
Christoph Hellwig's avatar
Christoph Hellwig committed
2663
{
2664 2665 2666
	struct nvme_subsystem *subsys =
		container_of(dev, struct nvme_subsystem, dev);

2667
	if (subsys->instance >= 0)
2668
		ida_free(&nvme_instance_ida, subsys->instance);
Christoph Hellwig's avatar
Christoph Hellwig committed
2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680
	kfree(subsys);
}

static void nvme_destroy_subsystem(struct kref *ref)
{
	struct nvme_subsystem *subsys =
			container_of(ref, struct nvme_subsystem, ref);

	mutex_lock(&nvme_subsystems_lock);
	list_del(&subsys->entry);
	mutex_unlock(&nvme_subsystems_lock);

2681
	ida_destroy(&subsys->ns_ida);
Christoph Hellwig's avatar
Christoph Hellwig committed
2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696
	device_del(&subsys->dev);
	put_device(&subsys->dev);
}

static void nvme_put_subsystem(struct nvme_subsystem *subsys)
{
	kref_put(&subsys->ref, nvme_destroy_subsystem);
}

static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
{
	struct nvme_subsystem *subsys;

	lockdep_assert_held(&nvme_subsystems_lock);

2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707
	/*
	 * Fail matches for discovery subsystems. This results
	 * in each discovery controller bound to a unique subsystem.
	 * This avoids issues with validating controller values
	 * that can only be true when there is a single unique subsystem.
	 * There may be multiple and completely independent entities
	 * that provide discovery controllers.
	 */
	if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
		return NULL;

Christoph Hellwig's avatar
Christoph Hellwig committed
2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718
	list_for_each_entry(subsys, &nvme_subsystems, entry) {
		if (strcmp(subsys->subnqn, subsysnqn))
			continue;
		if (!kref_get_unless_zero(&subsys->ref))
			continue;
		return subsys;
	}

	return NULL;
}

2719 2720 2721 2722 2723
static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
{
	return ctrl->opts && ctrl->opts->discovery_nqn;
}

2724 2725
static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
		struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2726
{
2727
	struct nvme_ctrl *tmp;
2728

2729 2730
	lockdep_assert_held(&nvme_subsystems_lock);

2731
	list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2732
		if (nvme_state_terminal(tmp))
2733 2734 2735 2736
			continue;

		if (tmp->cntlid == ctrl->cntlid) {
			dev_err(ctrl->device,
2737 2738 2739
				"Duplicate cntlid %u with %s, subsys %s, rejecting\n",
				ctrl->cntlid, dev_name(tmp->device),
				subsys->subnqn);
2740 2741
			return false;
		}
2742

2743
		if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
2744
		    nvme_discovery_ctrl(ctrl))
2745 2746 2747 2748 2749
			continue;

		dev_err(ctrl->device,
			"Subsystem does not support multiple controllers\n");
		return false;
2750 2751
	}

2752
	return true;
2753 2754
}

Christoph Hellwig's avatar
Christoph Hellwig committed
2755 2756 2757 2758 2759 2760 2761 2762
static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
	struct nvme_subsystem *subsys, *found;
	int ret;

	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
	if (!subsys)
		return -ENOMEM;
2763 2764

	subsys->instance = -1;
Christoph Hellwig's avatar
Christoph Hellwig committed
2765 2766 2767
	mutex_init(&subsys->lock);
	kref_init(&subsys->ref);
	INIT_LIST_HEAD(&subsys->ctrls);
2768
	INIT_LIST_HEAD(&subsys->nsheads);
Christoph Hellwig's avatar
Christoph Hellwig committed
2769 2770 2771 2772 2773
	nvme_init_subnqn(subsys, ctrl, id);
	memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
	memcpy(subsys->model, id->mn, sizeof(subsys->model));
	subsys->vendor_id = le16_to_cpu(id->vid);
	subsys->cmic = id->cmic;
2774 2775 2776 2777 2778 2779 2780 2781

	/* Versions prior to 1.4 don't necessarily report a valid type */
	if (id->cntrltype == NVME_CTRL_DISC ||
	    !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
		subsys->subtype = NVME_NQN_DISC;
	else
		subsys->subtype = NVME_NQN_NVME;

2782 2783 2784 2785 2786 2787 2788
	if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
		dev_err(ctrl->device,
			"Subsystem %s is not a discovery controller",
			subsys->subnqn);
		kfree(subsys);
		return -EINVAL;
	}
2789
	subsys->awupf = le16_to_cpu(id->awupf);
2790
	nvme_mpath_default_iopolicy(subsys);
Christoph Hellwig's avatar
Christoph Hellwig committed
2791 2792 2793

	subsys->dev.class = nvme_subsys_class;
	subsys->dev.release = nvme_release_subsystem;
2794
	subsys->dev.groups = nvme_subsys_attrs_groups;
2795
	dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
Christoph Hellwig's avatar
Christoph Hellwig committed
2796 2797 2798 2799 2800
	device_initialize(&subsys->dev);

	mutex_lock(&nvme_subsystems_lock);
	found = __nvme_find_get_subsystem(subsys->subnqn);
	if (found) {
2801
		put_device(&subsys->dev);
Christoph Hellwig's avatar
Christoph Hellwig committed
2802
		subsys = found;
2803

2804
		if (!nvme_validate_cntlid(subsys, ctrl, id)) {
Christoph Hellwig's avatar
Christoph Hellwig committed
2805
			ret = -EINVAL;
2806
			goto out_put_subsystem;
Christoph Hellwig's avatar
Christoph Hellwig committed
2807 2808 2809 2810 2811 2812
		}
	} else {
		ret = device_add(&subsys->dev);
		if (ret) {
			dev_err(ctrl->device,
				"failed to register subsystem device.\n");
2813
			put_device(&subsys->dev);
Christoph Hellwig's avatar
Christoph Hellwig committed
2814 2815
			goto out_unlock;
		}
2816
		ida_init(&subsys->ns_ida);
Christoph Hellwig's avatar
Christoph Hellwig committed
2817 2818 2819
		list_add_tail(&subsys->entry, &nvme_subsystems);
	}

2820 2821 2822
	ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
				dev_name(ctrl->device));
	if (ret) {
Christoph Hellwig's avatar
Christoph Hellwig committed
2823 2824
		dev_err(ctrl->device,
			"failed to create sysfs link from subsystem.\n");
2825
		goto out_put_subsystem;
Christoph Hellwig's avatar
Christoph Hellwig committed
2826 2827
	}

2828 2829
	if (!found)
		subsys->instance = ctrl->instance;
2830
	ctrl->subsys = subsys;
Christoph Hellwig's avatar
Christoph Hellwig committed
2831
	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
2832
	mutex_unlock(&nvme_subsystems_lock);
Christoph Hellwig's avatar
Christoph Hellwig committed
2833 2834
	return 0;

2835 2836
out_put_subsystem:
	nvme_put_subsystem(subsys);
Christoph Hellwig's avatar
Christoph Hellwig committed
2837 2838 2839
out_unlock:
	mutex_unlock(&nvme_subsystems_lock);
	return ret;
2840 2841
}

2842
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
2843
		void *log, size_t size, u64 offset)
2844 2845
{
	struct nvme_command c = { };
2846
	u32 dwlen = nvme_bytes_to_numd(size);
2847 2848

	c.get_log_page.opcode = nvme_admin_get_log_page;
2849
	c.get_log_page.nsid = cpu_to_le32(nsid);
2850
	c.get_log_page.lid = log_page;
2851
	c.get_log_page.lsp = lsp;
2852 2853
	c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
	c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
2854 2855
	c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
	c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
2856
	c.get_log_page.csi = csi;
2857 2858 2859 2860

	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}

2861 2862
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
				struct nvme_effects_log **log)
2863
{
2864
	struct nvme_effects_log	*cel = xa_load(&ctrl->cels, csi);
2865 2866
	int ret;

2867 2868
	if (cel)
		goto out;
2869

2870 2871 2872
	cel = kzalloc(sizeof(*cel), GFP_KERNEL);
	if (!cel)
		return -ENOMEM;
2873

2874
	ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
2875
			cel, sizeof(*cel), 0);
2876
	if (ret) {
2877 2878
		kfree(cel);
		return ret;
2879
	}
2880

2881
	xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
2882
out:
2883
	*log = cel;
2884
	return 0;
2885 2886
}

2887
static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
2888
{
2889
	u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val;
2890

2891 2892 2893
	if (check_shl_overflow(1U, units + page_shift - 9, &val))
		return UINT_MAX;
	return val;
2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907
}

static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
{
	struct nvme_command c = { };
	struct nvme_id_ctrl_nvm *id;
	int ret;

	if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
		ctrl->max_discard_sectors = UINT_MAX;
		ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
	} else {
		ctrl->max_discard_sectors = 0;
		ctrl->max_discard_segments = 0;
2908
	}
2909

2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921
	/*
	 * Even though NVMe spec explicitly states that MDTS is not applicable
	 * to the write-zeroes, we are cautious and limit the size to the
	 * controllers max_hw_sectors value, which is based on the MDTS field
	 * and possibly other limiting factors.
	 */
	if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
	    !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
		ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
	else
		ctrl->max_zeroes_sectors = 0;

2922
	if (ctrl->subsys->subtype != NVME_NQN_NVME ||
2923 2924
	    nvme_ctrl_limited_cns(ctrl) ||
	    test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags))
2925 2926 2927 2928
		return 0;

	id = kzalloc(sizeof(*id), GFP_KERNEL);
	if (!id)
2929
		return -ENOMEM;
2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940

	c.identify.opcode = nvme_admin_identify;
	c.identify.cns = NVME_ID_CNS_CS_CTRL;
	c.identify.csi = NVME_CSI_NVM;

	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
	if (ret)
		goto free_data;

	if (id->dmrl)
		ctrl->max_discard_segments = id->dmrl;
2941
	ctrl->dmrsl = le32_to_cpu(id->dmrsl);
2942 2943 2944 2945
	if (id->wzsl)
		ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);

free_data:
2946 2947
	if (ret > 0)
		set_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags);
2948 2949 2950 2951
	kfree(id);
	return ret;
}

2952 2953 2954 2955 2956 2957 2958 2959 2960 2961
static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl)
{
	struct nvme_effects_log	*log = ctrl->effects;

	log->acs[nvme_admin_format_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
						NVME_CMD_EFFECTS_NCC |
						NVME_CMD_EFFECTS_CSE_MASK);
	log->acs[nvme_admin_sanitize_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
						NVME_CMD_EFFECTS_CSE_MASK);

2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976
	/*
	 * The spec says the result of a security receive command depends on
	 * the previous security send command. As such, many vendors log this
	 * command as one to submitted only when no other commands to the same
	 * namespace are outstanding. The intention is to tell the host to
	 * prevent mixing security send and receive.
	 *
	 * This driver can only enforce such exclusive access against IO
	 * queues, though. We are not readily able to enforce such a rule for
	 * two commands to the admin queue, which is the only queue that
	 * matters for this command.
	 *
	 * Rather than blindly freezing the IO queues for this effect that
	 * doesn't even apply to IO, mask it off.
	 */
2977
	log->acs[nvme_admin_security_recv] &= cpu_to_le32(~NVME_CMD_EFFECTS_CSE_MASK);
2978

2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007
	log->iocs[nvme_cmd_write] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
	log->iocs[nvme_cmd_write_zeroes] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
	log->iocs[nvme_cmd_write_uncor] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
}

static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
	int ret = 0;

	if (ctrl->effects)
		return 0;

	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
		ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
		if (ret < 0)
			return ret;
	}

	if (!ctrl->effects) {
		ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
		if (!ctrl->effects)
			return -ENOMEM;
		xa_store(&ctrl->cels, NVME_CSI_NVM, ctrl->effects, GFP_KERNEL);
	}

	nvme_init_known_nvm_effects(ctrl);
	return 0;
}

3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026
static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
	/*
	 * In fabrics we need to verify the cntlid matches the
	 * admin connect
	 */
	if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
		dev_err(ctrl->device,
			"Mismatching cntlid: Connect %u vs Identify %u, rejecting\n",
			ctrl->cntlid, le16_to_cpu(id->cntlid));
		return -EINVAL;
	}

	if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
		dev_err(ctrl->device,
			"keep-alive support is mandatory for fabrics\n");
		return -EINVAL;
	}

3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040
	if (ctrl->ioccsz < 4) {
		dev_err(ctrl->device,
			"I/O queue command capsule supported size %d < 4\n",
			ctrl->ioccsz);
		return -EINVAL;
	}

	if (ctrl->iorcsz < 1) {
		dev_err(ctrl->device,
			"I/O queue response capsule supported size %d < 1\n",
			ctrl->iorcsz);
		return -EINVAL;
	}

3041 3042 3043
	return 0;
}

3044
static int nvme_init_identify(struct nvme_ctrl *ctrl)
3045 3046
{
	struct nvme_id_ctrl *id;
3047
	u32 max_hw_sectors;
3048
	bool prev_apst_enabled;
3049
	int ret;
3050

3051 3052
	ret = nvme_identify_ctrl(ctrl, &id);
	if (ret) {
3053
		dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
3054 3055 3056
		return -EIO;
	}

3057 3058 3059
	if (!(ctrl->ops->flags & NVME_F_FABRICS))
		ctrl->cntlid = le16_to_cpu(id->cntlid);

3060
	if (!ctrl->identified) {
3061
		unsigned int i;
Christoph Hellwig's avatar
Christoph Hellwig committed
3062

3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074
		/*
		 * Check for quirks.  Quirk can depend on firmware version,
		 * so, in principle, the set of quirks present can change
		 * across a reset.  As a possible future enhancement, we
		 * could re-scan for quirks every time we reinitialize
		 * the device, but we'd have to make sure that the driver
		 * behaves intelligently if the quirks change.
		 */
		for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
			if (quirk_matches(id, &core_quirks[i]))
				ctrl->quirks |= core_quirks[i].quirks;
		}
3075 3076 3077 3078

		ret = nvme_init_subsystem(ctrl, id);
		if (ret)
			goto out_free;
3079 3080 3081 3082

		ret = nvme_init_effects(ctrl, id);
		if (ret)
			goto out_free;
3083
	}
3084 3085
	memcpy(ctrl->subsys->firmware_rev, id->fr,
	       sizeof(ctrl->subsys->firmware_rev));
3086

3087
	if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
3088
		dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
3089 3090 3091
		ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
	}

3092 3093 3094 3095
	ctrl->crdt[0] = le16_to_cpu(id->crdt1);
	ctrl->crdt[1] = le16_to_cpu(id->crdt2);
	ctrl->crdt[2] = le16_to_cpu(id->crdt3);

3096
	ctrl->oacs = le16_to_cpu(id->oacs);
3097
	ctrl->oncs = le16_to_cpu(id->oncs);
3098
	ctrl->mtfa = le16_to_cpu(id->mtfa);
3099
	ctrl->oaes = le32_to_cpu(id->oaes);
3100 3101 3102
	ctrl->wctemp = le16_to_cpu(id->wctemp);
	ctrl->cctemp = le16_to_cpu(id->cctemp);

3103
	atomic_set(&ctrl->abort_limit, id->acl + 1);
3104 3105
	ctrl->vwc = id->vwc;
	if (id->mdts)
3106
		max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
3107
	else
3108 3109 3110
		max_hw_sectors = UINT_MAX;
	ctrl->max_hw_sectors =
		min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
3111

3112
	nvme_set_queue_limits(ctrl, ctrl->admin_q);
3113
	ctrl->sgls = le32_to_cpu(id->sgls);
3114
	ctrl->kas = le16_to_cpu(id->kas);
Christoph Hellwig's avatar
Christoph Hellwig committed
3115
	ctrl->max_namespaces = le32_to_cpu(id->mnan);
3116
	ctrl->ctratt = le32_to_cpu(id->ctratt);
3117

3118 3119 3120
	ctrl->cntrltype = id->cntrltype;
	ctrl->dctype = id->dctype;

3121 3122
	if (id->rtd3e) {
		/* us -> s */
3123
		u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
3124 3125 3126 3127 3128

		ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
						 shutdown_timeout, 60);

		if (ctrl->shutdown_timeout != shutdown_timeout)
3129
			dev_info(ctrl->device,
3130 3131 3132 3133 3134
				 "Shutdown timeout set to %u seconds\n",
				 ctrl->shutdown_timeout);
	} else
		ctrl->shutdown_timeout = shutdown_timeout;

3135
	ctrl->npss = id->npss;
3136 3137
	ctrl->apsta = id->apsta;
	prev_apst_enabled = ctrl->apst_enabled;
3138 3139
	if (ctrl->quirks & NVME_QUIRK_NO_APST) {
		if (force_apst && id->apsta) {
3140
			dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
3141
			ctrl->apst_enabled = true;
3142
		} else {
3143
			ctrl->apst_enabled = false;
3144 3145
		}
	} else {
3146
		ctrl->apst_enabled = id->apsta;
3147
	}
3148 3149
	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));

3150
	if (ctrl->ops->flags & NVME_F_FABRICS) {
3151 3152 3153 3154 3155
		ctrl->icdoff = le16_to_cpu(id->icdoff);
		ctrl->ioccsz = le32_to_cpu(id->ioccsz);
		ctrl->iorcsz = le32_to_cpu(id->iorcsz);
		ctrl->maxcmd = le16_to_cpu(id->maxcmd);

3156 3157
		ret = nvme_check_ctrl_fabric_info(ctrl, id);
		if (ret)
3158
			goto out_free;
3159
	} else {
3160 3161
		ctrl->hmpre = le32_to_cpu(id->hmpre);
		ctrl->hmmin = le32_to_cpu(id->hmmin);
3162 3163
		ctrl->hmminds = le32_to_cpu(id->hmminds);
		ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
3164
	}
3165

3166
	ret = nvme_mpath_init_identify(ctrl, id);
Christoph Hellwig's avatar
Christoph Hellwig committed
3167
	if (ret < 0)
3168
		goto out_free;
Christoph Hellwig's avatar
Christoph Hellwig committed
3169

3170
	if (ctrl->apst_enabled && !prev_apst_enabled)
3171
		dev_pm_qos_expose_latency_tolerance(ctrl->device);
3172
	else if (!ctrl->apst_enabled && prev_apst_enabled)
3173 3174
		dev_pm_qos_hide_latency_tolerance(ctrl->device);

3175 3176 3177 3178 3179 3180 3181 3182 3183 3184
out_free:
	kfree(id);
	return ret;
}

/*
 * Initialize the cached copies of the Identify data and various controller
 * register in our nvme_ctrl structure.  This should be called as soon as
 * the admin queue is fully up and running.
 */
3185
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203
{
	int ret;

	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
	if (ret) {
		dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
		return ret;
	}

	ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);

	if (ctrl->vs >= NVME_VS(1, 1, 0))
		ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);

	ret = nvme_init_identify(ctrl);
	if (ret)
		return ret;

3204 3205 3206
	ret = nvme_configure_apst(ctrl);
	if (ret < 0)
		return ret;
3207

3208 3209 3210
	ret = nvme_configure_timestamp(ctrl);
	if (ret < 0)
		return ret;
3211

3212
	ret = nvme_configure_host_options(ctrl);
3213 3214 3215
	if (ret < 0)
		return ret;

3216 3217
	nvme_configure_opal(ctrl, was_suspended);

3218
	if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
3219 3220 3221 3222
		/*
		 * Do not return errors unless we are in a controller reset,
		 * the controller works perfectly fine without hwmon.
		 */
3223
		ret = nvme_hwmon_init(ctrl);
3224
		if (ret == -EINTR)
3225 3226
			return ret;
	}
3227

3228
	clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags);
3229
	ctrl->identified = true;
3230

3231 3232
	nvme_start_keep_alive(ctrl);

3233
	return 0;
3234
}
3235
EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
3236

3237
static int nvme_dev_open(struct inode *inode, struct file *file)
3238
{
3239 3240
	struct nvme_ctrl *ctrl =
		container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3241

3242 3243 3244 3245
	switch (ctrl->state) {
	case NVME_CTRL_LIVE:
		break;
	default:
3246
		return -EWOULDBLOCK;
3247 3248
	}

3249
	nvme_get_ctrl(ctrl);
3250 3251
	if (!try_module_get(ctrl->ops->module)) {
		nvme_put_ctrl(ctrl);
3252
		return -EINVAL;
3253
	}
3254

3255
	file->private_data = ctrl;
3256 3257 3258
	return 0;
}

3259 3260 3261 3262 3263 3264 3265 3266 3267 3268
static int nvme_dev_release(struct inode *inode, struct file *file)
{
	struct nvme_ctrl *ctrl =
		container_of(inode->i_cdev, struct nvme_ctrl, cdev);

	module_put(ctrl->ops->module);
	nvme_put_ctrl(ctrl);
	return 0;
}

3269 3270 3271
static const struct file_operations nvme_dev_fops = {
	.owner		= THIS_MODULE,
	.open		= nvme_dev_open,
3272
	.release	= nvme_dev_release,
3273
	.unlocked_ioctl	= nvme_dev_ioctl,
3274
	.compat_ioctl	= compat_ptr_ioctl,
3275
	.uring_cmd	= nvme_dev_uring_cmd,
3276 3277
};

3278
static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
3279 3280 3281 3282
		unsigned nsid)
{
	struct nvme_ns_head *h;

3283
	lockdep_assert_held(&ctrl->subsys->lock);
3284

3285 3286 3287 3288 3289 3290 3291
	list_for_each_entry(h, &ctrl->subsys->nsheads, entry) {
		/*
		 * Private namespaces can share NSIDs under some conditions.
		 * In that case we can't use the same ns_head for namespaces
		 * with the same NSID.
		 */
		if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
3292 3293
			continue;
		if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
3294 3295 3296 3297 3298 3299
			return h;
	}

	return NULL;
}

3300 3301
static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
		struct nvme_ns_ids *ids)
3302
{
3303 3304 3305
	bool has_uuid = !uuid_is_null(&ids->uuid);
	bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid));
	bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
3306 3307 3308 3309 3310
	struct nvme_ns_head *h;

	lockdep_assert_held(&subsys->lock);

	list_for_each_entry(h, &subsys->nsheads, entry) {
3311 3312 3313 3314 3315 3316 3317
		if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
			return -EINVAL;
		if (has_nguid &&
		    memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
			return -EINVAL;
		if (has_eui64 &&
		    memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
3318 3319 3320 3321 3322 3323
			return -EINVAL;
	}

	return 0;
}

3324 3325
static void nvme_cdev_rel(struct device *dev)
{
3326
	ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt));
3327 3328
}

3329 3330 3331
void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device)
{
	cdev_device_del(cdev, cdev_device);
3332
	put_device(cdev_device);
3333 3334 3335 3336 3337 3338 3339
}

int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
		const struct file_operations *fops, struct module *owner)
{
	int minor, ret;

3340
	minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL);
3341 3342 3343 3344
	if (minor < 0)
		return minor;
	cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
	cdev_device->class = nvme_ns_chr_class;
3345
	cdev_device->release = nvme_cdev_rel;
3346 3347 3348 3349
	device_initialize(cdev_device);
	cdev_init(cdev, fops);
	cdev->owner = owner;
	ret = cdev_device_add(cdev, cdev_device);
3350
	if (ret)
3351
		put_device(cdev_device);
3352

3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372
	return ret;
}

static int nvme_ns_chr_open(struct inode *inode, struct file *file)
{
	return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev));
}

static int nvme_ns_chr_release(struct inode *inode, struct file *file)
{
	nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev));
	return 0;
}

static const struct file_operations nvme_ns_chr_fops = {
	.owner		= THIS_MODULE,
	.open		= nvme_ns_chr_open,
	.release	= nvme_ns_chr_release,
	.unlocked_ioctl	= nvme_ns_chr_ioctl,
	.compat_ioctl	= compat_ptr_ioctl,
3373
	.uring_cmd	= nvme_ns_chr_uring_cmd,
3374
	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385
};

static int nvme_add_ns_cdev(struct nvme_ns *ns)
{
	int ret;

	ns->cdev_device.parent = ns->ctrl->device;
	ret = dev_set_name(&ns->cdev_device, "ng%dn%d",
			   ns->ctrl->instance, ns->head->instance);
	if (ret)
		return ret;
3386 3387 3388

	return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops,
			     ns->ctrl->ops->module);
3389 3390
}

3391
static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
3392
		struct nvme_ns_info *info)
3393 3394
{
	struct nvme_ns_head *head;
3395
	size_t size = sizeof(*head);
3396 3397
	int ret = -ENOMEM;

3398 3399 3400 3401 3402
#ifdef CONFIG_NVME_MULTIPATH
	size += num_possible_nodes() * sizeof(struct nvme_ns *);
#endif

	head = kzalloc(size, GFP_KERNEL);
3403 3404
	if (!head)
		goto out;
3405
	ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL);
3406 3407 3408 3409
	if (ret < 0)
		goto out_free_head;
	head->instance = ret;
	INIT_LIST_HEAD(&head->list);
3410 3411 3412
	ret = init_srcu_struct(&head->srcu);
	if (ret)
		goto out_ida_remove;
3413
	head->subsys = ctrl->subsys;
3414 3415 3416
	head->ns_id = info->nsid;
	head->ids = info->ids;
	head->shared = info->is_shared;
3417 3418
	kref_init(&head->ref);

3419 3420 3421 3422 3423 3424 3425
	if (head->ids.csi) {
		ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
		if (ret)
			goto out_cleanup_srcu;
	} else
		head->effects = ctrl->effects;

3426 3427 3428 3429
	ret = nvme_mpath_alloc_disk(ctrl, head);
	if (ret)
		goto out_cleanup_srcu;

3430
	list_add_tail(&head->entry, &ctrl->subsys->nsheads);
3431 3432 3433

	kref_get(&ctrl->subsys->ref);

3434 3435 3436
	return head;
out_cleanup_srcu:
	cleanup_srcu_struct(&head->srcu);
3437
out_ida_remove:
3438
	ida_free(&ctrl->subsys->ns_ida, head->instance);
3439 3440 3441
out_free_head:
	kfree(head);
out:
3442 3443
	if (ret > 0)
		ret = blk_status_to_errno(nvme_error_status(ret));
3444 3445 3446
	return ERR_PTR(ret);
}

3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472
static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
		struct nvme_ns_ids *ids)
{
	struct nvme_subsystem *s;
	int ret = 0;

	/*
	 * Note that this check is racy as we try to avoid holding the global
	 * lock over the whole ns_head creation.  But it is only intended as
	 * a sanity check anyway.
	 */
	mutex_lock(&nvme_subsystems_lock);
	list_for_each_entry(s, &nvme_subsystems, entry) {
		if (s == this)
			continue;
		mutex_lock(&s->lock);
		ret = nvme_subsys_check_duplicate_ids(s, ids);
		mutex_unlock(&s->lock);
		if (ret)
			break;
	}
	mutex_unlock(&nvme_subsystems_lock);

	return ret;
}

3473
static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
3474 3475 3476
{
	struct nvme_ctrl *ctrl = ns->ctrl;
	struct nvme_ns_head *head = NULL;
3477 3478
	int ret;

3479
	ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids);
3480
	if (ret) {
3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496
		/*
		 * We've found two different namespaces on two different
		 * subsystems that report the same ID.  This is pretty nasty
		 * for anything that actually requires unique device
		 * identification.  In the kernel we need this for multipathing,
		 * and in user space the /dev/disk/by-id/ links rely on it.
		 *
		 * If the device also claims to be multi-path capable back off
		 * here now and refuse the probe the second device as this is a
		 * recipe for data corruption.  If not this is probably a
		 * cheap consumer device if on the PCIe bus, so let the user
		 * proceed and use the shiny toy, but warn that with changing
		 * probing order (which due to our async probing could just be
		 * device taking longer to startup) the other device could show
		 * up at any time.
		 */
3497
		nvme_print_device_info(ctrl);
3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514
		if ((ns->ctrl->ops->flags & NVME_F_FABRICS) || /* !PCIe */
		    ((ns->ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) &&
		     info->is_shared)) {
			dev_err(ctrl->device,
				"ignoring nsid %d because of duplicate IDs\n",
				info->nsid);
			return ret;
		}

		dev_err(ctrl->device,
			"clearing duplicate IDs for nsid %d\n", info->nsid);
		dev_err(ctrl->device,
			"use of /dev/disk/by-id/ may cause data corruption\n");
		memset(&info->ids.nguid, 0, sizeof(info->ids.nguid));
		memset(&info->ids.uuid, 0, sizeof(info->ids.uuid));
		memset(&info->ids.eui64, 0, sizeof(info->ids.eui64));
		ctrl->quirks |= NVME_QUIRK_BOGUS_NID;
3515
	}
3516 3517

	mutex_lock(&ctrl->subsys->lock);
3518
	head = nvme_find_ns_head(ctrl, info->nsid);
3519
	if (!head) {
3520
		ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids);
3521 3522
		if (ret) {
			dev_err(ctrl->device,
3523
				"duplicate IDs in subsystem for nsid %d\n",
3524
				info->nsid);
3525 3526
			goto out_unlock;
		}
3527
		head = nvme_alloc_ns_head(ctrl, info);
3528 3529 3530 3531 3532
		if (IS_ERR(head)) {
			ret = PTR_ERR(head);
			goto out_unlock;
		}
	} else {
3533
		ret = -EINVAL;
3534
		if (!info->is_shared || !head->shared) {
3535
			dev_err(ctrl->device,
3536 3537
				"Duplicate unshared namespace %d\n",
				info->nsid);
3538
			goto out_put_ns_head;
3539
		}
3540
		if (!nvme_ns_ids_equal(&head->ids, &info->ids)) {
3541 3542
			dev_err(ctrl->device,
				"IDs don't match for shared namespace %d\n",
3543
					info->nsid);
3544
			goto out_put_ns_head;
3545
		}
3546

3547
		if (!multipath) {
3548 3549
			dev_warn(ctrl->device,
				"Found shared namespace %d, but multipathing not supported.\n",
3550
				info->nsid);
3551 3552 3553
			dev_warn_once(ctrl->device,
				"Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
		}
3554 3555
	}

3556
	list_add_tail_rcu(&ns->siblings, &head->list);
3557
	ns->head = head;
3558 3559
	mutex_unlock(&ctrl->subsys->lock);
	return 0;
3560

3561 3562
out_put_ns_head:
	nvme_put_ns_head(head);
3563 3564 3565 3566 3567
out_unlock:
	mutex_unlock(&ctrl->subsys->lock);
	return ret;
}

3568
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3569
{
3570
	struct nvme_ns *ns, *ret = NULL;
3571

3572
	down_read(&ctrl->namespaces_rwsem);
3573
	list_for_each_entry(ns, &ctrl->namespaces, list) {
3574
		if (ns->head->ns_id == nsid) {
3575
			if (!nvme_get_ns(ns))
3576
				continue;
3577 3578 3579
			ret = ns;
			break;
		}
3580
		if (ns->head->ns_id > nsid)
3581 3582
			break;
	}
3583
	up_read(&ctrl->namespaces_rwsem);
3584
	return ret;
3585
}
3586
EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
3587

3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603
/*
 * Add the namespace to the controller list while keeping the list ordered.
 */
static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
{
	struct nvme_ns *tmp;

	list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
		if (tmp->head->ns_id < ns->head->ns_id) {
			list_add(&ns->list, &tmp->list);
			return;
		}
	}
	list_add(&ns->list, &ns->ctrl->namespaces);
}

3604
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
3605 3606 3607
{
	struct nvme_ns *ns;
	struct gendisk *disk;
3608
	int node = ctrl->numa_node;
3609 3610 3611

	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
	if (!ns)
3612
		return;
3613

3614 3615
	disk = blk_mq_alloc_disk(ctrl->tagset, ns);
	if (IS_ERR(disk))
3616
		goto out_free_ns;
3617 3618 3619 3620 3621
	disk->fops = &nvme_bdev_ops;
	disk->private_data = ns;

	ns->disk = disk;
	ns->queue = disk->queue;
3622

3623
	if (ctrl->opts && ctrl->opts->data_digest)
3624
		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
3625

3626
	blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
3627 3628
	if (ctrl->ops->supports_pci_p2pdma &&
	    ctrl->ops->supports_pci_p2pdma(ctrl))
3629 3630
		blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);

3631 3632 3633
	ns->ctrl = ctrl;
	kref_init(&ns->kref);

3634
	if (nvme_init_ns_head(ns, info))
3635
		goto out_cleanup_disk;
3636

3637
	/*
3638 3639 3640 3641 3642 3643 3644 3645 3646
	 * If multipathing is enabled, the device name for all disks and not
	 * just those that represent shared namespaces needs to be based on the
	 * subsystem instance.  Using the controller instance for private
	 * namespaces could lead to naming collisions between shared and private
	 * namespaces if they don't use a common numbering scheme.
	 *
	 * If multipathing is not enabled, disk names must use the controller
	 * instance as shared namespaces will show up as multiple block
	 * devices.
3647
	 */
3648
	if (nvme_ns_head_multipath(ns->head)) {
3649 3650 3651 3652 3653 3654 3655
		sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
			ctrl->instance, ns->head->instance);
		disk->flags |= GENHD_FL_HIDDEN;
	} else if (multipath) {
		sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
			ns->head->instance);
	} else {
3656 3657
		sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
			ns->head->instance);
3658
	}
3659

3660
	if (nvme_update_ns_info(ns, info))
3661
		goto out_unlink_ns;
3662

3663
	down_write(&ctrl->namespaces_rwsem);
3664
	nvme_ns_add_to_ctrl_list(ns);
3665
	up_write(&ctrl->namespaces_rwsem);
3666
	nvme_get_ctrl(ctrl);
3667

3668 3669 3670
	if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
		goto out_cleanup_ns_from_list;

3671 3672
	if (!nvme_ns_head_multipath(ns->head))
		nvme_add_ns_cdev(ns);
3673

3674
	nvme_mpath_add_disk(ns, info->anagrpid);
3675
	nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
Christoph Hellwig's avatar
Christoph Hellwig committed
3676

3677
	return;
3678

3679 3680 3681 3682 3683
 out_cleanup_ns_from_list:
	nvme_put_ctrl(ctrl);
	down_write(&ctrl->namespaces_rwsem);
	list_del_init(&ns->list);
	up_write(&ctrl->namespaces_rwsem);
3684 3685 3686
 out_unlink_ns:
	mutex_lock(&ctrl->subsys->lock);
	list_del_rcu(&ns->siblings);
3687 3688
	if (list_empty(&ns->head->list))
		list_del_init(&ns->head->entry);
3689
	mutex_unlock(&ctrl->subsys->lock);
3690
	nvme_put_ns_head(ns->head);
3691
 out_cleanup_disk:
3692
	put_disk(disk);
3693 3694 3695 3696 3697 3698
 out_free_ns:
	kfree(ns);
}

static void nvme_ns_remove(struct nvme_ns *ns)
{
3699 3700
	bool last_path = false;

3701 3702
	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
		return;
3703

3704
	clear_bit(NVME_NS_READY, &ns->flags);
3705
	set_capacity(ns->disk, 0);
3706
	nvme_fault_inject_fini(&ns->fault_inject);
3707

3708 3709 3710 3711 3712 3713 3714 3715 3716 3717
	/*
	 * Ensure that !NVME_NS_READY is seen by other threads to prevent
	 * this ns going back into current_path.
	 */
	synchronize_srcu(&ns->head->srcu);

	/* wait for concurrent submissions */
	if (nvme_mpath_clear_current_path(ns))
		synchronize_srcu(&ns->head->srcu);

3718 3719
	mutex_lock(&ns->ctrl->subsys->lock);
	list_del_rcu(&ns->siblings);
3720 3721 3722 3723
	if (list_empty(&ns->head->list)) {
		list_del_init(&ns->head->entry);
		last_path = true;
	}
3724
	mutex_unlock(&ns->ctrl->subsys->lock);
3725

3726
	/* guarantee not available in head->list */
3727
	synchronize_srcu(&ns->head->srcu);
3728

3729 3730 3731
	if (!nvme_ns_head_multipath(ns->head))
		nvme_cdev_del(&ns->cdev, &ns->cdev_device);
	del_gendisk(ns->disk);
3732

3733
	down_write(&ns->ctrl->namespaces_rwsem);
3734
	list_del_init(&ns->list);
3735
	up_write(&ns->ctrl->namespaces_rwsem);
3736

3737 3738
	if (last_path)
		nvme_mpath_shutdown_disk(ns->head);
3739 3740 3741
	nvme_put_ns(ns);
}

3742 3743 3744 3745 3746 3747 3748 3749 3750 3751
static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
{
	struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);

	if (ns) {
		nvme_ns_remove(ns);
		nvme_put_ns(ns);
	}
}

3752
static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
3753
{
3754
	int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
3755

3756
	if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
3757
		dev_err(ns->ctrl->device,
3758
			"identifiers changed for nsid %d\n", ns->head->ns_id);
3759
		goto out;
3760 3761
	}

3762
	ret = nvme_update_ns_info(ns, info);
3763 3764
out:
	/*
3765
	 * Only remove the namespace if we got a fatal error back from the
3766
	 * device, otherwise ignore the error and just move on.
3767 3768
	 *
	 * TODO: we should probably schedule a delayed retry here.
3769
	 */
3770
	if (ret > 0 && (ret & NVME_SC_DNR))
3771
		nvme_ns_remove(ns);
3772 3773
}

3774
static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3775
{
3776
	struct nvme_ns_info info = { .nsid = nsid };
3777
	struct nvme_ns *ns;
3778
	int ret;
3779

3780
	if (nvme_identify_ns_descs(ctrl, &info))
3781
		return;
3782

3783
	if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) {
3784 3785 3786 3787 3788
		dev_warn(ctrl->device,
			"command set not reported for nsid: %d\n", nsid);
		return;
	}

3789
	/*
3790 3791 3792
	 * If available try to use the Command Set Idependent Identify Namespace
	 * data structure to find all the generic information that is needed to
	 * set up a namespace.  If not fall back to the legacy version.
3793
	 */
3794
	if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
3795 3796 3797 3798 3799 3800 3801
	    (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS))
		ret = nvme_ns_info_from_id_cs_indep(ctrl, &info);
	else
		ret = nvme_ns_info_from_identify(ctrl, &info);

	if (info.is_removed)
		nvme_ns_remove_by_nsid(ctrl, nsid);
3802

3803 3804 3805 3806
	/*
	 * Ignore the namespace if it is not ready. We will get an AEN once it
	 * becomes ready and restart the scan.
	 */
3807
	if (ret || !info.is_ready)
3808 3809
		return;

3810
	ns = nvme_find_get_ns(ctrl, nsid);
3811
	if (ns) {
3812
		nvme_validate_ns(ns, &info);
3813
		nvme_put_ns(ns);
3814 3815
	} else {
		nvme_alloc_ns(ctrl, &info);
3816
	}
3817 3818
}

3819 3820 3821 3822
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
					unsigned nsid)
{
	struct nvme_ns *ns, *next;
3823
	LIST_HEAD(rm_list);
3824

3825
	down_write(&ctrl->namespaces_rwsem);
3826
	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
3827
		if (ns->head->ns_id > nsid)
3828
			list_move_tail(&ns->list, &rm_list);
3829
	}
3830
	up_write(&ctrl->namespaces_rwsem);
3831 3832 3833 3834

	list_for_each_entry_safe(ns, next, &rm_list, list)
		nvme_ns_remove(ns);

3835 3836
}

3837
static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
3838
{
3839
	const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
3840
	__le32 *ns_list;
3841 3842
	u32 prev = 0;
	int ret = 0, i;
3843

3844
	ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
3845 3846 3847
	if (!ns_list)
		return -ENOMEM;

3848
	for (;;) {
3849 3850 3851 3852 3853 3854 3855 3856
		struct nvme_command cmd = {
			.identify.opcode	= nvme_admin_identify,
			.identify.cns		= NVME_ID_CNS_NS_ACTIVE_LIST,
			.identify.nsid		= cpu_to_le32(prev),
		};

		ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
					    NVME_IDENTIFY_DATA_SIZE);
3857 3858 3859
		if (ret) {
			dev_warn(ctrl->device,
				"Identify NS List failed (status=0x%x)\n", ret);
3860
			goto free;
3861
		}
3862

3863
		for (i = 0; i < nr_entries; i++) {
3864
			u32 nsid = le32_to_cpu(ns_list[i]);
3865

3866 3867
			if (!nsid)	/* end of the list? */
				goto out;
3868
			nvme_scan_ns(ctrl, nsid);
3869 3870
			while (++prev < nsid)
				nvme_ns_remove_by_nsid(ctrl, prev);
3871 3872 3873
		}
	}
 out:
3874 3875
	nvme_remove_invalid_namespaces(ctrl, prev);
 free:
3876 3877 3878 3879
	kfree(ns_list);
	return ret;
}

3880
static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
3881
{
3882 3883 3884 3885 3886 3887 3888
	struct nvme_id_ctrl *id;
	u32 nn, i;

	if (nvme_identify_ctrl(ctrl, &id))
		return;
	nn = le32_to_cpu(id->nn);
	kfree(id);
3889

3890
	for (i = 1; i <= nn; i++)
3891
		nvme_scan_ns(ctrl, i);
3892

3893
	nvme_remove_invalid_namespaces(ctrl, nn);
3894 3895
}

3896
static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
3897 3898 3899
{
	size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
	__le32 *log;
3900
	int error;
3901 3902 3903

	log = kzalloc(log_size, GFP_KERNEL);
	if (!log)
3904
		return;
3905

3906 3907 3908 3909 3910 3911
	/*
	 * We need to read the log to clear the AEN, but we don't want to rely
	 * on it for the changed namespace information as userspace could have
	 * raced with us in reading the log page, which could cause us to miss
	 * updates.
	 */
3912 3913
	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
			NVME_CSI_NVM, log, log_size, 0);
3914
	if (error)
3915 3916 3917 3918 3919 3920
		dev_warn(ctrl->device,
			"reading changed ns log failed: %d\n", error);

	kfree(log);
}

3921
static void nvme_scan_work(struct work_struct *work)
3922
{
3923 3924
	struct nvme_ctrl *ctrl =
		container_of(work, struct nvme_ctrl, scan_work);
3925
	int ret;
3926

3927 3928
	/* No tagset on a live ctrl means IO queues could not created */
	if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
3929 3930
		return;

3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944
	/*
	 * Identify controller limits can change at controller reset due to
	 * new firmware download, even though it is not common we cannot ignore
	 * such scenario. Controller's non-mdts limits are reported in the unit
	 * of logical blocks that is dependent on the format of attached
	 * namespace. Hence re-read the limits at the time of ns allocation.
	 */
	ret = nvme_init_non_mdts_limits(ctrl);
	if (ret < 0) {
		dev_warn(ctrl->device,
			"reading non-mdts-limits failed: %d\n", ret);
		return;
	}

3945
	if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
3946
		dev_info(ctrl->device, "rescanning namespaces.\n");
3947
		nvme_clear_changed_ns_log(ctrl);
3948 3949
	}

3950
	mutex_lock(&ctrl->scan_lock);
3951
	if (nvme_ctrl_limited_cns(ctrl)) {
3952
		nvme_scan_ns_sequential(ctrl);
3953 3954 3955 3956 3957 3958 3959 3960 3961 3962
	} else {
		/*
		 * Fall back to sequential scan if DNR is set to handle broken
		 * devices which should support Identify NS List (as per the VS
		 * they report) but don't actually support it.
		 */
		ret = nvme_scan_ns_list(ctrl);
		if (ret > 0 && ret & NVME_SC_DNR)
			nvme_scan_ns_sequential(ctrl);
	}
3963
	mutex_unlock(&ctrl->scan_lock);
3964
}
3965

3966 3967 3968 3969 3970
/*
 * This function iterates the namespace list unlocked to allow recovery from
 * controller failure. It is up to the caller to ensure the namespace list is
 * not modified by scan work while this function is executing.
 */
3971 3972 3973
void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
{
	struct nvme_ns *ns, *next;
3974
	LIST_HEAD(ns_list);
3975

3976 3977 3978 3979 3980 3981 3982
	/*
	 * make sure to requeue I/O to all namespaces as these
	 * might result from the scan itself and must complete
	 * for the scan_work to make progress
	 */
	nvme_mpath_clear_ctrl_paths(ctrl);

3983 3984 3985 3986 3987 3988
	/*
	 * Unquiesce io queues so any pending IO won't hang, especially
	 * those submitted from scan work
	 */
	nvme_unquiesce_io_queues(ctrl);

3989 3990 3991
	/* prevent racing with ns scanning */
	flush_work(&ctrl->scan_work);

3992 3993 3994 3995 3996 3997
	/*
	 * The dead states indicates the controller was not gracefully
	 * disconnected. In that case, we won't be able to flush any data while
	 * removing the namespaces' disks; fail all the queues now to avoid
	 * potentially having to clean up the failed sync later.
	 */
3998
	if (ctrl->state == NVME_CTRL_DEAD)
3999
		nvme_mark_namespaces_dead(ctrl);
4000

4001 4002 4003
	/* this is a no-op when called from the controller reset handler */
	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);

4004
	down_write(&ctrl->namespaces_rwsem);
4005
	list_splice_init(&ctrl->namespaces, &ns_list);
4006
	up_write(&ctrl->namespaces_rwsem);
4007 4008

	list_for_each_entry_safe(ns, next, &ns_list, list)
4009 4010
		nvme_ns_remove(ns);
}
4011
EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4012

4013
static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env)
4014
{
4015
	const struct nvme_ctrl *ctrl =
4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035
		container_of(dev, struct nvme_ctrl, ctrl_device);
	struct nvmf_ctrl_options *opts = ctrl->opts;
	int ret;

	ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
	if (ret)
		return ret;

	if (opts) {
		ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
		if (ret)
			return ret;

		ret = add_uevent_var(env, "NVME_TRSVCID=%s",
				opts->trsvcid ?: "none");
		if (ret)
			return ret;

		ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
				opts->host_traddr ?: "none");
4036 4037 4038 4039 4040
		if (ret)
			return ret;

		ret = add_uevent_var(env, "NVME_HOST_IFACE=%s",
				opts->host_iface ?: "none");
4041 4042 4043 4044
	}
	return ret;
}

4045 4046 4047 4048 4049 4050 4051
static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata)
{
	char *envp[2] = { envdata, NULL };

	kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
}

4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067
static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
{
	char *envp[2] = { NULL, NULL };
	u32 aen_result = ctrl->aen_result;

	ctrl->aen_result = 0;
	if (!aen_result)
		return;

	envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
	if (!envp[0])
		return;
	kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
	kfree(envp[0]);
}

4068 4069 4070 4071 4072
static void nvme_async_event_work(struct work_struct *work)
{
	struct nvme_ctrl *ctrl =
		container_of(work, struct nvme_ctrl, async_event_work);

4073
	nvme_aen_uevent(ctrl);
4074 4075 4076 4077 4078 4079 4080 4081

	/*
	 * The transport drivers must guarantee AER submission here is safe by
	 * flushing ctrl async_event_work after changing the controller state
	 * from LIVE and before freeing the admin queue.
	*/
	if (ctrl->state == NVME_CTRL_LIVE)
		ctrl->ops->submit_async_event(ctrl);
4082 4083
}

4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105
static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
{

	u32 csts;

	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
		return false;

	if (csts == ~0)
		return false;

	return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
}

static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
{
	struct nvme_fw_slot_info_log *log;

	log = kmalloc(sizeof(*log), GFP_KERNEL);
	if (!log)
		return;

4106
	if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4107
			 log, sizeof(*log), 0)) {
4108
		dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121
		goto out_free_log;
	}

	if (log->afi & 0x70 || !(log->afi & 0x7)) {
		dev_info(ctrl->device,
			 "Firmware is activated after next Controller Level Reset\n");
		goto out_free_log;
	}

	memcpy(ctrl->subsys->firmware_rev, &log->frs[(log->afi & 0x7) - 1],
		sizeof(ctrl->subsys->firmware_rev));

out_free_log:
4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137
	kfree(log);
}

static void nvme_fw_act_work(struct work_struct *work)
{
	struct nvme_ctrl *ctrl = container_of(work,
				struct nvme_ctrl, fw_act_work);
	unsigned long fw_act_timeout;

	if (ctrl->mtfa)
		fw_act_timeout = jiffies +
				msecs_to_jiffies(ctrl->mtfa * 100);
	else
		fw_act_timeout = jiffies +
				msecs_to_jiffies(admin_timeout * 1000);

4138
	nvme_quiesce_io_queues(ctrl);
4139 4140 4141 4142
	while (nvme_ctrl_pp_status(ctrl)) {
		if (time_after(jiffies, fw_act_timeout)) {
			dev_warn(ctrl->device,
				"Fw activation timeout, reset controller\n");
4143 4144
			nvme_try_sched_reset(ctrl);
			return;
4145 4146 4147 4148
		}
		msleep(100);
	}

4149
	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
4150 4151
		return;

4152
	nvme_unquiesce_io_queues(ctrl);
4153
	/* read FW slot information to clear the AER */
4154
	nvme_get_fw_slot_info(ctrl);
4155 4156

	queue_work(nvme_wq, &ctrl->async_event_work);
4157 4158
}

4159 4160 4161 4162 4163 4164 4165 4166 4167 4168
static u32 nvme_aer_type(u32 result)
{
	return result & 0x7;
}

static u32 nvme_aer_subtype(u32 result)
{
	return (result & 0xff00) >> 8;
}

4169
static bool nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
4170
{
4171
	u32 aer_notice_type = nvme_aer_subtype(result);
4172
	bool requeue = true;
4173 4174

	switch (aer_notice_type) {
4175
	case NVME_AER_NOTICE_NS_CHANGED:
4176
		set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
4177 4178 4179
		nvme_queue_scan(ctrl);
		break;
	case NVME_AER_NOTICE_FW_ACT_STARTING:
4180 4181 4182 4183 4184
		/*
		 * We are (ab)using the RESETTING state to prevent subsequent
		 * recovery actions from interfering with the controller's
		 * firmware activation.
		 */
4185 4186
		if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
			nvme_auth_stop(ctrl);
4187
			requeue = false;
4188
			queue_work(nvme_wq, &ctrl->fw_act_work);
4189
		}
4190
		break;
Christoph Hellwig's avatar
Christoph Hellwig committed
4191 4192 4193 4194 4195 4196 4197
#ifdef CONFIG_NVME_MULTIPATH
	case NVME_AER_NOTICE_ANA:
		if (!ctrl->ana_log_buf)
			break;
		queue_work(nvme_wq, &ctrl->ana_work);
		break;
#endif
4198 4199 4200
	case NVME_AER_NOTICE_DISC_CHANGED:
		ctrl->aen_result = result;
		break;
4201 4202 4203
	default:
		dev_warn(ctrl->device, "async event result %08x\n", result);
	}
4204
	return requeue;
4205 4206
}

4207 4208 4209 4210 4211 4212
static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
{
	dev_warn(ctrl->device, "resetting controller due to AER\n");
	nvme_reset_ctrl(ctrl);
}

4213
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
4214
		volatile union nvme_result *res)
4215
{
4216
	u32 result = le32_to_cpu(res->u32);
4217 4218
	u32 aer_type = nvme_aer_type(result);
	u32 aer_subtype = nvme_aer_subtype(result);
4219
	bool requeue = true;
4220

4221
	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
4222 4223
		return;

4224
	trace_nvme_async_event(ctrl, result);
4225
	switch (aer_type) {
4226
	case NVME_AER_NOTICE:
4227
		requeue = nvme_handle_aen_notice(ctrl, result);
4228
		break;
4229
	case NVME_AER_ERROR:
4230 4231 4232 4233 4234 4235 4236 4237 4238
		/*
		 * For a persistent internal error, don't run async_event_work
		 * to submit a new AER. The controller reset will do it.
		 */
		if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
			nvme_handle_aer_persistent_error(ctrl);
			return;
		}
		fallthrough;
4239 4240 4241 4242
	case NVME_AER_SMART:
	case NVME_AER_CSS:
	case NVME_AER_VS:
		ctrl->aen_result = result;
4243 4244 4245
		break;
	default:
		break;
4246
	}
4247 4248 4249

	if (requeue)
		queue_work(nvme_wq, &ctrl->async_event_work);
4250 4251
}
EXPORT_SYMBOL_GPL(nvme_complete_async_event);
4252

4253
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
4254
		const struct blk_mq_ops *ops, unsigned int cmd_size)
4255 4256 4257 4258 4259 4260 4261 4262 4263
{
	int ret;

	memset(set, 0, sizeof(*set));
	set->ops = ops;
	set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
	if (ctrl->ops->flags & NVME_F_FABRICS)
		set->reserved_tags = NVMF_RESERVED_TAGS;
	set->numa_node = ctrl->numa_node;
4264 4265 4266
	set->flags = BLK_MQ_F_NO_SCHED;
	if (ctrl->ops->flags & NVME_F_BLOCKING)
		set->flags |= BLK_MQ_F_BLOCKING;
4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292
	set->cmd_size = cmd_size;
	set->driver_data = ctrl;
	set->nr_hw_queues = 1;
	set->timeout = NVME_ADMIN_TIMEOUT;
	ret = blk_mq_alloc_tag_set(set);
	if (ret)
		return ret;

	ctrl->admin_q = blk_mq_init_queue(set);
	if (IS_ERR(ctrl->admin_q)) {
		ret = PTR_ERR(ctrl->admin_q);
		goto out_free_tagset;
	}

	if (ctrl->ops->flags & NVME_F_FABRICS) {
		ctrl->fabrics_q = blk_mq_init_queue(set);
		if (IS_ERR(ctrl->fabrics_q)) {
			ret = PTR_ERR(ctrl->fabrics_q);
			goto out_cleanup_admin_q;
		}
	}

	ctrl->admin_tagset = set;
	return 0;

out_cleanup_admin_q:
4293
	blk_mq_destroy_queue(ctrl->admin_q);
4294
	blk_put_queue(ctrl->admin_q);
4295
out_free_tagset:
4296 4297 4298
	blk_mq_free_tag_set(set);
	ctrl->admin_q = NULL;
	ctrl->fabrics_q = NULL;
4299 4300 4301 4302 4303 4304 4305
	return ret;
}
EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);

void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
{
	blk_mq_destroy_queue(ctrl->admin_q);
4306 4307
	blk_put_queue(ctrl->admin_q);
	if (ctrl->ops->flags & NVME_F_FABRICS) {
4308
		blk_mq_destroy_queue(ctrl->fabrics_q);
4309 4310
		blk_put_queue(ctrl->fabrics_q);
	}
4311 4312 4313 4314 4315
	blk_mq_free_tag_set(ctrl->admin_tagset);
}
EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);

int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
4316
		const struct blk_mq_ops *ops, unsigned int nr_maps,
4317 4318 4319 4320 4321 4322
		unsigned int cmd_size)
{
	int ret;

	memset(set, 0, sizeof(*set));
	set->ops = ops;
4323
	set->queue_depth = min_t(unsigned, ctrl->sqsize, BLK_MQ_MAX_DEPTH - 1);
4324 4325 4326 4327 4328 4329 4330
	/*
	 * Some Apple controllers requires tags to be unique across admin and
	 * the (only) I/O queue, so reserve the first 32 tags of the I/O queue.
	 */
	if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
		set->reserved_tags = NVME_AQ_DEPTH;
	else if (ctrl->ops->flags & NVME_F_FABRICS)
4331
		set->reserved_tags = NVMF_RESERVED_TAGS;
4332
	set->numa_node = ctrl->numa_node;
4333 4334 4335
	set->flags = BLK_MQ_F_SHOULD_MERGE;
	if (ctrl->ops->flags & NVME_F_BLOCKING)
		set->flags |= BLK_MQ_F_BLOCKING;
4336 4337 4338 4339
	set->cmd_size = cmd_size,
	set->driver_data = ctrl;
	set->nr_hw_queues = ctrl->queue_count - 1;
	set->timeout = NVME_IO_TIMEOUT;
4340
	set->nr_maps = nr_maps;
4341 4342 4343 4344 4345 4346 4347 4348 4349 4350
	ret = blk_mq_alloc_tag_set(set);
	if (ret)
		return ret;

	if (ctrl->ops->flags & NVME_F_FABRICS) {
		ctrl->connect_q = blk_mq_init_queue(set);
        	if (IS_ERR(ctrl->connect_q)) {
			ret = PTR_ERR(ctrl->connect_q);
			goto out_free_tag_set;
		}
4351 4352
		blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE,
				   ctrl->connect_q);
4353 4354 4355 4356 4357 4358 4359
	}

	ctrl->tagset = set;
	return 0;

out_free_tag_set:
	blk_mq_free_tag_set(set);
4360
	ctrl->connect_q = NULL;
4361 4362 4363 4364 4365 4366
	return ret;
}
EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set);

void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl)
{
4367
	if (ctrl->ops->flags & NVME_F_FABRICS) {
4368
		blk_mq_destroy_queue(ctrl->connect_q);
4369 4370
		blk_put_queue(ctrl->connect_q);
	}
4371 4372 4373 4374
	blk_mq_free_tag_set(ctrl->tagset);
}
EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);

4375
void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
4376
{
Christoph Hellwig's avatar
Christoph Hellwig committed
4377
	nvme_mpath_stop(ctrl);
4378
	nvme_auth_stop(ctrl);
4379
	nvme_stop_keep_alive(ctrl);
4380
	nvme_stop_failfast_work(ctrl);
4381
	flush_work(&ctrl->async_event_work);
4382
	cancel_work_sync(&ctrl->fw_act_work);
4383 4384
	if (ctrl->ops->stop_ctrl)
		ctrl->ops->stop_ctrl(ctrl);
4385 4386 4387 4388 4389
}
EXPORT_SYMBOL_GPL(nvme_stop_ctrl);

void nvme_start_ctrl(struct nvme_ctrl *ctrl)
{
4390 4391
	nvme_enable_aen(ctrl);

4392 4393 4394 4395 4396 4397
	/*
	 * persistent discovery controllers need to send indication to userspace
	 * to re-read the discovery log page to learn about possible changes
	 * that were missed. We identify persistent discovery controllers by
	 * checking that they started once before, hence are reconnecting back.
	 */
4398
	if (test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
4399 4400 4401
	    nvme_discovery_ctrl(ctrl))
		nvme_change_uevent(ctrl, "NVME_EVENT=rediscover");

4402 4403
	if (ctrl->queue_count > 1) {
		nvme_queue_scan(ctrl);
4404
		nvme_unquiesce_io_queues(ctrl);
4405
		nvme_mpath_update(ctrl);
4406
	}
4407 4408

	nvme_change_uevent(ctrl, "NVME_EVENT=connected");
4409
	set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags);
4410 4411
}
EXPORT_SYMBOL_GPL(nvme_start_ctrl);
4412

4413 4414
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
{
4415
	nvme_hwmon_exit(ctrl);
4416
	nvme_fault_inject_fini(&ctrl->fault_inject);
4417
	dev_pm_qos_hide_latency_tolerance(ctrl->device);
4418
	cdev_device_del(&ctrl->cdev, ctrl->device);
4419
	nvme_put_ctrl(ctrl);
4420
}
4421
EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
4422

4423 4424 4425 4426 4427
static void nvme_free_cels(struct nvme_ctrl *ctrl)
{
	struct nvme_effects_log	*cel;
	unsigned long i;

4428
	xa_for_each(&ctrl->cels, i, cel) {
4429 4430 4431 4432 4433 4434 4435
		xa_erase(&ctrl->cels, i);
		kfree(cel);
	}

	xa_destroy(&ctrl->cels);
}

4436
static void nvme_free_ctrl(struct device *dev)
4437
{
4438 4439
	struct nvme_ctrl *ctrl =
		container_of(dev, struct nvme_ctrl, ctrl_device);
Christoph Hellwig's avatar
Christoph Hellwig committed
4440
	struct nvme_subsystem *subsys = ctrl->subsys;
4441

4442
	if (!subsys || ctrl->instance != subsys->instance)
4443
		ida_free(&nvme_instance_ida, ctrl->instance);
4444
	key_put(ctrl->tls_key);
4445
	nvme_free_cels(ctrl);
Christoph Hellwig's avatar
Christoph Hellwig committed
4446
	nvme_mpath_uninit(ctrl);
4447 4448
	nvme_auth_stop(ctrl);
	nvme_auth_free(ctrl);
4449
	__free_page(ctrl->discard_page);
4450
	free_opal_dev(ctrl->opal_dev);
4451

Christoph Hellwig's avatar
Christoph Hellwig committed
4452
	if (subsys) {
4453
		mutex_lock(&nvme_subsystems_lock);
Christoph Hellwig's avatar
Christoph Hellwig committed
4454 4455
		list_del(&ctrl->subsys_entry);
		sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
4456
		mutex_unlock(&nvme_subsystems_lock);
Christoph Hellwig's avatar
Christoph Hellwig committed
4457
	}
4458 4459 4460

	ctrl->ops->free_ctrl(ctrl);

Christoph Hellwig's avatar
Christoph Hellwig committed
4461 4462
	if (subsys)
		nvme_put_subsystem(subsys);
4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474
}

/*
 * Initialize a NVMe controller structures.  This needs to be called during
 * earliest initialization so that we have the initialized structured around
 * during probing.
 */
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
		const struct nvme_ctrl_ops *ops, unsigned long quirks)
{
	int ret;

4475
	ctrl->state = NVME_CTRL_NEW;
4476
	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
4477
	spin_lock_init(&ctrl->lock);
4478
	mutex_init(&ctrl->scan_lock);
4479
	INIT_LIST_HEAD(&ctrl->namespaces);
4480
	xa_init(&ctrl->cels);
4481
	init_rwsem(&ctrl->namespaces_rwsem);
4482 4483 4484
	ctrl->dev = dev;
	ctrl->ops = ops;
	ctrl->quirks = quirks;
4485
	ctrl->numa_node = NUMA_NO_NODE;
4486
	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
4487
	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
4488
	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
4489
	INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
4490
	init_waitqueue_head(&ctrl->state_wq);
4491

4492
	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
4493
	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
4494 4495 4496
	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;

4497 4498 4499 4500 4501 4502 4503 4504
	BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
			PAGE_SIZE);
	ctrl->discard_page = alloc_page(GFP_KERNEL);
	if (!ctrl->discard_page) {
		ret = -ENOMEM;
		goto out;
	}

4505
	ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
4506
	if (ret < 0)
4507
		goto out;
4508
	ctrl->instance = ret;
4509

4510 4511
	device_initialize(&ctrl->ctrl_device);
	ctrl->device = &ctrl->ctrl_device;
4512 4513
	ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
			ctrl->instance);
4514 4515
	ctrl->device->class = nvme_class;
	ctrl->device->parent = ctrl->dev;
4516 4517 4518 4519
	if (ops->dev_attr_groups)
		ctrl->device->groups = ops->dev_attr_groups;
	else
		ctrl->device->groups = nvme_dev_attr_groups;
4520 4521 4522 4523
	ctrl->device->release = nvme_free_ctrl;
	dev_set_drvdata(ctrl->device, ctrl);
	ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
	if (ret)
4524 4525
		goto out_release_instance;

4526
	nvme_get_ctrl(ctrl);
4527 4528 4529
	cdev_init(&ctrl->cdev, &nvme_dev_fops);
	ctrl->cdev.owner = ops->module;
	ret = cdev_device_add(&ctrl->cdev, ctrl->device);
4530 4531
	if (ret)
		goto out_free_name;
4532

4533 4534 4535 4536 4537 4538 4539 4540
	/*
	 * Initialize latency tolerance controls.  The sysfs files won't
	 * be visible to userspace unless the device actually supports APST.
	 */
	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
	dev_pm_qos_update_user_latency_tolerance(ctrl->device,
		min(default_ps_max_latency_us, (unsigned long)S32_MAX));

4541
	nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
4542
	nvme_mpath_init_ctrl(ctrl);
4543 4544 4545
	ret = nvme_auth_init_ctrl(ctrl);
	if (ret)
		goto out_free_cdev;
4546

4547
	return 0;
4548
out_free_cdev:
4549
	nvme_fault_inject_fini(&ctrl->fault_inject);
4550
	dev_pm_qos_hide_latency_tolerance(ctrl->device);
4551
	cdev_device_del(&ctrl->cdev, ctrl->device);
4552
out_free_name:
4553
	nvme_put_ctrl(ctrl);
4554
	kfree_const(ctrl->device->kobj.name);
4555
out_release_instance:
4556
	ida_free(&nvme_instance_ida, ctrl->instance);
4557
out:
4558 4559
	if (ctrl->discard_page)
		__free_page(ctrl->discard_page);
4560 4561
	return ret;
}
4562
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
4563

4564 4565
/* let I/O to all namespaces fail in preparation for surprise removal */
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
4566 4567 4568
{
	struct nvme_ns *ns;

4569
	down_read(&ctrl->namespaces_rwsem);
4570
	list_for_each_entry(ns, &ctrl->namespaces, list)
4571
		blk_mark_disk_dead(ns->disk);
4572
	up_read(&ctrl->namespaces_rwsem);
4573
}
4574
EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
4575

4576 4577 4578 4579
void nvme_unfreeze(struct nvme_ctrl *ctrl)
{
	struct nvme_ns *ns;

4580
	down_read(&ctrl->namespaces_rwsem);
4581 4582
	list_for_each_entry(ns, &ctrl->namespaces, list)
		blk_mq_unfreeze_queue(ns->queue);
4583
	up_read(&ctrl->namespaces_rwsem);
4584 4585 4586
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);

4587
int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
4588 4589 4590
{
	struct nvme_ns *ns;

4591
	down_read(&ctrl->namespaces_rwsem);
4592 4593 4594 4595 4596
	list_for_each_entry(ns, &ctrl->namespaces, list) {
		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
		if (timeout <= 0)
			break;
	}
4597
	up_read(&ctrl->namespaces_rwsem);
4598
	return timeout;
4599 4600 4601 4602 4603 4604 4605
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);

void nvme_wait_freeze(struct nvme_ctrl *ctrl)
{
	struct nvme_ns *ns;

4606
	down_read(&ctrl->namespaces_rwsem);
4607 4608
	list_for_each_entry(ns, &ctrl->namespaces, list)
		blk_mq_freeze_queue_wait(ns->queue);
4609
	up_read(&ctrl->namespaces_rwsem);
4610 4611 4612 4613 4614 4615 4616
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze);

void nvme_start_freeze(struct nvme_ctrl *ctrl)
{
	struct nvme_ns *ns;

4617
	down_read(&ctrl->namespaces_rwsem);
4618
	list_for_each_entry(ns, &ctrl->namespaces, list)
4619
		blk_freeze_queue_start(ns->queue);
4620
	up_read(&ctrl->namespaces_rwsem);
4621 4622 4623
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);

4624
void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl)
4625
{
4626 4627
	if (!ctrl->tagset)
		return;
4628 4629 4630 4631
	if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
		blk_mq_quiesce_tagset(ctrl->tagset);
	else
		blk_mq_wait_quiesce_done(ctrl->tagset);
4632
}
4633
EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues);
4634

4635
void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl)
4636
{
4637 4638
	if (!ctrl->tagset)
		return;
4639 4640
	if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
		blk_mq_unquiesce_tagset(ctrl->tagset);
4641
}
4642
EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues);
4643

4644
void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl)
4645
{
Ming Lei's avatar
Ming Lei committed
4646 4647
	if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
		blk_mq_quiesce_queue(ctrl->admin_q);
4648
	else
4649
		blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set);
4650
}
4651
EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue);
4652

4653
void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
4654
{
Ming Lei's avatar
Ming Lei committed
4655 4656
	if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
		blk_mq_unquiesce_queue(ctrl->admin_q);
4657
}
4658
EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
4659

4660
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
4661 4662 4663 4664 4665 4666 4667
{
	struct nvme_ns *ns;

	down_read(&ctrl->namespaces_rwsem);
	list_for_each_entry(ns, &ctrl->namespaces, list)
		blk_sync_queue(ns->queue);
	up_read(&ctrl->namespaces_rwsem);
4668 4669
}
EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
4670

4671 4672 4673
void nvme_sync_queues(struct nvme_ctrl *ctrl)
{
	nvme_sync_io_queues(ctrl);
4674 4675
	if (ctrl->admin_q)
		blk_sync_queue(ctrl->admin_q);
4676 4677 4678
}
EXPORT_SYMBOL_GPL(nvme_sync_queues);

4679
struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
4680
{
4681 4682 4683
	if (file->f_op != &nvme_dev_fops)
		return NULL;
	return file->private_data;
4684
}
4685
EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
4686

4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704
/*
 * Check we didn't inadvertently grow the command structure sizes:
 */
static inline void _nvme_check_size(void)
{
	BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
4705 4706
	BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) !=
			NVME_IDENTIFY_DATA_SIZE);
4707
	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
4708
	BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE);
4709
	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
4710
	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
4711 4712 4713 4714
	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
4715
	BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
4716 4717 4718
}


4719
static int __init nvme_core_init(void)
4720
{
4721
	int result = -ENOMEM;
4722

4723 4724
	_nvme_check_size();

4725 4726 4727
	nvme_wq = alloc_workqueue("nvme-wq",
			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
	if (!nvme_wq)
4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738
		goto out;

	nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
	if (!nvme_reset_wq)
		goto destroy_wq;

	nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
	if (!nvme_delete_wq)
		goto destroy_reset_wq;
4739

4740 4741
	result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0,
			NVME_MINORS, "nvme");
4742
	if (result < 0)
4743
		goto destroy_delete_wq;
4744

4745
	nvme_class = class_create("nvme");
4746 4747 4748 4749
	if (IS_ERR(nvme_class)) {
		result = PTR_ERR(nvme_class);
		goto unregister_chrdev;
	}
4750
	nvme_class->dev_uevent = nvme_class_uevent;
4751

4752
	nvme_subsys_class = class_create("nvme-subsystem");
Christoph Hellwig's avatar
Christoph Hellwig committed
4753 4754 4755 4756
	if (IS_ERR(nvme_subsys_class)) {
		result = PTR_ERR(nvme_subsys_class);
		goto destroy_class;
	}
4757 4758 4759 4760 4761 4762

	result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
				     "nvme-generic");
	if (result < 0)
		goto destroy_subsys_class;

4763
	nvme_ns_chr_class = class_create("nvme-generic");
4764 4765 4766 4767
	if (IS_ERR(nvme_ns_chr_class)) {
		result = PTR_ERR(nvme_ns_chr_class);
		goto unregister_generic_ns;
	}
4768 4769
	result = nvme_init_auth();
	if (result)
4770
		goto destroy_ns_chr;
4771
	return 0;
4772

4773 4774
destroy_ns_chr:
	class_destroy(nvme_ns_chr_class);
4775 4776 4777 4778
unregister_generic_ns:
	unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
destroy_subsys_class:
	class_destroy(nvme_subsys_class);
Christoph Hellwig's avatar
Christoph Hellwig committed
4779 4780
destroy_class:
	class_destroy(nvme_class);
4781
unregister_chrdev:
4782
	unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
4783 4784 4785 4786
destroy_delete_wq:
	destroy_workqueue(nvme_delete_wq);
destroy_reset_wq:
	destroy_workqueue(nvme_reset_wq);
4787 4788
destroy_wq:
	destroy_workqueue(nvme_wq);
4789
out:
4790
	return result;
4791 4792
}

4793
static void __exit nvme_core_exit(void)
4794
{
4795
	nvme_exit_auth();
4796
	class_destroy(nvme_ns_chr_class);
Christoph Hellwig's avatar
Christoph Hellwig committed
4797
	class_destroy(nvme_subsys_class);
4798
	class_destroy(nvme_class);
4799
	unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
4800
	unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
4801 4802
	destroy_workqueue(nvme_delete_wq);
	destroy_workqueue(nvme_reset_wq);
4803
	destroy_workqueue(nvme_wq);
4804
	ida_destroy(&nvme_ns_chr_minor_ida);
Max Gurtovoy's avatar
Max Gurtovoy committed
4805
	ida_destroy(&nvme_instance_ida);
4806
}
4807 4808 4809 4810 4811

MODULE_LICENSE("GPL");
MODULE_VERSION("1.0");
module_init(nvme_core_init);
module_exit(nvme_core_exit);